Quick Start#

Install TopMost#

Install topmost with pip as

$ pip install topmost

Discover topics from your own datasets#

We can get the top words of discovered topics, topic_top_words` and the topic distributions of documents, doc_topic_dist. The preprocessing steps are configurable. See our documentations.

import torch
import topmost
from topmost.preprocessing import Preprocessing

# Your own documents
docs = [
    "This is a document about space, including words like space, satellite, launch, orbit.",
    "This is a document about Microsoft Windows, including words like windows, files, dos.",
    # more documents...
]

device = 'cuda' # or 'cpu'
preprocessing = Preprocessing()
dataset = topmost.data.RawDatasetHandler(docs, preprocessing, device=device, as_tensor=True)

model = topmost.models.ProdLDA(dataset.vocab_size, num_topics=2)
model = model.to(device)

trainer = topmost.trainers.BasicTrainer(model)

topic_top_words, doc_topic_dist = trainer.fit_transform(dataset, num_top_words=15, verbose=False)

Usage#

Download a preprocessed dataset#

import topmost
from topmost.data import download_dataset

download_dataset('20NG', cache_path='./datasets')

Train a model#

device = "cuda" # or "cpu"

# load a preprocessed dataset
dataset = topmost.data.BasicDatasetHandler("./datasets/20NG", device=device, read_labels=True, as_tensor=True)
# create a model
model = topmost.models.ProdLDA(dataset.vocab_size)
model = model.to(device)

# create a trainer
trainer = topmost.trainers.BasicTrainer(model)

# train the model
trainer.train(dataset)

Evaluate#

# get theta (doc-topic distributions)
train_theta, test_theta = trainer.export_theta(dataset)
# get top words of topics
topic_top_words = trainer.export_top_words(dataset.vocab)

# evaluate topic diversity
TD = topmost.evaluations.compute_topic_diversity(top_words)

# evaluate clustering
clustering_results = topmost.evaluations.evaluate_clustering(test_theta, dataset.test_labels)

# evaluate classification
classification_results = topmost.evaluations.evaluate_classification(train_theta, test_theta, dataset.train_labels, dataset.test_labels)

Test new documents#

import torch
from topmost.preprocessing import Preprocessing

new_docs = [
    "This is a new document about space, including words like space, satellite, launch, orbit.",
    "This is a new document about Microsoft Windows, including words like windows, files, dos."
]

parsed_new_docs, new_bow = preprocessing.parse(new_docs, vocab=dataset.vocab)
new_doc_topic_dist = trainer.test(torch.as_tensor(new_bow, device=device).float())