Quick Start

Install TopMost

Install topmost with pip as

$ pip install topmost

We try FASTopic to get the top words of discovered topics, topic_top_words and the topic distributions of documents, doc_topic_dist. The preprocessing steps are configurable. See our documentations.

from topmost import RawDataset, Preprocess, FASTopicTrainer
from sklearn.datasets import fetch_20newsgroups

docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
preprocess = Preprocess(vocab_size=10000)

dataset = RawDataset(docs, preprocess, device="cuda")

trainer = FASTopicTrainer(dataset, verbose=True)
top_words, doc_topic_dist = trainer.train()

new_docs = [
    "This is a document about space, including words like space, satellite, launch, orbit.",
    "This is a document about Microsoft Windows, including words like windows, files, dos."
]

new_theta = trainer.test(new_docs)
print(new_theta.argmax(1))

Usage

Download a preprocessed dataset

import topmost

topmost.download_dataset('20NG', cache_path='./datasets')

Train a model

device = "cuda" # or "cpu"

# load a preprocessed dataset
dataset = topmost.BasicDataset("./datasets/20NG", device=device, read_labels=True)
# create a model
model = topmost.ProdLDA(dataset.vocab_size)
model = model.to(device)

# create a trainer
trainer = topmost.BasicTrainer(model, dataset)

# train the model
top_words, train_theta = trainer.train()

Evaluate

from topmost import eva

# topic diversity and coherence
TD = eva._diversity(top_words)
TC = eva._coherence(dataset.train_texts, dataset.vocab, top_words)

# get doc-topic distributions of testing samples
test_theta = trainer.test(dataset.test_data)
# clustering
clustering_results = eva._clustering(test_theta, dataset.test_labels)
# classification
cls_results = eva._cls(train_theta, test_theta, dataset.train_labels, dataset.test_labels)

Test new documents

import torch
from topmost import Preprocess

new_docs = [
    "This is a new document about space, including words like space, satellite, launch, orbit.",
    "This is a new document about Microsoft Windows, including words like windows, files, dos."
]

preprocess = Preprocess()
new_parsed_docs, new_bow = preprocess.parse(new_docs, vocab=dataset.vocab)
new_theta = trainer.test(torch.as_tensor(new_bow.toarray(), device=device).float())