basic_dataset

Module Contents

class DocEmbedModel(model: str | callable = 'all-MiniLM-L6-v2', device: str = 'cpu', verbose: bool = False)
verbose = False
encode(docs: List[str], convert_to_tensor: bool = False)
class RawDataset(docs, preprocess=None, batch_size=200, device='cpu', as_tensor=True, contextual_embed=False, pretrained_WE=False, doc_embed_model='all-MiniLM-L6-v2', embed_model_device=None, verbose=False)
train_data
train_texts
vocab
vocab_size
class BasicDataset(dataset_dir, batch_size=200, read_labels=False, as_tensor=True, contextual_embed=False, doc_embed_model='all-MiniLM-L6-v2', device='cpu')
vocab_size = 0
load_data(path, read_labels)