data ==== .. py:module:: topmost.data .. toctree:: :titlesonly: :maxdepth: 1 basic_dataset/index.rst crosslingual_dataset/index.rst download/index.rst download_20ng/index.rst dynamic_dataset/index.rst file_utils/index.rst Package Contents ---------------- .. autoapisummary:: topmost.data.BasicDataset topmost.data.RawDataset topmost.data.CrosslingualDataset topmost.data.DynamicDataset .. autoapisummary:: topmost.data.download_dataset .. py:class:: BasicDataset(dataset_dir, batch_size=200, read_labels=False, as_tensor=True, contextual_embed=False, doc_embed_model='all-MiniLM-L6-v2', device='cpu') .. py:attribute:: vocab_size :value: 0 .. py:method:: load_data(path, read_labels) .. py:class:: RawDataset(docs, preprocess=None, batch_size=200, device='cpu', as_tensor=True, contextual_embed=False, pretrained_WE=False, doc_embed_model='all-MiniLM-L6-v2', embed_model_device=None, verbose=False) .. py:attribute:: train_data .. py:attribute:: train_texts .. py:attribute:: vocab .. py:attribute:: vocab_size .. py:class:: CrosslingualDataset(dataset_dir, lang1, lang2, dict_path, device='cpu', batch_size=200, as_tensor=True) .. py:attribute:: batch_size :value: 200 .. py:attribute:: train_size_en :value: 0 .. py:attribute:: train_size_cn :value: 0 .. py:attribute:: vocab_size_en :value: 0 .. py:attribute:: vocab_size_cn :value: 0 .. py:attribute:: pretrained_WE_en .. py:attribute:: pretrained_WE_cn .. py:attribute:: Map_en2cn .. py:attribute:: Map_cn2en .. py:method:: move_to_device(bow, device) .. py:method:: read_data(dataset_dir, lang) .. py:method:: parse_dictionary(dict_path) .. py:method:: get_Map(trans_matrix, bow) .. py:class:: DynamicDataset(dataset_dir, batch_size=200, read_labels=False, device='cpu', as_tensor=True) .. py:attribute:: vocab_size :value: 0 .. py:attribute:: train_size .. py:attribute:: num_times .. py:attribute:: train_time_wordfreq .. py:method:: load_data(path, read_labels) .. py:method:: get_time_wordfreq(bow, times) .. py:function:: download_dataset(dataset_name, cache_path='~/.topmost')