data
====

.. py:module:: topmost.data


.. toctree::
   :titlesonly:
   :maxdepth: 1


   basic_dataset/index.rst


   crosslingual_dataset/index.rst


   download/index.rst


   download_20ng/index.rst


   dynamic_dataset/index.rst


   file_utils/index.rst


Package Contents
----------------


.. autoapisummary::


   topmost.data.BasicDataset


   topmost.data.RawDataset


   topmost.data.CrosslingualDataset


   topmost.data.DynamicDataset


.. autoapisummary::


   topmost.data.download_dataset


.. py:class:: BasicDataset(dataset_dir, batch_size=200, read_labels=False, as_tensor=True, contextual_embed=False, doc_embed_model='all-MiniLM-L6-v2', device='cpu')

   .. py:attribute:: vocab_size
      :value: 0


   .. py:method:: load_data(path, read_labels)


.. py:class:: RawDataset(docs, preprocess=None, batch_size=200, device='cpu', as_tensor=True, contextual_embed=False, pretrained_WE=False, doc_embed_model='all-MiniLM-L6-v2', embed_model_device=None, verbose=False)

   .. py:attribute:: train_data


   .. py:attribute:: train_texts


   .. py:attribute:: vocab


   .. py:attribute:: vocab_size


.. py:class:: CrosslingualDataset(dataset_dir, lang1, lang2, dict_path, device='cpu', batch_size=200, as_tensor=True)

   .. py:attribute:: batch_size
      :value: 200


   .. py:attribute:: train_size_en
      :value: 0


   .. py:attribute:: train_size_cn
      :value: 0


   .. py:attribute:: vocab_size_en
      :value: 0


   .. py:attribute:: vocab_size_cn
      :value: 0


   .. py:attribute:: pretrained_WE_en


   .. py:attribute:: pretrained_WE_cn


   .. py:attribute:: Map_en2cn


   .. py:attribute:: Map_cn2en


   .. py:method:: move_to_device(bow, device)


   .. py:method:: read_data(dataset_dir, lang)


   .. py:method:: parse_dictionary(dict_path)


   .. py:method:: get_Map(trans_matrix, bow)


.. py:class:: DynamicDataset(dataset_dir, batch_size=200, read_labels=False, device='cpu', as_tensor=True)

   .. py:attribute:: vocab_size
      :value: 0


   .. py:attribute:: train_size


   .. py:attribute:: num_times


   .. py:attribute:: train_time_wordfreq


   .. py:method:: load_data(path, read_labels)


   .. py:method:: get_time_wordfreq(bow, times)


.. py:function:: download_dataset(dataset_name, cache_path='~/.topmost')