:py:mod:`src.dataset`
=====================

.. py:module:: src.dataset


Module Contents
---------------

Classes
~~~~~~~

.. autoapisummary::

   src.dataset.StandardSplit
   src.dataset.UsePairCache
   src.dataset.Dataset


.. py:class:: StandardSplit


   Bases: :py:obj:`pydantic.BaseModel`

   .. py:attribute:: dev
      :type: list[str]

      
   .. py:attribute:: dev1
      :type: list[str]

      
   .. py:attribute:: dev2
      :type: list[str]

      
   .. py:attribute:: test
      :type: list[str]

      
   .. py:attribute:: full
      :type: list[str]

      
.. py:class:: UsePairCache


   Bases: :py:obj:`pydantic.BaseModel`

   .. py:attribute:: dataset
      :type: str

      
   .. py:attribute:: split
      :type: str

      
   .. py:attribute:: group
      :type: str

      
   .. py:attribute:: sample
      :type: str

      
.. py:class:: Dataset(**data)


   Bases: :py:obj:`pydantic.BaseModel`

   .. py:property:: relative_path
      :type: pathlib.Path


   .. py:property:: absolute_path
      :type: pathlib.Path


   .. py:property:: data_dir
      :type: pathlib.Path


   .. py:property:: stats_groupings_df
      :type: pandas.DataFrame


   .. py:property:: graded_change_labels
      :type: dict[str, float]


   .. py:property:: compare_labels
      :type: dict[str, float]


   .. py:property:: binary_change_labels
      :type: dict[str, int]


   .. py:property:: wic_labels
      :type: dict[tuple[src.use.UseID, src.use.UseID], float]


   .. py:property:: binary_wic_labels
      :type: dict[tuple[src.use.UseID, src.use.UseID], float]


   .. py:property:: wsi_labels
      :type: dict[str, int]


   .. py:property:: stats_agreement_df
      :type: pandas.DataFrame


   .. py:property:: uses_df


   .. py:property:: judgments_df


   .. py:property:: judgments_schema
      :type: pandera.DataFrameSchema


   .. py:property:: clusters_df


   .. py:property:: clusters_schema
      :type: pandera.DataFrameSchema


   .. py:property:: lemmas
      :type: list[src.lemma.Lemma]

      Returns the list of lemmas in the dataset

      Returns:
          list[Lemma]: _description_

      Examples
      --------
      >>> np.angle([1.0, 1.0j, 1+1j])               # in radians
      array([ 0.        ,  1.57079633,  0.78539816]) # may vary
      >>> np.angle(1+1j, deg=True)                  # in degrees
      45.0


   .. py:attribute:: groupings
      :type: tuple[str, str]

      
   .. py:attribute:: type
      :type: Literal[dev, test]

      
   .. py:attribute:: split
      :type: Literal[dev, dev1, dev2, test, full]

      
   .. py:attribute:: exclude_annotators
      :type: list[str]

      
   .. py:attribute:: path
      :type: pathlib.Path

      
   .. py:attribute:: name
      :type: str

      
   .. py:attribute:: url
      :type: pydantic.HttpUrl | None

      
   .. py:attribute:: standard_split
      :type: StandardSplit | None

      
   .. py:attribute:: test_on
      :type: set[str] | int | None

      
   .. py:attribute:: cleaning
      :type: src.cleaning.Cleaning | None

      
   .. py:attribute:: preprocessing
      :type: src.preprocessing.ContextPreprocessor | None

      
   .. py:attribute:: wic_use_pairs
      :type: src.lemma.UsePairOptions | None

      
   .. py:attribute:: _stats_groupings
      :type: pandas.DataFrame

      
   .. py:attribute:: _uses
      :type: pandas.DataFrame

      
   .. py:attribute:: _judgments
      :type: pandas.DataFrame

      
   .. py:attribute:: _agreements
      :type: pandas.DataFrame

      
   .. py:attribute:: _clusters
      :type: pandas.DataFrame

      
   .. py:attribute:: _lemmas
      :type: list[src.lemma.Lemma]

      
   .. py:method:: set_preprocessing(v) -> src.preprocessing.ContextPreprocessor


   .. py:method:: rewrite_config(new_config: dict[str, Any], path: pathlib.Path) -> None


   .. py:method:: __download_from_git() -> None


   .. py:method:: __download_zip() -> None


   .. py:method:: __download() -> None


   .. py:method:: __unzip(zip_file: pathlib.Path) -> None


   .. py:method:: get_stats_groupings_schema(evaluation_task: src.evaluation.EvaluationTask) -> pandera.DataFrameSchema

      Examples:
      >>> self.add(1, 2)


   .. py:method:: use_pairs(group: src.lemma.Group, sample: src.lemma.Sample) -> list[tuple[src.use.Use, src.use.Use]]


   .. py:method:: get_labels(evaluation_task: src.evaluation.EvaluationTask) -> dict[Any, Any]


   .. py:method:: get_standard_split() -> StandardSplit


   .. py:method:: get_split() -> list[str]


   .. py:method:: filter_lemmas(lemmas: list[src.lemma.Lemma]) -> list[src.lemma.Lemma]