src.dataset#

Module Contents#

Classes#

StandardSplit

UsePairCache

Dataset

class src.dataset.StandardSplit#

Bases: pydantic.BaseModel

dev: list[str]#
dev1: list[str]#
dev2: list[str]#
test: list[str]#
full: list[str]#
class src.dataset.UsePairCache#

Bases: pydantic.BaseModel

dataset: str#
split: str#
group: str#
sample: str#
class src.dataset.Dataset(**data)#

Bases: pydantic.BaseModel

property relative_path: pathlib.Path#
property absolute_path: pathlib.Path#
property data_dir: pathlib.Path#
property stats_groupings_df: pandas.DataFrame#
property graded_change_labels: dict[str, float]#
property compare_labels: dict[str, float]#
property binary_change_labels: dict[str, int]#
property wic_labels: dict[tuple[src.use.UseID, src.use.UseID], float]#
property binary_wic_labels: dict[tuple[src.use.UseID, src.use.UseID], float]#
property wsi_labels: dict[str, int]#
property stats_agreement_df: pandas.DataFrame#
property uses_df#
property judgments_df#
property judgments_schema: pandera.DataFrameSchema#
property clusters_df#
property clusters_schema: pandera.DataFrameSchema#
property lemmas: list[src.lemma.Lemma]#

Returns the list of lemmas in the dataset

Returns:

list[Lemma]: _description_

Examples#

>>> np.angle([1.0, 1.0j, 1+1j])               # in radians
array([ 0.        ,  1.57079633,  0.78539816]) # may vary
>>> np.angle(1+1j, deg=True)                  # in degrees
45.0
groupings: tuple[str, str]#
type: Literal[dev, test]#
split: Literal[dev, dev1, dev2, test, full]#
exclude_annotators: list[str]#
path: pathlib.Path#
name: str#
url: pydantic.HttpUrl | None#
standard_split: StandardSplit | None#
test_on: set[str] | int | None#
cleaning: src.cleaning.Cleaning | None#
preprocessing: src.preprocessing.ContextPreprocessor | None#
wic_use_pairs: src.lemma.UsePairOptions | None#
_stats_groupings: pandas.DataFrame#
_uses: pandas.DataFrame#
_judgments: pandas.DataFrame#
_agreements: pandas.DataFrame#
_clusters: pandas.DataFrame#
_lemmas: list[src.lemma.Lemma]#
set_preprocessing(v) src.preprocessing.ContextPreprocessor#
rewrite_config(new_config: dict[str, Any], path: pathlib.Path) None#
__download_from_git() None#
__download_zip() None#
__download() None#
__unzip(zip_file: pathlib.Path) None#
get_stats_groupings_schema(evaluation_task: src.evaluation.EvaluationTask) pandera.DataFrameSchema#

Examples: >>> self.add(1, 2)

use_pairs(group: src.lemma.Group, sample: src.lemma.Sample) list[tuple[src.use.Use, src.use.Use]]#
get_labels(evaluation_task: src.evaluation.EvaluationTask) dict[Any, Any]#
get_standard_split() StandardSplit#
get_split() list[str]#
filter_lemmas(lemmas: list[src.lemma.Lemma]) list[src.lemma.Lemma]#