src.preprocessing#
Module Contents#
Classes#
Base class for all kinds of context preprocessing strategies |
|
Base class for all kinds of context preprocessing strategies |
|
Base class for all kinds of context preprocessing strategies |
|
Base class for all kinds of context preprocessing strategies |
|
Base class for all kinds of context preprocessing strategies |
|
Base class for all kinds of context preprocessing strategies |
Attributes#
- src.preprocessing.log#
- class src.preprocessing.ContextPreprocessor(**data)#
Bases:
pydantic.BaseModelBase class for all kinds of context preprocessing strategies
- Parameters:
BaseModel (_type_) – _description_
- Raises:
ValueError – _description_
NotImplementedError – _description_
NotImplementedError – _description_
- Returns:
_description_
- Return type:
_type_
- spelling_normalization: dict[str, str] | None#
Dictionary of substring replacements to apply on the contexts
- static start_char_index(token_index: int, tokens: list[str]) int#
Finds the index of the first character of the target token, i.e. tokens[token_index]
- Parameters:
token_index (int) – the index of the target word in the list of tokens
tokens (list[str]) – the list of tokens
- Raises:
ValueError – If the token is not found
- Returns:
the start character index of the target word
- Return type:
int
- normalize_spelling(context: str, start: int) tuple[str, int]#
Applies the preprocessor’s spelling normalization table and the new start character index of the target word after all modifications
- Parameters:
context (str) – Context sentence of the target word
start (int) – Start character index of the target word
- Returns:
A tuple consisting of the modified string and the new start character index
- Return type:
tuple[str, int]
- abstract fields_from_series(s: pandas.Series) dict[str, Any]#
Selects fields from a pandas Series
- Parameters:
s (Series) – A row in a uses.csv file
- Raises:
NotImplementedError – _description_
- Returns:
A dictionary of parameters relevant to pass to the preprocess function
- Return type:
dict[str, Any]
- abstract preprocess(*args, **kwargs) tuple[str, int, int]#
- __call__(s: pandas.Series) pandas.Series#
Applies the preprocessing strategy based on a pandas.Series from a uses.csv file
- Parameters:
s (Series) – _description_
- Returns:
_description_
- Return type:
Series
- class src.preprocessing.Toklem(**data)#
Bases:
ContextPreprocessorBase class for all kinds of context preprocessing strategies
- Parameters:
BaseModel (_type_) – _description_
- Raises:
ValueError – _description_
NotImplementedError – _description_
NotImplementedError – _description_
- Returns:
_description_
- Return type:
_type_
- fields_from_series(s: pandas.Series) dict[str, str | int]#
Selects fields from a pandas Series
- Parameters:
s (Series) – A row in a uses.csv file
- Raises:
NotImplementedError – _description_
- Returns:
A dictionary of parameters relevant to pass to the preprocess function
- Return type:
dict[str, Any]
- preprocess(context: str, index: int, lemma: str) tuple[str, int, int]#
Applies the preprocessing strategy in a standalone manner
- Parameters:
context (str) – The context sentence of the target word
index (int) – The start character index of the target word
lemma (str) – The lemma of the target word
- Returns:
A tuple consisting of the modified string, and the start and end character indices of the target word
- Return type:
tuple[str, int, int]
- class src.preprocessing.Raw(**data)#
Bases:
ContextPreprocessorBase class for all kinds of context preprocessing strategies
- Parameters:
BaseModel (_type_) – _description_
- Raises:
ValueError – _description_
NotImplementedError – _description_
NotImplementedError – _description_
- Returns:
_description_
- Return type:
_type_
- fields_from_series(s: pandas.Series) dict[str, str | int]#
Selects fields from a pandas Series
- Parameters:
s (Series) – A row in a uses.csv file
- Raises:
NotImplementedError – _description_
- Returns:
A dictionary of parameters relevant to pass to the preprocess function
- Return type:
dict[str, Any]
- preprocess(context: str, start: int, end: int) tuple[str, int, int]#
Returns the unmodified context and the character indices of the target word
- Parameters:
context (str) – The context sentence of the target word
start (int) – The start character index of the target word
end (int) – The end character index of the target word
- Returns:
A tuple consisting of the unmodified string, and the start and end character indices of the target word
- Return type:
tuple[str, int, int]
- class src.preprocessing.Lemmatize(**data)#
Bases:
ContextPreprocessorBase class for all kinds of context preprocessing strategies
- Parameters:
BaseModel (_type_) – _description_
- Raises:
ValueError – _description_
NotImplementedError – _description_
NotImplementedError – _description_
- Returns:
_description_
- Return type:
_type_
- fields_from_series(s: pandas.Series) dict[str, str | int]#
Selects fields from a pandas Series
- Parameters:
s (Series) – A row in a uses.csv file
- Raises:
NotImplementedError – _description_
- Returns:
A dictionary of parameters relevant to pass to the preprocess function
- Return type:
dict[str, Any]
- preprocess(context: str, index: int) tuple[str, int, int]#
Applies the preprocessing strategy in a standalone manner
- Parameters:
context (str) – The context sentence of the target word
index (int) – The start character index of the target word
- Returns:
A tuple consisting of the modified string, and the start and end character indices of the target word
- Return type:
tuple[str, int, int]
- class src.preprocessing.Tokenize(**data)#
Bases:
ContextPreprocessorBase class for all kinds of context preprocessing strategies
- Parameters:
BaseModel (_type_) – _description_
- Raises:
ValueError – _description_
NotImplementedError – _description_
NotImplementedError – _description_
- Returns:
_description_
- Return type:
_type_
- fields_from_series(s: pandas.Series) dict[str, str | int]#
Selects fields from a pandas Series
- Parameters:
s (Series) – A row in a uses.csv file
- Raises:
NotImplementedError – _description_
- Returns:
A dictionary of parameters relevant to pass to the preprocess function
- Return type:
dict[str, Any]
- preprocess(context: str, index: int) tuple[str, int, int]#
Applies the preprocessing strategy in a standalone manner
- Parameters:
context (str) – The context sentence of the target word
index (int) – The start character index of the target word
- Returns:
A tuple consisting of the modified string, and the start and end character indices of the target word
- Return type:
tuple[str, int, int]
- class src.preprocessing.Normalize(**data)#
Bases:
ContextPreprocessorBase class for all kinds of context preprocessing strategies
- Parameters:
BaseModel (_type_) – _description_
- Raises:
ValueError – _description_
NotImplementedError – _description_
NotImplementedError – _description_
- Returns:
_description_
- Return type:
_type_
- default: str#
Column to extract from a Series if a given use does not contain a pre-normalized context
- fields_from_series(s: pandas.Series) dict[str, str | int]#
Selects fields from a pandas Series
- Parameters:
s (Series) – A row in a uses.csv file
- Raises:
NotImplementedError – _description_
- Returns:
A dictionary of parameters relevant to pass to the preprocess function
- Return type:
dict[str, Any]
- preprocess(context: str, index: int) tuple[str, int, int]#
Applies the preprocessing strategy in a standalone manner
- Parameters:
context (str) – The context sentence of the target word
index (int) – The start character index of the target word
- Returns:
A tuple consisting of the modified string, and the start and end character indices of the target word
- Return type:
tuple[str, int, int]