Source code for tidyX.text_normalization

import spacy
from typing import List, Union, Tuple
from spacy.language import Language
from .text_preprocessor import TextPreprocessor
from nltk.stem.snowball import SnowballStemmer
import emoji

[docs]class TextNormalization: def __init__(self): pass
[docs] @staticmethod def is_emoji(s: str) -> bool: """Check if a given string is an emoji.""" return s in emoji.EMOJI_DATA
[docs] @staticmethod def lemmatizer(token: str, model: Language) -> str: """Lemmatizes a given token using Spacy's Spanish language model. Lemmatization is the process of reducing a word to its base or dictionary form. For example, the word "running" would be lemmatized to "run". Lemmatization takes into account the meaning of the word in the sentence, leveraging vocabulary and morphological analysis. Note: Before using this function, a Spacy model should be downloaded. Use `python -m spacy download name_of_model` to download a model. Available models for Spanish are: "es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf". For more information, visit https://spacy.io/models/ Args: token (str): The token to be lemmatized. model (spacy.language.Language): A Spacy language model object. Returns: str: The lemmatized version of the token, with accents removed. """ if not token or TextNormalization.is_emoji(token): return token try: lemma = model(token)[0].lemma_ lemma = TextPreprocessor.remove_accents(lemma) return lemma except Exception as e: print(f"An error occurred: {e}") return token
[docs] def stemmer(token: str, language: str = "spanish") -> str: """Stems a given token using Snowball stemmer. Stemming is the process of reducing a word to its base or root form, often by stripping suffixes. For instance, the word "running" might be stemmed to "run". Unlike lemmatization, stemming doesn't always produce a valid word and doesn't consider the meaning of a word in the context. This function uses the Snowball stemmer, which supports multiple languages including Spanish. Note: Before using this function, you might need to install nltk if not done already. Use `pip install nltk`. Args: token (str): The token to be stemmed. language (str, optional): The language of the token. Defaults to "spanish". Returns: str: The stemmed version of the token. """ if not token or TextNormalization.is_emoji(token): return token stemmer = SnowballStemmer(language) try: stemmed = stemmer.stem(token) return stemmed except Exception as e: print(f"An error occurred: {e}") return token