diff --git a/components/language_filter/Dockerfile b/components/language_filter/Dockerfile new file mode 100644 index 000000000..605adc7e9 --- /dev/null +++ b/components/language_filter/Dockerfile @@ -0,0 +1,18 @@ +FROM --platform=linux/amd64 python:3.8-slim + +## System dependencies +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install git -y + +# install requirements +COPY requirements.txt / +RUN pip3 install --no-cache-dir -r requirements.txt + +# Set the working directory to the component folder +WORKDIR /component/src + +# Copy over src-files +COPY src/ . + +ENTRYPOINT ["python", "main.py"] \ No newline at end of file diff --git a/components/language_filter/README.md b/components/language_filter/README.md new file mode 100644 index 000000000..ba9bd4636 --- /dev/null +++ b/components/language_filter/README.md @@ -0,0 +1,7 @@ +# Language filter + +## Description +This component is based on the `TransformComponent` and is used to filter a dataframe based on language. +It allows you to remove rows that do not match the provided language, thus providing a way to focus +on specific languages within your data. + diff --git a/components/language_filter/fondant_component.yaml b/components/language_filter/fondant_component.yaml new file mode 100644 index 000000000..82d321fda --- /dev/null +++ b/components/language_filter/fondant_component.yaml @@ -0,0 +1,14 @@ +name: Filter languages +description: A component that filters text based on the language. +image: ghcr.io/ml6team/filter_language:latest + +consumes: + text: + fields: + data: + type: string + +args: + language: + description: A valid language code or identifier (e.g., "en", "fr", "de"). + type: str \ No newline at end of file diff --git a/components/language_filter/requirements.txt b/components/language_filter/requirements.txt new file mode 100644 index 000000000..3427ac0ae --- /dev/null +++ b/components/language_filter/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/ml6team/fondant@main +pyarrow>=7.0 +gcsfs==2023.4.00 +fasttext-wheel==0.9.2 \ No newline at end of file diff --git a/components/language_filter/src/lid.176.ftz b/components/language_filter/src/lid.176.ftz new file mode 100644 index 000000000..1fb85b357 Binary files /dev/null and b/components/language_filter/src/lid.176.ftz differ diff --git a/components/language_filter/src/main.py b/components/language_filter/src/main.py new file mode 100644 index 000000000..5e52def65 --- /dev/null +++ b/components/language_filter/src/main.py @@ -0,0 +1,72 @@ +"""A component that filters text based on the language.""" +import logging + +import fasttext +import pandas as pd + +from fondant.component import PandasTransformComponent + +logger = logging.getLogger(__name__) + + +class LanguageIdentification: + """A class for language detection using FastText.""" + + def __init__(self, language, model_path: str = "lid.176.ftz"): + """ + Initializes the LanguageDetect class. + + Args: + language (str): language to filter on + model_path (str): The path to the FastText language identification model. + """ + pretrained_lang_model_weight_path = model_path + self.language = language + self.model = fasttext.load_model(pretrained_lang_model_weight_path) + + def predict_lang(self, text: str): + """ + Detects the language of a text sequence. + + Args: + text (str): The text for language detection. + + Returns: + str: The predicted language label. + """ + predictions = self.model.predict(text, k=1) + return predictions[0][0] + + def is_language(self, row): + """Predict if text of a row is written in the defined language.""" + return self.language in self.predict_lang(row["text"]) + + +class LanguageFilterComponent(PandasTransformComponent): + """Component that filter columns based on provided language.""" + + def setup(self, *, language): + """Setup language filter component. + + Args: + language: Only keep text passages which are in the provided language. + """ + self.lang_detector = LanguageIdentification(language) + + + def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: + """ + Args: + dataframe: Pandas dataframe. + + Returns: + Pandas dataframe + """ + mask = dataframe.apply(self.lang_detector.is_language, axis=1) + + return dataframe[mask] + + +if __name__ == "__main__": + component = LanguageFilterComponent.from_args() + component.run() diff --git a/components/language_filter/tests/language_filter_component_test.py b/components/language_filter/tests/language_filter_component_test.py new file mode 100644 index 000000000..633b3ecee --- /dev/null +++ b/components/language_filter/tests/language_filter_component_test.py @@ -0,0 +1,54 @@ +"""Unit test for language filter component.""" +import pandas as pd + +from components.language_filter.src.main import LanguageFilterComponent +from fondant.component_spec import ComponentSpec + + +def test_run_component_test(): + """Test language filter component.""" + # Given: Dataframe with text in different languages + data = [{"text": "Das hier ist ein Satz in deutscher Sprache"}, + {"text": "This is a sentence in English"}, + {"text": "Dit is een zin in het Nederlands"}] + dataframe = pd.DataFrame(data) + + # When: The language filter component proceed the dataframe + # and filter out all entries which are not written in german + spec = ComponentSpec.from_file("../fondant_component.yaml") + + component = LanguageFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json", + output_manifest_path="./dummy_input_manifest.json", + metadata={}, + user_arguments={"language": "de"}, + ) + component.setup(language="de") + dataframe = component.transform(dataframe=dataframe) + + # Then: dataframe only contains one german row + assert len(dataframe) == 1 + assert dataframe.loc[0]["text"] == "Das hier ist ein Satz in deutscher Sprache" + + +def test_run_component_test_filter_out_all(): + """Test language filter component.""" + # Given: Dataframe with text in different languages + data = [{"text": "Das hier ist ein Satz in deutscher Sprache"}, + {"text": "This is a sentence in English"}, + {"text": "Dit is een zin in het Nederlands"}] + dataframe = pd.DataFrame(data) + + # When: The language filter component proceed the dataframe + # and filter out all entries which are not written in french + spec = ComponentSpec.from_file("../fondant_component.yaml") + + component = LanguageFilterComponent(spec, input_manifest_path="./dummy_input_manifest.json", + output_manifest_path="./dummy_input_manifest.json", + metadata={}, + user_arguments={"language": "fr"}, + ) + component.setup() + dataframe = component.transform(dataframe=dataframe) + + # Then: dataframe should contain no rows anymore + assert len(dataframe) == 0