diff --git a/tests/tests/python-lang-id/.disabled b/tests/tests/python-lang-id/.disabled new file mode 100644 index 00000000..e69de29b diff --git a/tests/tests/python-lang-id/README.md b/tests/tests/python-lang-id/README.md new file mode 100644 index 00000000..a574092d --- /dev/null +++ b/tests/tests/python-lang-id/README.md @@ -0,0 +1,3 @@ +Source: https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/lang_id/python + +See [this README](../python-pii-redactor/README) for details. In addition to what is documented there, in this test, we also had to make some minor changes to [nlp.py](./pail/src/nlp.py) to separate from the TransformUtils class. diff --git a/tests/tests/python-lang-id/pail/.helmignore b/tests/tests/python-lang-id/pail/.helmignore new file mode 100644 index 00000000..5dd7b8f9 --- /dev/null +++ b/tests/tests/python-lang-id/pail/.helmignore @@ -0,0 +1,2 @@ +test-data/ +requirements.txt \ No newline at end of file diff --git a/tests/tests/python-lang-id/pail/app.yaml b/tests/tests/python-lang-id/pail/app.yaml new file mode 100644 index 00000000..b6029384 --- /dev/null +++ b/tests/tests/python-lang-id/pail/app.yaml @@ -0,0 +1,17 @@ +apiVersion: lunchpail.io/v1alpha1 +kind: Application +metadata: + name: lang_id +spec: + role: worker + command: python3 ./main.py + code: + - name: main.py + source: | +{{ .Files.Get "src/main.py" | indent 8 }} + - name: lang_models.py + source: | +{{ .Files.Get "src/lang_models.py" | indent 8 }} + - name: nlp.py + source: | +{{ .Files.Get "src/nlp.py" | indent 8 }} diff --git a/tests/tests/python-lang-id/pail/pool1.yaml b/tests/tests/python-lang-id/pail/pool1.yaml new file mode 100644 index 00000000..f6e3910f --- /dev/null +++ b/tests/tests/python-lang-id/pail/pool1.yaml @@ -0,0 +1,11 @@ +{{- range until (.Values.pools | default 1 | int) }} +--- +apiVersion: lunchpail.io/v1alpha1 +kind: WorkerPool +metadata: + name: {{ print "pool" (add 1 .) }} +spec: + workers: + count: {{ $.Values.workers | default 1 }} + size: {{ $.Values.size | default "xxs" }} +{{- end }} diff --git a/tests/tests/python-lang-id/pail/requirements.txt b/tests/tests/python-lang-id/pail/requirements.txt new file mode 100644 index 00000000..56ebcc7e --- /dev/null +++ b/tests/tests/python-lang-id/pail/requirements.txt @@ -0,0 +1,5 @@ +fasttext==0.9.2 +langcodes==3.3.0 +huggingface-hub >= 0.21.4, <1.0.0 +numpy==1.26.4 +pyarrow diff --git a/tests/tests/python-lang-id/pail/src/lang_models.py b/tests/tests/python-lang-id/pail/src/lang_models.py new file mode 100644 index 00000000..dfbf01e6 --- /dev/null +++ b/tests/tests/python-lang-id/pail/src/lang_models.py @@ -0,0 +1,52 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import math +from abc import ABCMeta, abstractmethod + +import fasttext +from huggingface_hub import hf_hub_download +from langcodes import standardize_tag + + +KIND_FASTTEXT = "fasttext" + + +class LangModel(metaclass=ABCMeta): + @abstractmethod + def detect_lang(self, text: str) -> tuple[str, float]: + pass + + +class NoopModel(metaclass=ABCMeta): + def detect_lang(self, text: str) -> tuple[str, float]: + return "en", 0.0 + + +class FastTextModel(LangModel): + def __init__(self, url, credential): + model_path = hf_hub_download(repo_id=url, filename="model.bin", token=credential) + self.nlp = fasttext.load_model(model_path) + + def detect_lang(self, text: str) -> tuple[str, float]: + label, score = self.nlp.predict( + text.replace("\n", " "), 1 + ) # replace newline to avoid ERROR: predict processes one line at a time (remove '\n') skipping the file + return standardize_tag(label[0].replace("__label__", "")), math.floor(score[0] * 1000) / 1000 + + +class LangModelFactory: + def create_model(kind: str, url: str, credential: str) -> LangModel: + if kind == KIND_FASTTEXT: + return FastTextModel(url, credential) + else: + return NoopModel() diff --git a/tests/tests/python-lang-id/pail/src/main.py b/tests/tests/python-lang-id/pail/src/main.py new file mode 100644 index 00000000..ce19937d --- /dev/null +++ b/tests/tests/python-lang-id/pail/src/main.py @@ -0,0 +1,95 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import sys +import pyarrow.parquet as pq + +from os import getenv +import logging + +import pyarrow as pa +from lang_models import LangModelFactory +from nlp import get_lang_ds_pa + +from lang_models import KIND_FASTTEXT + +short_name = "lang_id" +cli_prefix = f"{short_name}_" +model_credential_key = "model_credential" +model_kind_key = "model_kind" +model_url_key = "model_url" +content_column_name_key = "content_column_name" +output_lang_column_name_key = "output_lang_column_name" +output_score_column_name_key = "output_score_column_name" +model_credential_cli_param = f"{cli_prefix}{model_credential_key}" +model_kind_cli_param = f"{cli_prefix}{model_kind_key}" +model_url_cli_param = f"{cli_prefix}{model_url_key}" +content_column_name_cli_param = f"{cli_prefix}{content_column_name_key}" +output_lang_column_name_cli_param = f"{cli_prefix}{output_lang_column_name_key}" +output_score_column_name_cli_param = f"{cli_prefix}{output_score_column_name_key}" + +default_content_column_name = "text" +default_output_lang_column_name = "lang" +default_output_score_column_name = "score" + +model_kind = getenv(model_kind_key, KIND_FASTTEXT) +model_url = getenv(model_url_key, "facebook/fasttext-language-identification") +model_credential = getenv(model_credential_key, "PUT YOUR OWN HUGGINGFACE CREDENTIAL") + +def validate_columns(table: pa.Table, required: list[str]) -> None: + """ + Check if required columns exist in the table + :param table: table + :param required: list of required columns + :return: None + """ + columns = table.schema.names + result = True + for r in required: + if r not in columns: + result = False + break + if not result: + raise Exception( + f"Not all required columns are present in the table - " f"required {required}, present {columns}" + ) + +logger=logging.getLogger(__name__) +nlp_langid = LangModelFactory.create_model( + model_kind, model_url, model_credential +) +content_column_name = getenv(content_column_name_key, default_content_column_name) +output_lang_column_name = getenv(output_lang_column_name_key, default_output_lang_column_name) +output_score_column_name = getenv(output_score_column_name_key, default_output_score_column_name) + +try: + print(f"Reading in parquet file {sys.argv[1]}") + table = pq.read_table(sys.argv[1]) +except Exception as e: + print(f"Error reading table from {path}: {e}", file=sys.stderr) + exit(1) +print(f"Done Reading in parquet file {sys.argv[1]}") + +validate_columns(table, [content_column_name]) +if output_lang_column_name in table.schema.names: + raise Exception(f"column to store identified language ({output_lang_column_name}) already exist") +if output_score_column_name in table.schema.names: + raise Exception( + f"column to store score of language identification ({output_score_column_name}) already exist" + ) +print(f"Transforming one table with {len(table)} rows") +table, stats = get_lang_ds_pa( + table, nlp_langid, content_column_name, output_lang_column_name, output_score_column_name) +print(f"Transformed one table with {len(table)} rows") + +print(f"Done. Writing output to {sys.argv[2]}") +pq.write_table(table, sys.argv[2]) diff --git a/tests/tests/python-lang-id/pail/src/nlp.py b/tests/tests/python-lang-id/pail/src/nlp.py new file mode 100644 index 00000000..d8b2a152 --- /dev/null +++ b/tests/tests/python-lang-id/pail/src/nlp.py @@ -0,0 +1,59 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any + +import logging +import pyarrow as pa +from lang_models import LangModel + + +logger = logging.getLogger(__name__) + +def add_column(table: pa.Table, name: str, content: list[Any]) -> pa.Table: + """ + Add column to the table + :param table: original table + :param name: column name + :param content: content of the column + :return: updated table, containing new column + """ + # check if column already exist and drop it + if name in table.schema.names: + table = table.drop(columns=[name]) + # append column + return table.append_column(field_=name, column=[content]) + +def get_lang_ds_pa( + table: pa.table, + nlp: LangModel, + content_column_name: str, + output_lang_column_name: str, + output_score_column_name: str, + ) -> tuple[pa.table, dict[str, Any]]: + detected_language = pa.Table.from_pylist( + list( + map( + lambda r: {"lang": r[0], "score": r[1]}, + map(lambda x: nlp.detect_lang(x), table[content_column_name].to_pylist()), + ) + ) + ) + stats = pa.table([detected_language["lang"]], names=["lang"]).group_by("lang").aggregate([("lang", "count")]) + stats_dict = {} + for batch in stats.to_batches(): + d = batch.to_pydict() + for lang, count in zip(d["lang"], d["lang_count"]): + stats_dict[lang] = count + result = add_column(table=table, name=output_lang_column_name, content=detected_language["lang"]) + result = add_column(table=result, name=output_score_column_name, content=detected_language["score"]) + return result, stats_dict diff --git a/tests/tests/python-lang-id/pail/test-data/sm/expected/metadata.json.gz b/tests/tests/python-lang-id/pail/test-data/sm/expected/metadata.json.gz new file mode 100644 index 00000000..41f427b9 Binary files /dev/null and b/tests/tests/python-lang-id/pail/test-data/sm/expected/metadata.json.gz differ diff --git a/tests/tests/python-lang-id/pail/test-data/sm/expected/test_01.parquet.gz b/tests/tests/python-lang-id/pail/test-data/sm/expected/test_01.parquet.gz new file mode 100644 index 00000000..6caba113 Binary files /dev/null and b/tests/tests/python-lang-id/pail/test-data/sm/expected/test_01.parquet.gz differ diff --git a/tests/tests/python-lang-id/pail/test-data/sm/expected/test_02.parquet.gz b/tests/tests/python-lang-id/pail/test-data/sm/expected/test_02.parquet.gz new file mode 100644 index 00000000..f5794f38 Binary files /dev/null and b/tests/tests/python-lang-id/pail/test-data/sm/expected/test_02.parquet.gz differ diff --git a/tests/tests/python-lang-id/pail/test-data/sm/expected/test_03.parquet.gz b/tests/tests/python-lang-id/pail/test-data/sm/expected/test_03.parquet.gz new file mode 100644 index 00000000..16ed7e1c Binary files /dev/null and b/tests/tests/python-lang-id/pail/test-data/sm/expected/test_03.parquet.gz differ diff --git a/tests/tests/python-lang-id/pail/test-data/sm/input/test_01.parquet.gz b/tests/tests/python-lang-id/pail/test-data/sm/input/test_01.parquet.gz new file mode 100644 index 00000000..5964c3ed Binary files /dev/null and b/tests/tests/python-lang-id/pail/test-data/sm/input/test_01.parquet.gz differ diff --git a/tests/tests/python-lang-id/pail/test-data/sm/input/test_02.parquet.gz b/tests/tests/python-lang-id/pail/test-data/sm/input/test_02.parquet.gz new file mode 100644 index 00000000..da07c83e Binary files /dev/null and b/tests/tests/python-lang-id/pail/test-data/sm/input/test_02.parquet.gz differ diff --git a/tests/tests/python-lang-id/pail/test-data/sm/input/test_03.parquet.gz b/tests/tests/python-lang-id/pail/test-data/sm/input/test_03.parquet.gz new file mode 100644 index 00000000..4701f42d Binary files /dev/null and b/tests/tests/python-lang-id/pail/test-data/sm/input/test_03.parquet.gz differ diff --git a/tests/tests/python-lang-id/post.sh b/tests/tests/python-lang-id/post.sh new file mode 100755 index 00000000..f52f8c8e --- /dev/null +++ b/tests/tests/python-lang-id/post.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +DATA="$TEST_PATH"/pail/test-data/sm + +for i in $(seq 1 3) +do + actual="$DATA"/input/test_0$i.output.parquet + expected="$DATA"/expected/test_0$i.parquet.gz + + if [ -f $actual ] + then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && rm -f $actual + else echo "❌ FAIL cannot find local task output file=$actual test=$TEST_NAME" && exit 1 + fi + + actual_sha256=$(cat "$actual" | sha256) + expected_sha256=$(gzcat "$expected" | sha256 ) + + if [ "$actual_sha256" = "$expected_sha256" ] + then echo "✅ PASS found local task output file=$f test=$TEST_NAME" && rm -f $f + else echo "❌ FAIL cannot find local task output file=$f test=$TEST_NAME" && exit 1 + fi +done diff --git a/tests/tests/python-lang-id/settings.sh b/tests/tests/python-lang-id/settings.sh new file mode 100644 index 00000000..38414948 --- /dev/null +++ b/tests/tests/python-lang-id/settings.sh @@ -0,0 +1,6 @@ +api=workqueue + +expected=("Transforming one table") +NUM_DESIRED_OUTPUTS=1 + +up_args='<(gzcat "$TEST_PATH"/pail/test-data/sm/input/test_01.parquet.gz) <(gzcat "$TEST_PATH"/pail/test-data/sm/input/test_02.parquet.gz) <(gzcat "$TEST_PATH"/pail/test-data/sm/input/test_03.parquet.gz)'