From eebd63eec8b0df0eb954220405dab1fd52f775ca Mon Sep 17 00:00:00 2001 From: Andrew Tavis McAllister Date: Mon, 18 Mar 2024 01:46:27 +0100 Subject: [PATCH] #77 formatting for translation, adding dependencies, trying Mac CI --- .github/workflows/python_package_ci.yaml | 6 +- README.md | 2 +- docs/source/conf.py | 7 +- .../scribe_data/extract_transform/index.rst | 1 + .../extract_transform/translation/index.rst | 4 + requirements.txt | 4 +- .../English/translations/translate_words.py | 101 +++++------------- .../Russian/translations/translate_words.py | 34 ++++-- .../extract_transform/translate.py | 84 --------------- src/scribe_data/utils.py | 93 ++++++++++------ 10 files changed, 126 insertions(+), 210 deletions(-) create mode 100644 docs/source/scribe_data/extract_transform/translation/index.rst delete mode 100644 src/scribe_data/extract_transform/translate.py diff --git a/.github/workflows/python_package_ci.yaml b/.github/workflows/python_package_ci.yaml index c2e37b90..cfb459ec 100644 --- a/.github/workflows/python_package_ci.yaml +++ b/.github/workflows/python_package_ci.yaml @@ -12,11 +12,9 @@ jobs: fail-fast: false matrix: os: - # Removing 'macos-latest' for now until build issue is fixed. - # https://github.com/scribe-org/Scribe-Data/issues/61 - # - macos-latest + - macos-latest - ubuntu-latest - python-version: + python-version: - "3.9" runs-on: ${{ matrix.os }} diff --git a/README.md b/README.md index f4975e11..a03a3b56 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Check out Scribe's [architecture diagrams](https://github.com/scribe-org/Organiz [scribe_data/extract_transform/update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) and the notebooks within the [scribe_data/extract_transform](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform) directory are used to update all data for [Scribe-iOS](https://github.com/scribe-org/Scribe-iOS), with this functionality later being expanded to update [Scribe-Android](https://github.com/scribe-org/Scribe-Android) and [Scribe-Desktop](https://github.com/scribe-org/Scribe-Desktop) when they're active. -The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/gen_emoji_lexicon.ipynb). +The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb). Running [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) is done via the following CLI command: diff --git a/docs/source/conf.py b/docs/source/conf.py index 90854084..716465b1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -54,16 +54,21 @@ autodoc_mock_imports = [ "beautifulsoup4", "emoji", + "flax", "langcodes", - "language_data", "mwparserfromhell", "pandas", + "pyarrow", "PyICU", "pytest", "pytest-cov", + "ruff", "sentencepiece", "SPARQLWrapper", "tabulate", + "tensorflow", + "torch", + "tqdm", "transformers", ] diff --git a/docs/source/scribe_data/extract_transform/index.rst b/docs/source/scribe_data/extract_transform/index.rst index 067fae69..a92e70aa 100644 --- a/docs/source/scribe_data/extract_transform/index.rst +++ b/docs/source/scribe_data/extract_transform/index.rst @@ -7,6 +7,7 @@ extract_transform :maxdepth: 1 languages/index + translation/index unicode/index wikidata/index wikipedia/index diff --git a/docs/source/scribe_data/extract_transform/translation/index.rst b/docs/source/scribe_data/extract_transform/translation/index.rst new file mode 100644 index 00000000..8fdd0a3a --- /dev/null +++ b/docs/source/scribe_data/extract_transform/translation/index.rst @@ -0,0 +1,4 @@ +translation +=========== + +`View code on Github `_ diff --git a/requirements.txt b/requirements.txt index a83ef552..d4f8b382 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,8 @@ beautifulsoup4==4.9.3 certifi>=2020.12.5 defusedxml==0.7.1 emoji>=2.2.0 +flax>=0.8.2 langcodes>=3.0.0 -language_data>=1.0.0 m2r2>=0.3.3 mwparserfromhell>=0.6 numpydoc>=1.6.0 @@ -18,5 +18,7 @@ sentencepiece>=0.1.95 SPARQLWrapper>=2.0.0 sphinx-rtd-theme>=2.0.0 tabulate>=0.8.9 +tensorflow>=2.0 +torch>=2.2.1 tqdm==4.56.1 transformers>=4.12 diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py index 1efff8aa..944c9cd4 100644 --- a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py @@ -1,86 +1,41 @@ """ Translates the English words queried from Wikidata to all other Scribe languages. + +Example +------- + python3 src/scribe_data/extract_transform/languages/English/translations/translate_words.py """ import json import os -import signal - -from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer - - -def translate_words(words_path: str): - with open(words_path, "r", encoding="utf-8") as file: - words_json_data = json.load(file) - - word_list = [] - - for item in words_json_data: - word_list.append(item["word"]) - - model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") - tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") - - with open( - "../../../../../scribe_data/resources/language_meta_data.json", - "r", - encoding="utf-8", - ) as file: - lang_json_data = json.load(file) - iso_list = [lang["iso"] for lang in lang_json_data["languages"]] - - target_languages = iso_list - - translations = [] - - if os.path.exists("../formatted_data/translated_words.json"): - with open( - "../formatted_data/translated_words.json", "r", encoding="utf-8" - ) as file: - translations = json.load(file) - - def signal_handler(sig, frame): - print( - "\nThe interrupt signal has been caught and the current progress is being saved..." - ) - with open( - "../formatted_data/translated_words.json", "w", encoding="utf-8" - ) as file: - json.dump(translations, file, ensure_ascii=False, indent=4) - file.write("\n") - - print("The current progress has been saved to the translated_words.json file.") - exit() - - signal.signal(signal.SIGINT, signal_handler) +import sys - for word in word_list[len(translations) :]: - word_translations = {word: {}} - for lang_code in target_languages: - tokenizer.src_lang = "en" - encoded_word = tokenizer(word, return_tensors="pt") - generated_tokens = model.generate( - **encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code) - ) - translated_word = tokenizer.batch_decode( - generated_tokens, skip_special_tokens=True - )[0] - word_translations[word][lang_code] = translated_word +PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] +PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" +sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) - translations.append(word_translations) +from scribe_data.utils import translate_to_other_languages - with open( - "../formatted_data/translated_words.json", "w", encoding="utf-8" - ) as file: - json.dump(translations, file, ensure_ascii=False, indent=4) - file.write("\n") +SRC_LANG = "English" +translate_script_dir = os.path.dirname(os.path.abspath(__file__)) +words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json") - print(f"Translation results for the word '{word}' have been saved.") +with open(words_to_translate_path, "r", encoding="utf-8") as file: + json_data = json.load(file) - print( - "Translation results for all words are saved to the translated_words.json file." - ) +word_list = [item["word"] for item in json_data] +translations = {} +translated_words_path = os.path.join( + translate_script_dir, "../formatted_data/translated_words.json" +) +if os.path.exists(translated_words_path): + with open(translated_words_path, "r", encoding="utf-8") as file: + translations = json.load(file) -if __name__ == "__main__": - translate_words("words_to_translate.json") +translate_to_other_languages( + source_language=SRC_LANG, + word_list=word_list, + translations=translations, + batch_size=100, +) diff --git a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py index 103ca2b5..a9c295fc 100644 --- a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py +++ b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py @@ -1,3 +1,11 @@ +""" +Translates the Russian words queried from Wikidata to all other Scribe languages. + +Example +------- + python3 src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py +""" + import json import os import sys @@ -6,24 +14,28 @@ PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) -from scribe_data.utils import translate_to_other_languages, translation_interrupt_handler +from scribe_data.utils import translate_to_other_languages +SRC_LANG = "Russian" translate_script_dir = os.path.dirname(os.path.abspath(__file__)) -words_to_translate_path = os.path.join(translate_script_dir, 'words_to_translate.json') +words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json") -with open(words_to_translate_path, 'r', encoding='utf-8') as file: +with open(words_to_translate_path, "r", encoding="utf-8") as file: json_data = json.load(file) -word_list = [] -for item in json_data: - word_list.append(item["word"]) - -src_lang="Russian" +word_list = [item["word"] for item in json_data] translations = {} -translated_words_path = os.path.join(translate_script_dir, '../formatted_data/translated_words.json') +translated_words_path = os.path.join( + translate_script_dir, "../formatted_data/translated_words.json" +) if os.path.exists(translated_words_path): - with open(translated_words_path, 'r', encoding='utf-8') as file: + with open(translated_words_path, "r", encoding="utf-8") as file: translations = json.load(file) -translate_to_other_languages(src_lang, word_list, translations, batch_size=100) \ No newline at end of file +translate_to_other_languages( + source_language=SRC_LANG, + word_list=word_list, + translations=translations, + batch_size=100, +) diff --git a/src/scribe_data/extract_transform/translate.py b/src/scribe_data/extract_transform/translate.py deleted file mode 100644 index 538e0f21..00000000 --- a/src/scribe_data/extract_transform/translate.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -Translates the words queried from Wikidata using query_words_to_translate.sparql. - -Example -------- - python translate.py '["French", "Portuguese"]' '["German"]' -""" - -import collections -import json -import os -import sys - -from tqdm.auto import tqdm -from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer - -PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] -PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" -sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) - -from scribe_data.utils import ( - check_and_return_command_line_args, - get_language_iso, - get_scribe_languages, -) - -# Note: Check whether arguments have been passed to only update a subset of the data. -src_languages, target_languages = check_and_return_command_line_args( - all_args=sys.argv, - first_args_check=get_scribe_languages(), - second_args_check=get_scribe_languages(), -) - -# Assign current_languages and current_word_types if no arguments have been passed. -if src_languages is None: - src_languages = get_scribe_languages() - -if target_languages is None: - target_languages = get_scribe_languages() - -for src_lang in src_languages: - for target_lang in [l for l in target_languages if l != src_lang]: - print( - f"Translating {get_language_iso(src_lang)} to {get_language_iso(target_lang)}" - ) - -""" -Note: Before `target_lang` is defined. - -with open("src_lang/words_to_translate.json", encoding="utf-8") as f: - translations_list = json.load(f) - -words_to_translate = [translation_vals["value"] for translation_vals in translations_list] -words_to_translate = list(set(words_to_translate)) - -translations_formatted = {} - -Note: After `target_lang` is defined. - -for w in tqdm( - words_to_translate[:100], - desc="Words translated", - unit="word", -): - See: https://huggingface.co/facebook/m2m100_418M - Output: - { - book: { - "es": "libro", - "de": "Buch" - } - } - -translations_formatted = collections.OrderedDict(sorted(translations_formatted.items())) - -with open( - "src_lang/formatted_data/translations.json", - "w", - encoding="utf-8", -) as f: - json.dump(translations_formatted, f, ensure_ascii=False, indent=0) - -print(f"Wrote file translations.json with {len(translations_formatted)} translations.") -""" diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py index b73bb2ab..303cb1af 100644 --- a/src/scribe_data/utils.py +++ b/src/scribe_data/utils.py @@ -11,10 +11,9 @@ get_language_words_to_remove, get_language_words_to_ignore, get_language_dir_path, - get_path_from_format_file, - get_language_dir_path, load_queried_data, export_formatted_data, + get_path_from_format_file, get_path_from_load_dir, get_path_from_et_dir, get_ios_data_path, @@ -22,8 +21,8 @@ get_desktop_data_path, check_command_line_args, check_and_return_command_line_args, - translation_interrupt_handler, get_target_langcodes, + translation_interrupt_handler, translate_to_other_languages, map_genders """ @@ -39,7 +38,6 @@ import langcodes from langcodes import Language - from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer PROJECT_ROOT = "Scribe-Data" @@ -296,7 +294,7 @@ def load_queried_data(file_path, language, data_type): data_path = queried_data_file else: update_data_in_use = True - data_path = f"{_get_language_dir_path(language)}/{data_type}/{queried_data_file}" + data_path = f"{get_language_dir_path(language)}/{data_type}/{queried_data_file}" with open(data_path, encoding="utf-8") as f: return json.load(f), update_data_in_use, data_path @@ -322,7 +320,9 @@ def export_formatted_data(formatted_data, update_data_in_use, language, data_typ None """ if update_data_in_use: - export_path = f"{_get_language_dir_path(language)}/formatted_data/{data_type}.json" + export_path = ( + f"{get_language_dir_path(language)}/formatted_data/{data_type}.json" + ) else: export_path = f"{data_type}.json" @@ -513,6 +513,25 @@ def check_and_return_command_line_args( ) +def get_target_langcodes(source_lang) -> list[str]: + """ + Returns a list of target language ISO codes for translation. + + Parameters + ---------- + source_lang : str + The source language being translated from. + + Returns + ------- + list[str] + A list of target language ISO codes. + """ + return [ + get_language_iso(lang) for lang in get_scribe_languages() if lang != source_lang + ] + + def translation_interrupt_handler(source_language, translations): """ Handles interrupt signals and saves the current translation progress. @@ -525,33 +544,20 @@ def translation_interrupt_handler(source_language, translations): translations : list[dict] The current list of translations. """ - print("\nThe interrupt signal has been caught and the current progress is being saved...") - with open(f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", 'w', encoding='utf-8') as file: + print( + "\nThe interrupt signal has been caught and the current progress is being saved..." + ) + + with open( + f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", + "w", + encoding="utf-8", + ) as file: json.dump(translations, file, ensure_ascii=False, indent=4) + print("The current progress is saved to the translated_words.json file.") exit() -def get_target_langcodes(source_lang)->list[str]: - """ - Returns a list of target language ISO codes for translation. - - Parameters - ---------- - source_lang : str - The source language being translated from. - - Returns - ------- - list[str] - A list of target language ISO codes. - """ - target_langcodes=[] - for lang in get_scribe_languages(): - if lang!=source_lang: - target_langcodes.append(get_language_iso(lang)) - else: - continue - return target_langcodes def translate_to_other_languages(source_language, word_list, translations, batch_size): """ @@ -574,26 +580,43 @@ def translate_to_other_languages(source_language, word_list, translations, batch model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") - signal.signal(signal.SIGINT, lambda sig, frame: translation_interrupt_handler(source_language, translations)) + signal.signal( + signal.SIGINT, + lambda sig, frame: translation_interrupt_handler(source_language, translations), + ) for i in range(0, len(word_list), batch_size): - batch_words = word_list[i:i+batch_size] + batch_words = word_list[i : i + batch_size] print(f"Translating batch {i//batch_size + 1}: {batch_words}") + for lang_code in get_target_langcodes(source_language): tokenizer.src_lang = get_language_iso(source_language) encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True) - generated_tokens = model.generate(**encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code)) - translated_words = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) + generated_tokens = model.generate( + **encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code) + ) + translated_words = tokenizer.batch_decode( + generated_tokens, skip_special_tokens=True + ) + for word, translation in zip(batch_words, translated_words): if word not in translations: translations[word] = {} + translations[word][lang_code] = translation + print(f"Batch {i//batch_size + 1} translation completed.") - with open(f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", 'w', encoding='utf-8') as file: + with open( + f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", + "w", + encoding="utf-8", + ) as file: json.dump(translations, file, ensure_ascii=False, indent=4) - print("Translation results for all words are saved to the translated_words.json file.") + print( + "Translation results for all words are saved to the translated_words.json file." + ) def map_genders(wikidata_gender):