scribe-org#77 formatting for translation, adding dependencies, trying…

… Mac CI
shashank-iitbhu · Mar 18, 2024 · eebd63e · eebd63e
1 parent cc41160
commit eebd63e
Show file tree

Hide file tree

Showing 10 changed files with 126 additions and 210 deletions.
diff --git a/.github/workflows/python_package_ci.yaml b/.github/workflows/python_package_ci.yaml
@@ -12,11 +12,9 @@ jobs:
       fail-fast: false
       matrix:
         os:
-          # Removing 'macos-latest' for now until build issue is fixed.
-          #   https://github.com/scribe-org/Scribe-Data/issues/61
-          # - macos-latest
+          - macos-latest
           - ubuntu-latest
-        python-version: 
+        python-version:
           - "3.9"
 
     runs-on: ${{ matrix.os }}

diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ Check out Scribe's [architecture diagrams](https://github.com/scribe-org/Organiz
 
 [scribe_data/extract_transform/update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) and the notebooks within the [scribe_data/extract_transform](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform) directory are used to update all data for [Scribe-iOS](https://github.com/scribe-org/Scribe-iOS), with this functionality later being expanded to update [Scribe-Android](https://github.com/scribe-org/Scribe-Android) and [Scribe-Desktop](https://github.com/scribe-org/Scribe-Desktop) when they're active.
 
-The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/gen_emoji_lexicon.ipynb).
+The main data update process in [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) triggers [SPARQL queries](https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/languages) to query language data from [Wikidata](https://www.wikidata.org/) using [SPARQLWrapper](https://github.com/RDFLib/sparqlwrapper) as a URI. The autosuggestion process derives popular words from [Wikipedia](https://www.wikipedia.org/) as well as those words that normally follow them for an effective baseline feature until natural language processing methods are employed. Functions to generate autosuggestions are ran in [gen_autosuggestions.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/wikipedia/gen_autosuggestions.ipynb). Emojis are further sourced from [Unicode CLDR](https://github.com/unicode-org/cldr), with this process being ran in [gen_emoji_lexicon.ipynb](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/unicode/gen_emoji_lexicon.ipynb).
 
 Running [update_data.py](https://github.com/scribe-org/Scribe-Data/blob/main/src/scribe_data/extract_transform/update_data.py) is done via the following CLI command:
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -54,16 +54,21 @@
 autodoc_mock_imports = [
     "beautifulsoup4",
     "emoji",
+    "flax",
     "langcodes",
-    "language_data",
     "mwparserfromhell",
     "pandas",
+    "pyarrow",
     "PyICU",
     "pytest",
     "pytest-cov",
+    "ruff",
     "sentencepiece",
     "SPARQLWrapper",
     "tabulate",
+    "tensorflow",
+    "torch",
+    "tqdm",
     "transformers",
 ]
 

diff --git a/docs/source/scribe_data/extract_transform/index.rst b/docs/source/scribe_data/extract_transform/index.rst
@@ -7,6 +7,7 @@ extract_transform
     :maxdepth: 1
 
     languages/index
+    translation/index
     unicode/index
     wikidata/index
     wikipedia/index
diff --git a/docs/source/scribe_data/extract_transform/translation/index.rst b/docs/source/scribe_data/extract_transform/translation/index.rst
@@ -0,0 +1,4 @@
+translation
+===========
+
+`View code on Github <https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/translation>`_
diff --git a/requirements.txt b/requirements.txt
@@ -2,8 +2,8 @@ beautifulsoup4==4.9.3
 certifi>=2020.12.5
 defusedxml==0.7.1
 emoji>=2.2.0
+flax>=0.8.2
 langcodes>=3.0.0
-language_data>=1.0.0
 m2r2>=0.3.3
 mwparserfromhell>=0.6
 numpydoc>=1.6.0
@@ -18,5 +18,7 @@ sentencepiece>=0.1.95
 SPARQLWrapper>=2.0.0
 sphinx-rtd-theme>=2.0.0
 tabulate>=0.8.9
+tensorflow>=2.0
+torch>=2.2.1
 tqdm==4.56.1
 transformers>=4.12
diff --git a/src/scribe_data/extract_transform/languages/English/translations/translate_words.py b/src/scribe_data/extract_transform/languages/English/translations/translate_words.py
@@ -1,86 +1,41 @@
 """
 Translates the English words queried from Wikidata to all other Scribe languages.
+
+Example
+-------
+    python3 src/scribe_data/extract_transform/languages/English/translations/translate_words.py
 """
 
 import json
 import os
-import signal
-
-from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
-
-def translate_words(words_path: str):
-    with open(words_path, "r", encoding="utf-8") as file:
-        words_json_data = json.load(file)
-
-    word_list = []
-
-    for item in words_json_data:
-        word_list.append(item["word"])
-
-    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-
-    with open(
-        "../../../../../scribe_data/resources/language_meta_data.json",
-        "r",
-        encoding="utf-8",
-    ) as file:
-        lang_json_data = json.load(file)
-    iso_list = [lang["iso"] for lang in lang_json_data["languages"]]
-
-    target_languages = iso_list
-
-    translations = []
-
-    if os.path.exists("../formatted_data/translated_words.json"):
-        with open(
-            "../formatted_data/translated_words.json", "r", encoding="utf-8"
-        ) as file:
-            translations = json.load(file)
-
-    def signal_handler(sig, frame):
-        print(
-            "\nThe interrupt signal has been caught and the current progress is being saved..."
-        )
-        with open(
-            "../formatted_data/translated_words.json", "w", encoding="utf-8"
-        ) as file:
-            json.dump(translations, file, ensure_ascii=False, indent=4)
-            file.write("\n")
-
-        print("The current progress has been saved to the translated_words.json file.")
-        exit()
-
-    signal.signal(signal.SIGINT, signal_handler)
+import sys
 
-    for word in word_list[len(translations) :]:
-        word_translations = {word: {}}
-        for lang_code in target_languages:
-            tokenizer.src_lang = "en"
-            encoded_word = tokenizer(word, return_tensors="pt")
-            generated_tokens = model.generate(
-                **encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code)
-            )
-            translated_word = tokenizer.batch_decode(
-                generated_tokens, skip_special_tokens=True
-            )[0]
-            word_translations[word][lang_code] = translated_word
+PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
+PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
+sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-        translations.append(word_translations)
+from scribe_data.utils import translate_to_other_languages
 
-        with open(
-            "../formatted_data/translated_words.json", "w", encoding="utf-8"
-        ) as file:
-            json.dump(translations, file, ensure_ascii=False, indent=4)
-            file.write("\n")
+SRC_LANG = "English"
+translate_script_dir = os.path.dirname(os.path.abspath(__file__))
+words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json")
 
-        print(f"Translation results for the word '{word}' have been saved.")
+with open(words_to_translate_path, "r", encoding="utf-8") as file:
+    json_data = json.load(file)
 
-    print(
-        "Translation results for all words are saved to the translated_words.json file."
-    )
+word_list = [item["word"] for item in json_data]
 
+translations = {}
+translated_words_path = os.path.join(
+    translate_script_dir, "../formatted_data/translated_words.json"
+)
+if os.path.exists(translated_words_path):
+    with open(translated_words_path, "r", encoding="utf-8") as file:
+        translations = json.load(file)
 
-if __name__ == "__main__":
-    translate_words("words_to_translate.json")
+translate_to_other_languages(
+    source_language=SRC_LANG,
+    word_list=word_list,
+    translations=translations,
+    batch_size=100,
+)
diff --git a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py
@@ -1,3 +1,11 @@
+"""
+Translates the Russian words queried from Wikidata to all other Scribe languages.
+
+Example
+-------
+    python3 src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py
+"""
+
 import json
 import os
 import sys
@@ -6,24 +14,28 @@
 PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
 sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
 
-from scribe_data.utils import translate_to_other_languages, translation_interrupt_handler
+from scribe_data.utils import translate_to_other_languages
 
+SRC_LANG = "Russian"
 translate_script_dir = os.path.dirname(os.path.abspath(__file__))
-words_to_translate_path = os.path.join(translate_script_dir, 'words_to_translate.json')
+words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json")
 
-with open(words_to_translate_path, 'r', encoding='utf-8') as file:
+with open(words_to_translate_path, "r", encoding="utf-8") as file:
     json_data = json.load(file)
 
-word_list = []
-for item in json_data:
-    word_list.append(item["word"])
-
-src_lang="Russian"
+word_list = [item["word"] for item in json_data]
 
 translations = {}
-translated_words_path = os.path.join(translate_script_dir, '../formatted_data/translated_words.json')
+translated_words_path = os.path.join(
+    translate_script_dir, "../formatted_data/translated_words.json"
+)
 if os.path.exists(translated_words_path):
-    with open(translated_words_path, 'r', encoding='utf-8') as file:
+    with open(translated_words_path, "r", encoding="utf-8") as file:
         translations = json.load(file)
 
-translate_to_other_languages(src_lang, word_list, translations, batch_size=100)
+translate_to_other_languages(
+    source_language=SRC_LANG,
+    word_list=word_list,
+    translations=translations,
+    batch_size=100,
+)
diff --git a/src/scribe_data/extract_transform/translate.py b/src/scribe_data/extract_transform/translate.py