scribe-org · andrewtavis · Mar 18, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py b/src/scribe_data/extract_transform/languages/Russian/translations/translate_words.py
@@ -0,0 +1,32 @@
+import json
+import os
+import signal
+import sys
+
+PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
+PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
+sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)
+
+from scribe_data.utils import translate_to_other_languages, translation_interrupt_handler
+
+translate_script_dir = os.path.dirname(os.path.abspath(__file__))
+words_to_translate_path = os.path.join(translate_script_dir, 'words_to_translate.json')
+
+with open(words_to_translate_path, 'r', encoding='utf-8') as file:
+    json_data = json.load(file)
+
+word_list = {}
+for item in json_data:
+    word_list.append(item["word"])
+
+src_lang="Russian"
+
+translations = {}
+translated_words_path = os.path.join(translate_script_dir, '../formatted_data/translated_words.json')
+if os.path.exists(translated_words_path):
+    with open(translated_words_path, 'r', encoding='utf-8') as file:
+        translations = json.load(file)
+
+signal.signal(signal.SIGINT, lambda sig, frame: translation_interrupt_handler(src_lang, translations))
+
+translate_to_other_languages(src_lang, word_list, translations)
diff --git a/src/scribe_data/utils.py b/src/scribe_data/utils.py
@@ -10,18 +10,23 @@
     get_language_from_iso,
     get_language_words_to_remove,
     get_language_words_to_ignore,
+    get_language_dir_path,
     get_path_from_format_file,
     get_path_from_load_dir,
     get_path_from_et_dir,
     get_ios_data_path,
     get_android_data_path,
     get_desktop_data_path,
     check_command_line_args,
-    check_and_return_command_line_args
+    check_and_return_command_line_args,
+    translation_interrupt_handler,
+    get_target_langcodes,
+    translate_to_other_languages
 """
 
 import ast
 import json
+import os
 import sys
 from importlib import resources
 from pathlib import Path
@@ -30,6 +35,8 @@
 import langcodes
 from langcodes import Language
 
+from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
 PROJECT_ROOT = "Scribe-Data"
 
 
@@ -240,6 +247,24 @@ def get_language_words_to_ignore(language: str) -> list[str]:
     )
 
 
+def get_language_dir_path(language):
+    """
+    Returns the directory path for a specific language within the Scribe-Data project.
+
+    Parameters
+    ----------
+        language : str
+            The language for which the directory path is needed.
+
+    Returns
+    -------
+        str
+            The directory path for the specified language.
+    """
+    PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
+    return f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages/{language}"
+
+
 def get_path_from_format_file() -> str:
     """
     Returns the directory path from a data formatting file to scribe-org.
@@ -420,3 +445,84 @@ def check_and_return_command_line_args(
         python {all_args[0]} '["comma_separated_sets_in_quotes"]'
         """
     )
+
+
+def translation_interrupt_handler(source_language, translations):
+    """
+    Handles interrupt signals and saves the current translation progress.
+
+    Parameters
+    ----------
+        source_language : str
+            The source language being translated from.
+
+        translations : list[dict]
+            The current list of translations.
+    """
+    print("\nThe interrupt signal has been caught and the current progress is being saved...")
+    with open(f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", 'w', encoding='utf-8') as file:
+        json.dump(translations, file, ensure_ascii=False, indent=4)
+    print("The current progress is saved to the translated_words.json file.")
+    exit()
+
+def get_target_langcodes(source_lang)->list[str]:
+    """
+    Returns a list of target language ISO codes for translation.
+
+    Parameters
+    ----------
+        source_lang : str
+            The source language being translated from.
+
+    Returns
+    -------
+        list[str]
+            A list of target language ISO codes.
+    """
+    target_langcodes=[]
+    for lang in get_scribe_languages():
+        if lang!=source_lang:
+            target_langcodes.append(get_language_iso(lang))
+        else:
+            continue
+    return target_langcodes
+
+def translate_to_other_languages(source_language, word_list, translations, batch_size=10):
+    """
+    Translates a list of words from the source language to other target languages using batch processing.
+
+    Parameters
+    ----------
+        source_language : str
+            The source language being translated from.
+
+        word_list : list[str]
+            The list of words to translate.
+
+        translations : dict
+            The current dictionary of translations.
+
+        batch_size : int
+            The number of words to translate in each batch.
+    """
+    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+
+    for i in range(0, len(word_list), batch_size):
+        batch_words = word_list[i:i+batch_size]
+        print(f"Translating batch {i//batch_size + 1}: {batch_words}")
+        for lang_code in get_target_langcodes(source_language):
+            tokenizer.src_lang = get_language_iso(source_language)
+            encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True)
+            generated_tokens = model.generate(**encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code))
+            translated_words = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+            for word, translation in zip(batch_words, translated_words):
+                if word not in translations:
+                    translations[word] = {}
+                translations[word][lang_code] = translation
+        print(f"Batch {i//batch_size + 1} translation completed.")
+
+        with open(f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", 'w', encoding='utf-8') as file:
+            json.dump(translations, file, ensure_ascii=False, indent=4)
+
+    print("Translation results for all words are saved to the translated_words.json file.")