Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Translate words from Russian to other Scribe languages #89

Merged
merged 14 commits into from
Mar 18, 2024
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
import os
import signal
import sys

PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)

from scribe_data.utils import translate_to_other_languages, translation_interrupt_handler

translate_script_dir = os.path.dirname(os.path.abspath(__file__))
words_to_translate_path = os.path.join(translate_script_dir, 'words_to_translate.json')

with open(words_to_translate_path, 'r', encoding='utf-8') as file:
json_data = json.load(file)

word_list = {}
for item in json_data:
word_list.append(item["word"])

src_lang="Russian"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Modify the src_lang variable to specify the source language for the translations.
Keep the rest of the translate_words.py script unchanged.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔥🔥🔥 Nice, @shashank-iitbhu :) I'll take a bit more of a look at this, and maybe we can move this to an all caps var at the top. I can handle this myself though 😊


translations = {}
translated_words_path = os.path.join(translate_script_dir, '../formatted_data/translated_words.json')
if os.path.exists(translated_words_path):
with open(translated_words_path, 'r', encoding='utf-8') as file:
translations = json.load(file)

signal.signal(signal.SIGINT, lambda sig, frame: translation_interrupt_handler(src_lang, translations))

translate_to_other_languages(src_lang, word_list, translations)
108 changes: 107 additions & 1 deletion src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,23 @@
get_language_from_iso,
get_language_words_to_remove,
get_language_words_to_ignore,
get_language_dir_path,
get_path_from_format_file,
get_path_from_load_dir,
get_path_from_et_dir,
get_ios_data_path,
get_android_data_path,
get_desktop_data_path,
check_command_line_args,
check_and_return_command_line_args
check_and_return_command_line_args,
translation_interrupt_handler,
get_target_langcodes,
translate_to_other_languages
"""

import ast
import json
import os
import sys
from importlib import resources
from pathlib import Path
Expand All @@ -30,6 +35,8 @@
import langcodes
from langcodes import Language

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

PROJECT_ROOT = "Scribe-Data"


Expand Down Expand Up @@ -240,6 +247,24 @@ def get_language_words_to_ignore(language: str) -> list[str]:
)


def get_language_dir_path(language):
"""
Returns the directory path for a specific language within the Scribe-Data project.

Parameters
----------
language : str
The language for which the directory path is needed.

Returns
-------
str
The directory path for the specified language.
"""
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0]
return f"{PATH_TO_SCRIBE_ORG}/Scribe-Data/src/scribe_data/extract_transform/languages/{language}"


def get_path_from_format_file() -> str:
"""
Returns the directory path from a data formatting file to scribe-org.
Expand Down Expand Up @@ -420,3 +445,84 @@ def check_and_return_command_line_args(
python {all_args[0]} '["comma_separated_sets_in_quotes"]'
"""
)


def translation_interrupt_handler(source_language, translations):
"""
Handles interrupt signals and saves the current translation progress.

Parameters
----------
source_language : str
The source language being translated from.

translations : list[dict]
The current list of translations.
"""
print("\nThe interrupt signal has been caught and the current progress is being saved...")
with open(f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", 'w', encoding='utf-8') as file:
json.dump(translations, file, ensure_ascii=False, indent=4)
print("The current progress is saved to the translated_words.json file.")
exit()

def get_target_langcodes(source_lang)->list[str]:
"""
Returns a list of target language ISO codes for translation.

Parameters
----------
source_lang : str
The source language being translated from.

Returns
-------
list[str]
A list of target language ISO codes.
"""
target_langcodes=[]
for lang in get_scribe_languages():
if lang!=source_lang:
target_langcodes.append(get_language_iso(lang))
else:
continue
return target_langcodes

def translate_to_other_languages(source_language, word_list, translations, batch_size=10):
"""
Translates a list of words from the source language to other target languages using batch processing.

Parameters
----------
source_language : str
The source language being translated from.

word_list : list[str]
The list of words to translate.

translations : dict
The current dictionary of translations.

batch_size : int
The number of words to translate in each batch.
"""
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

for i in range(0, len(word_list), batch_size):
batch_words = word_list[i:i+batch_size]
print(f"Translating batch {i//batch_size + 1}: {batch_words}")
for lang_code in get_target_langcodes(source_language):
tokenizer.src_lang = get_language_iso(source_language)
encoded_words = tokenizer(batch_words, return_tensors="pt", padding=True)
generated_tokens = model.generate(**encoded_words, forced_bos_token_id=tokenizer.get_lang_id(lang_code))
translated_words = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
for word, translation in zip(batch_words, translated_words):
if word not in translations:
translations[word] = {}
translations[word][lang_code] = translation
print(f"Batch {i//batch_size + 1} translation completed.")

with open(f"{get_language_dir_path(source_language)}/formatted_data/translated_words.json", 'w', encoding='utf-8') as file:
json.dump(translations, file, ensure_ascii=False, indent=4)

print("Translation results for all words are saved to the translated_words.json file.")
Loading