forked from scribe-org/Scribe-Data
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
scribe-org#77 formatting for translation, adding dependencies, trying…
… Mac CI
- Loading branch information
1 parent
cc41160
commit eebd63e
Showing
10 changed files
with
126 additions
and
210 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ extract_transform | |
:maxdepth: 1 | ||
|
||
languages/index | ||
translation/index | ||
unicode/index | ||
wikidata/index | ||
wikipedia/index |
4 changes: 4 additions & 0 deletions
4
docs/source/scribe_data/extract_transform/translation/index.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
translation | ||
=========== | ||
|
||
`View code on Github <https://github.com/scribe-org/Scribe-Data/tree/main/src/scribe_data/extract_transform/translation>`_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 28 additions & 73 deletions
101
src/scribe_data/extract_transform/languages/English/translations/translate_words.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,86 +1,41 @@ | ||
""" | ||
Translates the English words queried from Wikidata to all other Scribe languages. | ||
Example | ||
------- | ||
python3 src/scribe_data/extract_transform/languages/English/translations/translate_words.py | ||
""" | ||
|
||
import json | ||
import os | ||
import signal | ||
|
||
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | ||
|
||
|
||
def translate_words(words_path: str): | ||
with open(words_path, "r", encoding="utf-8") as file: | ||
words_json_data = json.load(file) | ||
|
||
word_list = [] | ||
|
||
for item in words_json_data: | ||
word_list.append(item["word"]) | ||
|
||
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") | ||
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") | ||
|
||
with open( | ||
"../../../../../scribe_data/resources/language_meta_data.json", | ||
"r", | ||
encoding="utf-8", | ||
) as file: | ||
lang_json_data = json.load(file) | ||
iso_list = [lang["iso"] for lang in lang_json_data["languages"]] | ||
|
||
target_languages = iso_list | ||
|
||
translations = [] | ||
|
||
if os.path.exists("../formatted_data/translated_words.json"): | ||
with open( | ||
"../formatted_data/translated_words.json", "r", encoding="utf-8" | ||
) as file: | ||
translations = json.load(file) | ||
|
||
def signal_handler(sig, frame): | ||
print( | ||
"\nThe interrupt signal has been caught and the current progress is being saved..." | ||
) | ||
with open( | ||
"../formatted_data/translated_words.json", "w", encoding="utf-8" | ||
) as file: | ||
json.dump(translations, file, ensure_ascii=False, indent=4) | ||
file.write("\n") | ||
|
||
print("The current progress has been saved to the translated_words.json file.") | ||
exit() | ||
|
||
signal.signal(signal.SIGINT, signal_handler) | ||
import sys | ||
|
||
for word in word_list[len(translations) :]: | ||
word_translations = {word: {}} | ||
for lang_code in target_languages: | ||
tokenizer.src_lang = "en" | ||
encoded_word = tokenizer(word, return_tensors="pt") | ||
generated_tokens = model.generate( | ||
**encoded_word, forced_bos_token_id=tokenizer.get_lang_id(lang_code) | ||
) | ||
translated_word = tokenizer.batch_decode( | ||
generated_tokens, skip_special_tokens=True | ||
)[0] | ||
word_translations[word][lang_code] = translated_word | ||
PATH_TO_SCRIBE_ORG = os.path.dirname(sys.path[0]).split("Scribe-Data")[0] | ||
PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src" | ||
sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC) | ||
|
||
translations.append(word_translations) | ||
from scribe_data.utils import translate_to_other_languages | ||
|
||
with open( | ||
"../formatted_data/translated_words.json", "w", encoding="utf-8" | ||
) as file: | ||
json.dump(translations, file, ensure_ascii=False, indent=4) | ||
file.write("\n") | ||
SRC_LANG = "English" | ||
translate_script_dir = os.path.dirname(os.path.abspath(__file__)) | ||
words_to_translate_path = os.path.join(translate_script_dir, "words_to_translate.json") | ||
|
||
print(f"Translation results for the word '{word}' have been saved.") | ||
with open(words_to_translate_path, "r", encoding="utf-8") as file: | ||
json_data = json.load(file) | ||
|
||
print( | ||
"Translation results for all words are saved to the translated_words.json file." | ||
) | ||
word_list = [item["word"] for item in json_data] | ||
|
||
translations = {} | ||
translated_words_path = os.path.join( | ||
translate_script_dir, "../formatted_data/translated_words.json" | ||
) | ||
if os.path.exists(translated_words_path): | ||
with open(translated_words_path, "r", encoding="utf-8") as file: | ||
translations = json.load(file) | ||
|
||
if __name__ == "__main__": | ||
translate_words("words_to_translate.json") | ||
translate_to_other_languages( | ||
source_language=SRC_LANG, | ||
word_list=word_list, | ||
translations=translations, | ||
batch_size=100, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.