Skip to content

Commit

Permalink
#39 #44 #23 formatting of files and adding words to translate
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Sep 8, 2023
1 parent a490dd9 commit da6b5c4
Show file tree
Hide file tree
Showing 49 changed files with 1,513,091 additions and 71,292 deletions.
24 changes: 12 additions & 12 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@ channels:
- defaults
dependencies:
- python=3.9
- black>=19.10b0
- beautifulsoup4>=4.12.0
- defusedxml=0.7.1
- pandas>=1.5.3
- pytest-cov>=3.0.0
- sentencepiece>=0.1.95
- tabulate>=0.8.9
- transformers>=4.12
- tqdm=4.59.0
- pandas>=2.0.3
- pytest-cov>=4.0.0
- sentencepiece>=0.1.99
- tabulate>=0.8.10
- transformers>=4.24.0
- tqdm=4.65.0
- pip:
- beautifulsoup4==4.9.3
- emoji>=2.2.0
- mwparserfromhell>=0.6
- black>=23.7.0
- emoji>=2.8.0
- mwparserfromhell>=0.6.5
- PyICU>=2.10.2 # Make sure to fulfill PyICU dependencies, see https://gitlab.pyicu.org/main/pyicu#installing-pyicu
- python-dateutil>=2.8.2
- regex>=2023.3.23
- regex>=2023.8.8
- SPARQLWrapper>=2.0.0
- tensorflow>=2.5.1
- tensorflow>=2.11.0
Empty file.
39 changes: 5 additions & 34 deletions src/scribe_data/extract_transform/English/nouns/format_nouns.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
Formats the nouns queried from Wikidata using query_nouns.sparql.
"""

# pylint: disable=invalid-name

import collections
import json
import os
Expand All @@ -17,7 +15,7 @@
PATH_TO_SCRIBE_DATA_SRC = f"{PATH_TO_SCRIBE_ORG}Scribe-Data/src"
sys.path.insert(0, PATH_TO_SCRIBE_DATA_SRC)

from scribe_data.load.update_utils import get_path_from_et_dir
from scribe_data.utils import get_path_from_et_dir

file_path = sys.argv[0]

Expand All @@ -30,24 +28,6 @@
with open(f"./{LANGUAGE}/nouns/nouns_queried.json", encoding="utf-8") as f:
nouns_list = json.load(f)


def order_annotations(annotation):
"""
Standardizes the annotations that are presented to users where more than one is applicable.
Parameters
----------
annotation : str
The annotation to be returned to the user in the command bar.
"""
single_annotations = ["PL"] # Gender annotations added here when genders are added.
if annotation in single_annotations:
return annotation

annotation_split = sorted([a for a in set(annotation.split("/")) if a != ""])

return "/".join(annotation_split)

nouns_formatted = {}

for noun_vals in nouns_list:
Expand All @@ -74,15 +54,7 @@ def order_annotations(annotation):
}
else:
# Mark plural as a possible form if it isn't already.
if (
"PL" not in nouns_formatted[noun_vals["plural"]]["form"]
and nouns_formatted[noun_vals["plural"]]["form"] != ""
):
nouns_formatted[noun_vals["plural"]]["form"] = (
nouns_formatted[noun_vals["plural"]]["form"] + "/PL"
)

elif nouns_formatted[noun_vals["plural"]]["form"] == "":
if nouns_formatted[noun_vals["plural"]]["form"] == "":
nouns_formatted[noun_vals["plural"]]["form"] = "PL"

# Assign itself as a plural if possible (maybe wasn't for prior versions).
Expand All @@ -95,12 +67,14 @@ def order_annotations(annotation):
"plural": "",
"form": "",
}

elif "plural" in noun_vals.keys():
if noun_vals["plural"] not in nouns_formatted:
nouns_formatted[noun_vals["plural"]] = {
"plural": "isPlural",
"form": "PL",
}

else:
# Mark plural as a possible form if it isn't already.
if (
Expand All @@ -114,9 +88,6 @@ def order_annotations(annotation):
elif nouns_formatted[noun_vals["plural"]]["form"] == "":
nouns_formatted[noun_vals["plural"]]["form"] = "PL"

for k in nouns_formatted:
nouns_formatted[k]["form"] = order_annotations(nouns_formatted[k]["form"])

nouns_formatted = collections.OrderedDict(sorted(nouns_formatted.items()))

org_path = get_path_from_et_dir()
Expand All @@ -131,4 +102,4 @@ def order_annotations(annotation):
with open(export_path, "w", encoding="utf-8",) as file:
json.dump(nouns_formatted, file, ensure_ascii=False, indent=0)

print(f"Wrote file nouns.json with {len(nouns_formatted)} nouns.")
print(f"Wrote file nouns.json with {len(nouns_formatted)} nouns.")
Empty file.
Loading

0 comments on commit da6b5c4

Please sign in to comment.