Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix more bug in CLI - Interactive Learning module #204

Merged
merged 8 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions src/scribe_data/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,27 +29,26 @@
from scribe_data.cli.cli_utils import language_map
from scribe_data.load.data_to_sqlite import data_to_sqlite
from scribe_data.utils import (
DEFAULT_JSON_EXPORT_DIR,
DEFAULT_SQLITE_EXPORT_DIR,
get_language_iso,
)

DATA_DIR = Path(DEFAULT_JSON_EXPORT_DIR)


def export_json(
language: str, data_type: str, output_dir: Path, overwrite: bool
) -> None:
normalized_language = language_map.get(language.lower())
language_capitalized = language.capitalize()

if not normalized_language:
raise ValueError(f"Language '{language_capitalized}' is not recognized.")
raise ValueError(f"Language '{language.capitalize()}' is not recognized.")

data_type = data_type[0] if isinstance(data_type, list) else data_type
data_file = (
DATA_DIR / normalized_language["language"].capitalize() / f"{data_type}.json"
output_dir / normalized_language["language"].capitalize() / f"{data_type}.json"
)

print(data_file)

if not data_file.exists():
print(
f"No data found for language '{normalized_language['language']}' and data type '{data_type}'."
Expand All @@ -64,11 +63,7 @@ def export_json(
print(f"Error reading '{data_file}': {e}")
return

json_output_dir = (
output_dir
/ DEFAULT_JSON_EXPORT_DIR
/ normalized_language["language"].capitalize()
)
json_output_dir = output_dir / normalized_language["language"].capitalize()
json_output_dir.mkdir(parents=True, exist_ok=True)

output_file = json_output_dir / f"{data_type}.json"
Expand All @@ -80,12 +75,13 @@ def export_json(

try:
with output_file.open("w") as file:
json.dump(data, file, indent=2)
json.dump(data, file, indent=0)

except IOError as e:
raise IOError(f"Error writing to '{output_file}': {e}") from e

print(
f"Data for language '{normalized_language['language']}' and data type '{data_type}' written to '{output_file}'"
f"Data for {normalized_language['language'].capitalize()} {data_type} written to {output_file}"
)


Expand All @@ -98,12 +94,20 @@ def convert_to_csv_or_tsv(
return

for dtype in data_type:
# Replace non-JSON default paths with JSON path for where exported data is.
file_path = (
DATA_DIR / normalized_language["language"].capitalize() / f"{dtype}.json"
Path(
str(output_dir)
.replace("scribe_data_csv_export", "scribe_data_json_export")
.replace("scribe_data_tsv_export", "scribe_data_json_export")
)
/ normalized_language["language"].capitalize()
/ f"{dtype}.json"
)
if not file_path.exists():
print(f"No data found for {dtype} conversion at '{file_path}'.")
continue
raise FileNotFoundError(
f"No data found for {dtype} conversion at '{file_path}'."
)

try:
with file_path.open("r") as f:
Expand Down
14 changes: 5 additions & 9 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,11 @@ def get_data(
"""
Function for controlling the data get process for the CLI.
"""
if not outputs_per_entry and (data_type in ["emoji-keywords", "emoji_keywords"]):
print(
"\nNo value set for 'outputs-per-entry'. Setting a default value of 3 outputs per entry.\n"
)
outputs_per_entry = 3

languages = [language] if language else None

if all:
print("Updating all languages and data types ...")
query_data()
query_data(None, None, overwrite)

elif data_type in ["emoji-keywords", "emoji_keywords"]:
for lang in languages:
Expand All @@ -80,17 +74,19 @@ def get_data(
os.system(f"python3 {translation_generation_script}")

elif language or data_type:
data_type = data_type[0] if isinstance(data_type, list) else data_type

data_type = [data_type] if data_type else None
print(f"Updating data for language: {language}, data type: {data_type}")
query_data(languages, data_type)
query_data(languages, data_type, overwrite)

else:
raise ValueError(
"You must provide either at least one of the --language (-l) or --data-type (-dt) options, or use --all (-a)."
)

if output_dir:
output_dir = Path(output_dir)
output_dir = Path(output_dir).resolve()
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)

Expand Down
10 changes: 0 additions & 10 deletions src/scribe_data/cli/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,6 @@ def run_interactive_mode():
selected_data_types = select_data_types()
output_options = get_output_options()

if len(selected_languages) == 1:
print(
f"\nGetting {', '.join(selected_data_types)} for {', '.join(selected_languages)}."
)

else:
print(
f"\nQuerying {', '.join(selected_data_types)} for {', '.join(selected_languages)} languages."
)

print(
f"Data will be exported as {output_options['type'].upper()} files to '{output_options['dir']}'."
)
Expand Down
4 changes: 2 additions & 2 deletions src/scribe_data/translation/translation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def translation_interrupt_handler(source_language, translations):
"w",
encoding="utf-8",
) as file:
json.dump(translations, file, ensure_ascii=False, indent=4)
json.dump(translations, file, ensure_ascii=False, indent=0)

print("The current progress is saved to the translations.json file.")
exit()
Expand Down Expand Up @@ -238,7 +238,7 @@ def translate_to_other_languages(
"w",
encoding="utf-8",
) as file:
file.write(json.dumps(translations, ensure_ascii=False, indent=2))
file.write(json.dumps(translations, ensure_ascii=False, indent=0))
file.write("\n")

print(
Expand Down
4 changes: 3 additions & 1 deletion src/scribe_data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ def load_queried_data(language: str, data_type: str) -> tuple[Any, bool, str]:
return json.load(f), data_path


def export_formatted_data(formatted_data: dict, language: str, data_type: str) -> None:
def export_formatted_data(
formatted_data: dict, language: str, data_type: str, query_data_in_use: bool = False
) -> None:
"""
Exports formatted data to a JSON file for a specific language and data type.

Expand Down
54 changes: 27 additions & 27 deletions src/scribe_data/wikidata/query_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from scribe_data.wikidata.wikidata_utils import sparql


def query_data(languages=None, word_types=None):
def query_data(languages=None, word_types=None, overwrite=None):
SCRIBE_DATA_SRC_PATH = Path(__file__).parent.parent
PATH_TO_LANGUAGE_EXTRACTION_FILES = (
SCRIBE_DATA_SRC_PATH / "language_data_extraction"
Expand Down Expand Up @@ -69,8 +69,6 @@ def query_data(languages=None, word_types=None):
]
queries_to_run = sorted(queries_to_run)

print(queries_to_run)

# Run queries and format data.
for q in tqdm(
queries_to_run,
Expand All @@ -87,33 +85,35 @@ def query_data(languages=None, word_types=None):
file_name = f"{target_type}.json"

if existing_files := list(export_dir.glob(f"{target_type}*.json")):
print(
f"Existing file(s) found for {lang} {target_type} in the outputs directory:\n"
)
for i, file in enumerate(existing_files, 1):
print(f"{i}. {file.name}")

# choice = input(
# "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: "
# )
choice = input(
"\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: "
)

print(f"You entered: {choice}")

if choice in ["o", "O"]:
print("Removing existing files...")
if overwrite:
print("Overwrite is enabled. Removing existing files...")
for file in existing_files:
file.unlink()

# elif choice in ["k", "K"]:
# timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# file_name = f"{target_type}_{timestamp}.json"

else:
print(f"Skipping update for {lang} {target_type}.")
continue
print(
f"\nExisting file(s) found for {lang} {target_type} in the outputs directory:\n"
)
for i, file in enumerate(existing_files, 1):
print(f"{i}. {file.name}")
# choice = input(
# "\nChoose an option:\n1. Overwrite existing (press 'o')\n2. Keep all (press 'k')\n3. Skip process (press anything else)\nEnter your choice: "
# )
choice = input(
"\nChoose an option:\n1. Overwrite existing data (press 'o')\n2. Skip process (press anything else)\nEnter your choice: "
)

print(f"You entered: {choice}")

if choice.lower() == "o":
print("Removing existing files...")
for file in existing_files:
file.unlink()
# elif choice in ["k", "K"]:
# timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# file_name = f"{target_type}_{timestamp}.json"
else:
print(f"Skipping update for {lang} {target_type}.")
continue

file_path = export_dir / file_name
print(f"Querying and formatting {lang} {target_type}")
Expand Down
6 changes: 3 additions & 3 deletions tests/cli/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def test_get_command(
self, mock_system, mock_convert, mock_export_json, mock_query_data
):
expected_calls = [
call(["English"], ["nouns"]),
call(["English"], ["nouns"]),
call(),
call(["English"], ["nouns"], False),
call(["English"], ["nouns"], False),
call(None, None, False),
]

# Execute the test
Expand Down
Loading