Skip to content

Commit

Permalink
Merge pull request #263 from SethiShreya/cli
Browse files Browse the repository at this point in the history
Fixed custom user directory issue on get method issue #260
  • Loading branch information
andrewtavis authored Oct 7, 2024
2 parents b7a0f82 + 3613d2f commit bf187a4
Show file tree
Hide file tree
Showing 38 changed files with 551 additions and 203 deletions.
20 changes: 12 additions & 8 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# OS Files
##########
# MARK: OS Files

.DS_Store
.vscode/*
!.vscode/extensions.json
Expand All @@ -8,8 +8,8 @@
*wiki_partitions
*wiki.ndjson

# Python Files
##############
# MARK: Python Files

# setup.py working directory
build
# setup.py dist directory
Expand All @@ -24,11 +24,15 @@ __pycache__
venv
.venv

# NPM Files
###########
# MARK: NPM Files

node_modules
package-lock.json

# Intermerdiary Data Files
##########################
# MARK: Intermerdiary Files

**/*_queried.json

# MARK: Test Files

tests_output
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Emojis for the following are chosen based on [gitmoji](https://gitmoji.dev/).
- Many files were renamed including `update_data.py` being renamed `query_data.py`
- Paths within the package have been updated to work for all operating systems via `pathlib` ([#125](https://github.com/scribe-org/Scribe-Data/issues/125)).
- The language formatting scripts have been dramatically simplified given changes to export paths all being the same.
- The `update_files` directory was removed in preparation of other means of showing data totals.

## Scribe-Data 3.3.0

Expand Down
2 changes: 1 addition & 1 deletion docs/source/scribe_data/cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ Behavior and Output:

.. code-block:: text
Updating data for language: English, data type: ['verbs']
Updating data for language(s): English; data type(s): verbs
Data updated: 0%|
2. If existing files are found, you'll be prompted to choose an option:
Expand Down
138 changes: 132 additions & 6 deletions src/scribe_data/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import json
import shutil
from pathlib import Path
from typing import Optional

from scribe_data.cli.cli_utils import language_map
from scribe_data.load.data_to_sqlite import data_to_sqlite
Expand All @@ -33,10 +32,33 @@
get_language_iso,
)

# MARK: JSON


def export_json(
language: str, data_type: str, output_dir: Path, overwrite: bool
) -> None:
"""
Export a JSON file from the CLI process.
Parameters
----------
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A JSON file saved in the given location.
"""
normalized_language = language_map.get(language.lower())

if not normalized_language:
Expand Down Expand Up @@ -85,9 +107,40 @@ def export_json(
)


# MARK: CSV or TSV


def convert_to_csv_or_tsv(
language: str, data_type: list, output_dir: Path, overwrite: bool, output_type: str
language: str,
data_type: list,
output_dir: Path,
overwrite: bool,
output_type: str,
) -> None:
"""
Converts a Scribe-Data output file to a CSV or TSV file.
Parameters
----------
output_type : str
The file type to convert to (CSV or TSV).
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A CSV or TSV file saved in the given location.
"""
normalized_language = language_map.get(language.lower())
if not normalized_language:
print(f"Language '{language}' is not recognized.")
Expand Down Expand Up @@ -154,12 +207,36 @@ def convert_to_csv_or_tsv(
print(f"Data for '{dtype}' written to '{output_file}'")


# MARK: SQLITE


def convert_to_sqlite(
language: Optional[str] = None,
data_type: Optional[str] = None,
output_dir: Optional[str] = None,
overwrite: bool = False,
language: str,
data_type: str,
output_dir: Path,
overwrite: bool,
) -> None:
"""
Converts a Scribe-Data output file to an SQLite file.
Parameters
----------
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A SQLite file saved in the given location.
"""
if not language:
raise ValueError("Language must be specified for SQLite conversion.")

Expand Down Expand Up @@ -191,3 +268,52 @@ def convert_to_sqlite(

else:
print("No output directory specified. SQLite file remains in default location.")


# MARK: Convert


def convert(
language: str, data_type: str, output_dir: str, overwrite: bool, output_type: str
):
"""
Converts a Scribe-Data output file to a different file type.
Parameters
----------
output_type : str
The file type to convert to (CSV or TSV).
language : str
The language of the file to convert.
data_type : str
The data type to of the file to convert.
output_dir : str
The output directory path for results.
overwrite : bool
Whether to overwrite existing files.
Returns
-------
A SQLite file saved in the given location.
"""
if output_dir:
output_dir = Path(output_dir).resolve()
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)

if output_type == "json" or output_type is None:
export_json(language, data_type, output_dir, overwrite)

elif output_type in {"csv", "tsv"}:
convert_to_csv_or_tsv(
language, data_type, output_dir, overwrite, output_type
)

else:
raise ValueError(
"Unsupported output type. Please use 'json', 'csv', or 'tsv'."
)
106 changes: 72 additions & 34 deletions src/scribe_data/cli/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,82 @@

import subprocess
from pathlib import Path
from typing import Optional

from scribe_data.cli.convert import convert_to_csv_or_tsv, export_json
from scribe_data.utils import DEFAULT_JSON_EXPORT_DIR
from scribe_data.utils import (
DEFAULT_CSV_EXPORT_DIR,
DEFAULT_JSON_EXPORT_DIR,
DEFAULT_SQLITE_EXPORT_DIR,
DEFAULT_TSV_EXPORT_DIR,
)
from scribe_data.wikidata.query_data import query_data

DATA_DIR = Path(DEFAULT_JSON_EXPORT_DIR)


def get_data(
language: Optional[str] = None,
data_type: Optional[str] = None,
output_dir: Optional[str] = None,
language: str = None,
data_type: str = None,
output_type: str = None,
output_dir: str = None,
overwrite: bool = False,
output_type: Optional[str] = None,
outputs_per_entry: int = None,
all: bool = False,
) -> None:
"""
Function for controlling the data get process for the CLI.
Parameters
----------
language : str
The language(s) to get.
data_type : str
The data type(s) to get.
output_type : str
The output file type.
output_dir : str
The output directory path for results.
outputs_per_entry : str
How many outputs should be generated per data entry.
overwrite : bool
Whether to overwrite existing files (default: False).
all : bool
Get all languages and data types.
Returns
-------
The requested data saved locally given file type and location arguments.
"""
# MARK: Defaults

output_type = output_type or "json"
if output_dir is None:
if output_type == "csv":
output_dir = DEFAULT_CSV_EXPORT_DIR
elif output_type == "json":
output_dir = DEFAULT_JSON_EXPORT_DIR
elif output_type == "sqlite":
output_dir = DEFAULT_SQLITE_EXPORT_DIR
elif output_type == "tsv":
output_dir = DEFAULT_TSV_EXPORT_DIR

languages = [language] if language else None

subprocess_result = False

# MARK: Get All

if all:
print("Updating all languages and data types ...")
query_data(None, None, overwrite)
subprocess_result = True

elif data_type in ["emoji-keywords", "emoji_keywords"]:
# MARK: Emojis

elif data_type in {"emoji-keywords", "emoji_keywords"}:
for lang in languages:
emoji_keyword_extraction_script = (
Path(__file__).parent.parent
Expand All @@ -66,6 +111,8 @@ def get_data(
["python", emoji_keyword_extraction_script]
)

# MARK: Translations

elif data_type == "translations":
for lang in languages:
translation_generation_script = (
Expand All @@ -80,47 +127,38 @@ def get_data(
["python", translation_generation_script]
)

# MARK: Query Data

elif language or data_type:
data_type = data_type[0] if isinstance(data_type, list) else data_type

data_type = [data_type] if data_type else None
print(f"Updating data for language: {language}, data type: {data_type}")
query_data(languages, data_type, overwrite)
print(
f"Updating data for language(s): {language}; data type(s): {', '.join(data_type)}"
)
query_data(
languages=languages,
data_type=data_type,
output_dir=output_dir,
overwrite=overwrite,
)
subprocess_result = True

else:
raise ValueError(
"You must provide at least one of the --language (-l) or --data-type (-dt) options, or use --all (-a)."
)

if output_dir:
output_dir = Path(output_dir).resolve()
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)

if output_type == "json" or output_type is None:
export_json(language, data_type, output_dir, overwrite)

elif output_type in ["csv", "tsv"]:
convert_to_csv_or_tsv(
language, data_type, output_dir, overwrite, output_type
)

else:
raise ValueError(
"Unsupported output type. Please use 'json', 'csv', or 'tsv'."
)

elif (
if (
isinstance(subprocess_result, subprocess.CompletedProcess)
and subprocess_result.returncode != 1
) or (isinstance(subprocess_result, bool) and subprocess_result is not False):
print(
"No output directory specified for exporting results.",
f"Updated data was saved in: {Path(DEFAULT_JSON_EXPORT_DIR).resolve()}.",
f"Updated data was saved in: {Path(output_dir).resolve()}.",
)

elif data_type in ["emoji-keywords", "emoji_keywords"]:
# The emoji keywords process has failed.
elif data_type in {"emoji-keywords", "emoji_keywords"}:
print(
"\nThe Scribe-Data emoji functionality is powered by PyICU, which is currently not installed."
)
Expand Down
Loading

0 comments on commit bf187a4

Please sign in to comment.