From 4de3692281196063230a9211a94fb88de2e86112 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:31:47 +0100 Subject: [PATCH] update initialise nltk functions --- src/modules/analysis.py | 2 +- src/modules/preprocessing.py | 39 +++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/modules/analysis.py b/src/modules/analysis.py index 6ada289..29fac77 100644 --- a/src/modules/analysis.py +++ b/src/modules/analysis.py @@ -82,7 +82,7 @@ def get_total_feature_count(features: DataFrame) -> DataFrame: return total_feature_count -def retrieve_named_entities(series: Series) -> list[list[str]]: +def retrieve_named_entities(series: Series) -> list: """retrieve any named entities from the series Parameters ---------- diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py index cac9625..943d35a 100644 --- a/src/modules/preprocessing.py +++ b/src/modules/preprocessing.py @@ -196,7 +196,40 @@ def lemmatizer(tokens: list) -> list: def _initialise_nltk_component(extension: str, download_object: str): - """download nltk component from package + """spliter function to determine which initialisation path to run + Parameters + ---------- + extension: str + the filepath extension leading to where the model is saved + download_object: str + the object to download from nltk + Returns + ------- + None + """ + if sys.platform.startswith("linux"): + _initialise_nltk_linux(download_object) + else: + _initialise_nltk_windows(extension, download_object) + + +def _initialise_nltk_linux(download_object: str) -> None: + """initialise nltk component for linux environment (for github actions) + Parameters + ---------- + download_object: str + nltk object to download + Returns + ------- + None + """ + nltk.download(download_object) + nltk.data.path.append("../home/runner/nltk_data") + return None + + +def _initialise_nltk_windows(extension: str, download_object: str): + """initialise nltk component for a windows environment Parameters ---------- extension: str @@ -211,10 +244,6 @@ def _initialise_nltk_component(extension: str, download_object: str): path = "C:/Users/" + username + "/AppData/Roaming/nltk_data/" + extension if not os.path.exists(path): nltk.download(download_object) - # Set path for runs on github actions - if sys.platform.startswith("linux"): - nltk.data.path.append("../home/runner/nltk_data") - else: nltk.data.path.append("../local_packages/nltk_data") return None