diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml index d7bb499..20b0445 100644 --- a/.github/workflows/CodeCov.yml +++ b/.github/workflows/CodeCov.yml @@ -20,14 +20,21 @@ jobs: python-version: 3.9 cache: 'pip' # caching pip dependencies - - name: Generate Report + - name: Install packages run: | pip install --upgrade pip - pip install -r requirements.txt + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz + python -m nltk.downloader punkt stopwords pip install coverage pip install coverage[toml] + + - name: Run Unit Tests + run: | coverage run -m pytest + + - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 with: diff --git a/requirements.txt b/requirements.txt index 1a7dcac..4e4472c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,69 +1,14 @@ -arrow==1.2.3 -binaryornot==0.4.4 -certifi==2023.5.7 -cfgv==3.3.1 -chardet==5.1.0 -charset-normalizer==3.1.0 -click==8.1.3 -colorama==0.4.6 -contourpy==1.1.0 -cookiecutter==2.1.1 -cycler==0.11.0 -distlib==0.3.6 -docopt==0.6.2 -exceptiongroup==1.1.1 -filelock==3.12.2 -fonttools==4.40.0 -fuzz==0.1.1 -gitdb==4.0.10 -GitPython==3.1.31 -identify==2.5.24 -idna==3.4 -imageio==2.31.1 -inexactsearch==1.0.2 -iniconfig==2.0.0 -Jinja2==3.1.2 -jinja2-time==0.2.0 -joblib==1.2.0 -kiwisolver==1.4.4 -MarkupSafe==2.1.3 matplotlib==3.7.1 -mglearn==0.2.0 nltk==3.8.1 -nodeenv==1.8.0 numpy==1.25.0 -packaging==23.1 pandas==2.0.2 -Pillow==9.5.0 -pipreqs==0.4.13 -platformdirs==3.5.3 -pluggy==1.1.0 -pre-commit==3.3.3 -pyparsing==3.1.0 -pyspellchecker==0.7.2 pytest==7.3.2 -python-dateutil==2.8.2 -python-slugify==8.0.1 -pytz==2023.3 +PyYAML==6.0 PyYAML==6.0 rapidfuzz==3.1.1 -regex==2023.6.3 -requests==2.31.0 -scikit-learn==1.2.2 -scipy==1.10.1 -silpa-common==0.3 -six==1.16.0 -smmap==5.0.0 -soundex==1.1.3 -text-unidecode==1.3 +scikit_learn==1.2.2 +scipy==1.11.1 +setuptools==67.6.1 +spacy==3.6.0 textblob==0.17.1 -threadpoolctl==3.1.0 -tomli==2.0.1 -tqdm==4.65.0 -typer==0.9.0 -typing_extensions==4.6.3 -tzdata==2023.3 -urllib3==2.0.3 -virtualenv==20.23.0 wordcloud==1.9.2 -yarg==0.1.9 diff --git a/src/config.yaml b/src/config.yaml index 99e73c2..5a40e02 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -1,8 +1,24 @@ -raw_data_path: "data/raw/2023_consultation_mock_data.csv" -business_terminology: - - 'dpm' - - 'admin' - - 'timeliness' -additional_stopwords: - - "census" - - "data" +raw_data_path: "data/raw/20230711_consultation_ingest.csv" #str +buisness_terminology: # dictionary of words to update spelling with associated weight + dpm: 1 #int + admin: 1 #int + timeliness: 1 #int + year: 450 #int +additional_stopwords: #list of words to filter; must be type str + - "census" #str + - "data" #str + - "personal" #str + - "use" #str +lemmatize: True #bool; select False to use Stemmer +feature_count: #dict + ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1) + min_df: 2 #float (proportion) or int (count) + max_df: 0.95 #float (proportion) or int (count) + max_features: null #null converts to None, or int value + lowercase: True #whether to convert all words to lowercase +lda: #dict + n_topics: 5 #int + n_top_words: 10 #int + max_iter: 25 #int + title: "Topic Summary" #str + topic_labels: null # also takes a list of strings (see additional stopwords ^) diff --git a/src/processing/__init__.py b/src/modules/__init__.py similarity index 100% rename from src/processing/__init__.py rename to src/modules/__init__.py diff --git a/src/modules/analysis.py b/src/modules/analysis.py new file mode 100644 index 0000000..558cc8b --- /dev/null +++ b/src/modules/analysis.py @@ -0,0 +1,131 @@ +import typing + +import spacy +from numpy.typing import ArrayLike +from pandas import DataFrame, Series +from scipy.sparse._csr import csr_matrix +from sklearn.decomposition import LatentDirichletAllocation +from sklearn.feature_extraction.text import CountVectorizer + + +def extract_feature_count( + series: Series, + max_features: int = None, + ngram_range: tuple[float, float] = (1, 1), + stop_words: ArrayLike = None, + lowercase: bool = True, + min_df=1, + max_df=1.0, +) -> typing.Tuple[CountVectorizer, DataFrame]: + """create a text feature count dataframe from series + Paramaters + ---------- + series: Series + Series of text strings + max_features: int, default = None + If not None, build a vocabulary that only consider the top max_features + ordered by term frequency across the corpus. Otherwise, all features are used. + ngram_range: tuple (min_n, max_n), default=(1, 1) + The lower and upper boundary of the range of n-values for different word n-grams + or char n-grams to be extracted. All values of n such such that + min_n <= n <= max_n will be used. + stop_words: list, default=None + list of stopwords to remove from text strings + lowercase: bool, default = True + convert all characters to lowercase before tokenizing + min_df: float or int, default = 1 + When building the vocabulary ignore terms that have a document frequency + strictly lower than the given threshold. This value is also called cut-off + in the literature. If float, the parameter represents a proportion of + documents, integer absolute counts. + This parameter is ignored if vocabulary is not None. + max_df: float or int, default = 1.0 + When building the vocabulary ignore terms that have a document frequency + strictly higher than the given threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. This parameter is ignored if vocabulary is not None. + Returns + ------- + DataFrame + A dataframe of text feature counts, displaying the number of times a word + appears in each element of the input series + """ + + vectorizer = CountVectorizer( + max_features=max_features, + ngram_range=ngram_range, + stop_words=stop_words, + lowercase=lowercase, + min_df=min_df, + max_df=max_df, + ) + + fitted_vector = vectorizer.fit_transform(series) + + word_count_df = DataFrame( + fitted_vector.toarray(), columns=vectorizer.get_feature_names_out() + ) + return (fitted_vector, word_count_df) + + +def get_total_feature_count(features: DataFrame) -> DataFrame: + """sum across features to get total number of times word was used + Parameters + ---------- + features: DataFrame + A dataframe of the features with each row corrosponding to a deconstructed + string + Returns + ------- + DataFrame + A dataframe of the total number of times each word is used across all + strings""" + total_feature_count = DataFrame() + for column in features.columns: + total_feature_count[column] = [features[column].sum()] + return total_feature_count + + +def retrieve_named_entities(series: Series) -> list: + """retrieve any named entities from the series + Parameters + ---------- + series:Series + A series of text strings to analyse for named entities + Returns + ------- + list[list[str]] + a list of lists containing strings for each named entitity""" + nlp = spacy.load("en_core_web_sm") + entities = [] + for doc in nlp.pipe(series): + entities.append([str(ent) for ent in doc.ents]) + return entities + + +def latent_dirichlet_allocation( + n_topics: int, max_iter: int, fitted_vector: csr_matrix +) -> LatentDirichletAllocation: + """fit latent direchlet allocation model on fitted vector + Parameters + ---------- + n_topics:int + number of components to include in model + max_iter: int + maximum number of passes over the training data + fitted_vector:csr_matrix + fitted vector from CountVectorizer + Returns + ------- + LatentDirichletAllocation + fitted lda model + """ + lda = LatentDirichletAllocation( + n_components=n_topics, + learning_method="batch", + max_iter=max_iter, + random_state=179, + ) + + lda.fit(fitted_vector) + return lda diff --git a/src/processing/preprocessing.py b/src/modules/preprocessing.py similarity index 54% rename from src/processing/preprocessing.py rename to src/modules/preprocessing.py index b366011..943d35a 100644 --- a/src/processing/preprocessing.py +++ b/src/modules/preprocessing.py @@ -1,6 +1,7 @@ import os import re import string +import sys import nltk import numpy as np @@ -8,8 +9,7 @@ import yaml from nltk.corpus import stopwords as sw from nltk.stem import PorterStemmer, WordNetLemmatizer -from pandas.core.series import Series -from rapidfuzz.fuzz import ratio +from pandas import Series def load_config(filepath: str) -> dict: @@ -30,7 +30,7 @@ def load_config(filepath: str) -> dict: raise TypeError("filepath must be a string") with open(filepath, "r") as file: - config = yaml.safe_load(file) + config = yaml.load(file, Loader=yaml.Loader) return config @@ -69,7 +69,24 @@ def _replace_blanks(series: Series) -> Series: return blanks_replaced -def correct_spelling(string: str, additional_words: list = []) -> str: +def spellcorrect_series(series: Series, additional_words: dict = {}) -> Series: + """fix spelling across series using the norvig spell-correct method + Parameters + ---------- + series: Series + the series of text strings you want to pass your spell checker on + additional_words:dict + a dictionary of words and weights for each word + Returns + ------- + Series + a series with words spelling corrected""" + tb.en.spelling = _update_spelling_words(additional_words) + corrected_series = series.apply(lambda str: _correct_spelling(str)) + return corrected_series + + +def _correct_spelling(string: str) -> str: """correct spelling using norvig spell-correct method (it has around 70% accuracy) Parameters @@ -80,46 +97,35 @@ def correct_spelling(string: str, additional_words: list = []) -> str: ------- str string with the spelling fixed""" - _update_spelling_words(additional_words) spelling_fixed = str(tb.TextBlob(string).correct()) return spelling_fixed -def _update_spelling_words(additional_words: list) -> None: +def _update_spelling_words(additional_words: dict) -> None: """update word in the textblob library with commonly used business word Parameters ---------- - additional_words:list - words to add to the textblob dictionary + additional_words:dict + words to add to the textblob dictionary, with associated weights. + higher weights give greater precedence to the weighted word. Returns ------- - None + dict + a dictionary of words and updated weights """ - for word in additional_words: - tb.en.spelling.update({word: 1}) - tb.en.spelling - return None + for word, weight in additional_words.items(): + tb.en.spelling.update({word: weight}) + return tb.en.spelling -def fuzzy_compare_ratio(base: Series, comparison: Series) -> Series: - """compare the base series to the comparison series to get - a similarity ratio between strings in the same column - Parameters - ---------- - base: Series - the base series for comparison - comparison: Series - the series you want to compare against - Returns - ------- - Series - a series of ratios (type:float) with scores closer to 100 - indicating complete match""" - fuzzy_ratio = Series(map(ratio, base, comparison)) - return fuzzy_ratio +def remove_punctuation(series: Series) -> Series: + """Remove punctuation from series of strings""" + _initialise_nltk_component("tokenizers/punkt", "punkt") + punct_removed = series.apply(_remove_punctuation_string) + return punct_removed -def remove_punctuation(text: str) -> str: +def _remove_punctuation_string(text: str) -> str: """Remove punctuation from string Parameters @@ -136,6 +142,21 @@ def remove_punctuation(text: str) -> str: return new_text +def shorten_tokens(word_tokens: list, lemmatize: bool = True) -> list: + """Shorten tokens to root words + Parameters + ---------- + word_tokens:list + list of word tokens to shorten + lemmatize: bool, default = True + whether to use lemmatizer or revert back to False (stemmer)""" + if lemmatize: + short_tokens = word_tokens.apply(lemmatizer) + else: + short_tokens = word_tokens.apply(stemmer) + return short_tokens + + def stemmer(tokens: list) -> list: """Stem works to their root form (e.g. flying -> fli, Beautiful -> Beauti) @@ -168,11 +189,65 @@ def lemmatizer(tokens: list) -> list: lemmatized_tokens list of simplified word groupings """ + _initialise_nltk_component("corpora/wordnet.zip", "wordnet") lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens] return lemmatized_tokens +def _initialise_nltk_component(extension: str, download_object: str): + """spliter function to determine which initialisation path to run + Parameters + ---------- + extension: str + the filepath extension leading to where the model is saved + download_object: str + the object to download from nltk + Returns + ------- + None + """ + if sys.platform.startswith("linux"): + _initialise_nltk_linux(download_object) + else: + _initialise_nltk_windows(extension, download_object) + + +def _initialise_nltk_linux(download_object: str) -> None: + """initialise nltk component for linux environment (for github actions) + Parameters + ---------- + download_object: str + nltk object to download + Returns + ------- + None + """ + nltk.download(download_object) + nltk.data.path.append("../home/runner/nltk_data") + return None + + +def _initialise_nltk_windows(extension: str, download_object: str): + """initialise nltk component for a windows environment + Parameters + ---------- + extension: str + the filepath extension leading to where the model is saved + download_object: str + the object to download from nltk + Returns + ------- + None + """ + username = os.getenv("username") + path = "C:/Users/" + username + "/AppData/Roaming/nltk_data/" + extension + if not os.path.exists(path): + nltk.download(download_object) + nltk.data.path.append("../local_packages/nltk_data") + return None + + def remove_nltk_stopwords(tokens: list, additional_stopwords: list = []) -> list: """remove stopwords from series @@ -187,27 +262,27 @@ def remove_nltk_stopwords(tokens: list, additional_stopwords: list = []) -> list list token list without stopwords """ - stopwords = _initialise_nltk_stopwords() - updated_stopwords = _update_nltk_stopwords(stopwords, additional_stopwords) - without_stopwords = [item for item in tokens if item not in updated_stopwords] + stopwords = initialise_update_stopwords(additional_stopwords) + without_stopwords = [item for item in tokens if item not in stopwords] return without_stopwords -def _initialise_nltk_stopwords() -> list: - """fetch nltk stopwords from corpora - +def initialise_update_stopwords(additional_stopwords: list = None) -> list: + """initialise and update stopwords, ise this for efficient retrieval of + stopwords, rather than calling both functions. + Parameters + ---------- + additional_stopwords:list + new words to add to the words to remove list Returns ------- list - list of nltk stopwords + a list of words to remove from corpus """ - username = os.getenv("username") - path = "c:/Users/" + username + "/AppData/Roaming/nltk_data/corpora/stopwords" - if not os.path.exists(path): - nltk.download("stopwords") - nltk.data.path.append("../local_packages/nltk_data") + _initialise_nltk_component("corpora/stopwords", "stopwords") stopwords = sw.words("english") - return stopwords + updated_stopwords = _update_nltk_stopwords(stopwords, additional_stopwords) + return updated_stopwords def _update_nltk_stopwords(stopwords: list, additional_stopwords: list): diff --git a/src/modules/quality_checks.py b/src/modules/quality_checks.py new file mode 100644 index 0000000..4909c36 --- /dev/null +++ b/src/modules/quality_checks.py @@ -0,0 +1,38 @@ +from pandas import Series +from rapidfuzz.fuzz import ratio + + +def fuzzy_compare_ratio(base: Series, comparison: Series) -> Series: + """compare the base series to the comparison series to get + a similarity ratio between strings in the same column + Parameters + ---------- + base: Series + the base series for comparison + comparison: Series + the series you want to compare against + Returns + ------- + Series + a series of ratios (type:float) with scores closer to 100 + indicating complete match""" + fuzzy_ratio = Series(map(ratio, base, comparison)) + return fuzzy_ratio + + +def print_row_by_row(base: Series, comparison: Series) -> None: + """print each pair of words row by row + Parameters + ---------- + base: Series + the base series for comparison + comparison: Series + the series you want to compare against + Returns + ------- + None + """ + for i in base.index: + print(base[i]) + print(comparison[i]) + return None diff --git a/src/modules/visualisation.py b/src/modules/visualisation.py new file mode 100644 index 0000000..ba7d66f --- /dev/null +++ b/src/modules/visualisation.py @@ -0,0 +1,181 @@ +import typing +from datetime import datetime as dt + +import matplotlib.pyplot as plt +from matplotlib.figure import Figure +from sklearn.decomposition import LatentDirichletAllocation +from wordcloud import WordCloud + + +def create_wordcloud(text: str, name: str = "wordcloud") -> None: + """generate a wordcloud with the given filename + Parameters + ---------- + text: str + text for wordcloud + filename: str + the name and path you want to save the wordcloud to + Returns: + None (message to console on location of file) + """ + wordcloud = WordCloud().generate(text) + figure = plt.figure(figsize=(5, 10)) + plt.imshow(wordcloud, interpolation="bilinear") + plt.axis("off") + save_figure(name, figure) + return None + + +def save_figure(name: str, fig: Figure) -> None: + """save figure with datestamp + Parameters + ---------- + name: str + name of the figure + fig + the figure object + Returns + ------- + None (message to console on location of file) + """ + datestamp = dt.strftime(dt.now(), "%Y%m%d") + filename = f"data/outputs/{datestamp}_{name}.jpeg" + fig.savefig(filename, bbox_inches="tight") + print(f"{name} plot saved as {filename}") + return None + + +def plot_top_words( + model: LatentDirichletAllocation, + feature_names: list, + n_topics: int, + title: str, + n_top_words: int = 10, + topic_labels: list = None, +) -> None: + """Plot topics by their most frequent words + Parameters + ---------- + model + the lda model components + feature_names:list + a list of the most frequent words (from bag of words model) + n_topics:int + number of topics to include in the chart + title:str + the title for the chart + n_top_words:int, (default = 10) + the number of top words to include in each topic plot + topic_labels:list, (default = None) + a list of labels to override the existing labels + Returns + ------- + None (message to console on location of file) + """ + topic_labels = _generate_topic_labels(n_topics, topic_labels) + labelled_components = dict(zip(topic_labels, model.components_)) + rows, columns = _get_n_columns_and_n_rows(n_topics) + fig, axes = plt.subplots( + rows, columns, figsize=_get_fig_size(columns, rows), sharex=True + ) + axes = axes.flatten() + for number, (topic_label, component) in enumerate(labelled_components.items()): + top_features_ind = component.argsort()[: -n_top_words - 1 : -1] + top_features = [feature_names[i] for i in top_features_ind] + weights = component[top_features_ind] + ax = axes[number] + ax.barh(top_features, weights, height=0.7) + ax.set_title(topic_label, fontdict={"fontsize": 30}) + ax.invert_yaxis() + ax.tick_params(axis="both", which="major", labelsize=20) + for i in "top right left".split(): + ax.spines[i].set_visible(False) + fig.suptitle(title, fontsize=40) + save_figure("lda_top_words", fig) + return None + + +def _generate_topic_labels(n_topics: int, topic_labels: list = None) -> list: + """Generate topic labels from n_topics + Parameters + ---------- + n_topics: int + number of topics + topic_labels:list (default=None) + list of topic_labels + Returns + ------- + list + list of topic labels + """ + if topic_labels is None: + topic_labels = [f"Topic_{n}" for n in range(1, n_topics)] + else: + if len(topic_labels) != n_topics: + raise AttributeError("len(topic_labels) does not equal n_topics") + return topic_labels + + +def _get_n_columns_and_n_rows(n_topics: int) -> int: + """calculate the optimal number of rows and columns for n_topics + Parameters + ---------- + n_topics: int + number of topics + Returns + ------- + int + optimal number of columns + int + optimal number of rows + """ + if n_topics <= 0: + raise ValueError("Value must be an integer greater than 0") + if n_topics <= 5: + n_columns = n_topics + n_rows = 1 + else: + factors = [factor for factor in _get_factors(n_topics) if 1 < factor <= 5] + if len(factors) > 0: + n_columns = factors[-1] + n_rows = int(n_topics / n_columns) + else: + factors = [ + factor for factor in _get_factors(n_topics + 1) if 1 < factor <= 5 + ] + n_columns = factors[-1] + n_rows = int((n_topics / n_columns) + 1) + return n_rows, n_columns + + +def _get_factors(x: int) -> list: + """retrieve factors of a given integer (x) + Parameters + ---------- + x:int + integer + Returns + ------- + list + a list of factors of x + """ + return [i for i in range(1, x + 1) if x % i == 0] + + +def _get_fig_size(columns: int, rows: int) -> typing.Tuple[int, int]: + """get figure size from number of columns and rows + Parameters + ---------- + columns:int + number of columns + rows: int + number of rows + Returns + ------- + int + width of fig + int + height of fig""" + width = columns * 6 + height = (rows * 6) + 3 + return (width, height) diff --git a/src/processing/visualisation.py b/src/processing/visualisation.py deleted file mode 100644 index 0ca08cd..0000000 --- a/src/processing/visualisation.py +++ /dev/null @@ -1,39 +0,0 @@ -import matplotlib.pyplot as plt -from pandas import Series -from wordcloud import WordCloud - - -def print_row_by_row(base: Series, comparison: Series) -> None: - """print each pair of words row by row - Parameters - ---------- - base: Series - the base series for comparison - comparison: Series - the series you want to compare against - Returns - ------- - None - """ - for i in base.index: - print(base[i]) - print(comparison[i]) - return None - - -def create_wordcloud(text: str, filename: str = "data/outputs/wordcloud.jpeg"): - """generate a wordcloud with the given filename - Parameters - ---------- - text: str - text for wordcloud - filename: str - the name and path you want to save the wordcloud to - Returns: - prints message to console saying where file is saved - """ - wordcloud = WordCloud().generate(text) - plt.imshow(wordcloud, interpolation="bilinear") - plt.axis("off") - plt.savefig(filename, bbox_inches="tight") - print(f"Wordcloud saved to {filename}") diff --git a/src/run_pipeline.py b/src/run_pipeline.py index 9a06a84..083224b 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -1,149 +1,80 @@ -# import re -# import string -# import matplotlib.pyplot as plt -# import mglearn -# import numpy as np import pandas as pd from nltk.tokenize import word_tokenize -from sklearn.feature_extraction.text import CountVectorizer -from src.processing.preprocessing import ( # stemmer, - correct_spelling, - fuzzy_compare_ratio, - lemmatizer, +from src.modules.analysis import ( + extract_feature_count, + get_total_feature_count, + latent_dirichlet_allocation, + retrieve_named_entities, +) +from src.modules.preprocessing import ( + initialise_update_stopwords, load_config, rejoin_tokens, remove_blank_rows, remove_nltk_stopwords, remove_punctuation, + shorten_tokens, + spellcorrect_series, ) -from src.processing.visualisation import create_wordcloud # print_row_by_row, - -# from sklearn.decomposition import LatentDirichletAllocation -# from importlib import reload -# reload(preprocessing) +from src.modules.quality_checks import fuzzy_compare_ratio # print_row_by_row, +from src.modules.visualisation import create_wordcloud, plot_top_words def run_pipeline(): - """run entire consultation nlp pipeline""" + """run consultation nlp pipeline""" config = load_config("src/config.yaml") - raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252") - raw_series = raw_data["qu_3"] + colnames = [f"qu_{number+1}" for number in range(0, 33)] + raw_data = pd.read_csv( + config["raw_data_path"], encoding="cp1252", names=colnames, skiprows=1 + ) + raw_series = raw_data["qu_11"] # TODO add clean_data parent function - lower_series = raw_series.str.lower() - without_blank_rows = remove_blank_rows(lower_series) - spelling_fixed = without_blank_rows.apply( - correct_spelling, config["business_terminology"] + without_blank_rows = remove_blank_rows(raw_series) + spelling_fixed = spellcorrect_series( + without_blank_rows, config["buisness_terminology"] ) impact_of_spell_correction = fuzzy_compare_ratio(without_blank_rows, spelling_fixed) - # TODO consider whether there are words we need to fix manually? i.e timliness + lower_series = spelling_fixed.str.lower() # print_row_by_row(without_blank_rows,spelling_fixed) - no_punctuation_series = spelling_fixed.apply(remove_punctuation) + no_punctuation_series = remove_punctuation(lower_series) word_tokens = no_punctuation_series.apply(word_tokenize) - # stemmed_tokens = word_tokens.apply(stemmer) - lemmatized_tokens = word_tokens.apply(lemmatizer) - without_stopwords = lemmatized_tokens.apply( + short_tokens = shorten_tokens(word_tokens, config["lemmatize"]) + without_stopwords = short_tokens.apply( lambda x: remove_nltk_stopwords(x, config["additional_stopwords"]) ) rejoined_words = without_stopwords.apply(rejoin_tokens) - text = " ".join(rejoined_words) - create_wordcloud(text) - - # just printing to overcome qa aspect - print(rejoined_words, impact_of_spell_correction) - - """#Topic Modelling""" - - vect = CountVectorizer(max_features=5) - coliv_wordsbows = vect.fit(raw_series) + all_text_combined = " ".join(rejoined_words) + create_wordcloud(all_text_combined) + stopwords = initialise_update_stopwords(config["additional_stopwords"]) + fitted_vector, features = extract_feature_count( + series=spelling_fixed, + ngram_range=config["feature_count"]["ngram_range"], + min_df=config["feature_count"]["min_df"], + max_df=config["feature_count"]["max_df"], + max_features=config["feature_count"]["max_features"], + lowercase=config["feature_count"]["lowercase"], + stop_words=stopwords, + ) + total_features = get_total_feature_count(features) + entities = retrieve_named_entities(without_blank_rows) + lda = latent_dirichlet_allocation( + n_topics=config["lda"]["n_topics"], + max_iter=config["lda"]["max_iter"], + fitted_vector=fitted_vector, + ) + plot_top_words( + model=lda, + feature_names=list(features.columns), + n_topics=config["lda"]["n_topics"], + title=config["lda"]["title"], + n_top_words=config["lda"]["n_top_words"], + topic_labels=config["lda"]["topic_labels"], + ) - print(coliv_wordsbows.vocabulary_) + print(impact_of_spell_correction, total_features, entities) -# lda5 = LatentDirichletAllocation( -# n_components=5, learning_method="batch", max_iter=25, random_state=0 -# ) -# -# document_topics5 = lda5.fit_transform(coliv_wordsbows) -# -# topics = np.array([0, 1, 2, 3, 4]) -# -# sorting = np.argsort(lda5.components_, axis=1)[:, ::-1] -# feature_names = np.array(vect.get_feature_names()) -# mglearn.tools.print_topics( -# topics=topics, -# feature_names=feature_names, -# sorting=sorting, -# topics_per_chunk=5, -# n_words=10, -# ) -# -# document_topics5 -# -# -# censtranf_respns = nlp_censtranf[ -# "cens_test_1" -# ] -# censtranf_respns = nlp_censtranf.reset_index(drop=True) -# -# -# -# -# def topic_summary( -# topic_number, -# ): -# -# topics = [topic_number] -# mglearn.tools.print_topics( -# topics=topics, -# feature_names=feature_names, -# sorting=sorting, -# topics_per_chunk=5, -# n_words=10, -# ) -# -# responses = np.argsort(document_topics5[:, topic_number])[::-1] -# -# for i in responses[:5]: -# print(coliv_respns[i], ".\n") -# -# -# for i in range(5): -# topic_summary(i) -# -# fig, ax = plt.subplots(1, 1, figsize=(10, 8)) -# topic_names = [ -# "{:>2} ".format(i) + " ".join(words) -# for i, words in enumerate(feature_names[sorting[:, :2]]) -# ] -# -# ax.barh(np.arange(5), np.sum(document_topics5, axis=0)) -# ax.set_yticks(np.arange(5)) -# ax.set_yticklabels(topic_names, ha="left", va="top") -# ax.invert_yaxis() -# ax.set_xlim(0, 300) -# yax = ax.get_yaxis() -# yax.set_tick_params(pad=130) -# plt.tight_layout() -# -# -# topic_labels = [ -# "The first label", -# "The second label", -# "The second label", -# "The third label", -# "The fourth label", -# ] -# -# -# fig, ax = plt.subplots(1, 1, figsize=(10, 8)) -# topic_names = ["{:>2} {}".format(i, label) for i, label in enumerate(topic_labels)] -# -# ax.barh(np.arange(5), np.mean(document_topics5, axis=0)) -# ax.set_yticks(np.arange(5)) -# ax.set_yticklabels(topic_names, ha="right", va="center") -# ax.invert_yaxis() -# ax.set_xlim(0, 0.5) -# yax = ax.get_yaxis() -# yax.set_tick_params(pad=10) -# plt.tight_layout() +# code to execute script from terminal +if __name__ == "__main__": + run_pipeline() diff --git a/tests/processing/__init__.py b/tests/modules/__init__.py similarity index 100% rename from tests/processing/__init__.py rename to tests/modules/__init__.py diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py new file mode 100644 index 0000000..0dd16b1 --- /dev/null +++ b/tests/modules/test_analysis.py @@ -0,0 +1,88 @@ +from itertools import repeat +import pytest +import sys +from pandas import DataFrame, Series +from scipy.sparse._csr import csr_matrix +from sklearn.decomposition import LatentDirichletAllocation +from sklearn.feature_extraction.text import CountVectorizer + +from src.modules.analysis import ( + extract_feature_count, + get_total_feature_count, + latent_dirichlet_allocation, + retrieve_named_entities, +) + + +class TestExtractFeatureCount: + def test_feature_count(self): + data = Series(["My name is elf"]) + expected = DataFrame([[1, 1, 1, 1]], columns=("elf", "is", "my", "name")) + actual = extract_feature_count(data)[1] + assert all(expected == actual), "Does not match expected output" + + def test_remove_stopwords(self): + stopwords = ["is", "my"] + data = Series(["My name is elf"]) + actual = extract_feature_count(data, stop_words=stopwords)[1] + expected = DataFrame([[1, 1]], columns=("elf", "name")) + assert all(expected == actual), "Does not remove stopwords" + + def test_ngrams(self): + data = Series(["My name is elf"]) + actual = extract_feature_count(data, ngram_range=(1, 2))[1] + expected = DataFrame( + [repeat(1, 7)], + columns=["elf", "is", "is elf", "my", "my name", "name", "name is"], + ) + assert all(expected == actual), "Does not handle ngrams" + + def test_get_fitted_vector(self): + data = Series(["My name is elf"]) + actual = extract_feature_count(data)[0] + assert isinstance( + actual, csr_matrix + ), "Does not return a csr_matrix object in position 0" + + +class TestGetTotalFeatureCount: + def test_get_total_feature_count(self): + df = DataFrame( + [[1, 1, 1, 1, 0], [0, 1, 1, 1, 1]], + columns=["elf", "is", "my", "name", "santa"], + ) + expected = DataFrame( + [[1, 2, 2, 2, 1]], columns=["elf", "is", "my", "name", "santa"] + ) + actual = get_total_feature_count(df) + assert all(expected == actual), "Does not correctly sum total features" + + +class TestRetrieveNamedEntities: + @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Unknown error during CI") + def test_retrieve_named_entities(self): + test_data = Series( + [ + "The ONS has just released an article on the UK Government's policy.", + "my own care for nothing", + "Hollywood actors now have their own statue", + ] + ) + actual = retrieve_named_entities(test_data) + expected = [["ONS", "the UK Government's"], [], ["Hollywood"]] + trimmed_actual = [component for component in actual if component != []] + trimmed_expected = [component for component in expected if component != []] + assert ( + trimmed_actual == trimmed_expected + ), "Did not successfully retrieve named entities" + + +class TestLatentDirichletAllocation: + def test_latent_dirichlet_allocation(self): + fitted = CountVectorizer().fit_transform( + Series(["My name is Elf and I like ignoble hats"]) + ) + lda = latent_dirichlet_allocation(10, 10, fitted) + assert isinstance( + lda, LatentDirichletAllocation + ), "function did not return an latent dirichlet allocation object" diff --git a/tests/processing/test_preprocessing.py b/tests/modules/test_preprocessing.py similarity index 66% rename from tests/processing/test_preprocessing.py rename to tests/modules/test_preprocessing.py index b307b5a..2f3a2a2 100644 --- a/tests/processing/test_preprocessing.py +++ b/tests/modules/test_preprocessing.py @@ -1,24 +1,24 @@ -import sys -import unittest - import numpy as np import pytest import textblob as tb +from nltk.corpus import stopwords as sw from pandas import Series -from src.processing.preprocessing import ( - _initialise_nltk_stopwords, +from src.modules.preprocessing import ( + _correct_spelling, + _initialise_nltk_component, + _remove_punctuation_string, _replace_blanks, _update_nltk_stopwords, _update_spelling_words, - correct_spelling, - fuzzy_compare_ratio, + initialise_update_stopwords, lemmatizer, load_config, rejoin_tokens, remove_blank_rows, remove_nltk_stopwords, remove_punctuation, + spellcorrect_series, stemmer, ) @@ -82,43 +82,49 @@ def test_return_series(self): ), "output is not " +class TestSpellCorrectSeries: + def test_spell_correct_series(self): + series = Series(["I live in a housr", "I own a housr"]) + actual = spellcorrect_series(series) + expected = Series(["I live in a house", "I own a house"]) + assert all(actual == expected), "Not fixed spelling across series" + + def test_update_spelling_on_series(self): + series = Series(["I live in a housr", "I own a housr"]) + additional_words = {"housr": 1} + actual = spellcorrect_series(series, additional_words) + expected = Series(["I live in a housr", "I own a housr"]) + assert all(actual == expected), "Updated spelling doesn't work across series" + + class TestCorrectSpelling: def test_spelling_fixed(self): - house_str = "I live in a housr" - corrected = correct_spelling(house_str) - assert corrected == "I live in a house", "spelling not fixed correctly" - - def test_word_update(self): - additional_words = ["housr"] - house_str = "I live in a housr" - corrected = correct_spelling(house_str, additional_words) - assert ( - corrected == "I live in a housr" - ), "spelling word list not correctly updated" + house_str = "I live flar away" + corrected = _correct_spelling(house_str) + assert corrected == "I live far away", "spelling not fixed correctly" class TestUpdateSpellingWords: def test_update_word_list(self): - additional_words = ["housr"] - _update_spelling_words(additional_words) + additional_words = {"monsterp": 1} + tb.en.spelling = _update_spelling_words(additional_words) assert ( - "housr" in tb.en.spelling.keys() + "monsterp" in tb.en.spelling.keys() ), "spelling word list not updated correctly" -class TestFuzzyCompareRatio: - def test_ratios(self): - base = Series(["this is", "this isn't"]) - comparison = Series(["this is", "yellow"]) - expected = Series([100.00, 0.0]) - actual = fuzzy_compare_ratio(base, comparison) - assert all(expected == actual), "fuzzy scoring not working correctly" +class TestRemovePunctuation: + def test_remove_punctuation(self): + series = Series(["this is!", "my series?"]) + actual = remove_punctuation(series) + expected = Series(["this is", "my series"]) + assert all(actual == expected), "Remove punctuation not working on series" -class TestRemovePunctuation: +class TestRemovePunctuationstring: def test_remove_punctuation(self): test_string = "my #$%&()*+,-./:;<=>?@[]^_`{|}~?name" - actual = remove_punctuation(test_string) + actual = _remove_punctuation_string(test_string) expected = "my name" assert actual == expected, "punctuation not removed correctly" @@ -132,7 +138,6 @@ def test_stemmer(self): class TestLemmatizer: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_lemmatization(self): word_list = ["house", "houses", "housing"] actual = lemmatizer(word_list) @@ -141,14 +146,12 @@ def test_lemmatization(self): class TestRemoveNLTKStopwords: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_remove_standard_stopwords(self): tokens = ["my", "name", "is", "elf", "who", "are", "you"] - actual = remove_nltk_stopwords(tokens, []) + actual = remove_nltk_stopwords(tokens) expected = ["name", "elf"] assert actual == expected, "core stopwords not being removed correctly" - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_remove_additional_stopwords(self): tokens = ["my", "name", "is", "elf", "who", "are", "you"] actual = remove_nltk_stopwords(tokens, ["elf"]) @@ -156,24 +159,18 @@ def test_remove_additional_stopwords(self): assert actual == expected, "additional stopwords not being removed correctly" -class TestInitialiseNLTKStopwords: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") - def test_return_stopwords_list(self): - stopwords = _initialise_nltk_stopwords() - assert isinstance(stopwords, list), "Did not return a list of stopwords" - - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") - def test_key_stopwords(self): - stopwords = _initialise_nltk_stopwords() - expected = ["i", "we", "you"] - actual = [word in stopwords for word in expected] - assert all(actual), "expected key words missing from stopwords" +class TestInitialiseUpdateStopwords: + def test_add_word_to_stopwords(self): + additional_words = ["elf", "santa"] + new_stopwords = initialise_update_stopwords(additional_words) + actual = [word in new_stopwords for word in additional_words] + assert all(actual), "new words not added to stopwords" class TestUpdateNLTKStopwords: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_add_word_to_stopwords(self): - stopwords = _initialise_nltk_stopwords() + _initialise_nltk_component("corpora/stopwords", "stopwords") + stopwords = sw.words("english") additional_words = ["elf", "santa"] new_stopwords = _update_nltk_stopwords(stopwords, additional_words) actual = [word in new_stopwords for word in additional_words] @@ -188,5 +185,6 @@ def test_region_tokens(self): assert actual == expected, "did not rejoin tokens correctly" -if __name__ == "__main__": - unittest.main() +class TestInitialiseNLTKComponent: + def test_initialise_component(self): + pass diff --git a/tests/modules/test_quality_checks.py b/tests/modules/test_quality_checks.py new file mode 100644 index 0000000..5f69bec --- /dev/null +++ b/tests/modules/test_quality_checks.py @@ -0,0 +1,12 @@ +from pandas import Series + +from src.modules.quality_checks import fuzzy_compare_ratio + + +class TestFuzzyCompareRatio: + def test_ratios(self): + base = Series(["this is", "this isn't"]) + comparison = Series(["this is", "yellow"]) + expected = Series([100.00, 0.0]) + actual = fuzzy_compare_ratio(base, comparison) + assert all(expected == actual), "fuzzy scoring not working correctly"