From 9f79bb32d91d14a959517434f1003aa055acab3c Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Fri, 7 Jul 2023 14:20:59 +0100 Subject: [PATCH 01/31] add get text feature count --- src/processing/preprocessing.py | 102 ++++++++++++++++++++++++- src/run_pipeline.py | 19 +++-- tests/processing/test_preprocessing.py | 54 ++++++++++++- 3 files changed, 164 insertions(+), 11 deletions(-) diff --git a/src/processing/preprocessing.py b/src/processing/preprocessing.py index b366011..89f8c99 100644 --- a/src/processing/preprocessing.py +++ b/src/processing/preprocessing.py @@ -8,8 +8,10 @@ import yaml from nltk.corpus import stopwords as sw from nltk.stem import PorterStemmer, WordNetLemmatizer -from pandas.core.series import Series +from numpy.typing import ArrayLike +from pandas import DataFrame, Series from rapidfuzz.fuzz import ratio +from sklearn.feature_extraction.text import CountVectorizer def load_config(filepath: str) -> dict: @@ -187,10 +189,26 @@ def remove_nltk_stopwords(tokens: list, additional_stopwords: list = []) -> list list token list without stopwords """ + stopwords = initialise_update_stopwords(additional_stopwords) + without_stopwords = [item for item in tokens if item not in stopwords] + return without_stopwords + + +def initialise_update_stopwords(additional_stopwords: list = None) -> list: + """initialise and update stopwords, ise this for efficient retrieval of + stopwords, rather than calling both functions. + Parameters + ---------- + additional_stopwords:list + new words to add to the words to remove list + Returns + ------- + list + a list of words to remove from corpus + """ stopwords = _initialise_nltk_stopwords() updated_stopwords = _update_nltk_stopwords(stopwords, additional_stopwords) - without_stopwords = [item for item in tokens if item not in updated_stopwords] - return without_stopwords + return updated_stopwords def _initialise_nltk_stopwords() -> list: @@ -243,3 +261,81 @@ def rejoin_tokens(tokens: list) -> str: """ joined_tokens = " ".join(tokens) return joined_tokens + + +def extract_feature_count( + series: Series, + max_features: int = None, + ngram_range: tuple[float, float] = (1, 1), + stop_words: ArrayLike = None, + lowercase: bool = True, + min_df: float | int = 1, + max_df: float | int = 1.0, +): + """create a text feature count dataframe from series + Paramaters + ---------- + series: Series + Series of text strings + max_features: int, default = None + If not None, build a vocabulary that only consider the top max_features + ordered by term frequency across the corpus. Otherwise, all features are used. + ngram_range: tuple (min_n, max_n), default=(1, 1) + The lower and upper boundary of the range of n-values for different word n-grams + or char n-grams to be extracted. All values of n such such that + min_n <= n <= max_n will be used. + stop_words: list, default=None + list of stopwords to remove from text strings + lowercase: bool, default = True + convert all characters to lowercase before tokenizing + min_df: float or int, default = 1 + When building the vocabulary ignore terms that have a document frequency + strictly lower than the given threshold. This value is also called cut-off + in the literature. If float, the parameter represents a proportion of + documents, integer absolute counts. + This parameter is ignored if vocabulary is not None. + max_df: float or int, default = 1.0 + When building the vocabulary ignore terms that have a document frequency + strictly higher than the given threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. This parameter is ignored if vocabulary is not None. + Returns + ------- + DataFrame + A dataframe of text feature counts, displaying the number of times a word + appears in each element of the input series + """ + + vectorizer = CountVectorizer( + max_features=max_features, + ngram_range=ngram_range, + stop_words=stop_words, + lowercase=lowercase, + min_df=min_df, + max_df=max_df, + ) + + fitted_vector = vectorizer.fit_transform(series) + + word_count_df = DataFrame( + fitted_vector.toarray(), columns=vectorizer.get_feature_names_out() + ) + return word_count_df + + +def get_total_feature_count(features: DataFrame) -> DataFrame: + """sum across features to get total number of times word was used + Parameters + ---------- + features: DataFrame + A dataframe of the features with each row corrosponding to a deconstructed + string + Returns + ------- + DataFrame + A dataframe of the total number of times each word is used across all + strings""" + total_feature_count = DataFrame() + for column in features.columns: + total_feature_count[column] = [features[column].sum()] + return total_feature_count diff --git a/src/run_pipeline.py b/src/run_pipeline.py index 9a06a84..c1d0ce0 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -1,15 +1,12 @@ -# import re -# import string -# import matplotlib.pyplot as plt -# import mglearn -# import numpy as np import pandas as pd from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import CountVectorizer from src.processing.preprocessing import ( # stemmer, correct_spelling, + extract_feature_count, fuzzy_compare_ratio, + initialise_update_stopwords, lemmatizer, load_config, rejoin_tokens, @@ -19,13 +16,18 @@ ) from src.processing.visualisation import create_wordcloud # print_row_by_row, +# import re +# import string +# import matplotlib.pyplot as plt +# import mglearn +# import numpy as np # from sklearn.decomposition import LatentDirichletAllocation # from importlib import reload # reload(preprocessing) def run_pipeline(): - """run entire consultation nlp pipeline""" + """run consultation nlp pipeline""" config = load_config("src/config.yaml") raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252") raw_series = raw_data["qu_3"] @@ -53,6 +55,11 @@ def run_pipeline(): print(rejoined_words, impact_of_spell_correction) """#Topic Modelling""" + stopwords = initialise_update_stopwords(config["additional_stopwords"]) + features = extract_feature_count( + without_blank_rows, ngram_range=(1, 2), min_df=0.2, stop_words=stopwords + ) + print(features) vect = CountVectorizer(max_features=5) coliv_wordsbows = vect.fit(raw_series) diff --git a/tests/processing/test_preprocessing.py b/tests/processing/test_preprocessing.py index b307b5a..9254d67 100644 --- a/tests/processing/test_preprocessing.py +++ b/tests/processing/test_preprocessing.py @@ -1,10 +1,11 @@ import sys import unittest +from itertools import repeat import numpy as np import pytest import textblob as tb -from pandas import Series +from pandas import DataFrame, Series from src.processing.preprocessing import ( _initialise_nltk_stopwords, @@ -12,7 +13,10 @@ _update_nltk_stopwords, _update_spelling_words, correct_spelling, + extract_feature_count, fuzzy_compare_ratio, + get_total_feature_count, + initialise_update_stopwords, lemmatizer, load_config, rejoin_tokens, @@ -144,7 +148,7 @@ class TestRemoveNLTKStopwords: @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_remove_standard_stopwords(self): tokens = ["my", "name", "is", "elf", "who", "are", "you"] - actual = remove_nltk_stopwords(tokens, []) + actual = remove_nltk_stopwords(tokens) expected = ["name", "elf"] assert actual == expected, "core stopwords not being removed correctly" @@ -156,6 +160,15 @@ def test_remove_additional_stopwords(self): assert actual == expected, "additional stopwords not being removed correctly" +class TestInitialiseUpdateStopwords: + @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") + def test_add_word_to_stopwords(self): + additional_words = ["elf", "santa"] + new_stopwords = initialise_update_stopwords(additional_words) + actual = [word in new_stopwords for word in additional_words] + assert all(actual), "new words not added to stopwords" + + class TestInitialiseNLTKStopwords: @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_return_stopwords_list(self): @@ -188,5 +201,42 @@ def test_region_tokens(self): assert actual == expected, "did not rejoin tokens correctly" +class TestExtractFeatureCount: + def test_feature_count(self): + data = Series(["My name is elf"]) + expected = DataFrame([[1, 1, 1, 1]], columns=("elf", "is", "my", "name")) + actual = extract_feature_count(data) + assert all(expected == actual), "Does not match expected output" + + def test_remove_stopwords(self): + stopwords = ["is", "my"] + data = Series(["My name is elf"]) + actual = extract_feature_count(data, stop_words=stopwords) + expected = DataFrame([[1, 1]], columns=("elf", "name")) + assert all(expected == actual), "Does not remove stopwords" + + def test_ngrams(self): + data = Series(["My name is elf"]) + actual = extract_feature_count(data, ngram_range=(1, 2)) + expected = DataFrame( + [repeat(1, 7)], + columns=["elf", "is", "is elf", "my", "my name", "name", "name is"], + ) + assert all(expected == actual), "Does not handle ngrams" + + +class testGetTotalFeatureCount: + def test_get_total_feature_count(self): + df = DataFrame( + [[1, 1, 1, 1, 0], [0, 1, 1, 1, 1]], + columns=["elf", "is", "my", "name", "santa"], + ) + expected = DataFrame( + [1, 2, 2, 2, 1], columns=["elf", "is", "my", "name", "santa"] + ) + actual = get_total_feature_count(df) + assert all(expected == actual), "Does not correctly sum total features" + + if __name__ == "__main__": unittest.main() From 3f31c81691936629ddef9f07bebda20e6dfedffe Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Mon, 10 Jul 2023 10:50:07 +0100 Subject: [PATCH 02/31] Updating config, and spelling functions --- src/config.yaml | 17 +++++--- src/processing/preprocessing.py | 59 +++++++++++++++++++++----- tests/processing/test_preprocessing.py | 38 ++++++++++++----- 3 files changed, 87 insertions(+), 27 deletions(-) diff --git a/src/config.yaml b/src/config.yaml index 99e73c2..c034c91 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -1,8 +1,15 @@ raw_data_path: "data/raw/2023_consultation_mock_data.csv" -business_terminology: - - 'dpm' - - 'admin' - - 'timeliness' -additional_stopwords: +buisness_terminology: #words to update spelling with associated weight + dpm: 1 + admin: 1 #needs higher weight to override amin -> main correction + timeliness: 1 +additional_stopwords: #words to filter - "census" - "data" +lemmatize: True #select False to use Stemmer +feature_count: + ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1) + min_df: 0.2 #float (proportion) or int (count) + max_df: 1.0 #float (proportion) or int (count) + max_features: null #null converts to None, or int value + lowercase: True #whether to convert all words to lowercase diff --git a/src/processing/preprocessing.py b/src/processing/preprocessing.py index 89f8c99..92fb87b 100644 --- a/src/processing/preprocessing.py +++ b/src/processing/preprocessing.py @@ -32,7 +32,7 @@ def load_config(filepath: str) -> dict: raise TypeError("filepath must be a string") with open(filepath, "r") as file: - config = yaml.safe_load(file) + config = yaml.load(file, Loader=yaml.Loader) return config @@ -71,36 +71,58 @@ def _replace_blanks(series: Series) -> Series: return blanks_replaced -def correct_spelling(string: str, additional_words: list = []) -> str: +def spellcorrect_series(series: Series, additional_words: dict = {}) -> Series: + """fix spelling across series using the norvig spell-correct method + Parameters + ---------- + series: Series + the series of text strings you want to pass your spell checker on + additional_words:dict + a dictionary of words and weights for each word + Returns + ------- + Series + a series with words spelling corrected""" + corrected_series = series.apply( + lambda str: _correct_spelling(str, additional_words) + ) + return corrected_series + + +def _correct_spelling(string: str, additional_words: dict = {}) -> str: """correct spelling using norvig spell-correct method (it has around 70% accuracy) Parameters ---------- string:str string you want to fix the spelling in + additional_words:dict, default = None + words to add to the textblob dictionary, with associated weights. + higher weights give greater precedence to the weighted word. Returns ------- str string with the spelling fixed""" - _update_spelling_words(additional_words) + tb.en.spelling = _update_spelling_words(additional_words) spelling_fixed = str(tb.TextBlob(string).correct()) return spelling_fixed -def _update_spelling_words(additional_words: list) -> None: +def _update_spelling_words(additional_words: dict) -> None: """update word in the textblob library with commonly used business word Parameters ---------- - additional_words:list - words to add to the textblob dictionary + additional_words:dict + words to add to the textblob dictionary, with associated weights. + higher weights give greater precedence to the weighted word. Returns ------- - None + dict + a dictionary of words and updated weights """ - for word in additional_words: - tb.en.spelling.update({word: 1}) - tb.en.spelling - return None + for word, weight in additional_words.items(): + tb.en.spelling.update({word: weight}) + return tb.en.spelling def fuzzy_compare_ratio(base: Series, comparison: Series) -> Series: @@ -138,6 +160,21 @@ def remove_punctuation(text: str) -> str: return new_text +def shorten_tokens(word_tokens: list, lemmatize: bool = True) -> list: + """Shorten tokens to root words + Parameters + ---------- + word_tokens:list + list of word tokens to shorten + lemmatize: bool, default = True + whether to use lemmatizer or revert back to False (stemmer)""" + if lemmatize: + short_tokens = word_tokens.apply(lemmatizer) + else: + short_tokens = word_tokens.apply(stemmer) + return short_tokens + + def stemmer(tokens: list) -> list: """Stem works to their root form (e.g. flying -> fli, Beautiful -> Beauti) diff --git a/tests/processing/test_preprocessing.py b/tests/processing/test_preprocessing.py index 9254d67..5eb9330 100644 --- a/tests/processing/test_preprocessing.py +++ b/tests/processing/test_preprocessing.py @@ -8,11 +8,11 @@ from pandas import DataFrame, Series from src.processing.preprocessing import ( + _correct_spelling, _initialise_nltk_stopwords, _replace_blanks, _update_nltk_stopwords, _update_spelling_words, - correct_spelling, extract_feature_count, fuzzy_compare_ratio, get_total_feature_count, @@ -23,6 +23,7 @@ remove_blank_rows, remove_nltk_stopwords, remove_punctuation, + spellcorrect_series, stemmer, ) @@ -86,27 +87,42 @@ def test_return_series(self): ), "output is not " +class TestSpellCorrectSeries: + def test_spell_correct_series(self): + series = Series(["I live in a housr", "I own a housr"]) + actual = spellcorrect_series(series) + expected = Series(["I live in a house", "I own a house"]) + assert all(actual == expected), "Not fixed spelling across series" + + def test_update_spelling_on_series(self): + series = Series(["I live in a housr", "I own a housr"]) + additional_words = {"housr": 1} + actual = spellcorrect_series(series, additional_words) + expected = Series(["I live in a housr", "I own a housr"]) + assert all(actual == expected), "Updated spelling doesn't work across series" + + class TestCorrectSpelling: def test_spelling_fixed(self): - house_str = "I live in a housr" - corrected = correct_spelling(house_str) - assert corrected == "I live in a house", "spelling not fixed correctly" + house_str = "I live flar away" + corrected = _correct_spelling(house_str) + assert corrected == "I live far away", "spelling not fixed correctly" def test_word_update(self): - additional_words = ["housr"] - house_str = "I live in a housr" - corrected = correct_spelling(house_str, additional_words) + additional_words = {"flar": 1} + house_str = "I live flar away" + corrected = _correct_spelling(house_str, additional_words) assert ( - corrected == "I live in a housr" + corrected == "I live flar away" ), "spelling word list not correctly updated" class TestUpdateSpellingWords: def test_update_word_list(self): - additional_words = ["housr"] - _update_spelling_words(additional_words) + additional_words = {"monsterp": 1} + tb.en.spelling = _update_spelling_words(additional_words) assert ( - "housr" in tb.en.spelling.keys() + "monsterp" in tb.en.spelling.keys() ), "spelling word list not updated correctly" From 8bf8b5a528157a33cad0220a7be8c242b6baca9e Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Mon, 10 Jul 2023 11:30:20 +0100 Subject: [PATCH 03/31] Plug in real data feed --- src/config.yaml | 6 +++--- src/run_pipeline.py | 47 +++++++++++++++++++++------------------------ 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/config.yaml b/src/config.yaml index c034c91..8d36cfb 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -1,7 +1,7 @@ -raw_data_path: "data/raw/2023_consultation_mock_data.csv" +raw_data_path: "data/raw/20230710_consultation_ingest.csv" buisness_terminology: #words to update spelling with associated weight dpm: 1 - admin: 1 #needs higher weight to override amin -> main correction + admin: 1 timeliness: 1 additional_stopwords: #words to filter - "census" @@ -9,7 +9,7 @@ additional_stopwords: #words to filter lemmatize: True #select False to use Stemmer feature_count: ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1) - min_df: 0.2 #float (proportion) or int (count) + min_df: 0.1 #float (proportion) or int (count) max_df: 1.0 #float (proportion) or int (count) max_features: null #null converts to None, or int value lowercase: True #whether to convert all words to lowercase diff --git a/src/run_pipeline.py b/src/run_pipeline.py index c1d0ce0..55bb98d 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -1,18 +1,18 @@ import pandas as pd from nltk.tokenize import word_tokenize -from sklearn.feature_extraction.text import CountVectorizer -from src.processing.preprocessing import ( # stemmer, - correct_spelling, +from src.processing.preprocessing import ( extract_feature_count, fuzzy_compare_ratio, + get_total_feature_count, initialise_update_stopwords, - lemmatizer, load_config, rejoin_tokens, remove_blank_rows, remove_nltk_stopwords, remove_punctuation, + shorten_tokens, + spellcorrect_series, ) from src.processing.visualisation import create_wordcloud # print_row_by_row, @@ -29,42 +29,39 @@ def run_pipeline(): """run consultation nlp pipeline""" config = load_config("src/config.yaml") - raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252") - raw_series = raw_data["qu_3"] + colnames = [f"qu_{number+1}" for number in range(0, 33)] + raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252", names=colnames) + raw_series = raw_data["qu_11"] # TODO add clean_data parent function lower_series = raw_series.str.lower() without_blank_rows = remove_blank_rows(lower_series) - spelling_fixed = without_blank_rows.apply( - correct_spelling, config["business_terminology"] + spelling_fixed = spellcorrect_series( + without_blank_rows, config["buisness_terminology"] ) impact_of_spell_correction = fuzzy_compare_ratio(without_blank_rows, spelling_fixed) - # TODO consider whether there are words we need to fix manually? i.e timliness # print_row_by_row(without_blank_rows,spelling_fixed) no_punctuation_series = spelling_fixed.apply(remove_punctuation) word_tokens = no_punctuation_series.apply(word_tokenize) - # stemmed_tokens = word_tokens.apply(stemmer) - lemmatized_tokens = word_tokens.apply(lemmatizer) - without_stopwords = lemmatized_tokens.apply( + short_tokens = shorten_tokens(word_tokens, config["lemmatize"]) + without_stopwords = short_tokens.apply( lambda x: remove_nltk_stopwords(x, config["additional_stopwords"]) ) rejoined_words = without_stopwords.apply(rejoin_tokens) - text = " ".join(rejoined_words) - create_wordcloud(text) - - # just printing to overcome qa aspect - print(rejoined_words, impact_of_spell_correction) - - """#Topic Modelling""" + all_text_combined = " ".join(rejoined_words) + create_wordcloud(all_text_combined) stopwords = initialise_update_stopwords(config["additional_stopwords"]) features = extract_feature_count( - without_blank_rows, ngram_range=(1, 2), min_df=0.2, stop_words=stopwords + series=spelling_fixed, + ngram_range=config["feature_count"]["ngram_range"], + min_df=config["feature_count"]["min_df"], + max_df=config["feature_count"]["max_df"], + max_features=config["feature_count"]["max_features"], + lowercase=config["feature_count"]["lowercase"], + stop_words=stopwords, ) - print(features) - - vect = CountVectorizer(max_features=5) - coliv_wordsbows = vect.fit(raw_series) + total_features = get_total_feature_count(features) - print(coliv_wordsbows.vocabulary_) + print(features, rejoined_words, total_features, impact_of_spell_correction) # lda5 = LatentDirichletAllocation( From 4eff9eb738ae735a4f2e5525a67a755a1c9b7400 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Mon, 10 Jul 2023 11:50:12 +0100 Subject: [PATCH 04/31] fix tests --- src/processing/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/processing/preprocessing.py b/src/processing/preprocessing.py index 92fb87b..e4d1811 100644 --- a/src/processing/preprocessing.py +++ b/src/processing/preprocessing.py @@ -306,8 +306,8 @@ def extract_feature_count( ngram_range: tuple[float, float] = (1, 1), stop_words: ArrayLike = None, lowercase: bool = True, - min_df: float | int = 1, - max_df: float | int = 1.0, + min_df: float in range[0.0, 1.0] or int = 1, + max_df: float in range[0.0, 1.0] or int = 1.0, ): """create a text feature count dataframe from series Paramaters From f2a02a3d414466d9b25d1d1c6d5882a0dc388e89 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Mon, 10 Jul 2023 11:54:10 +0100 Subject: [PATCH 05/31] Fix tests --- src/processing/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/processing/preprocessing.py b/src/processing/preprocessing.py index e4d1811..001939f 100644 --- a/src/processing/preprocessing.py +++ b/src/processing/preprocessing.py @@ -306,8 +306,8 @@ def extract_feature_count( ngram_range: tuple[float, float] = (1, 1), stop_words: ArrayLike = None, lowercase: bool = True, - min_df: float in range[0.0, 1.0] or int = 1, - max_df: float in range[0.0, 1.0] or int = 1.0, + min_df=1, + max_df=1.0, ): """create a text feature count dataframe from series Paramaters From 5f3fd35e33b02d2fbafee2a62cc1fef3e3dd3260 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 10:03:25 +0100 Subject: [PATCH 06/31] Restructure package --- src/{processing => modules}/__init__.py | 0 src/modules/analysis.py | 81 ++++++++++ src/{processing => modules}/preprocessing.py | 143 +++--------------- src/modules/quality_checks.py | 38 +++++ src/{processing => modules}/visualisation.py | 19 --- src/run_pipeline.py | 17 ++- tests/{processing => modules}/__init__.py | 0 tests/modules/test_analysis.py | 42 +++++ .../test_preprocessing.py | 80 ++-------- tests/modules/test_quality_checks.py | 12 ++ 10 files changed, 216 insertions(+), 216 deletions(-) rename src/{processing => modules}/__init__.py (100%) create mode 100644 src/modules/analysis.py rename src/{processing => modules}/preprocessing.py (64%) create mode 100644 src/modules/quality_checks.py rename src/{processing => modules}/visualisation.py (59%) rename tests/{processing => modules}/__init__.py (100%) create mode 100644 tests/modules/test_analysis.py rename tests/{processing => modules}/test_preprocessing.py (71%) create mode 100644 tests/modules/test_quality_checks.py diff --git a/src/processing/__init__.py b/src/modules/__init__.py similarity index 100% rename from src/processing/__init__.py rename to src/modules/__init__.py diff --git a/src/modules/analysis.py b/src/modules/analysis.py new file mode 100644 index 0000000..3e43ade --- /dev/null +++ b/src/modules/analysis.py @@ -0,0 +1,81 @@ +from numpy.typing import ArrayLike +from pandas import DataFrame, Series +from sklearn.feature_extraction.text import CountVectorizer + + +def extract_feature_count( + series: Series, + max_features: int = None, + ngram_range: tuple[float, float] = (1, 1), + stop_words: ArrayLike = None, + lowercase: bool = True, + min_df=1, + max_df=1.0, +): + """create a text feature count dataframe from series + Paramaters + ---------- + series: Series + Series of text strings + max_features: int, default = None + If not None, build a vocabulary that only consider the top max_features + ordered by term frequency across the corpus. Otherwise, all features are used. + ngram_range: tuple (min_n, max_n), default=(1, 1) + The lower and upper boundary of the range of n-values for different word n-grams + or char n-grams to be extracted. All values of n such such that + min_n <= n <= max_n will be used. + stop_words: list, default=None + list of stopwords to remove from text strings + lowercase: bool, default = True + convert all characters to lowercase before tokenizing + min_df: float or int, default = 1 + When building the vocabulary ignore terms that have a document frequency + strictly lower than the given threshold. This value is also called cut-off + in the literature. If float, the parameter represents a proportion of + documents, integer absolute counts. + This parameter is ignored if vocabulary is not None. + max_df: float or int, default = 1.0 + When building the vocabulary ignore terms that have a document frequency + strictly higher than the given threshold (corpus-specific stop words). + If float, the parameter represents a proportion of documents, integer + absolute counts. This parameter is ignored if vocabulary is not None. + Returns + ------- + DataFrame + A dataframe of text feature counts, displaying the number of times a word + appears in each element of the input series + """ + + vectorizer = CountVectorizer( + max_features=max_features, + ngram_range=ngram_range, + stop_words=stop_words, + lowercase=lowercase, + min_df=min_df, + max_df=max_df, + ) + + fitted_vector = vectorizer.fit_transform(series) + + word_count_df = DataFrame( + fitted_vector.toarray(), columns=vectorizer.get_feature_names_out() + ) + return word_count_df + + +def get_total_feature_count(features: DataFrame) -> DataFrame: + """sum across features to get total number of times word was used + Parameters + ---------- + features: DataFrame + A dataframe of the features with each row corrosponding to a deconstructed + string + Returns + ------- + DataFrame + A dataframe of the total number of times each word is used across all + strings""" + total_feature_count = DataFrame() + for column in features.columns: + total_feature_count[column] = [features[column].sum()] + return total_feature_count diff --git a/src/processing/preprocessing.py b/src/modules/preprocessing.py similarity index 64% rename from src/processing/preprocessing.py rename to src/modules/preprocessing.py index 001939f..d89581a 100644 --- a/src/processing/preprocessing.py +++ b/src/modules/preprocessing.py @@ -8,10 +8,7 @@ import yaml from nltk.corpus import stopwords as sw from nltk.stem import PorterStemmer, WordNetLemmatizer -from numpy.typing import ArrayLike -from pandas import DataFrame, Series -from rapidfuzz.fuzz import ratio -from sklearn.feature_extraction.text import CountVectorizer +from pandas import Series def load_config(filepath: str) -> dict: @@ -125,24 +122,6 @@ def _update_spelling_words(additional_words: dict) -> None: return tb.en.spelling -def fuzzy_compare_ratio(base: Series, comparison: Series) -> Series: - """compare the base series to the comparison series to get - a similarity ratio between strings in the same column - Parameters - ---------- - base: Series - the base series for comparison - comparison: Series - the series you want to compare against - Returns - ------- - Series - a series of ratios (type:float) with scores closer to 100 - indicating complete match""" - fuzzy_ratio = Series(map(ratio, base, comparison)) - return fuzzy_ratio - - def remove_punctuation(text: str) -> str: """Remove punctuation from string @@ -156,6 +135,7 @@ def remove_punctuation(text: str) -> str: str text string without punctuation """ + _initialise_nltk_component("tokenizers/punkt", "punkt") new_text = re.sub(string=text, pattern="[{}]".format(string.punctuation), repl="") return new_text @@ -207,11 +187,32 @@ def lemmatizer(tokens: list) -> list: lemmatized_tokens list of simplified word groupings """ + _initialise_nltk_component("corpora/wordnet.zip", "wordnet") lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens] return lemmatized_tokens +def _initialise_nltk_component(extension: str, download_object: str): + """download nltk component from package + Parameters + ---------- + extension: str + the filepath extension leading to where the model is saved + download_object: str + the object to download from nltk + Returns + ------- + None + """ + username = os.getenv("username") + path = "c:/Users/" + username + "/AppData/Roaming/nltk_data" + extension + if not os.path.exists(path): + nltk.download(download_object) + nltk.data.path.append("../local_packages/nltk_data") + return None + + def remove_nltk_stopwords(tokens: list, additional_stopwords: list = []) -> list: """remove stopwords from series @@ -243,28 +244,12 @@ def initialise_update_stopwords(additional_stopwords: list = None) -> list: list a list of words to remove from corpus """ - stopwords = _initialise_nltk_stopwords() + _initialise_nltk_component("corpora/stopwords", "stopwords") + stopwords = sw.words("english") updated_stopwords = _update_nltk_stopwords(stopwords, additional_stopwords) return updated_stopwords -def _initialise_nltk_stopwords() -> list: - """fetch nltk stopwords from corpora - - Returns - ------- - list - list of nltk stopwords - """ - username = os.getenv("username") - path = "c:/Users/" + username + "/AppData/Roaming/nltk_data/corpora/stopwords" - if not os.path.exists(path): - nltk.download("stopwords") - nltk.data.path.append("../local_packages/nltk_data") - stopwords = sw.words("english") - return stopwords - - def _update_nltk_stopwords(stopwords: list, additional_stopwords: list): """add additional words to nltk stopwords Parameters @@ -298,81 +283,3 @@ def rejoin_tokens(tokens: list) -> str: """ joined_tokens = " ".join(tokens) return joined_tokens - - -def extract_feature_count( - series: Series, - max_features: int = None, - ngram_range: tuple[float, float] = (1, 1), - stop_words: ArrayLike = None, - lowercase: bool = True, - min_df=1, - max_df=1.0, -): - """create a text feature count dataframe from series - Paramaters - ---------- - series: Series - Series of text strings - max_features: int, default = None - If not None, build a vocabulary that only consider the top max_features - ordered by term frequency across the corpus. Otherwise, all features are used. - ngram_range: tuple (min_n, max_n), default=(1, 1) - The lower and upper boundary of the range of n-values for different word n-grams - or char n-grams to be extracted. All values of n such such that - min_n <= n <= max_n will be used. - stop_words: list, default=None - list of stopwords to remove from text strings - lowercase: bool, default = True - convert all characters to lowercase before tokenizing - min_df: float or int, default = 1 - When building the vocabulary ignore terms that have a document frequency - strictly lower than the given threshold. This value is also called cut-off - in the literature. If float, the parameter represents a proportion of - documents, integer absolute counts. - This parameter is ignored if vocabulary is not None. - max_df: float or int, default = 1.0 - When building the vocabulary ignore terms that have a document frequency - strictly higher than the given threshold (corpus-specific stop words). - If float, the parameter represents a proportion of documents, integer - absolute counts. This parameter is ignored if vocabulary is not None. - Returns - ------- - DataFrame - A dataframe of text feature counts, displaying the number of times a word - appears in each element of the input series - """ - - vectorizer = CountVectorizer( - max_features=max_features, - ngram_range=ngram_range, - stop_words=stop_words, - lowercase=lowercase, - min_df=min_df, - max_df=max_df, - ) - - fitted_vector = vectorizer.fit_transform(series) - - word_count_df = DataFrame( - fitted_vector.toarray(), columns=vectorizer.get_feature_names_out() - ) - return word_count_df - - -def get_total_feature_count(features: DataFrame) -> DataFrame: - """sum across features to get total number of times word was used - Parameters - ---------- - features: DataFrame - A dataframe of the features with each row corrosponding to a deconstructed - string - Returns - ------- - DataFrame - A dataframe of the total number of times each word is used across all - strings""" - total_feature_count = DataFrame() - for column in features.columns: - total_feature_count[column] = [features[column].sum()] - return total_feature_count diff --git a/src/modules/quality_checks.py b/src/modules/quality_checks.py new file mode 100644 index 0000000..4909c36 --- /dev/null +++ b/src/modules/quality_checks.py @@ -0,0 +1,38 @@ +from pandas import Series +from rapidfuzz.fuzz import ratio + + +def fuzzy_compare_ratio(base: Series, comparison: Series) -> Series: + """compare the base series to the comparison series to get + a similarity ratio between strings in the same column + Parameters + ---------- + base: Series + the base series for comparison + comparison: Series + the series you want to compare against + Returns + ------- + Series + a series of ratios (type:float) with scores closer to 100 + indicating complete match""" + fuzzy_ratio = Series(map(ratio, base, comparison)) + return fuzzy_ratio + + +def print_row_by_row(base: Series, comparison: Series) -> None: + """print each pair of words row by row + Parameters + ---------- + base: Series + the base series for comparison + comparison: Series + the series you want to compare against + Returns + ------- + None + """ + for i in base.index: + print(base[i]) + print(comparison[i]) + return None diff --git a/src/processing/visualisation.py b/src/modules/visualisation.py similarity index 59% rename from src/processing/visualisation.py rename to src/modules/visualisation.py index 0ca08cd..02dd96d 100644 --- a/src/processing/visualisation.py +++ b/src/modules/visualisation.py @@ -1,26 +1,7 @@ import matplotlib.pyplot as plt -from pandas import Series from wordcloud import WordCloud -def print_row_by_row(base: Series, comparison: Series) -> None: - """print each pair of words row by row - Parameters - ---------- - base: Series - the base series for comparison - comparison: Series - the series you want to compare against - Returns - ------- - None - """ - for i in base.index: - print(base[i]) - print(comparison[i]) - return None - - def create_wordcloud(text: str, filename: str = "data/outputs/wordcloud.jpeg"): """generate a wordcloud with the given filename Parameters diff --git a/src/run_pipeline.py b/src/run_pipeline.py index 55bb98d..12d0242 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -1,10 +1,8 @@ import pandas as pd from nltk.tokenize import word_tokenize -from src.processing.preprocessing import ( - extract_feature_count, - fuzzy_compare_ratio, - get_total_feature_count, +from src.modules.analysis import extract_feature_count, get_total_feature_count +from src.modules.preprocessing import ( initialise_update_stopwords, load_config, rejoin_tokens, @@ -14,14 +12,13 @@ shorten_tokens, spellcorrect_series, ) -from src.processing.visualisation import create_wordcloud # print_row_by_row, +from src.modules.quality_checks import fuzzy_compare_ratio # print_row_by_row, +from src.modules.visualisation import create_wordcloud -# import re -# import string # import matplotlib.pyplot as plt # import mglearn -# import numpy as np # from sklearn.decomposition import LatentDirichletAllocation + # from importlib import reload # reload(preprocessing) @@ -64,6 +61,10 @@ def run_pipeline(): print(features, rejoined_words, total_features, impact_of_spell_correction) +# code to execute script from terminal +if __name__ == "__main__": + run_pipeline() + # lda5 = LatentDirichletAllocation( # n_components=5, learning_method="batch", max_iter=25, random_state=0 # ) diff --git a/tests/processing/__init__.py b/tests/modules/__init__.py similarity index 100% rename from tests/processing/__init__.py rename to tests/modules/__init__.py diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py new file mode 100644 index 0000000..078c75d --- /dev/null +++ b/tests/modules/test_analysis.py @@ -0,0 +1,42 @@ +from itertools import repeat + +from pandas import DataFrame, Series + +from src.modules.analysis import extract_feature_count, get_total_feature_count + + +class TestExtractFeatureCount: + def test_feature_count(self): + data = Series(["My name is elf"]) + expected = DataFrame([[1, 1, 1, 1]], columns=("elf", "is", "my", "name")) + actual = extract_feature_count(data) + assert all(expected == actual), "Does not match expected output" + + def test_remove_stopwords(self): + stopwords = ["is", "my"] + data = Series(["My name is elf"]) + actual = extract_feature_count(data, stop_words=stopwords) + expected = DataFrame([[1, 1]], columns=("elf", "name")) + assert all(expected == actual), "Does not remove stopwords" + + def test_ngrams(self): + data = Series(["My name is elf"]) + actual = extract_feature_count(data, ngram_range=(1, 2)) + expected = DataFrame( + [repeat(1, 7)], + columns=["elf", "is", "is elf", "my", "my name", "name", "name is"], + ) + assert all(expected == actual), "Does not handle ngrams" + + +class testGetTotalFeatureCount: + def test_get_total_feature_count(self): + df = DataFrame( + [[1, 1, 1, 1, 0], [0, 1, 1, 1, 1]], + columns=["elf", "is", "my", "name", "santa"], + ) + expected = DataFrame( + [1, 2, 2, 2, 1], columns=["elf", "is", "my", "name", "santa"] + ) + actual = get_total_feature_count(df) + assert all(expected == actual), "Does not correctly sum total features" diff --git a/tests/processing/test_preprocessing.py b/tests/modules/test_preprocessing.py similarity index 71% rename from tests/processing/test_preprocessing.py rename to tests/modules/test_preprocessing.py index 5eb9330..968b6e4 100644 --- a/tests/processing/test_preprocessing.py +++ b/tests/modules/test_preprocessing.py @@ -1,21 +1,17 @@ import sys -import unittest -from itertools import repeat import numpy as np import pytest import textblob as tb -from pandas import DataFrame, Series +from nltk.corpus import stopwords as sw +from pandas import Series -from src.processing.preprocessing import ( +from src.modules.preprocessing import ( _correct_spelling, - _initialise_nltk_stopwords, + _initialise_nltk_component, _replace_blanks, _update_nltk_stopwords, _update_spelling_words, - extract_feature_count, - fuzzy_compare_ratio, - get_total_feature_count, initialise_update_stopwords, lemmatizer, load_config, @@ -126,15 +122,6 @@ def test_update_word_list(self): ), "spelling word list not updated correctly" -class TestFuzzyCompareRatio: - def test_ratios(self): - base = Series(["this is", "this isn't"]) - comparison = Series(["this is", "yellow"]) - expected = Series([100.00, 0.0]) - actual = fuzzy_compare_ratio(base, comparison) - assert all(expected == actual), "fuzzy scoring not working correctly" - - class TestRemovePunctuation: def test_remove_punctuation(self): test_string = "my #$%&()*+,-./:;<=>?@[]^_`{|}~?name" @@ -185,24 +172,11 @@ def test_add_word_to_stopwords(self): assert all(actual), "new words not added to stopwords" -class TestInitialiseNLTKStopwords: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") - def test_return_stopwords_list(self): - stopwords = _initialise_nltk_stopwords() - assert isinstance(stopwords, list), "Did not return a list of stopwords" - - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") - def test_key_stopwords(self): - stopwords = _initialise_nltk_stopwords() - expected = ["i", "we", "you"] - actual = [word in stopwords for word in expected] - assert all(actual), "expected key words missing from stopwords" - - class TestUpdateNLTKStopwords: @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_add_word_to_stopwords(self): - stopwords = _initialise_nltk_stopwords() + _initialise_nltk_component("corpora/stopwords", "stopwords") + stopwords = sw.words("english") additional_words = ["elf", "santa"] new_stopwords = _update_nltk_stopwords(stopwords, additional_words) actual = [word in new_stopwords for word in additional_words] @@ -217,42 +191,6 @@ def test_region_tokens(self): assert actual == expected, "did not rejoin tokens correctly" -class TestExtractFeatureCount: - def test_feature_count(self): - data = Series(["My name is elf"]) - expected = DataFrame([[1, 1, 1, 1]], columns=("elf", "is", "my", "name")) - actual = extract_feature_count(data) - assert all(expected == actual), "Does not match expected output" - - def test_remove_stopwords(self): - stopwords = ["is", "my"] - data = Series(["My name is elf"]) - actual = extract_feature_count(data, stop_words=stopwords) - expected = DataFrame([[1, 1]], columns=("elf", "name")) - assert all(expected == actual), "Does not remove stopwords" - - def test_ngrams(self): - data = Series(["My name is elf"]) - actual = extract_feature_count(data, ngram_range=(1, 2)) - expected = DataFrame( - [repeat(1, 7)], - columns=["elf", "is", "is elf", "my", "my name", "name", "name is"], - ) - assert all(expected == actual), "Does not handle ngrams" - - -class testGetTotalFeatureCount: - def test_get_total_feature_count(self): - df = DataFrame( - [[1, 1, 1, 1, 0], [0, 1, 1, 1, 1]], - columns=["elf", "is", "my", "name", "santa"], - ) - expected = DataFrame( - [1, 2, 2, 2, 1], columns=["elf", "is", "my", "name", "santa"] - ) - actual = get_total_feature_count(df) - assert all(expected == actual), "Does not correctly sum total features" - - -if __name__ == "__main__": - unittest.main() +class TestInitialiseNLTKComponent: + def test_initialise_component(self): + pass diff --git a/tests/modules/test_quality_checks.py b/tests/modules/test_quality_checks.py new file mode 100644 index 0000000..5f69bec --- /dev/null +++ b/tests/modules/test_quality_checks.py @@ -0,0 +1,12 @@ +from pandas import Series + +from src.modules.quality_checks import fuzzy_compare_ratio + + +class TestFuzzyCompareRatio: + def test_ratios(self): + base = Series(["this is", "this isn't"]) + comparison = Series(["this is", "yellow"]) + expected = Series([100.00, 0.0]) + actual = fuzzy_compare_ratio(base, comparison) + assert all(expected == actual), "fuzzy scoring not working correctly" From fa9578828ec0bad772fe44db05f534f8bbe72a3d Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 14:57:02 +0100 Subject: [PATCH 07/31] add named entity recognition --- src/config.yaml | 4 +++- src/modules/analysis.py | 18 ++++++++++++++++++ src/modules/visualisation.py | 11 ++++++++--- src/run_pipeline.py | 15 +++++++++------ tests/modules/test_analysis.py | 24 +++++++++++++++++++++--- 5 files changed, 59 insertions(+), 13 deletions(-) diff --git a/src/config.yaml b/src/config.yaml index 8d36cfb..56d9967 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -1,4 +1,4 @@ -raw_data_path: "data/raw/20230710_consultation_ingest.csv" +raw_data_path: "data/raw/20230711_consultation_ingest.csv" buisness_terminology: #words to update spelling with associated weight dpm: 1 admin: 1 @@ -6,6 +6,8 @@ buisness_terminology: #words to update spelling with associated weight additional_stopwords: #words to filter - "census" - "data" + - "personal" + - "use" lemmatize: True #select False to use Stemmer feature_count: ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1) diff --git a/src/modules/analysis.py b/src/modules/analysis.py index 3e43ade..6ada289 100644 --- a/src/modules/analysis.py +++ b/src/modules/analysis.py @@ -1,3 +1,4 @@ +import spacy from numpy.typing import ArrayLike from pandas import DataFrame, Series from sklearn.feature_extraction.text import CountVectorizer @@ -79,3 +80,20 @@ def get_total_feature_count(features: DataFrame) -> DataFrame: for column in features.columns: total_feature_count[column] = [features[column].sum()] return total_feature_count + + +def retrieve_named_entities(series: Series) -> list[list[str]]: + """retrieve any named entities from the series + Parameters + ---------- + series:Series + A series of text strings to analyse for named entities + Returns + ------- + list[list[str]] + a list of lists containing strings for each named entitity""" + nlp = spacy.load("en_core_web_sm") + entities = [] + for doc in nlp.pipe(series): + entities.append([str(ent) for ent in doc.ents]) + return entities diff --git a/src/modules/visualisation.py b/src/modules/visualisation.py index 02dd96d..6fc6b5c 100644 --- a/src/modules/visualisation.py +++ b/src/modules/visualisation.py @@ -1,8 +1,10 @@ +from datetime import datetime as dt + import matplotlib.pyplot as plt from wordcloud import WordCloud -def create_wordcloud(text: str, filename: str = "data/outputs/wordcloud.jpeg"): +def create_wordcloud(text: str, filename: str = "wordcloud"): """generate a wordcloud with the given filename Parameters ---------- @@ -16,5 +18,8 @@ def create_wordcloud(text: str, filename: str = "data/outputs/wordcloud.jpeg"): wordcloud = WordCloud().generate(text) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") - plt.savefig(filename, bbox_inches="tight") - print(f"Wordcloud saved to {filename}") + datestamp = dt.strftime(dt.now(), "%Y%m%d") + filename_datestamp_ext = "data/outputs/" + datestamp + "_" + filename + ".jpeg" + plt.savefig(filename_datestamp_ext, bbox_inches="tight") + print(f"Wordcloud saved to {filename_datestamp_ext}") + return None diff --git a/src/run_pipeline.py b/src/run_pipeline.py index 12d0242..fe0bba3 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -1,7 +1,11 @@ import pandas as pd from nltk.tokenize import word_tokenize -from src.modules.analysis import extract_feature_count, get_total_feature_count +from src.modules.analysis import ( + extract_feature_count, + get_total_feature_count, + retrieve_named_entities, +) from src.modules.preprocessing import ( initialise_update_stopwords, load_config, @@ -19,15 +23,14 @@ # import mglearn # from sklearn.decomposition import LatentDirichletAllocation -# from importlib import reload -# reload(preprocessing) - def run_pipeline(): """run consultation nlp pipeline""" config = load_config("src/config.yaml") colnames = [f"qu_{number+1}" for number in range(0, 33)] - raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252", names=colnames) + raw_data = pd.read_csv( + config["raw_data_path"], encoding="cp1252", names=colnames, skiprows=1 + ) raw_series = raw_data["qu_11"] # TODO add clean_data parent function lower_series = raw_series.str.lower() @@ -57,7 +60,7 @@ def run_pipeline(): stop_words=stopwords, ) total_features = get_total_feature_count(features) - + retrieve_named_entities(spelling_fixed) print(features, rejoined_words, total_features, impact_of_spell_correction) diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py index 078c75d..a0be45a 100644 --- a/tests/modules/test_analysis.py +++ b/tests/modules/test_analysis.py @@ -2,7 +2,11 @@ from pandas import DataFrame, Series -from src.modules.analysis import extract_feature_count, get_total_feature_count +from src.modules.analysis import ( + extract_feature_count, + get_total_feature_count, + retrieve_named_entities, +) class TestExtractFeatureCount: @@ -29,14 +33,28 @@ def test_ngrams(self): assert all(expected == actual), "Does not handle ngrams" -class testGetTotalFeatureCount: +class TestGetTotalFeatureCount: def test_get_total_feature_count(self): df = DataFrame( [[1, 1, 1, 1, 0], [0, 1, 1, 1, 1]], columns=["elf", "is", "my", "name", "santa"], ) expected = DataFrame( - [1, 2, 2, 2, 1], columns=["elf", "is", "my", "name", "santa"] + [[1, 2, 2, 2, 1]], columns=["elf", "is", "my", "name", "santa"] ) actual = get_total_feature_count(df) assert all(expected == actual), "Does not correctly sum total features" + + +class TestRetrieveNamedEntities: + def test_retrieve_named_entities(self): + test_data = Series( + [ + "The ONS has just released an article on the UK Government's policy.", + "my own care for nothing", + "Hollywood actors now have their own statue", + ] + ) + actual = retrieve_named_entities(test_data) + expected = [["ONS", "the UK Government's"], [], ["Hollywood"]] + assert actual == expected, "Did not successfully retrieve named entities" From 7d5ccec4f93eccf25d0c2d3eb402aca52fcd39c8 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 15:39:26 +0100 Subject: [PATCH 08/31] Fix duplication in code --- src/modules/preprocessing.py | 23 ++++++++++++----------- src/run_pipeline.py | 12 +++++++----- tests/modules/test_preprocessing.py | 19 ++++++++++--------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py index d89581a..ee179a1 100644 --- a/src/modules/preprocessing.py +++ b/src/modules/preprocessing.py @@ -80,27 +80,22 @@ def spellcorrect_series(series: Series, additional_words: dict = {}) -> Series: ------- Series a series with words spelling corrected""" - corrected_series = series.apply( - lambda str: _correct_spelling(str, additional_words) - ) + tb.en.spelling = _update_spelling_words(additional_words) + corrected_series = series.apply(lambda str: _correct_spelling(str)) return corrected_series -def _correct_spelling(string: str, additional_words: dict = {}) -> str: +def _correct_spelling(string: str) -> str: """correct spelling using norvig spell-correct method (it has around 70% accuracy) Parameters ---------- string:str string you want to fix the spelling in - additional_words:dict, default = None - words to add to the textblob dictionary, with associated weights. - higher weights give greater precedence to the weighted word. Returns ------- str string with the spelling fixed""" - tb.en.spelling = _update_spelling_words(additional_words) spelling_fixed = str(tb.TextBlob(string).correct()) return spelling_fixed @@ -122,7 +117,14 @@ def _update_spelling_words(additional_words: dict) -> None: return tb.en.spelling -def remove_punctuation(text: str) -> str: +def remove_punctuation(series: Series) -> Series: + """Remove punctuation from series of strings""" + _initialise_nltk_component("tokenizers/punkt", "punkt") + punct_removed = series.apply(_remove_punctuation_string) + return punct_removed + + +def _remove_punctuation_string(text: str) -> str: """Remove punctuation from string Parameters @@ -135,7 +137,6 @@ def remove_punctuation(text: str) -> str: str text string without punctuation """ - _initialise_nltk_component("tokenizers/punkt", "punkt") new_text = re.sub(string=text, pattern="[{}]".format(string.punctuation), repl="") return new_text @@ -206,7 +207,7 @@ def _initialise_nltk_component(extension: str, download_object: str): None """ username = os.getenv("username") - path = "c:/Users/" + username + "/AppData/Roaming/nltk_data" + extension + path = "C:/Users/" + username + "/AppData/Roaming/nltk_data/" + extension if not os.path.exists(path): nltk.download(download_object) nltk.data.path.append("../local_packages/nltk_data") diff --git a/src/run_pipeline.py b/src/run_pipeline.py index fe0bba3..fd44b86 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -33,14 +33,14 @@ def run_pipeline(): ) raw_series = raw_data["qu_11"] # TODO add clean_data parent function - lower_series = raw_series.str.lower() - without_blank_rows = remove_blank_rows(lower_series) + without_blank_rows = remove_blank_rows(raw_series) spelling_fixed = spellcorrect_series( without_blank_rows, config["buisness_terminology"] ) impact_of_spell_correction = fuzzy_compare_ratio(without_blank_rows, spelling_fixed) + lower_series = spelling_fixed.str.lower() # print_row_by_row(without_blank_rows,spelling_fixed) - no_punctuation_series = spelling_fixed.apply(remove_punctuation) + no_punctuation_series = remove_punctuation(lower_series) word_tokens = no_punctuation_series.apply(word_tokenize) short_tokens = shorten_tokens(word_tokens, config["lemmatize"]) without_stopwords = short_tokens.apply( @@ -60,14 +60,16 @@ def run_pipeline(): stop_words=stopwords, ) total_features = get_total_feature_count(features) - retrieve_named_entities(spelling_fixed) - print(features, rejoined_words, total_features, impact_of_spell_correction) + entities = retrieve_named_entities(without_blank_rows) + + print(impact_of_spell_correction, total_features, entities) # code to execute script from terminal if __name__ == "__main__": run_pipeline() + # lda5 = LatentDirichletAllocation( # n_components=5, learning_method="batch", max_iter=25, random_state=0 # ) diff --git a/tests/modules/test_preprocessing.py b/tests/modules/test_preprocessing.py index 968b6e4..65dbcbc 100644 --- a/tests/modules/test_preprocessing.py +++ b/tests/modules/test_preprocessing.py @@ -9,6 +9,7 @@ from src.modules.preprocessing import ( _correct_spelling, _initialise_nltk_component, + _remove_punctuation_string, _replace_blanks, _update_nltk_stopwords, _update_spelling_words, @@ -104,14 +105,6 @@ def test_spelling_fixed(self): corrected = _correct_spelling(house_str) assert corrected == "I live far away", "spelling not fixed correctly" - def test_word_update(self): - additional_words = {"flar": 1} - house_str = "I live flar away" - corrected = _correct_spelling(house_str, additional_words) - assert ( - corrected == "I live flar away" - ), "spelling word list not correctly updated" - class TestUpdateSpellingWords: def test_update_word_list(self): @@ -123,9 +116,17 @@ def test_update_word_list(self): class TestRemovePunctuation: + def test_remove_punctuation(self): + series = Series(["this is!", "my series?"]) + actual = remove_punctuation(series) + expected = Series(["this is", "my series"]) + assert all(actual == expected), "Remove punctuation not working on series" + + +class TestRemovePunctuationstring: def test_remove_punctuation(self): test_string = "my #$%&()*+,-./:;<=>?@[]^_`{|}~?name" - actual = remove_punctuation(test_string) + actual = _remove_punctuation_string(test_string) expected = "my name" assert actual == expected, "punctuation not removed correctly" From 7156344089584fc5cd658b9a73c555b5a9fd6bac Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 15:49:06 +0100 Subject: [PATCH 09/31] update initialise documentation --- src/modules/preprocessing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py index ee179a1..e253ce7 100644 --- a/src/modules/preprocessing.py +++ b/src/modules/preprocessing.py @@ -1,6 +1,7 @@ import os import re import string +import sys import nltk import numpy as np @@ -210,7 +211,10 @@ def _initialise_nltk_component(extension: str, download_object: str): path = "C:/Users/" + username + "/AppData/Roaming/nltk_data/" + extension if not os.path.exists(path): nltk.download(download_object) - nltk.data.path.append("../local_packages/nltk_data") + if sys.platform.startswith("linux"): + nltk.data.path.append("../usr/share/nltk_data") + else: + nltk.data.path.append("../local_packages/nltk_data") return None From c3ecaf4becdd815b59202f269255d285425a9193 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 15:52:40 +0100 Subject: [PATCH 10/31] new path --- src/modules/preprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py index e253ce7..cac9625 100644 --- a/src/modules/preprocessing.py +++ b/src/modules/preprocessing.py @@ -211,8 +211,9 @@ def _initialise_nltk_component(extension: str, download_object: str): path = "C:/Users/" + username + "/AppData/Roaming/nltk_data/" + extension if not os.path.exists(path): nltk.download(download_object) + # Set path for runs on github actions if sys.platform.startswith("linux"): - nltk.data.path.append("../usr/share/nltk_data") + nltk.data.path.append("../home/runner/nltk_data") else: nltk.data.path.append("../local_packages/nltk_data") return None From eadef671bc7c9960b9b0809be04380b42b94fe94 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:02:02 +0100 Subject: [PATCH 11/31] test ci.yml --- ci.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 ci.yml diff --git a/ci.yml b/ci.yml new file mode 100644 index 0000000..b6ef354 --- /dev/null +++ b/ci.yml @@ -0,0 +1,2 @@ +if [ -f requirements.txt ]; then pip install -r requirements.txt; fi +python -m nltk.downloader punkt stopwords From 847c08fedf89518d72d88f13b2e45ee88e05bc54 Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:05:29 +0100 Subject: [PATCH 12/31] Update CodeCov.yml --- .github/workflows/CodeCov.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml index d7bb499..bcedcd2 100644 --- a/.github/workflows/CodeCov.yml +++ b/.github/workflows/CodeCov.yml @@ -23,7 +23,8 @@ jobs: - name: Generate Report run: | pip install --upgrade pip - pip install -r requirements.txt + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m nltk.downloader punkt stopwords pip install coverage pip install coverage[toml] coverage run -m pytest From b38438a51f2707635f6bf3e6389e2f184ea2b6ab Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:07:30 +0100 Subject: [PATCH 13/31] update requirements.txt --- ci.yml | 2 -- requirements.txt | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) delete mode 100644 ci.yml diff --git a/ci.yml b/ci.yml deleted file mode 100644 index b6ef354..0000000 --- a/ci.yml +++ /dev/null @@ -1,2 +0,0 @@ -if [ -f requirements.txt ]; then pip install -r requirements.txt; fi -python -m nltk.downloader punkt stopwords diff --git a/requirements.txt b/requirements.txt index 1a7dcac..c31940f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,21 @@ arrow==1.2.3 binaryornot==0.4.4 +blis==0.7.9 +catalogue==2.0.8 certifi==2023.5.7 cfgv==3.3.1 chardet==5.1.0 charset-normalizer==3.1.0 click==8.1.3 colorama==0.4.6 +confection==0.1.0 contourpy==1.1.0 cookiecutter==2.1.1 cycler==0.11.0 +cymem==2.0.7 distlib==0.3.6 docopt==0.6.2 +en-core-web-sm @ file:///C:/users/daglic/downloads/en_core_web_sm-3.6.0.tar.gz#sha256=7ef2a0090b49aaab02d6eba347186e3d4ff99328334f5504e1da3afe2b3474e0 exceptiongroup==1.1.1 filelock==3.12.2 fonttools==4.40.0 @@ -26,19 +31,24 @@ Jinja2==3.1.2 jinja2-time==0.2.0 joblib==1.2.0 kiwisolver==1.4.4 +langcodes==3.3.0 MarkupSafe==2.1.3 matplotlib==3.7.1 mglearn==0.2.0 +murmurhash==1.0.9 nltk==3.8.1 nodeenv==1.8.0 numpy==1.25.0 packaging==23.1 pandas==2.0.2 +pathy==0.10.2 Pillow==9.5.0 pipreqs==0.4.13 platformdirs==3.5.3 pluggy==1.1.0 pre-commit==3.3.3 +preshed==3.0.8 +pydantic==1.10.11 pyparsing==3.1.0 pyspellchecker==0.7.2 pytest==7.3.2 @@ -53,10 +63,16 @@ scikit-learn==1.2.2 scipy==1.10.1 silpa-common==0.3 six==1.16.0 +smart-open==6.3.0 smmap==5.0.0 soundex==1.1.3 +spacy==3.6.0 +spacy-legacy==3.0.12 +spacy-loggers==1.0.4 +srsly==2.4.6 text-unidecode==1.3 textblob==0.17.1 +thinc==8.1.10 threadpoolctl==3.1.0 tomli==2.0.1 tqdm==4.65.0 @@ -65,5 +81,6 @@ typing_extensions==4.6.3 tzdata==2023.3 urllib3==2.0.3 virtualenv==20.23.0 +wasabi==1.1.2 wordcloud==1.9.2 yarg==0.1.9 From 58305580cf40172f03d49ee48fde06a6d0653f86 Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:11:19 +0100 Subject: [PATCH 14/31] Update CodeCov.yml --- .github/workflows/CodeCov.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml index bcedcd2..d4fb902 100644 --- a/.github/workflows/CodeCov.yml +++ b/.github/workflows/CodeCov.yml @@ -25,6 +25,7 @@ jobs: pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi python -m nltk.downloader punkt stopwords + pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz pip install coverage pip install coverage[toml] coverage run -m pytest From 4f1466a06d66d2212a8b7fae868756fec4022a95 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:11:54 +0100 Subject: [PATCH 15/31] Re-activate tests --- tests/modules/test_preprocessing.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/modules/test_preprocessing.py b/tests/modules/test_preprocessing.py index 65dbcbc..2f3a2a2 100644 --- a/tests/modules/test_preprocessing.py +++ b/tests/modules/test_preprocessing.py @@ -1,5 +1,3 @@ -import sys - import numpy as np import pytest import textblob as tb @@ -140,7 +138,6 @@ def test_stemmer(self): class TestLemmatizer: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_lemmatization(self): word_list = ["house", "houses", "housing"] actual = lemmatizer(word_list) @@ -149,14 +146,12 @@ def test_lemmatization(self): class TestRemoveNLTKStopwords: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_remove_standard_stopwords(self): tokens = ["my", "name", "is", "elf", "who", "are", "you"] actual = remove_nltk_stopwords(tokens) expected = ["name", "elf"] assert actual == expected, "core stopwords not being removed correctly" - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_remove_additional_stopwords(self): tokens = ["my", "name", "is", "elf", "who", "are", "you"] actual = remove_nltk_stopwords(tokens, ["elf"]) @@ -165,7 +160,6 @@ def test_remove_additional_stopwords(self): class TestInitialiseUpdateStopwords: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_add_word_to_stopwords(self): additional_words = ["elf", "santa"] new_stopwords = initialise_update_stopwords(additional_words) @@ -174,7 +168,6 @@ def test_add_word_to_stopwords(self): class TestUpdateNLTKStopwords: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file") def test_add_word_to_stopwords(self): _initialise_nltk_component("corpora/stopwords", "stopwords") stopwords = sw.words("english") From a67b2fca70c912234827d03c4c9516d982677200 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:13:46 +0100 Subject: [PATCH 16/31] update spacy model requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c31940f..bcb8371 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ cycler==0.11.0 cymem==2.0.7 distlib==0.3.6 docopt==0.6.2 -en-core-web-sm @ file:///C:/users/daglic/downloads/en_core_web_sm-3.6.0.tar.gz#sha256=7ef2a0090b49aaab02d6eba347186e3d4ff99328334f5504e1da3afe2b3474e0 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz exceptiongroup==1.1.1 filelock==3.12.2 fonttools==4.40.0 From 42de25079803ee42542dec91dc8ee2bc7e15b935 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:16:27 +0100 Subject: [PATCH 17/31] amend version no --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bcb8371..7821a97 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ cycler==0.11.0 cymem==2.0.7 distlib==0.3.6 docopt==0.6.2 -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz exceptiongroup==1.1.1 filelock==3.12.2 fonttools==4.40.0 From 4de3692281196063230a9211a94fb88de2e86112 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:31:47 +0100 Subject: [PATCH 18/31] update initialise nltk functions --- src/modules/analysis.py | 2 +- src/modules/preprocessing.py | 39 +++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/modules/analysis.py b/src/modules/analysis.py index 6ada289..29fac77 100644 --- a/src/modules/analysis.py +++ b/src/modules/analysis.py @@ -82,7 +82,7 @@ def get_total_feature_count(features: DataFrame) -> DataFrame: return total_feature_count -def retrieve_named_entities(series: Series) -> list[list[str]]: +def retrieve_named_entities(series: Series) -> list: """retrieve any named entities from the series Parameters ---------- diff --git a/src/modules/preprocessing.py b/src/modules/preprocessing.py index cac9625..943d35a 100644 --- a/src/modules/preprocessing.py +++ b/src/modules/preprocessing.py @@ -196,7 +196,40 @@ def lemmatizer(tokens: list) -> list: def _initialise_nltk_component(extension: str, download_object: str): - """download nltk component from package + """spliter function to determine which initialisation path to run + Parameters + ---------- + extension: str + the filepath extension leading to where the model is saved + download_object: str + the object to download from nltk + Returns + ------- + None + """ + if sys.platform.startswith("linux"): + _initialise_nltk_linux(download_object) + else: + _initialise_nltk_windows(extension, download_object) + + +def _initialise_nltk_linux(download_object: str) -> None: + """initialise nltk component for linux environment (for github actions) + Parameters + ---------- + download_object: str + nltk object to download + Returns + ------- + None + """ + nltk.download(download_object) + nltk.data.path.append("../home/runner/nltk_data") + return None + + +def _initialise_nltk_windows(extension: str, download_object: str): + """initialise nltk component for a windows environment Parameters ---------- extension: str @@ -211,10 +244,6 @@ def _initialise_nltk_component(extension: str, download_object: str): path = "C:/Users/" + username + "/AppData/Roaming/nltk_data/" + extension if not os.path.exists(path): nltk.download(download_object) - # Set path for runs on github actions - if sys.platform.startswith("linux"): - nltk.data.path.append("../home/runner/nltk_data") - else: nltk.data.path.append("../local_packages/nltk_data") return None From 0c3ccd3078e146e662b4b8136523b2726b43b4bf Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Tue, 11 Jul 2023 16:42:48 +0100 Subject: [PATCH 19/31] block spacy test --- tests/modules/test_analysis.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py index a0be45a..45ec728 100644 --- a/tests/modules/test_analysis.py +++ b/tests/modules/test_analysis.py @@ -1,5 +1,7 @@ +import sys from itertools import repeat +import pytest from pandas import DataFrame, Series from src.modules.analysis import ( @@ -47,6 +49,7 @@ def test_get_total_feature_count(self): class TestRetrieveNamedEntities: + @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Not sure") def test_retrieve_named_entities(self): test_data = Series( [ From 44ff672e64ff4b696ad121c48934ef8ff199971a Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Thu, 13 Jul 2023 09:38:02 +0100 Subject: [PATCH 20/31] Add lda and plot keyword topics --- src/config.yaml | 5 +- src/modules/analysis.py | 30 ++++++- src/modules/visualisation.py | 166 +++++++++++++++++++++++++++++++++-- src/run_pipeline.py | 109 +++-------------------- 4 files changed, 207 insertions(+), 103 deletions(-) diff --git a/src/config.yaml b/src/config.yaml index 56d9967..ff4fc66 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -3,6 +3,7 @@ buisness_terminology: #words to update spelling with associated weight dpm: 1 admin: 1 timeliness: 1 + year: 450 additional_stopwords: #words to filter - "census" - "data" @@ -11,7 +12,7 @@ additional_stopwords: #words to filter lemmatize: True #select False to use Stemmer feature_count: ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1) - min_df: 0.1 #float (proportion) or int (count) - max_df: 1.0 #float (proportion) or int (count) + min_df: 2 #float (proportion) or int (count) + max_df: 0.95 #float (proportion) or int (count) max_features: null #null converts to None, or int value lowercase: True #whether to convert all words to lowercase diff --git a/src/modules/analysis.py b/src/modules/analysis.py index 29fac77..602b118 100644 --- a/src/modules/analysis.py +++ b/src/modules/analysis.py @@ -1,6 +1,8 @@ import spacy from numpy.typing import ArrayLike from pandas import DataFrame, Series +from scipy.sparse._csr import csr_matrix +from sklearn.decomposition import LatentDirichletAllocation from sklearn.feature_extraction.text import CountVectorizer @@ -61,7 +63,7 @@ def extract_feature_count( word_count_df = DataFrame( fitted_vector.toarray(), columns=vectorizer.get_feature_names_out() ) - return word_count_df + return fitted_vector, word_count_df def get_total_feature_count(features: DataFrame) -> DataFrame: @@ -97,3 +99,29 @@ def retrieve_named_entities(series: Series) -> list: for doc in nlp.pipe(series): entities.append([str(ent) for ent in doc.ents]) return entities + + +def latent_dirichlet_allocation( + n_components: int, max_iter: int, fitted_vector: csr_matrix +): + """fit latent direchlet allocation model on fitted vector + Parameters + ---------- + n_components:int + number of components to include in model + max_iter: int + maximum number of passes over the training data + fitted_vector:csr_matrix + fitted vector from CountVectorizer + Returns + ------- + fitted lda model + document_topics + """ + lda = LatentDirichletAllocation( + n_components=10, learning_method="batch", max_iter=25, random_state=179 + ) + + document_topics = lda.fit_transform(fitted_vector) + + return lda, document_topics diff --git a/src/modules/visualisation.py b/src/modules/visualisation.py index 6fc6b5c..ba7d66f 100644 --- a/src/modules/visualisation.py +++ b/src/modules/visualisation.py @@ -1,10 +1,13 @@ +import typing from datetime import datetime as dt import matplotlib.pyplot as plt +from matplotlib.figure import Figure +from sklearn.decomposition import LatentDirichletAllocation from wordcloud import WordCloud -def create_wordcloud(text: str, filename: str = "wordcloud"): +def create_wordcloud(text: str, name: str = "wordcloud") -> None: """generate a wordcloud with the given filename Parameters ---------- @@ -13,13 +16,166 @@ def create_wordcloud(text: str, filename: str = "wordcloud"): filename: str the name and path you want to save the wordcloud to Returns: - prints message to console saying where file is saved + None (message to console on location of file) """ wordcloud = WordCloud().generate(text) + figure = plt.figure(figsize=(5, 10)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") + save_figure(name, figure) + return None + + +def save_figure(name: str, fig: Figure) -> None: + """save figure with datestamp + Parameters + ---------- + name: str + name of the figure + fig + the figure object + Returns + ------- + None (message to console on location of file) + """ datestamp = dt.strftime(dt.now(), "%Y%m%d") - filename_datestamp_ext = "data/outputs/" + datestamp + "_" + filename + ".jpeg" - plt.savefig(filename_datestamp_ext, bbox_inches="tight") - print(f"Wordcloud saved to {filename_datestamp_ext}") + filename = f"data/outputs/{datestamp}_{name}.jpeg" + fig.savefig(filename, bbox_inches="tight") + print(f"{name} plot saved as {filename}") + return None + + +def plot_top_words( + model: LatentDirichletAllocation, + feature_names: list, + n_topics: int, + title: str, + n_top_words: int = 10, + topic_labels: list = None, +) -> None: + """Plot topics by their most frequent words + Parameters + ---------- + model + the lda model components + feature_names:list + a list of the most frequent words (from bag of words model) + n_topics:int + number of topics to include in the chart + title:str + the title for the chart + n_top_words:int, (default = 10) + the number of top words to include in each topic plot + topic_labels:list, (default = None) + a list of labels to override the existing labels + Returns + ------- + None (message to console on location of file) + """ + topic_labels = _generate_topic_labels(n_topics, topic_labels) + labelled_components = dict(zip(topic_labels, model.components_)) + rows, columns = _get_n_columns_and_n_rows(n_topics) + fig, axes = plt.subplots( + rows, columns, figsize=_get_fig_size(columns, rows), sharex=True + ) + axes = axes.flatten() + for number, (topic_label, component) in enumerate(labelled_components.items()): + top_features_ind = component.argsort()[: -n_top_words - 1 : -1] + top_features = [feature_names[i] for i in top_features_ind] + weights = component[top_features_ind] + ax = axes[number] + ax.barh(top_features, weights, height=0.7) + ax.set_title(topic_label, fontdict={"fontsize": 30}) + ax.invert_yaxis() + ax.tick_params(axis="both", which="major", labelsize=20) + for i in "top right left".split(): + ax.spines[i].set_visible(False) + fig.suptitle(title, fontsize=40) + save_figure("lda_top_words", fig) return None + + +def _generate_topic_labels(n_topics: int, topic_labels: list = None) -> list: + """Generate topic labels from n_topics + Parameters + ---------- + n_topics: int + number of topics + topic_labels:list (default=None) + list of topic_labels + Returns + ------- + list + list of topic labels + """ + if topic_labels is None: + topic_labels = [f"Topic_{n}" for n in range(1, n_topics)] + else: + if len(topic_labels) != n_topics: + raise AttributeError("len(topic_labels) does not equal n_topics") + return topic_labels + + +def _get_n_columns_and_n_rows(n_topics: int) -> int: + """calculate the optimal number of rows and columns for n_topics + Parameters + ---------- + n_topics: int + number of topics + Returns + ------- + int + optimal number of columns + int + optimal number of rows + """ + if n_topics <= 0: + raise ValueError("Value must be an integer greater than 0") + if n_topics <= 5: + n_columns = n_topics + n_rows = 1 + else: + factors = [factor for factor in _get_factors(n_topics) if 1 < factor <= 5] + if len(factors) > 0: + n_columns = factors[-1] + n_rows = int(n_topics / n_columns) + else: + factors = [ + factor for factor in _get_factors(n_topics + 1) if 1 < factor <= 5 + ] + n_columns = factors[-1] + n_rows = int((n_topics / n_columns) + 1) + return n_rows, n_columns + + +def _get_factors(x: int) -> list: + """retrieve factors of a given integer (x) + Parameters + ---------- + x:int + integer + Returns + ------- + list + a list of factors of x + """ + return [i for i in range(1, x + 1) if x % i == 0] + + +def _get_fig_size(columns: int, rows: int) -> typing.Tuple[int, int]: + """get figure size from number of columns and rows + Parameters + ---------- + columns:int + number of columns + rows: int + number of rows + Returns + ------- + int + width of fig + int + height of fig""" + width = columns * 6 + height = (rows * 6) + 3 + return (width, height) diff --git a/src/run_pipeline.py b/src/run_pipeline.py index fd44b86..000fe0b 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -4,6 +4,7 @@ from src.modules.analysis import ( extract_feature_count, get_total_feature_count, + latent_dirichlet_allocation, retrieve_named_entities, ) from src.modules.preprocessing import ( @@ -17,11 +18,7 @@ spellcorrect_series, ) from src.modules.quality_checks import fuzzy_compare_ratio # print_row_by_row, -from src.modules.visualisation import create_wordcloud - -# import matplotlib.pyplot as plt -# import mglearn -# from sklearn.decomposition import LatentDirichletAllocation +from src.modules.visualisation import create_wordcloud, plot_top_words def run_pipeline(): @@ -50,7 +47,7 @@ def run_pipeline(): all_text_combined = " ".join(rejoined_words) create_wordcloud(all_text_combined) stopwords = initialise_update_stopwords(config["additional_stopwords"]) - features = extract_feature_count( + fitted_vector, features = extract_feature_count( series=spelling_fixed, ngram_range=config["feature_count"]["ngram_range"], min_df=config["feature_count"]["min_df"], @@ -61,6 +58,17 @@ def run_pipeline(): ) total_features = get_total_feature_count(features) entities = retrieve_named_entities(without_blank_rows) + lda, document_topics = latent_dirichlet_allocation( + n_components=10, max_iter=50, fitted_vector=fitted_vector + ) + plot_top_words( + model=lda, + feature_names=list(features.columns), + n_topics=10, + title="Top words by topic", + n_top_words=10, + topic_labels=None, + ) print(impact_of_spell_correction, total_features, entities) @@ -68,92 +76,3 @@ def run_pipeline(): # code to execute script from terminal if __name__ == "__main__": run_pipeline() - - -# lda5 = LatentDirichletAllocation( -# n_components=5, learning_method="batch", max_iter=25, random_state=0 -# ) -# -# document_topics5 = lda5.fit_transform(coliv_wordsbows) -# -# topics = np.array([0, 1, 2, 3, 4]) -# -# sorting = np.argsort(lda5.components_, axis=1)[:, ::-1] -# feature_names = np.array(vect.get_feature_names()) -# mglearn.tools.print_topics( -# topics=topics, -# feature_names=feature_names, -# sorting=sorting, -# topics_per_chunk=5, -# n_words=10, -# ) -# -# document_topics5 -# -# -# censtranf_respns = nlp_censtranf[ -# "cens_test_1" -# ] -# censtranf_respns = nlp_censtranf.reset_index(drop=True) -# -# -# -# -# def topic_summary( -# topic_number, -# ): -# -# topics = [topic_number] -# mglearn.tools.print_topics( -# topics=topics, -# feature_names=feature_names, -# sorting=sorting, -# topics_per_chunk=5, -# n_words=10, -# ) -# -# responses = np.argsort(document_topics5[:, topic_number])[::-1] -# -# for i in responses[:5]: -# print(coliv_respns[i], ".\n") -# -# -# for i in range(5): -# topic_summary(i) -# -# fig, ax = plt.subplots(1, 1, figsize=(10, 8)) -# topic_names = [ -# "{:>2} ".format(i) + " ".join(words) -# for i, words in enumerate(feature_names[sorting[:, :2]]) -# ] -# -# ax.barh(np.arange(5), np.sum(document_topics5, axis=0)) -# ax.set_yticks(np.arange(5)) -# ax.set_yticklabels(topic_names, ha="left", va="top") -# ax.invert_yaxis() -# ax.set_xlim(0, 300) -# yax = ax.get_yaxis() -# yax.set_tick_params(pad=130) -# plt.tight_layout() -# -# -# topic_labels = [ -# "The first label", -# "The second label", -# "The second label", -# "The third label", -# "The fourth label", -# ] -# -# -# fig, ax = plt.subplots(1, 1, figsize=(10, 8)) -# topic_names = ["{:>2} {}".format(i, label) for i, label in enumerate(topic_labels)] -# -# ax.barh(np.arange(5), np.mean(document_topics5, axis=0)) -# ax.set_yticks(np.arange(5)) -# ax.set_yticklabels(topic_names, ha="right", va="center") -# ax.invert_yaxis() -# ax.set_xlim(0, 0.5) -# yax = ax.get_yaxis() -# yax.set_tick_params(pad=10) -# plt.tight_layout() From 8bec0a4ea884d2bfc854d69a5f0501bc30db3ba6 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Thu, 13 Jul 2023 09:44:16 +0100 Subject: [PATCH 21/31] add type-hints --- src/modules/analysis.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/modules/analysis.py b/src/modules/analysis.py index 602b118..dd1d31d 100644 --- a/src/modules/analysis.py +++ b/src/modules/analysis.py @@ -1,3 +1,5 @@ +import typing + import spacy from numpy.typing import ArrayLike from pandas import DataFrame, Series @@ -14,7 +16,7 @@ def extract_feature_count( lowercase: bool = True, min_df=1, max_df=1.0, -): +) -> typing.Tuple[CountVectorizer, DataFrame]: """create a text feature count dataframe from series Paramaters ---------- @@ -63,7 +65,7 @@ def extract_feature_count( word_count_df = DataFrame( fitted_vector.toarray(), columns=vectorizer.get_feature_names_out() ) - return fitted_vector, word_count_df + return (fitted_vector, word_count_df) def get_total_feature_count(features: DataFrame) -> DataFrame: @@ -103,7 +105,7 @@ def retrieve_named_entities(series: Series) -> list: def latent_dirichlet_allocation( n_components: int, max_iter: int, fitted_vector: csr_matrix -): +) -> LatentDirichletAllocation: """fit latent direchlet allocation model on fitted vector Parameters ---------- @@ -115,13 +117,15 @@ def latent_dirichlet_allocation( fitted vector from CountVectorizer Returns ------- - fitted lda model - document_topics + LatentDirichletAllocation + fitted lda model """ lda = LatentDirichletAllocation( - n_components=10, learning_method="batch", max_iter=25, random_state=179 + n_components=n_components, + learning_method="batch", + max_iter=max_iter, + random_state=179, ) - document_topics = lda.fit_transform(fitted_vector) - - return lda, document_topics + lda.fit(fitted_vector) + return lda From d850567f4c6d06b76122040c3b95d97840bfd552 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Thu, 13 Jul 2023 10:11:26 +0100 Subject: [PATCH 22/31] update config --- src/config.yaml | 32 +++++++++++++++++++------------- src/modules/analysis.py | 6 +++--- src/run_pipeline.py | 14 ++++++++------ 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/src/config.yaml b/src/config.yaml index ff4fc66..5a40e02 100644 --- a/src/config.yaml +++ b/src/config.yaml @@ -1,18 +1,24 @@ -raw_data_path: "data/raw/20230711_consultation_ingest.csv" -buisness_terminology: #words to update spelling with associated weight - dpm: 1 - admin: 1 - timeliness: 1 - year: 450 -additional_stopwords: #words to filter - - "census" - - "data" - - "personal" - - "use" -lemmatize: True #select False to use Stemmer -feature_count: +raw_data_path: "data/raw/20230711_consultation_ingest.csv" #str +buisness_terminology: # dictionary of words to update spelling with associated weight + dpm: 1 #int + admin: 1 #int + timeliness: 1 #int + year: 450 #int +additional_stopwords: #list of words to filter; must be type str + - "census" #str + - "data" #str + - "personal" #str + - "use" #str +lemmatize: True #bool; select False to use Stemmer +feature_count: #dict ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1) min_df: 2 #float (proportion) or int (count) max_df: 0.95 #float (proportion) or int (count) max_features: null #null converts to None, or int value lowercase: True #whether to convert all words to lowercase +lda: #dict + n_topics: 5 #int + n_top_words: 10 #int + max_iter: 25 #int + title: "Topic Summary" #str + topic_labels: null # also takes a list of strings (see additional stopwords ^) diff --git a/src/modules/analysis.py b/src/modules/analysis.py index dd1d31d..558cc8b 100644 --- a/src/modules/analysis.py +++ b/src/modules/analysis.py @@ -104,12 +104,12 @@ def retrieve_named_entities(series: Series) -> list: def latent_dirichlet_allocation( - n_components: int, max_iter: int, fitted_vector: csr_matrix + n_topics: int, max_iter: int, fitted_vector: csr_matrix ) -> LatentDirichletAllocation: """fit latent direchlet allocation model on fitted vector Parameters ---------- - n_components:int + n_topics:int number of components to include in model max_iter: int maximum number of passes over the training data @@ -121,7 +121,7 @@ def latent_dirichlet_allocation( fitted lda model """ lda = LatentDirichletAllocation( - n_components=n_components, + n_components=n_topics, learning_method="batch", max_iter=max_iter, random_state=179, diff --git a/src/run_pipeline.py b/src/run_pipeline.py index 000fe0b..083224b 100644 --- a/src/run_pipeline.py +++ b/src/run_pipeline.py @@ -58,16 +58,18 @@ def run_pipeline(): ) total_features = get_total_feature_count(features) entities = retrieve_named_entities(without_blank_rows) - lda, document_topics = latent_dirichlet_allocation( - n_components=10, max_iter=50, fitted_vector=fitted_vector + lda = latent_dirichlet_allocation( + n_topics=config["lda"]["n_topics"], + max_iter=config["lda"]["max_iter"], + fitted_vector=fitted_vector, ) plot_top_words( model=lda, feature_names=list(features.columns), - n_topics=10, - title="Top words by topic", - n_top_words=10, - topic_labels=None, + n_topics=config["lda"]["n_topics"], + title=config["lda"]["title"], + n_top_words=config["lda"]["n_top_words"], + topic_labels=config["lda"]["topic_labels"], ) print(impact_of_spell_correction, total_features, entities) From 905fad77a680fd391bfdf449fb4f3b126c87eae9 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Thu, 13 Jul 2023 10:37:31 +0100 Subject: [PATCH 23/31] lda test --- tests/modules/test_analysis.py | 37 +++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py index 45ec728..d398ef8 100644 --- a/tests/modules/test_analysis.py +++ b/tests/modules/test_analysis.py @@ -1,12 +1,14 @@ -import sys from itertools import repeat -import pytest from pandas import DataFrame, Series +from scipy.sparse._csr import csr_matrix +from sklearn.decomposition import LatentDirichletAllocation +from sklearn.feature_extraction.text import CountVectorizer from src.modules.analysis import ( extract_feature_count, get_total_feature_count, + latent_dirichlet_allocation, retrieve_named_entities, ) @@ -15,25 +17,32 @@ class TestExtractFeatureCount: def test_feature_count(self): data = Series(["My name is elf"]) expected = DataFrame([[1, 1, 1, 1]], columns=("elf", "is", "my", "name")) - actual = extract_feature_count(data) + actual = extract_feature_count(data)[1] assert all(expected == actual), "Does not match expected output" def test_remove_stopwords(self): stopwords = ["is", "my"] data = Series(["My name is elf"]) - actual = extract_feature_count(data, stop_words=stopwords) + actual = extract_feature_count(data, stop_words=stopwords)[1] expected = DataFrame([[1, 1]], columns=("elf", "name")) assert all(expected == actual), "Does not remove stopwords" def test_ngrams(self): data = Series(["My name is elf"]) - actual = extract_feature_count(data, ngram_range=(1, 2)) + actual = extract_feature_count(data, ngram_range=(1, 2))[1] expected = DataFrame( [repeat(1, 7)], columns=["elf", "is", "is elf", "my", "my name", "name", "name is"], ) assert all(expected == actual), "Does not handle ngrams" + def test_get_fitted_vector(self): + data = Series(["My name is elf"]) + actual = extract_feature_count(data)[0] + assert isinstance( + actual, csr_matrix + ), "Does not return a csr_matrix object in position 0" + class TestGetTotalFeatureCount: def test_get_total_feature_count(self): @@ -49,7 +58,6 @@ def test_get_total_feature_count(self): class TestRetrieveNamedEntities: - @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Not sure") def test_retrieve_named_entities(self): test_data = Series( [ @@ -60,4 +68,19 @@ def test_retrieve_named_entities(self): ) actual = retrieve_named_entities(test_data) expected = [["ONS", "the UK Government's"], [], ["Hollywood"]] - assert actual == expected, "Did not successfully retrieve named entities" + trimmed_actual = [component for component in actual if component != []] + trimmed_expected = [component for component in expected if component != []] + assert ( + trimmed_actual == trimmed_expected + ), "Did not successfully retrieve named entities" + + +class TestLatentDirichletAllocation: + def test_latent_dirichlet_allocation(self): + fitted = CountVectorizer().fit_transform( + Series(["My name is Elf and I like ignoble hats"]) + ) + lda = latent_dirichlet_allocation(10, 10, fitted) + assert isinstance( + lda, LatentDirichletAllocation + ), "function did not return an latent dirichlet allocation object" From a60143a50ff07eed1d19fe233efcb9014378901a Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Thu, 13 Jul 2023 11:32:59 +0100 Subject: [PATCH 24/31] testing earlier version of type extensions --- requirements.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7821a97..26fc40b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ arrow==1.2.3 +asttokens +backcall +backports.functools-lru-cache binaryornot==0.4.4 blis==0.7.9 catalogue==2.0.8 @@ -8,15 +11,21 @@ chardet==5.1.0 charset-normalizer==3.1.0 click==8.1.3 colorama==0.4.6 +comm confection==0.1.0 contourpy==1.1.0 cookiecutter==2.1.1 cycler==0.11.0 cymem==2.0.7 +debugpy +decorator distlib==0.3.6 docopt==0.6.2 -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz +entrypoints exceptiongroup==1.1.1 +executing +fastjsonschema==2.17.1 filelock==3.12.2 fonttools==4.40.0 fuzz==0.1.1 @@ -27,60 +36,93 @@ idna==3.4 imageio==2.31.1 inexactsearch==1.0.2 iniconfig==2.0.0 +ipykernel +ipython +ipython-genutils==0.2.0 +jedi Jinja2==3.1.2 jinja2-time==0.2.0 joblib==1.2.0 +jupyter-client +jupyter-highlight-selected-word==0.2.0 +jupyter_core kiwisolver==1.4.4 langcodes==3.3.0 +Markdown==3.4.3 MarkupSafe==2.1.3 matplotlib==3.7.1 +matplotlib-inline mglearn==0.2.0 murmurhash==1.0.9 +nest-asyncio nltk==3.8.1 nodeenv==1.8.0 numpy==1.25.0 -packaging==23.1 +packaging pandas==2.0.2 +parso pathy==0.10.2 +pickleshare Pillow==9.5.0 pipreqs==0.4.13 -platformdirs==3.5.3 +platformdirs pluggy==1.1.0 pre-commit==3.3.3 preshed==3.0.8 +prompt-toolkit +psutil +pure-eval pydantic==1.10.11 +Pygments pyparsing==3.1.0 pyspellchecker==0.7.2 pytest==7.3.2 python-dateutil==2.8.2 python-slugify==8.0.1 pytz==2023.3 +pywin32==305.1 PyYAML==6.0 +pyzmq==25.1.0 rapidfuzz==3.1.1 regex==2023.6.3 requests==2.31.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.8.10 scikit-learn==1.2.2 scipy==1.10.1 +Send2Trash==1.8.2 silpa-common==0.3 six==1.16.0 smart-open==6.3.0 smmap==5.0.0 +sniffio==1.3.0 soundex==1.1.3 +soupsieve==2.4.1 spacy==3.6.0 spacy-legacy==3.0.12 spacy-loggers==1.0.4 srsly==2.4.6 +stack-data text-unidecode==1.3 textblob==0.17.1 thinc==8.1.10 threadpoolctl==3.1.0 +tinycss2==1.2.1 tomli==2.0.1 +tornado tqdm==4.65.0 +traitlets typer==0.9.0 -typing_extensions==4.6.3 +typing_extensions==4.5.0 tzdata==2023.3 +uri-template==1.3.0 urllib3==2.0.3 virtualenv==20.23.0 wasabi==1.1.2 +wcwidth +webcolors==1.13 +webencodings==0.5.1 +websocket-client==1.6.1 wordcloud==1.9.2 yarg==0.1.9 From bee9fee540116bf0dfa6bb8936f090c61d4a9607 Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Thu, 13 Jul 2023 11:33:34 +0100 Subject: [PATCH 25/31] Update CodeCov.yml --- .github/workflows/CodeCov.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml index d4fb902..bcedcd2 100644 --- a/.github/workflows/CodeCov.yml +++ b/.github/workflows/CodeCov.yml @@ -25,7 +25,6 @@ jobs: pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi python -m nltk.downloader punkt stopwords - pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz pip install coverage pip install coverage[toml] coverage run -m pytest From 242e00c07a0deb6cc6c671e5472494b2b3f5e627 Mon Sep 17 00:00:00 2001 From: Colin Daglish Date: Thu, 13 Jul 2023 11:48:55 +0100 Subject: [PATCH 26/31] update minimal requirements --- requirements.txt | 122 ++--------------------------------------------- 1 file changed, 4 insertions(+), 118 deletions(-) diff --git a/requirements.txt b/requirements.txt index 26fc40b..4e4472c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,128 +1,14 @@ -arrow==1.2.3 -asttokens -backcall -backports.functools-lru-cache -binaryornot==0.4.4 -blis==0.7.9 -catalogue==2.0.8 -certifi==2023.5.7 -cfgv==3.3.1 -chardet==5.1.0 -charset-normalizer==3.1.0 -click==8.1.3 -colorama==0.4.6 -comm -confection==0.1.0 -contourpy==1.1.0 -cookiecutter==2.1.1 -cycler==0.11.0 -cymem==2.0.7 -debugpy -decorator -distlib==0.3.6 -docopt==0.6.2 -en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz -entrypoints -exceptiongroup==1.1.1 -executing -fastjsonschema==2.17.1 -filelock==3.12.2 -fonttools==4.40.0 -fuzz==0.1.1 -gitdb==4.0.10 -GitPython==3.1.31 -identify==2.5.24 -idna==3.4 -imageio==2.31.1 -inexactsearch==1.0.2 -iniconfig==2.0.0 -ipykernel -ipython -ipython-genutils==0.2.0 -jedi -Jinja2==3.1.2 -jinja2-time==0.2.0 -joblib==1.2.0 -jupyter-client -jupyter-highlight-selected-word==0.2.0 -jupyter_core -kiwisolver==1.4.4 -langcodes==3.3.0 -Markdown==3.4.3 -MarkupSafe==2.1.3 matplotlib==3.7.1 -matplotlib-inline -mglearn==0.2.0 -murmurhash==1.0.9 -nest-asyncio nltk==3.8.1 -nodeenv==1.8.0 numpy==1.25.0 -packaging pandas==2.0.2 -parso -pathy==0.10.2 -pickleshare -Pillow==9.5.0 -pipreqs==0.4.13 -platformdirs -pluggy==1.1.0 -pre-commit==3.3.3 -preshed==3.0.8 -prompt-toolkit -psutil -pure-eval -pydantic==1.10.11 -Pygments -pyparsing==3.1.0 -pyspellchecker==0.7.2 pytest==7.3.2 -python-dateutil==2.8.2 -python-slugify==8.0.1 -pytz==2023.3 -pywin32==305.1 PyYAML==6.0 -pyzmq==25.1.0 +PyYAML==6.0 rapidfuzz==3.1.1 -regex==2023.6.3 -requests==2.31.0 -rfc3339-validator==0.1.4 -rfc3986-validator==0.1.1 -rpds-py==0.8.10 -scikit-learn==1.2.2 -scipy==1.10.1 -Send2Trash==1.8.2 -silpa-common==0.3 -six==1.16.0 -smart-open==6.3.0 -smmap==5.0.0 -sniffio==1.3.0 -soundex==1.1.3 -soupsieve==2.4.1 +scikit_learn==1.2.2 +scipy==1.11.1 +setuptools==67.6.1 spacy==3.6.0 -spacy-legacy==3.0.12 -spacy-loggers==1.0.4 -srsly==2.4.6 -stack-data -text-unidecode==1.3 textblob==0.17.1 -thinc==8.1.10 -threadpoolctl==3.1.0 -tinycss2==1.2.1 -tomli==2.0.1 -tornado -tqdm==4.65.0 -traitlets -typer==0.9.0 -typing_extensions==4.5.0 -tzdata==2023.3 -uri-template==1.3.0 -urllib3==2.0.3 -virtualenv==20.23.0 -wasabi==1.1.2 -wcwidth -webcolors==1.13 -webencodings==0.5.1 -websocket-client==1.6.1 wordcloud==1.9.2 -yarg==0.1.9 From 2c8c20dd59b2a46077ecb888e6202eb8a8532b2f Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Thu, 13 Jul 2023 11:51:46 +0100 Subject: [PATCH 27/31] Update CodeCov.yml --- .github/workflows/CodeCov.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml index bcedcd2..2ca40d6 100644 --- a/.github/workflows/CodeCov.yml +++ b/.github/workflows/CodeCov.yml @@ -24,6 +24,7 @@ jobs: run: | pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz python -m nltk.downloader punkt stopwords pip install coverage pip install coverage[toml] From 4c18d049e0bee3bdb65b0414a7d822eed43229d2 Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Thu, 13 Jul 2023 11:55:46 +0100 Subject: [PATCH 28/31] Update CodeCov.yml --- .github/workflows/CodeCov.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml index 2ca40d6..20b0445 100644 --- a/.github/workflows/CodeCov.yml +++ b/.github/workflows/CodeCov.yml @@ -20,7 +20,7 @@ jobs: python-version: 3.9 cache: 'pip' # caching pip dependencies - - name: Generate Report + - name: Install packages run: | pip install --upgrade pip if [ -f requirements.txt ]; then pip install -r requirements.txt; fi @@ -28,8 +28,13 @@ jobs: python -m nltk.downloader punkt stopwords pip install coverage pip install coverage[toml] + + - name: Run Unit Tests + run: | coverage run -m pytest + + - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 with: From a3d4cebe99d04dc51b3aa49ff7cd5a21101d7fa3 Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Thu, 13 Jul 2023 11:57:11 +0100 Subject: [PATCH 29/31] Add skip test for retrieve named entitites --- tests/modules/test_analysis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py index d398ef8..82a86a2 100644 --- a/tests/modules/test_analysis.py +++ b/tests/modules/test_analysis.py @@ -58,6 +58,7 @@ def test_get_total_feature_count(self): class TestRetrieveNamedEntities: + @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Unknown error during CI") def test_retrieve_named_entities(self): test_data = Series( [ From 34072b12ee1802c360f98c274c04c60dcc5d0f02 Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Thu, 13 Jul 2023 12:00:00 +0100 Subject: [PATCH 30/31] add import --- tests/modules/test_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py index 82a86a2..86b090c 100644 --- a/tests/modules/test_analysis.py +++ b/tests/modules/test_analysis.py @@ -1,5 +1,5 @@ from itertools import repeat - +import pytest from pandas import DataFrame, Series from scipy.sparse._csr import csr_matrix from sklearn.decomposition import LatentDirichletAllocation From 6a168c6a938e6175433eacb25592b28e638065b0 Mon Sep 17 00:00:00 2001 From: Colin Daglish <87810570+ColinDaglish@users.noreply.github.com> Date: Thu, 13 Jul 2023 12:01:44 +0100 Subject: [PATCH 31/31] add import sys --- tests/modules/test_analysis.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py index 86b090c..0dd16b1 100644 --- a/tests/modules/test_analysis.py +++ b/tests/modules/test_analysis.py @@ -1,5 +1,6 @@ from itertools import repeat import pytest +import sys from pandas import DataFrame, Series from scipy.sparse._csr import csr_matrix from sklearn.decomposition import LatentDirichletAllocation