Plug in real data feed

datasciencecampus · Jul 10, 2023 · 8bf8b5a · 8bf8b5a
1 parent 3f31c81
commit 8bf8b5a
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 28 deletions.
diff --git a/src/config.yaml b/src/config.yaml
@@ -1,15 +1,15 @@
-raw_data_path: "data/raw/2023_consultation_mock_data.csv"
+raw_data_path: "data/raw/20230710_consultation_ingest.csv"
 buisness_terminology: #words to update spelling with associated weight
   dpm: 1
-  admin: 1 #needs higher weight to override amin -> main correction
+  admin: 1
   timeliness: 1
 additional_stopwords: #words to filter
   - "census"
   - "data"
 lemmatize: True #select False to use Stemmer
 feature_count:
   ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1)
-  min_df: 0.2 #float (proportion) or int (count)
+  min_df: 0.1 #float (proportion) or int (count)
   max_df: 1.0 #float (proportion) or int (count)
   max_features: null #null converts to None, or int value
   lowercase: True #whether to convert all words to lowercase
diff --git a/src/run_pipeline.py b/src/run_pipeline.py
@@ -1,18 +1,18 @@
 import pandas as pd
 from nltk.tokenize import word_tokenize
-from sklearn.feature_extraction.text import CountVectorizer
 
-from src.processing.preprocessing import (  # stemmer,
-    correct_spelling,
+from src.processing.preprocessing import (
     extract_feature_count,
     fuzzy_compare_ratio,
+    get_total_feature_count,
     initialise_update_stopwords,
-    lemmatizer,
     load_config,
     rejoin_tokens,
     remove_blank_rows,
     remove_nltk_stopwords,
     remove_punctuation,
+    shorten_tokens,
+    spellcorrect_series,
 )
 from src.processing.visualisation import create_wordcloud  # print_row_by_row,
 
@@ -29,42 +29,39 @@
 def run_pipeline():
     """run consultation nlp pipeline"""
     config = load_config("src/config.yaml")
-    raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252")
-    raw_series = raw_data["qu_3"]
+    colnames = [f"qu_{number+1}" for number in range(0, 33)]
+    raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252", names=colnames)
+    raw_series = raw_data["qu_11"]
     # TODO add clean_data parent function
     lower_series = raw_series.str.lower()
     without_blank_rows = remove_blank_rows(lower_series)
-    spelling_fixed = without_blank_rows.apply(
-        correct_spelling, config["business_terminology"]
+    spelling_fixed = spellcorrect_series(
+        without_blank_rows, config["buisness_terminology"]
     )
     impact_of_spell_correction = fuzzy_compare_ratio(without_blank_rows, spelling_fixed)
-    # TODO consider whether there are words we need to fix manually? i.e timliness
     #      print_row_by_row(without_blank_rows,spelling_fixed)
     no_punctuation_series = spelling_fixed.apply(remove_punctuation)
     word_tokens = no_punctuation_series.apply(word_tokenize)
-    # stemmed_tokens = word_tokens.apply(stemmer)
-    lemmatized_tokens = word_tokens.apply(lemmatizer)
-    without_stopwords = lemmatized_tokens.apply(
+    short_tokens = shorten_tokens(word_tokens, config["lemmatize"])
+    without_stopwords = short_tokens.apply(
         lambda x: remove_nltk_stopwords(x, config["additional_stopwords"])
     )
     rejoined_words = without_stopwords.apply(rejoin_tokens)
-    text = " ".join(rejoined_words)
-    create_wordcloud(text)
-
-    # just printing to overcome qa aspect
-    print(rejoined_words, impact_of_spell_correction)
-
-    """#Topic Modelling"""
+    all_text_combined = " ".join(rejoined_words)
+    create_wordcloud(all_text_combined)
     stopwords = initialise_update_stopwords(config["additional_stopwords"])
     features = extract_feature_count(
-        without_blank_rows, ngram_range=(1, 2), min_df=0.2, stop_words=stopwords
+        series=spelling_fixed,
+        ngram_range=config["feature_count"]["ngram_range"],
+        min_df=config["feature_count"]["min_df"],
+        max_df=config["feature_count"]["max_df"],
+        max_features=config["feature_count"]["max_features"],
+        lowercase=config["feature_count"]["lowercase"],
+        stop_words=stopwords,
     )
-    print(features)
-
-    vect = CountVectorizer(max_features=5)
-    coliv_wordsbows = vect.fit(raw_series)
+    total_features = get_total_feature_count(features)
 
-    print(coliv_wordsbows.vocabulary_)
+    print(features, rejoined_words, total_features, impact_of_spell_correction)
 
 
 #    lda5 = LatentDirichletAllocation(