diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml
index d7bb499..20b0445 100644
--- a/.github/workflows/CodeCov.yml
+++ b/.github/workflows/CodeCov.yml
@@ -20,14 +20,21 @@ jobs:
         python-version: 3.9
         cache: 'pip' # caching pip dependencies
         
-    - name: Generate Report
+    - name: Install packages
       run: |
         pip install --upgrade pip
-        pip install -r requirements.txt
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
+        python -m nltk.downloader punkt stopwords
         pip install coverage
         pip install coverage[toml]
+        
+    - name: Run Unit Tests
+      run: | 
         coverage run -m pytest
         
+        
+        
     - name: Upload Coverage to Codecov
       uses: codecov/codecov-action@v3
       with:
diff --git a/requirements.txt b/requirements.txt
index 1a7dcac..4e4472c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,69 +1,14 @@
-arrow==1.2.3
-binaryornot==0.4.4
-certifi==2023.5.7
-cfgv==3.3.1
-chardet==5.1.0
-charset-normalizer==3.1.0
-click==8.1.3
-colorama==0.4.6
-contourpy==1.1.0
-cookiecutter==2.1.1
-cycler==0.11.0
-distlib==0.3.6
-docopt==0.6.2
-exceptiongroup==1.1.1
-filelock==3.12.2
-fonttools==4.40.0
-fuzz==0.1.1
-gitdb==4.0.10
-GitPython==3.1.31
-identify==2.5.24
-idna==3.4
-imageio==2.31.1
-inexactsearch==1.0.2
-iniconfig==2.0.0
-Jinja2==3.1.2
-jinja2-time==0.2.0
-joblib==1.2.0
-kiwisolver==1.4.4
-MarkupSafe==2.1.3
 matplotlib==3.7.1
-mglearn==0.2.0
 nltk==3.8.1
-nodeenv==1.8.0
 numpy==1.25.0
-packaging==23.1
 pandas==2.0.2
-Pillow==9.5.0
-pipreqs==0.4.13
-platformdirs==3.5.3
-pluggy==1.1.0
-pre-commit==3.3.3
-pyparsing==3.1.0
-pyspellchecker==0.7.2
 pytest==7.3.2
-python-dateutil==2.8.2
-python-slugify==8.0.1
-pytz==2023.3
+PyYAML==6.0
 PyYAML==6.0
 rapidfuzz==3.1.1
-regex==2023.6.3
-requests==2.31.0
-scikit-learn==1.2.2
-scipy==1.10.1
-silpa-common==0.3
-six==1.16.0
-smmap==5.0.0
-soundex==1.1.3
-text-unidecode==1.3
+scikit_learn==1.2.2
+scipy==1.11.1
+setuptools==67.6.1
+spacy==3.6.0
 textblob==0.17.1
-threadpoolctl==3.1.0
-tomli==2.0.1
-tqdm==4.65.0
-typer==0.9.0
-typing_extensions==4.6.3
-tzdata==2023.3
-urllib3==2.0.3
-virtualenv==20.23.0
 wordcloud==1.9.2
-yarg==0.1.9
diff --git a/src/config.yaml b/src/config.yaml
index 99e73c2..5a40e02 100644
--- a/src/config.yaml
+++ b/src/config.yaml
@@ -1,8 +1,24 @@
-raw_data_path: "data/raw/2023_consultation_mock_data.csv"
-business_terminology:
-  - 'dpm'
-  - 'admin'
-  - 'timeliness'
-additional_stopwords:
-  - "census"
-  - "data"
+raw_data_path: "data/raw/20230711_consultation_ingest.csv" #str
+buisness_terminology: # dictionary of words to update spelling with associated weight
+  dpm: 1  #int
+  admin: 1 #int
+  timeliness: 1 #int
+  year: 450 #int
+additional_stopwords: #list of words to filter; must be type str
+  - "census" #str
+  - "data" #str
+  - "personal" #str
+  - "use" #str
+lemmatize: True #bool; select False to use Stemmer
+feature_count: #dict
+  ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1)
+  min_df: 2 #float (proportion) or int (count)
+  max_df: 0.95 #float (proportion) or int (count)
+  max_features: null #null converts to None, or int value
+  lowercase: True #whether to convert all words to lowercase
+lda: #dict
+  n_topics: 5 #int
+  n_top_words: 10 #int
+  max_iter: 25 #int
+  title: "Topic Summary" #str
+  topic_labels: null # also takes a list of strings (see additional stopwords ^)
diff --git a/src/processing/__init__.py b/src/modules/__init__.py
similarity index 100%
rename from src/processing/__init__.py
rename to src/modules/__init__.py
diff --git a/src/modules/analysis.py b/src/modules/analysis.py
new file mode 100644
index 0000000..558cc8b
--- /dev/null
+++ b/src/modules/analysis.py
@@ -0,0 +1,131 @@
+import typing
+
+import spacy
+from numpy.typing import ArrayLike
+from pandas import DataFrame, Series
+from scipy.sparse._csr import csr_matrix
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import CountVectorizer
+
+
+def extract_feature_count(
+    series: Series,
+    max_features: int = None,
+    ngram_range: tuple[float, float] = (1, 1),
+    stop_words: ArrayLike = None,
+    lowercase: bool = True,
+    min_df=1,
+    max_df=1.0,
+) -> typing.Tuple[CountVectorizer, DataFrame]:
+    """create a text feature count dataframe from series
+    Paramaters
+    ----------
+    series: Series
+        Series of text strings
+    max_features: int, default = None
+        If not None, build a vocabulary that only consider the top max_features
+        ordered by term frequency across the corpus. Otherwise, all features are used.
+    ngram_range: tuple (min_n, max_n), default=(1, 1)
+        The lower and upper boundary of the range of n-values for different word n-grams
+        or char n-grams to be extracted. All values of n such such that
+        min_n <= n <= max_n will be used.
+    stop_words: list, default=None
+        list of stopwords to remove from text strings
+    lowercase: bool, default = True
+        convert all characters to lowercase before tokenizing
+    min_df: float or int, default = 1
+        When building the vocabulary ignore terms that have a document frequency
+        strictly lower than the given threshold. This value is also called cut-off
+        in the literature. If float, the parameter represents a proportion of
+        documents, integer absolute counts.
+        This parameter is ignored if vocabulary is not None.
+    max_df: float or int, default = 1.0
+        When building the vocabulary ignore terms that have a document frequency
+        strictly higher than the given threshold (corpus-specific stop words).
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts. This parameter is ignored if vocabulary is not None.
+    Returns
+    -------
+    DataFrame
+        A dataframe of text feature counts, displaying the number of times a word
+        appears in each element of the input series
+    """
+
+    vectorizer = CountVectorizer(
+        max_features=max_features,
+        ngram_range=ngram_range,
+        stop_words=stop_words,
+        lowercase=lowercase,
+        min_df=min_df,
+        max_df=max_df,
+    )
+
+    fitted_vector = vectorizer.fit_transform(series)
+
+    word_count_df = DataFrame(
+        fitted_vector.toarray(), columns=vectorizer.get_feature_names_out()
+    )
+    return (fitted_vector, word_count_df)
+
+
+def get_total_feature_count(features: DataFrame) -> DataFrame:
+    """sum across features to get total number of times word was used
+    Parameters
+    ----------
+    features: DataFrame
+        A dataframe of the features with each row corrosponding to a deconstructed
+        string
+    Returns
+    -------
+    DataFrame
+        A dataframe of the total number of times each word is used across all
+        strings"""
+    total_feature_count = DataFrame()
+    for column in features.columns:
+        total_feature_count[column] = [features[column].sum()]
+    return total_feature_count
+
+
+def retrieve_named_entities(series: Series) -> list:
+    """retrieve any named entities from the series
+    Parameters
+    ----------
+    series:Series
+        A series of text strings to analyse for named entities
+    Returns
+    -------
+    list[list[str]]
+        a list of lists containing strings for each named entitity"""
+    nlp = spacy.load("en_core_web_sm")
+    entities = []
+    for doc in nlp.pipe(series):
+        entities.append([str(ent) for ent in doc.ents])
+    return entities
+
+
+def latent_dirichlet_allocation(
+    n_topics: int, max_iter: int, fitted_vector: csr_matrix
+) -> LatentDirichletAllocation:
+    """fit latent direchlet allocation model on fitted vector
+    Parameters
+    ----------
+    n_topics:int
+        number of components to include in model
+    max_iter: int
+        maximum number of passes over the training data
+    fitted_vector:csr_matrix
+        fitted vector from CountVectorizer
+    Returns
+    -------
+    LatentDirichletAllocation
+        fitted lda model
+    """
+    lda = LatentDirichletAllocation(
+        n_components=n_topics,
+        learning_method="batch",
+        max_iter=max_iter,
+        random_state=179,
+    )
+
+    lda.fit(fitted_vector)
+    return lda
diff --git a/src/processing/preprocessing.py b/src/modules/preprocessing.py
similarity index 54%
rename from src/processing/preprocessing.py
rename to src/modules/preprocessing.py
index b366011..943d35a 100644
--- a/src/processing/preprocessing.py
+++ b/src/modules/preprocessing.py
@@ -1,6 +1,7 @@
 import os
 import re
 import string
+import sys
 
 import nltk
 import numpy as np
@@ -8,8 +9,7 @@
 import yaml
 from nltk.corpus import stopwords as sw
 from nltk.stem import PorterStemmer, WordNetLemmatizer
-from pandas.core.series import Series
-from rapidfuzz.fuzz import ratio
+from pandas import Series
 
 
 def load_config(filepath: str) -> dict:
@@ -30,7 +30,7 @@ def load_config(filepath: str) -> dict:
         raise TypeError("filepath must be a string")
 
     with open(filepath, "r") as file:
-        config = yaml.safe_load(file)
+        config = yaml.load(file, Loader=yaml.Loader)
     return config
 
 
@@ -69,7 +69,24 @@ def _replace_blanks(series: Series) -> Series:
     return blanks_replaced
 
 
-def correct_spelling(string: str, additional_words: list = []) -> str:
+def spellcorrect_series(series: Series, additional_words: dict = {}) -> Series:
+    """fix spelling across series using the norvig spell-correct method
+    Parameters
+    ----------
+    series: Series
+        the series of text strings you want to pass your spell checker on
+    additional_words:dict
+        a dictionary of words and weights for each word
+    Returns
+    -------
+    Series
+        a series with words spelling corrected"""
+    tb.en.spelling = _update_spelling_words(additional_words)
+    corrected_series = series.apply(lambda str: _correct_spelling(str))
+    return corrected_series
+
+
+def _correct_spelling(string: str) -> str:
     """correct spelling using norvig spell-correct method
     (it has around 70% accuracy)
     Parameters
@@ -80,46 +97,35 @@ def correct_spelling(string: str, additional_words: list = []) -> str:
     -------
     str
         string with the spelling fixed"""
-    _update_spelling_words(additional_words)
     spelling_fixed = str(tb.TextBlob(string).correct())
     return spelling_fixed
 
 
-def _update_spelling_words(additional_words: list) -> None:
+def _update_spelling_words(additional_words: dict) -> None:
     """update word in the textblob library with commonly used business word
     Parameters
     ----------
-    additional_words:list
-        words to add to the textblob dictionary
+    additional_words:dict
+        words to add to the textblob dictionary, with associated weights.
+        higher weights give greater precedence to the weighted word.
     Returns
     -------
-    None
+    dict
+        a dictionary of words and updated weights
     """
-    for word in additional_words:
-        tb.en.spelling.update({word: 1})
-        tb.en.spelling
-    return None
+    for word, weight in additional_words.items():
+        tb.en.spelling.update({word: weight})
+    return tb.en.spelling
 
 
-def fuzzy_compare_ratio(base: Series, comparison: Series) -> Series:
-    """compare the base series to the comparison series to get
-    a similarity ratio between strings in the same column
-    Parameters
-    ----------
-    base: Series
-        the base series for comparison
-    comparison: Series
-        the series you want to compare against
-    Returns
-    -------
-    Series
-        a series of ratios (type:float) with scores closer to 100
-        indicating complete match"""
-    fuzzy_ratio = Series(map(ratio, base, comparison))
-    return fuzzy_ratio
+def remove_punctuation(series: Series) -> Series:
+    """Remove punctuation from series of strings"""
+    _initialise_nltk_component("tokenizers/punkt", "punkt")
+    punct_removed = series.apply(_remove_punctuation_string)
+    return punct_removed
 
 
-def remove_punctuation(text: str) -> str:
+def _remove_punctuation_string(text: str) -> str:
     """Remove punctuation from string
 
     Parameters
@@ -136,6 +142,21 @@ def remove_punctuation(text: str) -> str:
     return new_text
 
 
+def shorten_tokens(word_tokens: list, lemmatize: bool = True) -> list:
+    """Shorten tokens to root words
+    Parameters
+    ----------
+    word_tokens:list
+        list of word tokens to shorten
+    lemmatize: bool, default = True
+        whether to use lemmatizer or revert back to False (stemmer)"""
+    if lemmatize:
+        short_tokens = word_tokens.apply(lemmatizer)
+    else:
+        short_tokens = word_tokens.apply(stemmer)
+    return short_tokens
+
+
 def stemmer(tokens: list) -> list:
     """Stem works to their root form (e.g. flying -> fli, Beautiful -> Beauti)
 
@@ -168,11 +189,65 @@ def lemmatizer(tokens: list) -> list:
     lemmatized_tokens
         list of simplified word groupings
     """
+    _initialise_nltk_component("corpora/wordnet.zip", "wordnet")
     lemmatizer = WordNetLemmatizer()
     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
     return lemmatized_tokens
 
 
+def _initialise_nltk_component(extension: str, download_object: str):
+    """spliter function to determine which initialisation path to run
+    Parameters
+    ----------
+    extension: str
+        the filepath extension leading to where the model is saved
+    download_object: str
+        the object to download from nltk
+    Returns
+    -------
+    None
+    """
+    if sys.platform.startswith("linux"):
+        _initialise_nltk_linux(download_object)
+    else:
+        _initialise_nltk_windows(extension, download_object)
+
+
+def _initialise_nltk_linux(download_object: str) -> None:
+    """initialise nltk component for linux  environment (for github actions)
+    Parameters
+    ----------
+    download_object: str
+        nltk object to download
+    Returns
+    -------
+    None
+    """
+    nltk.download(download_object)
+    nltk.data.path.append("../home/runner/nltk_data")
+    return None
+
+
+def _initialise_nltk_windows(extension: str, download_object: str):
+    """initialise nltk component for a windows environment
+    Parameters
+    ----------
+    extension: str
+        the filepath extension leading to where the model is saved
+    download_object: str
+        the object to download from nltk
+    Returns
+    -------
+    None
+    """
+    username = os.getenv("username")
+    path = "C:/Users/" + username + "/AppData/Roaming/nltk_data/" + extension
+    if not os.path.exists(path):
+        nltk.download(download_object)
+        nltk.data.path.append("../local_packages/nltk_data")
+    return None
+
+
 def remove_nltk_stopwords(tokens: list, additional_stopwords: list = []) -> list:
     """remove stopwords from series
 
@@ -187,27 +262,27 @@ def remove_nltk_stopwords(tokens: list, additional_stopwords: list = []) -> list
     list
         token list without stopwords
     """
-    stopwords = _initialise_nltk_stopwords()
-    updated_stopwords = _update_nltk_stopwords(stopwords, additional_stopwords)
-    without_stopwords = [item for item in tokens if item not in updated_stopwords]
+    stopwords = initialise_update_stopwords(additional_stopwords)
+    without_stopwords = [item for item in tokens if item not in stopwords]
     return without_stopwords
 
 
-def _initialise_nltk_stopwords() -> list:
-    """fetch nltk stopwords from corpora
-
+def initialise_update_stopwords(additional_stopwords: list = None) -> list:
+    """initialise and update stopwords, ise this for efficient retrieval of
+    stopwords, rather than calling both functions.
+    Parameters
+    ----------
+    additional_stopwords:list
+        new words to add to the words to remove list
     Returns
     -------
     list
-        list of nltk stopwords
+        a list of words to remove from corpus
     """
-    username = os.getenv("username")
-    path = "c:/Users/" + username + "/AppData/Roaming/nltk_data/corpora/stopwords"
-    if not os.path.exists(path):
-        nltk.download("stopwords")
-    nltk.data.path.append("../local_packages/nltk_data")
+    _initialise_nltk_component("corpora/stopwords", "stopwords")
     stopwords = sw.words("english")
-    return stopwords
+    updated_stopwords = _update_nltk_stopwords(stopwords, additional_stopwords)
+    return updated_stopwords
 
 
 def _update_nltk_stopwords(stopwords: list, additional_stopwords: list):
diff --git a/src/modules/quality_checks.py b/src/modules/quality_checks.py
new file mode 100644
index 0000000..4909c36
--- /dev/null
+++ b/src/modules/quality_checks.py
@@ -0,0 +1,38 @@
+from pandas import Series
+from rapidfuzz.fuzz import ratio
+
+
+def fuzzy_compare_ratio(base: Series, comparison: Series) -> Series:
+    """compare the base series to the comparison series to get
+    a similarity ratio between strings in the same column
+    Parameters
+    ----------
+    base: Series
+        the base series for comparison
+    comparison: Series
+        the series you want to compare against
+    Returns
+    -------
+    Series
+        a series of ratios (type:float) with scores closer to 100
+        indicating complete match"""
+    fuzzy_ratio = Series(map(ratio, base, comparison))
+    return fuzzy_ratio
+
+
+def print_row_by_row(base: Series, comparison: Series) -> None:
+    """print each pair of words row by row
+    Parameters
+    ----------
+    base: Series
+        the base series for comparison
+    comparison: Series
+        the series you want to compare against
+    Returns
+    -------
+    None
+    """
+    for i in base.index:
+        print(base[i])
+        print(comparison[i])
+    return None
diff --git a/src/modules/visualisation.py b/src/modules/visualisation.py
new file mode 100644
index 0000000..ba7d66f
--- /dev/null
+++ b/src/modules/visualisation.py
@@ -0,0 +1,181 @@
+import typing
+from datetime import datetime as dt
+
+import matplotlib.pyplot as plt
+from matplotlib.figure import Figure
+from sklearn.decomposition import LatentDirichletAllocation
+from wordcloud import WordCloud
+
+
+def create_wordcloud(text: str, name: str = "wordcloud") -> None:
+    """generate a wordcloud with the given filename
+    Parameters
+    ----------
+    text: str
+        text for wordcloud
+    filename: str
+        the name and path you want to save the wordcloud to
+    Returns:
+    None (message to console on location of file)
+    """
+    wordcloud = WordCloud().generate(text)
+    figure = plt.figure(figsize=(5, 10))
+    plt.imshow(wordcloud, interpolation="bilinear")
+    plt.axis("off")
+    save_figure(name, figure)
+    return None
+
+
+def save_figure(name: str, fig: Figure) -> None:
+    """save figure with datestamp
+    Parameters
+    ----------
+    name: str
+        name of the figure
+    fig
+        the figure object
+    Returns
+    -------
+    None (message to console on location of file)
+    """
+    datestamp = dt.strftime(dt.now(), "%Y%m%d")
+    filename = f"data/outputs/{datestamp}_{name}.jpeg"
+    fig.savefig(filename, bbox_inches="tight")
+    print(f"{name} plot saved as {filename}")
+    return None
+
+
+def plot_top_words(
+    model: LatentDirichletAllocation,
+    feature_names: list,
+    n_topics: int,
+    title: str,
+    n_top_words: int = 10,
+    topic_labels: list = None,
+) -> None:
+    """Plot topics by their most frequent words
+    Parameters
+    ----------
+    model
+        the lda model components
+    feature_names:list
+        a list of the most frequent words (from bag of words model)
+    n_topics:int
+        number of topics to include in the chart
+    title:str
+        the title for the chart
+    n_top_words:int, (default = 10)
+        the number of top words to include in each topic plot
+    topic_labels:list, (default = None)
+        a list of labels to override the existing labels
+    Returns
+    -------
+    None (message to console on location of file)
+    """
+    topic_labels = _generate_topic_labels(n_topics, topic_labels)
+    labelled_components = dict(zip(topic_labels, model.components_))
+    rows, columns = _get_n_columns_and_n_rows(n_topics)
+    fig, axes = plt.subplots(
+        rows, columns, figsize=_get_fig_size(columns, rows), sharex=True
+    )
+    axes = axes.flatten()
+    for number, (topic_label, component) in enumerate(labelled_components.items()):
+        top_features_ind = component.argsort()[: -n_top_words - 1 : -1]
+        top_features = [feature_names[i] for i in top_features_ind]
+        weights = component[top_features_ind]
+        ax = axes[number]
+        ax.barh(top_features, weights, height=0.7)
+        ax.set_title(topic_label, fontdict={"fontsize": 30})
+        ax.invert_yaxis()
+        ax.tick_params(axis="both", which="major", labelsize=20)
+        for i in "top right left".split():
+            ax.spines[i].set_visible(False)
+        fig.suptitle(title, fontsize=40)
+    save_figure("lda_top_words", fig)
+    return None
+
+
+def _generate_topic_labels(n_topics: int, topic_labels: list = None) -> list:
+    """Generate topic labels from n_topics
+    Parameters
+    ----------
+    n_topics: int
+        number of topics
+    topic_labels:list (default=None)
+        list of topic_labels
+    Returns
+    -------
+    list
+        list of topic labels
+    """
+    if topic_labels is None:
+        topic_labels = [f"Topic_{n}" for n in range(1, n_topics)]
+    else:
+        if len(topic_labels) != n_topics:
+            raise AttributeError("len(topic_labels) does not equal n_topics")
+    return topic_labels
+
+
+def _get_n_columns_and_n_rows(n_topics: int) -> int:
+    """calculate the optimal number of rows and columns for n_topics
+    Parameters
+    ----------
+    n_topics: int
+        number of topics
+    Returns
+    -------
+    int
+        optimal number of columns
+    int
+        optimal number of rows
+    """
+    if n_topics <= 0:
+        raise ValueError("Value must be an integer greater than 0")
+    if n_topics <= 5:
+        n_columns = n_topics
+        n_rows = 1
+    else:
+        factors = [factor for factor in _get_factors(n_topics) if 1 < factor <= 5]
+        if len(factors) > 0:
+            n_columns = factors[-1]
+            n_rows = int(n_topics / n_columns)
+        else:
+            factors = [
+                factor for factor in _get_factors(n_topics + 1) if 1 < factor <= 5
+            ]
+            n_columns = factors[-1]
+            n_rows = int((n_topics / n_columns) + 1)
+    return n_rows, n_columns
+
+
+def _get_factors(x: int) -> list:
+    """retrieve factors of a given integer (x)
+    Parameters
+    ----------
+    x:int
+        integer
+    Returns
+    -------
+    list
+        a list of factors of x
+    """
+    return [i for i in range(1, x + 1) if x % i == 0]
+
+
+def _get_fig_size(columns: int, rows: int) -> typing.Tuple[int, int]:
+    """get figure size from number of columns and rows
+    Parameters
+    ----------
+    columns:int
+        number of columns
+    rows: int
+        number of rows
+    Returns
+    -------
+    int
+        width of fig
+    int
+        height of fig"""
+    width = columns * 6
+    height = (rows * 6) + 3
+    return (width, height)
diff --git a/src/processing/visualisation.py b/src/processing/visualisation.py
deleted file mode 100644
index 0ca08cd..0000000
--- a/src/processing/visualisation.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import matplotlib.pyplot as plt
-from pandas import Series
-from wordcloud import WordCloud
-
-
-def print_row_by_row(base: Series, comparison: Series) -> None:
-    """print each pair of words row by row
-    Parameters
-    ----------
-    base: Series
-        the base series for comparison
-    comparison: Series
-        the series you want to compare against
-    Returns
-    -------
-    None
-    """
-    for i in base.index:
-        print(base[i])
-        print(comparison[i])
-    return None
-
-
-def create_wordcloud(text: str, filename: str = "data/outputs/wordcloud.jpeg"):
-    """generate a wordcloud with the given filename
-    Parameters
-    ----------
-    text: str
-        text for wordcloud
-    filename: str
-        the name and path you want to save the wordcloud to
-    Returns:
-       prints message to console saying where file is saved
-    """
-    wordcloud = WordCloud().generate(text)
-    plt.imshow(wordcloud, interpolation="bilinear")
-    plt.axis("off")
-    plt.savefig(filename, bbox_inches="tight")
-    print(f"Wordcloud saved to {filename}")
diff --git a/src/run_pipeline.py b/src/run_pipeline.py
index 9a06a84..083224b 100644
--- a/src/run_pipeline.py
+++ b/src/run_pipeline.py
@@ -1,149 +1,80 @@
-# import re
-# import string
-# import matplotlib.pyplot as plt
-# import mglearn
-# import numpy as np
 import pandas as pd
 from nltk.tokenize import word_tokenize
-from sklearn.feature_extraction.text import CountVectorizer
 
-from src.processing.preprocessing import (  # stemmer,
-    correct_spelling,
-    fuzzy_compare_ratio,
-    lemmatizer,
+from src.modules.analysis import (
+    extract_feature_count,
+    get_total_feature_count,
+    latent_dirichlet_allocation,
+    retrieve_named_entities,
+)
+from src.modules.preprocessing import (
+    initialise_update_stopwords,
     load_config,
     rejoin_tokens,
     remove_blank_rows,
     remove_nltk_stopwords,
     remove_punctuation,
+    shorten_tokens,
+    spellcorrect_series,
 )
-from src.processing.visualisation import create_wordcloud  # print_row_by_row,
-
-# from sklearn.decomposition import LatentDirichletAllocation
-# from importlib import reload
-# reload(preprocessing)
+from src.modules.quality_checks import fuzzy_compare_ratio  # print_row_by_row,
+from src.modules.visualisation import create_wordcloud, plot_top_words
 
 
 def run_pipeline():
-    """run entire consultation nlp pipeline"""
+    """run consultation nlp pipeline"""
     config = load_config("src/config.yaml")
-    raw_data = pd.read_csv(config["raw_data_path"], encoding="cp1252")
-    raw_series = raw_data["qu_3"]
+    colnames = [f"qu_{number+1}" for number in range(0, 33)]
+    raw_data = pd.read_csv(
+        config["raw_data_path"], encoding="cp1252", names=colnames, skiprows=1
+    )
+    raw_series = raw_data["qu_11"]
     # TODO add clean_data parent function
-    lower_series = raw_series.str.lower()
-    without_blank_rows = remove_blank_rows(lower_series)
-    spelling_fixed = without_blank_rows.apply(
-        correct_spelling, config["business_terminology"]
+    without_blank_rows = remove_blank_rows(raw_series)
+    spelling_fixed = spellcorrect_series(
+        without_blank_rows, config["buisness_terminology"]
     )
     impact_of_spell_correction = fuzzy_compare_ratio(without_blank_rows, spelling_fixed)
-    # TODO consider whether there are words we need to fix manually? i.e timliness
+    lower_series = spelling_fixed.str.lower()
     #      print_row_by_row(without_blank_rows,spelling_fixed)
-    no_punctuation_series = spelling_fixed.apply(remove_punctuation)
+    no_punctuation_series = remove_punctuation(lower_series)
     word_tokens = no_punctuation_series.apply(word_tokenize)
-    # stemmed_tokens = word_tokens.apply(stemmer)
-    lemmatized_tokens = word_tokens.apply(lemmatizer)
-    without_stopwords = lemmatized_tokens.apply(
+    short_tokens = shorten_tokens(word_tokens, config["lemmatize"])
+    without_stopwords = short_tokens.apply(
         lambda x: remove_nltk_stopwords(x, config["additional_stopwords"])
     )
     rejoined_words = without_stopwords.apply(rejoin_tokens)
-    text = " ".join(rejoined_words)
-    create_wordcloud(text)
-
-    # just printing to overcome qa aspect
-    print(rejoined_words, impact_of_spell_correction)
-
-    """#Topic Modelling"""
-
-    vect = CountVectorizer(max_features=5)
-    coliv_wordsbows = vect.fit(raw_series)
+    all_text_combined = " ".join(rejoined_words)
+    create_wordcloud(all_text_combined)
+    stopwords = initialise_update_stopwords(config["additional_stopwords"])
+    fitted_vector, features = extract_feature_count(
+        series=spelling_fixed,
+        ngram_range=config["feature_count"]["ngram_range"],
+        min_df=config["feature_count"]["min_df"],
+        max_df=config["feature_count"]["max_df"],
+        max_features=config["feature_count"]["max_features"],
+        lowercase=config["feature_count"]["lowercase"],
+        stop_words=stopwords,
+    )
+    total_features = get_total_feature_count(features)
+    entities = retrieve_named_entities(without_blank_rows)
+    lda = latent_dirichlet_allocation(
+        n_topics=config["lda"]["n_topics"],
+        max_iter=config["lda"]["max_iter"],
+        fitted_vector=fitted_vector,
+    )
+    plot_top_words(
+        model=lda,
+        feature_names=list(features.columns),
+        n_topics=config["lda"]["n_topics"],
+        title=config["lda"]["title"],
+        n_top_words=config["lda"]["n_top_words"],
+        topic_labels=config["lda"]["topic_labels"],
+    )
 
-    print(coliv_wordsbows.vocabulary_)
+    print(impact_of_spell_correction, total_features, entities)
 
 
-#    lda5 = LatentDirichletAllocation(
-#        n_components=5, learning_method="batch", max_iter=25, random_state=0
-#    )
-#
-#    document_topics5 = lda5.fit_transform(coliv_wordsbows)
-#
-#    topics = np.array([0, 1, 2, 3, 4])
-#
-#    sorting = np.argsort(lda5.components_, axis=1)[:, ::-1]
-#    feature_names = np.array(vect.get_feature_names())
-#    mglearn.tools.print_topics(
-#        topics=topics,
-#        feature_names=feature_names,
-#        sorting=sorting,
-#        topics_per_chunk=5,
-#        n_words=10,
-#    )
-#
-#     document_topics5
-#
-#
-#    censtranf_respns = nlp_censtranf[
-#        "cens_test_1"
-#    ]
-#    censtranf_respns = nlp_censtranf.reset_index(drop=True)
-#
-#
-#
-#
-#    def topic_summary(
-#        topic_number,
-#    ):
-#
-#        topics = [topic_number]
-#        mglearn.tools.print_topics(
-#            topics=topics,
-#            feature_names=feature_names,
-#            sorting=sorting,
-#            topics_per_chunk=5,
-#            n_words=10,
-#        )
-#
-#        responses = np.argsort(document_topics5[:, topic_number])[::-1]
-#
-#        for i in responses[:5]:
-#            print(coliv_respns[i], ".\n")
-#
-#
-#    for i in range(5):
-#        topic_summary(i)
-#
-#    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
-#    topic_names = [
-#        "{:>2} ".format(i) + " ".join(words)
-#        for i, words in enumerate(feature_names[sorting[:, :2]])
-#    ]
-#
-#    ax.barh(np.arange(5), np.sum(document_topics5, axis=0))
-#    ax.set_yticks(np.arange(5))
-#    ax.set_yticklabels(topic_names, ha="left", va="top")
-#    ax.invert_yaxis()
-#    ax.set_xlim(0, 300)
-#    yax = ax.get_yaxis()
-#    yax.set_tick_params(pad=130)
-#    plt.tight_layout()
-#
-#
-#    topic_labels = [
-#        "The first label",
-#        "The second label",
-#        "The second label",
-#        "The third label",
-#        "The fourth label",
-#    ]
-#
-#
-#    fig, ax = plt.subplots(1, 1, figsize=(10, 8))
-#    topic_names = ["{:>2} {}".format(i, label) for i, label in enumerate(topic_labels)]
-#
-#    ax.barh(np.arange(5), np.mean(document_topics5, axis=0))
-#    ax.set_yticks(np.arange(5))
-#    ax.set_yticklabels(topic_names, ha="right", va="center")
-#    ax.invert_yaxis()
-#    ax.set_xlim(0, 0.5)
-#    yax = ax.get_yaxis()
-#    yax.set_tick_params(pad=10)
-#    plt.tight_layout()
+# code to execute script from terminal
+if __name__ == "__main__":
+    run_pipeline()
diff --git a/tests/processing/__init__.py b/tests/modules/__init__.py
similarity index 100%
rename from tests/processing/__init__.py
rename to tests/modules/__init__.py
diff --git a/tests/modules/test_analysis.py b/tests/modules/test_analysis.py
new file mode 100644
index 0000000..0dd16b1
--- /dev/null
+++ b/tests/modules/test_analysis.py
@@ -0,0 +1,88 @@
+from itertools import repeat
+import pytest
+import sys
+from pandas import DataFrame, Series
+from scipy.sparse._csr import csr_matrix
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import CountVectorizer
+
+from src.modules.analysis import (
+    extract_feature_count,
+    get_total_feature_count,
+    latent_dirichlet_allocation,
+    retrieve_named_entities,
+)
+
+
+class TestExtractFeatureCount:
+    def test_feature_count(self):
+        data = Series(["My name is elf"])
+        expected = DataFrame([[1, 1, 1, 1]], columns=("elf", "is", "my", "name"))
+        actual = extract_feature_count(data)[1]
+        assert all(expected == actual), "Does not match expected output"
+
+    def test_remove_stopwords(self):
+        stopwords = ["is", "my"]
+        data = Series(["My name is elf"])
+        actual = extract_feature_count(data, stop_words=stopwords)[1]
+        expected = DataFrame([[1, 1]], columns=("elf", "name"))
+        assert all(expected == actual), "Does not remove stopwords"
+
+    def test_ngrams(self):
+        data = Series(["My name is elf"])
+        actual = extract_feature_count(data, ngram_range=(1, 2))[1]
+        expected = DataFrame(
+            [repeat(1, 7)],
+            columns=["elf", "is", "is elf", "my", "my name", "name", "name is"],
+        )
+        assert all(expected == actual), "Does not handle ngrams"
+
+    def test_get_fitted_vector(self):
+        data = Series(["My name is elf"])
+        actual = extract_feature_count(data)[0]
+        assert isinstance(
+            actual, csr_matrix
+        ), "Does not return a csr_matrix object in position 0"
+
+
+class TestGetTotalFeatureCount:
+    def test_get_total_feature_count(self):
+        df = DataFrame(
+            [[1, 1, 1, 1, 0], [0, 1, 1, 1, 1]],
+            columns=["elf", "is", "my", "name", "santa"],
+        )
+        expected = DataFrame(
+            [[1, 2, 2, 2, 1]], columns=["elf", "is", "my", "name", "santa"]
+        )
+        actual = get_total_feature_count(df)
+        assert all(expected == actual), "Does not correctly sum total features"
+
+
+class TestRetrieveNamedEntities:
+    @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Unknown error during CI")
+    def test_retrieve_named_entities(self):
+        test_data = Series(
+            [
+                "The ONS has just released an article on the UK Government's policy.",
+                "my own care for nothing",
+                "Hollywood actors now have their own statue",
+            ]
+        )
+        actual = retrieve_named_entities(test_data)
+        expected = [["ONS", "the UK Government's"], [], ["Hollywood"]]
+        trimmed_actual = [component for component in actual if component != []]
+        trimmed_expected = [component for component in expected if component != []]
+        assert (
+            trimmed_actual == trimmed_expected
+        ), "Did not successfully retrieve named entities"
+
+
+class TestLatentDirichletAllocation:
+    def test_latent_dirichlet_allocation(self):
+        fitted = CountVectorizer().fit_transform(
+            Series(["My name is Elf and I like ignoble hats"])
+        )
+        lda = latent_dirichlet_allocation(10, 10, fitted)
+        assert isinstance(
+            lda, LatentDirichletAllocation
+        ), "function did not return an latent dirichlet allocation object"
diff --git a/tests/processing/test_preprocessing.py b/tests/modules/test_preprocessing.py
similarity index 66%
rename from tests/processing/test_preprocessing.py
rename to tests/modules/test_preprocessing.py
index b307b5a..2f3a2a2 100644
--- a/tests/processing/test_preprocessing.py
+++ b/tests/modules/test_preprocessing.py
@@ -1,24 +1,24 @@
-import sys
-import unittest
-
 import numpy as np
 import pytest
 import textblob as tb
+from nltk.corpus import stopwords as sw
 from pandas import Series
 
-from src.processing.preprocessing import (
-    _initialise_nltk_stopwords,
+from src.modules.preprocessing import (
+    _correct_spelling,
+    _initialise_nltk_component,
+    _remove_punctuation_string,
     _replace_blanks,
     _update_nltk_stopwords,
     _update_spelling_words,
-    correct_spelling,
-    fuzzy_compare_ratio,
+    initialise_update_stopwords,
     lemmatizer,
     load_config,
     rejoin_tokens,
     remove_blank_rows,
     remove_nltk_stopwords,
     remove_punctuation,
+    spellcorrect_series,
     stemmer,
 )
 
@@ -82,43 +82,49 @@ def test_return_series(self):
         ), "output is not <class 'pandas.core.series.Series'>"
 
 
+class TestSpellCorrectSeries:
+    def test_spell_correct_series(self):
+        series = Series(["I live in a housr", "I own a housr"])
+        actual = spellcorrect_series(series)
+        expected = Series(["I live in a house", "I own a house"])
+        assert all(actual == expected), "Not fixed spelling across series"
+
+    def test_update_spelling_on_series(self):
+        series = Series(["I live in a housr", "I own a housr"])
+        additional_words = {"housr": 1}
+        actual = spellcorrect_series(series, additional_words)
+        expected = Series(["I live in a housr", "I own a housr"])
+        assert all(actual == expected), "Updated spelling doesn't work across series"
+
+
 class TestCorrectSpelling:
     def test_spelling_fixed(self):
-        house_str = "I live in a housr"
-        corrected = correct_spelling(house_str)
-        assert corrected == "I live in a house", "spelling not fixed correctly"
-
-    def test_word_update(self):
-        additional_words = ["housr"]
-        house_str = "I live in a housr"
-        corrected = correct_spelling(house_str, additional_words)
-        assert (
-            corrected == "I live in a housr"
-        ), "spelling word list not correctly updated"
+        house_str = "I live flar away"
+        corrected = _correct_spelling(house_str)
+        assert corrected == "I live far away", "spelling not fixed correctly"
 
 
 class TestUpdateSpellingWords:
     def test_update_word_list(self):
-        additional_words = ["housr"]
-        _update_spelling_words(additional_words)
+        additional_words = {"monsterp": 1}
+        tb.en.spelling = _update_spelling_words(additional_words)
         assert (
-            "housr" in tb.en.spelling.keys()
+            "monsterp" in tb.en.spelling.keys()
         ), "spelling word list not updated correctly"
 
 
-class TestFuzzyCompareRatio:
-    def test_ratios(self):
-        base = Series(["this is", "this isn't"])
-        comparison = Series(["this is", "yellow"])
-        expected = Series([100.00, 0.0])
-        actual = fuzzy_compare_ratio(base, comparison)
-        assert all(expected == actual), "fuzzy scoring not working correctly"
+class TestRemovePunctuation:
+    def test_remove_punctuation(self):
+        series = Series(["this is!", "my series?"])
+        actual = remove_punctuation(series)
+        expected = Series(["this is", "my series"])
+        assert all(actual == expected), "Remove punctuation not working on series"
 
 
-class TestRemovePunctuation:
+class TestRemovePunctuationstring:
     def test_remove_punctuation(self):
         test_string = "my #$%&()*+,-./:;<=>?@[]^_`{|}~?name"
-        actual = remove_punctuation(test_string)
+        actual = _remove_punctuation_string(test_string)
         expected = "my name"
         assert actual == expected, "punctuation not removed correctly"
 
@@ -132,7 +138,6 @@ def test_stemmer(self):
 
 
 class TestLemmatizer:
-    @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file")
     def test_lemmatization(self):
         word_list = ["house", "houses", "housing"]
         actual = lemmatizer(word_list)
@@ -141,14 +146,12 @@ def test_lemmatization(self):
 
 
 class TestRemoveNLTKStopwords:
-    @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file")
     def test_remove_standard_stopwords(self):
         tokens = ["my", "name", "is", "elf", "who", "are", "you"]
-        actual = remove_nltk_stopwords(tokens, [])
+        actual = remove_nltk_stopwords(tokens)
         expected = ["name", "elf"]
         assert actual == expected, "core stopwords not being removed correctly"
 
-    @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file")
     def test_remove_additional_stopwords(self):
         tokens = ["my", "name", "is", "elf", "who", "are", "you"]
         actual = remove_nltk_stopwords(tokens, ["elf"])
@@ -156,24 +159,18 @@ def test_remove_additional_stopwords(self):
         assert actual == expected, "additional stopwords not being removed correctly"
 
 
-class TestInitialiseNLTKStopwords:
-    @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file")
-    def test_return_stopwords_list(self):
-        stopwords = _initialise_nltk_stopwords()
-        assert isinstance(stopwords, list), "Did not return a list of stopwords"
-
-    @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file")
-    def test_key_stopwords(self):
-        stopwords = _initialise_nltk_stopwords()
-        expected = ["i", "we", "you"]
-        actual = [word in stopwords for word in expected]
-        assert all(actual), "expected key words missing from stopwords"
+class TestInitialiseUpdateStopwords:
+    def test_add_word_to_stopwords(self):
+        additional_words = ["elf", "santa"]
+        new_stopwords = initialise_update_stopwords(additional_words)
+        actual = [word in new_stopwords for word in additional_words]
+        assert all(actual), "new words not added to stopwords"
 
 
 class TestUpdateNLTKStopwords:
-    @pytest.mark.skipif(sys.platform.startswith("linux"), reason="Cannot download file")
     def test_add_word_to_stopwords(self):
-        stopwords = _initialise_nltk_stopwords()
+        _initialise_nltk_component("corpora/stopwords", "stopwords")
+        stopwords = sw.words("english")
         additional_words = ["elf", "santa"]
         new_stopwords = _update_nltk_stopwords(stopwords, additional_words)
         actual = [word in new_stopwords for word in additional_words]
@@ -188,5 +185,6 @@ def test_region_tokens(self):
         assert actual == expected, "did not rejoin tokens correctly"
 
 
-if __name__ == "__main__":
-    unittest.main()
+class TestInitialiseNLTKComponent:
+    def test_initialise_component(self):
+        pass
diff --git a/tests/modules/test_quality_checks.py b/tests/modules/test_quality_checks.py
new file mode 100644
index 0000000..5f69bec
--- /dev/null
+++ b/tests/modules/test_quality_checks.py
@@ -0,0 +1,12 @@
+from pandas import Series
+
+from src.modules.quality_checks import fuzzy_compare_ratio
+
+
+class TestFuzzyCompareRatio:
+    def test_ratios(self):
+        base = Series(["this is", "this isn't"])
+        comparison = Series(["this is", "yellow"])
+        expected = Series([100.00, 0.0])
+        actual = fuzzy_compare_ratio(base, comparison)
+        assert all(expected == actual), "fuzzy scoring not working correctly"