Merge pull request #11 from datasciencecampus/feature/named_entity_re…

…cognition Feature/named entity recognition
datasciencecampus · Jul 13, 2023 · 8951a4c · 8951a4c
2 parents 3441d1f + 6a168c6
commit 8951a4c
Show file tree

Hide file tree

Showing 14 changed files with 710 additions and 327 deletions.
diff --git a/.github/workflows/CodeCov.yml b/.github/workflows/CodeCov.yml
@@ -20,14 +20,21 @@ jobs:
         python-version: 3.9
         cache: 'pip' # caching pip dependencies
 
-    - name: Generate Report
+    - name: Install packages
       run: |
         pip install --upgrade pip
-        pip install -r requirements.txt
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
+        python -m nltk.downloader punkt stopwords
         pip install coverage
         pip install coverage[toml]
+        
+    - name: Run Unit Tests
+      run: | 
         coverage run -m pytest
         
+        
+        
     - name: Upload Coverage to Codecov
       uses: codecov/codecov-action@v3
       with:

diff --git a/requirements.txt b/requirements.txt
@@ -1,69 +1,14 @@
-arrow==1.2.3
-binaryornot==0.4.4
-certifi==2023.5.7
-cfgv==3.3.1
-chardet==5.1.0
-charset-normalizer==3.1.0
-click==8.1.3
-colorama==0.4.6
-contourpy==1.1.0
-cookiecutter==2.1.1
-cycler==0.11.0
-distlib==0.3.6
-docopt==0.6.2
-exceptiongroup==1.1.1
-filelock==3.12.2
-fonttools==4.40.0
-fuzz==0.1.1
-gitdb==4.0.10
-GitPython==3.1.31
-identify==2.5.24
-idna==3.4
-imageio==2.31.1
-inexactsearch==1.0.2
-iniconfig==2.0.0
-Jinja2==3.1.2
-jinja2-time==0.2.0
-joblib==1.2.0
-kiwisolver==1.4.4
-MarkupSafe==2.1.3
 matplotlib==3.7.1
-mglearn==0.2.0
 nltk==3.8.1
-nodeenv==1.8.0
 numpy==1.25.0
-packaging==23.1
 pandas==2.0.2
-Pillow==9.5.0
-pipreqs==0.4.13
-platformdirs==3.5.3
-pluggy==1.1.0
-pre-commit==3.3.3
-pyparsing==3.1.0
-pyspellchecker==0.7.2
 pytest==7.3.2
-python-dateutil==2.8.2
-python-slugify==8.0.1
-pytz==2023.3
+PyYAML==6.0
 PyYAML==6.0
 rapidfuzz==3.1.1
-regex==2023.6.3
-requests==2.31.0
-scikit-learn==1.2.2
-scipy==1.10.1
-silpa-common==0.3
-six==1.16.0
-smmap==5.0.0
-soundex==1.1.3
-text-unidecode==1.3
+scikit_learn==1.2.2
+scipy==1.11.1
+setuptools==67.6.1
+spacy==3.6.0
 textblob==0.17.1
-threadpoolctl==3.1.0
-tomli==2.0.1
-tqdm==4.65.0
-typer==0.9.0
-typing_extensions==4.6.3
-tzdata==2023.3
-urllib3==2.0.3
-virtualenv==20.23.0
 wordcloud==1.9.2
-yarg==0.1.9
diff --git a/src/config.yaml b/src/config.yaml
@@ -1,8 +1,24 @@
-raw_data_path: "data/raw/2023_consultation_mock_data.csv"
-business_terminology:
-  - 'dpm'
-  - 'admin'
-  - 'timeliness'
-additional_stopwords:
-  - "census"
-  - "data"
+raw_data_path: "data/raw/20230711_consultation_ingest.csv" #str
+buisness_terminology: # dictionary of words to update spelling with associated weight
+  dpm: 1  #int
+  admin: 1 #int
+  timeliness: 1 #int
+  year: 450 #int
+additional_stopwords: #list of words to filter; must be type str
+  - "census" #str
+  - "data" #str
+  - "personal" #str
+  - "use" #str
+lemmatize: True #bool; select False to use Stemmer
+feature_count: #dict
+  ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1)
+  min_df: 2 #float (proportion) or int (count)
+  max_df: 0.95 #float (proportion) or int (count)
+  max_features: null #null converts to None, or int value
+  lowercase: True #whether to convert all words to lowercase
+lda: #dict
+  n_topics: 5 #int
+  n_top_words: 10 #int
+  max_iter: 25 #int
+  title: "Topic Summary" #str
+  topic_labels: null # also takes a list of strings (see additional stopwords ^)
diff --git a/src/processing/__init__.py → src/modules/__init__.py b/src/processing/__init__.py → src/modules/__init__.py
diff --git a/src/modules/analysis.py b/src/modules/analysis.py
@@ -0,0 +1,131 @@
+import typing
+
+import spacy
+from numpy.typing import ArrayLike
+from pandas import DataFrame, Series
+from scipy.sparse._csr import csr_matrix
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import CountVectorizer
+
+
+def extract_feature_count(
+    series: Series,
+    max_features: int = None,
+    ngram_range: tuple[float, float] = (1, 1),
+    stop_words: ArrayLike = None,
+    lowercase: bool = True,
+    min_df=1,
+    max_df=1.0,
+) -> typing.Tuple[CountVectorizer, DataFrame]:
+    """create a text feature count dataframe from series
+    Paramaters
+    ----------
+    series: Series
+        Series of text strings
+    max_features: int, default = None
+        If not None, build a vocabulary that only consider the top max_features
+        ordered by term frequency across the corpus. Otherwise, all features are used.
+    ngram_range: tuple (min_n, max_n), default=(1, 1)
+        The lower and upper boundary of the range of n-values for different word n-grams
+        or char n-grams to be extracted. All values of n such such that
+        min_n <= n <= max_n will be used.
+    stop_words: list, default=None
+        list of stopwords to remove from text strings
+    lowercase: bool, default = True
+        convert all characters to lowercase before tokenizing
+    min_df: float or int, default = 1
+        When building the vocabulary ignore terms that have a document frequency
+        strictly lower than the given threshold. This value is also called cut-off
+        in the literature. If float, the parameter represents a proportion of
+        documents, integer absolute counts.
+        This parameter is ignored if vocabulary is not None.
+    max_df: float or int, default = 1.0
+        When building the vocabulary ignore terms that have a document frequency
+        strictly higher than the given threshold (corpus-specific stop words).
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts. This parameter is ignored if vocabulary is not None.
+    Returns
+    -------
+    DataFrame
+        A dataframe of text feature counts, displaying the number of times a word
+        appears in each element of the input series
+    """
+
+    vectorizer = CountVectorizer(
+        max_features=max_features,
+        ngram_range=ngram_range,
+        stop_words=stop_words,
+        lowercase=lowercase,
+        min_df=min_df,
+        max_df=max_df,
+    )
+
+    fitted_vector = vectorizer.fit_transform(series)
+
+    word_count_df = DataFrame(
+        fitted_vector.toarray(), columns=vectorizer.get_feature_names_out()
+    )
+    return (fitted_vector, word_count_df)
+
+
+def get_total_feature_count(features: DataFrame) -> DataFrame:
+    """sum across features to get total number of times word was used
+    Parameters
+    ----------
+    features: DataFrame
+        A dataframe of the features with each row corrosponding to a deconstructed
+        string
+    Returns
+    -------
+    DataFrame
+        A dataframe of the total number of times each word is used across all
+        strings"""
+    total_feature_count = DataFrame()
+    for column in features.columns:
+        total_feature_count[column] = [features[column].sum()]
+    return total_feature_count
+
+
+def retrieve_named_entities(series: Series) -> list:
+    """retrieve any named entities from the series
+    Parameters
+    ----------
+    series:Series
+        A series of text strings to analyse for named entities
+    Returns
+    -------
+    list[list[str]]
+        a list of lists containing strings for each named entitity"""
+    nlp = spacy.load("en_core_web_sm")
+    entities = []
+    for doc in nlp.pipe(series):
+        entities.append([str(ent) for ent in doc.ents])
+    return entities
+
+
+def latent_dirichlet_allocation(
+    n_topics: int, max_iter: int, fitted_vector: csr_matrix
+) -> LatentDirichletAllocation:
+    """fit latent direchlet allocation model on fitted vector
+    Parameters
+    ----------
+    n_topics:int
+        number of components to include in model
+    max_iter: int
+        maximum number of passes over the training data
+    fitted_vector:csr_matrix
+        fitted vector from CountVectorizer
+    Returns
+    -------
+    LatentDirichletAllocation
+        fitted lda model
+    """
+    lda = LatentDirichletAllocation(
+        n_components=n_topics,
+        learning_method="batch",
+        max_iter=max_iter,
+        random_state=179,
+    )
+
+    lda.fit(fitted_vector)
+    return lda