Skip to content

Commit

Permalink
Merge pull request #11 from datasciencecampus/feature/named_entity_re…
Browse files Browse the repository at this point in the history
…cognition

Feature/named entity recognition
  • Loading branch information
ColinDaglish authored Jul 13, 2023
2 parents 3441d1f + 6a168c6 commit 8951a4c
Show file tree
Hide file tree
Showing 14 changed files with 710 additions and 327 deletions.
11 changes: 9 additions & 2 deletions .github/workflows/CodeCov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,21 @@ jobs:
python-version: 3.9
cache: 'pip' # caching pip dependencies

- name: Generate Report
- name: Install packages
run: |
pip install --upgrade pip
pip install -r requirements.txt
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
python -m nltk.downloader punkt stopwords
pip install coverage
pip install coverage[toml]
- name: Run Unit Tests
run: |
coverage run -m pytest
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v3
with:
Expand Down
65 changes: 5 additions & 60 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,69 +1,14 @@
arrow==1.2.3
binaryornot==0.4.4
certifi==2023.5.7
cfgv==3.3.1
chardet==5.1.0
charset-normalizer==3.1.0
click==8.1.3
colorama==0.4.6
contourpy==1.1.0
cookiecutter==2.1.1
cycler==0.11.0
distlib==0.3.6
docopt==0.6.2
exceptiongroup==1.1.1
filelock==3.12.2
fonttools==4.40.0
fuzz==0.1.1
gitdb==4.0.10
GitPython==3.1.31
identify==2.5.24
idna==3.4
imageio==2.31.1
inexactsearch==1.0.2
iniconfig==2.0.0
Jinja2==3.1.2
jinja2-time==0.2.0
joblib==1.2.0
kiwisolver==1.4.4
MarkupSafe==2.1.3
matplotlib==3.7.1
mglearn==0.2.0
nltk==3.8.1
nodeenv==1.8.0
numpy==1.25.0
packaging==23.1
pandas==2.0.2
Pillow==9.5.0
pipreqs==0.4.13
platformdirs==3.5.3
pluggy==1.1.0
pre-commit==3.3.3
pyparsing==3.1.0
pyspellchecker==0.7.2
pytest==7.3.2
python-dateutil==2.8.2
python-slugify==8.0.1
pytz==2023.3
PyYAML==6.0
PyYAML==6.0
rapidfuzz==3.1.1
regex==2023.6.3
requests==2.31.0
scikit-learn==1.2.2
scipy==1.10.1
silpa-common==0.3
six==1.16.0
smmap==5.0.0
soundex==1.1.3
text-unidecode==1.3
scikit_learn==1.2.2
scipy==1.11.1
setuptools==67.6.1
spacy==3.6.0
textblob==0.17.1
threadpoolctl==3.1.0
tomli==2.0.1
tqdm==4.65.0
typer==0.9.0
typing_extensions==4.6.3
tzdata==2023.3
urllib3==2.0.3
virtualenv==20.23.0
wordcloud==1.9.2
yarg==0.1.9
32 changes: 24 additions & 8 deletions src/config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,24 @@
raw_data_path: "data/raw/2023_consultation_mock_data.csv"
business_terminology:
- 'dpm'
- 'admin'
- 'timeliness'
additional_stopwords:
- "census"
- "data"
raw_data_path: "data/raw/20230711_consultation_ingest.csv" #str
buisness_terminology: # dictionary of words to update spelling with associated weight
dpm: 1 #int
admin: 1 #int
timeliness: 1 #int
year: 450 #int
additional_stopwords: #list of words to filter; must be type str
- "census" #str
- "data" #str
- "personal" #str
- "use" #str
lemmatize: True #bool; select False to use Stemmer
feature_count: #dict
ngram_range: !!python/tuple [1,2] #tuple range of defaults to unigram (1,1)
min_df: 2 #float (proportion) or int (count)
max_df: 0.95 #float (proportion) or int (count)
max_features: null #null converts to None, or int value
lowercase: True #whether to convert all words to lowercase
lda: #dict
n_topics: 5 #int
n_top_words: 10 #int
max_iter: 25 #int
title: "Topic Summary" #str
topic_labels: null # also takes a list of strings (see additional stopwords ^)
File renamed without changes.
131 changes: 131 additions & 0 deletions src/modules/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import typing

import spacy
from numpy.typing import ArrayLike
from pandas import DataFrame, Series
from scipy.sparse._csr import csr_matrix
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer


def extract_feature_count(
series: Series,
max_features: int = None,
ngram_range: tuple[float, float] = (1, 1),
stop_words: ArrayLike = None,
lowercase: bool = True,
min_df=1,
max_df=1.0,
) -> typing.Tuple[CountVectorizer, DataFrame]:
"""create a text feature count dataframe from series
Paramaters
----------
series: Series
Series of text strings
max_features: int, default = None
If not None, build a vocabulary that only consider the top max_features
ordered by term frequency across the corpus. Otherwise, all features are used.
ngram_range: tuple (min_n, max_n), default=(1, 1)
The lower and upper boundary of the range of n-values for different word n-grams
or char n-grams to be extracted. All values of n such such that
min_n <= n <= max_n will be used.
stop_words: list, default=None
list of stopwords to remove from text strings
lowercase: bool, default = True
convert all characters to lowercase before tokenizing
min_df: float or int, default = 1
When building the vocabulary ignore terms that have a document frequency
strictly lower than the given threshold. This value is also called cut-off
in the literature. If float, the parameter represents a proportion of
documents, integer absolute counts.
This parameter is ignored if vocabulary is not None.
max_df: float or int, default = 1.0
When building the vocabulary ignore terms that have a document frequency
strictly higher than the given threshold (corpus-specific stop words).
If float, the parameter represents a proportion of documents, integer
absolute counts. This parameter is ignored if vocabulary is not None.
Returns
-------
DataFrame
A dataframe of text feature counts, displaying the number of times a word
appears in each element of the input series
"""

vectorizer = CountVectorizer(
max_features=max_features,
ngram_range=ngram_range,
stop_words=stop_words,
lowercase=lowercase,
min_df=min_df,
max_df=max_df,
)

fitted_vector = vectorizer.fit_transform(series)

word_count_df = DataFrame(
fitted_vector.toarray(), columns=vectorizer.get_feature_names_out()
)
return (fitted_vector, word_count_df)


def get_total_feature_count(features: DataFrame) -> DataFrame:
"""sum across features to get total number of times word was used
Parameters
----------
features: DataFrame
A dataframe of the features with each row corrosponding to a deconstructed
string
Returns
-------
DataFrame
A dataframe of the total number of times each word is used across all
strings"""
total_feature_count = DataFrame()
for column in features.columns:
total_feature_count[column] = [features[column].sum()]
return total_feature_count


def retrieve_named_entities(series: Series) -> list:
"""retrieve any named entities from the series
Parameters
----------
series:Series
A series of text strings to analyse for named entities
Returns
-------
list[list[str]]
a list of lists containing strings for each named entitity"""
nlp = spacy.load("en_core_web_sm")
entities = []
for doc in nlp.pipe(series):
entities.append([str(ent) for ent in doc.ents])
return entities


def latent_dirichlet_allocation(
n_topics: int, max_iter: int, fitted_vector: csr_matrix
) -> LatentDirichletAllocation:
"""fit latent direchlet allocation model on fitted vector
Parameters
----------
n_topics:int
number of components to include in model
max_iter: int
maximum number of passes over the training data
fitted_vector:csr_matrix
fitted vector from CountVectorizer
Returns
-------
LatentDirichletAllocation
fitted lda model
"""
lda = LatentDirichletAllocation(
n_components=n_topics,
learning_method="batch",
max_iter=max_iter,
random_state=179,
)

lda.fit(fitted_vector)
return lda
Loading

0 comments on commit 8951a4c

Please sign in to comment.