Skip to content

Commit

Permalink
Release 2.0.1 (#234)
Browse files Browse the repository at this point in the history
* Switched away from pyramid ARIMA due to stability issues
* Now supports uncompressed pickle files (rather than just bzip2 compressed)
* Using Python 3.7.3
* Handles imported data when dates are stored as strings rather than Timestamp objects
* Corrected unigram handling
  • Loading branch information
IanGrimstead authored Apr 5, 2019
1 parent 776dda3 commit 84f6905
Show file tree
Hide file tree
Showing 25 changed files with 516 additions and 126 deletions.
24 changes: 13 additions & 11 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@ language: python

matrix:
include:
# Use the built in venv for linux builds
- os: linux
sudo: required
python: "3.6.6"
dist: trusty
# Use the built in venv for linux builds
- os: linux
sudo: required
python: "3.7.3"
dist: xenial

# Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
# Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
# - os: osx
# language: generic
# env: PYTHON=3.6.6

before_install: |
if [ "$TRAVIS_OS_NAME" == "osx" ]; then
brew update
Expand All @@ -21,26 +21,26 @@ before_install: |
# See https://docs.travis-ci.com/user/osx-ci-environment/#A-note-on-upgrading-packages.
# I didn't do this above because it works and I'm lazy.
brew outdated pyenv || brew upgrade pyenv
# virtualenv doesn't work without pyenv knowledge. venv in Python 3.3
# doesn't provide Pip by default. So, use `pyenv-virtualenv <https://github.com/yyuu/pyenv-virtualenv/blob/master/README.md>`_.
brew install pyenv-virtualenv
pyenv install $PYTHON
# I would expect something like ``pyenv init; pyenv local $PYTHON`` or
# ``pyenv shell $PYTHON`` would work, but ``pyenv init`` doesn't seem to
# modify the Bash environment. ??? So, I hand-set the variables instead.
export PYENV_VERSION=$PYTHON
export PATH="/Users/travis/.pyenv/shims:${PATH}"
pyenv-virtualenv venv
source venv/bin/activate
# A manual check that the correct version of Python is running.
python --version
fi
export BOTO_CONFIG=/dev/null
install:
- python --version
- python -m pip install -U pip
Expand All @@ -53,6 +53,8 @@ install:
script:
# for codecov support
- pip install pytest pytest-cov
# to report installed packages
- pip freeze
# command to run tests
- pytest --cov-config .coveragerc --cov=./ tests/

Expand Down
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build: none
environment:
matrix:
- PYTHON: "C:\\Python36-x64"
PYTHON_VERSION: 3.6.6
PYTHON_VERSION: 3.7.3
PYTHON_ARCH: 64
init:
- ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%
Expand Down
1 change: 1 addition & 0 deletions config/stopwords_glob.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ everybody
everyone
everything
everywhere
excess
f
few
find
Expand Down
3 changes: 2 additions & 1 deletion config/stopwords_n.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
situation
consist
first
plurality
plurality
second
4 changes: 3 additions & 1 deletion config/stopwords_uni.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
etc
etc
cover
adjacent
3 changes: 1 addition & 2 deletions pygrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ def get_args(command_line_arguments):

args = parser.parse_args(command_line_arguments)

args.path = 'data'
return args


Expand Down Expand Up @@ -165,7 +164,7 @@ def main(supplied_args):
pickled_tf_idf_file_name=pickled_tf_idf_path,
output_name=args.outputs_name, emerging_technology=args.emerging_technology)

pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=50)
pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)

# emtech integration
if args.emerging_technology:
Expand Down
77 changes: 63 additions & 14 deletions scripts/algorithms/arima.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,76 @@
import warnings

import numpy as np
from numpy import clip, inf
from pyramid.arima import auto_arima
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA


class ARIMAForecast(object):

def __init__(self, data_in, num_prediction_periods):
if not all(isinstance(x, float) for x in data_in):
raise ValueError('Time series must be all float values')
def __evaluate_models(self, dataset, p_values, d_values, q_values):
dataset=np.array(dataset)
dataset = dataset.astype('float32')
best_score, best_cfg = float("inf"), None
for p in p_values:
for d in d_values:
for q in q_values:
order = (p, d, q)
try:
mse = self.__evaluate_arima_model(dataset, order, ground_truth_in_history=True)
if mse < best_score:
best_score = mse
best_cfg = order
except:
continue
return best_cfg, best_score

def __evaluate_arima_model(self, X, arima_order, ground_truth_in_history=False):

train_ratio = 0.8
train_size = int(len(X) * train_ratio)
train, test = X[0:train_size], X[train_size:]
history = [x for x in train]
predictions = list()

self.__history = data_in
self.__num_prediction_periods = num_prediction_periods
for t in range(len(test)):
model = ARIMA(history, order=arima_order)
model_fit = model.fit(disp=0, maxiter=200)
yhat = model_fit.forecast()[0][0]
predictions.append(yhat)
history.append(test[t] if ground_truth_in_history else yhat)
error = mean_squared_error(test, predictions)
return error

self.__stepwise_model = auto_arima(
data_in,
seasonal=False,
error_action='ignore', suppress_warnings=True, stepwise=True
)
def __arima_model_predict(self, X, arima_order, steps_ahead):
# make predictions
predictions = list()
try:
for t in range(steps_ahead):
model = ARIMA(X, order=arima_order)
model_fit = model.fit(disp=0)
yhat = model_fit.forecast()[0][0]
predictions.append(yhat)
X = np.append(X, yhat)
except:
predictions.extend([np.nan] * (steps_ahead - len(predictions)))

return predictions

def __init__(self, data_in, num_prediction_periods ):
if not all(isinstance(x, float) for x in data_in):
raise ValueError('Time series must be all float values')

self.__stepwise_model.fit(data_in)
p_values = [0, 1, 2, 4, 6]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
self.__order, score = self.__evaluate_models(data_in, p_values, d_values, q_values)
self.__predictions = self.__arima_model_predict(data_in, self.__order, num_prediction_periods)

@property
def configuration(self):
return self.__stepwise_model.order
return self.__order

def predict_counts(self):
return clip(self.__stepwise_model.predict(n_periods=self.__num_prediction_periods), 0, inf)
return clip(self.__predictions, 0, inf)
2 changes: 1 addition & 1 deletion scripts/data_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def get(doc_source_file_name):
if not os.path.isfile(doc_source_file_name):
raise PygramsException('file: ' + doc_source_file_name + ' does not exist in data folder')

if doc_source_file_name.endswith('.pkl.bz2'):
if doc_source_file_name.endswith('.pkl.bz2') or doc_source_file_name.endswith('.pkl'):
return read_pickle(doc_source_file_name)
elif doc_source_file_name.endswith('.xls'):
return read_excel(doc_source_file_name)
Expand Down
35 changes: 29 additions & 6 deletions scripts/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import pickle
from os import makedirs, path

from pandas import read_pickle
from pandas import read_pickle, to_datetime
from pandas.api.types import is_string_dtype
from tqdm import tqdm

import scripts.data_factory as datafactory
Expand All @@ -11,7 +12,7 @@
from scripts.documents_filter import DocumentsFilter
from scripts.documents_weights import DocumentsWeights
from scripts.filter_terms import FilterTerms
from scripts.text_processing import LemmaTokenizer
from scripts.text_processing import LemmaTokenizer, WordAnalyzer, lowercase_strip_accents_and_ownership
from scripts.tfidf_mask import TfidfMask
from scripts.tfidf_reduce import TfidfReduce
from scripts.tfidf_wrapper import TFIDF
Expand All @@ -21,14 +22,24 @@
from scripts.vandv.predictor import evaluate_prediction


def checkdf( df, emtec, docs_mask_dict, text_header):
def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
app_exit = False

if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None:
if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None or term_counts:
if docs_mask_dict['date_header'] not in df.columns:
print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
app_exit = True

if docs_mask_dict['date_header'] is not None:
if is_string_dtype(df[docs_mask_dict['date_header']]):
df[docs_mask_dict['date_header']] = to_datetime(df[docs_mask_dict['date_header']])

min_date = min(df[docs_mask_dict['date_header']])
max_date = max(df[docs_mask_dict['date_header']])
print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
else:
print('Document dates not specified')

if text_header not in df.columns:
print(f"text_header '{text_header}' not in dataframe")
app_exit = True
Expand Down Expand Up @@ -61,7 +72,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
if pickled_tf_idf_file_name is None:

self.__dataframe = datafactory.get(data_filename)
checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header)
checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)

remove_empty_documents(self.__dataframe, text_header)
self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range,
Expand All @@ -70,7 +81,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
self.__text_lengths = self.__dataframe[text_header].map(len).tolist()
self.__dataframe.drop(columns=[text_header], inplace=True)

tfidf_filename = path.join('outputs', 'tfidf', output_name + '-tfidf.pkl.bz2')
tfidf_filename = path.join('outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
makedirs(path.dirname(tfidf_filename), exist_ok=True)
with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file:
pickle.dump(
Expand All @@ -81,6 +92,17 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
else:
print(f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}')
self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle(pickled_tf_idf_file_name)
if docs_mask_dict['date_header'] is None:
print('Document dates not specified')
else:
min_date = min(self.__dataframe[docs_mask_dict['date_header']])
max_date = max(self.__dataframe[docs_mask_dict['date_header']])
print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')

WordAnalyzer.init(
tokenizer=LemmaTokenizer(),
preprocess=lowercase_strip_accents_and_ownership,
ngram_range=ngram_range)

# todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
# the original. We're really just filtering down.
Expand Down Expand Up @@ -140,6 +162,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
docs_mask_dict['date_header'])
# if other outputs
self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)

# todo: no output method; just if statements to call output functions...?
# Only supply what they each directly require
Expand Down
34 changes: 10 additions & 24 deletions scripts/text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
"""
import scripts.utils.utils as ut
import string

from nltk import word_tokenize, PorterStemmer, pos_tag
Expand Down Expand Up @@ -86,6 +87,7 @@ class WordAnalyzer(object):
stemmed_stop_word_set_n = None
stemmed_stop_word_set_uni = None


@staticmethod
def init(tokenizer, preprocess, ngram_range):
WordAnalyzer.tokenizer = tokenizer
Expand All @@ -110,39 +112,23 @@ def init(tokenizer, preprocess, ngram_range):
def analyzer(doc):
"""based on VectorizerMixin._word_ngrams in sklearn/feature_extraction/text.py,
from scikit-learn; extended to prevent generation of n-grams containing stop words"""
tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))

# handle token n-grams
min_n, max_n = WordAnalyzer.ngram_range
if max_n != 1:
original_tokens = tokens
if min_n == 1:
# no need to do any slicing for unigrams
# just iterate through the original tokens
tokens = [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni and not w.isdigit()]
# tokens = list(original_tokens)
min_n += 1
else:
tokens = []
original_tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))
tokens = original_tokens if min_n == 1 else []

# handle token n-grams
if max_n > 1:
min_phrase = max(min_n, 2)
n_original_tokens = len(original_tokens)

# bind method outside of loop to reduce overhead
tokens_append = tokens.append
space_join = " ".join

for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
for n in range(min_phrase, min(max_n + 1, n_original_tokens + 1)):
for i in range(n_original_tokens - n + 1):
candidate_ngram = original_tokens[i: i + n]
hasdigit = False
for ngram in candidate_ngram:
if ngram.isdigit():
hasdigit = True
tokens_append(space_join(candidate_ngram))

ngram_stop_word_set = set(candidate_ngram) & WordAnalyzer.stemmed_stop_word_set_n
if len(ngram_stop_word_set) == 0 and not hasdigit:
tokens_append(space_join(candidate_ngram))
return ut.stop(tokens,WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)

return tokens
else:
return [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni]
6 changes: 3 additions & 3 deletions scripts/tfidf_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
self.__feature_names = tfidf_obj.feature_names
self.__tfidf_mask = self.__tfidf_matrix.copy()
self.__tfidf_mask.data = np.ones(len(self.__tfidf_matrix.data))
self.__vectorizer = tfidf_obj.vectorizer
self.__vocabulary = tfidf_obj.vocabulary
self.__uni_factor = uni_factor
self.__idf = tfidf_obj.idf

Expand Down Expand Up @@ -88,8 +88,8 @@ def __unbias_ngrams(self, max_ngram_length):
ngram_minus_front = ' '.join(big_ngram_terms[1:])
ngram_minus_back = ' '.join(big_ngram_terms[:len(big_ngram_terms) - 1])

idx_ngram_minus_front = self.__vectorizer.vocabulary_.get(ngram_minus_front)
idx_ngram_minus_back = self.__vectorizer.vocabulary_.get(ngram_minus_back)
idx_ngram_minus_front = self.__vocabulary.get(ngram_minus_front)
idx_ngram_minus_back = self.__vocabulary.get(ngram_minus_back)

indices_slice = self.__tfidf_matrix.indices[start_idx_ptr:end_idx_ptr]
ngram_counts = self.__tfidf_matrix.data[j] / self.__idf[col_idx]
Expand Down
4 changes: 2 additions & 2 deletions scripts/tfidf_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ def tfidf_matrix(self):
return self.__tfidf_matrix

@property
def vectorizer(self):
return self.__vectorizer
def vocabulary(self):
return self.__vectorizer.vocabulary_

@property
def feature_names(self):
Expand Down
Loading

0 comments on commit 84f6905

Please sign in to comment.