Release 2.0.1 (#234)

* Switched away from pyramid ARIMA due to stability issues * Now supports uncompressed pickle files (rather than just bzip2 compressed) * Using Python 3.7.3 * Handles imported data when dates are stored as strings rather than Timestamp objects * Corrected unigram handling
datasciencecampus · Apr 5, 2019 · 84f6905 · 84f6905
1 parent 776dda3
commit 84f6905
Show file tree

Hide file tree

Showing 25 changed files with 516 additions and 126 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,17 +2,17 @@ language: python
 
 matrix:
   include:
-        # Use the built in venv for linux builds
-        - os: linux
-          sudo: required
-          python: "3.6.6"
-          dist: trusty
+    # Use the built in venv for linux builds
+    - os: linux
+      sudo: required
+      python: "3.7.3"
+      dist: xenial
 
-        # Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
+    # Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
 #        - os: osx
 #          language: generic
 #          env: PYTHON=3.6.6
-  
+
 before_install: |
   if [ "$TRAVIS_OS_NAME" == "osx" ]; then
     brew update
@@ -21,26 +21,26 @@ before_install: |
     # See https://docs.travis-ci.com/user/osx-ci-environment/#A-note-on-upgrading-packages.
     # I didn't do this above because it works and I'm lazy.
     brew outdated pyenv || brew upgrade pyenv
-    
+
     # virtualenv doesn't work without pyenv knowledge. venv in Python 3.3
     # doesn't provide Pip by default. So, use `pyenv-virtualenv <https://github.com/yyuu/pyenv-virtualenv/blob/master/README.md>`_.
     brew install pyenv-virtualenv
     pyenv install $PYTHON
-    
+
     # I would expect something like ``pyenv init; pyenv local $PYTHON`` or
     # ``pyenv shell $PYTHON`` would work, but ``pyenv init`` doesn't seem to
     # modify the Bash environment. ??? So, I hand-set the variables instead.
     export PYENV_VERSION=$PYTHON
     export PATH="/Users/travis/.pyenv/shims:${PATH}"
     pyenv-virtualenv venv
     source venv/bin/activate
-    
+
     # A manual check that the correct version of Python is running.
     python --version
   fi
 
   export BOTO_CONFIG=/dev/null
-  
+
 install:
   - python --version
   - python -m pip install -U pip
@@ -53,6 +53,8 @@ install:
 script:
   # for codecov support
   - pip install pytest pytest-cov
+  # to report installed packages
+  - pip freeze
   # command to run tests
   - pytest --cov-config .coveragerc --cov=./ tests/
 

diff --git a/appveyor.yml b/appveyor.yml
@@ -5,7 +5,7 @@ build: none
 environment:
   matrix:
     - PYTHON: "C:\\Python36-x64"
-      PYTHON_VERSION: 3.6.6
+      PYTHON_VERSION: 3.7.3
       PYTHON_ARCH: 64
 init:
   - ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%

diff --git a/config/stopwords_glob.txt b/config/stopwords_glob.txt
@@ -82,6 +82,7 @@ everybody
 everyone
 everything
 everywhere
+excess
 f
 few
 find

diff --git a/config/stopwords_n.txt b/config/stopwords_n.txt
@@ -1,4 +1,5 @@
 situation
 consist
 first
-plurality
+plurality
+second
diff --git a/config/stopwords_uni.txt b/config/stopwords_uni.txt
@@ -1 +1,3 @@
-etc
+etc
+cover
+adjacent
diff --git a/pygrams.py b/pygrams.py
@@ -128,7 +128,6 @@ def get_args(command_line_arguments):
 
     args = parser.parse_args(command_line_arguments)
 
-    args.path = 'data'
     return args
 
 
@@ -165,7 +164,7 @@ def main(supplied_args):
                         pickled_tf_idf_file_name=pickled_tf_idf_path,
                         output_name=args.outputs_name, emerging_technology=args.emerging_technology)
 
-    pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=50)
+    pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)
 
     # emtech integration
     if args.emerging_technology:

diff --git a/scripts/algorithms/arima.py b/scripts/algorithms/arima.py
@@ -1,27 +1,76 @@
+import warnings
+
+import numpy as np
 from numpy import clip, inf
-from pyramid.arima import auto_arima
+from sklearn.metrics import mean_squared_error
+from statsmodels.tsa.arima_model import ARIMA
 
 
 class ARIMAForecast(object):
 
-    def __init__(self, data_in, num_prediction_periods):
-        if not all(isinstance(x, float) for x in data_in):
-            raise ValueError('Time series must be all float values')
+    def __evaluate_models(self, dataset, p_values, d_values, q_values):
+        dataset=np.array(dataset)
+        dataset = dataset.astype('float32')
+        best_score, best_cfg = float("inf"), None
+        for p in p_values:
+            for d in d_values:
+                for q in q_values:
+                    order = (p, d, q)
+                    try:
+                        mse = self.__evaluate_arima_model(dataset, order, ground_truth_in_history=True)
+                        if mse < best_score:
+                            best_score = mse
+                            best_cfg = order
+                    except:
+                        continue
+        return best_cfg, best_score
+
+    def __evaluate_arima_model(self, X, arima_order, ground_truth_in_history=False):
+
+        train_ratio = 0.8
+        train_size = int(len(X) * train_ratio)
+        train, test = X[0:train_size], X[train_size:]
+        history = [x for x in train]
+        predictions = list()
 
-        self.__history = data_in
-        self.__num_prediction_periods = num_prediction_periods
+        for t in range(len(test)):
+            model = ARIMA(history, order=arima_order)
+            model_fit = model.fit(disp=0, maxiter=200)
+            yhat = model_fit.forecast()[0][0]
+            predictions.append(yhat)
+            history.append(test[t] if ground_truth_in_history else yhat)
+        error = mean_squared_error(test, predictions)
+        return error
 
-        self.__stepwise_model = auto_arima(
-            data_in,
-            seasonal=False,
-            error_action='ignore', suppress_warnings=True, stepwise=True
-        )
+    def __arima_model_predict(self, X, arima_order, steps_ahead):
+        # make predictions
+        predictions = list()
+        try:
+            for t in range(steps_ahead):
+                model = ARIMA(X, order=arima_order)
+                model_fit = model.fit(disp=0)
+                yhat = model_fit.forecast()[0][0]
+                predictions.append(yhat)
+                X = np.append(X, yhat)
+        except:
+            predictions.extend([np.nan] * (steps_ahead - len(predictions)))
+
+        return predictions
+
+    def __init__(self, data_in, num_prediction_periods ):
+        if not all(isinstance(x, float) for x in data_in):
+            raise ValueError('Time series must be all float values')
 
-        self.__stepwise_model.fit(data_in)
+        p_values = [0, 1, 2, 4, 6]
+        d_values = range(0, 3)
+        q_values = range(0, 3)
+        warnings.filterwarnings("ignore")
+        self.__order, score = self.__evaluate_models(data_in, p_values, d_values, q_values)
+        self.__predictions = self.__arima_model_predict(data_in, self.__order, num_prediction_periods)
 
     @property
     def configuration(self):
-        return self.__stepwise_model.order
+        return self.__order
 
     def predict_counts(self):
-        return clip(self.__stepwise_model.predict(n_periods=self.__num_prediction_periods), 0, inf)
+        return clip(self.__predictions, 0, inf)
diff --git a/scripts/data_factory.py b/scripts/data_factory.py
@@ -10,7 +10,7 @@ def get(doc_source_file_name):
     if not os.path.isfile(doc_source_file_name):
         raise PygramsException('file: ' + doc_source_file_name + ' does not exist in data folder')
 
-    if doc_source_file_name.endswith('.pkl.bz2'):
+    if doc_source_file_name.endswith('.pkl.bz2') or doc_source_file_name.endswith('.pkl'):
         return read_pickle(doc_source_file_name)
     elif doc_source_file_name.endswith('.xls'):
         return read_excel(doc_source_file_name)

diff --git a/scripts/pipeline.py b/scripts/pipeline.py
@@ -2,7 +2,8 @@
 import pickle
 from os import makedirs, path
 
-from pandas import read_pickle
+from pandas import read_pickle, to_datetime
+from pandas.api.types import is_string_dtype
 from tqdm import tqdm
 
 import scripts.data_factory as datafactory
@@ -11,7 +12,7 @@
 from scripts.documents_filter import DocumentsFilter
 from scripts.documents_weights import DocumentsWeights
 from scripts.filter_terms import FilterTerms
-from scripts.text_processing import LemmaTokenizer
+from scripts.text_processing import LemmaTokenizer, WordAnalyzer, lowercase_strip_accents_and_ownership
 from scripts.tfidf_mask import TfidfMask
 from scripts.tfidf_reduce import TfidfReduce
 from scripts.tfidf_wrapper import TFIDF
@@ -21,14 +22,24 @@
 from scripts.vandv.predictor import evaluate_prediction
 
 
-def checkdf( df, emtec, docs_mask_dict, text_header):
+def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
     app_exit = False
 
-    if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None:
+    if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None or term_counts:
         if docs_mask_dict['date_header'] not in df.columns:
             print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
             app_exit = True
 
+    if docs_mask_dict['date_header'] is not None:
+        if is_string_dtype(df[docs_mask_dict['date_header']]):
+            df[docs_mask_dict['date_header']] = to_datetime(df[docs_mask_dict['date_header']])
+
+            min_date = min(df[docs_mask_dict['date_header']])
+            max_date = max(df[docs_mask_dict['date_header']])
+            print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
+    else:
+        print('Document dates not specified')
+
     if text_header not in df.columns:
         print(f"text_header '{text_header}' not in dataframe")
         app_exit = True
@@ -61,7 +72,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         if pickled_tf_idf_file_name is None:
 
             self.__dataframe = datafactory.get(data_filename)
-            checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header)
+            checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
 
             remove_empty_documents(self.__dataframe, text_header)
             self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range,
@@ -70,7 +81,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
             self.__text_lengths = self.__dataframe[text_header].map(len).tolist()
             self.__dataframe.drop(columns=[text_header], inplace=True)
 
-            tfidf_filename = path.join('outputs', 'tfidf', output_name + '-tfidf.pkl.bz2')
+            tfidf_filename = path.join('outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
             makedirs(path.dirname(tfidf_filename), exist_ok=True)
             with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file:
                 pickle.dump(
@@ -81,6 +92,17 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         else:
             print(f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}')
             self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle(pickled_tf_idf_file_name)
+            if docs_mask_dict['date_header'] is None:
+                print('Document dates not specified')
+            else:
+                min_date = min(self.__dataframe[docs_mask_dict['date_header']])
+                max_date = max(self.__dataframe[docs_mask_dict['date_header']])
+                print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
+
+            WordAnalyzer.init(
+                tokenizer=LemmaTokenizer(),
+                preprocess=lowercase_strip_accents_and_ownership,
+                ngram_range=ngram_range)
 
         # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
         #  the original. We're really just filtering down.
@@ -140,6 +162,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
                                                                                  docs_mask_dict['date_header'])
         # if other outputs
         self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
+        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
 
         # todo: no output method; just if statements to call output functions...?
         #  Only supply what they each directly require

diff --git a/scripts/text_processing.py b/scripts/text_processing.py
@@ -31,6 +31,7 @@
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGE.
 """
+import scripts.utils.utils as ut
 import string
 
 from nltk import word_tokenize, PorterStemmer, pos_tag
@@ -86,6 +87,7 @@ class WordAnalyzer(object):
     stemmed_stop_word_set_n = None
     stemmed_stop_word_set_uni = None
 
+
     @staticmethod
     def init(tokenizer, preprocess, ngram_range):
         WordAnalyzer.tokenizer = tokenizer
@@ -110,39 +112,23 @@ def init(tokenizer, preprocess, ngram_range):
     def analyzer(doc):
         """based on VectorizerMixin._word_ngrams in sklearn/feature_extraction/text.py,
         from scikit-learn; extended to prevent generation of n-grams containing stop words"""
-        tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))
-
-        # handle token n-grams
         min_n, max_n = WordAnalyzer.ngram_range
-        if max_n != 1:
-            original_tokens = tokens
-            if min_n == 1:
-                # no need to do any slicing for unigrams
-                # just iterate through the original tokens
-                tokens = [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni and not w.isdigit()]
-                # tokens = list(original_tokens)
-                min_n += 1
-            else:
-                tokens = []
+        original_tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))
+        tokens = original_tokens if min_n == 1 else []
 
+        # handle token n-grams
+        if max_n > 1:
+            min_phrase = max(min_n, 2)
             n_original_tokens = len(original_tokens)
 
             # bind method outside of loop to reduce overhead
             tokens_append = tokens.append
             space_join = " ".join
 
-            for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
+            for n in range(min_phrase, min(max_n + 1, n_original_tokens + 1)):
                 for i in range(n_original_tokens - n + 1):
                     candidate_ngram = original_tokens[i: i + n]
-                    hasdigit = False
-                    for ngram in candidate_ngram:
-                        if ngram.isdigit():
-                            hasdigit = True
+                    tokens_append(space_join(candidate_ngram))
 
-                    ngram_stop_word_set = set(candidate_ngram) & WordAnalyzer.stemmed_stop_word_set_n
-                    if len(ngram_stop_word_set) == 0 and not hasdigit:
-                        tokens_append(space_join(candidate_ngram))
+        return ut.stop(tokens,WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
 
-            return tokens
-        else:
-            return [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni]
diff --git a/scripts/tfidf_mask.py b/scripts/tfidf_mask.py
@@ -7,7 +7,7 @@ def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
         self.__feature_names = tfidf_obj.feature_names
         self.__tfidf_mask = self.__tfidf_matrix.copy()
         self.__tfidf_mask.data = np.ones(len(self.__tfidf_matrix.data))
-        self.__vectorizer = tfidf_obj.vectorizer
+        self.__vocabulary = tfidf_obj.vocabulary
         self.__uni_factor = uni_factor
         self.__idf = tfidf_obj.idf
 
@@ -88,8 +88,8 @@ def __unbias_ngrams(self, max_ngram_length):
                     ngram_minus_front = ' '.join(big_ngram_terms[1:])
                     ngram_minus_back = ' '.join(big_ngram_terms[:len(big_ngram_terms) - 1])
 
-                    idx_ngram_minus_front = self.__vectorizer.vocabulary_.get(ngram_minus_front)
-                    idx_ngram_minus_back = self.__vectorizer.vocabulary_.get(ngram_minus_back)
+                    idx_ngram_minus_front = self.__vocabulary.get(ngram_minus_front)
+                    idx_ngram_minus_back = self.__vocabulary.get(ngram_minus_back)
 
                     indices_slice = self.__tfidf_matrix.indices[start_idx_ptr:end_idx_ptr]
                     ngram_counts = self.__tfidf_matrix.data[j] / self.__idf[col_idx]

diff --git a/scripts/tfidf_wrapper.py b/scripts/tfidf_wrapper.py
@@ -34,8 +34,8 @@ def tfidf_matrix(self):
         return self.__tfidf_matrix
 
     @property
-    def vectorizer(self):
-        return self.__vectorizer
+    def vocabulary(self):
+        return self.__vectorizer.vocabulary_
 
     @property
     def feature_names(self):