diff --git a/.travis.yml b/.travis.yml
index c6aa177..f113baa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,17 +2,17 @@ language: python
 
 matrix:
   include:
-        # Use the built in venv for linux builds
-        - os: linux
-          sudo: required
-          python: "3.6.6"
-          dist: trusty
+    # Use the built in venv for linux builds
+    - os: linux
+      sudo: required
+      python: "3.7.3"
+      dist: xenial
 
-        # Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
+    # Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
 #        - os: osx
 #          language: generic
 #          env: PYTHON=3.6.6
-  
+
 before_install: |
   if [ "$TRAVIS_OS_NAME" == "osx" ]; then
     brew update
@@ -21,12 +21,12 @@ before_install: |
     # See https://docs.travis-ci.com/user/osx-ci-environment/#A-note-on-upgrading-packages.
     # I didn't do this above because it works and I'm lazy.
     brew outdated pyenv || brew upgrade pyenv
-    
+
     # virtualenv doesn't work without pyenv knowledge. venv in Python 3.3
     # doesn't provide Pip by default. So, use `pyenv-virtualenv <https://github.com/yyuu/pyenv-virtualenv/blob/master/README.md>`_.
     brew install pyenv-virtualenv
     pyenv install $PYTHON
-    
+
     # I would expect something like ``pyenv init; pyenv local $PYTHON`` or
     # ``pyenv shell $PYTHON`` would work, but ``pyenv init`` doesn't seem to
     # modify the Bash environment. ??? So, I hand-set the variables instead.
@@ -34,13 +34,13 @@ before_install: |
     export PATH="/Users/travis/.pyenv/shims:${PATH}"
     pyenv-virtualenv venv
     source venv/bin/activate
-    
+
     # A manual check that the correct version of Python is running.
     python --version
   fi
 
   export BOTO_CONFIG=/dev/null
-  
+
 install:
   - python --version
   - python -m pip install -U pip
@@ -53,6 +53,8 @@ install:
 script:
   # for codecov support
   - pip install pytest pytest-cov
+  # to report installed packages
+  - pip freeze
   # command to run tests
   - pytest --cov-config .coveragerc --cov=./ tests/
 
diff --git a/appveyor.yml b/appveyor.yml
index 7d909f2..e0b4f07 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -5,7 +5,7 @@ build: none
 environment:
   matrix:
     - PYTHON: "C:\\Python36-x64"
-      PYTHON_VERSION: 3.6.6
+      PYTHON_VERSION: 3.7.3
       PYTHON_ARCH: 64
 init:
   - ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%
diff --git a/config/stopwords_glob.txt b/config/stopwords_glob.txt
index d05b03a..cf4509a 100644
--- a/config/stopwords_glob.txt
+++ b/config/stopwords_glob.txt
@@ -82,6 +82,7 @@ everybody
 everyone
 everything
 everywhere
+excess
 f
 few
 find
diff --git a/config/stopwords_n.txt b/config/stopwords_n.txt
index d5a921b..1e34355 100644
--- a/config/stopwords_n.txt
+++ b/config/stopwords_n.txt
@@ -1,4 +1,5 @@
 situation
 consist
 first
-plurality
\ No newline at end of file
+plurality
+second
\ No newline at end of file
diff --git a/config/stopwords_uni.txt b/config/stopwords_uni.txt
index dd7999b..a1d7522 100644
--- a/config/stopwords_uni.txt
+++ b/config/stopwords_uni.txt
@@ -1 +1,3 @@
-etc
\ No newline at end of file
+etc
+cover
+adjacent
\ No newline at end of file
diff --git a/pygrams.py b/pygrams.py
index fb0b198..f9fa108 100644
--- a/pygrams.py
+++ b/pygrams.py
@@ -128,7 +128,6 @@ def get_args(command_line_arguments):
 
     args = parser.parse_args(command_line_arguments)
 
-    args.path = 'data'
     return args
 
 
@@ -165,7 +164,7 @@ def main(supplied_args):
                         pickled_tf_idf_file_name=pickled_tf_idf_path,
                         output_name=args.outputs_name, emerging_technology=args.emerging_technology)
 
-    pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=50)
+    pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)
 
     # emtech integration
     if args.emerging_technology:
diff --git a/scripts/algorithms/arima.py b/scripts/algorithms/arima.py
index 1c621c6..39334dc 100644
--- a/scripts/algorithms/arima.py
+++ b/scripts/algorithms/arima.py
@@ -1,27 +1,76 @@
+import warnings
+
+import numpy as np
 from numpy import clip, inf
-from pyramid.arima import auto_arima
+from sklearn.metrics import mean_squared_error
+from statsmodels.tsa.arima_model import ARIMA
 
 
 class ARIMAForecast(object):
 
-    def __init__(self, data_in, num_prediction_periods):
-        if not all(isinstance(x, float) for x in data_in):
-            raise ValueError('Time series must be all float values')
+    def __evaluate_models(self, dataset, p_values, d_values, q_values):
+        dataset=np.array(dataset)
+        dataset = dataset.astype('float32')
+        best_score, best_cfg = float("inf"), None
+        for p in p_values:
+            for d in d_values:
+                for q in q_values:
+                    order = (p, d, q)
+                    try:
+                        mse = self.__evaluate_arima_model(dataset, order, ground_truth_in_history=True)
+                        if mse < best_score:
+                            best_score = mse
+                            best_cfg = order
+                    except:
+                        continue
+        return best_cfg, best_score
+
+    def __evaluate_arima_model(self, X, arima_order, ground_truth_in_history=False):
+
+        train_ratio = 0.8
+        train_size = int(len(X) * train_ratio)
+        train, test = X[0:train_size], X[train_size:]
+        history = [x for x in train]
+        predictions = list()
 
-        self.__history = data_in
-        self.__num_prediction_periods = num_prediction_periods
+        for t in range(len(test)):
+            model = ARIMA(history, order=arima_order)
+            model_fit = model.fit(disp=0, maxiter=200)
+            yhat = model_fit.forecast()[0][0]
+            predictions.append(yhat)
+            history.append(test[t] if ground_truth_in_history else yhat)
+        error = mean_squared_error(test, predictions)
+        return error
 
-        self.__stepwise_model = auto_arima(
-            data_in,
-            seasonal=False,
-            error_action='ignore', suppress_warnings=True, stepwise=True
-        )
+    def __arima_model_predict(self, X, arima_order, steps_ahead):
+        # make predictions
+        predictions = list()
+        try:
+            for t in range(steps_ahead):
+                model = ARIMA(X, order=arima_order)
+                model_fit = model.fit(disp=0)
+                yhat = model_fit.forecast()[0][0]
+                predictions.append(yhat)
+                X = np.append(X, yhat)
+        except:
+            predictions.extend([np.nan] * (steps_ahead - len(predictions)))
+
+        return predictions
+
+    def __init__(self, data_in, num_prediction_periods ):
+        if not all(isinstance(x, float) for x in data_in):
+            raise ValueError('Time series must be all float values')
 
-        self.__stepwise_model.fit(data_in)
+        p_values = [0, 1, 2, 4, 6]
+        d_values = range(0, 3)
+        q_values = range(0, 3)
+        warnings.filterwarnings("ignore")
+        self.__order, score = self.__evaluate_models(data_in, p_values, d_values, q_values)
+        self.__predictions = self.__arima_model_predict(data_in, self.__order, num_prediction_periods)
 
     @property
     def configuration(self):
-        return self.__stepwise_model.order
+        return self.__order
 
     def predict_counts(self):
-        return clip(self.__stepwise_model.predict(n_periods=self.__num_prediction_periods), 0, inf)
+        return clip(self.__predictions, 0, inf)
diff --git a/scripts/data_factory.py b/scripts/data_factory.py
index 9fb00c9..e70fb39 100644
--- a/scripts/data_factory.py
+++ b/scripts/data_factory.py
@@ -10,7 +10,7 @@ def get(doc_source_file_name):
     if not os.path.isfile(doc_source_file_name):
         raise PygramsException('file: ' + doc_source_file_name + ' does not exist in data folder')
 
-    if doc_source_file_name.endswith('.pkl.bz2'):
+    if doc_source_file_name.endswith('.pkl.bz2') or doc_source_file_name.endswith('.pkl'):
         return read_pickle(doc_source_file_name)
     elif doc_source_file_name.endswith('.xls'):
         return read_excel(doc_source_file_name)
diff --git a/scripts/pipeline.py b/scripts/pipeline.py
index 11e0a2b..495ad79 100644
--- a/scripts/pipeline.py
+++ b/scripts/pipeline.py
@@ -2,7 +2,8 @@
 import pickle
 from os import makedirs, path
 
-from pandas import read_pickle
+from pandas import read_pickle, to_datetime
+from pandas.api.types import is_string_dtype
 from tqdm import tqdm
 
 import scripts.data_factory as datafactory
@@ -11,7 +12,7 @@
 from scripts.documents_filter import DocumentsFilter
 from scripts.documents_weights import DocumentsWeights
 from scripts.filter_terms import FilterTerms
-from scripts.text_processing import LemmaTokenizer
+from scripts.text_processing import LemmaTokenizer, WordAnalyzer, lowercase_strip_accents_and_ownership
 from scripts.tfidf_mask import TfidfMask
 from scripts.tfidf_reduce import TfidfReduce
 from scripts.tfidf_wrapper import TFIDF
@@ -21,14 +22,24 @@
 from scripts.vandv.predictor import evaluate_prediction
 
 
-def checkdf( df, emtec, docs_mask_dict, text_header):
+def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
     app_exit = False
 
-    if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None:
+    if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None or term_counts:
         if docs_mask_dict['date_header'] not in df.columns:
             print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
             app_exit = True
 
+    if docs_mask_dict['date_header'] is not None:
+        if is_string_dtype(df[docs_mask_dict['date_header']]):
+            df[docs_mask_dict['date_header']] = to_datetime(df[docs_mask_dict['date_header']])
+
+            min_date = min(df[docs_mask_dict['date_header']])
+            max_date = max(df[docs_mask_dict['date_header']])
+            print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
+    else:
+        print('Document dates not specified')
+
     if text_header not in df.columns:
         print(f"text_header '{text_header}' not in dataframe")
         app_exit = True
@@ -61,7 +72,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         if pickled_tf_idf_file_name is None:
 
             self.__dataframe = datafactory.get(data_filename)
-            checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header)
+            checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
 
             remove_empty_documents(self.__dataframe, text_header)
             self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range,
@@ -70,7 +81,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
             self.__text_lengths = self.__dataframe[text_header].map(len).tolist()
             self.__dataframe.drop(columns=[text_header], inplace=True)
 
-            tfidf_filename = path.join('outputs', 'tfidf', output_name + '-tfidf.pkl.bz2')
+            tfidf_filename = path.join('outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
             makedirs(path.dirname(tfidf_filename), exist_ok=True)
             with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file:
                 pickle.dump(
@@ -81,6 +92,17 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
         else:
             print(f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}')
             self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle(pickled_tf_idf_file_name)
+            if docs_mask_dict['date_header'] is None:
+                print('Document dates not specified')
+            else:
+                min_date = min(self.__dataframe[docs_mask_dict['date_header']])
+                max_date = max(self.__dataframe[docs_mask_dict['date_header']])
+                print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
+
+            WordAnalyzer.init(
+                tokenizer=LemmaTokenizer(),
+                preprocess=lowercase_strip_accents_and_ownership,
+                ngram_range=ngram_range)
 
         # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
         #  the original. We're really just filtering down.
@@ -140,6 +162,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
                                                                                  docs_mask_dict['date_header'])
         # if other outputs
         self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
+        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
 
         # todo: no output method; just if statements to call output functions...?
         #  Only supply what they each directly require
diff --git a/scripts/text_processing.py b/scripts/text_processing.py
index 9183431..a5b76e3 100644
--- a/scripts/text_processing.py
+++ b/scripts/text_processing.py
@@ -31,6 +31,7 @@
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGE.
 """
+import scripts.utils.utils as ut
 import string
 
 from nltk import word_tokenize, PorterStemmer, pos_tag
@@ -86,6 +87,7 @@ class WordAnalyzer(object):
     stemmed_stop_word_set_n = None
     stemmed_stop_word_set_uni = None
 
+
     @staticmethod
     def init(tokenizer, preprocess, ngram_range):
         WordAnalyzer.tokenizer = tokenizer
@@ -110,39 +112,23 @@ def init(tokenizer, preprocess, ngram_range):
     def analyzer(doc):
         """based on VectorizerMixin._word_ngrams in sklearn/feature_extraction/text.py,
         from scikit-learn; extended to prevent generation of n-grams containing stop words"""
-        tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))
-
-        # handle token n-grams
         min_n, max_n = WordAnalyzer.ngram_range
-        if max_n != 1:
-            original_tokens = tokens
-            if min_n == 1:
-                # no need to do any slicing for unigrams
-                # just iterate through the original tokens
-                tokens = [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni and not w.isdigit()]
-                # tokens = list(original_tokens)
-                min_n += 1
-            else:
-                tokens = []
+        original_tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))
+        tokens = original_tokens if min_n == 1 else []
 
+        # handle token n-grams
+        if max_n > 1:
+            min_phrase = max(min_n, 2)
             n_original_tokens = len(original_tokens)
 
             # bind method outside of loop to reduce overhead
             tokens_append = tokens.append
             space_join = " ".join
 
-            for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
+            for n in range(min_phrase, min(max_n + 1, n_original_tokens + 1)):
                 for i in range(n_original_tokens - n + 1):
                     candidate_ngram = original_tokens[i: i + n]
-                    hasdigit = False
-                    for ngram in candidate_ngram:
-                        if ngram.isdigit():
-                            hasdigit = True
+                    tokens_append(space_join(candidate_ngram))
 
-                    ngram_stop_word_set = set(candidate_ngram) & WordAnalyzer.stemmed_stop_word_set_n
-                    if len(ngram_stop_word_set) == 0 and not hasdigit:
-                        tokens_append(space_join(candidate_ngram))
+        return ut.stop(tokens,WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
 
-            return tokens
-        else:
-            return [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni]
diff --git a/scripts/tfidf_mask.py b/scripts/tfidf_mask.py
index 8de5f0c..4451d5a 100644
--- a/scripts/tfidf_mask.py
+++ b/scripts/tfidf_mask.py
@@ -7,7 +7,7 @@ def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
         self.__feature_names = tfidf_obj.feature_names
         self.__tfidf_mask = self.__tfidf_matrix.copy()
         self.__tfidf_mask.data = np.ones(len(self.__tfidf_matrix.data))
-        self.__vectorizer = tfidf_obj.vectorizer
+        self.__vocabulary = tfidf_obj.vocabulary
         self.__uni_factor = uni_factor
         self.__idf = tfidf_obj.idf
 
@@ -88,8 +88,8 @@ def __unbias_ngrams(self, max_ngram_length):
                     ngram_minus_front = ' '.join(big_ngram_terms[1:])
                     ngram_minus_back = ' '.join(big_ngram_terms[:len(big_ngram_terms) - 1])
 
-                    idx_ngram_minus_front = self.__vectorizer.vocabulary_.get(ngram_minus_front)
-                    idx_ngram_minus_back = self.__vectorizer.vocabulary_.get(ngram_minus_back)
+                    idx_ngram_minus_front = self.__vocabulary.get(ngram_minus_front)
+                    idx_ngram_minus_back = self.__vocabulary.get(ngram_minus_back)
 
                     indices_slice = self.__tfidf_matrix.indices[start_idx_ptr:end_idx_ptr]
                     ngram_counts = self.__tfidf_matrix.data[j] / self.__idf[col_idx]
diff --git a/scripts/tfidf_wrapper.py b/scripts/tfidf_wrapper.py
index acc63c9..2c93338 100644
--- a/scripts/tfidf_wrapper.py
+++ b/scripts/tfidf_wrapper.py
@@ -34,8 +34,8 @@ def tfidf_matrix(self):
         return self.__tfidf_matrix
 
     @property
-    def vectorizer(self):
-        return self.__vectorizer
+    def vocabulary(self):
+        return self.__vectorizer.vocabulary_
 
     @property
     def feature_names(self):
diff --git a/scripts/utils/argschecker.py b/scripts/utils/argschecker.py
index 42fbcec..daf71e7 100644
--- a/scripts/utils/argschecker.py
+++ b/scripts/utils/argschecker.py
@@ -14,7 +14,8 @@ def __init__(self, args, args_default):
     def checkargs(self):
         app_exit = False
 
-        if path.isfile(path.join(self.args.path, self.args.doc_source)) is False:
+        doc_path = path.join(self.args.path, self.args.doc_source)
+        if path.isfile(doc_path) is False:
             print(f"File {self.args.doc_source} in path {self.args.path} not found")
             app_exit = True
 
@@ -71,11 +72,6 @@ def checkargs(self):
                       '[-o] "wordcloud"')
                 app_exit = True
 
-        if self.args.num_ngrams_report != self.args_default.num_ngrams_report:
-            if 'report' not in self.args.output:
-                print('arguments [-np] can only be used when output includes report [-o] "report"')
-                app_exit = True
-
         if self.args.num_ngrams_fdg != self.args_default.num_ngrams_fdg:
             if 'fdg' not in self.args.output:
                 print('argument [-nf] can only be used when output includes fdg [-o] "fdg"')
diff --git a/scripts/utils/reduce_existing_data_frame.py b/scripts/utils/reduce_existing_data_frame.py
index a04b96d..157259b 100644
--- a/scripts/utils/reduce_existing_data_frame.py
+++ b/scripts/utils/reduce_existing_data_frame.py
@@ -92,7 +92,7 @@ def main():
                                       subset_size=args.size, fraction=args.fraction, date_range=date_range,
                                       date_column_name=args.date_column_name)
 
-    print(f'After filtering: {data_frame.shape[0]} rows in data frame')
+    print(f'After filtering: {data_frame.shape[0]:,} rows in data frame')
     print(f'Writing sub-sampled data frame in pickle {pickle_file_name}...')
     data_frame.to_pickle(pickle_file_name)
     print(f'...written sub-sampled data frame in pickle {pickle_file_name}')
diff --git a/scripts/utils/utils.py b/scripts/utils/utils.py
index 079847e..9537bbc 100644
--- a/scripts/utils/utils.py
+++ b/scripts/utils/utils.py
@@ -129,3 +129,39 @@ def normalize(ydata):
 
     return np.asarray([(_y - miny) / diff for _y in ydata])
 
+
+def stop(tokensin, unigrams, ngrams, digits=True):
+    new_tokens=[]
+    for token in tokensin:
+        ngram = token.split()
+        if len(ngram)==1:
+            if ngram[0] not in unigrams and not ngram[0].isdigit():
+                new_tokens.append(token)
+        else:
+            word_in_ngrams=False
+            for word in ngram:
+                if word in ngrams or (digits and word.isdigit()):
+                    word_in_ngrams=True
+                    break
+            if not word_in_ngrams:
+                new_tokens.append(token)
+    return new_tokens
+
+
+def stop_tup(tuples, unigrams, ngrams, digits=True):
+    new_tuples=[]
+    for tuple in tuples:
+        token = tuple[1]
+        ngram = token.split()
+        if len(ngram)==1:
+            if ngram[0] not in unigrams and not ngram[0].isdigit():
+                new_tuples.append(tuple)
+        else:
+            word_in_ngrams=False
+            for word in ngram:
+                if word in ngrams or (digits and word.isdigit()):
+                    word_in_ngrams=True
+                    break
+            if not word_in_ngrams:
+                new_tuples.append(tuple)
+    return new_tuples
\ No newline at end of file
diff --git a/setup.py b/setup.py
index de94875..73da624 100644
--- a/setup.py
+++ b/setup.py
@@ -53,8 +53,9 @@ def setup_package():
             'License ::  MIT License',
             'Programming Language :: Python :: 3.6',
         ],
-        install_requires=['matplotlib', 'numpy', 'scipy', 'wordcloud', 'pandas', 'tqdm', 'nltk', 'scikit-learn', 'xlrd',
-                          'python-Levenshtein', 'gensim', 'pyramid-arima>=0.9.0', 'keras', 'tensorflow', 'keras_tqdm',
+
+        install_requires=['matplotlib', 'numpy', 'scipy>=1.2.1', 'wordcloud', 'pandas', 'tqdm', 'nltk', 'scikit-learn',
+                          'xlrd','python-Levenshtein', 'gensim', 'statsmodels', 'keras', 'tensorflow', 'keras_tqdm',
                           'patsy', 'humanfriendly', 'psutil', 'jinja2'],
         # extras_require={'dev': ['check-manifest'],'test': ['coverage'],},
         python_requires='>=3.6',
diff --git a/tests/algorithms/test_arima.py b/tests/algorithms/test_arima.py
index 04769bf..5a5f4e9 100644
--- a/tests/algorithms/test_arima.py
+++ b/tests/algorithms/test_arima.py
@@ -9,6 +9,15 @@
 
 from scripts.algorithms.arima import ARIMAForecast
 
+import platform; print(platform.platform())
+import sys; print("Python", sys.version)
+import os
+import pandas as pd
+import numpy as np; print("NumPy", np.__version__)
+import scipy; print("SciPy", scipy.__version__)
+import sklearn; print("Scikit-Learn", sklearn.__version__)
+import statsmodels; print("Statsmodels", statsmodels.__version__)
+
 
 class ArimaTests(unittest.TestCase):
 
@@ -31,12 +40,44 @@ def test_static_sequence(self):
 
         np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4)
 
-    def test_linearly_increasing_sequence(self):
-        time_series = [8.9, 11.0, 13.0, 15.1, 17.0, 18.9, 21.0]
-        num_predicted_periods = 4
-        expected_prediction = [23.0, 25.0, 27.0, 29.0]
+    def test_linear_sequence(self):
+        time_series = [1.0, 2.0, 3.0, 4.0, 5.0]
+        num_predicted_periods = 3
+        expected_prediction = [6.0, 7.0, 8.0]
+        arima = ARIMAForecast(time_series, num_predicted_periods)
+
+        actual_prediction = arima.predict_counts()
+
+        np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4)
+
+    def test_flakey_sequence(self):
+        time_series = [20.0, -20.0]
+        num_predicted_periods = 3
+        expected_prediction = [np.nan] * 3
         arima = ARIMAForecast(time_series, num_predicted_periods)
 
         actual_prediction = arima.predict_counts()
 
         np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=1)
+
+    def test_linearly_increasing_sequence_fuel_cell(self):
+        time_series = pd.read_csv(os.path.join('tests','data', 'fuel_cell_quarterly.csv')).values.tolist()
+        time_series = [item for sublist in time_series for item in sublist]
+        num_predicted_periods = 4
+        expected_prediction = [333., 333., 334., 335.]
+        arima = ARIMAForecast(np.array(time_series).astype(float), num_predicted_periods)
+
+        actual_prediction = arima.predict_counts()
+
+        np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=0)
+
+    def test_linearly_decreasing_sequence_image_data(self):
+        time_series = pd.read_csv(os.path.join('tests','data', 'image_data_quarterly.csv')).values.tolist()
+        time_series = [item for sublist in time_series for item in sublist]
+        num_predicted_periods = 4
+        expected_prediction = [562., 561., 558., 556.]
+        arima = ARIMAForecast(np.array(time_series).astype(float), num_predicted_periods)
+
+        actual_prediction = arima.predict_counts()
+
+        np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=0)
diff --git a/tests/data/fuel_cell_quarterly.csv b/tests/data/fuel_cell_quarterly.csv
new file mode 100644
index 0000000..91f5489
--- /dev/null
+++ b/tests/data/fuel_cell_quarterly.csv
@@ -0,0 +1,52 @@
+323
+340
+296
+217
+265
+337
+326
+284
+276
+294
+252
+253
+264
+190
+262
+256
+264
+232
+211
+222
+235
+219
+273
+292
+330
+268
+260
+263
+277
+309
+282
+316
+348
+314
+314
+317
+350
+368
+375
+321
+413
+395
+368
+330
+407
+316
+349
+377
+320
+334
+340
+317
diff --git a/tests/data/image_data_quarterly.csv b/tests/data/image_data_quarterly.csv
new file mode 100644
index 0000000..5e2d656
--- /dev/null
+++ b/tests/data/image_data_quarterly.csv
@@ -0,0 +1,52 @@
+190
+257
+186
+253
+275
+344
+296
+322
+273
+291
+253
+293
+285
+251
+349
+288
+330
+297
+341
+302
+349
+357
+427
+434
+409
+436
+430
+408
+474
+486
+517
+551
+575
+621
+618
+627
+560
+663
+630
+565
+661
+690
+685
+577
+623
+516
+639
+544
+538
+547
+564
+569
diff --git a/tests/test_filter_terms.py b/tests/test_filter_terms.py
index 0cea76b..829f581 100644
--- a/tests/test_filter_terms.py
+++ b/tests/test_filter_terms.py
@@ -17,8 +17,26 @@ def setUp(self):
 
     def test_embeddings_filter_binary(self):
         user_queries = ['pharmacy', 'health', 'chemist']
-        weights_vec_expected = [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0,
-                                0.0, 0.0, 0.0]
+        weights_vec_expected = [1.0,
+                                1.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                1.0,
+                                1.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0,
+                                0.0]
         weights_vec_actual = FilterTerms(self.feature_names, user_queries, threshold=0.8).ngram_weights_vec[410:430]
 
         self.assertListEqual(weights_vec_expected, weights_vec_actual)
@@ -28,7 +46,6 @@ def test_embeddings_filter_cosine_dist(self):
         user_queries = ['pharmacy', 'health', 'chemist']
         weights_vec_actual = FilterTerms(self.feature_names, user_queries).ngram_weights_vec[410:430]
         weights_vec_expected = [0.5728331683597565,
-                                0.5728331683597565,
                                 0.5728331683597565,
                                 0.023525821108745026,
                                 0.551300224350135,
@@ -46,7 +63,8 @@ def test_embeddings_filter_cosine_dist(self):
                                 0.47060086220739433,
                                 -0.10829696922978878,
                                 0.19429777744446344,
-                                0.19429777744446344]
+                                0.19429777744446344,
+                                0.47456806019549364]
 
         assert_list_almost_equal(self, weights_vec_expected, weights_vec_actual)
 
diff --git a/tests/test_pygrams.py b/tests/test_pygrams.py
index 91e748a..91a3401 100644
--- a/tests/test_pygrams.py
+++ b/tests/test_pygrams.py
@@ -8,6 +8,7 @@
 
 import pygrams
 from scripts import FilePaths
+from scripts.text_processing import WordAnalyzer
 from scripts.utils.pygrams_exception import PygramsException
 
 
@@ -55,8 +56,9 @@ def preparePyGrams(self, fake_df_data, mock_read_pickle, mock_open, mock_bz2file
                                                 in range(self.number_of_rows)]
 
         if self.publication_date_auto_tested:
-            fake_df_data['publication_date'] = [pd.Timestamp('2000-12-28 00:00:00') - pd.DateOffset(weeks=row) for row
-                                                in range(self.number_of_rows)]
+            fake_df_data['publication_date'] = [
+                f"{pd.Timestamp('2000-12-28 00:00:00') - pd.DateOffset(weeks=row):%Y-%m-%d}" for row
+                in range(self.number_of_rows)]
 
         if self.invention_title_auto_tested:
             fake_df_data['invention_title'] = [f'invention_title-{pid}' for pid in range(self.number_of_rows)]
@@ -126,14 +128,14 @@ def isfile_fake(file_name):
 
         mock_path_isfile.side_effect = isfile_fake
 
-    def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs):
+    def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs, max_df):
         self.assertTrue(self.publication_date_auto_tested)
         self.assertTrue(self.patent_id_auto_tested)
 
         mock_makedirs.assert_called_with(self.tfidfOutputFolder(), exist_ok=True)
         results_checked = False
         for dump_args in mock_pickle_dump.call_args_list:
-            if dump_args[0][1] == self.tfidfFileName(self.out_name):
+            if dump_args[0][1] == self.tfidfFileName(self.out_name, max_df):
                 tfidf_pickle = dump_args[0][0]
                 tfidf_obj = tfidf_pickle[0]
 
@@ -168,8 +170,8 @@ def tfidfOutputFolder():
         return os.path.join('outputs', 'tfidf')
 
     @staticmethod
-    def tfidfFileName(data_source_name):
-        return os.path.join(TestPyGrams.tfidfOutputFolder(), data_source_name + '-tfidf.pkl.bz2')
+    def tfidfFileName(data_source_name, max_df):
+        return os.path.join(TestPyGrams.tfidfOutputFolder(), data_source_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
 
     @staticmethod
     def termCountsOutputFolder():
@@ -192,9 +194,9 @@ def test_simple_output_tfidf(self, mock_path_isfile, mock_makedirs, mock_bz2file
                 'abstract'
             ]
         }
-
+        max_df = 1.0
         self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile)
-        args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', '1.0']
+        args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', str(max_df)]
 
         pygrams.main(args)
 
@@ -202,7 +204,7 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names):
             self.assertEqual(tfidf_matrix.todense(), np.ones(shape=(1, 1)), 'TFIDF should be 1x1 matrix of 1')
             self.assertListEqual(feature_names, ['abstract'])
 
-        self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs)
+        self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)
 
     @mock.patch("scripts.pipeline.read_pickle", create=True)
     @mock.patch("scripts.data_factory.read_pickle", create=True)
@@ -227,6 +229,13 @@ def test_simple_output_tfidf_pickle_and_unpickle(self, mock_path_isfile, mock_ou
         args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', '1.0']
         pygrams.main(args)
 
+        # reset static object
+        WordAnalyzer.tokenizer = None
+        WordAnalyzer.preprocess = None
+        WordAnalyzer.ngram_range = None
+        WordAnalyzer.stemmed_stop_word_set_n = None
+        WordAnalyzer.stemmed_stop_word_set_uni = None
+
         # Fail if original data frame is requested from disc
         def factory_read_pickle_fake(pickle_file_name):
             self.fail(f'Should not be reading {pickle_file_name} via a factory if TFIDF was requested from pickle')
@@ -271,10 +280,10 @@ def test_simple_two_patents_unigrams_only_output_tfidf(self, mock_path_isfile, m
                 'abstract two'
             ]
         }
-
+        max_df=1.0
         self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile)
         args = ['-ds', self.data_source_name, '--date_header',
-                'publication_date', '--max_document_frequency', '1.0', '--max_ngrams', '1']
+                'publication_date', '--max_document_frequency', str(max_df), '--max_ngrams', '1']
 
         pygrams.main(args)
 
@@ -301,7 +310,7 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names):
             self.assertListAlmostEqual(tfidf_as_lists[0], [l2norm_tfidf_abstract, l2norm_tfidf_one, 0], places=4)
             self.assertListAlmostEqual(tfidf_as_lists[1], [l2norm_tfidf_abstract, 0, l2norm_tfidf_one], places=4)
 
-        self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs)
+        self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)
 
     @mock.patch("scripts.data_factory.read_pickle", create=True)
     @mock.patch("pickle.dump", create=True)
diff --git a/tests/test_terms_graph.py b/tests/test_terms_graph.py
index 73b4f65..175135f 100644
--- a/tests/test_terms_graph.py
+++ b/tests/test_terms_graph.py
@@ -55,25 +55,25 @@ def test_num_nodes(self):
         self.assertEquals(50, len(self.__nodes))
 
     def test_num_links(self):
-        self.assertEquals(447, len(self.__links))
+        self.assertEquals(454, len(self.__links))
 
     def test_terms_in_nodes(self):
         texts = [x['text'] for x in self.__nodes]
 
         self.assertIn('central portion', texts)
         self.assertIn('fluid commun', texts)
-        self.assertIn('provid seed', texts)
+        self.assertIn('phenyl ring', texts)
         self.assertIn('gate line', texts)
 
         idx_1 = texts.index("central portion")
         idx_2 = texts.index("fluid commun")
-        idx_3 = texts.index("provid seed")
+        idx_3 = texts.index("phenyl ring")
         idx_4 = texts.index("gate line")
 
-        self.assertAlmostEqual(0.05478826302293826,  self.__nodes[idx_1]['freq'])
-        self.assertAlmostEqual(0.022815124444693337,   self.__nodes[idx_2]['freq'])
-        self.assertAlmostEqual(0.01193531394736373, self.__nodes[idx_3]['freq'])
-        self.assertAlmostEqual(0.07963623423011947,  self.__nodes[idx_4]['freq'])
+        self.assertAlmostEqual(0.024110680522099224,  self.__nodes[idx_1]['freq'])
+        self.assertAlmostEqual(0.004707609539032177,   self.__nodes[idx_2]['freq'])
+        self.assertAlmostEqual(0.09743319564023586, self.__nodes[idx_3]['freq'])
+        self.assertAlmostEqual(0.07346334072037178,  self.__nodes[idx_4]['freq'])
 
     def test_terms_in_links(self):
 
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 140429a..21c4936 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -1,12 +1,9 @@
 import unittest
 
-import pandas as pd
 from nltk import word_tokenize
 
-from scripts import FilePaths
-from scripts.text_processing import  StemTokenizer, WordAnalyzer, \
-    lowercase_strip_accents_and_ownership
-
+from scripts.text_processing import StemTokenizer, WordAnalyzer, \
+    lowercase_strip_accents_and_ownership, LemmaTokenizer
 
 # Sample abstracts taken from the USPTO Bulk Download Service: https://bulkdata.uspto.gov
 # Data used was downloaded from "Patent Grant Full Text Data"
@@ -23,8 +20,7 @@ def test_stematizer(self):
         self.assertListEqual(expected_words, actual_words)
 
 
-
-class Test_lowercase_strip_accents_and_ownership(unittest.TestCase):
+class TestLowercaseStripAccentsAndOwnership(unittest.TestCase):
 
     def test_lowercase(self):
         doc = 'Test ABCdefGH IJ. Again'
@@ -92,3 +88,129 @@ def test_WordAnalyser_ngrams_dont_cross_punctuation_or_stop_words(self):
                            'metal fish bucket']
         actual_ngrams = WordAnalyzer.analyzer(doc)
         self.assertListEqual(expected_ngrams, actual_ngrams)
+
+    def test_WordAnalyser_ngrams(self):
+        ngram_range = (1, 3)
+        WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=self.preprocess, ngram_range=ngram_range)
+
+        doc = "Conductive structures in features of an insulator layer on a substrate are fabricated by a particular " \
+              "process. In this process, a layer of conductive material is applied over the insulator layer so that " \
+              "the layer of conductive material covers field regions adjacent the features and fills in the features " \
+              "themselves. A grain size differential between the conductive material which covers the field regions " \
+              "and the conductive material which fills in the feature is then established by annealing the layer of " \
+              "conductive material. Excess conductive material is then removed to uncover the field regions and leave " \
+              "the conductive structures. The layer of conductive material is applied so as to define a first layer " \
+              "thickness over the field regions and a second layer thickness in and over the features. These " \
+              "thicknesses are dimensioned such that d 1 ≦0.5d 2 , with d 1  being the first layer thickness and d 2  " \
+              "being the second layer thickness. Preferably, the first and second layer thicknesses are dimensioned " \
+              "such that d 1 ≦0.3d 2 . "
+        expected_ngrams = ['conductive',
+                           'structure',
+                           'feature',
+                           'insulator',
+                           'layer',
+                           'substrate',
+                           'fabricate',
+                           'particular',
+                           'process',
+                           'process',
+                           'layer',
+                           'conductive',
+                           'material',
+                           'apply',
+                           'insulator',
+                           'layer',
+                           'layer',
+                           'conductive',
+                           'material',
+                           'field',
+                           'region',
+                           'feature',
+                           'fill',
+                           'feature',
+                           'themselves',
+                           'grain',
+                           'differential',
+                           'conductive',
+                           'material',
+                           'field',
+                           'region',
+                           'conductive',
+                           'material',
+                           'fill',
+                           'feature',
+                           'establish',
+                           'anneal',
+                           'layer',
+                           'conductive',
+                           'material',
+                           'conductive',
+                           'material',
+                           'remove',
+                           'uncover',
+                           'field',
+                           'region',
+                           'leave',
+                           'conductive',
+                           'structure',
+                           'layer',
+                           'conductive',
+                           'material',
+                           'apply',
+                           'define',
+                           'first',
+                           'layer',
+                           'thickness',
+                           'field',
+                           'region',
+                           'second',
+                           'layer',
+                           'thickness',
+                           'feature',
+                           'thickness',
+                           'dimension',
+                           '0.5d',
+                           'first',
+                           'layer',
+                           'thickness',
+                           'second',
+                           'layer',
+                           'thickness',
+                           'preferably',
+                           'first',
+                           'second',
+                           'layer',
+                           'thickness',
+                           'dimension',
+                           '0.3d',
+                           'conductive structure',
+                           'insulator layer',
+                           'particular process',
+                           'conductive material',
+                           'insulator layer',
+                           'conductive material',
+                           'material cover',
+                           'cover field',
+                           'field region',
+                           'region adjacent',
+                           'feature themselves',
+                           'conductive material',
+                           'field region',
+                           'conductive material',
+                           'conductive material',
+                           'conductive material',
+                           'field region',
+                           'conductive structure',
+                           'conductive material',
+                           'layer thickness',
+                           'field region',
+                           'layer thickness',
+                           'layer thickness',
+                           'layer thickness',
+                           'layer thickness',
+                           'conductive material cover',
+                           'material cover field',
+                           'cover field region',
+                           'field region adjacent']
+        actual_ngrams = WordAnalyzer.analyzer(doc)
+        self.assertListEqual(expected_ngrams, actual_ngrams)
diff --git a/tests/test_tfidf_mask.py b/tests/test_tfidf_mask.py
index 0c6681b..6b0e97b 100644
--- a/tests/test_tfidf_mask.py
+++ b/tests/test_tfidf_mask.py
@@ -64,7 +64,7 @@ def init_mask(self, cpc, min_n, uni_factor=0.8):
 
     def test_num_non_zeros_no_clean_rows(self):
         self.init_mask('Y02', 2)
-        self.assertEqual(2059, len(self.__tfidf_mask.data))
+        self.assertEqual(2024, len(self.__tfidf_mask.data))
 
     def test_terms(self):
         self.init_mask('Y02', 2)
@@ -119,14 +119,14 @@ def test_no_negative_weights(self):
     def test_non_zeros_clean_rows(self):
         self.init_mask('Y02', 2)
         tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
-        vectorizer = self.__tfidf_obj.vectorizer
+        vocabulary = self.__tfidf_obj.vocabulary
         expected_term1_val = 0.25
         expected_term2_val = 0.2962962962962961
 
         term1 = 'exhaust ga'  # 0.25
         term2 = 'drive region'  # 0.2962962962962961
-        idx_term1 = vectorizer.vocabulary_.get(term1)
-        idx_term2 = vectorizer.vocabulary_.get(term2)
+        idx_term1 = vocabulary.get(term1)
+        idx_term2 = vocabulary.get(term2)
 
         indexof_idx_term1 = tfidf_mask_nozero_rows.indices.tolist().index(idx_term1)
         indexof_idx_term2 = tfidf_mask_nozero_rows.indices.tolist().index(idx_term2)
diff --git a/tests/test_tfidf_reduce.py b/tests/test_tfidf_reduce.py
index c7ee153..6d2c1da 100644
--- a/tests/test_tfidf_reduce.py
+++ b/tests/test_tfidf_reduce.py
@@ -48,7 +48,8 @@ def setUpClass(cls):
     def test_terms(self):
         term_score_tuples = self.__term_score_tuples
         actual_terms = [x for _, x in term_score_tuples]
-        expected_terms = ['transmit path',
+        expected_terms = ['mount surfac',
+                          'transmit path',
                           'electron element',
                           'link document',
                           'amid deriv',
@@ -60,14 +61,13 @@ def test_terms(self):
                           'contact beam',
                           'angular veloc',
                           'shorter tuft',
+                          'conduct materi',
                           'endodont instrument',
                           'mass offset',
                           'section bend',
-                          'termin channel',
-                          'stationari household applianc',
-                          'fault point',
-                          'adhes strip',
-                          'handheld electron devic'
+                          'compon materi',
+                          'connect portion',
+                          'termin channel'
                           ]
 
         self.assertListEqual(actual_terms[:20], expected_terms)
@@ -75,7 +75,8 @@ def test_terms(self):
     def test_scores(self):
         term_score_tuples = self.__term_score_tuples
         actual_scores = [x for x, _ in term_score_tuples]
-        expected_scores = [0.8259734063804905,
+        expected_scores = [0.9449111825230679,
+                           0.8259734063804905,
                            0.7754588414852185,
                            0.7276068751089988,
                            0.7071067811865476,
@@ -83,17 +84,16 @@ def test_scores(self):
                            0.7071067811865475,
                            0.6666666666666666,
                            0.6396021490668312,
-                           0.6172133998483675,
+                           0.6246950475544241,
                            0.6031800939323297,
                            0.6000595413031171,
                            0.5834599659915781,
+                           0.5806718350868961,
                            0.5773502691896257,
                            0.5773502691896257,
                            0.5773502691896257,
-                           0.5597177778726654,
-                           0.5570860145311556,
-                           0.5568900989230109,
-                           0.547722557505166,
-                           0.5265695940793358]
+                           0.5669467095138407,
+                           0.5611088299627696,
+                           0.5597177778726654]
 
         support.assert_list_almost_equal(self, actual_scores[:20], expected_scores)