drivendataorg · dmitrypolo · Jul 25, 2018 · Jul 25, 2018 · Jul 25, 2018 · Jul 25, 2018
diff --git a/.gitignore b/.gitignore
@@ -2,8 +2,15 @@ docs/site/
 
 # OSX Junk
 .DS_Store
+.idea
 
 # test cache
 .cache/*
 tests/__pycache__/*
-*.pytest_cache/
+*.pytest_cache/
+
+# dataset 
+*.csv
+
+# environment 
+venv/
diff --git a/cookiecutter.json b/cookiecutter.json
@@ -6,5 +6,6 @@
     "open_source_license": ["MIT", "BSD-3-Clause", "No license file"],
     "s3_bucket": "[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')",
     "aws_profile": "default",
-    "python_interpreter": ["python3", "python"]
+    "python_interpreter": ["python3", "python"],
+    "include_starter_proj": ["n", "y"]
 }
diff --git a/hooks/post_gen_project.py b/hooks/post_gen_project.py
@@ -0,0 +1,14 @@
+import os
+import shutil
+
+DIRS = ['src/models']
+CWD = os.getcwd()
+
+if '{{ cookiecutter.include_starter_proj|lower }}' == 'n':
+    shutil.rmtree(os.path.join(CWD, 'tests'))
+    for directory in DIRS:
+        files_path = os.path.join(CWD, directory)
+        for fin in os.listdir(files_path):
+            if fin.endswith('.py'):
+                open(os.path.join(CWD, files_path, fin), 
+                    'w').close()
diff --git a/{{ cookiecutter.repo_name }}/.gitignore b/{{ cookiecutter.repo_name }}/.gitignore
@@ -1,5 +1,6 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
+*.pytest_cache/
 *.py[cod]
 
 # C extensions
@@ -78,5 +79,8 @@ target/
 # exclude data from source control by default
 /data/
 
+# exclude pickled models since they can be un-safe
+/models/
+
 # Mac OS-specific storage files
 .DS_Store
diff --git a/{{ cookiecutter.repo_name }}/Makefile b/{{ cookiecutter.repo_name }}/Makefile
@@ -27,8 +27,21 @@ requirements: test_environment
 
 ## Make Dataset
 data: requirements
+	wget --progress=bar:force -O data/raw/transfusion_data_raw.csv https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data
 	$(PYTHON_INTERPRETER) src/data/make_dataset.py
 
+{% if cookiecutter.include_starter_proj|lower == 'y' %}
+
+## Train Model
+train: data
+	$(PYTHON_INTERPRETER) src/models/train_model.py
+
+## Run the model pipeline
+model_pipeline: train
+	$(PYTHON_INTERPRETER) src/models/predict_model.py
+
+{% endif %}
+
 ## Delete all compiled Python files
 clean:
 	find . -type f -name "*.py[co]" -delete

diff --git a/{{ cookiecutter.repo_name }}/requirements.txt b/{{ cookiecutter.repo_name }}/requirements.txt
@@ -12,4 +12,12 @@ python-dotenv>=0.5.1
 
 # backwards compatibility
 pathlib2
+
+{% endif %}
+
+{% if cookiecutter.include_starter_proj|lower == 'y' %}
+
+# requirements needed for starter Data Science project
+scikit-learn[alldeps]
+
 {% endif %}
diff --git a/{{ cookiecutter.repo_name }}/setup.py b/{{ cookiecutter.repo_name }}/setup.py
@@ -6,5 +6,5 @@
     version='0.1.0',
     description='{{ cookiecutter.description }}',
     author='{{ cookiecutter.author_name }}',
-    license='{% if cookiecutter.open_source_license == 'MIT' %}MIT{% elif cookiecutter.open_source_license == 'BSD-3-Clause' %}BSD-3{% endif %}',
+    license='{% if cookiecutter.open_source_license == "MIT" %}MIT{% elif cookiecutter.open_source_license == "BSD-3-Clause" %}BSD-3{% endif %}',
 )
diff --git a/{{ cookiecutter.repo_name }}/src/data/make_dataset.py b/{{ cookiecutter.repo_name }}/src/data/make_dataset.py
@@ -1,30 +1,40 @@
-# -*- coding: utf-8 -*-
-import click
 import logging
+import pandas as pd
 from pathlib import Path
 from dotenv import find_dotenv, load_dotenv
 
+ROOT = Path(__file__).resolve().parents[2]
 
-@click.command()
-@click.argument('input_filepath', type=click.Path(exists=True))
-@click.argument('output_filepath', type=click.Path())
-def main(input_filepath, output_filepath):
-    """ Runs data processing scripts to turn raw data from (../raw) into
+
+def massage_data(raw_data):
+    """ Preprocess the data for predictions
+    """
+    raw_data.rename(index=str, columns={"whether he/she donated blood in March 2007": "label"}, inplace=True)
+
+    # generate features for year for time columns
+    for x, y in zip(['time_years', 'recency_years'], ['Time (months)', 'Recency (months)']):
+        raw_data[x] = (raw_data[y] / 12).astype('int')
+
+    # generate features for quarter for time columns (3 month periods)
+    for x, y in zip(['time_quarters', 'recency_quarters'], ['Time (months)', 'Recency (months)']):
+        raw_data[x] = (raw_data[y] / 3).astype('int')
+
+    return raw_data
+
+
+def main():
+    """ Retrieves data and runs processing scripts to turn raw data from (../raw) into
         cleaned data ready to be analyzed (saved in ../processed).
     """
-    logger = logging.getLogger(__name__)
-    logger.info('making final data set from raw data')
+    df = pd.read_csv(ROOT / 'data/raw/transfusion_data_raw.csv')
+    processed_data = massage_data(df)
+    processed_data.to_csv(ROOT / 'data/processed/transfusion_data.csv', index=False)
 
 
 if __name__ == '__main__':
     log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
     logging.basicConfig(level=logging.INFO, format=log_fmt)
-
-    # not used in this stub but often useful for finding various files
-    project_dir = Path(__file__).resolve().parents[2]
-
-    # find .env automagically by walking up directories until it's found, then
-    # load up the .env entries as environment variables
+
     load_dotenv(find_dotenv())
 
-    main()
+    main()
diff --git a/{{ cookiecutter.repo_name }}/src/models/predict_model.py b/{{ cookiecutter.repo_name }}/src/models/predict_model.py
@@ -0,0 +1,38 @@
+import pickle
+import logging
+import pandas as pd
+from pathlib import Path
+from sklearn.metrics import roc_auc_score
+
+ROOT = Path(__file__).resolve().parents[2]
+
+
+def retrieve_model():
+    """retrieve the pickled model object
+    """
+    pickled_model = ROOT / 'models/transfusion.model'
+    with open(pickled_model, 'rb') as fin:
+        return(pickle.load(fin))
+
+
+def main():
+    """ retrieve the model and predict labels. Show prediction and performance
+    """
+    deserialized_model = retrieve_model()
+    X_test = pd.read_csv(ROOT / 'data/processed/transfusion_x_test.csv')
+    y_pred = deserialized_model.predict(X_test)
+
+    y_test = pd.read_csv(ROOT / 'data/processed/transfusion_y_test.csv',
+        header=None)
+    auc = roc_auc_score(y_test.astype(int), deserialized_model.predict_proba(X_test)[:, 1])
+    return y_pred, auc
+
+
+if __name__ == '__main__':
+    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    logging.basicConfig(level=logging.INFO, format=log_fmt)
+    logger = logging.getLogger(__file__)
+
+    preds, auc = main()
+    logging.info('The predictions are {}'.format(preds))
+    logging.info('The AUC is {}'.format(auc))
diff --git a/{{ cookiecutter.repo_name }}/src/models/train_model.py b/{{ cookiecutter.repo_name }}/src/models/train_model.py
@@ -0,0 +1,58 @@
+import pickle
+import pandas as pd
+from pathlib import Path
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+
+ROOT = Path(__file__).resolve().parents[2]
+
+if '{{ cookiecutter.python_interpreter }}' == 'python3':
+    PROTOCOL = pickle.DEFAULT_PROTOCOL
+else:
+    PROTOCOL = 2
+
+
+def fetch_processed(data_path):
+    """
+    fetch the data that was processed in make data
+    """
+    data = pd.read_csv(ROOT / data_path)
+    data_y = data.label
+    data_x = data.drop(['label'], axis=1)
+
+    # Create training and test sets
+    X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, 
+        test_size=0.2, random_state=0)
+    return X_train, X_test, y_train, y_test
+
+
+def fit_model(X_train, y_train):
+    """
+    fit a model to the training data
+    """
+    model = RandomForestClassifier(n_estimators=100)
+
+    # Fit to the training data
+    model.fit(X_train, y_train)
+    return model
+
+
+def main():
+    """ Trains the model on the retrieved data write it back to file
+    """
+    x_train, x_test, y_train, y_test = fetch_processed('data/processed/transfusion_data.csv')
+
+    # Train the model 
+    model = fit_model(x_train, y_train)
+
+    # Store model and test set for prediction
+    with open(ROOT / 'models/transfusion.model', 'wb') as fout:
+        pickle.dump(model, fout, PROTOCOL)
+    x_test.to_csv(ROOT / 'data/processed/transfusion_x_test.csv',
+        index=False)
+    y_test.to_csv(ROOT / 'data/processed/transfusion_y_test.csv',
+        index=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/{{ cookiecutter.repo_name }}/tests/__init__.py b/{{ cookiecutter.repo_name }}/tests/__init__.py
diff --git a/{{ cookiecutter.repo_name }}/tests/test_make_data.py b/{{ cookiecutter.repo_name }}/tests/test_make_data.py
@@ -0,0 +1,17 @@
+import pandas as pd
+from src.data.make_dataset import massage_data
+
+
+mock_data = {
+   'whether he/she donated blood in March 2007': [1, 0, 0, 1],
+   'Time (months)': [36, 10, 12, 16],
+   'Recency (months)': [10, 20, 15, 22] 
+}
+
+
+def test_massage_data():
+    raw = pd.DataFrame(mock_data)
+    data = massage_data(raw)
+    assert data.iloc[0, 2] == 10
+    assert data.iloc[3, 6] == 7
+
diff --git a/{{ cookiecutter.repo_name }}/tests/test_train_data.py b/{{ cookiecutter.repo_name }}/tests/test_train_data.py
@@ -0,0 +1,37 @@
+import pandas as pd
+from unittest.mock import patch, Mock
+from src.models.train_model import fetch_processed, fit_model
+
+
+mock_data = {
+   'label': [1, 0, 0, 1],
+   'fizz': ['John', 'Bob', 'Sam', 'Kevin'],
+   'buzz': ['foo', 'bar', 'buzz', 'fizz'],
+   'foo': ['y', 'n', 'm', 'y'],
+   'bar': ['a', 'b', 'c', 'd'],
+   'fish': ['nyc', 'la', 'boston', 'amherst'] 
+}
+
+
+def test_fetch_processed(monkeypatch):
+    def mock_read_csv(fin):
+        return pd.DataFrame(mock_data)
+
+    monkeypatch.setattr(pd, 'read_csv', mock_read_csv)
+    x_train, x_test, y_train, y_test = fetch_processed('foo')
+
+    assert all(y_train >= 0)
+    assert all(y_test >= 0)
+    assert x_train.shape[0] > 0
+    assert x_test.shape[0] > 0
+
+
+@patch('src.models.train_model.RandomForestClassifier')
+def test_fit_model(mock_forest, monkeypatch):
+    mock_model = Mock()
+    attrs = {'fit.return_value': 'foo'}
+    mock_model.configure_mock(**attrs)
+    mock_forest.return_value = mock_model
+    model = fit_model('foo', 'bar')
+    assert model.fit() == 'foo'
+