Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add starter Data Science Project #133

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
14d2de9
update .gitignore
jeanielim Jul 25, 2018
7fb6426
update .gitignore
jeanielim Jul 25, 2018
9b945b3
Add sample project using titanic dataset
jeanielim Jul 25, 2018
8e624bc
Add sample project using titanic dataset
jeanielim Jul 25, 2018
de9cb37
Add sample project
jeanielim Jul 25, 2018
d51e00a
remove files
jeanielim Jul 25, 2018
e6c5dc2
update reqs for starter proj or remove proj
dmitrypolo Jul 25, 2018
d06e2bc
Merge pull request #1 from jeanielim/project_setup
jeanielim Jul 25, 2018
77ad646
moving code into appropriate spots
dmitrypolo Jul 25, 2018
66a8eec
pickle then dump, also removed old code
dmitrypolo Jul 26, 2018
19fe2f4
train pickle, unpickle predict
dmitrypolo Jul 26, 2018
7710f61
adding some tests and update gitignore
dmitrypolo Jul 26, 2018
5e2c0b6
change hooks
dmitrypolo Jul 26, 2018
4248e9e
save testing data when pickling
dmitrypolo Jul 26, 2018
764dfa8
adding tests for checking the training and test set
jkarlenmm Jul 26, 2018
aabf927
testing the training data
dmitrypolo Jul 27, 2018
bf32822
rename path
dmitrypolo Jul 27, 2018
e94c12c
remove files if not needed with hook
dmitrypolo Jul 27, 2018
ddb7e1e
hook to remove starter kit
dmitrypolo Jul 27, 2018
bc55e9e
not overwrite variables
dmitrypolo Jul 27, 2018
38c7cfb
printing auc
jkarlenmm Jul 27, 2018
65411e7
added everything to make 'make all' run the modeling pipeline
jkarlenmm Jul 27, 2018
840e4fc
adding make model_pipeline to Makefile
jkarlenmm Jul 27, 2018
6b2b2b0
fix test and loop
dmitrypolo Jul 27, 2018
301ce78
Merge branch 'master' of github.com:jeanielim/cookiecutter-data-science
dmitrypolo Jul 27, 2018
756ed68
clean up unused imports and use PEP
dmitrypolo Jul 27, 2018
90041ea
removed superfluous comment
jkarlenmm Jul 27, 2018
bca86b4
merge conflicts
jkarlenmm Jul 27, 2018
c63493c
merge upstream master and resolve conflicts
dmitrypolo Jul 28, 2018
a0b1896
PR changes
dmitrypolo Aug 7, 2018
d554e56
use lower case
dmitrypolo Aug 7, 2018
25f193e
forgot reqs
dmitrypolo Aug 7, 2018
fae2374
forgot to remove os dependency
dmitrypolo Aug 7, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@ docs/site/

# OSX Junk
.DS_Store
.idea

# test cache
.cache/*
tests/__pycache__/*
*.pytest_cache/
*.pytest_cache/

# dataset
*.csv

# environment
venv/
3 changes: 2 additions & 1 deletion cookiecutter.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
"open_source_license": ["MIT", "BSD-3-Clause", "No license file"],
"s3_bucket": "[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')",
"aws_profile": "default",
"python_interpreter": ["python3", "python"]
"python_interpreter": ["python3", "python"],
"include_starter_proj": ["n", "y"]
}
14 changes: 14 additions & 0 deletions hooks/post_gen_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os
import shutil

DIRS = ['src/models']
CWD = os.getcwd()

if '{{ cookiecutter.include_starter_proj|lower }}' == 'n':
shutil.rmtree(os.path.join(CWD, 'tests'))
for directory in DIRS:
files_path = os.path.join(CWD, directory)
for fin in os.listdir(files_path):
if fin.endswith('.py'):
open(os.path.join(CWD, files_path, fin),
'w').close()
4 changes: 4 additions & 0 deletions {{ cookiecutter.repo_name }}/.gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.pytest_cache/
*.py[cod]

# C extensions
Expand Down Expand Up @@ -78,5 +79,8 @@ target/
# exclude data from source control by default
/data/

# exclude pickled models since they can be un-safe
/models/

# Mac OS-specific storage files
.DS_Store
13 changes: 13 additions & 0 deletions {{ cookiecutter.repo_name }}/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,21 @@ requirements: test_environment

## Make Dataset
data: requirements
wget --progress=bar:force -O data/raw/transfusion_data_raw.csv https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data
$(PYTHON_INTERPRETER) src/data/make_dataset.py

{% if cookiecutter.include_starter_proj|lower == 'y' %}

## Train Model
train: data
$(PYTHON_INTERPRETER) src/models/train_model.py

## Run the model pipeline
model_pipeline: train
$(PYTHON_INTERPRETER) src/models/predict_model.py

{% endif %}

## Delete all compiled Python files
clean:
find . -type f -name "*.py[co]" -delete
Expand Down
8 changes: 8 additions & 0 deletions {{ cookiecutter.repo_name }}/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,12 @@ python-dotenv>=0.5.1

# backwards compatibility
pathlib2

{% endif %}

{% if cookiecutter.include_starter_proj|lower == 'y' %}

# requirements needed for starter Data Science project
scikit-learn[alldeps]

{% endif %}
2 changes: 1 addition & 1 deletion {{ cookiecutter.repo_name }}/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
version='0.1.0',
description='{{ cookiecutter.description }}',
author='{{ cookiecutter.author_name }}',
license='{% if cookiecutter.open_source_license == 'MIT' %}MIT{% elif cookiecutter.open_source_license == 'BSD-3-Clause' %}BSD-3{% endif %}',
license='{% if cookiecutter.open_source_license == "MIT" %}MIT{% elif cookiecutter.open_source_license == "BSD-3-Clause" %}BSD-3{% endif %}',
)
42 changes: 26 additions & 16 deletions {{ cookiecutter.repo_name }}/src/data/make_dataset.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,40 @@
# -*- coding: utf-8 -*-
import click
import logging
import pandas as pd
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

ROOT = Path(__file__).resolve().parents[2]

@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath):
""" Runs data processing scripts to turn raw data from (../raw) into

def massage_data(raw_data):
""" Preprocess the data for predictions
"""
raw_data.rename(index=str, columns={"whether he/she donated blood in March 2007": "label"}, inplace=True)

# generate features for year for time columns
for x, y in zip(['time_years', 'recency_years'], ['Time (months)', 'Recency (months)']):
raw_data[x] = (raw_data[y] / 12).astype('int')

# generate features for quarter for time columns (3 month periods)
for x, y in zip(['time_quarters', 'recency_quarters'], ['Time (months)', 'Recency (months)']):
raw_data[x] = (raw_data[y] / 3).astype('int')

return raw_data


def main():
""" Retrieves data and runs processing scripts to turn raw data from (../raw) into
cleaned data ready to be analyzed (saved in ../processed).
"""
logger = logging.getLogger(__name__)
logger.info('making final data set from raw data')
df = pd.read_csv(ROOT / 'data/raw/transfusion_data_raw.csv')
processed_data = massage_data(df)
processed_data.to_csv(ROOT / 'data/processed/transfusion_data.csv', index=False)


if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)

# not used in this stub but often useful for finding various files
project_dir = Path(__file__).resolve().parents[2]

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables

load_dotenv(find_dotenv())

main()
main()
38 changes: 38 additions & 0 deletions {{ cookiecutter.repo_name }}/src/models/predict_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pickle
import logging
import pandas as pd
from pathlib import Path
from sklearn.metrics import roc_auc_score

ROOT = Path(__file__).resolve().parents[2]


def retrieve_model():
"""retrieve the pickled model object
"""
pickled_model = ROOT / 'models/transfusion.model'
with open(pickled_model, 'rb') as fin:
return(pickle.load(fin))


def main():
""" retrieve the model and predict labels. Show prediction and performance
"""
deserialized_model = retrieve_model()
X_test = pd.read_csv(ROOT / 'data/processed/transfusion_x_test.csv')
y_pred = deserialized_model.predict(X_test)

y_test = pd.read_csv(ROOT / 'data/processed/transfusion_y_test.csv',
header=None)
auc = roc_auc_score(y_test.astype(int), deserialized_model.predict_proba(X_test)[:, 1])
return y_pred, auc


if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
logger = logging.getLogger(__file__)

preds, auc = main()
logging.info('The predictions are {}'.format(preds))
logging.info('The AUC is {}'.format(auc))
58 changes: 58 additions & 0 deletions {{ cookiecutter.repo_name }}/src/models/train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pickle
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

ROOT = Path(__file__).resolve().parents[2]

if '{{ cookiecutter.python_interpreter }}' == 'python3':
PROTOCOL = pickle.DEFAULT_PROTOCOL
else:
PROTOCOL = 2


def fetch_processed(data_path):
"""
fetch the data that was processed in make data
"""
data = pd.read_csv(ROOT / data_path)
data_y = data.label
data_x = data.drop(['label'], axis=1)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y,
test_size=0.2, random_state=0)
return X_train, X_test, y_train, y_test


def fit_model(X_train, y_train):
"""
fit a model to the training data
"""
model = RandomForestClassifier(n_estimators=100)

# Fit to the training data
model.fit(X_train, y_train)
return model


def main():
""" Trains the model on the retrieved data write it back to file
"""
x_train, x_test, y_train, y_test = fetch_processed('data/processed/transfusion_data.csv')

# Train the model
model = fit_model(x_train, y_train)

# Store model and test set for prediction
with open(ROOT / 'models/transfusion.model', 'wb') as fout:
pickle.dump(model, fout, PROTOCOL)
x_test.to_csv(ROOT / 'data/processed/transfusion_x_test.csv',
index=False)
y_test.to_csv(ROOT / 'data/processed/transfusion_y_test.csv',
index=False)


if __name__ == '__main__':
main()
Empty file.
17 changes: 17 additions & 0 deletions {{ cookiecutter.repo_name }}/tests/test_make_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pandas as pd
from src.data.make_dataset import massage_data


mock_data = {
'whether he/she donated blood in March 2007': [1, 0, 0, 1],
'Time (months)': [36, 10, 12, 16],
'Recency (months)': [10, 20, 15, 22]
}


def test_massage_data():
raw = pd.DataFrame(mock_data)
data = massage_data(raw)
assert data.iloc[0, 2] == 10
assert data.iloc[3, 6] == 7

37 changes: 37 additions & 0 deletions {{ cookiecutter.repo_name }}/tests/test_train_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd
from unittest.mock import patch, Mock
from src.models.train_model import fetch_processed, fit_model


mock_data = {
'label': [1, 0, 0, 1],
'fizz': ['John', 'Bob', 'Sam', 'Kevin'],
'buzz': ['foo', 'bar', 'buzz', 'fizz'],
'foo': ['y', 'n', 'm', 'y'],
'bar': ['a', 'b', 'c', 'd'],
'fish': ['nyc', 'la', 'boston', 'amherst']
}


def test_fetch_processed(monkeypatch):
def mock_read_csv(fin):
return pd.DataFrame(mock_data)

monkeypatch.setattr(pd, 'read_csv', mock_read_csv)
x_train, x_test, y_train, y_test = fetch_processed('foo')

assert all(y_train >= 0)
assert all(y_test >= 0)
assert x_train.shape[0] > 0
assert x_test.shape[0] > 0


@patch('src.models.train_model.RandomForestClassifier')
def test_fit_model(mock_forest, monkeypatch):
mock_model = Mock()
attrs = {'fit.return_value': 'foo'}
mock_model.configure_mock(**attrs)
mock_forest.return_value = mock_model
model = fit_model('foo', 'bar')
assert model.fit() == 'foo'