Skip to content

Commit

Permalink
Fill out a few function docstrings (#196)
Browse files Browse the repository at this point in the history
* Fill out function docstrings

* Add CONTRIBUTING.md

* Expand hyperparameter explanation in docstrings

* Update CONTRIBUTING.md
  • Loading branch information
acrellin authored Oct 13, 2016
1 parent d39f4ae commit 3e36983
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 9 deletions.
11 changes: 11 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Contributing



## Contributing Time-Series Features

We gratefully accept contributions of new time-series features, be they
domain-specific or general. Please follow the below guidelines in order that
your features may be successfully incorporated into the Cesium feature base.

*Coming soon*
58 changes: 54 additions & 4 deletions cesium/build_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,19 @@


def rectangularize_featureset(featureset):
"""Convert xarray.Dataset into (2d) Pandas.DataFrame for use with sklearn."""
"""Convert xarray.Dataset into (2d) Pandas.DataFrame for use with sklearn.
Params
------
featureset : xarray.Dataset
The xarray.Dataset object containing features.
Returns
-------
Pandas.DataFrame
2-D, sklearn-compatible Dataframe containing features.
"""
featureset = featureset.drop([coord for coord in featureset.coords
if coord not in ['name', 'channel']])
feature_df = featureset.to_dataframe()
Expand Down Expand Up @@ -68,15 +80,53 @@ def fit_model_optimize_hyperparams(data, targets, model, params_to_optimize,


def build_model_from_featureset(featureset, model=None, model_type=None,
model_options={}, params_to_optimize=None,
model_parameters={}, params_to_optimize=None,
cv=None):
"""Build model from (non-rectangular) xarray.Dataset of features."""
"""Build model from (non-rectangular) xarray.Dataset of features.
Parameters
----------
featureset : xarray.Dataset of features
Features for training model.
model : scikit-learn model, optional
Instantiated scikit-learn model. If None, `model_type` must not be.
Defaults to None.
model_type : str, optional
String indicating model to be used, e.g. "RandomForestClassifier".
If None, `model` must not be. Defaults to None.
model_parameters : dict, optional
Dictionary with hyperparameter values to be used in model building.
Keys are parameter names, values are the associated parameter values.
These hyperparameters will be passed to the model constructor as-is
(for hyperparameter optimization, see `params_to_optimize`).
If None, default values will be used (see scikit-learn documentation
for specifics).
params_to_optimize : dict or list of dict, optional
During hyperparameter optimization, various model parameters
are adjusted to give an optimal fit. This dictionary gives the
different values that should be explored for each parameter. E.g.,
`{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all
6 combinations of alpha and beta and compare the resulting models'
goodness-of-fit. If None, only those hyperparameters specified in
`model_parameters` will be used (passed to model constructor as-is).
Defaults to None.
cv : int, cross-validation generator or an iterable, optional
Number of folds (defaults to 3 if None) or an iterable yielding
train/test splits. See documentation for `GridSearchCV` for details.
Defaults to None (yielding 3 folds).
Returns
-------
sklearn estimator object
The fitted sklearn model.
"""
if featureset.get('target') is None:
raise ValueError("Cannot build model for unlabeled feature set.")

if model is None:
if model_type:
model = MODELS_TYPE_DICT[model_type](**model_options)
model = MODELS_TYPE_DICT[model_type](**model_parameters)
else:
raise ValueError("If model is None, model_type must be specified")
feature_df = rectangularize_featureset(featureset)
Expand Down
56 changes: 55 additions & 1 deletion cesium/datasets/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,22 @@ def _md5sum_file(path):


def download_file(data_dir, base_url, filename):
"""Download a single file into the given directory."""
"""Download a single file into the given directory.
Parameters
----------
data_dir : str
Path to directory in which to save file.
base_url : str
URL of file to download, minus the file name.
filename : str
Name of file to be downloaded.
Returns
-------
str
The path to the newly downloaded file.
"""
if not os.path.exists(data_dir):
os.makedirs(data_dir)

Expand All @@ -43,6 +58,26 @@ def download_and_extract_archives(data_dir, base_url, filenames, md5sums=None,
remove_archive=True):
"""Download list of data archives, verify md5 checksums (if applicable),
and extract into the given directory.
Parameters
----------
data_dir : str
Path to directory in which to download and extract archives.
base_url : str
URL of files to download, minus the file names.
filenames : list or tuple of str
Name of file to be downloaded.
md5sums : dict, optional
Dictionary whose keys are file names and values are
corresponding hexadecimal md5 checksums to be checked against.
remove_archive : bool, optional
Boolean indicating whether to delete the archive(s) from disk
after the contents have been extracted. Defaults to True.
Returns
-------
list of str
The paths to the newly downloaded and unzipped files.
"""
if not os.path.exists(data_dir):
os.makedirs(data_dir)
Expand All @@ -67,6 +102,13 @@ def download_and_extract_archives(data_dir, base_url, filenames, md5sums=None,
def build_time_series_archive(archive_path, ts_paths):
"""Write a .tar.gz archive containing the given time series files, as
required for data uploaded via the front end.
Parameters
----------
archive_path : str
Path at which to create the tarfile.
ts_paths : list of str
Paths to time-series file to be included in tarfile.
"""
with tarfile.TarFile(archive_path, 'w') as t:
for fname in ts_paths:
Expand All @@ -76,6 +118,18 @@ def build_time_series_archive(archive_path, ts_paths):
def write_header(header_path, filenames, classes, metadata={}):
"""Write a header file for the given time series files, as required for
data uploaded via the front end.
Parameters
----------
header_path : str
Path at which header file will be created.
filenames : list of str
List of time-series file names associated with header file.
classes : list of str
List of class names associated with each time-series file.
metadata : dict, optional
Dictionary describing meta features associated with each time-series.
Keys are time-series file names.
"""
data_dict = {'filename': [util.shorten_fname(f) for f in filenames],
'class': classes}
Expand Down
41 changes: 37 additions & 4 deletions cesium/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,52 @@


def shorten_fname(file_path):
"""Extract the name of a file (omitting directory names and extensions)."""
"""Extract the name of a file (omitting directory names and extensions).
Parameters
----------
file_path : str
Absolute or relative path to a file.
Returns
-------
str
The name of the file with directory names and extensions removed.
"""
return os.path.splitext(os.path.basename(file_path))[0]


def make_list(x):
if isinstance(x, collections.Iterable) and not isinstance(x, str):
"""Wrap `x` in a list if it isn't already a list or tuple.
Parameters
----------
x : any valid object
The parameter to be wrapped in a list.
Returns
-------
list or tuple
Returns `[x]` if `x` is not already a list or tuple, otherwise
returns `x`.
"""
if isinstance(x, collections.Iterable) and not isinstance(x, (str, dict)):
return x
else:
return [x,]
return [x]


def remove_files(paths):
"""Remove specified files from disk."""
"""Remove specified file(s) from disk.
Parameters
----------
paths : str or list of str
Path(s) to file(s) to be removed from disk.
"""
paths = make_list(paths)
for path in paths:
try:
Expand Down

0 comments on commit 3e36983

Please sign in to comment.