From 606e69c6967c6ba1c778c6b8ae36c1b7adf17b4c Mon Sep 17 00:00:00 2001 From: Ari Crellin-Quick Date: Thu, 6 Oct 2016 14:17:25 -0700 Subject: [PATCH] Fill out function docstrings --- cesium/build_model.py | 47 ++++++++++++++++++++++++++++++++-- cesium/datasets/util.py | 56 ++++++++++++++++++++++++++++++++++++++++- cesium/util.py | 41 +++++++++++++++++++++++++++--- 3 files changed, 137 insertions(+), 7 deletions(-) diff --git a/cesium/build_model.py b/cesium/build_model.py index 8aeaccdc..bc8fd653 100644 --- a/cesium/build_model.py +++ b/cesium/build_model.py @@ -20,7 +20,19 @@ def rectangularize_featureset(featureset): - """Convert xarray.Dataset into (2d) Pandas.DataFrame for use with sklearn.""" + """Convert xarray.Dataset into (2d) Pandas.DataFrame for use with sklearn. + + Params + ------ + featureset : xarray.Dataset + The xarray.Dataset object containing features. + + Returns + ------- + Pandas.DataFrame + 2-D, sklearn-compatible Dataframe containing features. + + """ featureset = featureset.drop([coord for coord in featureset.coords if coord not in ['name', 'channel']]) feature_df = featureset.to_dataframe() @@ -71,7 +83,38 @@ def fit_model_optimize_hyperparams(data, targets, model, params_to_optimize, def build_model_from_featureset(featureset, model=None, model_type=None, model_options={}, params_to_optimize=None, cv=None): - """Build model from (non-rectangular) xarray.Dataset of features.""" + """Build model from (non-rectangular) xarray.Dataset of features. + + Parameters + ---------- + featureset : xarray.Dataset of features + Features for training model. + model : scikit-learn model, optional + Instantiated scikit-learn model. If None, `model_type` must not be. + Defaults to None. + model_type : str, optional + String indicating model to be used, e.g. "RandomForestClassifier". + If None, `model` must not be. Defaults to None. + model_options : dict, optional + Dictionary with hyperparameter values to be used in model building. + Keys are parameter names, values are the associated values. + params_to_optimize : list of str, optional + List of parameters to be optimized (whose corresponding entries + in `model_options` would be a list of values to try). If None, + parameters specified in `model_options` will be passed to model + constructor as-is (i.e. they are assumed not to be lists/grids of + values to try). Defaults to None. + cv : int, cross-validation generator or an iterable, optional + Number of folds (defaults to 3 if None) or an iterable yielding + train/test splits. See documentation for `GridSearchCV` for details. + Defaults to None (yielding 3 folds). + + Returns + ------- + sklearn estimator object + The fitted sklearn model. + + """ if featureset.get('target') is None: raise ValueError("Cannot build model for unlabeled feature set.") diff --git a/cesium/datasets/util.py b/cesium/datasets/util.py index d264a2ca..4b26866e 100644 --- a/cesium/datasets/util.py +++ b/cesium/datasets/util.py @@ -28,7 +28,22 @@ def _md5sum_file(path): def download_file(data_dir, base_url, filename): - """Download a single file into the given directory.""" + """Download a single file into the given directory. + + Parameters + ---------- + data_dir : str + Path to directory in which to save file. + base_url : str + URL of file to download, minus the file name. + filename : str + Name of file to be downloaded. + + Returns + ------- + str + The path to the newly downloaded file. + """ if not os.path.exists(data_dir): os.makedirs(data_dir) @@ -43,6 +58,26 @@ def download_and_extract_archives(data_dir, base_url, filenames, md5sums=None, remove_archive=True): """Download list of data archives, verify md5 checksums (if applicable), and extract into the given directory. + + Parameters + ---------- + data_dir : str + Path to directory in which to download and extract archives. + base_url : str + URL of files to download, minus the file names. + filenames : list or tuple of str + Name of file to be downloaded. + md5sums : dict, optional + Dictionary whose keys are file names and values are + corresponding hexadecimal md5 checksums to be checked against. + remove_archive : bool, optional + Boolean indicating whether to delete the archive(s) from disk + after the contents have been extracted. Defaults to True. + + Returns + ------- + list of str + The paths to the newly downloaded and unzipped files. """ if not os.path.exists(data_dir): os.makedirs(data_dir) @@ -67,6 +102,13 @@ def download_and_extract_archives(data_dir, base_url, filenames, md5sums=None, def build_time_series_archive(archive_path, ts_paths): """Write a .tar.gz archive containing the given time series files, as required for data uploaded via the front end. + + Parameters + ---------- + archive_path : str + Path at which to create the tarfile. + ts_paths : list of str + Paths to time-series file to be included in tarfile. """ with tarfile.TarFile(archive_path, 'w') as t: for fname in ts_paths: @@ -76,6 +118,18 @@ def build_time_series_archive(archive_path, ts_paths): def write_header(header_path, filenames, classes, metadata={}): """Write a header file for the given time series files, as required for data uploaded via the front end. + + Parameters + ---------- + header_path : str + Path at which header file will be created. + filenames : list of str + List of time-series file names associated with header file. + classes : list of str + List of class names associated with each time-series file. + metadata : dict, optional + Dictionary describing meta features associated with each time-series. + Keys are time-series file names. """ data_dict = {'filename': [util.shorten_fname(f) for f in filenames], 'class': classes} diff --git a/cesium/util.py b/cesium/util.py index f192560a..fbdfb900 100644 --- a/cesium/util.py +++ b/cesium/util.py @@ -13,19 +13,52 @@ def shorten_fname(file_path): - """Extract the name of a file (omitting directory names and extensions).""" + """Extract the name of a file (omitting directory names and extensions). + + Parameters + ---------- + file_path : str + Absolute or relative path to a file. + + Returns + ------- + str + The name of the file with directory names and extensions removed. + + """ return os.path.splitext(os.path.basename(file_path))[0] def make_list(x): - if isinstance(x, collections.Iterable) and not isinstance(x, str): + """Wrap `x` in a list if it isn't already a list or tuple. + + Parameters + ---------- + x : any valid object + The parameter to be wrapped in a list. + + Returns + ------- + list or tuple + Returns `[x]` if `x` is not already a list or tuple, otherwise + returns `x`. + + """ + if isinstance(x, collections.Iterable) and not isinstance(x, (str, dict)): return x else: - return [x,] + return [x] def remove_files(paths): - """Remove specified files from disk.""" + """Remove specified file(s) from disk. + + Parameters + ---------- + paths : str or list of str + Path(s) to file(s) to be removed from disk. + + """ paths = make_list(paths) for path in paths: try: