Fill out a few function docstrings (#196)

* Fill out function docstrings * Add CONTRIBUTING.md * Expand hyperparameter explanation in docstrings * Update CONTRIBUTING.md
cesium-ml · Oct 13, 2016 · 3e36983 · 3e36983
1 parent d39f4ae
commit 3e36983
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 9 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,11 @@
+# Contributing
+
+
+
+## Contributing Time-Series Features
+
+We gratefully accept contributions of new time-series features, be they
+domain-specific or general. Please follow the below guidelines in order that
+your features may be successfully incorporated into the Cesium feature base.
+
+*Coming soon*
diff --git a/cesium/build_model.py b/cesium/build_model.py
@@ -19,7 +19,19 @@
 
 
 def rectangularize_featureset(featureset):
-    """Convert xarray.Dataset into (2d) Pandas.DataFrame for use with sklearn."""
+    """Convert xarray.Dataset into (2d) Pandas.DataFrame for use with sklearn.
+
+    Params
+    ------
+    featureset : xarray.Dataset
+        The xarray.Dataset object containing features.
+
+    Returns
+    -------
+    Pandas.DataFrame
+        2-D, sklearn-compatible Dataframe containing features.
+
+    """
     featureset = featureset.drop([coord for coord in featureset.coords
                                   if coord not in ['name', 'channel']])
     feature_df = featureset.to_dataframe()
@@ -68,15 +80,53 @@ def fit_model_optimize_hyperparams(data, targets, model, params_to_optimize,
 
 
 def build_model_from_featureset(featureset, model=None, model_type=None,
-                                model_options={}, params_to_optimize=None,
+                                model_parameters={}, params_to_optimize=None,
                                 cv=None):
-    """Build model from (non-rectangular) xarray.Dataset of features."""
+    """Build model from (non-rectangular) xarray.Dataset of features.
+
+    Parameters
+    ----------
+    featureset : xarray.Dataset of features
+        Features for training model.
+    model : scikit-learn model, optional
+        Instantiated scikit-learn model. If None, `model_type` must not be.
+        Defaults to None.
+    model_type : str, optional
+        String indicating model to be used, e.g. "RandomForestClassifier".
+        If None, `model` must not be. Defaults to None.
+    model_parameters : dict, optional
+        Dictionary with hyperparameter values to be used in model building.
+        Keys are parameter names, values are the associated parameter values.
+        These hyperparameters will be passed to the model constructor as-is
+        (for hyperparameter optimization, see `params_to_optimize`).
+        If None, default values will be used (see scikit-learn documentation
+        for specifics).
+    params_to_optimize : dict or list of dict, optional
+        During hyperparameter optimization, various model parameters
+        are adjusted to give an optimal fit. This dictionary gives the
+        different values that should be explored for each parameter. E.g.,
+        `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all
+        6 combinations of alpha and beta and compare the resulting models'
+        goodness-of-fit. If None, only those hyperparameters specified in
+        `model_parameters` will be used (passed to model constructor as-is).
+        Defaults to None.
+    cv : int, cross-validation generator or an iterable, optional
+        Number of folds (defaults to 3 if None) or an iterable yielding
+        train/test splits. See documentation for `GridSearchCV` for details.
+        Defaults to None (yielding 3 folds).
+
+    Returns
+    -------
+    sklearn estimator object
+        The fitted sklearn model.
+
+    """
     if featureset.get('target') is None:
         raise ValueError("Cannot build model for unlabeled feature set.")
 
     if model is None:
         if model_type:
-            model = MODELS_TYPE_DICT[model_type](**model_options)
+            model = MODELS_TYPE_DICT[model_type](**model_parameters)
         else:
             raise ValueError("If model is None, model_type must be specified")
     feature_df = rectangularize_featureset(featureset)

diff --git a/cesium/datasets/util.py b/cesium/datasets/util.py
@@ -28,7 +28,22 @@ def _md5sum_file(path):
 
 
 def download_file(data_dir, base_url, filename):
-    """Download a single file into the given directory."""
+    """Download a single file into the given directory.
+
+    Parameters
+    ----------
+    data_dir : str
+        Path to directory in which to save file.
+    base_url : str
+        URL of file to download, minus the file name.
+    filename : str
+        Name of file to be downloaded.
+
+    Returns
+    -------
+    str
+        The path to the newly downloaded file.
+    """
     if not os.path.exists(data_dir):
         os.makedirs(data_dir)
 
@@ -43,6 +58,26 @@ def download_and_extract_archives(data_dir, base_url, filenames, md5sums=None,
                                  remove_archive=True):
     """Download list of data archives, verify md5 checksums (if applicable),
     and extract into the given directory.
+
+    Parameters
+    ----------
+    data_dir : str
+        Path to directory in which to download and extract archives.
+    base_url : str
+        URL of files to download, minus the file names.
+    filenames : list or tuple of str
+        Name of file to be downloaded.
+    md5sums : dict, optional
+        Dictionary whose keys are file names and values are
+        corresponding hexadecimal md5 checksums to be checked against.
+    remove_archive : bool, optional
+        Boolean indicating whether to delete the archive(s) from disk
+        after the contents have been extracted. Defaults to True.
+
+    Returns
+    -------
+    list of str
+        The paths to the newly downloaded and unzipped files.
     """
     if not os.path.exists(data_dir):
         os.makedirs(data_dir)
@@ -67,6 +102,13 @@ def download_and_extract_archives(data_dir, base_url, filenames, md5sums=None,
 def build_time_series_archive(archive_path, ts_paths):
     """Write a .tar.gz archive containing the given time series files, as
     required for data uploaded via the front end.
+
+    Parameters
+    ----------
+    archive_path : str
+        Path at which to create the tarfile.
+    ts_paths : list of str
+        Paths to time-series file to be included in tarfile.
     """
     with tarfile.TarFile(archive_path, 'w') as t:
         for fname in ts_paths:
@@ -76,6 +118,18 @@ def build_time_series_archive(archive_path, ts_paths):
 def write_header(header_path, filenames, classes, metadata={}):
     """Write a header file for the given time series files, as required for
     data uploaded via the front end.
+
+    Parameters
+    ----------
+    header_path : str
+        Path at which header file will be created.
+    filenames : list of str
+        List of time-series file names associated with header file.
+    classes : list of str
+        List of class names associated with each time-series file.
+    metadata : dict, optional
+        Dictionary describing meta features associated with each time-series.
+        Keys are time-series file names.
     """
     data_dict = {'filename': [util.shorten_fname(f) for f in filenames],
                  'class': classes}

diff --git a/cesium/util.py b/cesium/util.py
@@ -13,19 +13,52 @@
 
 
 def shorten_fname(file_path):
-    """Extract the name of a file (omitting directory names and extensions)."""
+    """Extract the name of a file (omitting directory names and extensions).
+
+    Parameters
+    ----------
+    file_path : str
+        Absolute or relative path to a file.
+
+    Returns
+    -------
+    str
+       The name of the file with directory names and extensions removed.
+
+    """
     return os.path.splitext(os.path.basename(file_path))[0]
 
 
 def make_list(x):
-    if isinstance(x, collections.Iterable) and not isinstance(x, str):
+    """Wrap `x` in a list if it isn't already a list or tuple.
+
+    Parameters
+    ----------
+    x : any valid object
+        The parameter to be wrapped in a list.
+
+    Returns
+    -------
+    list or tuple
+        Returns `[x]` if `x` is not already a list or tuple, otherwise
+        returns `x`.
+
+    """
+    if isinstance(x, collections.Iterable) and not isinstance(x, (str, dict)):
         return x
     else:
-        return [x,]
+        return [x]
 
 
 def remove_files(paths):
-    """Remove specified files from disk."""
+    """Remove specified file(s) from disk.
+
+    Parameters
+    ----------
+    paths : str or list of str
+        Path(s) to file(s) to be removed from disk.
+
+    """
     paths = make_list(paths)
     for path in paths:
         try: