Merge pull request #162 from bnaul/featureset

Add Featureset class extending xr.Dataset
cesium-ml · Oct 7, 2016 · 74a5426 · 74a5426
2 parents b431200 + 5e8304f
commit 74a5426
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 4 deletions.
diff --git a/cesium/featureset.py b/cesium/featureset.py
@@ -0,0 +1,89 @@
+import numpy as np
+from sklearn.preprocessing import Imputer
+import xarray as xr
+
+
+__all__ = ['Featureset']
+
+
+class Featureset(xr.Dataset):
+    """Extension of `xarray.Dataset` class that implements some convenience
+    functions specific to featuresets generated from a set of time series.
+
+    In particular, provides a method `impute` for filling missing values and
+    overloads indexing so that the `name` attribute becomes the "primary"
+    coordinate to simplify extracting features for specific time series.
+    """
+    def __repr__(self):
+        """Replace <xarray.Dataset> when printing."""
+        s = xr.Dataset.__repr__(self)
+        return s.replace('<xarray.', '<cesium.')
+
+    def impute(self, strategy='constant', value=None):
+        """Replace NaN/Inf values with imputed values as defined by `strategy`.
+        Output should always satisfy `sklearn.validation.assert_all_finite` so
+        that training a model will never produce an error.
+
+        Parameters
+        ----------
+        strategy : str, optional
+            The imputation strategy. Defaults to 'constant'.
+
+            - 'constant': replace all missing with `value`
+            - 'mean': replace all missing with mean along `axis`
+            - 'median': replace all missing with median along `axis`
+            - 'most_frequent': replace all missing with mode along `axis`
+
+        value : float or None, optional
+            Replacement value to use for `strategy='constant'`. Defaults to
+            `None`, in which case a very large negative value is used (a
+            good choice for e.g. random forests).
+
+        Returns
+        -------
+        cesium.Featureset
+            Featureset wth no missing/infinite values.
+        """
+        masked = self.where(abs(self) < np.inf)
+        if strategy == 'constant':
+            if value == None:
+                # If no fill-in value is provided, use a large negative value
+                max_by_var = abs(masked).max()
+                value = -2. * np.array([v.values for v in max_by_var.values()]).max()
+            return masked.fillna(value)
+        elif strategy in ('mean', 'median', 'most_frequent'):
+            imputer = Imputer(strategy=strategy, axis=1)
+            for var, values in masked.data_vars.items():
+                values[:] = imputer.fit_transform(values)
+            return masked
+        else:
+            raise NotImplementedError("Imputation strategy '{}' not"
+                                      "recognized.".format(strategy))
+
+    def __getitem__(self, key):
+        """Overloads indexing of `xarray.Dataset` to handle special cases for
+        extracting features for specific time series. The `name` attribute is
+        treated as the "primary" coordinate since this indicates which time
+        series the features correspond to.
+
+        - First, if we pass in indices/slice, return data corresponding to
+          `name[key]`.
+        - Next, if we pass in a set of labels that are all present in `name`,
+          return data for those time series with `name`s present in `key`.
+        - Otherwise, fall back on standard `xarray.Dataset` indexing.
+
+        NOTE: the warning `FutureWarning: elementwise comparison failed;
+        returning scalar instead, but in the future will perform elementwise
+        comparison` is due to a bug in `numpy`:
+        https://github.com/numpy/numpy/issues/6784
+        """
+        names = self._construct_dataarray('name').values
+        if (isinstance(key, (slice, int))
+            or (hasattr(key, '__iter__') and all(isinstance(el, int)
+                                                 for el in key))):
+            return super().isel(name=key)
+        elif ((hasattr(key, '__iter__') and all(el in names for el in key)) or
+              key in names):
+            return super().sel(name=key)
+        else:
+            return super().__getitem__(key)
diff --git a/cesium/featurize.py b/cesium/featurize.py
@@ -11,6 +11,7 @@
 from . import data_management
 from . import time_series
 from . import util
+from .featureset import Featureset
 from .time_series import TimeSeries
 from . import obs_feature_tools as oft
 from . import science_feature_tools as sft
@@ -137,7 +138,7 @@ def assemble_featureset(feature_dicts, time_series=None, targets=None,
         featureset.coords['name'] = ('name', np.array(names))
     if targets is not None and any(targets):
         featureset.coords['target'] = ('name', np.array(targets))
-    return featureset
+    return Featureset(featureset)
 
 
 def load_and_store_feature_data(features_path, output_path):
@@ -147,7 +148,7 @@ def load_and_store_feature_data(features_path, output_path):
     featureset = assemble_featureset([], targets=targets,
                                      meta_feature_dicts=meta_feature_dicts)
     featureset.to_netcdf(output_path)
-    return featureset
+    return Featureset(featureset)
 
 
 # TODO should this be changed to use TimeSeries objects? or maybe an optional

diff --git a/cesium/tests/fixtures.py b/cesium/tests/fixtures.py
@@ -7,6 +7,7 @@
 import numpy as np
 import xarray as xr
 
+from cesium.featureset import Featureset
 from cesium.time_series import TimeSeries
 
 
@@ -38,7 +39,8 @@ def sample_ts_files(size, targets=[None]):
     shutil.rmtree(temp_dir)
 
 
-def sample_featureset(size, n_channels=1, features=[], targets=None):
+def sample_featureset(size, n_channels=1, features=[], targets=None,
+                      labels=None):
     ts_names = np.arange(size).astype('str')
     feat_dict = {f: (['channel', 'name'], [np.random.random(size)
                                            for i in range(n_channels)])
@@ -48,5 +50,7 @@ def sample_featureset(size, n_channels=1, features=[], targets=None):
     if targets:
         ts_targets = np.array(list(islice(cycle(targets), size)))
         fset.coords['target'] = ('name', ts_targets)
+    if labels:
+        fset.name.values = labels
 
-    return fset
+    return Featureset(fset)
diff --git a/cesium/tests/test_featureset.py b/cesium/tests/test_featureset.py
@@ -0,0 +1,50 @@
+import os
+from os.path import join as pjoin
+import numpy as np
+import numpy.testing as npt
+import scipy.stats
+import xarray as xr
+from cesium.tests.fixtures import sample_featureset
+
+
+def test_repr():
+    """Testing Featureset printing."""
+    fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum'],
+                             ['class1', 'class2'])
+    repr(fset)
+
+
+def test_impute():
+    """Test imputation of missing Featureset values."""
+    fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
+    fset.amplitude.values[0, 0] = np.inf
+    fset.amplitude.values[0, 1] = np.nan
+    values = fset.amplitude.values[0, 2:]
+
+    imputed = fset.impute(strategy='constant', value=-1e4)
+    npt.assert_allclose(-1e4, imputed.amplitude.values[0, 0:2])
+
+    imputed = fset.impute(strategy='mean')
+    npt.assert_allclose(np.mean(values), imputed.amplitude.values[0, 0:2])
+    npt.assert_allclose(values, imputed.amplitude.values[0, 2:])
+
+    imputed = fset.impute(strategy='median')
+    npt.assert_allclose(np.median(values), imputed.amplitude.values[0, 0:2])
+    npt.assert_allclose(values, imputed.amplitude.values[0, 2:])
+
+    imputed = fset.impute(strategy='most_frequent')
+    npt.assert_allclose(scipy.stats.mode(values).mode.item(),
+                        imputed.amplitude.values[0, 0:2])
+    npt.assert_allclose(values, imputed.amplitude.values[0, 2:])
+
+
+def test_indexing():
+    fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'],
+                             labels=['a', 'b', 'c'])
+    """Test indexing overloading (__getattr__)."""
+    assert all(fset[0] == fset.isel(name=0))
+    assert all(fset[0:2] == fset.isel(name=[0, 1]))
+    assert all(fset['a'] == fset.sel(name='a'))
+    assert all(fset[['a', 'b']] == fset.sel(name=['a', 'b']))
+    npt.assert_allclose(fset['amplitude'].values.ravel(),
+                        fset.data_vars['amplitude'].values.ravel())