Skip to content

Commit

Permalink
Merge pull request #162 from bnaul/featureset
Browse files Browse the repository at this point in the history
Add Featureset class extending xr.Dataset
  • Loading branch information
bnaul authored Oct 7, 2016
2 parents b431200 + 5e8304f commit 74a5426
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 4 deletions.
89 changes: 89 additions & 0 deletions cesium/featureset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import numpy as np
from sklearn.preprocessing import Imputer
import xarray as xr


__all__ = ['Featureset']


class Featureset(xr.Dataset):
"""Extension of `xarray.Dataset` class that implements some convenience
functions specific to featuresets generated from a set of time series.
In particular, provides a method `impute` for filling missing values and
overloads indexing so that the `name` attribute becomes the "primary"
coordinate to simplify extracting features for specific time series.
"""
def __repr__(self):
"""Replace <xarray.Dataset> when printing."""
s = xr.Dataset.__repr__(self)
return s.replace('<xarray.', '<cesium.')

def impute(self, strategy='constant', value=None):
"""Replace NaN/Inf values with imputed values as defined by `strategy`.
Output should always satisfy `sklearn.validation.assert_all_finite` so
that training a model will never produce an error.
Parameters
----------
strategy : str, optional
The imputation strategy. Defaults to 'constant'.
- 'constant': replace all missing with `value`
- 'mean': replace all missing with mean along `axis`
- 'median': replace all missing with median along `axis`
- 'most_frequent': replace all missing with mode along `axis`
value : float or None, optional
Replacement value to use for `strategy='constant'`. Defaults to
`None`, in which case a very large negative value is used (a
good choice for e.g. random forests).
Returns
-------
cesium.Featureset
Featureset wth no missing/infinite values.
"""
masked = self.where(abs(self) < np.inf)
if strategy == 'constant':
if value == None:
# If no fill-in value is provided, use a large negative value
max_by_var = abs(masked).max()
value = -2. * np.array([v.values for v in max_by_var.values()]).max()
return masked.fillna(value)
elif strategy in ('mean', 'median', 'most_frequent'):
imputer = Imputer(strategy=strategy, axis=1)
for var, values in masked.data_vars.items():
values[:] = imputer.fit_transform(values)
return masked
else:
raise NotImplementedError("Imputation strategy '{}' not"
"recognized.".format(strategy))

def __getitem__(self, key):
"""Overloads indexing of `xarray.Dataset` to handle special cases for
extracting features for specific time series. The `name` attribute is
treated as the "primary" coordinate since this indicates which time
series the features correspond to.
- First, if we pass in indices/slice, return data corresponding to
`name[key]`.
- Next, if we pass in a set of labels that are all present in `name`,
return data for those time series with `name`s present in `key`.
- Otherwise, fall back on standard `xarray.Dataset` indexing.
NOTE: the warning `FutureWarning: elementwise comparison failed;
returning scalar instead, but in the future will perform elementwise
comparison` is due to a bug in `numpy`:
https://github.com/numpy/numpy/issues/6784
"""
names = self._construct_dataarray('name').values
if (isinstance(key, (slice, int))
or (hasattr(key, '__iter__') and all(isinstance(el, int)
for el in key))):
return super().isel(name=key)
elif ((hasattr(key, '__iter__') and all(el in names for el in key)) or
key in names):
return super().sel(name=key)
else:
return super().__getitem__(key)
5 changes: 3 additions & 2 deletions cesium/featurize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from . import data_management
from . import time_series
from . import util
from .featureset import Featureset
from .time_series import TimeSeries
from . import obs_feature_tools as oft
from . import science_feature_tools as sft
Expand Down Expand Up @@ -137,7 +138,7 @@ def assemble_featureset(feature_dicts, time_series=None, targets=None,
featureset.coords['name'] = ('name', np.array(names))
if targets is not None and any(targets):
featureset.coords['target'] = ('name', np.array(targets))
return featureset
return Featureset(featureset)


def load_and_store_feature_data(features_path, output_path):
Expand All @@ -147,7 +148,7 @@ def load_and_store_feature_data(features_path, output_path):
featureset = assemble_featureset([], targets=targets,
meta_feature_dicts=meta_feature_dicts)
featureset.to_netcdf(output_path)
return featureset
return Featureset(featureset)


# TODO should this be changed to use TimeSeries objects? or maybe an optional
Expand Down
8 changes: 6 additions & 2 deletions cesium/tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import xarray as xr

from cesium.featureset import Featureset
from cesium.time_series import TimeSeries


Expand Down Expand Up @@ -38,7 +39,8 @@ def sample_ts_files(size, targets=[None]):
shutil.rmtree(temp_dir)


def sample_featureset(size, n_channels=1, features=[], targets=None):
def sample_featureset(size, n_channels=1, features=[], targets=None,
labels=None):
ts_names = np.arange(size).astype('str')
feat_dict = {f: (['channel', 'name'], [np.random.random(size)
for i in range(n_channels)])
Expand All @@ -48,5 +50,7 @@ def sample_featureset(size, n_channels=1, features=[], targets=None):
if targets:
ts_targets = np.array(list(islice(cycle(targets), size)))
fset.coords['target'] = ('name', ts_targets)
if labels:
fset.name.values = labels

return fset
return Featureset(fset)
50 changes: 50 additions & 0 deletions cesium/tests/test_featureset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
from os.path import join as pjoin
import numpy as np
import numpy.testing as npt
import scipy.stats
import xarray as xr
from cesium.tests.fixtures import sample_featureset


def test_repr():
"""Testing Featureset printing."""
fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum'],
['class1', 'class2'])
repr(fset)


def test_impute():
"""Test imputation of missing Featureset values."""
fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
fset.amplitude.values[0, 0] = np.inf
fset.amplitude.values[0, 1] = np.nan
values = fset.amplitude.values[0, 2:]

imputed = fset.impute(strategy='constant', value=-1e4)
npt.assert_allclose(-1e4, imputed.amplitude.values[0, 0:2])

imputed = fset.impute(strategy='mean')
npt.assert_allclose(np.mean(values), imputed.amplitude.values[0, 0:2])
npt.assert_allclose(values, imputed.amplitude.values[0, 2:])

imputed = fset.impute(strategy='median')
npt.assert_allclose(np.median(values), imputed.amplitude.values[0, 0:2])
npt.assert_allclose(values, imputed.amplitude.values[0, 2:])

imputed = fset.impute(strategy='most_frequent')
npt.assert_allclose(scipy.stats.mode(values).mode.item(),
imputed.amplitude.values[0, 0:2])
npt.assert_allclose(values, imputed.amplitude.values[0, 2:])


def test_indexing():
fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'],
labels=['a', 'b', 'c'])
"""Test indexing overloading (__getattr__)."""
assert all(fset[0] == fset.isel(name=0))
assert all(fset[0:2] == fset.isel(name=[0, 1]))
assert all(fset['a'] == fset.sel(name='a'))
assert all(fset[['a', 'b']] == fset.sel(name=['a', 'b']))
npt.assert_allclose(fset['amplitude'].values.ravel(),
fset.data_vars['amplitude'].values.ravel())

0 comments on commit 74a5426

Please sign in to comment.