diff --git a/lime/discretize.py b/lime/discretize.py index 41635198..246c1e7c 100644 --- a/lime/discretize.py +++ b/lime/discretize.py @@ -18,7 +18,8 @@ class BaseDiscretizer(): __metaclass__ = ABCMeta # abstract class - def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): + def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, + data_stats=None): """Initializer Args: data: numpy 2d array @@ -31,9 +32,12 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando column x. feature_names: list of names (strings) corresponding to the columns in the training data. + data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this + if you don't want these values to be computed from data """ self.to_discretize = ([x for x in range(data.shape[1]) - if x not in categorical_features]) + if x not in categorical_features]) + self.data_stats = data_stats self.names = {} self.lambdas = {} self.means = {} @@ -46,6 +50,13 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando bins = self.bins(data, labels) bins = [np.unique(x) for x in bins] + # Read the stats from data_stats if exists + if data_stats: + self.means = self.data_stats.get("means") + self.stds = self.data_stats.get("stds") + self.mins = self.data_stats.get("mins") + self.maxs = self.data_stats.get("maxs") + for feature, qts in zip(self.to_discretize, bins): n_bins = qts.shape[0] # Actually number of borders (= #bins-1) boundaries = np.min(data[:, feature]), np.max(data[:, feature]) @@ -60,6 +71,10 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x) discretized = self.lambdas[feature](data[:, feature]) + # If data stats are provided no need to compute the below set of details + if data_stats: + continue + self.means[feature] = [] self.stds[feature] = [] for x in range(n_bins + 1): @@ -117,6 +132,31 @@ def get_inverse(q): return ret +class StatsDiscretizer(BaseDiscretizer): + """ + Class to be used to supply the data stats info when discretize_continuous is true + """ + + def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, + data_stats=None): + + BaseDiscretizer.__init__(self, data, categorical_features, + feature_names, labels=labels, + random_state=random_state, + data_stats=data_stats) + + def bins(self, data, labels): + bins_from_stats = self.data_stats.get("bins") + bins = [] + if bins_from_stats is not None: + for feature in self.to_discretize: + bins_from_stats_feature = bins_from_stats.get(feature) + if bins_from_stats_feature is not None: + qts = np.array(bins_from_stats_feature) + bins.append(qts) + return bins + + class QuartileDiscretizer(BaseDiscretizer): def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): diff --git a/lime/lime_tabular.py b/lime/lime_tabular.py index f745c26a..cdd9ee9d 100644 --- a/lime/lime_tabular.py +++ b/lime/lime_tabular.py @@ -16,6 +16,7 @@ from lime.discretize import DecileDiscretizer from lime.discretize import EntropyDiscretizer from lime.discretize import BaseDiscretizer +from lime.discretize import StatsDiscretizer from . import explanation from . import lime_base @@ -112,7 +113,8 @@ def __init__(self, discretize_continuous=True, discretizer='quartile', sample_around_instance=False, - random_state=None): + random_state=None, + training_data_stats=None): """Init function. Args: @@ -153,11 +155,21 @@ def __init__(self, random_state: an integer or numpy.RandomState that will be used to generate random numbers. If None, the random state will be initialized using the internal numpy seed. + training_data_stats: a dict object having the details of training data + statistics. If None, training data information will be used, only matters + if discretize_continuous is True. Must have the following keys: + means", "mins", "maxs", "stds", "feature_values", + "feature_frequencies" """ self.random_state = check_random_state(random_state) self.mode = mode self.categorical_names = categorical_names or {} self.sample_around_instance = sample_around_instance + self.training_data_stats = training_data_stats + + # Check and raise proper error in stats are supplied in non-descritized path + if self.training_data_stats: + self.validate_training_data_stats(self.training_data_stats) if categorical_features is None: categorical_features = [] @@ -169,6 +181,12 @@ def __init__(self, self.discretizer = None if discretize_continuous: + # Set the discretizer if training data stats are provided + if self.training_data_stats: + discretizer = StatsDiscretizer(training_data, self.categorical_features, + self.feature_names, labels=training_labels, + data_stats=self.training_data_stats) + if discretizer == 'quartile': self.discretizer = QuartileDiscretizer( training_data, self.categorical_features, @@ -188,7 +206,10 @@ def __init__(self, ''' 'decile', 'entropy' or a''' + ''' BaseDiscretizer instance''') self.categorical_features = list(range(training_data.shape[1])) - discretized_training_data = self.discretizer.discretize( + + # Get the discretized_training_data when the stats are not provided + if(self.training_data_stats is None): + discretized_training_data = self.discretizer.discretize( training_data) if kernel_width is None: @@ -203,21 +224,27 @@ def kernel(d, kernel_width): self.feature_selection = feature_selection self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state) - self.scaler = None self.class_names = class_names + + # Though set has no role to play if training data stats are provided + self.scaler = None self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False) self.scaler.fit(training_data) self.feature_values = {} self.feature_frequencies = {} for feature in self.categorical_features: - if self.discretizer is not None: - column = discretized_training_data[:, feature] - else: - column = training_data[:, feature] + if training_data_stats is None: + if self.discretizer is not None: + column = discretized_training_data[:, feature] + else: + column = training_data[:, feature] - feature_count = collections.Counter(column) - values, frequencies = map(list, zip(*(sorted(feature_count.items())))) + feature_count = collections.Counter(column) + values, frequencies = map(list, zip(*(sorted(feature_count.items())))) + else: + values = training_data_stats["feature_values"][feature] + frequencies = training_data_stats["feature_frequencies"][feature] self.feature_values[feature] = values self.feature_frequencies[feature] = (np.array(frequencies) / @@ -229,6 +256,17 @@ def kernel(d, kernel_width): def convert_and_round(values): return ['%.2f' % v for v in values] + @staticmethod + def validate_training_data_stats(training_data_stats): + """ + Method to validate the structure of training data stats + """ + stat_keys = list(training_data_stats.keys()) + valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"] + missing_keys = list(set(valid_stat_keys) - set(stat_keys)) + if len(missing_keys) > 0: + raise Exception("Missing keys in training_data_stats. Details:" % (missing_keys)) + def explain_instance(self, data_row, predict_fn, @@ -414,8 +452,8 @@ def __data_inverse(self, categorical_features = range(data_row.shape[0]) if self.discretizer is None: data = self.random_state.normal( - 0, 1, num_samples * data_row.shape[0]).reshape( - num_samples, data_row.shape[0]) + 0, 1, num_samples * data_row.shape[0]).reshape( + num_samples, data_row.shape[0]) if self.sample_around_instance: data = data * self.scaler.scale_ + data_row else: diff --git a/lime/tests/test_lime_tabular.py b/lime/tests/test_lime_tabular.py index cc860320..306b5cb2 100644 --- a/lime/tests/test_lime_tabular.py +++ b/lime/tests/test_lime_tabular.py @@ -1,10 +1,11 @@ import unittest import numpy as np -import sklearn # noqa +import collections +import sklearn # noqa import sklearn.datasets import sklearn.ensemble -import sklearn.linear_model # noqa +import sklearn.linear_model # noqa from numpy.testing import assert_array_equal from sklearn.datasets import load_iris, make_classification from sklearn.ensemble import RandomForestClassifier @@ -12,6 +13,7 @@ from sklearn.linear_model import LinearRegression from lime.discretize import QuartileDiscretizer, DecileDiscretizer, EntropyDiscretizer + try: from sklearn.model_selection import train_test_split except ImportError: @@ -577,6 +579,72 @@ def testFeatureValues(self): assert_array_equal(explainer.feature_frequencies[1], np.array([.25, .25, .25, .25])) assert_array_equal(explainer.feature_frequencies[2], np.array([.5, .5])) + def test_lime_explainer_with_data_stats(self): + np.random.seed(1) + + rf = RandomForestClassifier(n_estimators=500) + rf.fit(self.train, self.labels_train) + i = np.random.randint(0, self.test.shape[0]) + + # Generate stats using a quartile descritizer + descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names, + random_state=20) + + d_means = descritizer.means + d_stds = descritizer.stds + d_mins = descritizer.mins + d_maxs = descritizer.maxs + d_bins = descritizer.bins(self.train, self.target_names) + + # Compute feature values and frequencies of all columns + cat_features = np.arange(self.train.shape[1]) + discretized_training_data = descritizer.discretize(self.train) + + feature_values = {} + feature_frequencies = {} + for feature in cat_features: + column = discretized_training_data[:, feature] + feature_count = collections.Counter(column) + values, frequencies = map(list, zip(*(feature_count.items()))) + feature_values[feature] = values + feature_frequencies[feature] = frequencies + + # Convert bins to list from array + d_bins_revised = {} + index = 0 + for bin in d_bins: + d_bins_revised[index] = bin.tolist() + index = index+1 + + # Descritized stats + data_stats = {} + data_stats["means"] = d_means + data_stats["stds"] = d_stds + data_stats["maxs"] = d_maxs + data_stats["mins"] = d_mins + data_stats["bins"] = d_bins_revised + data_stats["feature_values"] = feature_values + data_stats["feature_frequencies"] = feature_frequencies + + data = np.zeros((2, len(self.feature_names))) + explainer = LimeTabularExplainer( + data, feature_names=self.feature_names, random_state=10, + training_data_stats=data_stats, training_labels=self.target_names) + + exp = explainer.explain_instance(self.test[i], + rf.predict_proba, + num_features=2, + model_regressor=LinearRegression()) + + self.assertIsNotNone(exp) + keys = [x[0] for x in exp.as_list()] + self.assertEqual(1, + sum([1 if 'petal width' in x else 0 for x in keys]), + "Petal Width is a major feature") + self.assertEqual(1, + sum([1 if 'petal length' in x else 0 for x in keys]), + "Petal Length is a major feature") + if __name__ == '__main__': unittest.main()