Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix KElbow get_params #1251

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
520 changes: 520 additions & 0 deletions examples/bbengfort/corpus.ipynb

Large diffs are not rendered by default.

649 changes: 248 additions & 401 deletions examples/bbengfort/testing.ipynb

Large diffs are not rendered by default.

33 changes: 26 additions & 7 deletions tests/test_cluster/test_elbow.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,12 +205,16 @@ def test_invalid_k(self):
"""
Assert that invalid values of K raise exceptions
"""
# Generate a blobs data set
X, y = make_blobs(
n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42
)

with pytest.raises(YellowbrickValueError):
KElbowVisualizer(KMeans(), k=(1, 2, 3, "foo", 5))
KElbowVisualizer(KMeans(), k=(1, 2, 3, "foo", 5)).fit(X)

with pytest.raises(YellowbrickValueError):
KElbowVisualizer(KMeans(), k="foo")
KElbowVisualizer(KMeans(), k="foo").fit(X)

def test_valid_k(self):
"""
Expand All @@ -220,16 +224,21 @@ def test_valid_k(self):
# if k is a tuple of 2 ints, k_values = range(k[0], k[1])
# if k is an iterable, k_values_ = list(k)

visualizer = KElbowVisualizer(KMeans(), k=8)
# Generate a blobs data set
X, y = make_blobs(
n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42
)

visualizer = KElbowVisualizer(KMeans(), k=8).fit(X)
assert visualizer.k_values_ == list(np.arange(2, 8 + 1))

visualizer = KElbowVisualizer(KMeans(), k=(4, 12))
visualizer = KElbowVisualizer(KMeans(), k=(4, 12)).fit(X)
assert visualizer.k_values_ == list(np.arange(4, 12))

visualizer = KElbowVisualizer(KMeans(), k=np.arange(10, 100, 10))
visualizer = KElbowVisualizer(KMeans(), k=np.arange(10, 100, 10)).fit(X)
assert visualizer.k_values_ == list(np.arange(10, 100, 10))

visualizer = KElbowVisualizer(KMeans(), k=[10, 20, 30, 40, 50, 60, 70, 80, 90])
visualizer = KElbowVisualizer(KMeans(), k=[10, 20, 30, 40, 50, 60, 70, 80, 90]).fit(X)
assert visualizer.k_values_ == list(np.arange(10, 100, 10))

@pytest.mark.xfail(sys.platform == "win32", reason="images not close on windows")
Expand Down Expand Up @@ -491,4 +500,14 @@ def test_set_colors_manually(self):
# Execute drawing
oz.draw()
oz.finalize()
self.assert_images_similar(oz, tol=3.2)
self.assert_images_similar(oz, tol=3.2)

def test_get_params(self):
"""
Ensure the get params works for sklearn-compatibility
"""
oz = KElbowVisualizer(
KMeans(random_state=0), k=5,
)
params = oz.get_params()
assert len(params) > 0
10 changes: 9 additions & 1 deletion tests/test_model_selection/test_dropping_curve.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,12 @@ def test_bad_train_sizes(self):
Test learning curve with bad input for feature size.
"""
with pytest.raises(YellowbrickValueError):
DroppingCurve(SVC(), param_name="gamma", feature_sizes=100)
DroppingCurve(SVC(), param_name="gamma", feature_sizes=100)

def test_get_params(self):
"""
Ensure dropping curve get params works correctly
"""
oz = DroppingCurve(MultinomialNB())
params = oz.get_params()
assert len(params) > 0
21 changes: 21 additions & 0 deletions tests/test_utils/test_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@
## Imports
##########################################################################

import pytest

from unittest import mock

from yellowbrick.base import Visualizer
from yellowbrick.utils.wrapper import *
from yellowbrick.exceptions import YellowbrickAttributeError, YellowbrickTypeError
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

Expand Down Expand Up @@ -133,3 +136,21 @@ def test_rewrap_object(self):
obj.predict()
old.predict.assert_called_once()
new.predict.assert_called_once()

def test_wrapper_recursion(self):
"""
Ensure wrapper recursion isn't possible
"""
obj = Wrapper("")
obj._wrapped = obj
with pytest.raises(YellowbrickTypeError):
obj.foo

def test_attribute_error(self):
"""
Attribute errors should return a YellowbrickAttributeError
"""
obj = WrappedEstimator()
pat = r"neither visualizer 'WrappedEstimator' nor wrapped estimator 'MagicMock' have attribute 'notaproperty'"
with pytest.raises(YellowbrickAttributeError, match=pat):
obj.notaproperty
56 changes: 24 additions & 32 deletions yellowbrick/cluster/elbow.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ class KElbowVisualizer(ClusteringScoreVisualizer):
- **calinski_harabasz**: ratio of within to between cluster dispersion

distance_metric : str or callable, default='euclidean'
The metric to use when calculating distance between instances in a
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options allowed
by sklearn's metrics.pairwise.pairwise_distances. If X is the distance array itself,
use metric="precomputed".
Expand Down Expand Up @@ -280,6 +280,7 @@ def __init__(
)

# Store the arguments
self.k = k
self.scoring_metric = KELBOW_SCOREMAP[metric]
self.metric = metric
self.timings = timings
Expand All @@ -293,50 +294,41 @@ def __init__(
CVLINE: LINE_COLOR,
}

# Convert K into a tuple argument if an integer
if isinstance(k, int):
self.k_values_ = list(range(2, k + 1))
def fit(self, X, y=None, **kwargs):
"""
Fits n KMeans models where n is the length of ``self.k_values_``,
storing the silhouette scores in the ``self.k_scores_`` attribute.
The "elbow" and silhouette score corresponding to it are stored in
``self.elbow_value`` and ``self.elbow_score`` respectively.
This method finishes up by calling draw to create the plot.
"""
# Convert K into a tuple argument if an integer
if isinstance(self.k, int):
self.k_values_ = list(range(2, self.k + 1))
elif (
isinstance(k, tuple)
and len(k) == 2
and all(isinstance(x, (int, np.integer)) for x in k)
isinstance(self.k, tuple)
and len(self.k) == 2
and all(isinstance(x, (int, np.integer)) for x in self.k)
):
self.k_values_ = list(range(*k))
elif isinstance(k, Iterable) and all(
isinstance(x, (int, np.integer)) for x in k
self.k_values_ = list(range(*self.k))
elif isinstance(self.k, Iterable) and all(
isinstance(x, (int, np.integer)) for x in self.k
):
self.k_values_ = list(k)
self.k_values_ = list(self.k)
else:
raise YellowbrickValueError(
(
"Specify an iterable of integers, a range, or maximal K value,"
" the value '{}' is not a valid argument for K.".format(k)
" the value '{}' is not a valid argument for K.".format(self.k)
)
)

# Holds the values of the silhoutte scores
self.k_scores_ = None

# Set Default Elbow Value
self.elbow_value_ = None

def fit(self, X, y=None, **kwargs):
"""
Fits n KMeans models where n is the length of ``self.k_values_``,
storing the silhouette scores in the ``self.k_scores_`` attribute.
The "elbow" and silhouette score corresponding to it are stored in
``self.elbow_value`` and ``self.elbow_score`` respectively.
This method finishes up by calling draw to create the plot.
"""

self.k_scores_ = []
self.k_timers_ = []
self.kneedle = None
self.knee_value = None

if self.locate_elbow:
self.elbow_value_ = None
self.elbow_score_ = None
self.elbow_value_ = None
self.elbow_score_ = None

for k in self.k_values_:
# Compute the start time for each model
Expand Down Expand Up @@ -527,7 +519,7 @@ def kelbow_visualizer(
- **calinski_harabasz**: ratio of within to between cluster dispersion

distance_metric : str or callable, default='euclidean'
The metric to use when calculating distance between instances in a
The metric to use when calculating distance between instances in a
feature array. If metric is a string, it must be one of the options allowed
by sklearn's metrics.pairwise.pairwise_distances. If X is the distance array itself,
use metric="precomputed".
Expand Down
2 changes: 1 addition & 1 deletion yellowbrick/model_selection/dropping_curve.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def fit(self, X, y=None):
# compute the mean and standard deviation of the training data
self.train_scores_mean_ = np.mean(self.train_scores_, axis=1)
self.train_scores_std_ = np.std(self.train_scores_, axis=1)

# compute the mean and standard deviation of the validation data
self.valid_scores_mean_ = np.mean(self.valid_scores_, axis=1)
self.valid_scores_std_ = np.std(self.valid_scores_, axis=1)
Expand Down
10 changes: 9 additions & 1 deletion yellowbrick/utils/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
## Wrapper Class
##########################################################################

from yellowbrick.exceptions import YellowbrickAttributeError, YellowbrickTypeError


class Wrapper(object):
"""
Expand All @@ -38,5 +40,11 @@ def __init__(self, obj):
self._wrapped = obj

def __getattr__(self, attr):
if self is self._wrapped:
raise YellowbrickTypeError("wrapper cannot wrap itself or recursion will occur")

# proxy to the wrapped object
return getattr(self._wrapped, attr)
try:
return getattr(self._wrapped, attr)
except AttributeError as e:
raise YellowbrickAttributeError(f"neither visualizer '{self.__class__.__name__}' nor wrapped estimator '{type(self._wrapped).__name__}' have attribute '{attr}'") from e