Cleanup (#1303)

* old BalanceBatchSampler removed * extra classification report added * fix * fix * fix * fix * fix * fix
catalyst-team · Sep 27, 2021 · 185b892 · 185b892
1 parent 4c8e685
commit 185b892
Show file tree

Hide file tree

Showing 17 changed files with 443 additions and 339 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,19 +1,9 @@
-## Before submitting (checklist)
-
-- [ ] Was this discussed/approved via a Github issue? (no need for typos and docs improvements)
-- [ ] Did you read the [contribution guide](https://github.com/catalyst-team/catalyst/blob/master/CONTRIBUTING.md)?
-- [ ] Did you check the code style? `catalyst-make-codestyle -l 99 && catalyst-check-codestyle -l 99 ` (`pip install -U catalyst-codestyle`).
-- [ ] Did you make sure to update the docs? We use Google format for all the methods and classes.
-- [ ] Did you check the docs with `make check-docs`?
-- [ ] Did you write any new necessary tests?
-- [ ] Did you check that your code passes the unit tests `pytest .` ?
-- [ ] Did you add your new functionality to the docs?
-- [ ] Did you update the [CHANGELOG](https://github.com/catalyst-team/catalyst/blob/master/CHANGELOG.md)?
-- [ ] Did you run [colab minimal CI/CD](https://colab.research.google.com/github/catalyst-team/catalyst/blob/master/examples/notebooks/colab_ci_cd.ipynb) with `latest` and `minimal` requirements?
-- [ ] Did you check XLA integration with [single](https://colab.research.google.com/github/catalyst-team/catalyst/blob/master/examples/notebooks/Catalyst_XLA_single_process.ipynb) and [multiple](https://colab.research.google.com/github/catalyst-team/catalyst/blob/master/examples/notebooks/Catalyst_XLA_multi_process.ipynb) processes?
-
-<!-- For CHANGELOG separate each item in unreleased section by blank line to reduce collisions -->
-
+### Pull Request FAQ
+- [documentation](https://catalyst-team.github.io/catalyst/)
+- [contribution guide](https://github.com/catalyst-team/catalyst/blob/master/CONTRIBUTING.md)
+- [minimal examples section](https://github.com/catalyst-team/catalyst#minimal-examples)
+- [changelog](https://github.com/catalyst-team/catalyst/blob/master/CHANGELOG.md) for main framework updates
+- [Catalyst slack (#__questions channel)](https://join.slack.com/t/catalyst-team-core/shared_invite/zt-d9miirnn-z86oKDzFMKlMG4fgFdZafw) for issue discussion
 
 ## Description
 
@@ -43,11 +33,11 @@ If we didn't discuss your PR in Github issues there's a high chance it will not
 
 <!-- Thank you for your contribution! -->
 
+### Checklist
+- [ ] Have you updated tests for the new functionality?
+- [ ] Have you added your new classes/functions to the docs?
+- [ ] Have you updated the [CHANGELOG](https://github.com/catalyst-team/catalyst/blob/master/CHANGELOG.md)?
+- [ ] Have you run [colab minimal CI/CD](https://colab.research.google.com/github/catalyst-team/catalyst/blob/master/examples/notebooks/colab_ci_cd.ipynb) with `latest` and `minimal` requirements?
+- [ ] Have you checked XLA integration with [single](https://colab.research.google.com/github/catalyst-team/catalyst/blob/master/examples/notebooks/Catalyst_XLA_single_process.ipynb) and [multiple](https://colab.research.google.com/github/catalyst-team/catalyst/blob/master/examples/notebooks/Catalyst_XLA_multi_process.ipynb) processes?
 
-### FAQ
-Please review the FAQ before submitting an issue:
-- [ ] I have read the [documentation and FAQ](https://catalyst-team.github.io/catalyst/)
-- [ ] I have reviewed the [minimal examples section](https://github.com/catalyst-team/catalyst#minimal-examples)
-- [ ] I have checked the [changelog](https://github.com/catalyst-team/catalyst/blob/master/CHANGELOG.md) for main framework updates
-- [ ] I have read the [contribution guide](https://github.com/catalyst-team/catalyst/blob/master/CONTRIBUTING.md)
-- [ ] I have joined [Catalyst slack (#__questions channel)](https://join.slack.com/t/catalyst-team-core/shared_invite/zt-d9miirnn-z86oKDzFMKlMG4fgFdZafw) for issue discussion
+<!-- For CHANGELOG separate each item in unreleased section by blank line to reduce collisions -->
diff --git a/README.md b/README.md
@@ -812,8 +812,10 @@ from catalyst.data.transforms import Compose, Normalize, ToTensor
 transforms = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))])
 
 train_dataset = datasets.MnistMLDataset(root=os.getcwd(), download=True, transform=transforms)
-sampler = data.BalanceBatchSampler(labels=train_dataset.get_labels(), p=5, k=10)
-train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=sampler.batch_size)
+sampler = data.BatchBalanceClassSampler(
+    labels=train_dataset.get_labels(), num_classes=5, num_samples=10, num_batches=10
+)
+train_loader = DataLoader(dataset=train_dataset, batch_sampler=sampler)
 
 valid_dataset = datasets.MnistQGDataset(root=os.getcwd(), transform=transforms, gallery_fraq=0.2)
 valid_loader = DataLoader(dataset=valid_dataset, batch_size=1024)

diff --git a/catalyst/callbacks/metrics/cmc_score.py b/catalyst/callbacks/metrics/cmc_score.py
@@ -52,10 +52,10 @@ class CMCScoreCallback(LoaderMetricCallback):
         train_dataset = datasets.MnistMLDataset(
             root=os.getcwd(), download=True, transform=transforms
             )
-        sampler = data.BalanceBatchSampler(labels=train_dataset.get_labels(), p=5, k=10)
-        train_loader = DataLoader(
-            dataset=train_dataset, sampler=sampler, batch_size=sampler.batch_size
-            )
+        sampler = data.BatchBalanceClassSampler(
+            labels=train_dataset.get_labels(), num_classes=5, num_samples=10
+        )
+        train_loader = DataLoader(dataset=train_dataset, batch_sampler=sampler)
 
         valid_dataset = datasets.MnistQGDataset(
             root=os.getcwd(), transform=transforms, gallery_fraq=0.2

diff --git a/catalyst/callbacks/mixup.py b/catalyst/callbacks/mixup.py
@@ -10,6 +10,18 @@ class MixupCallback(Callback):
     Callback to do mixup augmentation. More details about mixin can be found in the paper
     `mixup: Beyond Empirical Risk Minimization`: https://arxiv.org/abs/1710.09412 .
 
+    Args:
+        keys: batch keys to which you want to apply augmentation
+        alpha: beta distribution a=b parameters. Must be >=0. The more alpha closer to zero the
+            less effect of the mixup.
+        mode: mode determines the method of use. Must be in ["replace", "add"]. If "replace"
+            then replaces the batch with a mixed one, while the batch size is not changed
+            If "add", concatenates mixed examples to the current ones, the batch size increases
+            by 2 times.
+        on_train_only: apply to train only. As the mixup use the proxy inputs, the targets are
+            also proxy. We are not interested in them, are we? So, if ``on_train_only``
+            is ``True`` use a standard output/metric for validation.
+
     Examples:
 
     .. code-block:: python
@@ -107,24 +119,8 @@ def handle_batch(self, batch):
         use ControlFlowCallback in order to evaluate model(see example)
     """
 
-    def __init__(
-        self, keys: Union[str, List[str]], alpha=0.2, mode="replace", on_train_only=True, **kwargs
-    ):
-        """
-
-        Args:
-            keys: batch keys to which you want to apply augmentation
-            alpha: beta distribution a=b parameters. Must be >=0. The more alpha closer to zero the
-                less effect of the mixup.
-            mode: mode determines the method of use. Must be in ["replace", "add"]. If "replace"
-                then replaces the batch with a mixed one, while the batch size is not changed
-                If "add", concatenates mixed examples to the current ones, the batch size increases
-                by 2 times.
-            on_train_only: apply to train only. As the mixup use the proxy inputs, the targets are
-                also proxy. We are not interested in them, are we? So, if ``on_train_only``
-                is ``True`` use a standard output/metric for validation.
-            **kwargs:
-        """
+    def __init__(self, keys: Union[str, List[str]], alpha=0.2, mode="replace", on_train_only=True):
+        """Init."""
         assert isinstance(keys, (str, list, tuple)), (
             f"keys must be str of list[str]," f" get: {type(keys)}"
         )

diff --git a/catalyst/callbacks/sklearn_model.py b/catalyst/callbacks/sklearn_model.py
@@ -48,11 +48,10 @@ class SklearnModelCallback(Callback):
                 download=True,
                 transform=transforms
             )
-            sampler = data.BalanceBatchSampler(labels=train_dataset.get_labels(), p=5, k=10)
-            train_loader = DataLoader(
-                dataset=train_dataset,
-                sampler=sampler,
-                batch_size=sampler.batch_size)
+            sampler = data.BatchBalanceClassSampler(
+                labels=train_dataset.get_labels(), num_classes=5, num_samples=10
+            )
+            train_loader = DataLoader(dataset=train_dataset, batch_sampler=sampler)
 
             valid_dataset = datasets.MNIST(root=os.getcwd(), transform=transforms, train=False)
             valid_loader = DataLoader(dataset=valid_dataset, batch_size=1024)
@@ -137,11 +136,10 @@ def handle_batch(self, batch) -> None:
                 download=True,
                 transform=transforms
             )
-            sampler = data.BalanceBatchSampler(labels=train_dataset.get_labels(), p=5, k=10)
-            train_loader = DataLoader(
-                dataset=train_dataset,
-                sampler=sampler,
-                batch_size=sampler.batch_size)
+            sampler = data.BatchBalanceClassSampler(
+                labels=train_dataset.get_labels(), num_classes=5, num_samples=10
+            )
+            train_loader = DataLoader(dataset=train_dataset, batch_sampler=sampler)
 
             valid_dataset = datasets.MNIST(root=os.getcwd(), transform=transforms, train=False)
             valid_loader = DataLoader(dataset=valid_dataset, batch_size=1024)

diff --git a/catalyst/contrib/utils/__init__.py b/catalyst/contrib/utils/__init__.py
@@ -43,6 +43,8 @@
     get_pool,
 )
 
+if SETTINGS.ml_required:
+    from catalyst.contrib.utils.report import get_classification_report
 
 from catalyst.contrib.utils.serialization import deserialize, serialize
 

diff --git a/catalyst/contrib/utils/report.py b/catalyst/contrib/utils/report.py
@@ -0,0 +1,125 @@
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
+
+
+def get_classification_report(
+    y_true: np.ndarray, y_pred: np.ndarray, y_scores: np.ndarray = None, beta: float = None
+) -> pd.DataFrame:
+    """Generates pandas-based per-class and aggregated classification metrics.
+
+    Args:
+        y_true (np.ndarray): ground truth labels
+        y_pred (np.ndarray): predicted model labels
+        y_scores (np.ndarray): predicted model scores. Defaults to None.
+        beta (float, optional): Beta parameter for custom Fbeta score computation.
+            Defaults to None.
+
+    Returns:
+        pd.DataFrame: pandas dataframe with main classification metrics.
+
+    Examples:
+
+    .. code-block:: python
+
+        from sklearn import datasets, linear_model, metrics
+        from sklearn.model_selection import train_test_split
+        from catalyst import utils
+
+        digits = datasets.load_digits()
+
+        # flatten the images
+        n_samples = len(digits.images)
+        data = digits.images.reshape((n_samples, -1))
+
+        # Create a classifier
+        clf = linear_model.LogisticRegression(multi_class="ovr")
+
+        # Split data into 50% train and 50% test subsets
+        X_train, X_test, y_train, y_test = train_test_split(
+            data, digits.target, test_size=0.5, shuffle=False)
+
+        # Learn the digits on the train subset
+        clf.fit(X_train, y_train)
+
+        # Predict the value of the digit on the test subset
+        y_scores = clf.predict_proba(X_test)
+        y_pred = clf.predict(X_test)
+
+        utils.get_classification_report(
+            y_true=y_test,
+            y_pred=y_pred,
+            y_scores=y_scores,
+            beta=0.5
+        )
+    """
+    metrics = defaultdict(lambda: {})
+    metrics_names = [
+        "precision",
+        "recall",
+        "f1-score",
+        "auc",
+        "support",
+        "support (%)",
+    ]
+    avg_names = ["macro", "micro", "weighted"]
+    labels = sorted(set(y_true).union(y_pred))
+    auc = np.zeros(len(labels))
+    if y_scores is not None:
+        for i, label in enumerate(labels):
+            auc[i] = roc_auc_score((y_true == label).astype(int), y_scores[:, i])
+
+    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
+    precision, recall, f1, support = precision_recall_fscore_support(
+        y_true=y_true, y_pred=y_pred, average=None, labels=labels
+    )
+
+    r_support = support / support.sum()
+    for average in avg_names:
+        avg_precision, avg_recall, avg_f1, _ = precision_recall_fscore_support(
+            y_true=y_true, y_pred=y_pred, average=average, labels=labels
+        )
+
+        avg_metrics = avg_precision, avg_recall, avg_f1
+        for k, v in zip(metrics_names[:4], avg_metrics):
+            metrics[k][average] = v
+
+    report = pd.DataFrame(
+        [precision, recall, f1, auc, support, r_support], columns=labels, index=metrics_names
+    ).T
+
+    if beta is not None:
+        _, _, fbeta, _ = precision_recall_fscore_support(
+            y_true=y_true, y_pred=y_pred, average=None, beta=beta, labels=labels
+        )
+        avg_fbeta = np.zeros(len(avg_names))
+        for i, average in enumerate(avg_names):
+            _, _, avg_beta, _ = precision_recall_fscore_support(
+                y_true=y_true, y_pred=y_pred, average=average, beta=beta, labels=labels
+            )
+            avg_fbeta[i] = avg_beta
+        report.insert(3, "f-beta", fbeta, True)
+
+    metrics["support"]["macro"] = support.sum()
+    metrics["precision"]["accuracy"] = accuracy
+    if y_scores is not None:
+        metrics["auc"]["macro"] = roc_auc_score(
+            y_true, y_scores, multi_class="ovr", average="macro"
+        )
+        metrics["auc"]["weighted"] = roc_auc_score(
+            y_true, y_scores, multi_class="ovr", average="weighted"
+        )
+    metrics = pd.DataFrame(metrics, index=avg_names + ["accuracy"])
+
+    result = pd.concat((report, metrics)).fillna("")
+
+    if beta:
+        result["f-beta"]["macro"] = avg_fbeta[0]
+        result["f-beta"]["micro"] = avg_fbeta[1]
+        result["f-beta"]["weighted"] = avg_fbeta[2]
+    return result
+
+
+__all__ = ["get_classification_report"]
diff --git a/catalyst/data/__init__.py b/catalyst/data/__init__.py
@@ -16,8 +16,8 @@
     BatchPrefetchLoaderWrapper,
 )
 from catalyst.data.sampler import (
-    BalanceClassSampler,
     BalanceBatchSampler,
+    BalanceClassSampler,
     BatchBalanceClassSampler,
     DistributedSamplerWrapper,
     DynamicLenBatchSampler,