pytorch · diego-urgell · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 26, 2024
diff --git a/tests/utils/loggers/test_anomaly_logger.py b/tests/utils/loggers/test_anomaly_logger.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import math
+import unittest
+from unittest.mock import call, MagicMock, patch
+
+import torch
+
+from torchtnt.utils.anomaly_evaluation import (
+    IsNaNEvaluator,
+    MetricAnomalyEvaluator,
+    ThresholdEvaluator,
+)
+
+from torchtnt.utils.loggers.anomaly_logger import AnomalyLogger, TrackedMetric
+
+
+class DummyEvaluator(MetricAnomalyEvaluator):
+    def _evaluate_anomaly(self, value: float) -> bool:
+        return True
+
+
+class TestAnomalyLogger(unittest.TestCase):
+
+    def test_init(self) -> None:
+        tracked_metrics = [
+            TrackedMetric(
+                name="accuracy",
+                anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
+            ),
+            TrackedMetric(
+                name="accuracy",
+                anomaly_evaluators=[IsNaNEvaluator()],
+            ),
+            TrackedMetric(name="loss", anomaly_evaluators=[IsNaNEvaluator()]),
+        ]
+
+        warning_container = []
+        with patch(
+            "torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
+            side_effect=warning_container.append,
+        ):
+            logger = AnomalyLogger(
+                tracked_metrics=tracked_metrics,
+            )
+
+        self.assertEqual(
+            warning_container,
+            ["Found multiple configs for metric 'accuracy'. Skipping."],
+        )
+        self.assertEqual(set(logger._tracked_metrics.keys()), {"loss"})
+
+    @patch(
+        "torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
+    )
+    def test_log(self, mock_on_anomaly_detected: MagicMock) -> None:
+        logger = AnomalyLogger(
+            tracked_metrics=[
+                TrackedMetric(
+                    name="accuracy",
+                    anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
+                    warmup_steps=4,
+                    evaluate_every_n_steps=2,
+                )
+            ]
+        )
+
+        # Log value that can't be resolved to a single numerical.
+        warning_container = []
+        with patch(
+            "torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
+            side_effect=warning_container.append,
+        ):
+            logger.log(step=1, name="accuracy", data=torch.Tensor([0.5, 0.9]))
+
+        self.assertEqual(
+            warning_container,
+            [
+                "Error when extracting a single numerical value from the provided metric: Scalar tensor must contain a single item, 2 given."
+            ],
+        )
+        mock_on_anomaly_detected.assert_called_once()
+
+        # Log anomalous value during warmup: no-op
+        mock_on_anomaly_detected.reset_mock()
+        logger.log(step=4, name="accuracy", data=0.2)
+        mock_on_anomaly_detected.assert_not_called()
+
+        # Log anomalous value on non-evaluate step: no-op
+        logger.log(step=5, name="accuracy", data=0.1)
+        mock_on_anomaly_detected.assert_not_called()
+
+        # Log metric that is not tracked: no-op
+        mock_on_anomaly_detected.reset_mock()
+        logger.log(step=6, name="loss", data=math.nan)
+        mock_on_anomaly_detected.assert_not_called()
+
+        # Log metric within threshold: no-op
+        logger.log(step=6, name="accuracy", data=0.6)
+        mock_on_anomaly_detected.assert_not_called()
+
+        # Log metric outside threshold
+        warning_container = []
+        with patch(
+            "torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
+            side_effect=warning_container.append,
+        ):
+            logger.log(step=8, name="accuracy", data=0.95)
+
+        self.assertEqual(
+            warning_container,
+            [
+                "Found anomaly in metric: accuracy, with value: 0.95, using evaluator: ThresholdEvaluator"
+            ],
+        )
+        mock_on_anomaly_detected.assert_called_with("accuracy", 0.95, 8)
+
+    @patch(
+        "torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
+    )
+    def test_log_dict(self, mock_on_anomaly_detected: MagicMock) -> None:
+        logger = AnomalyLogger(
+            tracked_metrics=[
+                TrackedMetric(
+                    name="accuracy",
+                    anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
+                ),
+                TrackedMetric(
+                    name="loss",
+                    anomaly_evaluators=[IsNaNEvaluator()],
+                ),
+                TrackedMetric(
+                    name="f1_score",
+                    anomaly_evaluators=[
+                        IsNaNEvaluator(),
+                        ThresholdEvaluator(min_val=0.2),
+                    ],
+                ),
+            ]
+        )
+
+        warning_container = []
+        with patch(
+            "torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
+            side_effect=warning_container.append,
+        ):
+            logger.log_dict(
+                step=1,
+                payload={
+                    "loss": math.nan,
+                    "accuracy": 0.63,
+                    "precision": 0.7,
+                    "f1_score": 0.05,
+                },
+            )
+
+        self.assertEqual(
+            set(warning_container),
+            {
+                "Found anomaly in metric: f1_score, with value: 0.05, using evaluator: ThresholdEvaluator",
+                "Found anomaly in metric: loss, with value: nan, using evaluator: IsNaNEvaluator",
+            },
+        )
+
+        expected_anomaly_callback_calls = [
+            call("f1_score", 0.05, 1),
+            call("loss", math.nan, 1),
+        ]
+        mock_on_anomaly_detected.assert_has_calls(
+            expected_anomaly_callback_calls, any_order=True
+        )
+
+    @patch(
+        "torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
+        side_effect=Exception("test exception"),
+    )
+    def test_on_anomaly_callback_exception(self, _) -> None:
+        logger = AnomalyLogger(
+            tracked_metrics=[
+                TrackedMetric(
+                    name="accuracy",
+                    anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
+                ),
+            ]
+        )
+
+        warning_container = []
+        with patch(
+            "torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
+            side_effect=warning_container.append,
+        ):
+            logger.log(step=1, name="accuracy", data=0.95)
+
+        self.assertEqual(
+            warning_container,
+            [
+                "Found anomaly in metric: accuracy, with value: 0.95, using evaluator: ThresholdEvaluator",
+                "Exception when calling on_anomaly_hook: test exception",
+            ],
+        )
diff --git a/tests/utils/loggers/test_tensorboard.py b/tests/utils/loggers/test_tensorboard.py
@@ -11,20 +11,53 @@
 
 import tempfile
 import unittest
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
 
 from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
+from torchtnt.utils.anomaly_evaluation import ThresholdEvaluator
+from torchtnt.utils.loggers.anomaly_logger import TrackedMetric
 
 from torchtnt.utils.loggers.tensorboard import TensorBoardLogger
 
 
 class TensorBoardLoggerTest(unittest.TestCase):
-    def test_log(self: TensorBoardLoggerTest) -> None:
+
+    @patch(
+        "torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
+    )
+    def test_log(
+        self: TensorBoardLoggerTest, mock_on_anomaly_detected: MagicMock
+    ) -> None:
         with tempfile.TemporaryDirectory() as log_dir:
-            logger = TensorBoardLogger(path=log_dir)
-            for i in range(5):
-                logger.log("test_log", float(i) ** 2, i)
-            logger.close()
+            logger = TensorBoardLogger(
+                path=log_dir,
+                tracked_metrics=[
+                    TrackedMetric(
+                        name="test_log",
+                        anomaly_evaluators=[
+                            ThresholdEvaluator(min_val=25),
+                        ],
+                        evaluate_every_n_steps=2,
+                        warmup_steps=2,
+                    )
+                ],
+            )
+            warning_container = []
+            with patch(
+                "torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
+                side_effect=warning_container.append,
+            ):
+                for i in range(5):
+                    logger.log("test_log", float(i) ** 2, i)
+                logger.close()
+
+            self.assertEqual(
+                warning_container,
+                [
+                    "Found anomaly in metric: test_log, with value: 16.0, using evaluator: ThresholdEvaluator"
+                ],
+            )
+            mock_on_anomaly_detected.assert_called_with("test_log", 16.0, 4)
 
             acc = EventAccumulator(log_dir)
             acc.Reload()

diff --git a/tests/utils/test_anomaly_evaluation.py b/tests/utils/test_anomaly_evaluation.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import math
+import unittest
+
+from torchtnt.utils.anomaly_evaluation import IsNaNEvaluator, ThresholdEvaluator
+
+
+class TestAnomalyLogger(unittest.TestCase):
+
+    def test_threshold(self) -> None:
+        threshold = ThresholdEvaluator(min_val=0.5, max_val=0.9)
+        self.assertFalse(threshold.is_anomaly())
+
+        threshold.update(0.4)
+        self.assertTrue(threshold.is_anomaly())
+
+        threshold.update(0.6)
+        self.assertFalse(threshold.is_anomaly())
+
+        threshold.update(0.95)
+        self.assertTrue(threshold.is_anomaly())
+
+        threshold = ThresholdEvaluator(max_val=1)
+
+        threshold.update(100.0)
+        self.assertTrue(threshold.is_anomaly())
+
+        threshold.update(-500.0)
+        self.assertFalse(threshold.is_anomaly())
+
+    def test_isnan(self) -> None:
+        isnan = IsNaNEvaluator()
+        self.assertFalse(isnan.is_anomaly())
+
+        isnan.update(0.4)
+        self.assertFalse(isnan.is_anomaly())
+
+        isnan.update(math.nan)
+        self.assertTrue(isnan.is_anomaly())
diff --git a/torchtnt/utils/__init__.py b/torchtnt/utils/__init__.py
@@ -6,6 +6,7 @@
 
 # pyre-strict
 
+from .anomaly_evaluation import IsNaNEvaluator, ThresholdEvaluator
 from .checkpoint import (
     BestCheckpointConfig,
     CheckpointManager,
@@ -88,6 +89,8 @@
 )
 
 __all__ = [
+    "IsNaNEvaluator",
+    "ThresholdEvaluator",
     "CheckpointPath",
     "MetricData",
     "get_best_checkpoint_path",