Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add anomaly detection support to TensorboardLogger #854

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 207 additions & 0 deletions tests/utils/loggers/test_anomaly_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

import math
import unittest
from unittest.mock import call, MagicMock, patch

import torch

from torchtnt.utils.anomaly_evaluation import (
IsNaNEvaluator,
MetricAnomalyEvaluator,
ThresholdEvaluator,
)

from torchtnt.utils.loggers.anomaly_logger import AnomalyLogger, TrackedMetric


class DummyEvaluator(MetricAnomalyEvaluator):
def _evaluate_anomaly(self, value: float) -> bool:
return True


class TestAnomalyLogger(unittest.TestCase):

def test_init(self) -> None:
tracked_metrics = [
TrackedMetric(
name="accuracy",
anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
),
TrackedMetric(
name="accuracy",
anomaly_evaluators=[IsNaNEvaluator()],
),
TrackedMetric(name="loss", anomaly_evaluators=[IsNaNEvaluator()]),
]

warning_container = []
with patch(
"torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
side_effect=warning_container.append,
):
logger = AnomalyLogger(
tracked_metrics=tracked_metrics,
)

self.assertEqual(
warning_container,
["Found multiple configs for metric 'accuracy'. Skipping."],
)
self.assertEqual(set(logger._tracked_metrics.keys()), {"loss"})

@patch(
"torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
)
def test_log(self, mock_on_anomaly_detected: MagicMock) -> None:
logger = AnomalyLogger(
tracked_metrics=[
TrackedMetric(
name="accuracy",
anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
warmup_steps=4,
evaluate_every_n_steps=2,
)
]
)

# Log value that can't be resolved to a single numerical.
warning_container = []
with patch(
"torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
side_effect=warning_container.append,
):
logger.log(step=1, name="accuracy", data=torch.Tensor([0.5, 0.9]))

self.assertEqual(
warning_container,
[
"Error when extracting a single numerical value from the provided metric: Scalar tensor must contain a single item, 2 given."
],
)
mock_on_anomaly_detected.assert_called_once()

# Log anomalous value during warmup: no-op
mock_on_anomaly_detected.reset_mock()
logger.log(step=4, name="accuracy", data=0.2)
mock_on_anomaly_detected.assert_not_called()

# Log anomalous value on non-evaluate step: no-op
logger.log(step=5, name="accuracy", data=0.1)
mock_on_anomaly_detected.assert_not_called()

# Log metric that is not tracked: no-op
mock_on_anomaly_detected.reset_mock()
logger.log(step=6, name="loss", data=math.nan)
mock_on_anomaly_detected.assert_not_called()

# Log metric within threshold: no-op
logger.log(step=6, name="accuracy", data=0.6)
mock_on_anomaly_detected.assert_not_called()

# Log metric outside threshold
warning_container = []
with patch(
"torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
side_effect=warning_container.append,
):
logger.log(step=8, name="accuracy", data=0.95)

self.assertEqual(
warning_container,
[
"Found anomaly in metric: accuracy, with value: 0.95, using evaluator: ThresholdEvaluator"
],
)
mock_on_anomaly_detected.assert_called_with("accuracy", 0.95, 8)

@patch(
"torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
)
def test_log_dict(self, mock_on_anomaly_detected: MagicMock) -> None:
logger = AnomalyLogger(
tracked_metrics=[
TrackedMetric(
name="accuracy",
anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
),
TrackedMetric(
name="loss",
anomaly_evaluators=[IsNaNEvaluator()],
),
TrackedMetric(
name="f1_score",
anomaly_evaluators=[
IsNaNEvaluator(),
ThresholdEvaluator(min_val=0.2),
],
),
]
)

warning_container = []
with patch(
"torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
side_effect=warning_container.append,
):
logger.log_dict(
step=1,
payload={
"loss": math.nan,
"accuracy": 0.63,
"precision": 0.7,
"f1_score": 0.05,
},
)

self.assertEqual(
set(warning_container),
{
"Found anomaly in metric: f1_score, with value: 0.05, using evaluator: ThresholdEvaluator",
"Found anomaly in metric: loss, with value: nan, using evaluator: IsNaNEvaluator",
},
)

expected_anomaly_callback_calls = [
call("f1_score", 0.05, 1),
call("loss", math.nan, 1),
]
mock_on_anomaly_detected.assert_has_calls(
expected_anomaly_callback_calls, any_order=True
)

@patch(
"torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
side_effect=Exception("test exception"),
)
def test_on_anomaly_callback_exception(self, _) -> None:
logger = AnomalyLogger(
tracked_metrics=[
TrackedMetric(
name="accuracy",
anomaly_evaluators=[ThresholdEvaluator(min_val=0.5, max_val=0.9)],
),
]
)

warning_container = []
with patch(
"torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
side_effect=warning_container.append,
):
logger.log(step=1, name="accuracy", data=0.95)

self.assertEqual(
warning_container,
[
"Found anomaly in metric: accuracy, with value: 0.95, using evaluator: ThresholdEvaluator",
"Exception when calling on_anomaly_hook: test exception",
],
)
45 changes: 39 additions & 6 deletions tests/utils/loggers/test_tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,53 @@

import tempfile
import unittest
from unittest.mock import Mock, patch
from unittest.mock import MagicMock, Mock, patch

from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from torchtnt.utils.anomaly_evaluation import ThresholdEvaluator
from torchtnt.utils.loggers.anomaly_logger import TrackedMetric

from torchtnt.utils.loggers.tensorboard import TensorBoardLogger


class TensorBoardLoggerTest(unittest.TestCase):
def test_log(self: TensorBoardLoggerTest) -> None:

@patch(
"torchtnt.utils.loggers.anomaly_logger.AnomalyLogger.on_anomaly_detected",
)
def test_log(
self: TensorBoardLoggerTest, mock_on_anomaly_detected: MagicMock
) -> None:
with tempfile.TemporaryDirectory() as log_dir:
logger = TensorBoardLogger(path=log_dir)
for i in range(5):
logger.log("test_log", float(i) ** 2, i)
logger.close()
logger = TensorBoardLogger(
path=log_dir,
tracked_metrics=[
TrackedMetric(
name="test_log",
anomaly_evaluators=[
ThresholdEvaluator(min_val=25),
],
evaluate_every_n_steps=2,
warmup_steps=2,
)
],
)
warning_container = []
with patch(
"torchtnt.utils.loggers.anomaly_logger.logging.Logger.warning",
side_effect=warning_container.append,
):
for i in range(5):
logger.log("test_log", float(i) ** 2, i)
logger.close()

self.assertEqual(
warning_container,
[
"Found anomaly in metric: test_log, with value: 16.0, using evaluator: ThresholdEvaluator"
],
)
mock_on_anomaly_detected.assert_called_with("test_log", 16.0, 4)

acc = EventAccumulator(log_dir)
acc.Reload()
Expand Down
47 changes: 47 additions & 0 deletions tests/utils/test_anomaly_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

import math
import unittest

from torchtnt.utils.anomaly_evaluation import IsNaNEvaluator, ThresholdEvaluator


class TestAnomalyLogger(unittest.TestCase):

def test_threshold(self) -> None:
threshold = ThresholdEvaluator(min_val=0.5, max_val=0.9)
self.assertFalse(threshold.is_anomaly())

threshold.update(0.4)
self.assertTrue(threshold.is_anomaly())

threshold.update(0.6)
self.assertFalse(threshold.is_anomaly())

threshold.update(0.95)
self.assertTrue(threshold.is_anomaly())

threshold = ThresholdEvaluator(max_val=1)

threshold.update(100.0)
self.assertTrue(threshold.is_anomaly())

threshold.update(-500.0)
self.assertFalse(threshold.is_anomaly())

def test_isnan(self) -> None:
isnan = IsNaNEvaluator()
self.assertFalse(isnan.is_anomaly())

isnan.update(0.4)
self.assertFalse(isnan.is_anomaly())

isnan.update(math.nan)
self.assertTrue(isnan.is_anomaly())
3 changes: 3 additions & 0 deletions torchtnt/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

# pyre-strict

from .anomaly_evaluation import IsNaNEvaluator, ThresholdEvaluator
from .checkpoint import (
BestCheckpointConfig,
CheckpointManager,
Expand Down Expand Up @@ -88,6 +89,8 @@
)

__all__ = [
"IsNaNEvaluator",
"ThresholdEvaluator",
"CheckpointPath",
"MetricData",
"get_best_checkpoint_path",
Expand Down
Loading
Loading