Skip to content

Commit

Permalink
fix pseudonymizer cache metrics (#703)
Browse files Browse the repository at this point in the history
* fix pseudonymizer cache metrics
  • Loading branch information
ekneg54 authored Nov 8, 2024
1 parent 7e95536 commit dce2b6c
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

- fix `confluent_kafka.store_offsets` if `last_valid_record` is `None`, can happen if a rebalancing happens
before the first message was pulled.
- fix pseudonymizer cache metrics not updated

## 14.0.0
### Breaking
Expand Down
8 changes: 4 additions & 4 deletions logprep/processor/pseudonymizer/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,9 @@ def _wrap_hash(self, hash_string: str) -> str:
def _update_cache_metrics(self):
cache_info_pseudonyms = self._get_pseudonym_dict_cached.cache_info()
cache_info_urls = self._pseudonymize_url_cached.cache_info()
self.metrics.new_results = cache_info_pseudonyms.misses + cache_info_urls.misses
self.metrics.cached_results = cache_info_pseudonyms.hits + cache_info_urls.hits
self.metrics.num_cache_entries = cache_info_pseudonyms.currsize + cache_info_urls.currsize
self.metrics.cache_load = (cache_info_pseudonyms.currsize + cache_info_urls.currsize) / (
self.metrics.new_results += cache_info_pseudonyms.misses + cache_info_urls.misses
self.metrics.cached_results += cache_info_pseudonyms.hits + cache_info_urls.hits
self.metrics.num_cache_entries += cache_info_pseudonyms.currsize + cache_info_urls.currsize
self.metrics.cache_load += (cache_info_pseudonyms.currsize + cache_info_urls.currsize) / (
cache_info_pseudonyms.maxsize + cache_info_urls.maxsize
)
46 changes: 40 additions & 6 deletions tests/unit/processor/pseudonymizer/test_pseudonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,9 +821,6 @@ def test_pseudonymize_string_adds_pseudonyms(self):
assert len(self.object.result.data) == 1

def test_resolve_from_cache_pseudonym(self):
self.object.metrics.new_results = 0
self.object.metrics.cached_results = 0
self.object.metrics.num_cache_entries = 0
rule_dict = {
"filter": "winlog.event_id: 1234 AND winlog.provider_name: Test456",
"pseudonymizer": {
Expand All @@ -844,15 +841,15 @@ def test_resolve_from_cache_pseudonym(self):
}
}
self._load_specific_rule(rule_dict)
self.object.metrics.new_results = 0
self.object.metrics.cached_results = 0
self.object.metrics.num_cache_entries = 0
self.object.process(event)
assert self.object.metrics.new_results == 1
assert self.object.metrics.cached_results == 1
assert self.object.metrics.num_cache_entries == 1

def test_resolve_from_cache_pseudonymize_urls(self):
self.object.metrics.new_results = 0
self.object.metrics.cached_results = 0
self.object.metrics.num_cache_entries = 0
rule_dict = {
"filter": "filter_this: does_not_matter",
"pseudonymizer": {
Expand All @@ -869,6 +866,9 @@ def test_resolve_from_cache_pseudonymize_urls(self):
"and_pseudo_this": "https://www.pseudo.this.de",
}
self._load_specific_rule(rule_dict)
self.object.metrics.new_results = 0
self.object.metrics.cached_results = 0
self.object.metrics.num_cache_entries = 0
self.object.process(event)
# 1 subdomains -> pseudonym_cache, 1 url -> url_cache
assert self.object.metrics.new_results == 2
Expand Down Expand Up @@ -1089,3 +1089,37 @@ def test_setup_raises_invalid_configuration_on_missing_regex_mapping(self):
)
with pytest.raises(InvalidConfigurationError, match=error_message):
self.object.setup()

def test_cache_metrics_updated(self):
rule_dict = {
"filter": "winlog.event_id: 1234 AND winlog.provider_name: Test456",
"pseudonymizer": {
"mapping": {
"winlog.event_data.param1": "RE_WHOLE_FIELD",
}
},
}
event = {
"@timestamp": "custom timestamp",
"winlog": {
"event_id": 1234,
"provider_name": "Test456",
"event_data": {
"param1": "Pseudonymize me - appears twice!",
},
},
}
self._load_specific_rule(rule_dict)

self.object.metrics.new_results = 0
self.object.metrics.cached_results = 0
self.object.metrics.num_cache_entries = 0

self.object.process(deepcopy(event))
self.object.process(deepcopy(event))
self.object.process(event)
# because the event is the same, the result is cached
# metrics are mocked by integers and incremented by cache_info results
assert self.object.metrics.new_results == 3
assert self.object.metrics.cached_results == 3
assert self.object.metrics.num_cache_entries == 3

0 comments on commit dce2b6c

Please sign in to comment.