From 9923f462312b6471c30e050b39786b438aee58ba Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 23 Aug 2024 09:14:25 -0400 Subject: [PATCH] allow monthly metrics reports on multiple subjects replace `DailyReport.DAILY_UNIQUE_FIELDS` with `UNIQUE_TOGETHER_FIELDS` on both `DailyReport` and `MonthlyReport`, so we can have (for example) monthly reports for each institution or each institutional user account --- osf/metrics/reports.py | 53 ++++++++++------- osf/metrics/utils.py | 16 +++--- osf_tests/metrics/test_daily_report.py | 19 ++++--- osf_tests/metrics/test_monthly_report.py | 72 ++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 37 deletions(-) create mode 100644 osf_tests/metrics/test_monthly_report.py diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 609e79fc324..22758f91aea 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -1,3 +1,4 @@ +from collections import abc import datetime from django.dispatch import receiver @@ -20,10 +21,14 @@ class DailyReport(metrics.Metric): There's something we'd like to know about every so often, so let's regularly run a report and stash the results here. """ - DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day + UNIQUE_TOGETHER_FIELDS = ('report_date',) # override in subclasses for multiple reports per day report_date = metrics.Date(format='strict_date', required=True) + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + class Meta: abstract = True dynamic = metrics.MetaField('strict') @@ -58,6 +63,7 @@ def serialize(self, data): class MonthlyReport(metrics.Metric): """MonthlyReport (abstract base for report-based metrics that run monthly) """ + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth',) # override in subclasses for multiple reports per month report_yearmonth = YearmonthField() @@ -66,26 +72,31 @@ class Meta: dynamic = metrics.MetaField('strict') source = metrics.MetaField(enabled=True) + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + @receiver(metrics_pre_save) def set_report_id(sender, instance, **kwargs): - # Set the document id to a hash of "unique together" - # values (just `report_date` by default) to get - # "ON CONFLICT UPDATE" behavior -- if the document - # already exists, it will be updated rather than duplicated. - # Cannot detect/avoid conflicts this way, but that's ok. - - if issubclass(sender, DailyReport): - duf_name = instance.DAILY_UNIQUE_FIELD - if duf_name is None: - instance.meta.id = stable_key(instance.report_date) - else: - duf_value = getattr(instance, duf_name) - if not duf_value or not isinstance(duf_value, str): - raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})') - instance.meta.id = stable_key(instance.report_date, duf_value) - elif issubclass(sender, MonthlyReport): - instance.meta.id = stable_key(instance.report_yearmonth) + try: + _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS + except AttributeError: + pass + else: + # Set the document id to a hash of "unique together" fields + # for "ON CONFLICT UPDATE" behavior -- if the document + # already exists, it will be updated rather than duplicated. + # Cannot detect/avoid conflicts this way, but that's ok. + _key_values = [] + for _field_name in _unique_together_fields: + _field_value = getattr(instance, _field_name) + if not _field_value or ( + isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str) + ): + raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})') + _key_values.append(_field_value) + instance.meta.id = stable_key(*_key_values) #### BEGIN reusable inner objects ##### @@ -157,7 +168,7 @@ class DownloadCountReport(DailyReport): class InstitutionSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'institution_id' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',) institution_id = metrics.Keyword() institution_name = metrics.Keyword() @@ -169,7 +180,7 @@ class InstitutionSummaryReport(DailyReport): class NewUserDomainReport(DailyReport): - DAILY_UNIQUE_FIELD = 'domain_name' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',) domain_name = metrics.Keyword() new_user_count = metrics.Integer() @@ -187,7 +198,7 @@ class OsfstorageFileCountReport(DailyReport): class PreprintSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'provider_key' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',) provider_key = metrics.Keyword() preprint_count = metrics.Integer() diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 5ea397fef39..0d402ad56fb 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -1,9 +1,8 @@ +import dataclasses import re import datetime -import typing from hashlib import sha256 - -import pytz +from typing import ClassVar def stable_key(*key_parts): @@ -20,11 +19,12 @@ def stable_key(*key_parts): return sha256(bytes(plain_key, encoding='utf')).hexdigest() -class YearMonth(typing.NamedTuple): +@dataclasses.dataclass(frozen=True) +class YearMonth: year: int month: int - YEARMONTH_RE = re.compile(r'(?P\d{4})-(?P\d{2})') + YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P\d{4})-(?P\d{2})') @classmethod def from_date(cls, date): @@ -46,9 +46,9 @@ def __str__(self): return f'{self.year}-{self.month:0>2}' def target_month(self): - return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc) + return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC) def next_month(self): if self.month == 12: - return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc) - return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc) + return datetime.datetime(self.year + 1, 1, 1, tzinfo=datetime.UTC) + return datetime.datetime(self.year, self.month + 1, 1, tzinfo=datetime.UTC) diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 2089e7279c9..3840f5dba21 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -37,11 +37,11 @@ class Meta: assert report.meta.id == expected_key mock_save.reset_mock() - def test_with_duf(self, mock_save): + def test_with_unique_together(self, mock_save): # multiple reports of this type per day, unique by given field class UniqueByDateAndField(DailyReport): - DAILY_UNIQUE_FIELD = 'duf' - duf = metrics.Keyword() + UNIQUE_TOGETHER_FIELDS = ('report_date', 'uniquefield',) + uniquefield = metrics.Keyword() class Meta: app_label = 'osf' @@ -49,7 +49,7 @@ class Meta: today = date(2022, 5, 18) expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151' - blah_report = UniqueByDateAndField(report_date=today, duf='blah') + blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah') blah_report.save() assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is blah_report @@ -57,13 +57,16 @@ class Meta: mock_save.reset_mock() expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99' - fleh_report = UniqueByDateAndField(report_date=today, duf='fleh') + fleh_report = UniqueByDateAndField(report_date=today, uniquefield='fleh') fleh_report.save() assert mock_save.call_count == 1 assert mock_save.call_args[0][0] is fleh_report assert fleh_report.meta.id == expected_fleh mock_save.reset_mock() - bad_report = UniqueByDateAndField(report_date=today) - with pytest.raises(ReportInvalid): - bad_report.save() + for _bad_report in ( + UniqueByDateAndField(report_date=today), + UniqueByDateAndField(report_date=today, uniquefield=['list', 'of', 'things']), + ): + with pytest.raises(ReportInvalid): + _bad_report.save() diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py new file mode 100644 index 00000000000..bc8d482a605 --- /dev/null +++ b/osf_tests/metrics/test_monthly_report.py @@ -0,0 +1,72 @@ +from unittest import mock + +import pytest +from elasticsearch_metrics import metrics + +from osf.metrics.reports import MonthlyReport, ReportInvalid +from osf.metrics.utils import YearMonth + + +class TestMonthlyReportKey: + @pytest.fixture + def mock_save(self): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save + + def test_default(self, mock_save): + # only one of this type of report per month + class UniqueByMonth(MonthlyReport): + blah = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + + reports = [ + UniqueByMonth(report_yearmonth=yearmonth), + UniqueByMonth(report_yearmonth=yearmonth, blah='blah'), + UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'), + ] + expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2' + + for report in reports: + report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is report + assert report.meta.id == expected_key + mock_save.reset_mock() + + def test_with_unique_together(self, mock_save): + # multiple reports of this type per day, unique by given field + class UniqueByMonthAndField(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',) + uniquefield = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + + expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08' + blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah') + blah_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is blah_report + assert blah_report.meta.id == expected_blah + mock_save.reset_mock() + + expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f' + fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh') + fleh_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is fleh_report + assert fleh_report.meta.id == expected_fleh + mock_save.reset_mock() + + for _bad_report in ( + UniqueByMonthAndField(report_yearmonth=yearmonth), + UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']), + ): + with pytest.raises(ReportInvalid): + _bad_report.save()