Skip to content

Commit

Permalink
allow monthly metrics reports on multiple subjects
Browse files Browse the repository at this point in the history
replace `DailyReport.DAILY_UNIQUE_FIELDS` with `UNIQUE_TOGETHER_FIELDS`
on both `DailyReport` and `MonthlyReport`, so we can have (for example)
monthly reports for each institution or each institutional user account
  • Loading branch information
aaxelb committed Aug 23, 2024
1 parent 5bf4ee4 commit 9923f46
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 37 deletions.
53 changes: 32 additions & 21 deletions osf/metrics/reports.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import abc
import datetime

from django.dispatch import receiver
Expand All @@ -20,10 +21,14 @@ class DailyReport(metrics.Metric):
There's something we'd like to know about every so often,
so let's regularly run a report and stash the results here.
"""
DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day
UNIQUE_TOGETHER_FIELDS = ('report_date',) # override in subclasses for multiple reports per day

report_date = metrics.Date(format='strict_date', required=True)

def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})'

class Meta:
abstract = True
dynamic = metrics.MetaField('strict')
Expand Down Expand Up @@ -58,6 +63,7 @@ def serialize(self, data):
class MonthlyReport(metrics.Metric):
"""MonthlyReport (abstract base for report-based metrics that run monthly)
"""
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth',) # override in subclasses for multiple reports per month

report_yearmonth = YearmonthField()

Expand All @@ -66,26 +72,31 @@ class Meta:
dynamic = metrics.MetaField('strict')
source = metrics.MetaField(enabled=True)

def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})'


@receiver(metrics_pre_save)
def set_report_id(sender, instance, **kwargs):
# Set the document id to a hash of "unique together"
# values (just `report_date` by default) to get
# "ON CONFLICT UPDATE" behavior -- if the document
# already exists, it will be updated rather than duplicated.
# Cannot detect/avoid conflicts this way, but that's ok.

if issubclass(sender, DailyReport):
duf_name = instance.DAILY_UNIQUE_FIELD
if duf_name is None:
instance.meta.id = stable_key(instance.report_date)
else:
duf_value = getattr(instance, duf_name)
if not duf_value or not isinstance(duf_value, str):
raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})')
instance.meta.id = stable_key(instance.report_date, duf_value)
elif issubclass(sender, MonthlyReport):
instance.meta.id = stable_key(instance.report_yearmonth)
try:
_unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS
except AttributeError:
pass
else:
# Set the document id to a hash of "unique together" fields
# for "ON CONFLICT UPDATE" behavior -- if the document
# already exists, it will be updated rather than duplicated.
# Cannot detect/avoid conflicts this way, but that's ok.
_key_values = []
for _field_name in _unique_together_fields:
_field_value = getattr(instance, _field_name)
if not _field_value or (
isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str)
):
raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})')
_key_values.append(_field_value)
instance.meta.id = stable_key(*_key_values)


#### BEGIN reusable inner objects #####
Expand Down Expand Up @@ -157,7 +168,7 @@ class DownloadCountReport(DailyReport):


class InstitutionSummaryReport(DailyReport):
DAILY_UNIQUE_FIELD = 'institution_id'
UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',)

institution_id = metrics.Keyword()
institution_name = metrics.Keyword()
Expand All @@ -169,7 +180,7 @@ class InstitutionSummaryReport(DailyReport):


class NewUserDomainReport(DailyReport):
DAILY_UNIQUE_FIELD = 'domain_name'
UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',)

domain_name = metrics.Keyword()
new_user_count = metrics.Integer()
Expand All @@ -187,7 +198,7 @@ class OsfstorageFileCountReport(DailyReport):


class PreprintSummaryReport(DailyReport):
DAILY_UNIQUE_FIELD = 'provider_key'
UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',)

provider_key = metrics.Keyword()
preprint_count = metrics.Integer()
Expand Down
16 changes: 8 additions & 8 deletions osf/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import dataclasses
import re
import datetime
import typing
from hashlib import sha256

import pytz
from typing import ClassVar


def stable_key(*key_parts):
Expand All @@ -20,11 +19,12 @@ def stable_key(*key_parts):
return sha256(bytes(plain_key, encoding='utf')).hexdigest()


class YearMonth(typing.NamedTuple):
@dataclasses.dataclass(frozen=True)
class YearMonth:
year: int
month: int

YEARMONTH_RE = re.compile(r'(?P<year>\d{4})-(?P<month>\d{2})')
YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P<year>\d{4})-(?P<month>\d{2})')

@classmethod
def from_date(cls, date):
Expand All @@ -46,9 +46,9 @@ def __str__(self):
return f'{self.year}-{self.month:0>2}'

def target_month(self):
return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc)
return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC)

def next_month(self):
if self.month == 12:
return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc)
return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc)
return datetime.datetime(self.year + 1, 1, 1, tzinfo=datetime.UTC)
return datetime.datetime(self.year, self.month + 1, 1, tzinfo=datetime.UTC)
19 changes: 11 additions & 8 deletions osf_tests/metrics/test_daily_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,36 @@ class Meta:
assert report.meta.id == expected_key
mock_save.reset_mock()

def test_with_duf(self, mock_save):
def test_with_unique_together(self, mock_save):
# multiple reports of this type per day, unique by given field
class UniqueByDateAndField(DailyReport):
DAILY_UNIQUE_FIELD = 'duf'
duf = metrics.Keyword()
UNIQUE_TOGETHER_FIELDS = ('report_date', 'uniquefield',)
uniquefield = metrics.Keyword()

class Meta:
app_label = 'osf'

today = date(2022, 5, 18)

expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151'
blah_report = UniqueByDateAndField(report_date=today, duf='blah')
blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah')
blah_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is blah_report
assert blah_report.meta.id == expected_blah
mock_save.reset_mock()

expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99'
fleh_report = UniqueByDateAndField(report_date=today, duf='fleh')
fleh_report = UniqueByDateAndField(report_date=today, uniquefield='fleh')
fleh_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is fleh_report
assert fleh_report.meta.id == expected_fleh
mock_save.reset_mock()

bad_report = UniqueByDateAndField(report_date=today)
with pytest.raises(ReportInvalid):
bad_report.save()
for _bad_report in (
UniqueByDateAndField(report_date=today),
UniqueByDateAndField(report_date=today, uniquefield=['list', 'of', 'things']),
):
with pytest.raises(ReportInvalid):
_bad_report.save()
72 changes: 72 additions & 0 deletions osf_tests/metrics/test_monthly_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from unittest import mock

import pytest
from elasticsearch_metrics import metrics

from osf.metrics.reports import MonthlyReport, ReportInvalid
from osf.metrics.utils import YearMonth


class TestMonthlyReportKey:
@pytest.fixture
def mock_save(self):
with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
yield mock_save

def test_default(self, mock_save):
# only one of this type of report per month
class UniqueByMonth(MonthlyReport):
blah = metrics.Keyword()

class Meta:
app_label = 'osf'

yearmonth = YearMonth(2022, 5)

reports = [
UniqueByMonth(report_yearmonth=yearmonth),
UniqueByMonth(report_yearmonth=yearmonth, blah='blah'),
UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'),
]
expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2'

for report in reports:
report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is report
assert report.meta.id == expected_key
mock_save.reset_mock()

def test_with_unique_together(self, mock_save):
# multiple reports of this type per day, unique by given field
class UniqueByMonthAndField(MonthlyReport):
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',)
uniquefield = metrics.Keyword()

class Meta:
app_label = 'osf'

yearmonth = YearMonth(2022, 5)

expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08'
blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah')
blah_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is blah_report
assert blah_report.meta.id == expected_blah
mock_save.reset_mock()

expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f'
fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh')
fleh_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is fleh_report
assert fleh_report.meta.id == expected_fleh
mock_save.reset_mock()

for _bad_report in (
UniqueByMonthAndField(report_yearmonth=yearmonth),
UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']),
):
with pytest.raises(ReportInvalid):
_bad_report.save()

0 comments on commit 9923f46

Please sign in to comment.