-
Notifications
You must be signed in to change notification settings - Fork 330
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
332 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
from __future__ import annotations | ||
import typing | ||
|
||
if typing.TYPE_CHECKING: | ||
import elasticsearch_dsl | ||
|
||
from osf.metrics.counted_usage import CountedAuthUsage | ||
from osf.metrics.reports import PublicItemUsageReport | ||
from osf.metrics.utils import YearMonth | ||
from ._base import MonthlyReporter | ||
|
||
|
||
_CHUNK_SIZE = 500 | ||
|
||
|
||
class PublicItemUsageReporter(MonthlyReporter): | ||
'''build a PublicItemUsageReport for each public item | ||
includes projects, project components, registrations, registration components, and preprints | ||
''' | ||
|
||
def report(self, yearmonth: YearMonth): | ||
for _itembucket in self._iter_itembuckets(yearmonth): | ||
yield self._report_from_itembucket(_itembucket) | ||
|
||
def _item_page_search(self, yearmonth) -> elasticsearch_dsl.Search: | ||
_usage_search = ( | ||
CountedAuthUsage.search() | ||
.filter('term', item_public=True) | ||
.filter('range', timestamp={ | ||
'gte': yearmonth.target_month(), | ||
'lt': yearmonth.next_month(), | ||
}) | ||
.update_from_dict({'size': 0}) # only aggregations, no hits | ||
) | ||
# the main agg: use a composite aggregation to page thru *every* item | ||
_agg_items = _usage_search.aggs.bucket( | ||
'agg_items', | ||
'composite', | ||
sources=[{'item_osfid': {'terms': {'field': 'item_guid'}}}], | ||
size=_CHUNK_SIZE, | ||
) | ||
# nested agg: for each item, get platform_iri values | ||
_agg_items.bucket('agg_platform_iri', 'terms', field='platform_iri') | ||
# nested agg: for each item, get provider_id values | ||
_agg_items.bucket('agg_provider_id', 'terms', field='provider_id') | ||
# nested agg: for each item, get item_type values | ||
_agg_items.bucket('agg_item_type', 'terms', field='item_type') | ||
# nested agg: for each item, get view and download count | ||
_agg_action = _agg_items.bucket( | ||
'agg_action', | ||
'terms', | ||
field='action_labels', | ||
include=[ | ||
CountedAuthUsage.ActionLabel.VIEW.value, | ||
CountedAuthUsage.ActionLabel.DOWNLOAD.value, | ||
], | ||
) | ||
# nested nested agg: for each item-action pair, get a session count | ||
_agg_action.bucket( | ||
'agg_session_count', | ||
'cardinality', | ||
field='session_id', | ||
precision_threshold=40000, # maximum precision | ||
) | ||
return _usage_search | ||
|
||
def _iter_itembuckets(self, yearmonth: YearMonth): | ||
_search = self._item_page_search(yearmonth) | ||
while _search is not None: | ||
_page_response = _search.execute() | ||
_agg_items = _page_response.aggregations.agg_items | ||
yield from _agg_items.buckets | ||
# update the search for the next page | ||
if len(_agg_items.buckets) == _CHUNK_SIZE: | ||
_search.aggs['agg_items'].after = _agg_items.after_key | ||
else: | ||
_search = None | ||
|
||
def _report_from_itembucket(self, itembucket): | ||
_report = PublicItemUsageReport( | ||
item_osfid=itembucket.key.item_osfid, | ||
item_type=_agg_keys(itembucket.agg_item_type), | ||
provider_id=_agg_keys(itembucket.agg_provider_id), | ||
platform_iri=_agg_keys(itembucket.agg_platform_iri), | ||
# default counts to zero, will be updated if non-zero | ||
view_count=0, | ||
view_session_count=0, | ||
download_count=0, | ||
download_session_count=0, | ||
) | ||
for _actionbucket in itembucket.agg_action: | ||
if _actionbucket.key == CountedAuthUsage.ActionLabel.VIEW.value: | ||
_report.view_count = _actionbucket.doc_count | ||
_report.view_session_count = _actionbucket.agg_session_count.value | ||
elif _actionbucket.key == CountedAuthUsage.ActionLabel.DOWNLOAD.value: | ||
_report.download_count = _actionbucket.doc_count | ||
_report.download_session_count = _actionbucket.agg_session_count.value | ||
return _report | ||
|
||
|
||
### | ||
# local helpers | ||
|
||
def _agg_keys(bucket_agg_result) -> list: | ||
return [_bucket.key for _bucket in bucket_agg_result] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
203 changes: 203 additions & 0 deletions
203
osf_tests/metrics/reporters/test_public_item_usage_reporter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
from datetime import timedelta | ||
from operator import attrgetter | ||
from unittest import mock | ||
|
||
import pytest | ||
|
||
from osf.metrics.counted_usage import CountedAuthUsage | ||
from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter | ||
from osf.metrics.reports import PublicItemUsageReport | ||
from osf.metrics.utils import YearMonth | ||
|
||
|
||
@pytest.mark.es_metrics | ||
class TestPublicItemUsageReport: | ||
@pytest.fixture(autouse=True) | ||
def _mocks(self): | ||
# HACK: skip auto-filling fields from the database | ||
with mock.patch('osf.metrics.counted_usage.Guid.load', return_value=None): | ||
yield | ||
|
||
@pytest.fixture | ||
def ym_empty(self) -> YearMonth: | ||
return YearMonth(2012, 7) | ||
|
||
@pytest.fixture | ||
def ym_sparse(self) -> YearMonth: | ||
return YearMonth(2017, 7) | ||
|
||
@pytest.fixture | ||
def ym_busy(self) -> YearMonth: | ||
return YearMonth(2023, 7) | ||
|
||
@pytest.fixture | ||
def sparse_month_usage(self, ym_sparse): | ||
# "sparse" month: | ||
# item0: 3 views, 0 downloads, 2 sessions | ||
# item1: 1 views, 1 download, 1 session | ||
_month_start = ym_sparse.target_month() | ||
_save_usage( | ||
timestamp=_month_start, | ||
item_guid='item0', | ||
session_id='sesh0', | ||
action_labels=['view'], | ||
) | ||
_save_usage( | ||
timestamp=_month_start, | ||
item_guid='item1', | ||
session_id='sesh0', | ||
action_labels=['view'], | ||
) | ||
_save_usage( | ||
timestamp=_month_start + timedelta(minutes=2), | ||
item_guid='item0', | ||
session_id='sesh0', | ||
action_labels=['view'], | ||
) | ||
_save_usage( | ||
timestamp=_month_start + timedelta(minutes=3), | ||
item_guid='item1', | ||
session_id='sesh0', | ||
action_labels=['download'], | ||
) | ||
_save_usage( | ||
timestamp=_month_start + timedelta(days=17), | ||
item_guid='item0', | ||
session_id='sesh1', | ||
action_labels=['view'], | ||
) | ||
|
||
@pytest.fixture | ||
def busy_month_item0(self, ym_busy): | ||
# item0: 4 sessions, 4*7 views, 4*5 downloads | ||
_month_start = ym_busy.target_month() | ||
for _sesh in range(0, 4): | ||
_sesh_start = _month_start + timedelta(days=_sesh) | ||
for _minute in range(0, 7): | ||
_save_usage( | ||
timestamp=_sesh_start + timedelta(minutes=_minute), | ||
item_guid='item0', | ||
session_id=f'sesh0{_sesh}', | ||
action_labels=['view'], | ||
) | ||
for _minute in range(10, 15): | ||
_save_usage( | ||
timestamp=_sesh_start + timedelta(minutes=_minute), | ||
item_guid='item0', | ||
session_id=f'sesh0{_sesh}', | ||
action_labels=['download'], | ||
) | ||
|
||
@pytest.fixture | ||
def busy_month_item1(self, ym_busy): | ||
# item1: 10 sessions, 6*9 views, 5*7 downloads, 2 providers | ||
_month_start = ym_busy.target_month() | ||
for _sesh in range(0, 6): | ||
_sesh_start = _month_start + timedelta(days=_sesh) | ||
for _minute in range(0, 9): | ||
_save_usage( | ||
timestamp=_sesh_start + timedelta(minutes=_minute), | ||
item_guid='item1', | ||
session_id=f'sesh1{_sesh}', | ||
action_labels=['view'], | ||
) | ||
for _sesh in range(5, 10): | ||
_sesh_start = _month_start + timedelta(days=_sesh) | ||
for _minute in range(10, 17): | ||
_save_usage( | ||
timestamp=_sesh_start + timedelta(minutes=_minute), | ||
item_guid='item1', | ||
session_id=f'sesh1{_sesh}', | ||
action_labels=['download'], | ||
provider_id='prov1', # additional provider_id | ||
) | ||
|
||
@pytest.fixture | ||
def busy_month_item2(self, ym_busy): | ||
# item2: 11 sessions, 11 views, 11 downloads | ||
_month_start = ym_busy.target_month() | ||
for _sesh in range(1, 12): | ||
_save_usage( | ||
timestamp=_month_start + timedelta(days=_sesh), | ||
item_guid='item2', | ||
session_id=f'sesh2{_sesh}', | ||
action_labels=['view'], | ||
) | ||
_save_usage( | ||
timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), | ||
item_guid='item2', | ||
session_id=f'sesh2{_sesh}', | ||
action_labels=['download'], | ||
) | ||
|
||
def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2): | ||
_reporter = PublicItemUsageReporter() | ||
_empty = list(_reporter.report(ym_empty)) | ||
_sparse = list(_reporter.report(ym_sparse)) | ||
_busy = list(_reporter.report(ym_busy)) | ||
|
||
# empty month: | ||
assert _empty == [] | ||
|
||
# sparse month: | ||
assert len(_sparse) == 2 | ||
_sparse_item0, _sparse_item1 = sorted(_sparse, key=attrgetter('item_osfid')) | ||
# sparse-month item0 | ||
assert isinstance(_sparse_item0, PublicItemUsageReport) | ||
assert _sparse_item0.item_osfid == 'item0' | ||
assert _sparse_item0.provider_id == ['prov0'] | ||
assert _sparse_item0.platform_iri == ['http://osf.example'] | ||
assert _sparse_item0.view_count == 3 | ||
assert _sparse_item0.view_session_count == 2 | ||
assert _sparse_item0.download_count == 0 | ||
assert _sparse_item0.download_session_count == 0 | ||
# sparse-month item1 | ||
assert isinstance(_sparse_item1, PublicItemUsageReport) | ||
assert _sparse_item1.item_osfid == 'item1' | ||
assert _sparse_item1.provider_id == ['prov0'] | ||
assert _sparse_item1.platform_iri == ['http://osf.example'] | ||
assert _sparse_item1.view_count == 1 | ||
assert _sparse_item1.view_session_count == 1 | ||
assert _sparse_item1.download_count == 1 | ||
assert _sparse_item1.download_session_count == 1 | ||
|
||
# busy month: | ||
assert len(_busy) == 3 | ||
_busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid')) | ||
# busy-month item0 | ||
assert isinstance(_busy_item0, PublicItemUsageReport) | ||
assert _busy_item0.item_osfid == 'item0' | ||
assert _busy_item0.provider_id == ['prov0'] | ||
assert _busy_item0.platform_iri == ['http://osf.example'] | ||
assert _busy_item0.view_count == 4 * 7 | ||
assert _busy_item0.view_session_count == 4 | ||
assert _busy_item0.download_count == 4 * 5 | ||
assert _busy_item0.download_session_count == 4 | ||
# busy-month item1 | ||
assert isinstance(_busy_item1, PublicItemUsageReport) | ||
assert _busy_item1.item_osfid == 'item1' | ||
assert _busy_item1.provider_id == ['prov0', 'prov1'] | ||
assert _busy_item1.platform_iri == ['http://osf.example'] | ||
assert _busy_item1.view_count == 6 * 9 | ||
assert _busy_item1.view_session_count == 6 | ||
assert _busy_item1.download_count == 5 * 7 | ||
assert _busy_item1.download_session_count == 5 | ||
# busy-month item2 | ||
assert isinstance(_busy_item2, PublicItemUsageReport) | ||
assert _busy_item2.item_osfid == 'item2' | ||
assert _busy_item2.provider_id == ['prov0'] | ||
assert _busy_item2.platform_iri == ['http://osf.example'] | ||
assert _busy_item2.view_count == 11 | ||
assert _busy_item2.view_session_count == 11 | ||
assert _busy_item2.download_count == 11 | ||
assert _busy_item2.download_session_count == 11 | ||
|
||
|
||
def _save_usage(**kwargs): | ||
_kwargs = { # overridable defaults: | ||
'platform_iri': 'http://osf.example', | ||
'item_public': True, | ||
'provider_id': 'prov0', | ||
**kwargs, | ||
} | ||
CountedAuthUsage(**_kwargs).save(refresh=True) |