Skip to content

Commit

Permalink
PublicItemUsageReport(er) + tests
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Sep 23, 2024
1 parent 884bdfc commit 4119193
Show file tree
Hide file tree
Showing 3 changed files with 332 additions and 0 deletions.
106 changes: 106 additions & 0 deletions osf/metrics/reporters/public_item_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from __future__ import annotations
import typing

if typing.TYPE_CHECKING:
import elasticsearch_dsl

from osf.metrics.counted_usage import CountedAuthUsage
from osf.metrics.reports import PublicItemUsageReport
from osf.metrics.utils import YearMonth
from ._base import MonthlyReporter


_CHUNK_SIZE = 500


class PublicItemUsageReporter(MonthlyReporter):
'''build a PublicItemUsageReport for each public item
includes projects, project components, registrations, registration components, and preprints
'''

def report(self, yearmonth: YearMonth):
for _itembucket in self._iter_itembuckets(yearmonth):
yield self._report_from_itembucket(_itembucket)

def _item_page_search(self, yearmonth) -> elasticsearch_dsl.Search:
_usage_search = (
CountedAuthUsage.search()
.filter('term', item_public=True)
.filter('range', timestamp={
'gte': yearmonth.target_month(),
'lt': yearmonth.next_month(),
})
.update_from_dict({'size': 0}) # only aggregations, no hits
)
# the main agg: use a composite aggregation to page thru *every* item
_agg_items = _usage_search.aggs.bucket(
'agg_items',
'composite',
sources=[{'item_osfid': {'terms': {'field': 'item_guid'}}}],
size=_CHUNK_SIZE,
)
# nested agg: for each item, get platform_iri values
_agg_items.bucket('agg_platform_iri', 'terms', field='platform_iri')
# nested agg: for each item, get provider_id values
_agg_items.bucket('agg_provider_id', 'terms', field='provider_id')
# nested agg: for each item, get item_type values
_agg_items.bucket('agg_item_type', 'terms', field='item_type')
# nested agg: for each item, get view and download count
_agg_action = _agg_items.bucket(
'agg_action',
'terms',
field='action_labels',
include=[
CountedAuthUsage.ActionLabel.VIEW.value,
CountedAuthUsage.ActionLabel.DOWNLOAD.value,
],
)
# nested nested agg: for each item-action pair, get a session count
_agg_action.bucket(
'agg_session_count',
'cardinality',
field='session_id',
precision_threshold=40000, # maximum precision
)
return _usage_search

def _iter_itembuckets(self, yearmonth: YearMonth):
_search = self._item_page_search(yearmonth)
while _search is not None:
_page_response = _search.execute()
_agg_items = _page_response.aggregations.agg_items
yield from _agg_items.buckets
# update the search for the next page
if len(_agg_items.buckets) == _CHUNK_SIZE:
_search.aggs['agg_items'].after = _agg_items.after_key
else:
_search = None

def _report_from_itembucket(self, itembucket):
_report = PublicItemUsageReport(
item_osfid=itembucket.key.item_osfid,
item_type=_agg_keys(itembucket.agg_item_type),
provider_id=_agg_keys(itembucket.agg_provider_id),
platform_iri=_agg_keys(itembucket.agg_platform_iri),
# default counts to zero, will be updated if non-zero
view_count=0,
view_session_count=0,
download_count=0,
download_session_count=0,
)
for _actionbucket in itembucket.agg_action:
if _actionbucket.key == CountedAuthUsage.ActionLabel.VIEW.value:
_report.view_count = _actionbucket.doc_count
_report.view_session_count = _actionbucket.agg_session_count.value
elif _actionbucket.key == CountedAuthUsage.ActionLabel.DOWNLOAD.value:
_report.download_count = _actionbucket.doc_count
_report.download_session_count = _actionbucket.agg_session_count.value
return _report


###
# local helpers

def _agg_keys(bucket_agg_result) -> list:
return [_bucket.key for _bucket in bucket_agg_result]
23 changes: 23 additions & 0 deletions osf/metrics/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,26 @@ class InstitutionalUserReport(MonthlyReport):
published_preprint_count = metrics.Integer()
public_file_count = metrics.Long()
storage_byte_count = metrics.Long()


class PublicItemUsageReport(MonthlyReport):
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid')

# where noted, fields correspond to defined terms from COUNTER
# https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
item_osfid = metrics.Keyword() # counter:Item
item_type = metrics.Keyword(multi=True) # counter:Data-Type
provider_id = metrics.Keyword(multi=True) # counter:Database(?)
platform_iri = metrics.Keyword(multi=True) # counter:Platform

# counts for this item only (not including components or files)
view_count = metrics.Long() # counter:Total_Item_Investigations
view_session_count = metrics.Long() # counter:Unique_Item_Investigations
download_count = metrics.Long() # counter:Total_Item_Requests
download_session_count = metrics.Long() # counter:Unique_Item_Requests

# combined with counts for contained components and files
combined_view_count = metrics.Long()
combined_view_session_count = metrics.Long()
combined_download_count = metrics.Long()
combined_download_session_count = metrics.Long()
203 changes: 203 additions & 0 deletions osf_tests/metrics/reporters/test_public_item_usage_reporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
from datetime import timedelta
from operator import attrgetter
from unittest import mock

import pytest

from osf.metrics.counted_usage import CountedAuthUsage
from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter
from osf.metrics.reports import PublicItemUsageReport
from osf.metrics.utils import YearMonth


@pytest.mark.es_metrics
class TestPublicItemUsageReport:
@pytest.fixture(autouse=True)
def _mocks(self):
# HACK: skip auto-filling fields from the database
with mock.patch('osf.metrics.counted_usage.Guid.load', return_value=None):
yield

@pytest.fixture
def ym_empty(self) -> YearMonth:
return YearMonth(2012, 7)

@pytest.fixture
def ym_sparse(self) -> YearMonth:
return YearMonth(2017, 7)

@pytest.fixture
def ym_busy(self) -> YearMonth:
return YearMonth(2023, 7)

@pytest.fixture
def sparse_month_usage(self, ym_sparse):
# "sparse" month:
# item0: 3 views, 0 downloads, 2 sessions
# item1: 1 views, 1 download, 1 session
_month_start = ym_sparse.target_month()
_save_usage(
timestamp=_month_start,
item_guid='item0',
session_id='sesh0',
action_labels=['view'],
)
_save_usage(
timestamp=_month_start,
item_guid='item1',
session_id='sesh0',
action_labels=['view'],
)
_save_usage(
timestamp=_month_start + timedelta(minutes=2),
item_guid='item0',
session_id='sesh0',
action_labels=['view'],
)
_save_usage(
timestamp=_month_start + timedelta(minutes=3),
item_guid='item1',
session_id='sesh0',
action_labels=['download'],
)
_save_usage(
timestamp=_month_start + timedelta(days=17),
item_guid='item0',
session_id='sesh1',
action_labels=['view'],
)

@pytest.fixture
def busy_month_item0(self, ym_busy):
# item0: 4 sessions, 4*7 views, 4*5 downloads
_month_start = ym_busy.target_month()
for _sesh in range(0, 4):
_sesh_start = _month_start + timedelta(days=_sesh)
for _minute in range(0, 7):
_save_usage(
timestamp=_sesh_start + timedelta(minutes=_minute),
item_guid='item0',
session_id=f'sesh0{_sesh}',
action_labels=['view'],
)
for _minute in range(10, 15):
_save_usage(
timestamp=_sesh_start + timedelta(minutes=_minute),
item_guid='item0',
session_id=f'sesh0{_sesh}',
action_labels=['download'],
)

@pytest.fixture
def busy_month_item1(self, ym_busy):
# item1: 10 sessions, 6*9 views, 5*7 downloads, 2 providers
_month_start = ym_busy.target_month()
for _sesh in range(0, 6):
_sesh_start = _month_start + timedelta(days=_sesh)
for _minute in range(0, 9):
_save_usage(
timestamp=_sesh_start + timedelta(minutes=_minute),
item_guid='item1',
session_id=f'sesh1{_sesh}',
action_labels=['view'],
)
for _sesh in range(5, 10):
_sesh_start = _month_start + timedelta(days=_sesh)
for _minute in range(10, 17):
_save_usage(
timestamp=_sesh_start + timedelta(minutes=_minute),
item_guid='item1',
session_id=f'sesh1{_sesh}',
action_labels=['download'],
provider_id='prov1', # additional provider_id
)

@pytest.fixture
def busy_month_item2(self, ym_busy):
# item2: 11 sessions, 11 views, 11 downloads
_month_start = ym_busy.target_month()
for _sesh in range(1, 12):
_save_usage(
timestamp=_month_start + timedelta(days=_sesh),
item_guid='item2',
session_id=f'sesh2{_sesh}',
action_labels=['view'],
)
_save_usage(
timestamp=_month_start + timedelta(days=_sesh, hours=_sesh),
item_guid='item2',
session_id=f'sesh2{_sesh}',
action_labels=['download'],
)

def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2):
_reporter = PublicItemUsageReporter()
_empty = list(_reporter.report(ym_empty))
_sparse = list(_reporter.report(ym_sparse))
_busy = list(_reporter.report(ym_busy))

# empty month:
assert _empty == []

# sparse month:
assert len(_sparse) == 2
_sparse_item0, _sparse_item1 = sorted(_sparse, key=attrgetter('item_osfid'))
# sparse-month item0
assert isinstance(_sparse_item0, PublicItemUsageReport)
assert _sparse_item0.item_osfid == 'item0'
assert _sparse_item0.provider_id == ['prov0']
assert _sparse_item0.platform_iri == ['http://osf.example']
assert _sparse_item0.view_count == 3
assert _sparse_item0.view_session_count == 2
assert _sparse_item0.download_count == 0
assert _sparse_item0.download_session_count == 0
# sparse-month item1
assert isinstance(_sparse_item1, PublicItemUsageReport)
assert _sparse_item1.item_osfid == 'item1'
assert _sparse_item1.provider_id == ['prov0']
assert _sparse_item1.platform_iri == ['http://osf.example']
assert _sparse_item1.view_count == 1
assert _sparse_item1.view_session_count == 1
assert _sparse_item1.download_count == 1
assert _sparse_item1.download_session_count == 1

# busy month:
assert len(_busy) == 3
_busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid'))
# busy-month item0
assert isinstance(_busy_item0, PublicItemUsageReport)
assert _busy_item0.item_osfid == 'item0'
assert _busy_item0.provider_id == ['prov0']
assert _busy_item0.platform_iri == ['http://osf.example']
assert _busy_item0.view_count == 4 * 7
assert _busy_item0.view_session_count == 4
assert _busy_item0.download_count == 4 * 5
assert _busy_item0.download_session_count == 4
# busy-month item1
assert isinstance(_busy_item1, PublicItemUsageReport)
assert _busy_item1.item_osfid == 'item1'
assert _busy_item1.provider_id == ['prov0', 'prov1']
assert _busy_item1.platform_iri == ['http://osf.example']
assert _busy_item1.view_count == 6 * 9
assert _busy_item1.view_session_count == 6
assert _busy_item1.download_count == 5 * 7
assert _busy_item1.download_session_count == 5
# busy-month item2
assert isinstance(_busy_item2, PublicItemUsageReport)
assert _busy_item2.item_osfid == 'item2'
assert _busy_item2.provider_id == ['prov0']
assert _busy_item2.platform_iri == ['http://osf.example']
assert _busy_item2.view_count == 11
assert _busy_item2.view_session_count == 11
assert _busy_item2.download_count == 11
assert _busy_item2.download_session_count == 11


def _save_usage(**kwargs):
_kwargs = { # overridable defaults:
'platform_iri': 'http://osf.example',
'item_public': True,
'provider_id': 'prov0',
**kwargs,
}
CountedAuthUsage(**_kwargs).save(refresh=True)

0 comments on commit 4119193

Please sign in to comment.