From 6cf406e0b9c68ee0b687abc813cbe9d01f086d39 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 9 Sep 2024 14:11:47 -0400 Subject: [PATCH] institution-user metrics view - add base `ElasticsearchListView` - use `elasticsearch_dsl.Search` as queryset-analogue for code reuse - allows filtering (with 'eq' and 'ne' operators) following `filterable_fields` on the serializer - allows sorting, following `default_ordering` and `ordering_fields` on the view - allows pagination (with `page` and `page[size]` params) - update "new" institution user metrics view with ElasticsearchListView - add `MonthlyReport.most_recent_yearmonth` --- api/base/elasticsearch_dsl_views.py | 98 +++++++++++ api/institutions/serializers.py | 8 + api/institutions/views.py | 31 +++- .../test_institution_user_metric_list.py | 160 +++++++++++++++++- osf/metrics/reports.py | 25 ++- osf/metrics/utils.py | 5 +- 6 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 api/base/elasticsearch_dsl_views.py diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py new file mode 100644 index 00000000000..e593a63de7b --- /dev/null +++ b/api/base/elasticsearch_dsl_views.py @@ -0,0 +1,98 @@ +from __future__ import annotations +import abc + +import elasticsearch_dsl as edsl +from rest_framework import generics +from rest_framework import exceptions as drf_exceptions +from rest_framework.settings import api_settings as drf_settings + +from api.base.filters import FilterMixin +from api.base.views import JSONAPIBaseView + + +class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): + '''abstract view class using `elasticsearch_dsl.Search` as a queryset-analogue + ''' + default_ordering: str | None = None + ordering_fields: frozenset[str] = frozenset() + + @abc.abstractmethod + def get_default_search(self) -> edsl.Search | None: + ... + + ### + # beware! inheritance shenanigans below + + # override FilterMixin to disable all operators besides 'eq' and 'ne' + MATCHABLE_FIELDS = () + COMPARABLE_FIELDS = () + DEFAULT_OPERATOR_OVERRIDES = {} + # (if you want to add fulltext-search or range-filter support, remove the override + # and update `__add_search_filter` to handle those operators -- tho note that the + # underlying elasticsearch field mapping will need to be compatible with the query) + + # override DEFAULT_FILTER_BACKENDS rest_framework setting + # (filtering handled in-view to reuse logic from FilterMixin) + filter_backends = () + + # note: because elasticsearch_dsl.Search supports slicing and gives results when iterated on, + # it works fine with default pagination + + # override rest_framework.generics.GenericAPIView + def get_queryset(self): + _search = self.get_default_search() + if _search is None: + return [] + # using parsing logic from FilterMixin (oddly nested dict and all) + for _parsed_param in self.parse_query_params(self.request.query_params).values(): + for _parsed_filter in _parsed_param.values(): + _search = self.__add_search_filter( + _search, + elastic_field_name=_parsed_filter['source_field_name'], + operator=_parsed_filter['op'], + value=_parsed_filter['value'], + ) + return self.__add_sort(_search) + + ### + # private methods + + def __add_sort(self, search: edsl.Search) -> edsl.Search: + _elastic_sort = self.__get_elastic_sort() + return (search if _elastic_sort is None else search.sort(_elastic_sort)) + + def __get_elastic_sort(self) -> str | None: + _sort_param = self.request.query_params.get(drf_settings.ORDERING_PARAM, self.default_ordering) + if not _sort_param: + return None + _sort_field, _ascending = ( + (_sort_param[1:], False) + if _sort_param.startswith('-') + else (_sort_param, True) + ) + if _sort_field not in self.ordering_fields: + raise drf_exceptions.ValidationError( + f'invalid value for {drf_settings.ORDERING_PARAM} query param (valid values: {", ".join(self.ordering_fields)})', + ) + _serializer_field = self.get_serializer().fields[_sort_field] + _elastic_sort_field = _serializer_field.source + return (_elastic_sort_field if _ascending else f'-{_elastic_sort_field}') + + def __add_search_filter( + self, + search: edsl.Search, + elastic_field_name: str, + operator: str, + value: str, + ) -> edsl.Search: + match operator: # operators from FilterMixin + case 'eq': + if value == '': + return search.exclude('exists', field=elastic_field_name) + return search.filter('term', **{elastic_field_name: value}) + case 'ne': + if value == '': + return search.filter('exists', field=elastic_field_name) + return search.exclude('term', **{elastic_field_name: value}) + case _: + raise NotImplementedError(f'unsupported filter operator "{operator}"') diff --git a/api/institutions/serializers.py b/api/institutions/serializers.py index 9e161fb5d52..0421e9348f6 100644 --- a/api/institutions/serializers.py +++ b/api/institutions/serializers.py @@ -314,6 +314,11 @@ class NewInstitutionUserMetricsSerializer(JSONAPISerializer): class Meta: type_ = 'institution-users' + filterable_fields = frozenset({ + 'department', + 'orcid_id', + }) + id = IDField(source='meta.id', read_only=True) user_name = ser.CharField(read_only=True) department = ser.CharField(read_only=True, source='department_name') @@ -339,3 +344,6 @@ class Meta: ) links = LinksField({}) + + def get_absolute_url(self): + return None # there is no detail view for institution-users diff --git a/api/institutions/views.py b/api/institutions/views.py index 8287a615136..ae6de06ca1a 100644 --- a/api/institutions/views.py +++ b/api/institutions/views.py @@ -12,10 +12,12 @@ from osf.metrics import InstitutionProjectCounts from osf.models import OSFUser, Node, Institution, Registration from osf.metrics import UserInstitutionProjectCounts +from osf.metrics.reports import InstitutionalUserReport from osf.utils import permissions as osf_permissions from api.base import permissions as base_permissions -from api.base.filters import ListFilterMixin, FilterMixin +from api.base.elasticsearch_dsl_views import ElasticsearchListView +from api.base.filters import ListFilterMixin from api.base.views import JSONAPIBaseView from api.base.serializers import JSONAPISerializer from api.base.utils import get_object_or_error, get_user_auth @@ -528,7 +530,7 @@ def get_default_queryset(self): return self._make_elasticsearch_results_filterable(search, id=institution._id, department=DEFAULT_ES_NULL_VALUE) -class _NewInstitutionUserMetricsList(InstitutionMixin, FilterMixin, JSONAPIBaseView): +class _NewInstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): permission_classes = ( drf_permissions.IsAuthenticatedOrReadOnly, base_permissions.TokenHasScope, @@ -543,6 +545,31 @@ class _NewInstitutionUserMetricsList(InstitutionMixin, FilterMixin, JSONAPIBaseV serializer_class = NewInstitutionUserMetricsSerializer + default_ordering = '-storage_byte_count' + ordering_fields = frozenset(( + 'user_name', + 'department', + 'month_last_login', + 'account_creation_date', + 'public_projects', + 'private_projects', + 'public_registration_count', + 'embargoed_registration_count', + 'published_preprint_count', + 'public_file_count', + 'storage_byte_count', + )) + + def get_default_search(self): + _yearmonth = InstitutionalUserReport.most_recent_yearmonth() + if _yearmonth is None: + return None + return ( + InstitutionalUserReport.search() + .filter('term', report_yearmonth=str(_yearmonth)) + .filter('term', institution_id=self.get_institution()._id) + ) + institution_user_metrics_list_view = view_toggled_by_feature_flag( flag_name=osf.features.INSTITUTIONAL_DASHBOARD_2024, diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index 22e0066b3d2..34727f71523 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -2,6 +2,7 @@ import csv from io import StringIO from random import random +from urllib.parse import urlencode import pytest from waffle.testutils import override_flag @@ -14,10 +15,11 @@ ) from osf.metrics import UserInstitutionProjectCounts +from osf.metrics.reports import InstitutionalUserReport @pytest.mark.es @pytest.mark.django_db -class TestInstitutionUserMetricList: +class TestOldInstitutionUserMetricList: @pytest.fixture(autouse=True) def _waffled(self): @@ -262,3 +264,159 @@ def test_filter_and_sort(self, app, url, user, user2, user3, admin, user4, popul assert data[0]['attributes']['department'] == 'Biology dept' assert data[1]['attributes']['department'] == 'N/A' assert data[2]['attributes']['department'] == 'Psychology dept' + + +@pytest.mark.django_db +class TestNewInstitutionUserMetricList: + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(osf.features.INSTITUTIONAL_DASHBOARD_2024, active=True): + yield # these tests apply only after institution dashboard improvements + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + _admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(_admin_user) + return _admin_user + + @pytest.fixture() + def unshown_reports(self, institution): + # unshown because another institution + _another_institution = InstitutionFactory() + _report_factory('2024-08', _another_institution, user_id='nother_inst') + # unshown because old + _report_factory('2024-07', institution, user_id='old') + + @pytest.fixture() + def reports(self, institution): + return [ + _report_factory( + '2024-08', institution, + user_id='u_sparse', + storage_byte_count=53, + ), + _report_factory( + '2024-08', institution, + user_id='u_orc', + orcid_id='5555-4444-3333-2222', + storage_byte_count=8277, + ), + _report_factory( + '2024-08', institution, + user_id='u_blargl', + department_name='blargl', + storage_byte_count=34834834, + ), + _report_factory( + '2024-08', institution, + user_id='u_orcomma', + orcid_id='4444-3333-2222-1111', + department_name='a department, or so, that happens, incidentally, to have commas', + storage_byte_count=736662999298, + ), + ] + + @pytest.fixture() + def url(self, institution): + return f'/{API_BASE}institutions/{institution._id}/metrics/users/' + + def test_anon(self, app, url): + _resp = app.get(url, expect_errors=True) + assert _resp.status_code == 401 + + def test_rando(self, app, url, rando): + _resp = app.get(url, auth=rando.auth, expect_errors=True) + assert _resp.status_code == 403 + + @pytest.mark.es + def test_get_empty(self, app, url, institutional_admin): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert _resp.json['data'] == [] + + @pytest.mark.es + def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert len(_resp.json['data']) == len(reports) + _expected_user_ids = {_report.user_id for _report in reports} + assert set(_user_ids(_resp)) == _expected_user_ids + + @pytest.mark.es + def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_ids in ( + ({'filter[department]': 'nunavum'}, set()), + ({'filter[department]': 'incidentally'}, set()), + ({'filter[department]': 'blargl'}, {'u_blargl'}), + ({'filter[department]': 'a department, or so, that happens, incidentally, to have commas'}, {'u_orcomma'}), + ({'filter[department][eq]': 'nunavum'}, set()), + ({'filter[department][eq]': 'blargl'}, {'u_blargl'}), + ({'filter[department][eq]': 'a department, or so, that happens, incidentally, to have commas'}, {'u_orcomma'}), + ({'filter[department][ne]': 'nunavum'}, {'u_sparse', 'u_blargl', 'u_orc', 'u_orcomma'}), + + ({'filter[orcid_id][eq]': '5555-4444-3333-2222'}, {'u_orc'}), + ({'filter[orcid_id][ne]': ''}, {'u_orc', 'u_orcomma'}), + ({'filter[orcid_id][eq]': ''}, {'u_sparse', 'u_blargl'}), + ({ + 'filter[orcid_id]': '', + 'filter[department]': 'blargl', + }, {'u_blargl'}), + ({ + 'filter[orcid_id]': '', + 'filter[department][ne]': 'blargl', + }, {'u_sparse'}), + ({ + 'filter[orcid_id]': '5555-4444-3333-2222', + 'filter[department][ne]': 'blargl', + }, {'u_orc'}), + ({ + 'filter[orcid_id]': '5555-4444-3333-2222', + 'filter[department][ne]': '', + }, set()), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert set(_user_ids(_resp)) == _expected_user_ids + + @pytest.mark.es + def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_id_list in ( + ({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']), + ({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert list(_user_ids(_resp)) == _expected_user_id_list + + @pytest.mark.es + def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_id_list in ( + ({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']), + ({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']), + ({'sort': '-storage_byte_count', 'page[size]': 3}, ['u_orcomma', 'u_blargl', 'u_orc']), + ({'sort': '-storage_byte_count', 'page[size]': 3, 'page': 2}, ['u_sparse']), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert list(_user_ids(_resp)) == _expected_user_id_list + +def _user_ids(api_response): + for _datum in api_response.json['data']: + yield _datum['relationships']['user']['data']['id'] + +def _report_factory(yearmonth, institution, **kwargs): + _report = InstitutionalUserReport( + report_yearmonth=yearmonth, + institution_id=institution._id, + **kwargs, + ) + _report.save(refresh=True) + return _report diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index cd85471cf81..cee4efc7c02 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -21,7 +21,7 @@ class DailyReport(metrics.Metric): There's something we'd like to know about every so often, so let's regularly run a report and stash the results here. """ - UNIQUE_TOGETHER_FIELDS = ('report_date',) # override in subclasses for multiple reports per day + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_date',) # override in subclasses for multiple reports per day report_date = metrics.Date(format='strict_date', required=True) @@ -46,6 +46,10 @@ def deserialize(self, data): return YearMonth.from_str(data) elif isinstance(data, (datetime.datetime, datetime.date)): return YearMonth.from_date(data) + elif isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) elif data is None: return None else: @@ -67,7 +71,7 @@ def serialize(self, data): class MonthlyReport(metrics.Metric): """MonthlyReport (abstract base for report-based metrics that run monthly) """ - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth',) # override in subclasses for multiple reports per month + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_yearmonth',) # override in subclasses for multiple reports per month report_yearmonth = YearmonthField(required=True) @@ -76,6 +80,23 @@ class Meta: dynamic = metrics.MetaField('strict') source = metrics.MetaField(enabled=True) + @classmethod + def most_recent_yearmonth(cls, base_search=None) -> YearMonth | None: + _search = base_search or cls.search() + _search = _search.update_from_dict({'size': 0}) # omit hits + _search.aggs.bucket( + 'agg_most_recent_yearmonth', + 'terms', + field='report_yearmonth', + order={'_key': 'desc'}, + size=1, + ) + _response = _search.execute() + if not _response.aggregations: + return None + (_bucket,) = _response.aggregations.agg_most_recent_yearmonth.buckets + return _bucket.key + def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 0d402ad56fb..e52e0c08c45 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -1,3 +1,4 @@ +from __future__ import annotations import dataclasses import re import datetime @@ -27,12 +28,12 @@ class YearMonth: YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P\d{4})-(?P\d{2})') @classmethod - def from_date(cls, date): + def from_date(cls, date: datetime.date) -> YearMonth: assert isinstance(date, (datetime.datetime, datetime.date)) return cls(date.year, date.month) @classmethod - def from_str(cls, input_str): + def from_str(cls, input_str: str) -> YearMonth: match = cls.YEARMONTH_RE.fullmatch(input_str) if match: return cls(