Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6283] User table allows CSV and TSV downloads #10782

Open
wants to merge 6 commits into
base: feature/insti-dash-improv
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/base/settings/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@

MAX_SIZE_OF_ES_QUERY = 10000
DEFAULT_ES_NULL_VALUE = 'N/A'
USER_INSTITUTION_REPORT_FILENAME = 'institution_user_report_{institution_id}_{date_created}.{format_type}'

CI_ENV = False

Expand Down
7 changes: 7 additions & 0 deletions api/institutions/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
)
from api.base.settings import DEFAULT_ES_NULL_VALUE
from api.metrics.permissions import IsInstitutionalMetricsUser
from api.metrics.renderers import MetricsReportsCsvRenderer, MetricsReportsTsvRenderer, MetricsReportsJsonRenderer
from api.nodes.serializers import NodeSerializer
from api.nodes.filters import NodesFilterMixin
from api.users.serializers import UserSerializer
Expand Down Expand Up @@ -553,6 +554,12 @@ class _NewInstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView):

view_category = 'institutions'
view_name = 'institution-user-metrics'
renderer_classes = (
*api_settings.DEFAULT_RENDERER_CLASSES,
MetricsReportsCsvRenderer,
MetricsReportsTsvRenderer,
MetricsReportsJsonRenderer,
)

serializer_class = NewInstitutionUserMetricsSerializer

Expand Down
125 changes: 99 additions & 26 deletions api/metrics/renderers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import csv
import io
import json
from api.base.settings.defaults import USER_INSTITUTION_REPORT_FILENAME, MAX_SIZE_OF_ES_QUERY
import datetime

from django.http import Http404

Expand All @@ -16,16 +19,25 @@ def csv_fieldname_sortkey(fieldname):


def get_nested_keys(report_attrs):
for attr_key in sorted(report_attrs.keys(), key=csv_fieldname_sortkey):
attr_value = report_attrs[attr_key]
if isinstance(attr_value, dict):
for subkey in get_nested_keys(attr_value):
yield f'{attr_key}.{subkey}'
else:
yield attr_key
"""
Recursively retrieves all nested keys from the report attributes.
Handles both dictionaries and lists of attributes.
"""
if isinstance(report_attrs, dict):
for attr_key in sorted(report_attrs.keys(), key=csv_fieldname_sortkey):
attr_value = report_attrs[attr_key]
if isinstance(attr_value, dict):
for subkey in get_nested_keys(attr_value):
yield f'{attr_key}.{subkey}'
else:
yield attr_key
elif isinstance(report_attrs, list):
for item in report_attrs:
yield from get_nested_keys(item)


def get_key_value(nested_key, report_attrs):
report_attrs = report_attrs.to_dict() if hasattr(report_attrs, 'to_dict') else report_attrs
(key, _, next_nested_key) = nested_key.partition('.')
attr_value = report_attrs.get(key, {})
return (
Expand All @@ -42,32 +54,93 @@ def get_csv_row(keys_list, report_attrs):
]


class MetricsReportsCsvRenderer(renderers.BaseRenderer):
media_type = 'text/csv'
format = 'csv'
CSV_DIALECT = csv.excel
class MetricsReportsBaseRenderer(renderers.BaseRenderer):
media_type: str
format: str
CSV_DIALECT: csv.Dialect
extension: str

def get_filename(self, renderer_context: dict, format_type: str) -> str:
"""Generate the filename for the CSV/TSV file based on institution and current date."""
if renderer_context and 'view' in renderer_context:
current_date = datetime.datetime.now().strftime('%Y-%m') # Format as 'YYYY-MM'
return USER_INSTITUTION_REPORT_FILENAME.format(
date_created=current_date,
institution_id=renderer_context['view'].kwargs['institution_id'],
format_type=format_type
)
else:
raise NotImplementedError('Missing format filename')

def render(self, json_response, accepted_media_type=None, renderer_context=None):
serialized_reports = (
jsonapi_resource['attributes']
for jsonapi_resource in json_response['data']
)
try:
first_row = next(serialized_reports)
except StopIteration:
def get_all_data(self, view, request):
"""Bypass pagination by fetching all the data."""
view.pagination_class = None # Disable pagination
return view.get_default_search().extra(size=MAX_SIZE_OF_ES_QUERY).execute()

def render(self, data: dict, accepted_media_type: str = None, renderer_context: dict = None) -> str:
"""Render the full dataset as CSV or TSV format."""
data = self.get_all_data(renderer_context['view'], renderer_context['request'])
hits = data.hits
if not hits:
raise Http404('<h1>none found</h1>')
csv_fieldnames = list(get_nested_keys(first_row))

# Assuming each hit contains '_source' with the relevant data
first_row = hits[0].to_dict()
csv_fieldnames = list(first_row)
csv_filecontent = io.StringIO(newline='')
csv_writer = csv.writer(csv_filecontent, dialect=self.CSV_DIALECT)
csv_writer.writerow(csv_fieldnames)
for serialized_report in (first_row, *serialized_reports):
csv_writer.writerow(
get_csv_row(csv_fieldnames, serialized_report),
)

# Write each hit's '_source' as a row in the CSV
for hit in hits:
csv_writer.writerow(get_csv_row(csv_fieldnames, hit.to_dict()))

# Set response headers for file download
response = renderer_context['response']
filename = self.get_filename(renderer_context, self.extension)
response['Content-Disposition'] = f'attachment; filename="{filename}"'

return csv_filecontent.getvalue()


class MetricsReportsTsvRenderer(MetricsReportsCsvRenderer):
format = 'tsv'
class MetricsReportsCsvRenderer(MetricsReportsBaseRenderer):
media_type = 'text/csv'
format = 'csv'
CSV_DIALECT = csv.excel
extension = 'csv'


class MetricsReportsTsvRenderer(MetricsReportsBaseRenderer):
media_type = 'text/tab-separated-values'
format = 'tsv'
CSV_DIALECT = csv.excel_tab
extension = 'tsv'


class MetricsReportsJsonRenderer(MetricsReportsBaseRenderer):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think we need yet another json renderer; we only need a way to download the existing json as a file (instead of opening in the browser)

how about accepting a forDownload query param in ElasticsearchListView (or even JSONAPIBaseView?) that toggles a Content-Disposition header? could work the same way for any format, reducing special-casing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In considering this I think the user probably doesn't want meta and general pagination information that would come from this endpoint. It should be a pure list, like the tabular formats are. But still considering options here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is that based on user feedback? (not sure where "it should be a pure list" came from, when talking about a non-tabular format like json)

total counts and pagination information seem like helpful info to include (especially if they can choose a page size, or if there are more than 10000), and easy to ignore if they don't need it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(could understand wanting to omit relationships, but i'd argue the effort would be better spent making ElasticsearchListView support sparse fieldsets (so the download could reflect selected columns from the frontend, implicitly omitting relationships))

media_type = 'application/json'
format = 'json_file'
Copy link
Contributor Author

@Johnetordoff Johnetordoff Oct 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe change format param to fix collision with normal json format?

extension = 'json'

def default_serializer(self, obj):
"""Custom serializer to handle non-serializable objects like datetime."""
if isinstance(obj, datetime.datetime):
return obj.isoformat() # Convert datetime to ISO format string
raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')

def render(self, data, accepted_media_type=None, renderer_context=None):
"""Render the response as JSON format and trigger browser download as a binary file."""
data = self.get_all_data(renderer_context['view'], renderer_context['request'])
hits = data.hits
if not hits:
raise Http404('<h1>none found</h1>')

serialized_hits = [hit.to_dict() for hit in hits]

# Set response headers for file download
response = renderer_context['response']
filename = self.get_filename(renderer_context, self.extension)
response['Content-Disposition'] = f'attachment; filename="{filename}"'

# Use custom serializer for non-serializable types (like datetime)
return json.dumps(serialized_hits, default=self.default_serializer, indent=4).encode('utf-8')
81 changes: 79 additions & 2 deletions api_tests/institutions/views/test_institution_user_metric_list.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import datetime
import csv
import json
import datetime
from io import StringIO
from random import random
from urllib.parse import urlencode

import pytest
from waffle.testutils import override_flag

from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE
from api.base.settings.defaults import API_BASE, DEFAULT_ES_NULL_VALUE, USER_INSTITUTION_REPORT_FILENAME
import osf.features
from osf_tests.factories import (
InstitutionFactory,
Expand Down Expand Up @@ -404,6 +405,82 @@ def test_paginate_reports(self, app, url, institutional_admin, institution, repo
assert _resp.status_code == 200
assert list(_user_ids(_resp)) == _expected_user_id_list

@pytest.mark.parametrize('format_type, delimiter, content_type', [
('csv', ',', 'text/csv; charset=utf-8'),
('tsv', '\t', 'text/tab-separated-values; charset=utf-8'),
('json_file', None, 'application/json; charset=utf-8')
])
def test_get_report_formats(self, app, url, institutional_admin, institution, format_type, delimiter, content_type):
_report_factory(
'2024-08',
institution,
user_id=f'u_orcomma',
account_creation_date='2018-02',
user_name=f'Jason Kelce',
orcid_id='4444-3333-2222-1111',
department_name='Center \t Greatest Ever',
storage_byte_count=736662999298,
embargoed_registration_count=1,
published_preprint_count=1,
public_registration_count=2,
public_project_count=3,
public_file_count=4,
private_project_count=5,
month_last_active='2018-02',
month_last_login='2018-02',
)

resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth)
assert resp.status_code == 200
assert resp.headers['Content-Type'] == content_type

current_date = datetime.datetime.now().strftime('%Y-%m')
expected_filename = USER_INSTITUTION_REPORT_FILENAME.format(
date_created=current_date,
institution_id=institution._id,
format_type='json' if format_type == 'json_file' else format_type
)
assert resp.headers['Content-Disposition'] == f'attachment; filename="{expected_filename}"'

if format_type == 'json_file':
# Validate JSON structure and content
response_data = json.loads(resp.body.decode('utf-8'))
expected_data = [
{
'account_creation_date': '2018-02',
'department_name': 'Center \t Greatest Ever',
'embargoed_registration_count': 1,
'month_last_active': '2018-02',
'month_last_login': '2018-02',
'orcid_id': '4444-3333-2222-1111',
'private_project_count': 5,
'public_file_count': 4,
'public_project_count': 3,
'public_registration_count': 2,
'published_preprint_count': 1,
'storage_byte_count': 736662999298,
'user_name': 'Jason Kelce'
}
]
assert response_data == expected_data
else:
response_body = resp.text
expected_response = [
['account_creation_date', 'department_name', 'embargoed_registration_count', 'month_last_active',
'month_last_login', 'orcid_id', 'private_projects', 'public_file_count', 'public_projects',
'public_registration_count', 'published_preprint_count', 'storage_byte_count', 'user_name'],
['2018-02', 'Center \t Greatest Ever', '1', '2018-02', '2018-02', '4444-3333-2222-1111', '5', '4', '3',
'2', '1', '736662999298', 'Jason Kelce'],
]

if delimiter:
with StringIO(response_body) as file:
reader = csv.reader(file, delimiter=delimiter)
response_rows = list(reader)
assert response_rows[0] == expected_response[0]
assert sorted(response_rows[1:]) == sorted(expected_response[1:])


def _user_ids(api_response):
for _datum in api_response.json['data']:
yield _datum['relationships']['user']['data']['id']
Expand Down
Loading