Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6283] User table allows CSV and TSV downloads #10782

Open
wants to merge 6 commits into
base: feature/insti-dash-improv
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/base/settings/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@

MAX_SIZE_OF_ES_QUERY = 10000
DEFAULT_ES_NULL_VALUE = 'N/A'
REPORT_FILENAME_FORMAT = 'osf_report_{date_created}.{format_type}'
Johnetordoff marked this conversation as resolved.
Show resolved Hide resolved

CI_ENV = False

Expand Down
7 changes: 7 additions & 0 deletions api/institutions/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
)
from api.base.settings import DEFAULT_ES_NULL_VALUE
from api.metrics.permissions import IsInstitutionalMetricsUser
from api.metrics.renderers import MetricsReportsCsvRenderer, MetricsReportsTsvRenderer, MetricsReportsJsonRenderer
from api.nodes.serializers import NodeSerializer
from api.nodes.filters import NodesFilterMixin
from api.users.serializers import UserSerializer
Expand Down Expand Up @@ -553,6 +554,12 @@ class _NewInstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView):

view_category = 'institutions'
view_name = 'institution-user-metrics'
renderer_classes = (
*api_settings.DEFAULT_RENDERER_CLASSES,
MetricsReportsCsvRenderer,
MetricsReportsTsvRenderer,
MetricsReportsJsonRenderer,
)

serializer_class = NewInstitutionUserMetricsSerializer

Expand Down
121 changes: 95 additions & 26 deletions api/metrics/renderers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import csv
import io
import json
from api.base.settings.defaults import REPORT_FILENAME_FORMAT, MAX_SIZE_OF_ES_QUERY
import datetime

from django.http import Http404

Expand All @@ -16,16 +19,21 @@ def csv_fieldname_sortkey(fieldname):


def get_nested_keys(report_attrs):
for attr_key in sorted(report_attrs.keys(), key=csv_fieldname_sortkey):
attr_value = report_attrs[attr_key]
if isinstance(attr_value, dict):
for subkey in get_nested_keys(attr_value):
yield f'{attr_key}.{subkey}'
else:
yield attr_key
if isinstance(report_attrs, dict):
for attr_key in sorted(report_attrs.keys(), key=csv_fieldname_sortkey):
attr_value = report_attrs[attr_key]
if isinstance(attr_value, dict):
for subkey in get_nested_keys(attr_value):
yield f'{attr_key}.{subkey}'
else:
yield attr_key
elif isinstance(report_attrs, list):
for item in report_attrs:
yield from get_nested_keys(item)


def get_key_value(nested_key, report_attrs):
report_attrs = report_attrs.to_dict() if hasattr(report_attrs, 'to_dict') else report_attrs
(key, _, next_nested_key) = nested_key.partition('.')
attr_value = report_attrs.get(key, {})
return (
Expand All @@ -42,32 +50,93 @@ def get_csv_row(keys_list, report_attrs):
]


class MetricsReportsCsvRenderer(renderers.BaseRenderer):
media_type = 'text/csv'
format = 'csv'
CSV_DIALECT = csv.excel
class MetricsReportsBaseRenderer(renderers.BaseRenderer):
"""
This renderer should override the format parameter to send a Content-Disposition attachment of the file data via
the browser.
"""
media_type: str
format: str
CSV_DIALECT: csv.Dialect
extension: str

def get_filename(self, renderer_context: dict, format_type: str) -> str:
"""Generate the filename for the file based on format_type REPORT_FILENAME_FORMAT and current date."""
if renderer_context and 'view' in renderer_context:
current_date = datetime.datetime.now().strftime('%Y-%m')
return REPORT_FILENAME_FORMAT.format(
date_created=current_date,
format_type=format_type,
)
else:
raise NotImplementedError('Missing format filename')

def render(self, json_response, accepted_media_type=None, renderer_context=None):
serialized_reports = (
jsonapi_resource['attributes']
for jsonapi_resource in json_response['data']
)
try:
first_row = next(serialized_reports)
except StopIteration:
def render(self, data: dict, accepted_media_type: str = None, renderer_context: dict = None) -> str:
"""Render the full dataset as CSV or TSV format."""
view = renderer_context['view']
view.pagination_class = None # Disable pagination
data = view.get_default_search().extra(size=MAX_SIZE_OF_ES_QUERY).execute()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good to keep a logical separation of concerns -- a renderer should work with the data it's given; this is logic that belongs in the view

(tho might be even better to skip the special casing and let the client set page[size]=10000)

hits = data.hits
if not hits:
raise Http404('<h1>none found</h1>')
csv_fieldnames = list(get_nested_keys(first_row))

first_row = hits[0].to_dict()
csv_fieldnames = list(first_row)
csv_filecontent = io.StringIO(newline='')
csv_writer = csv.writer(csv_filecontent, dialect=self.CSV_DIALECT)
csv_writer.writerow(csv_fieldnames)
for serialized_report in (first_row, *serialized_reports):
csv_writer.writerow(
get_csv_row(csv_fieldnames, serialized_report),
)

for hit in hits:
csv_writer.writerow(get_csv_row(csv_fieldnames, hit.to_dict()))

response = renderer_context['response']
filename = self.get_filename(renderer_context, self.extension)
response['Content-Disposition'] = f'attachment; filename="{filename}"'

return csv_filecontent.getvalue()


class MetricsReportsTsvRenderer(MetricsReportsCsvRenderer):
format = 'tsv'
class MetricsReportsCsvRenderer(MetricsReportsBaseRenderer):
media_type = 'text/csv'
format = 'csv'
CSV_DIALECT = csv.excel
extension = 'csv'


class MetricsReportsTsvRenderer(MetricsReportsBaseRenderer):
media_type = 'text/tab-separated-values'
format = 'tsv'
CSV_DIALECT = csv.excel_tab
extension = 'tsv'


class MetricsReportsJsonRenderer(MetricsReportsBaseRenderer):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think we need yet another json renderer; we only need a way to download the existing json as a file (instead of opening in the browser)

how about accepting a forDownload query param in ElasticsearchListView (or even JSONAPIBaseView?) that toggles a Content-Disposition header? could work the same way for any format, reducing special-casing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In considering this I think the user probably doesn't want meta and general pagination information that would come from this endpoint. It should be a pure list, like the tabular formats are. But still considering options here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is that based on user feedback? (not sure where "it should be a pure list" came from, when talking about a non-tabular format like json)

total counts and pagination information seem like helpful info to include (especially if they can choose a page size, or if there are more than 10000), and easy to ignore if they don't need it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(could understand wanting to omit relationships, but i'd argue the effort would be better spent making ElasticsearchListView support sparse fieldsets (so the download could reflect selected columns from the frontend, implicitly omitting relationships))

media_type = 'application/json'
format = 'json_file'
Copy link
Contributor Author

@Johnetordoff Johnetordoff Oct 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe change format param to fix collision with normal json format?

extension = 'json'

def default_serializer(self, obj):
"""Custom serializer to handle non-serializable objects like datetime."""
if isinstance(obj, datetime.datetime):
return obj.isoformat() # Convert datetime to ISO format string
raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')

def render(self, data, accepted_media_type=None, renderer_context=None):
"""Render the response as JSON format and trigger browser download as a binary file."""
view = renderer_context['view']
view.pagination_class = None # Disable pagination
data = view.get_default_search().extra(size=MAX_SIZE_OF_ES_QUERY).execute()

hits = data.hits
if not hits:
raise Http404('<h1>none found</h1>')

serialized_hits = [hit.to_dict() for hit in hits]

# Set response headers for file download
response = renderer_context['response']
filename = self.get_filename(renderer_context, self.extension)
response['Content-Disposition'] = f'attachment; filename="{filename}"'

# Use custom serializer for non-serializable types (like datetime)
return json.dumps(serialized_hits, default=self.default_serializer, indent=4).encode('utf-8')
60 changes: 37 additions & 23 deletions api/metrics/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,43 +320,56 @@ class RecentReportList(JSONAPIBaseView):
MetricsReportsTsvRenderer,
)

def get(self, request, *args, report_name):
def get_default_search(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is getting beyond the scope of the ticket, but it'd be nice for RecentReportList to inherit ElasticsearchListView, now that it exists (instead of only duck-typing get_default_search) -- could be fuller-featured and more consistent, but might also rabbit-hole, so don't bother unless you want to

try:
report_class = VIEWABLE_REPORTS[report_name]
report_class = VIEWABLE_REPORTS[self.kwargs['report_name']]
except KeyError:
return Response(
{
'errors': [{
'title': 'unknown report name',
'detail': f'unknown report: "{report_name}"',
}],
},
status=404,
)
return None

is_daily = issubclass(report_class, reports.DailyReport)
days_back = request.GET.get('days_back', self.DEFAULT_DAYS_BACK if is_daily else None)
is_monthly = issubclass(report_class, reports.MonthlyReport)

request = self.get_renderer_context()['request']
days_back = request.GET.get('days_back', self.DEFAULT_DAYS_BACK if is_daily else None)

if is_daily:
serializer_class = DailyReportSerializer
range_field_name = 'report_date'
elif is_monthly:
serializer_class = MonthlyReportSerializer
range_field_name = 'report_yearmonth'
else:
raise ValueError(f'report class must subclass DailyReport or MonthlyReport: {report_class}')

range_filter = parse_date_range(request.GET, is_monthly=is_monthly)
search_recent = (
report_class.search()
.filter('range', **{range_field_name: range_filter})
.sort(range_field_name)
[:self.MAX_COUNT]
search_recent = report_class.search().filter(
'range',
**{range_field_name: range_filter},
).sort(range_field_name)[:self.MAX_COUNT]

if is_daily and days_back:
search_recent = search_recent.filter('range', report_date={'gte': f'now/d-{days_back}d'})

return search_recent

def get(self, request, *args, report_name):
search_response = self.get_default_search()

if search_response is None:
return Response(
{
'errors': [{
'title': 'unknown report name',
'detail': f'unknown report: "{report_name}"',
}],
},
status=404,
)

report_class = VIEWABLE_REPORTS[report_name]
serializer_class = (
DailyReportSerializer if issubclass(report_class, reports.DailyReport)
else MonthlyReportSerializer
)
if days_back:
search_recent.filter('range', report_date={'gte': f'now/d-{days_back}d'})

report_date_range = parse_date_range(request.GET)
search_response = search_recent.execute()
serializer = serializer_class(
search_response,
many=True,
Expand All @@ -365,6 +378,7 @@ def get(self, request, *args, report_name):
accepted_format = request.accepted_renderer.format
response_headers = {}
if accepted_format in ('tsv', 'csv'):
report_date_range = parse_date_range(request.GET)
from_date = report_date_range['gte']
until_date = report_date_range['lte']
filename = (
Expand Down
Loading
Loading