-
Notifications
You must be signed in to change notification settings - Fork 330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ENG-6283] User table allows CSV and TSV downloads #10782
Open
Johnetordoff
wants to merge
6
commits into
CenterForOpenScience:feature/insti-dash-improv
Choose a base branch
from
Johnetordoff:user-table-downloads
base: feature/insti-dash-improv
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 3 commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
8cb64b5
allow csv and tsv downloads from the institutional user endpoint
e2f7e47
add formatted filename for user institutional dashboard report CSV/TS…
62907cf
include all results in downloaded data beyond default page size and a…
09370fe
add better tests for ignoring pagination on downloadable data
4472a44
incorporate new renderer improvements into old views and make file fi…
c1dac13
add view name to file download
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,8 @@ | ||
import csv | ||
import io | ||
import json | ||
from api.base.settings.defaults import USER_INSTITUTION_REPORT_FILENAME, MAX_SIZE_OF_ES_QUERY | ||
import datetime | ||
|
||
from django.http import Http404 | ||
|
||
|
@@ -16,16 +19,25 @@ def csv_fieldname_sortkey(fieldname): | |
|
||
|
||
def get_nested_keys(report_attrs): | ||
for attr_key in sorted(report_attrs.keys(), key=csv_fieldname_sortkey): | ||
attr_value = report_attrs[attr_key] | ||
if isinstance(attr_value, dict): | ||
for subkey in get_nested_keys(attr_value): | ||
yield f'{attr_key}.{subkey}' | ||
else: | ||
yield attr_key | ||
""" | ||
Recursively retrieves all nested keys from the report attributes. | ||
Handles both dictionaries and lists of attributes. | ||
""" | ||
if isinstance(report_attrs, dict): | ||
for attr_key in sorted(report_attrs.keys(), key=csv_fieldname_sortkey): | ||
attr_value = report_attrs[attr_key] | ||
if isinstance(attr_value, dict): | ||
for subkey in get_nested_keys(attr_value): | ||
yield f'{attr_key}.{subkey}' | ||
else: | ||
yield attr_key | ||
elif isinstance(report_attrs, list): | ||
for item in report_attrs: | ||
yield from get_nested_keys(item) | ||
|
||
|
||
def get_key_value(nested_key, report_attrs): | ||
report_attrs = report_attrs.to_dict() if hasattr(report_attrs, 'to_dict') else report_attrs | ||
(key, _, next_nested_key) = nested_key.partition('.') | ||
attr_value = report_attrs.get(key, {}) | ||
return ( | ||
|
@@ -42,32 +54,93 @@ def get_csv_row(keys_list, report_attrs): | |
] | ||
|
||
|
||
class MetricsReportsCsvRenderer(renderers.BaseRenderer): | ||
media_type = 'text/csv' | ||
format = 'csv' | ||
CSV_DIALECT = csv.excel | ||
class MetricsReportsBaseRenderer(renderers.BaseRenderer): | ||
media_type: str | ||
format: str | ||
CSV_DIALECT: csv.Dialect | ||
extension: str | ||
|
||
def get_filename(self, renderer_context: dict, format_type: str) -> str: | ||
"""Generate the filename for the CSV/TSV file based on institution and current date.""" | ||
if renderer_context and 'view' in renderer_context: | ||
current_date = datetime.datetime.now().strftime('%Y-%m') # Format as 'YYYY-MM' | ||
return USER_INSTITUTION_REPORT_FILENAME.format( | ||
date_created=current_date, | ||
institution_id=renderer_context['view'].kwargs['institution_id'], | ||
format_type=format_type | ||
) | ||
else: | ||
raise NotImplementedError('Missing format filename') | ||
|
||
def render(self, json_response, accepted_media_type=None, renderer_context=None): | ||
serialized_reports = ( | ||
jsonapi_resource['attributes'] | ||
for jsonapi_resource in json_response['data'] | ||
) | ||
try: | ||
first_row = next(serialized_reports) | ||
except StopIteration: | ||
def get_all_data(self, view, request): | ||
"""Bypass pagination by fetching all the data.""" | ||
view.pagination_class = None # Disable pagination | ||
return view.get_default_search().extra(size=MAX_SIZE_OF_ES_QUERY).execute() | ||
|
||
def render(self, data: dict, accepted_media_type: str = None, renderer_context: dict = None) -> str: | ||
"""Render the full dataset as CSV or TSV format.""" | ||
data = self.get_all_data(renderer_context['view'], renderer_context['request']) | ||
hits = data.hits | ||
if not hits: | ||
raise Http404('<h1>none found</h1>') | ||
csv_fieldnames = list(get_nested_keys(first_row)) | ||
|
||
# Assuming each hit contains '_source' with the relevant data | ||
first_row = hits[0].to_dict() | ||
csv_fieldnames = list(first_row) | ||
csv_filecontent = io.StringIO(newline='') | ||
csv_writer = csv.writer(csv_filecontent, dialect=self.CSV_DIALECT) | ||
csv_writer.writerow(csv_fieldnames) | ||
for serialized_report in (first_row, *serialized_reports): | ||
csv_writer.writerow( | ||
get_csv_row(csv_fieldnames, serialized_report), | ||
) | ||
|
||
# Write each hit's '_source' as a row in the CSV | ||
for hit in hits: | ||
csv_writer.writerow(get_csv_row(csv_fieldnames, hit.to_dict())) | ||
|
||
# Set response headers for file download | ||
response = renderer_context['response'] | ||
filename = self.get_filename(renderer_context, self.extension) | ||
response['Content-Disposition'] = f'attachment; filename="{filename}"' | ||
|
||
return csv_filecontent.getvalue() | ||
|
||
|
||
class MetricsReportsTsvRenderer(MetricsReportsCsvRenderer): | ||
format = 'tsv' | ||
class MetricsReportsCsvRenderer(MetricsReportsBaseRenderer): | ||
media_type = 'text/csv' | ||
format = 'csv' | ||
CSV_DIALECT = csv.excel | ||
extension = 'csv' | ||
|
||
|
||
class MetricsReportsTsvRenderer(MetricsReportsBaseRenderer): | ||
media_type = 'text/tab-separated-values' | ||
format = 'tsv' | ||
CSV_DIALECT = csv.excel_tab | ||
extension = 'tsv' | ||
|
||
|
||
class MetricsReportsJsonRenderer(MetricsReportsBaseRenderer): | ||
media_type = 'application/json' | ||
format = 'json_file' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe change format param to fix collision with normal json format? |
||
extension = 'json' | ||
|
||
def default_serializer(self, obj): | ||
"""Custom serializer to handle non-serializable objects like datetime.""" | ||
if isinstance(obj, datetime.datetime): | ||
return obj.isoformat() # Convert datetime to ISO format string | ||
raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable') | ||
|
||
def render(self, data, accepted_media_type=None, renderer_context=None): | ||
"""Render the response as JSON format and trigger browser download as a binary file.""" | ||
data = self.get_all_data(renderer_context['view'], renderer_context['request']) | ||
hits = data.hits | ||
if not hits: | ||
raise Http404('<h1>none found</h1>') | ||
|
||
serialized_hits = [hit.to_dict() for hit in hits] | ||
|
||
# Set response headers for file download | ||
response = renderer_context['response'] | ||
filename = self.get_filename(renderer_context, self.extension) | ||
response['Content-Disposition'] = f'attachment; filename="{filename}"' | ||
|
||
# Use custom serializer for non-serializable types (like datetime) | ||
return json.dumps(serialized_hits, default=self.default_serializer, indent=4).encode('utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i don't think we need yet another json renderer; we only need a way to download the existing json as a file (instead of opening in the browser)
how about accepting a
forDownload
query param inElasticsearchListView
(or evenJSONAPIBaseView
?) that toggles aContent-Disposition
header? could work the same way for any format, reducing special-casingThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In considering this I think the user probably doesn't want
meta
and general pagination information that would come from this endpoint. It should be a pure list, like the tabular formats are. But still considering options here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is that based on user feedback? (not sure where "it should be a pure list" came from, when talking about a non-tabular format like json)
total counts and pagination information seem like helpful info to include (especially if they can choose a page size, or if there are more than 10000), and easy to ignore if they don't need it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(could understand wanting to omit relationships, but i'd argue the effort would be better spent making
ElasticsearchListView
support sparse fieldsets (so the download could reflect selected columns from the frontend, implicitly omitting relationships))