Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Harvester] parse iso8601 compliant timestamps #14

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
image: openknowledge/ckan-dev:2.10
services:
solr:
image: ckan/ckan-solr-dev:2.10
image: ckan/ckan-solr:2.10-solr9
postgres:
image: ckan/ckan-postgres-dev:2.10
env:
Expand Down
59 changes: 51 additions & 8 deletions ckanext/fairdatapoint/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,21 @@
# check for multiple-text fields in the schema
# All changes are © Stichting Health-RI and are licensed under the AGPLv3 license

from datetime import datetime
import re
import json
import logging

from ckanext.dcat.profiles import EuropeanDCATAP2Profile
from ckan.plugins import toolkit
from ckan import model
import json
from typing import Dict
import dateutil.parser as dateparser
from dateutil.parser import ParserError
from typing import Dict, List
from rdflib import URIRef

log = logging.getLogger(__name__)


def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
"""
Expand All @@ -31,19 +39,52 @@ def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
# Populate the declared schema fields, if they are present in the extras
for extra_dict in dataset_dict.get('extras', []):
field_key = extra_dict.get('key')
field_value = extra_dict.get('value')
if field_key in dataset_fields:
preset = dataset_fields[field_key]
if preset == "multiple_text" and extra_dict.get('value'):
dataset_dict[field_key] = json.loads(extra_dict.get('value'))
if preset == 'multiple_text' and field_value:
dataset_dict[field_key] = json.loads(field_value)
elif preset == 'date' and field_value:
dataset_dict[field_key] = convert_datetime_string(field_value)
else:
dataset_dict[field_key] = extra_dict.get('value')
dataset_dict[field_key] = field_value

# Remove the extras that have been populated into the declared schema fields
dataset_dict['extras'] = [d for d in dataset_dict['extras'] if d.get('key') not in dataset_fields]

return dataset_dict


def validate_tags(values_list: List[Dict]) -> List:
"""
Validates tags strings to contain allowed characters, replaces others with spaces
"""
illegal_pattern = re.compile('[^A-Za-z0-9\- _\.]')
tags = []
for item in values_list:
tag_value = item['name']
find_illegal = re.search(illegal_pattern, tag_value)
if find_illegal:
log.warning(f'Tag {tag_value} contains values other than alphanumeric characters, spaces, hyphens, '
f'underscores or dots, they will be replaces with spaces')
tag = {'name': re.sub(illegal_pattern, ' ', tag_value)}
tags.append(tag)
else:
tags.append(item)
return tags


def convert_datetime_string(date_value: str) -> datetime:
"""
Converts datestrings (e.g. '2023-10-06T10:12:55.614000+00:00') to datetime class instance
"""
try:
date_value = dateparser.parse(date_value)
except ParserError:
log.error('A date field string value can not be parsed to a date')
return date_value


class FAIRDataPointDCATAPProfile(EuropeanDCATAP2Profile):
"""
An RDF profile for FAIR data points
Expand All @@ -54,10 +95,12 @@ def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:

dataset_dict = _convert_extras_to_declared_schema_fields(dataset_dict)

dataset_dict['tags'] = validate_tags(dataset_dict['tags'])

# Example of adding a field
dataset_dict['extras'].append({'key': 'hello',
'value': "Hello from the FAIR data point profile. Use this function to do "
"FAIR data point specific stuff during the import stage"})
# dataset_dict['extras'].append({'key': 'hello',
# 'value': 'Hello from the FAIR data point profile. Use this function to do '
# 'FAIR data point specific stuff during the import stage'})

return dataset_dict

Expand Down
48 changes: 22 additions & 26 deletions ckanext/fairdatapoint/tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
"""

import pytest
from datetime import datetime
from dateutil.tz import tzutc
from pathlib import Path
from unittest.mock import patch
from rdflib import Graph
Expand Down Expand Up @@ -59,10 +61,7 @@ def test_fdp_record_converter_dataset_dict(self):
record=data)
expected_dataset = {"extras":
[
{"key": "uri", "value": "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"},
{"key": "hello",
"value": "Hello from the FAIR data point profile. Use this function to do FAIR data point "
"specific stuff during the import stage"}
{"key": "uri", "value": "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"}
],
"resources": [],
"title": "COVID-NL cohort MUMC+",
Expand All @@ -73,8 +72,8 @@ def test_fdp_record_converter_dataset_dict(self):
"has_version": ["https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"],
"contact_uri": "https://orcid.org/0000-0002-4348-707X",
"publisher_uri": "https://opal.health-ri.nl/pub/",
"temporal_start": "2020-01-01",
"temporal_end": "2025-12-31"}
"temporal_start": datetime(2020, 1, 1, 0, 0),
"temporal_end": datetime(2025, 12, 31, 0, 0)}
assert actual_dataset == expected_dataset

def test_fdp_record_converter_catalog_dict(self):
Expand All @@ -84,25 +83,22 @@ def test_fdp_record_converter_catalog_dict(self):
guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
record=data)
expected = {
"access_rights": "https://fair.healthinformationportal.eu/catalog/"
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
"conforms_to": ["https://fair.healthinformationportal.eu/profile/"
"a0949e72-4466-4d53-8900-9436d1049a4b"],
"extras": [{"key": "uri",
"value": "https://fair.healthinformationportal.eu/catalog/"
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"},
{"key": "hello",
"value": "Hello from the FAIR data point profile. Use this "
"function to do FAIR data point specific stuff during "
"the import stage"}],
"has_version": ["1.0"],
"issued": "2023-10-06T10:12:55.614000+00:00",
"language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
"license_id": "",
"modified": "2023-10-06T10:12:55.614000+00:00",
"publisher_name": "Automatic",
"resources": [],
"tags": [],
"title": "Slovenia National Node"
"access_rights": "https://fair.healthinformationportal.eu/catalog/"
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
"conforms_to": ["https://fair.healthinformationportal.eu/profile/"
"a0949e72-4466-4d53-8900-9436d1049a4b"],
"extras": [{"key": "uri",
"value": "https://fair.healthinformationportal.eu/catalog/"
"1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"},
],
"has_version": ["1.0"],
"issued": datetime(2023, 10, 6, 10, 12, 55, 614000, tzinfo=tzutc()),
"language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
"license_id": "",
"modified": datetime(2023, 10, 6, 10, 12, 55, 614000, tzinfo=tzutc()),
"publisher_name": "Automatic",
"resources": [],
"tags": [],
"title": "Slovenia National Node"
}
assert actual == expected
Loading