Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve harvester and fix issues; NB! Changes to CKAN and Solr scheme are required! #21

Merged
merged 4 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 32 additions & 31 deletions ckanext/fairdatapoint/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# check for multiple-text fields in the schema
# All changes are © Stichting Health-RI and are licensed under the AGPLv3 license

from datetime import datetime
from datetime import datetime, timezone
import re
import json
import logging
Expand All @@ -15,7 +15,7 @@
from dateutil.parser import ParserError
from json import JSONDecodeError
from typing import Dict, List
from rdflib import URIRef, Namespace
from rdflib import URIRef, Namespace, DCAT

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -85,9 +85,11 @@ def convert_datetime_string(date_value: str) -> datetime:
Converts datestrings (e.g. '2023-10-06T10:12:55.614000+00:00') to datetime class instance
"""
try:
date_value = dateparser.parse(date_value)
date_value = dateparser.parse(date_value, yearfirst=True)
if date_value.tzinfo is not None:
date_value = date_value.astimezone(timezone.utc)
except ParserError:
log.error('A date field string value can not be parsed to a date')
log.error(f'A date field string value {date_value} can not be parsed to a date')
return date_value


Expand All @@ -98,43 +100,42 @@ class FAIRDataPointDCATAPProfile(EuropeanDCATAP2Profile):

def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
super(FAIRDataPointDCATAPProfile, self).parse_dataset(dataset_dict, dataset_ref)
dataset_dict = self._parse_contact_point(dataset_dict, dataset_ref)

dataset_dict = _convert_extras_to_declared_schema_fields(dataset_dict)

dataset_dict['tags'] = validate_tags(dataset_dict['tags'])

return dataset_dict

def _contact_details(self, subject, predicate):
def _contact_point_details(self, subject, predicate) -> List:
"""
Overrides RDFProfile._contact_details so uri is taken from hasUID for VCard
"""
contact = {}
# todo fix for multiple
contact_list = []

for agent in self.g.objects(subject, predicate):
contact = {
'contact_uri': (str(agent) if isinstance(agent, URIRef)
else self._get_vcard_property_value(agent, VCARD.hasUID)),
'contact_name': self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn),
'contact_email': self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail))}

contact['uri'] = (str(agent) if isinstance(agent, URIRef)
else self._get_vcard_property_value(agent, VCARD.hasUID))

contact['name'] = self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn)

contact['email'] = self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail))

return contact

# def graph_from_dataset(self, dataset_dict, dataset_ref):
#
# g = self.g
#
# spatial_text = self._get_dataset_value(dataset_dict, 'hello')
#
# if spatial_uri:
# spatial_ref = URIRef(spatial_uri)
# else:
# spatial_ref = BNode()
#
# if spatial_text:
# g.add((dataset_ref, DCT.spatial, spatial_ref))
# g.add((spatial_ref, RDF.type, DCT.Location))
# g.add((spatial_ref, RDFS.label, Literal(spatial_text)))
contact_list.append(contact)

return contact_list

def _parse_contact_point(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
"""
ckan-dcat extension implies there can be just one contact point and in case a list is provided by source only
last value is taken. Besides it never solves uri from a VCard object. This function parses DCAT.contactPoint
information to a list of `pontact_point` dictionaries and replaces ckan-dcat values
"""
contact_point = self._contact_point_details(subject=dataset_ref, predicate=DCAT.contactPoint)
dcat_profile_contact_fields = ['contact_name', 'contact_email', 'contact_uri']
if contact_point:
dataset_dict['extras'].append({'key': 'contact_point', 'value': contact_point})
# Remove the extras contact_ fields if they were parsed by dcat extension
dataset_dict['extras'] = \
[item for item in dataset_dict['extras'] if item.get('key') not in dcat_profile_contact_fields]
return dataset_dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix v: <http://www.w3.org/2006/vcard/ns#> .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
a dcat:Resource, dcat:Dataset;
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
dcterms:title "Example";
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
dcterms:description "This is an example description.";
dcat:contactPoint [ a v:VCard ;
v:fn "Marc Bonten" ;
v:hasUID <https://orcid.org/0000-0002-9095-9201> ;
v:hasEmail <mailto:[email protected]> ] ,
[ a v:VCard ;
v:fn "Frits Rosendaal" ;
v:hasUID <https://orcid.org/0000-0003-2558-7496> ;
v:hasEmail <mailto:[email protected]> ] .
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
a dcat:Resource, dcat:Dataset;
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
dcterms:title "Example";
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
dcterms:description "This is an example description.";
dcat:contactPoint <https://orcid.org/0000-0002-9095-9201>, <https://orcid.org/0000-0003-2558-7496> .
15 changes: 15 additions & 0 deletions ckanext/fairdatapoint/tests/test_data/contact_point_url.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
a dcat:Resource, dcat:Dataset;
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
dcterms:title "Example";
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
dcterms:description "This is an example description.";
dcat:contactPoint <https://orcid.org/0000-0002-9095-9201> .
19 changes: 19 additions & 0 deletions ckanext/fairdatapoint/tests/test_data/contact_point_vcard.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix ldp: <http://www.w3.org/ns/ldp#> .
@prefix v: <http://www.w3.org/2006/vcard/ns#> .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25>
a dcat:Resource, dcat:Dataset;
<http://www.w3.org/2000/01/rdf-schema#label> "Example";
dcterms:title "Example";
<https://w3id.org/fdp/fdp-o#metadataIssued> "2023-09-05T12:00:36.276171042Z"^^xsd:dateTime;
<https://w3id.org/fdp/fdp-o#metadataModified> "2024-05-02T13:01:35.716385359Z"^^xsd:dateTime;
dcterms:license <http://rdflicense.appspot.com/rdflicense/cc-by-nc-nd3.0>;
dcterms:description "This is an example description.";
dcat:contactPoint [ a v:Kind ;
v:fn "Marc Bonten" ;
v:hasUID <https://orcid.org/0000-0002-9095-9201> ;
v:hasEmail <mailto:[email protected]> ] .
Loading
Loading