diff --git a/ckanext/fairdatapoint/harvesters/civity_harvester.py b/ckanext/fairdatapoint/harvesters/civity_harvester.py index 206aec9..cc8cca3 100644 --- a/ckanext/fairdatapoint/harvesters/civity_harvester.py +++ b/ckanext/fairdatapoint/harvesters/civity_harvester.py @@ -359,7 +359,7 @@ def import_stage(self, harvest_object): # Get the last harvested object (if any) previous_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ - .filter(HarvestObject.current is True) \ + .filter(HarvestObject.current == True) \ .first() # Flag previous object as not current anymore @@ -427,7 +427,7 @@ def _get_guids_to_package_ids_from_database(harvest_job): :return: """ query = model.Session.query(HarvestObject.guid, HarvestObject.package_id). \ - filter(HarvestObject.current is True). \ + filter(HarvestObject.current == True). \ filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} diff --git a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py index 8850928..868925b 100644 --- a/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py +++ b/ckanext/fairdatapoint/harvesters/domain/fair_data_point_record_provider.py @@ -11,7 +11,9 @@ from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier from ckanext.fairdatapoint.harvesters.domain.fair_data_point import FairDataPoint -from rdflib import Namespace, URIRef, Literal, DCAT, DCTERMS, Graph, RDF +from requests import JSONDecodeError, HTTPError + +from rdflib import Namespace, URIRef, Literal, DCAT, DCTERMS, Graph, RDF, BNode from rdflib.term import Node from typing import Dict, Iterable, Union @@ -84,10 +86,14 @@ def get_record_by_id(self, guid: str) -> str: subject_uri = URIRef(subject_url) + self._remove_fdp_defaults(g, subject_uri) + # Add information from distribution to graph for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution): distribution_g = self.fair_data_point.get_graph(distribution_uri) + self._remove_fdp_defaults(g, distribution_uri) + for predicate in [ DCTERMS.description, DCTERMS.format, @@ -100,18 +106,32 @@ def get_record_by_id(self, guid: str) -> str: # Look-up contact information for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint): - if 'orcid' in contact_point_uri: - orcid_response = requests.get(str(contact_point_uri) + '/public-record.json') - json_orcid_response = orcid_response.json() - name = json_orcid_response['displayName'] - name_literal = Literal(name) - g.add((subject_uri, VCARD.fn, name_literal)) - # TODO add original Orcid URL in a field + if isinstance(contact_point_uri, URIRef): + self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri) result = g.serialize(format='ttl') return result + @staticmethod + def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRef): + """ + Replaces contact point URI with a VCard + """ + g.remove((subject_uri, DCAT.contactPoint, contact_point_uri)) + vcard_node = BNode() + g.add((subject_uri, DCAT.contactPoint, vcard_node)) + g.add((vcard_node, RDF.type, VCARD.Kind)) + g.add((vcard_node, VCARD.hasUID, contact_point_uri)) + if 'orcid' in str(contact_point_uri): + try: + orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json') + json_orcid_response = orcid_response.json() + name = json_orcid_response['displayName'] + g.add((vcard_node, VCARD.fn, Literal(name))) + except (JSONDecodeError, HTTPError) as e: + log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}') + @staticmethod def get_values(graph: Graph, subject: Union[str, URIRef, Node], @@ -121,3 +141,11 @@ def get_values(graph: Graph, for value in graph.objects(subject=subject_uri, predicate=predicate_uri): yield value + + @staticmethod + def _remove_fdp_defaults(g, subject_uri): + for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)): + access_rights_default = URIRef(f'{subject_uri}#accessRights') + if o == access_rights_default: + g.remove((subject_uri, DCTERMS.accessRights, o)) + g.remove((access_rights_default, None, None)) diff --git a/ckanext/fairdatapoint/profiles.py b/ckanext/fairdatapoint/profiles.py index 20387d6..61f57db 100644 --- a/ckanext/fairdatapoint/profiles.py +++ b/ckanext/fairdatapoint/profiles.py @@ -15,10 +15,12 @@ from dateutil.parser import ParserError from json import JSONDecodeError from typing import Dict, List -from rdflib import URIRef +from rdflib import URIRef, Namespace log = logging.getLogger(__name__) +VCARD = Namespace("http://www.w3.org/2006/vcard/ns#") + def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict: """ @@ -101,13 +103,26 @@ def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict: dataset_dict['tags'] = validate_tags(dataset_dict['tags']) - # Example of adding a field - # dataset_dict['extras'].append({'key': 'hello', - # 'value': 'Hello from the FAIR data point profile. Use this function to do ' - # 'FAIR data point specific stuff during the import stage'}) - return dataset_dict + def _contact_details(self, subject, predicate): + """ + Overrides RDFProfile._contact_details so uri is taken from hasUID for VCard + """ + contact = {} + # todo fix for multiple + + for agent in self.g.objects(subject, predicate): + + contact['uri'] = (str(agent) if isinstance(agent, URIRef) + else self._get_vcard_property_value(agent, VCARD.hasUID)) + + contact['name'] = self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn) + + contact['email'] = self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail)) + + return contact + # def graph_from_dataset(self, dataset_dict, dataset_ref): # # g = self.g diff --git a/ckanext/fairdatapoint/tests/test_data/Project_27866022694497978_out.ttl b/ckanext/fairdatapoint/tests/test_data/Project_27866022694497978_out.ttl index 1cd614d..d2cceb1 100644 --- a/ckanext/fairdatapoint/tests/test_data/Project_27866022694497978_out.ttl +++ b/ckanext/fairdatapoint/tests/test_data/Project_27866022694497978_out.ttl @@ -16,5 +16,6 @@ dcat:endDate "2025-12-31"^^xsd:date ; dcat:startDate "2020-01-01"^^xsd:date ] ; dcterms:title "COVID-NL cohort MUMC+"@en ; - v:fn "N.K. De Vries" ; - dcat:contactPoint . \ No newline at end of file + dcat:contactPoint [ a v:Kind ; + v:fn "N.K. De Vries" ; + v:hasUID ] . \ No newline at end of file diff --git a/ckanext/fairdatapoint/tests/test_data/dataset_898ca4b8-197b-4d40-bc81-d9cd88197670.ttl b/ckanext/fairdatapoint/tests/test_data/dataset_898ca4b8-197b-4d40-bc81-d9cd88197670.ttl index 4394e58..727a2a4 100644 --- a/ckanext/fairdatapoint/tests/test_data/dataset_898ca4b8-197b-4d40-bc81-d9cd88197670.ttl +++ b/ckanext/fairdatapoint/tests/test_data/dataset_898ca4b8-197b-4d40-bc81-d9cd88197670.ttl @@ -7,6 +7,7 @@ @prefix prov: . @prefix rdfs: . @prefix xsd: . +@prefix ns3: . a ldp:DirectContainer ; dcterms:title "Distributions" ; @@ -16,7 +17,6 @@ a dcat:Dataset, dcat:Resource ; rdfs:label "Slovenian income, poverty and social exclusion indicators" ; - dcterms:accessRights ; dcterms:conformsTo ; dcterms:creator "Statistični urad Republike Slovenije"@en ; dcterms:description "The purpose of data collection is to show the quality of life in Slovenia in view of allocation of disposable income among households, relative poverty and social exclusion for different socio-economic groups of persons and households and to highlight which groups of population are relatively worse off than the rest of the population and are thus more vulnerable to poverty, material deprivation and unemployment. One of the main purposes is also collecting data on health (disability, unmet needs, etc). {\"Topics\": \"Self-perceived health/morbidity, Disability, Wellbeing\", \"Data collection period\": \"2005-01-01 2021-12-31\", \"Funding\": \"State Budget\", \"Geo coverage\": \"Nuts 3\", \"Target population\": \"General population\", \"Age range from\": \"16\", \"Age range to\": \"100\", \"Updating periodicity\": \"Annually\", \"Sample size\": \"\", \"Personal identifier\": \"National identifier\", \"Level of aggregation\": \"Individual\", \"Linkage possible\": \"Only to some\", \"Permanent identifier of the data source\": \"\", \"Regulations for data sharing\": \"\"}"@en ; @@ -36,7 +36,8 @@ dcterms:title "Slovenian income, poverty and social exclusion indicators"@en ; ns2:SIO_000628 , ; - dcat:contactPoint ; + dcat:contactPoint [ a ns3:Kind ; + ns3:hasUID ] ; dcat:keyword "Self-perceived health, poverty"@en ; dcat:landingPage ; dcat:theme , @@ -52,9 +53,6 @@ ns1:metadataIssued "2023-10-06T10:13:09.627000+00:00"^^xsd:dateTime ; ns1:metadataModified "2023-10-25T14:01:34.351000+00:00"^^xsd:dateTime . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670" . diff --git a/ckanext/fairdatapoint/tests/test_data/dataset_cbioportal.ttl b/ckanext/fairdatapoint/tests/test_data/dataset_cbioportal.ttl index 2a250fd..297ba92 100644 --- a/ckanext/fairdatapoint/tests/test_data/dataset_cbioportal.ttl +++ b/ckanext/fairdatapoint/tests/test_data/dataset_cbioportal.ttl @@ -16,7 +16,6 @@ a dcat:Dataset, dcat:Resource ; rdfs:label "[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" ; - dcterms:accessRights ; dcterms:conformsTo ; dcterms:description "Whole exome sequencing of 23 grade II glioma tumor/normal pairs." ; dcterms:identifier "lgg_ucsf_2014"^^xsd:token ; @@ -41,9 +40,6 @@ ns1:metadataIssued "2024-01-22T12:58:04.249592+00:00"^^xsd:dateTime ; ns1:metadataModified "2024-01-22T12:58:05.109355+00:00"^^xsd:dateTime . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5" . diff --git a/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25.ttl b/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25.ttl index d93250c..44f6388 100644 --- a/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25.ttl +++ b/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25.ttl @@ -31,9 +31,6 @@ ns1:metadataIssued "2023-09-05T12:00:36.276171+00:00"^^xsd:dateTime ; ns1:metadataModified "2023-09-05T12:03:28.843400+00:00"^^xsd:dateTime . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25" . diff --git a/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25_out.ttl b/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25_out.ttl index 421a6f3..bcdcd9c 100644 --- a/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25_out.ttl +++ b/ckanext/fairdatapoint/tests/test_data/dataset_d7129d28-b72a-437f-8db0-4f0258dd3c25_out.ttl @@ -17,7 +17,6 @@ a dcat:Dataset, dcat:Resource ; rdfs:label "Example" ; - dcterms:accessRights ; dcterms:conformsTo ; dcterms:description "This is an example description." ; dcterms:isPartOf ; @@ -31,9 +30,6 @@ ns1:metadataIssued "2023-09-05T12:00:36.276171+00:00"^^xsd:dateTime ; ns1:metadataModified "2023-09-05T12:03:28.843400+00:00"^^xsd:dateTime . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25" . diff --git a/ckanext/fairdatapoint/tests/test_data/distribution_f9b9dff8-a039-4ca2-be9b-da72a61e3bac.ttl b/ckanext/fairdatapoint/tests/test_data/distribution_f9b9dff8-a039-4ca2-be9b-da72a61e3bac.ttl index 7e6f7b1..ed9f720 100644 --- a/ckanext/fairdatapoint/tests/test_data/distribution_f9b9dff8-a039-4ca2-be9b-da72a61e3bac.ttl +++ b/ckanext/fairdatapoint/tests/test_data/distribution_f9b9dff8-a039-4ca2-be9b-da72a61e3bac.ttl @@ -23,9 +23,6 @@ ns2:metadataIssued "2023-09-05T12:03:28.782932+00:00"^^xsd:dateTime ; ns2:metadataModified "2023-09-05T12:03:28.782932+00:00"^^xsd:dateTime . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/distribution/f9b9dff8-a039-4ca2-be9b-da72a61e3bac" . diff --git a/ckanext/fairdatapoint/tests/test_data/fdp_catalog.ttl b/ckanext/fairdatapoint/tests/test_data/fdp_catalog.ttl index b1ef514..771de82 100644 --- a/ckanext/fairdatapoint/tests/test_data/fdp_catalog.ttl +++ b/ckanext/fairdatapoint/tests/test_data/fdp_catalog.ttl @@ -36,9 +36,6 @@ ns2:metadataIssued "2023-10-06T10:12:55.614000+00:00"^^xsd:dateTime ; ns2:metadataModified "2023-10-25T14:02:23.680000+00:00"^^xsd:dateTime . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d" . diff --git a/ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl b/ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl index 7f8e009..67fc245 100644 --- a/ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl +++ b/ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl @@ -15,9 +15,6 @@ ldp:hasMemberRelation ns2:metadataCatalog ; ldp:membershipResource . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://fair.healthinformationportal.eu" . diff --git a/ckanext/fairdatapoint/tests/test_data/root_fdp_response_no_catalogs.ttl b/ckanext/fairdatapoint/tests/test_data/root_fdp_response_no_catalogs.ttl index c84d79a..8153e25 100644 --- a/ckanext/fairdatapoint/tests/test_data/root_fdp_response_no_catalogs.ttl +++ b/ckanext/fairdatapoint/tests/test_data/root_fdp_response_no_catalogs.ttl @@ -12,9 +12,6 @@ ldp:hasMemberRelation ns2:metadataCatalog ; ldp:membershipResource . - a dcterms:RightsStatement ; - dcterms:description "This resource has no access restriction" . - a ; dcterms:identifier "https://fair.healthinformationportal.eu" . diff --git a/ckanext/fairdatapoint/tests/test_processors.py b/ckanext/fairdatapoint/tests/test_processors.py index 5b3c188..59f60bf 100644 --- a/ckanext/fairdatapoint/tests/test_processors.py +++ b/ckanext/fairdatapoint/tests/test_processors.py @@ -70,6 +70,7 @@ def test_fdp_record_converter_dataset_dict(self): "license_id": "", "identifier": "27866022694497978", "has_version": ["https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"], + "contact_name": 'N.K. De Vries', "contact_uri": "https://orcid.org/0000-0002-4348-707X", "publisher_uri": "https://opal.health-ri.nl/pub/", "temporal_start": datetime(2020, 1, 1, 0, 0), diff --git a/ckanext/fairdatapoint/tests/test_profiles.py b/ckanext/fairdatapoint/tests/test_profiles.py index 1951b64..1dc003c 100644 --- a/ckanext/fairdatapoint/tests/test_profiles.py +++ b/ckanext/fairdatapoint/tests/test_profiles.py @@ -83,8 +83,6 @@ def test_parse_dataset(): 'identifier': 'lgg_ucsf_2014', 'language': ['http://id.loc.gov/vocabulary/iso639-1/en'], 'conforms_to': ['https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604'], 'publisher_uri': 'https://www.health-ri.nl', - 'access_rights': 'https://health-ri.sandbox.semlab-leiden.nl/dataset/' - 'd9956191-1aff-4181-ac8b-16b829135ed5#accessRights', 'is_referenced_by': '["https://pubmed.ncbi.nlm.nih.gov/24336570"]'} assert actual == expected