Skip to content

Commit

Permalink
Merge pull request #20 from GenomicDataInfrastructure/improve-harvest…
Browse files Browse the repository at this point in the history
…er-and-fix-issues

fix: Improve harvester and fix issues
  • Loading branch information
a-nayden authored May 3, 2024
2 parents 45e056a + 6cf5e10 commit 887430c
Show file tree
Hide file tree
Showing 14 changed files with 66 additions and 48 deletions.
4 changes: 2 additions & 2 deletions ckanext/fairdatapoint/harvesters/civity_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ def import_stage(self, harvest_object):
# Get the last harvested object (if any)
previous_object = model.Session.query(HarvestObject) \
.filter(HarvestObject.guid == harvest_object.guid) \
.filter(HarvestObject.current is True) \
.filter(HarvestObject.current == True) \
.first()

# Flag previous object as not current anymore
Expand Down Expand Up @@ -427,7 +427,7 @@ def _get_guids_to_package_ids_from_database(harvest_job):
:return:
"""
query = model.Session.query(HarvestObject.guid, HarvestObject.package_id). \
filter(HarvestObject.current is True). \
filter(HarvestObject.current == True). \
filter(HarvestObject.harvest_source_id == harvest_job.source.id)

guid_to_package_id = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
from ckanext.fairdatapoint.harvesters.domain.identifier import Identifier
from ckanext.fairdatapoint.harvesters.domain.fair_data_point import FairDataPoint

from rdflib import Namespace, URIRef, Literal, DCAT, DCTERMS, Graph, RDF
from requests import JSONDecodeError, HTTPError

from rdflib import Namespace, URIRef, Literal, DCAT, DCTERMS, Graph, RDF, BNode
from rdflib.term import Node
from typing import Dict, Iterable, Union

Expand Down Expand Up @@ -84,10 +86,14 @@ def get_record_by_id(self, guid: str) -> str:

subject_uri = URIRef(subject_url)

self._remove_fdp_defaults(g, subject_uri)

# Add information from distribution to graph
for distribution_uri in g.objects(subject=subject_uri, predicate=DCAT.distribution):
distribution_g = self.fair_data_point.get_graph(distribution_uri)

self._remove_fdp_defaults(g, distribution_uri)

for predicate in [
DCTERMS.description,
DCTERMS.format,
Expand All @@ -100,18 +106,32 @@ def get_record_by_id(self, guid: str) -> str:

# Look-up contact information
for contact_point_uri in self.get_values(g, subject_uri, DCAT.contactPoint):
if 'orcid' in contact_point_uri:
orcid_response = requests.get(str(contact_point_uri) + '/public-record.json')
json_orcid_response = orcid_response.json()
name = json_orcid_response['displayName']
name_literal = Literal(name)
g.add((subject_uri, VCARD.fn, name_literal))
# TODO add original Orcid URL in a field
if isinstance(contact_point_uri, URIRef):
self._parse_contact_point(g=g, subject_uri=subject_uri, contact_point_uri=contact_point_uri)

result = g.serialize(format='ttl')

return result

@staticmethod
def _parse_contact_point(g: Graph, subject_uri: URIRef, contact_point_uri: URIRef):
"""
Replaces contact point URI with a VCard
"""
g.remove((subject_uri, DCAT.contactPoint, contact_point_uri))
vcard_node = BNode()
g.add((subject_uri, DCAT.contactPoint, vcard_node))
g.add((vcard_node, RDF.type, VCARD.Kind))
g.add((vcard_node, VCARD.hasUID, contact_point_uri))
if 'orcid' in str(contact_point_uri):
try:
orcid_response = requests.get(str(contact_point_uri).rstrip('/') + '/public-record.json')
json_orcid_response = orcid_response.json()
name = json_orcid_response['displayName']
g.add((vcard_node, VCARD.fn, Literal(name)))
except (JSONDecodeError, HTTPError) as e:
log.error(f'Failed to get data from ORCID for {contact_point_uri}: {e}')

@staticmethod
def get_values(graph: Graph,
subject: Union[str, URIRef, Node],
Expand All @@ -121,3 +141,11 @@ def get_values(graph: Graph,

for value in graph.objects(subject=subject_uri, predicate=predicate_uri):
yield value

@staticmethod
def _remove_fdp_defaults(g, subject_uri):
for (s, p, o) in g.triples((subject_uri, DCTERMS.accessRights, None)):
access_rights_default = URIRef(f'{subject_uri}#accessRights')
if o == access_rights_default:
g.remove((subject_uri, DCTERMS.accessRights, o))
g.remove((access_rights_default, None, None))
27 changes: 21 additions & 6 deletions ckanext/fairdatapoint/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
from dateutil.parser import ParserError
from json import JSONDecodeError
from typing import Dict, List
from rdflib import URIRef
from rdflib import URIRef, Namespace

log = logging.getLogger(__name__)

VCARD = Namespace("http://www.w3.org/2006/vcard/ns#")


def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
"""
Expand Down Expand Up @@ -101,13 +103,26 @@ def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:

dataset_dict['tags'] = validate_tags(dataset_dict['tags'])

# Example of adding a field
# dataset_dict['extras'].append({'key': 'hello',
# 'value': 'Hello from the FAIR data point profile. Use this function to do '
# 'FAIR data point specific stuff during the import stage'})

return dataset_dict

def _contact_details(self, subject, predicate):
"""
Overrides RDFProfile._contact_details so uri is taken from hasUID for VCard
"""
contact = {}
# todo fix for multiple

for agent in self.g.objects(subject, predicate):

contact['uri'] = (str(agent) if isinstance(agent, URIRef)
else self._get_vcard_property_value(agent, VCARD.hasUID))

contact['name'] = self._get_vcard_property_value(agent, VCARD.hasFN, VCARD.fn)

contact['email'] = self._without_mailto(self._get_vcard_property_value(agent, VCARD.hasEmail))

return contact

# def graph_from_dataset(self, dataset_dict, dataset_ref):
#
# g = self.g
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@
dcat:endDate "2025-12-31"^^xsd:date ;
dcat:startDate "2020-01-01"^^xsd:date ] ;
dcterms:title "COVID-NL cohort MUMC+"@en ;
v:fn "N.K. De Vries" ;
dcat:contactPoint <https://orcid.org/0000-0002-4348-707X> .
dcat:contactPoint [ a v:Kind ;
v:fn "N.K. De Vries" ;
v:hasUID <https://orcid.org/0000-0002-4348-707X> ] .
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix ns3: <http://www.w3.org/2006/vcard/ns#> .

<https://fair.healthinformationportal.eu/distribution/> a ldp:DirectContainer ;
dcterms:title "Distributions" ;
Expand All @@ -16,7 +17,6 @@
<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670> a dcat:Dataset,
dcat:Resource ;
rdfs:label "Slovenian income, poverty and social exclusion indicators" ;
dcterms:accessRights <https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670#accessRights> ;
dcterms:conformsTo <https://fair.healthinformationportal.eu/profile/2f08228e-1789-40f8-84cd-28e3288c3604> ;
dcterms:creator "Statisti&#269;ni urad Republike Slovenije"@en ;
dcterms:description "The purpose of data collection is to show the quality of life in Slovenia in view of allocation of disposable income among households, relative poverty and social exclusion for different socio-economic groups of persons and households and to highlight which groups of population are relatively worse off than the rest of the population and are thus more vulnerable to poverty, material deprivation and unemployment. One of the main purposes is also collecting data on health (disability, unmet needs, etc). {\"Topics\": \"Self-perceived health/morbidity, Disability, Wellbeing\", \"Data collection period\": \"2005-01-01 2021-12-31\", \"Funding\": \"State Budget\", \"Geo coverage\": \"Nuts 3\", \"Target population\": \"General population\", \"Age range from\": \"16\", \"Age range to\": \"100\", \"Updating periodicity\": \"Annually\", \"Sample size\": \"\", \"Personal identifier\": \"National identifier\", \"Level of aggregation\": \"Individual\", \"Linkage possible\": \"Only to some\", \"Permanent identifier of the data source\": \"\", \"Regulations for data sharing\": \"\"}"@en ;
Expand All @@ -36,7 +36,8 @@
dcterms:title "Slovenian income, poverty and social exclusion indicators"@en ;
ns2:SIO_000628 <https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670/metrics/445c0a70d1e214e545b261559e2842f4>,
<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670/metrics/5d27e854a9e78eb3f663331cd47cdc13> ;
dcat:contactPoint <https://healthinformationportal.eu> ;
dcat:contactPoint [ a ns3:Kind ;
ns3:hasUID <https://healthinformationportal.eu> ] ;
dcat:keyword "Self-perceived health, poverty"@en ;
dcat:landingPage <https://www.healthinformationportal.eu/health-information-sources/slovenian-income-poverty-and-social-exclusion-indicators> ;
dcat:theme <http://publications.europa.eu/resource/authority/data-theme/HEAL>,
Expand All @@ -52,9 +53,6 @@
ns1:metadataIssued "2023-10-06T10:13:09.627000+00:00"^^xsd:dateTime ;
ns1:metadataModified "2023-10-25T14:01:34.351000+00:00"^^xsd:dateTime .

<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://fair.healthinformationportal.eu/dataset/898ca4b8-197b-4d40-bc81-d9cd88197670" .

Expand Down
4 changes: 0 additions & 4 deletions ckanext/fairdatapoint/tests/test_data/dataset_cbioportal.ttl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5> a dcat:Dataset,
dcat:Resource ;
rdfs:label "[PUBLIC] Low-Grade Gliomas (UCSF, Science 2014)" ;
dcterms:accessRights <https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#accessRights> ;
dcterms:conformsTo <https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604> ;
dcterms:description "Whole exome sequencing of 23 grade II glioma tumor/normal pairs." ;
dcterms:identifier "lgg_ucsf_2014"^^xsd:token ;
Expand All @@ -41,9 +40,6 @@
ns1:metadataIssued "2024-01-22T12:58:04.249592+00:00"^^xsd:dateTime ;
ns1:metadataModified "2024-01-22T12:58:05.109355+00:00"^^xsd:dateTime .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d9956191-1aff-4181-ac8b-16b829135ed5" .

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
ns1:metadataIssued "2023-09-05T12:00:36.276171+00:00"^^xsd:dateTime ;
ns1:metadataModified "2023-09-05T12:03:28.843400+00:00"^^xsd:dateTime .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25" .

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25> a dcat:Dataset,
dcat:Resource ;
rdfs:label "Example" ;
dcterms:accessRights <https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#accessRights> ;
dcterms:conformsTo <https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604> ;
dcterms:description "This is an example description." ;
dcterms:isPartOf <https://health-ri.sandbox.semlab-leiden.nl/catalog/e3faf7ad-050c-475f-8ce4-da7e2faa5cd0> ;
Expand All @@ -31,9 +30,6 @@
ns1:metadataIssued "2023-09-05T12:00:36.276171+00:00"^^xsd:dateTime ;
ns1:metadataModified "2023-09-05T12:03:28.843400+00:00"^^xsd:dateTime .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/dataset/d7129d28-b72a-437f-8db0-4f0258dd3c25" .

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@
ns2:metadataIssued "2023-09-05T12:03:28.782932+00:00"^^xsd:dateTime ;
ns2:metadataModified "2023-09-05T12:03:28.782932+00:00"^^xsd:dateTime .

<https://health-ri.sandbox.semlab-leiden.nl/distribution/f9b9dff8-a039-4ca2-be9b-da72a61e3bac#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://health-ri.sandbox.semlab-leiden.nl/distribution/f9b9dff8-a039-4ca2-be9b-da72a61e3bac#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://health-ri.sandbox.semlab-leiden.nl/distribution/f9b9dff8-a039-4ca2-be9b-da72a61e3bac" .

Expand Down
3 changes: 0 additions & 3 deletions ckanext/fairdatapoint/tests/test_data/fdp_catalog.ttl
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@
ns2:metadataIssued "2023-10-06T10:12:55.614000+00:00"^^xsd:dateTime ;
ns2:metadataModified "2023-10-25T14:02:23.680000+00:00"^^xsd:dateTime .

<https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d" .

Expand Down
3 changes: 0 additions & 3 deletions ckanext/fairdatapoint/tests/test_data/root_fdp_response.ttl
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
ldp:hasMemberRelation ns2:metadataCatalog ;
ldp:membershipResource <https://fair.healthinformationportal.eu> .

<https://fair.healthinformationportal.eu#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://fair.healthinformationportal.eu#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://fair.healthinformationportal.eu" .

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
ldp:hasMemberRelation ns2:metadataCatalog ;
ldp:membershipResource <https://fair.healthinformationportal.eu> .

<https://fair.healthinformationportal.eu#accessRights> a dcterms:RightsStatement ;
dcterms:description "This resource has no access restriction" .

<https://fair.healthinformationportal.eu#identifier> a <http://purl.org/spar/datacite/Identifier> ;
dcterms:identifier "https://fair.healthinformationportal.eu" .

Expand Down
1 change: 1 addition & 0 deletions ckanext/fairdatapoint/tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def test_fdp_record_converter_dataset_dict(self):
"license_id": "",
"identifier": "27866022694497978",
"has_version": ["https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"],
"contact_name": 'N.K. De Vries',
"contact_uri": "https://orcid.org/0000-0002-4348-707X",
"publisher_uri": "https://opal.health-ri.nl/pub/",
"temporal_start": datetime(2020, 1, 1, 0, 0),
Expand Down
2 changes: 0 additions & 2 deletions ckanext/fairdatapoint/tests/test_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ def test_parse_dataset():
'identifier': 'lgg_ucsf_2014', 'language': ['http://id.loc.gov/vocabulary/iso639-1/en'],
'conforms_to': ['https://health-ri.sandbox.semlab-leiden.nl/profile/2f08228e-1789-40f8-84cd-28e3288c3604'],
'publisher_uri': 'https://www.health-ri.nl',
'access_rights': 'https://health-ri.sandbox.semlab-leiden.nl/dataset/'
'd9956191-1aff-4181-ac8b-16b829135ed5#accessRights',
'is_referenced_by': '["https://pubmed.ncbi.nlm.nih.gov/24336570"]'}
assert actual == expected

Expand Down

0 comments on commit 887430c

Please sign in to comment.