From 20deca4cf3b715e921392633c0f9c219493e4d77 Mon Sep 17 00:00:00 2001 From: Abram Booth Date: Tue, 25 Apr 2023 14:02:35 -0400 Subject: [PATCH] gather more preprint metadata --- osf/metadata/osf_gathering.py | 99 ++++++++++++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 8 deletions(-) diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index 5d592d8f1c89..2044c8389aaa 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -36,7 +36,7 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket: '''for when you just want a basket of rdf metadata about a thing - @osf_item: the thing (osf model instance or 5-ish character guid string) + @osf_item: the thing (an instance of osf.models.base.GuidMixin or a 5-ish character osf:id string) ''' focus = OsfFocus(osf_item) return gather.Basket(focus) @@ -53,7 +53,7 @@ def osfmap_for_type(rdftype_iri: str): ##### BEGIN osfmap ##### -# TODO: replace these dictionaries with dctap tsv +# TODO: replace these dictionaries with dctap tsv or rdf/shacl file OSF_AGENT_REFERENCE = { DCTERMS.identifier: None, @@ -153,6 +153,10 @@ def osfmap_for_type(rdftype_iri: str): OSF.Preprint: { **OSF_OBJECT, OSF.isSupplementedBy: OSF_OBJECT_REFERENCE, + OSF.hasDataResource: None, + OSF.hasPreregisteredStudyDesign: None, + OSF.hasPreregisteredAnalysisPlan: None, + OSF.statedConflictOfInterest: None, }, OSF.File: { DCTERMS.created: None, @@ -333,7 +337,11 @@ def gather_moderation_dates(focus): @gather.er(DCTERMS.dateCopyrighted, DCTERMS.rightsHolder, DCTERMS.rights) def gather_licensing(focus): - license_record = getattr(focus.dbmodel, 'node_license', None) + license_record = ( + focus.dbmodel.license + if focus.rdftype == OSF.Preprint + else getattr(focus.dbmodel, 'node_license', None) + ) if license_record is not None: yield (DCTERMS.dateCopyrighted, license_record.year) for copyright_holder in license_record.copyright_holders: @@ -359,6 +367,8 @@ def gather_title(focus): def _language_text(focus, text): if not text: return None + if getattr(text, 'language', None): + return text # already has non-empty language tag return rdflib.Literal(text, lang=_get_language(focus)) @@ -499,19 +509,84 @@ def gather_parts(focus): @gather.er( - DCTERMS.hasVersion, OSF.isSupplementedBy, focustype_iris=[OSF.Preprint], ) -def gather_preprint_related_items(focus): +def gather_preprint_supplement(focus): + supplemental_node = focus.dbmodel.node + if supplemental_node and supplemental_node.is_public: + yield (OSF.isSupplementedBy, OsfFocus(supplemental_node)) + + +@gather.er( + DCTERMS.hasVersion, + focustype_iris=[OSF.Preprint], +) +def gather_preprint_external_links(focus): published_article_doi = getattr(focus.dbmodel, 'article_doi', None) if published_article_doi: article_iri = DOI[published_article_doi] yield (DCTERMS.hasVersion, article_iri) yield (article_iri, DCTERMS.identifier, str(article_iri)) - supplemental_node = focus.dbmodel.node - if supplemental_node and supplemental_node.is_public: - yield (OSF.isSupplementedBy, OsfFocus(supplemental_node)) + + +@gather.er( + OSF.hasDataResource, + focustype_iris=[OSF.Preprint], +) +def gather_preprint_data_links(focus): + preprint = focus.dbmodel + if preprint.has_data_links == 'no': + yield from _omitted_metadata( + focus=focus, + omitted_property_set=[OSF.hasDataResource], + description=preprint.why_no_data, + ) + elif preprint.has_data_links == 'available': + for data_link in filter(None, preprint.data_links): + yield (OSF.hasDataResource, rdflib.URIRef(data_link)) + + +@gather.er( + OSF.hasPreregisteredStudyDesign, + OSF.hasPreregisteredAnalysisPlan, + focustype_iris=[OSF.Preprint], +) +def gather_preprint_prereg(focus): + preprint = focus.dbmodel + if preprint.has_prereg_links == 'no': + yield from _omitted_metadata( + focus=focus, + omitted_property_set=[ + OSF.hasPreregisteredStudyDesign, + OSF.hasPreregisteredAnalysisPlan, + ], + description=preprint.why_no_prereg, + ) + elif preprint.has_prereg_links == 'available': + try: + prereg_relations = { + 'prereg_designs': [OSF.hasPreregisteredStudyDesign], + 'prereg_analysis': [OSF.hasPreregisteredStudyDesign], + 'prereg_both': [OSF.hasPreregisteredStudyDesign, OSF.hasPreregisteredAnalysisPlan], + }[preprint.prereg_link_info] + except KeyError: + pass + else: + for prereg_link in filter(None, preprint.prereg_links): + for prereg_relation in prereg_relations: + yield (prereg_relation, rdflib.URIRef(prereg_link)) + + +@gather.er( + OSF.statedConflictOfInterest, + focustype_iris=[OSF.Preprint], +) +def gather_conflict_of_interest(focus): + if focus.dbmodel.has_coi: + yield (OSF.statedConflictOfInterest, _language_text(focus, focus.dbmodel.conflict_of_interest_statement)) + else: + yield (OSF.statedConflictOfInterest, OSF['no-conflict-of-interest']) @gather.er( @@ -703,3 +778,11 @@ def _publisher_tripleset(iri, name, url=None): yield (iri, FOAF.name, name) yield (iri, DCTERMS.identifier, str(iri)) yield (iri, DCTERMS.identifier, url) + + +def _omitted_metadata(focus, omitted_property_set, description): + bnode = rdflib.BNode() + yield (focus.iri, OSF.omits, bnode) + for property_iri in omitted_property_set: + yield (bnode, OSF.omittedMetadataProperty, property_iri) + yield (bnode, DCTERMS.description, _language_text(focus, description))