GenomicDataInfrastructure · brunopacheco1 · Mar 18, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 18, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ jobs:
       image: openknowledge/ckan-dev:2.10
     services:
       solr:
-        image: ckan/ckan-solr-dev:2.10
+        image: ckan/ckan-solr:2.10-solr9
       postgres:
         image: ckan/ckan-postgres-dev:2.10
         env:

diff --git a/ckanext/fairdatapoint/profiles.py b/ckanext/fairdatapoint/profiles.py
@@ -3,13 +3,21 @@
 #  check for multiple-text fields in the schema
 # All changes are © Stichting Health-RI and are licensed under the AGPLv3 license
 
+from datetime import datetime
+import re
+import json
+import logging
+
 from ckanext.dcat.profiles import EuropeanDCATAP2Profile
 from ckan.plugins import toolkit
 from ckan import model
-import json
-from typing import Dict
+import dateutil.parser as dateparser
+from dateutil.parser import ParserError
+from typing import Dict, List
 from rdflib import URIRef
 
+log = logging.getLogger(__name__)
+
 
 def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
     """
@@ -31,19 +39,52 @@ def _convert_extras_to_declared_schema_fields(dataset_dict: Dict) -> Dict:
     # Populate the declared schema fields, if they are present in the extras
     for extra_dict in dataset_dict.get('extras', []):
         field_key = extra_dict.get('key')
+        field_value = extra_dict.get('value')
         if field_key in dataset_fields:
             preset = dataset_fields[field_key]
-            if preset == "multiple_text" and extra_dict.get('value'):
-                dataset_dict[field_key] = json.loads(extra_dict.get('value'))
+            if preset == 'multiple_text' and field_value:
+                dataset_dict[field_key] = json.loads(field_value)
+            elif preset == 'date' and field_value:
+                dataset_dict[field_key] = convert_datetime_string(field_value)
             else:
-                dataset_dict[field_key] = extra_dict.get('value')
+                dataset_dict[field_key] = field_value
 
     # Remove the extras that have been populated into the declared schema fields
     dataset_dict['extras'] = [d for d in dataset_dict['extras'] if d.get('key') not in dataset_fields]
 
     return dataset_dict
 
 
+def validate_tags(values_list: List[Dict]) -> List:
+    """
+    Validates tags strings to contain allowed characters, replaces others with spaces
+    """
+    illegal_pattern = re.compile('[^A-Za-z0-9\- _\.]')
+    tags = []
+    for item in values_list:
+        tag_value = item['name']
+        find_illegal = re.search(illegal_pattern, tag_value)
+        if find_illegal:
+            log.warning(f'Tag {tag_value} contains values other than alphanumeric characters, spaces, hyphens, '
+                        f'underscores or dots, they will be replaces with spaces')
+            tag = {'name': re.sub(illegal_pattern, ' ', tag_value)}
+            tags.append(tag)
+        else:
+            tags.append(item)
+    return tags
+
+
+def convert_datetime_string(date_value: str) -> datetime:
+    """
+    Converts datestrings (e.g. '2023-10-06T10:12:55.614000+00:00') to datetime class instance
+    """
+    try:
+        date_value = dateparser.parse(date_value)
+    except ParserError:
+        log.error('A date field string value can not be parsed to a date')
+    return date_value
+
+
 class FAIRDataPointDCATAPProfile(EuropeanDCATAP2Profile):
     """
     An RDF profile for FAIR data points
@@ -54,10 +95,12 @@ def parse_dataset(self, dataset_dict: Dict, dataset_ref: URIRef) -> Dict:
 
         dataset_dict = _convert_extras_to_declared_schema_fields(dataset_dict)
 
+        dataset_dict['tags'] = validate_tags(dataset_dict['tags'])
+
         # Example of adding a field
-        dataset_dict['extras'].append({'key': 'hello',
-                                       'value': "Hello from the FAIR data point profile. Use this function to do "
-                                                "FAIR data point specific stuff during the import stage"})
+        # dataset_dict['extras'].append({'key': 'hello',
+        #                                'value': 'Hello from the FAIR data point profile. Use this function to do '
+        #                                         'FAIR data point specific stuff during the import stage'})
 
         return dataset_dict
 

diff --git a/ckanext/fairdatapoint/tests/test_processors.py b/ckanext/fairdatapoint/tests/test_processors.py
@@ -17,6 +17,8 @@
 """
 
 import pytest
+from datetime import datetime
+from dateutil.tz import tzutc
 from pathlib import Path
 from unittest.mock import patch
 from rdflib import Graph
@@ -59,10 +61,7 @@ def test_fdp_record_converter_dataset_dict(self):
             record=data)
         expected_dataset = {"extras":
             [
-                {"key": "uri", "value": "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"},
-                {"key": "hello",
-                 "value": "Hello from the FAIR data point profile. Use this function to do FAIR data point "
-                          "specific stuff during the import stage"}
+                {"key": "uri", "value": "https://covid19initiatives.health-ri.nl/p/Project/27866022694497978"}
             ],
             "resources": [],
             "title": "COVID-NL cohort MUMC+",
@@ -73,8 +72,8 @@ def test_fdp_record_converter_dataset_dict(self):
             "has_version": ["https://repo.metadatacenter.org/template-instances/2836bf1c-76e9-44e7-a65e-80e9ca63025a"],
             "contact_uri": "https://orcid.org/0000-0002-4348-707X",
             "publisher_uri": "https://opal.health-ri.nl/pub/",
-            "temporal_start": "2020-01-01",
-            "temporal_end": "2025-12-31"}
+            "temporal_start": datetime(2020, 1, 1, 0, 0),
+            "temporal_end": datetime(2025, 12, 31, 0, 0)}
         assert actual_dataset == expected_dataset
 
     def test_fdp_record_converter_catalog_dict(self):
@@ -84,25 +83,22 @@ def test_fdp_record_converter_catalog_dict(self):
             guid="catalog=https://fair.healthinformationportal.eu/catalog/1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d",
             record=data)
         expected = {
-                       "access_rights": "https://fair.healthinformationportal.eu/catalog/"
-                                        "1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
-                       "conforms_to": ["https://fair.healthinformationportal.eu/profile/"
-                                      "a0949e72-4466-4d53-8900-9436d1049a4b"],
-                       "extras": [{"key": "uri",
-                                   "value": "https://fair.healthinformationportal.eu/catalog/"
-                                            "1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"},
-                                  {"key": "hello",
-                                   "value": "Hello from the FAIR data point profile. Use this "
-                                            "function to do FAIR data point specific stuff during "
-                                            "the import stage"}],
-                       "has_version": ["1.0"],
-                       "issued": "2023-10-06T10:12:55.614000+00:00",
-                       "language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
-                       "license_id": "",
-                       "modified": "2023-10-06T10:12:55.614000+00:00",
-                       "publisher_name": "Automatic",
-                       "resources": [],
-                       "tags": [],
-                       "title": "Slovenia National Node"
+            "access_rights": "https://fair.healthinformationportal.eu/catalog/"
+                             "1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d#accessRights",
+            "conforms_to": ["https://fair.healthinformationportal.eu/profile/"
+                            "a0949e72-4466-4d53-8900-9436d1049a4b"],
+            "extras": [{"key": "uri",
+                        "value": "https://fair.healthinformationportal.eu/catalog/"
+                                 "1c75c2c9-d2cc-44cb-aaa8-cf8c11515c8d"},
+                       ],
+            "has_version": ["1.0"],
+            "issued": datetime(2023, 10, 6, 10, 12, 55, 614000, tzinfo=tzutc()),
+            "language": ["http://id.loc.gov/vocabulary/iso639-1/en"],
+            "license_id": "",
+            "modified": datetime(2023, 10, 6, 10, 12, 55, 614000, tzinfo=tzutc()),
+            "publisher_name": "Automatic",
+            "resources": [],
+            "tags": [],
+            "title": "Slovenia National Node"
         }
         assert actual == expected