From 00bb7f7885c9a2cd27d196cb8afc5d5e4a00217f Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Thu, 13 Jul 2023 11:32:13 +0100 Subject: [PATCH 1/9] Add initial B2shareImportService The B2shareImportService has been copied from the SITESImportService due to their similarity in structure, although unimplemented parts are left out for now. We leave the import statements in for future convenience. In this form the import just logs a list of DOIs at level INFO. Jira: ELTER-20 --- .../config/DevelopmentUserStoreConfig.java | 10 ++ .../catalogue/elter/B2shareImportService.java | 129 ++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/config/DevelopmentUserStoreConfig.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/config/DevelopmentUserStoreConfig.java index 08206e8e4..19acfdd8c 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/config/DevelopmentUserStoreConfig.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/config/DevelopmentUserStoreConfig.java @@ -189,6 +189,16 @@ public CatalogueUser sitesMetadataImport() throws UsernameAlreadyTakenException return user; } + @Bean + public CatalogueUser b2shareMetadataImport() throws UsernameAlreadyTakenException { + val user = new CatalogueUser() + .setUsername("B2SHARE metadata import") + .setEmail("info@eudat.eu"); + addUserToGroup(user, ELTER_EDITOR, ELTER_PUBLISHER); + userStore().addUser(user, "password"); + return user; + } + @Bean public CatalogueUser erammpEditor() throws UsernameAlreadyTakenException { val user = new CatalogueUser() diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java new file mode 100644 index 000000000..5a7d9bd29 --- /dev/null +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java @@ -0,0 +1,129 @@ +package uk.ac.ceh.gateway.catalogue.elter; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.net.URL; +import java.io.IOException; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import lombok.SneakyThrows; +import lombok.ToString; +import lombok.extern.slf4j.Slf4j; + +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.CommonParams; +import static org.apache.solr.client.solrj.SolrRequest.METHOD.POST; + +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Profile; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Service; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; + +import uk.ac.ceh.gateway.catalogue.TimeConstants; +import uk.ac.ceh.gateway.catalogue.deims.DeimsSolrIndex; +import uk.ac.ceh.gateway.catalogue.elter.ElterDocument; +import uk.ac.ceh.gateway.catalogue.gemini.AccessLimitation; +import uk.ac.ceh.gateway.catalogue.gemini.DatasetReferenceDate; +import uk.ac.ceh.gateway.catalogue.gemini.OnlineResource; +import uk.ac.ceh.gateway.catalogue.gemini.TimePeriod; +import uk.ac.ceh.gateway.catalogue.imports.CatalogueImportService; +import uk.ac.ceh.gateway.catalogue.model.CatalogueUser; +import uk.ac.ceh.gateway.catalogue.model.MetadataDocument; +import uk.ac.ceh.gateway.catalogue.model.ResponsibleParty; +import uk.ac.ceh.gateway.catalogue.publication.PublicationService; +import uk.ac.ceh.gateway.catalogue.repository.DocumentRepository; + +@Profile("server:elter & imports") +@Slf4j +@Service +@ToString +public class B2shareImportService implements CatalogueImportService { + // constructor prep + private final DocumentRepository documentRepository; + private final ObjectMapper objectMapper; + private final PublicationService publicationService; + private final SolrClient solrClient; + private final String B2shareApiRoot; + + // constructor + @SneakyThrows + public B2shareImportService( + DocumentRepository documentRepository, + PublicationService publicationService, + SolrClient solrClient, + @Value("${b2share.api}") String B2shareApiRoot + ) { + log.info("Creating"); + + this.documentRepository = documentRepository; + this.objectMapper = new ObjectMapper(); + this.publicationService = publicationService; + this.solrClient = solrClient; + this.B2shareApiRoot = B2shareApiRoot; + } + + // methods start here + @SneakyThrows + private List getRemoteRecordList() { + log.debug("GET B2SHARE records at {}", B2shareApiRoot); + + // prep + List results = new ArrayList<>(); + String B2shareRecordsUrl = B2shareApiRoot + "?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=10000000"; + + JsonNode B2shareRecords = objectMapper.readTree(new URL(B2shareRecordsUrl)); + + for (JsonNode node : B2shareRecords.path("hits").path("hits")){ + String doi = node.path("metadata").path("DOI").asText(); + if (!doi.equals("")){ + results.add(doi); + } + } + + return results; + } + @Scheduled(initialDelay = TimeConstants.ONE_MINUTE, fixedDelay = TimeConstants.SEVEN_DAYS) + public void runImport(){ + // prep + log.info("Running B2SHARE metadata import..."); + CatalogueUser importUser = new CatalogueUser().setUsername("B2SHARE metadata import").setEmail("info@eudat.eu"); + //Map localRecordList = null; + int totalRecords = 0; + int newRecords = 0; + int updatedRecords = 0; + int skippedRecords = 0; + + // get remote records + List remoteRecordList = getRemoteRecordList(); + totalRecords = remoteRecordList.size(); + + // ready to import + for (String recordUrl : remoteRecordList){ + log.info("{}", recordUrl); + } + + // finished, log summary + log.info("Finished B2SHARE metadata import!"); + log.info("{} created + {} updated + {} skipped = {} total ({} records in B2SHARE)", + newRecords, + updatedRecords, + skippedRecords, + newRecords + updatedRecords + skippedRecords, + totalRecords + ); + } +} From ccfa0fe68193512ab7ab022cde1dbbc874ffd468 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Fri, 14 Jul 2023 10:19:29 +0100 Subject: [PATCH 2/9] Finish first draft B2shareImportService As it stands the B2SHARE import will import the first 5 (not all, any more) records in the "LTER" community. It gets the record DOIs from the B2SHARE API and imports them from datacite. Most of the new methods are just tweaked copies of those from SITESImportService. Since we are reusing the same JSON processing code as the LinkedDocumentRetrievalService, this is refactored into a method of the ElterDocument, `importDataciteJson`. Also add `b2share.api` to application.properties, which was supposed to be added before. Jira: ELTER-20 --- .../catalogue/elter/B2shareImportService.java | 158 ++++++++++++++++-- .../catalogue/elter/ElterDocument.java | 84 ++++++++++ .../elter/LinkedDocumentRetrievalService.java | 91 +--------- .../src/main/resources/application.properties | 1 + 4 files changed, 230 insertions(+), 104 deletions(-) diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java index 5a7d9bd29..fbe1a569f 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java @@ -3,15 +3,17 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import java.net.URL; +import java.io.FileNotFoundException; import java.io.IOException; -import java.time.LocalDate; +import java.net.URL; import java.time.ZoneId; import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import lombok.SneakyThrows; import lombok.ToString; @@ -19,7 +21,6 @@ import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.CommonParams; @@ -30,20 +31,11 @@ import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; -import org.w3c.dom.Document; -import org.w3c.dom.Node; - import uk.ac.ceh.gateway.catalogue.TimeConstants; -import uk.ac.ceh.gateway.catalogue.deims.DeimsSolrIndex; import uk.ac.ceh.gateway.catalogue.elter.ElterDocument; -import uk.ac.ceh.gateway.catalogue.gemini.AccessLimitation; -import uk.ac.ceh.gateway.catalogue.gemini.DatasetReferenceDate; -import uk.ac.ceh.gateway.catalogue.gemini.OnlineResource; -import uk.ac.ceh.gateway.catalogue.gemini.TimePeriod; import uk.ac.ceh.gateway.catalogue.imports.CatalogueImportService; import uk.ac.ceh.gateway.catalogue.model.CatalogueUser; import uk.ac.ceh.gateway.catalogue.model.MetadataDocument; -import uk.ac.ceh.gateway.catalogue.model.ResponsibleParty; import uk.ac.ceh.gateway.catalogue.publication.PublicationService; import uk.ac.ceh.gateway.catalogue.repository.DocumentRepository; @@ -55,9 +47,11 @@ public class B2shareImportService implements CatalogueImportService { // constructor prep private final DocumentRepository documentRepository; private final ObjectMapper objectMapper; + private final Pattern p; private final PublicationService publicationService; private final SolrClient solrClient; private final String B2shareApiRoot; + private final String dataciteApiRoot; // constructor @SneakyThrows @@ -65,15 +59,18 @@ public B2shareImportService( DocumentRepository documentRepository, PublicationService publicationService, SolrClient solrClient, - @Value("${b2share.api}") String B2shareApiRoot + @Value("${b2share.api}") String B2shareApiRoot, + @Value("${doi.api}") String dataciteApiRoot ) { log.info("Creating"); this.documentRepository = documentRepository; this.objectMapper = new ObjectMapper(); + this.p = Pattern.compile("10\\.\\S+/\\S+"); this.publicationService = publicationService; this.solrClient = solrClient; this.B2shareApiRoot = B2shareApiRoot; + this.dataciteApiRoot = dataciteApiRoot; } // methods start here @@ -83,7 +80,7 @@ private List getRemoteRecordList() { // prep List results = new ArrayList<>(); - String B2shareRecordsUrl = B2shareApiRoot + "?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=10000000"; + String B2shareRecordsUrl = B2shareApiRoot + "/?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=5"; JsonNode B2shareRecords = objectMapper.readTree(new URL(B2shareRecordsUrl)); @@ -96,12 +93,103 @@ private List getRemoteRecordList() { return results; } + + @SneakyThrows + private Map getLocalRecordMapping() throws IOException { + log.debug("GET locally imported records"); + + // prep + Map resultMapping = new HashMap<>(5000); + + // form and make SOLR query + // could potentially reimplement with MetadataListingService.getPublicDocumentsOfCatalogue + SolrQuery query = new SolrQuery(); + query.setParam(CommonParams.Q, "importId:10.23728/b2share.*"); + query.setParam(CommonParams.FL, "importId,identifier"); + // Ugh, there doesn't seem to be a way to return all results. To avoid + // dealing with pagination just abort if 10000000 results are returned, + // since we won't have checked all the records. + // + // At time of writing there are 881 records in the sitemap, so this + // should basically never happen before the heat death of the universe. + query.setRows(10000000); + SolrDocumentList resultList = solrClient.query("documents", query, POST).getResults(); + + // raise warning and abort, as promised above + if (resultList.getNumFound() >= 10000000L){ + log.error("10000000 results were returned: update the code in B2shareImportService.java; aborting import"); + throw new IOException(); + } + + // populate mapping + for (SolrDocument document : resultList){ + resultMapping.put( + (String) document.getFieldValue("importId"), + (String) document.getFieldValue("identifier") + ); + } + + return resultMapping; + } + + @SneakyThrows + private ElterDocument getFullRemoteRecord(String inputDoi) { + String recordUrl = dataciteApiRoot + "/" + inputDoi; + JsonNode dataciteJson = null; + + log.info("GET {}", recordUrl); + try { + dataciteJson = objectMapper.readTree(new URL(recordUrl)); + } catch (FileNotFoundException e) { + return null; + } + + ElterDocument document = new ElterDocument(); + document.importDataciteJson(dataciteJson); + + return document; + } + + @SneakyThrows + private String createRecord(String remoteRecordId, ElterDocument newRecord, CatalogueUser user) { + // save document + MetadataDocument savedDocument = documentRepository.saveNew( + user, + newRecord, + "elter", + "Create new record " + remoteRecordId + ); + + // publish new record + publicationService.transition(user, savedDocument.getId(), "ykhm7b"); + publicationService.transition(user, savedDocument.getId(), "re4vkb"); + + // success + log.debug("Successfully imported record {}", remoteRecordId); + return savedDocument.getId(); + } + + @SneakyThrows + private void updateRecord(String localRecordId, String remoteRecordId, ElterDocument updatedRecord, CatalogueUser user) { + // save back + updatedRecord.setMetadata(documentRepository.read(localRecordId).getMetadata()); + documentRepository.save( + user, + updatedRecord, + localRecordId, + "Updated record " + remoteRecordId + ); + + // success + log.debug("Successfully updated record {}", remoteRecordId); + } + @Scheduled(initialDelay = TimeConstants.ONE_MINUTE, fixedDelay = TimeConstants.SEVEN_DAYS) public void runImport(){ // prep log.info("Running B2SHARE metadata import..."); CatalogueUser importUser = new CatalogueUser().setUsername("B2SHARE metadata import").setEmail("info@eudat.eu"); - //Map localRecordList = null; + Map localRecordList = null; int totalRecords = 0; int newRecords = 0; int updatedRecords = 0; @@ -111,9 +199,45 @@ public void runImport(){ List remoteRecordList = getRemoteRecordList(); totalRecords = remoteRecordList.size(); + // get local records + try { + localRecordList = getLocalRecordMapping(); + } catch (IOException ex) { + log.error("Error retrieving locally imported records; aborting import"); + return; + } + // ready to import - for (String recordUrl : remoteRecordList){ - log.info("{}", recordUrl); + for (String recordDoiOrgUrl : remoteRecordList){ + // normalise input to DOI + Matcher doiCheck = p.matcher(recordDoiOrgUrl); + if (!doiCheck.find()) { + log.info("No DOI detected in record {}", recordDoiOrgUrl); + skippedRecords++; + continue; + } + + // try resolving DOI with datacite + String recordDoi = doiCheck.group(0); + ElterDocument remoteRecord = getFullRemoteRecord(recordDoi); + if (remoteRecord == null){ + log.info("DOI {} does not exist", recordDoi); + skippedRecords++; + continue; + } + + // ready to import + remoteRecord.setImportId(recordDoi); + remoteRecord.setImportLastModified(ZonedDateTime.now(ZoneId.of("UTC"))); + if (localRecordList.containsKey(recordDoi)) { + updateRecord(localRecordList.get(recordDoi), recordDoi, remoteRecord, importUser); + updatedRecords++; + } + else { + String newId = createRecord(recordDoi, remoteRecord, importUser); + log.debug("New document ID is {}", newId); + newRecords++; + } } // finished, log summary diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/ElterDocument.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/ElterDocument.java index b06f4aade..e7c4a1dee 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/ElterDocument.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/ElterDocument.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.databind.JsonNode; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.ToString; @@ -16,6 +17,7 @@ import uk.ac.ceh.gateway.catalogue.indexing.solr.WellKnownText; import uk.ac.ceh.gateway.catalogue.model.*; +import java.time.LocalDate; import java.time.ZonedDateTime; import java.util.*; import java.util.stream.Collectors; @@ -71,6 +73,88 @@ public class ElterDocument extends AbstractMetadataDocument implements WellKnown private String importId; private ZonedDateTime importLastModified; + public void importDataciteJson(JsonNode inputJson){ + // create document from Datacite API JSON + + // don't currently need other parts of the response + inputJson = inputJson.get("data").get("attributes"); + + // ensure title is set to something + JsonNode jsonTitles = inputJson.get("titles"); + int numTitles = jsonTitles.size(); + if (numTitles == 0){ + this.setTitle("TITLE MISSING"); + } + else { + this.setTitle(jsonTitles.get(0).get("title").asText()); + ArrayList alternativeTitles = new ArrayList<>(); + for (int i = 1; i < numTitles; i++){ + alternativeTitles.add(jsonTitles.get(i).get("title").asText()); + } + this.setAlternateTitles(alternativeTitles); + } + // description + StringBuilder descriptionBuilder = new StringBuilder(); + for (Iterator iter = inputJson.get("descriptions").iterator(); iter.hasNext(); ) { + JsonNode node = iter.next(); + if (descriptionBuilder.length() > 0){ + descriptionBuilder.append("\n\n"); + } + JsonNode descriptionTypeNode = node.get("descriptionType"); + if (descriptionTypeNode != null) { + String descriptionType = descriptionTypeNode.asText(); + if (! descriptionType.equals("Other")) { + descriptionBuilder.append(descriptionType + ": "); + } + } + descriptionBuilder.append(node.get("description").asText().strip()); + } + this.setDescription(descriptionBuilder.toString()); + // authors + JsonNode jsonCreators = inputJson.get("creators").path(0); + if (! jsonCreators.isMissingNode()){ + ResponsibleParty documentCreators = ResponsibleParty.builder() + .individualName(jsonCreators.get("name").asText()) + .organisationName("Unknown") + .role("author") + .build(); + ArrayList list1 = new ArrayList<>(); + list1.add(documentCreators); + this.setResponsibleParties(list1); + } + // onlineresources + ArrayList list2 = new ArrayList<>(); + list2.add( + OnlineResource.builder() + .url(inputJson.get("url").asText()) + .name("View record") + .description("View record at this link") + .function("information") + .build() + ); + this.setOnlineResources(list2); + // reference dates + // the timestamp parsing is extremely dubious but we need working code NOW for the Frankfurt meeting. + // TBF using LocalDate is totally broken anyway so it all needs redoing at some point. + this.setDatasetReferenceDate( + DatasetReferenceDate.builder() + .creationDate(LocalDate.parse(inputJson.get("created").asText().substring(0,10))) + .publicationDate(LocalDate.parse(inputJson.get("published").asText().substring(0,4) + "-01-01")) + .creationDate(LocalDate.parse(inputJson.get("created").asText().substring(0,10))) + .build() + ); + // fixed stuff + this.setAccessLimitation( + AccessLimitation.builder() + .value("no limitations to public access") + .code("Available") + .uri("http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations") + .build() + ); + this.setDataLevel("Level 0"); + this.setType("signpost"); + } + @Override public Set getRelationships() { val relations = Optional.ofNullable(super.getRelationships()) diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/LinkedDocumentRetrievalService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/LinkedDocumentRetrievalService.java index 485d928b5..08d89323f 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/LinkedDocumentRetrievalService.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/LinkedDocumentRetrievalService.java @@ -4,9 +4,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.net.URL; -import java.time.LocalDate; -import java.util.ArrayList; -import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -26,11 +23,6 @@ import org.springframework.web.client.RestClientResponseException; import org.springframework.web.client.RestTemplate; -import uk.ac.ceh.gateway.catalogue.gemini.AccessLimitation; -import uk.ac.ceh.gateway.catalogue.gemini.DatasetReferenceDate; -import uk.ac.ceh.gateway.catalogue.gemini.OnlineResource; -import uk.ac.ceh.gateway.catalogue.model.ResponsibleParty; - @Profile("server:elter") @Slf4j @Service @@ -64,87 +56,12 @@ public ElterDocument get(String url) { // call Datacite String dataciteRecordUrl = dataciteApiRoot + "/" + inputDoi; log.info("GET {}", dataciteRecordUrl); - JsonNode jsonRecordAttributes = objectMapper.readTree(new URL(dataciteRecordUrl)).get("data").get("attributes"); + JsonNode dataciteJson = objectMapper.readTree(new URL(dataciteRecordUrl)); - // create document from Datacite response + // create and return ElterDocument ElterDocument document = new ElterDocument(); - // ensure title is set to something - JsonNode jsonTitles = jsonRecordAttributes.get("titles"); - int numTitles = jsonTitles.size(); - if (numTitles == 0){ - document.setTitle("TITLE MISSING"); - } - else { - document.setTitle(jsonTitles.get(0).get("title").asText()); - ArrayList alternativeTitles = new ArrayList<>(); - for (int i = 1; i < numTitles; i++){ - alternativeTitles.add(jsonTitles.get(i).get("title").asText()); - } - document.setAlternateTitles(alternativeTitles); - } - // description - StringBuilder descriptionBuilder = new StringBuilder(); - for (Iterator iter = jsonRecordAttributes.get("descriptions").iterator(); iter.hasNext(); ) { - JsonNode node = iter.next(); - if (descriptionBuilder.length() > 0){ - descriptionBuilder.append("\n\n"); - } - JsonNode descriptionTypeNode = node.get("descriptionType"); - if (descriptionTypeNode != null) { - String descriptionType = descriptionTypeNode.asText(); - if (! descriptionType.equals("Other")) { - descriptionBuilder.append(descriptionType + ": "); - } - } - descriptionBuilder.append(node.get("description").asText().strip()); - } - document.setDescription(descriptionBuilder.toString()); - // authors - JsonNode jsonCreators = jsonRecordAttributes.get("creators").path(0); - if (! jsonCreators.isMissingNode()){ - ResponsibleParty documentCreators = ResponsibleParty.builder() - .individualName(jsonCreators.get("name").asText()) - .organisationName("Unknown") - .role("author") - .build(); - ArrayList list1 = new ArrayList<>(); - list1.add(documentCreators); - document.setResponsibleParties(list1); - } - // onlineresources - ArrayList list2 = new ArrayList<>(); - list2.add( - OnlineResource.builder() - .url(jsonRecordAttributes.get("url").asText()) - .name("View record") - .description("View record at this link") - .function("information") - .build() - ); - document.setOnlineResources(list2); - // reference dates - // the timestamp parsing is extremely dubious but we need working code NOW for the Frankfurt meeting. - // TBF using LocalDate is totally broken anyway so it all needs redoing at some point. - document.setDatasetReferenceDate( - DatasetReferenceDate.builder() - .creationDate(LocalDate.parse(jsonRecordAttributes.get("created").asText().substring(0,10))) - .publicationDate(LocalDate.parse(jsonRecordAttributes.get("published").asText().substring(0,4) + "-01-01")) - .creationDate(LocalDate.parse(jsonRecordAttributes.get("created").asText().substring(0,10))) - .build() - ); - // import ID - should equal input doi but can't be too careful - document.setImportId(jsonRecordAttributes.get("doi").asText()); - // fixed stuff - document.setAccessLimitation( - AccessLimitation.builder() - .value("no limitations to public access") - .code("Available") - .uri("http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations") - .build() - ); - document.setDataLevel("Level 0"); - document.setType("signpost"); - + document.importDataciteJson(dataciteJson); + document.setImportId(inputDoi); return document; } else { diff --git a/java/src/main/resources/application.properties b/java/src/main/resources/application.properties index acf12e719..0494c77bd 100644 --- a/java/src/main/resources/application.properties +++ b/java/src/main/resources/application.properties @@ -1,6 +1,7 @@ # suppress inspection "UnusedProperty" for whole file # suppress inspection "HttpUrlsUsage" for whole file # suppress inspection "SpringBootApplicationProperties" for whole file +b2share.api=https://b2share.eudat.eu/api/records crowd.address=https://crowd.ceh.ac.uk/crowd/rest/usermanagement/latest crowd.username=eip-ro data.repository.location=/var/ceh-catalogue/datastore From a176576d91a5de16f2d6423eaaaca97c39c462a9 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Fri, 14 Jul 2023 15:27:28 +0100 Subject: [PATCH 3/9] Refactor B2shareImportService to use direct URL Instead of concatenating a URL query, we just directly set the whole thing in application.properties. This makes testing easier and the code shorter. Jira: ELTER-20 --- .../catalogue/elter/B2shareImportService.java | 13 ++++++------- java/src/main/resources/application.properties | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java index fbe1a569f..ebf5d61e8 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java @@ -50,7 +50,7 @@ public class B2shareImportService implements CatalogueImportService { private final Pattern p; private final PublicationService publicationService; private final SolrClient solrClient; - private final String B2shareApiRoot; + private final String b2shareRecordsUrl; private final String dataciteApiRoot; // constructor @@ -59,7 +59,7 @@ public B2shareImportService( DocumentRepository documentRepository, PublicationService publicationService, SolrClient solrClient, - @Value("${b2share.api}") String B2shareApiRoot, + @Value("${b2share.api}") String b2shareRecordsUrl, @Value("${doi.api}") String dataciteApiRoot ) { log.info("Creating"); @@ -69,22 +69,21 @@ public B2shareImportService( this.p = Pattern.compile("10\\.\\S+/\\S+"); this.publicationService = publicationService; this.solrClient = solrClient; - this.B2shareApiRoot = B2shareApiRoot; + this.b2shareRecordsUrl = b2shareRecordsUrl; this.dataciteApiRoot = dataciteApiRoot; } // methods start here @SneakyThrows private List getRemoteRecordList() { - log.debug("GET B2SHARE records at {}", B2shareApiRoot); + log.debug("GET B2SHARE records at {}", b2shareRecordsUrl); // prep List results = new ArrayList<>(); - String B2shareRecordsUrl = B2shareApiRoot + "/?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=5"; - JsonNode B2shareRecords = objectMapper.readTree(new URL(B2shareRecordsUrl)); + JsonNode b2shareRecords = objectMapper.readTree(new URL(b2shareRecordsUrl)); - for (JsonNode node : B2shareRecords.path("hits").path("hits")){ + for (JsonNode node : b2shareRecords.path("hits").path("hits")){ String doi = node.path("metadata").path("DOI").asText(); if (!doi.equals("")){ results.add(doi); diff --git a/java/src/main/resources/application.properties b/java/src/main/resources/application.properties index 0494c77bd..6589bf33e 100644 --- a/java/src/main/resources/application.properties +++ b/java/src/main/resources/application.properties @@ -1,7 +1,7 @@ # suppress inspection "UnusedProperty" for whole file # suppress inspection "HttpUrlsUsage" for whole file # suppress inspection "SpringBootApplicationProperties" for whole file -b2share.api=https://b2share.eudat.eu/api/records +b2share.api=https://b2share.eudat.eu/api/records/?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=5 crowd.address=https://crowd.ceh.ac.uk/crowd/rest/usermanagement/latest crowd.username=eip-ro data.repository.location=/var/ceh-catalogue/datastore From a8e23ce21b85bd143fd239f0d305f84e80e3c035 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Mon, 24 Jul 2023 16:44:45 +0100 Subject: [PATCH 4/9] Add tests for B2shareImportService Jira: ELTER-20 --- .../elter/B2shareImportServiceTest.java | 225 ++++++++++++++++++ .../elter/b2share-invalid-api-response.json | 128 ++++++++++ .../elter/b2share-valid-api-response.json | 128 ++++++++++ 3 files changed, 481 insertions(+) create mode 100644 java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java create mode 100644 java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json create mode 100644 java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json diff --git a/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java b/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java new file mode 100644 index 000000000..16476e515 --- /dev/null +++ b/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java @@ -0,0 +1,225 @@ +package uk.ac.ceh.gateway.catalogue.elter; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import lombok.SneakyThrows; +import lombok.val; + +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.SolrParams; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; + +import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; + +import uk.ac.ceh.gateway.catalogue.model.CatalogueUser; +import uk.ac.ceh.gateway.catalogue.publication.PublicationService; +import uk.ac.ceh.gateway.catalogue.repository.DocumentRepository; + +import static org.apache.solr.client.solrj.SolrRequest.METHOD.POST; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.BDDMockito.given; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoInteractions; + +import static org.springframework.test.web.client.match.MockRestRequestMatchers.*; + +@ExtendWith(MockitoExtension.class) +@DisplayName("B2SHAREImportService") +public class B2shareImportServiceTest { + private B2shareImportService b2shareImportService; + private String testB2shareResponse; + private QueryResponse queryResponse; + private CatalogueUser expectedUser; + + String b2shareResponse = getClass().getResource("b2share-valid-api-response.json").toString(); + String invalidB2shareResponse = getClass().getResource("b2share-invalid-api-response.json").toString(); + String dataciteTestUrl = b2shareResponse.substring(0,b2shareResponse.lastIndexOf("/")); + + private static final String CATALOGUE = "elter"; + private static final String RECORD_ID = "00000000-0000-0000-0000-000000000000"; + + @Mock private DocumentRepository documentRepository; + @Mock private PublicationService publicationService; + @Mock private SolrClient solrClient; + + @BeforeEach + void setup() { + queryResponse = mock(QueryResponse.class); + + expectedUser = new CatalogueUser() + .setUsername("B2SHARE metadata import") + .setEmail("info@eudat.eu"); + } + + @Test + @SneakyThrows + public void importNewRecord() { + // setup + b2shareImportService = new B2shareImportService( + documentRepository, + publicationService, + solrClient, + b2shareResponse, + dataciteTestUrl + ); + + // given + given(solrClient.query(any(String.class), any(SolrParams.class), eq(POST))) + .willReturn(queryResponse); + given(queryResponse.getResults()) + .willReturn(new SolrDocumentList()); + + given(documentRepository.saveNew( + any(CatalogueUser.class), + any(ElterDocument.class), + any(String.class), + any(String.class) + )) + .willReturn(new ElterDocument().setId(RECORD_ID)); + + // when + b2shareImportService.runImport(); + + // then + // check interactions + ArgumentCaptor argument = ArgumentCaptor.forClass(ElterDocument.class); + verify(documentRepository).saveNew(eq(expectedUser), argument.capture(), eq(CATALOGUE), eq("Create new record 10.23728/b2share.b56cd875765a403599859177fced08ae")); + verify(publicationService).transition(expectedUser, RECORD_ID, "ykhm7b"); + verify(publicationService).transition(expectedUser, RECORD_ID, "re4vkb"); + + // check created document + ElterDocument createdDocument = argument.getValue(); + assertEquals( + "TERENO Wüstebach meteorological data", + createdDocument.getTitle() + ); + assertEquals( + "10 minute interval temperature and precipitation in °C and mm from different sensors of a meteorological station.Sensor names are temperature at 2m, Precipitation_Cum10min_OttNRTtotal, Precipitation_Cum10min_OttRTNRT, Precipitation_Cum10min_OttNRT, Precipitation_Cum10min_Ecotech, Precipitation_Cum10min_RainCap", + createdDocument.getDescription() + ); + assertEquals( + "10.23728/b2share.b56cd875765a403599859177fced08ae", + createdDocument.getImportId() + ); + assertEquals( + "signpost", + createdDocument.getType() + ); + assertEquals( + "Level 0", + createdDocument.getDataLevel() + ); + } + + @Test + @SneakyThrows + public void updateExistingRecord() { + // setup + b2shareImportService = new B2shareImportService( + documentRepository, + publicationService, + solrClient, + b2shareResponse, + dataciteTestUrl + ); + + Map solrFieldMapping = new HashMap<>(); + solrFieldMapping.put("importId", "10.23728/b2share.b56cd875765a403599859177fced08ae"); + solrFieldMapping.put("identifier", RECORD_ID); + + SolrDocumentList mockResults = new SolrDocumentList(); + mockResults.add(new SolrDocument(solrFieldMapping)); + + // given + given(solrClient.query(any(String.class), any(SolrParams.class), eq(POST))) + .willReturn(queryResponse); + given(queryResponse.getResults()) + .willReturn(mockResults); + + given(documentRepository.save( + any(CatalogueUser.class), + any(ElterDocument.class), + any(String.class), + any(String.class) + )) + .willReturn(new ElterDocument().setId(RECORD_ID)); + + given(documentRepository.read(RECORD_ID)) + .willReturn(new ElterDocument().setId(RECORD_ID)); + + // when + b2shareImportService.runImport(); + + // then + // check interactions + verify(documentRepository).read(RECORD_ID); + ArgumentCaptor argument = ArgumentCaptor.forClass(ElterDocument.class); + verify(documentRepository).save(eq(expectedUser), argument.capture(), eq(RECORD_ID), eq("Updated record 10.23728/b2share.b56cd875765a403599859177fced08ae")); + verifyNoInteractions(publicationService); + + // check created document + ElterDocument updatedDocument = argument.getValue(); + assertEquals( + "TERENO Wüstebach meteorological data", + updatedDocument.getTitle() + ); + assertEquals( + "10 minute interval temperature and precipitation in °C and mm from different sensors of a meteorological station.Sensor names are temperature at 2m, Precipitation_Cum10min_OttNRTtotal, Precipitation_Cum10min_OttRTNRT, Precipitation_Cum10min_OttNRT, Precipitation_Cum10min_Ecotech, Precipitation_Cum10min_RainCap", + updatedDocument.getDescription() + ); + assertEquals( + "10.23728/b2share.b56cd875765a403599859177fced08ae", + updatedDocument.getImportId() + ); + assertEquals( + "signpost", + updatedDocument.getType() + ); + assertEquals( + "Level 0", + updatedDocument.getDataLevel() + ); + } + + @Test + @SneakyThrows + public void skipInvalidRecord() { + // setup + b2shareImportService = new B2shareImportService( + documentRepository, + publicationService, + solrClient, + invalidB2shareResponse, + dataciteTestUrl + ); + + // given + given(solrClient.query(any(String.class), any(SolrParams.class), eq(POST))) + .willReturn(queryResponse); + given(queryResponse.getResults()) + .willReturn(new SolrDocumentList()); + + // when + b2shareImportService.runImport(); + + // then + verifyNoInteractions(documentRepository); + verifyNoInteractions(publicationService); + } +} diff --git a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json new file mode 100644 index 000000000..52366721b --- /dev/null +++ b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json @@ -0,0 +1,128 @@ +{ + "aggregations": { + "type": { + "buckets": [], + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0 + } + }, + "hits": { + "hits": [ + { + "created": "2017-03-16T11:35:23.358952+00:00", + "files": [ + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:16f2cc7e68d7851bd68c2b9f6fab6ced", + "ePIC_PID": "http://hdl.handle.net/11304/be3f5ea7-4c9c-4e98-85a0-e498523a1df7", + "key": "D_AirTemperature_2m_LTER_EU_DE_013_15-03-2017.csv", + "size": 25782444, + "version_id": "62b4afdd-efee-4cd3-95af-48db2aa8165f" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:e221eec47dc4ed0b68ef508ff3435e98", + "ePIC_PID": "http://hdl.handle.net/11304/7f204427-d634-436f-a175-065cdeae1d11", + "key": "D_Precip11_LTER_EU_DE_013_15-03-2017.csv", + "size": 20423694, + "version_id": "53318bf5-9fa2-45d0-8cbd-48cb3273f5ee" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:1bfcda02ec3a819b9334e7915fca05ff", + "ePIC_PID": "http://hdl.handle.net/11304/c17618b8-7901-4633-8f1f-8adf9b570c1f", + "key": "D_Precip13_LTER_EU_DE_013_15-03-2017.csv", + "size": 20472456, + "version_id": "0a0e5090-947c-4100-a794-ea59640395c7" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:3d4164f4ffc838bf9d2dd2cf39860bb0", + "ePIC_PID": "http://hdl.handle.net/11304/dd0412f2-f45d-4f1d-808e-1a1145a8ee26", + "key": "D_Precip5_LTER_EU_DE_013_15-03-2017.csv", + "size": 10351327, + "version_id": "c132ccbf-f411-4ad3-ba72-fbdc6871b5ce" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:be3344fa62dd6c6c7f33e1684964dfad", + "ePIC_PID": "http://hdl.handle.net/11304/2d808cf1-d629-497a-9999-a9ce212deb06", + "key": "D_Precip7_LTER_EU_DE_013_15-03-2017.csv", + "size": 9544325, + "version_id": "8a0ae46b-eed3-4a60-82c6-6151761e17d2" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:6831fc601ecf5ba40053fea5515992d1", + "ePIC_PID": "http://hdl.handle.net/11304/9f3998ae-f922-4406-89b0-86906a4478db", + "key": "D_Precip9_LTER_EU_DE_013_15-03-2017.csv", + "size": 9543481, + "version_id": "41f59617-2578-4a4c-ac92-c913fb5e2a06" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:b2950a788edf39847d7175fea33520a2", + "ePIC_PID": "http://hdl.handle.net/11304/ee24cf51-6997-4978-815b-f4aadf022d66", + "key": "eLTER_T3.4_VA_DataReporting_v1.2_eifel.xls", + "size": 189952, + "version_id": "85091b50-3033-4c66-af0f-5321ca6f4f64" + } + ], + "id": "b56cd875765a403599859177fced08ae", + "links": { + "files": "https://b2share.eudat.eu/api/files/91b00ebf-6f03-4ace-8498-f70e52492aee", + "self": "https://b2share.eudat.eu/api/records/b56cd875765a403599859177fced08ae", + "versions": "https://b2share.eudat.eu/api/records/1af0379d227744b1bdc2bc89b477b781/versions" + }, + "metadata": { + "$schema": "https://b2share.eudat.eu/api/communities/d952913c-451e-4b5c-817e-d578dc8a4469/schemas/0#/json_schema", + "DOI": "https://doi.org/10.23728/notARealDoiNorAFile", + "community": "d952913c-451e-4b5c-817e-d578dc8a4469", + "community_specific": { + "27193e5b-97e6-4f6f-8e87-3694589bcebe": {} + }, + "contact_email": "h.bogena@fz-juelich.de", + "creators": [ + { + "creator_name": "Heye Bogena" + } + ], + "descriptions": [ + { + "description": "10 minute interval temperature and precipitation in \u00b0C and mm from different sensors of a meteorological station.Sensor names are temperature at 2m, Precipitation_Cum10min_OttNRTtotal, Precipitation_Cum10min_OttRTNRT, Precipitation_Cum10min_OttNRT, Precipitation_Cum10min_Ecotech, Precipitation_Cum10min_RainCap", + "description_type": "Other" + } + ], + "disciplines": [ + { + "discipline_name": "3.3.2 \u2192 Earth sciences \u2192 Environmental science" + } + ], + "ePIC_PID": "http://hdl.handle.net/11304/545c3088-61be-4def-bdb3-d3e0f7330268", + "keywords": [ + { + "keyword": "Meteorology" + } + ], + "open_access": true, + "owners": [ + 636 + ], + "publication_date": "2017-03-16", + "publication_state": "published", + "titles": [ + { + "title": "TERENO W\u00fcstebach meteorological data" + } + ] + }, + "updated": "2018-01-11T13:50:07.974669+00:00" + } + ], + "total": 1004 + }, + "links": { + "next": "https://b2share.eudat.eu/api/records/?page=2&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1", + "self": "https://b2share.eudat.eu/api/records/?page=1&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1" + } +} diff --git a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json new file mode 100644 index 000000000..7f085744c --- /dev/null +++ b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json @@ -0,0 +1,128 @@ +{ + "aggregations": { + "type": { + "buckets": [], + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0 + } + }, + "hits": { + "hits": [ + { + "created": "2017-03-16T11:35:23.358952+00:00", + "files": [ + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:16f2cc7e68d7851bd68c2b9f6fab6ced", + "ePIC_PID": "http://hdl.handle.net/11304/be3f5ea7-4c9c-4e98-85a0-e498523a1df7", + "key": "D_AirTemperature_2m_LTER_EU_DE_013_15-03-2017.csv", + "size": 25782444, + "version_id": "62b4afdd-efee-4cd3-95af-48db2aa8165f" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:e221eec47dc4ed0b68ef508ff3435e98", + "ePIC_PID": "http://hdl.handle.net/11304/7f204427-d634-436f-a175-065cdeae1d11", + "key": "D_Precip11_LTER_EU_DE_013_15-03-2017.csv", + "size": 20423694, + "version_id": "53318bf5-9fa2-45d0-8cbd-48cb3273f5ee" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:1bfcda02ec3a819b9334e7915fca05ff", + "ePIC_PID": "http://hdl.handle.net/11304/c17618b8-7901-4633-8f1f-8adf9b570c1f", + "key": "D_Precip13_LTER_EU_DE_013_15-03-2017.csv", + "size": 20472456, + "version_id": "0a0e5090-947c-4100-a794-ea59640395c7" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:3d4164f4ffc838bf9d2dd2cf39860bb0", + "ePIC_PID": "http://hdl.handle.net/11304/dd0412f2-f45d-4f1d-808e-1a1145a8ee26", + "key": "D_Precip5_LTER_EU_DE_013_15-03-2017.csv", + "size": 10351327, + "version_id": "c132ccbf-f411-4ad3-ba72-fbdc6871b5ce" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:be3344fa62dd6c6c7f33e1684964dfad", + "ePIC_PID": "http://hdl.handle.net/11304/2d808cf1-d629-497a-9999-a9ce212deb06", + "key": "D_Precip7_LTER_EU_DE_013_15-03-2017.csv", + "size": 9544325, + "version_id": "8a0ae46b-eed3-4a60-82c6-6151761e17d2" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:6831fc601ecf5ba40053fea5515992d1", + "ePIC_PID": "http://hdl.handle.net/11304/9f3998ae-f922-4406-89b0-86906a4478db", + "key": "D_Precip9_LTER_EU_DE_013_15-03-2017.csv", + "size": 9543481, + "version_id": "41f59617-2578-4a4c-ac92-c913fb5e2a06" + }, + { + "bucket": "91b00ebf-6f03-4ace-8498-f70e52492aee", + "checksum": "md5:b2950a788edf39847d7175fea33520a2", + "ePIC_PID": "http://hdl.handle.net/11304/ee24cf51-6997-4978-815b-f4aadf022d66", + "key": "eLTER_T3.4_VA_DataReporting_v1.2_eifel.xls", + "size": 189952, + "version_id": "85091b50-3033-4c66-af0f-5321ca6f4f64" + } + ], + "id": "b56cd875765a403599859177fced08ae", + "links": { + "files": "https://b2share.eudat.eu/api/files/91b00ebf-6f03-4ace-8498-f70e52492aee", + "self": "https://b2share.eudat.eu/api/records/b56cd875765a403599859177fced08ae", + "versions": "https://b2share.eudat.eu/api/records/1af0379d227744b1bdc2bc89b477b781/versions" + }, + "metadata": { + "$schema": "https://b2share.eudat.eu/api/communities/d952913c-451e-4b5c-817e-d578dc8a4469/schemas/0#/json_schema", + "DOI": "https://doi.org/10.23728/b2share.b56cd875765a403599859177fced08ae", + "community": "d952913c-451e-4b5c-817e-d578dc8a4469", + "community_specific": { + "27193e5b-97e6-4f6f-8e87-3694589bcebe": {} + }, + "contact_email": "h.bogena@fz-juelich.de", + "creators": [ + { + "creator_name": "Heye Bogena" + } + ], + "descriptions": [ + { + "description": "10 minute interval temperature and precipitation in \u00b0C and mm from different sensors of a meteorological station.Sensor names are temperature at 2m, Precipitation_Cum10min_OttNRTtotal, Precipitation_Cum10min_OttRTNRT, Precipitation_Cum10min_OttNRT, Precipitation_Cum10min_Ecotech, Precipitation_Cum10min_RainCap", + "description_type": "Other" + } + ], + "disciplines": [ + { + "discipline_name": "3.3.2 \u2192 Earth sciences \u2192 Environmental science" + } + ], + "ePIC_PID": "http://hdl.handle.net/11304/545c3088-61be-4def-bdb3-d3e0f7330268", + "keywords": [ + { + "keyword": "Meteorology" + } + ], + "open_access": true, + "owners": [ + 636 + ], + "publication_date": "2017-03-16", + "publication_state": "published", + "titles": [ + { + "title": "TERENO W\u00fcstebach meteorological data" + } + ] + }, + "updated": "2018-01-11T13:50:07.974669+00:00" + } + ], + "total": 1004 + }, + "links": { + "next": "https://b2share.eudat.eu/api/records/?page=2&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1", + "self": "https://b2share.eudat.eu/api/records/?page=1&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1" + } +} From 72e80224c23bb19928654565efc49d75566bc012 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Wed, 2 Aug 2023 16:27:41 +0100 Subject: [PATCH 5/9] Refactor to use B2SHARE API instead of Datacite Datacite metadata is incomplete compared to B2SHARE data and B2SHARE record listings contain the complete records anyway, so we just use the B2SHARE records directly. We also implement a blacklist of DOIs which have already been imported separately to the production catalogue already - these records are skipped. The new createDocumentFromJson() method handles the parsing of the B2SHARE record JSON. The exception is the importId, since we need to check the DOI of a record before importing it in runImport(). Jira: ELTER-20 --- .../catalogue/elter/B2shareImportService.java | 262 +++++++++++++----- .../src/main/resources/application.properties | 2 +- .../elter/B2shareImportServiceTest.java | 14 +- .../elter/b2share-invalid-api-response.json | 5 +- .../elter/b2share-valid-api-response.json | 3 +- 5 files changed, 198 insertions(+), 88 deletions(-) diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java index ebf5d61e8..2c386450c 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java @@ -3,7 +3,6 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import java.io.FileNotFoundException; import java.io.IOException; import java.net.URL; import java.time.ZoneId; @@ -33,9 +32,12 @@ import uk.ac.ceh.gateway.catalogue.TimeConstants; import uk.ac.ceh.gateway.catalogue.elter.ElterDocument; +import uk.ac.ceh.gateway.catalogue.gemini.AccessLimitation; +import uk.ac.ceh.gateway.catalogue.gemini.OnlineResource; import uk.ac.ceh.gateway.catalogue.imports.CatalogueImportService; import uk.ac.ceh.gateway.catalogue.model.CatalogueUser; import uk.ac.ceh.gateway.catalogue.model.MetadataDocument; +import uk.ac.ceh.gateway.catalogue.model.ResponsibleParty; import uk.ac.ceh.gateway.catalogue.publication.PublicationService; import uk.ac.ceh.gateway.catalogue.repository.DocumentRepository; @@ -50,8 +52,64 @@ public class B2shareImportService implements CatalogueImportService { private final Pattern p; private final PublicationService publicationService; private final SolrClient solrClient; - private final String b2shareRecordsUrl; - private final String dataciteApiRoot; + private String b2shareRecordsUrl; + + private static final List blacklistedDois = List.of( + "10.23728/b2share.09454896da99494f931be25e279658ef", + "10.23728/b2share.16b4760bb98642fc97730e32bac39e63", + "10.23728/b2share.192a897c521b4c6babfc1b9db65163b2", + "10.23728/b2share.1bd8c763c4524168aecafad6d941c5c0", + "10.23728/b2share.227bede4ca97433bb86405ad30c8b0f4", + "10.23728/b2share.2854520d9e4947e68ddb6a0a270a4b5f", + "10.23728/b2share.2f3df5f7baf04288853f24121d0810cd", + "10.23728/b2share.34a1f1b96ab6404983a3da0f3e13713f", + "10.23728/b2share.362d6bae28094f788463f6ada0a5203b", + "10.23728/b2share.388b113860ae4b4da48214c18f30b91f", + "10.23728/b2share.399c0653601a46488082e0f5edafc0df", + "10.23728/b2share.426cc15f73dc47b98b6b2af9d740ea98", + "10.23728/b2share.44219e37ca9045779ede02ac4feb418a", + "10.23728/b2share.58dde320216e41a5a97b8f4b287efb4b", + "10.23728/b2share.58eb7b9a8b3c466783762cb15dcd3898", + "10.23728/b2share.5919b9ca10044e64aef836f448d69a3e", + "10.23728/b2share.5b175810bd504f2f8607a9ed9f078809", + "10.23728/b2share.5ed401dfe7274f268fcfed2aa595df70", + "10.23728/b2share.6175e4fe23194eed8f63325cae7b1131", + "10.23728/b2share.61c4fc815b044ed2a80321fef27c7d32", + "10.23728/b2share.62e30863d25542f5a9e6e7ce00d08b9f", + "10.23728/b2share.63ff6e748ff34edd8b314805c73b7ffe", + "10.23728/b2share.6436f257b9e44c3c81e614e6d68c5083", + "10.23728/b2share.6754e1d72a75478090a84216c0321d4d", + "10.23728/b2share.68d0d103ee904b3886ac19d455f4089a", + "10.23728/b2share.69b11c4e7a3b4762b80951728b030ba5", + "10.23728/b2share.6de1e903ae0445a487cb219142bbcfb2", + "10.23728/b2share.79905f4696814887a7d186643c30f962", + "10.23728/b2share.81f34962de634f5e9effcacba3b40f97", + "10.23728/b2share.8c4b87b485f5493bbfbcd1ce1ee6dd6e", + "10.23728/b2share.8d628873b8c147f09a3b74090ff65b08", + "10.23728/b2share.913069fbb66740a7a80d5de7ce32edb7", + "10.23728/b2share.914c38d9675149c8bc18e0be48cf6ef3", + "10.23728/b2share.9752dbeeab904279a422d7fae4b31ae4", + "10.23728/b2share.9a664abd7ebb4db6b26d9d955147490e", + "10.23728/b2share.9a8aa6c218e54a3d8c08e3912e6364de", + "10.23728/b2share.a6ef9b030de240f198581261be9de6c1", + "10.23728/b2share.ab59b8caa1aa484bbdcaefad9a9a7437", + "10.23728/b2share.b31d67e6a3a6468db8a3f2939ecbbe4c", + "10.23728/b2share.b36f78e35b4a42439f0e3730323cf8c3", + "10.23728/b2share.b9a0acc25ca74ec9ace2bdc9a6799e97", + "10.23728/b2share.c4124d84f54b4d09826236514102003a", + "10.23728/b2share.c7ffe7052f2f4f73bc2f1770e6716d7d", + "10.23728/b2share.cafe3eb1d4ef4e648bd4f8f058ab5dfb", + "10.23728/b2share.d84a0855ef7241839da4b0f4644e8553", + "10.23728/b2share.dde42ccc994b4702b396225d58b0049a", + "10.23728/b2share.df83c69cc955416f9fb847ba86b3141b", + "10.23728/b2share.ebd200fa5dac41dc93a3db74b19087be", + "10.23728/b2share.ec4fa718a1a1482cae299268615668f0", + "10.23728/b2share.f42f4fec3729435eb371bb67825715d0", + "10.23728/b2share.f68dd6f4a9384b449a8d6ebb18ae02bb", + "10.23728/b2share.f97c8289132b4ba19a3e21c5032c910b", + "10.23728/b2share.fa82d241391c4b9e867731b3b9e82f7d", + "10.23728/b2share.fadbfb30fdd543f4a3af31130d6dc685" + ); // constructor @SneakyThrows @@ -59,8 +117,7 @@ public B2shareImportService( DocumentRepository documentRepository, PublicationService publicationService, SolrClient solrClient, - @Value("${b2share.api}") String b2shareRecordsUrl, - @Value("${doi.api}") String dataciteApiRoot + @Value("${b2share.api}") String b2shareRecordsUrl ) { log.info("Creating"); @@ -70,29 +127,9 @@ public B2shareImportService( this.publicationService = publicationService; this.solrClient = solrClient; this.b2shareRecordsUrl = b2shareRecordsUrl; - this.dataciteApiRoot = dataciteApiRoot; } // methods start here - @SneakyThrows - private List getRemoteRecordList() { - log.debug("GET B2SHARE records at {}", b2shareRecordsUrl); - - // prep - List results = new ArrayList<>(); - - JsonNode b2shareRecords = objectMapper.readTree(new URL(b2shareRecordsUrl)); - - for (JsonNode node : b2shareRecords.path("hits").path("hits")){ - String doi = node.path("metadata").path("DOI").asText(); - if (!doi.equals("")){ - results.add(doi); - } - } - - return results; - } - @SneakyThrows private Map getLocalRecordMapping() throws IOException { log.debug("GET locally imported records"); @@ -132,21 +169,91 @@ private Map getLocalRecordMapping() throws IOException { } @SneakyThrows - private ElterDocument getFullRemoteRecord(String inputDoi) { - String recordUrl = dataciteApiRoot + "/" + inputDoi; - JsonNode dataciteJson = null; - - log.info("GET {}", recordUrl); - try { - dataciteJson = objectMapper.readTree(new URL(recordUrl)); - } catch (FileNotFoundException e) { - return null; + private ElterDocument createDocumentFromJson(JsonNode inputJson) { + // create document from B2SHARE API JSON + ElterDocument newDocument = new ElterDocument(); + JsonNode metadataJson = inputJson.get("metadata"); + + // fields from JSON / import metadata + // ensure title is set to something + JsonNode jsonTitles = metadataJson.path("titles"); + int numTitles = jsonTitles.size(); + if (numTitles == 0){ + newDocument.setTitle("TITLE MISSING"); } + else { + newDocument.setTitle(jsonTitles.get(0).get("title").asText()); + ArrayList alternativeTitles = new ArrayList<>(); + for (int i = 1; i < numTitles; i++){ + alternativeTitles.add(jsonTitles.get(i).get("title").asText()); + } + newDocument.setAlternateTitles(alternativeTitles); + } + // description + StringBuilder descriptionBuilder = new StringBuilder(); + for (JsonNode node : metadataJson.path("descriptions")) { + if (descriptionBuilder.length() > 0){ + descriptionBuilder.append("\n\n"); + } + JsonNode descriptionTypeNode = node.get("description_type"); + if (descriptionTypeNode != null) { + String descriptionType = descriptionTypeNode.asText(); + if (! descriptionType.equals("Other")) { + descriptionBuilder.append(descriptionType + ": "); + } + } + descriptionBuilder.append(node.get("description").asText().strip()); + } + newDocument.setDescription(descriptionBuilder.toString()); + // authors and contact_email (separate field) + ArrayList contactList = new ArrayList<>(); + // creators + JsonNode creators = metadataJson.path("creators"); + if (! creators.isMissingNode()) { + for (JsonNode creatorNode : creators) { + ResponsibleParty creator = ResponsibleParty.builder() + .individualName(creatorNode.get("creator_name").asText()) + .role("author") + .build(); + contactList.add(creator); + } + } + // email + JsonNode contactEmail = metadataJson.path("contact_email"); + if (! contactEmail.isMissingNode()) { + ResponsibleParty contact = ResponsibleParty.builder() + .email(contactEmail.asText()) + .role("pointOfContact") + .build(); + contactList.add(contact); + } + newDocument.setResponsibleParties(contactList); + // onlineresources + ArrayList resources = new ArrayList<>(); + resources.add( + OnlineResource.builder() + .url(inputJson.get("links").get("self").asText()) + .name("View record") + .description("View record at this link") + .function("information") + .build() + ); + newDocument.setOnlineResources(resources); + // metadata + newDocument.setImportLastModified(ZonedDateTime.now(ZoneId.of("UTC"))); + + // fixed fields + newDocument.setAccessLimitation( + AccessLimitation.builder() + .value("no limitations to public access") + .code("Available") + .uri("http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations") + .build() + ); + newDocument.setType("signpost"); + newDocument.setDataLevel("Level 0"); - ElterDocument document = new ElterDocument(); - document.importDataciteJson(dataciteJson); - - return document; + return newDocument; } @SneakyThrows @@ -183,20 +290,18 @@ private void updateRecord(String localRecordId, String remoteRecordId, ElterDocu log.debug("Successfully updated record {}", remoteRecordId); } + @SneakyThrows @Scheduled(initialDelay = TimeConstants.ONE_MINUTE, fixedDelay = TimeConstants.SEVEN_DAYS) public void runImport(){ // prep log.info("Running B2SHARE metadata import..."); CatalogueUser importUser = new CatalogueUser().setUsername("B2SHARE metadata import").setEmail("info@eudat.eu"); Map localRecordList = null; - int totalRecords = 0; + int blacklistedRecords = 0; int newRecords = 0; - int updatedRecords = 0; int skippedRecords = 0; - - // get remote records - List remoteRecordList = getRemoteRecordList(); - totalRecords = remoteRecordList.size(); + int totalRecords = 0; + int updatedRecords = 0; // get local records try { @@ -207,37 +312,47 @@ public void runImport(){ } // ready to import - for (String recordDoiOrgUrl : remoteRecordList){ - // normalise input to DOI - Matcher doiCheck = p.matcher(recordDoiOrgUrl); - if (!doiCheck.find()) { - log.info("No DOI detected in record {}", recordDoiOrgUrl); - skippedRecords++; - continue; - } - - // try resolving DOI with datacite - String recordDoi = doiCheck.group(0); - ElterDocument remoteRecord = getFullRemoteRecord(recordDoi); - if (remoteRecord == null){ - log.info("DOI {} does not exist", recordDoi); - skippedRecords++; - continue; - } - - // ready to import - remoteRecord.setImportId(recordDoi); - remoteRecord.setImportLastModified(ZonedDateTime.now(ZoneId.of("UTC"))); - if (localRecordList.containsKey(recordDoi)) { - updateRecord(localRecordList.get(recordDoi), recordDoi, remoteRecord, importUser); - updatedRecords++; - } - else { - String newId = createRecord(recordDoi, remoteRecord, importUser); - log.debug("New document ID is {}", newId); - newRecords++; + JsonNode b2shareRecords = null; + while (! b2shareRecordsUrl.equals("")) { + // get next page of records + b2shareRecords = objectMapper.readTree(new URL(b2shareRecordsUrl)); + + for (JsonNode record : b2shareRecords.path("hits").path("hits")){ + // process each record on page + // normalise DOI to actual DOI, i.e. "10.xxx.../xxxx" + String originalDoi = record.get("metadata").get("DOI").asText(); + Matcher doiCheck = p.matcher(originalDoi); + if (!doiCheck.find()) { + log.debug("No DOI detected in record {}", originalDoi); + skippedRecords++; + continue; + } + String normalisedDoi = doiCheck.group(0); + if (blacklistedDois.contains(normalisedDoi)) { + log.debug("Skipping blacklisted doi {}", normalisedDoi); + skippedRecords++; + blacklistedRecords++; + continue; + } + + // ready to import, as we have a record and a correctly-structured DOI + // to use as the importId + ElterDocument recordDocument = createDocumentFromJson(record); + recordDocument.setImportId(normalisedDoi); + + if (localRecordList.containsKey(normalisedDoi)) { + updateRecord(localRecordList.get(normalisedDoi), normalisedDoi, recordDocument, importUser); + updatedRecords++; + } + else { + String newId = createRecord(normalisedDoi, recordDocument, importUser); + log.debug("New document ID is {}", newId); + newRecords++; + } } + b2shareRecordsUrl = b2shareRecords.path("links").path("next").asText(); } + totalRecords = b2shareRecords.get("hits").get("total").asInt(); // finished, log summary log.info("Finished B2SHARE metadata import!"); @@ -248,5 +363,6 @@ public void runImport(){ newRecords + updatedRecords + skippedRecords, totalRecords ); + log.info("{}/{} blacklisted DOIs were skipped", blacklistedRecords, blacklistedDois.size()); } } diff --git a/java/src/main/resources/application.properties b/java/src/main/resources/application.properties index 6589bf33e..cce6edb05 100644 --- a/java/src/main/resources/application.properties +++ b/java/src/main/resources/application.properties @@ -1,7 +1,7 @@ # suppress inspection "UnusedProperty" for whole file # suppress inspection "HttpUrlsUsage" for whole file # suppress inspection "SpringBootApplicationProperties" for whole file -b2share.api=https://b2share.eudat.eu/api/records/?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=5 +b2share.api=https://b2share.eudat.eu/api/records/?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=100 crowd.address=https://crowd.ceh.ac.uk/crowd/rest/usermanagement/latest crowd.username=eip-ro data.repository.location=/var/ceh-catalogue/datastore diff --git a/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java b/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java index 16476e515..d906845f8 100644 --- a/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java +++ b/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java @@ -47,9 +47,8 @@ public class B2shareImportServiceTest { private QueryResponse queryResponse; private CatalogueUser expectedUser; - String b2shareResponse = getClass().getResource("b2share-valid-api-response.json").toString(); - String invalidB2shareResponse = getClass().getResource("b2share-invalid-api-response.json").toString(); - String dataciteTestUrl = b2shareResponse.substring(0,b2shareResponse.lastIndexOf("/")); + String b2shareResponseUrl = getClass().getResource("b2share-valid-api-response.json").toString(); + String invalidB2shareResponseUrl = getClass().getResource("b2share-invalid-api-response.json").toString(); private static final String CATALOGUE = "elter"; private static final String RECORD_ID = "00000000-0000-0000-0000-000000000000"; @@ -75,8 +74,7 @@ public void importNewRecord() { documentRepository, publicationService, solrClient, - b2shareResponse, - dataciteTestUrl + b2shareResponseUrl ); // given @@ -135,8 +133,7 @@ public void updateExistingRecord() { documentRepository, publicationService, solrClient, - b2shareResponse, - dataciteTestUrl + b2shareResponseUrl ); Map solrFieldMapping = new HashMap<>(); @@ -205,8 +202,7 @@ public void skipInvalidRecord() { documentRepository, publicationService, solrClient, - invalidB2shareResponse, - dataciteTestUrl + invalidB2shareResponseUrl ); // given diff --git a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json index 52366721b..1b21f7a44 100644 --- a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json +++ b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-invalid-api-response.json @@ -76,7 +76,7 @@ }, "metadata": { "$schema": "https://b2share.eudat.eu/api/communities/d952913c-451e-4b5c-817e-d578dc8a4469/schemas/0#/json_schema", - "DOI": "https://doi.org/10.23728/notARealDoiNorAFile", + "DOI": "This is not a DOI", "community": "d952913c-451e-4b5c-817e-d578dc8a4469", "community_specific": { "27193e5b-97e6-4f6f-8e87-3694589bcebe": {} @@ -119,10 +119,9 @@ "updated": "2018-01-11T13:50:07.974669+00:00" } ], - "total": 1004 + "total": 1 }, "links": { - "next": "https://b2share.eudat.eu/api/records/?page=2&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1", "self": "https://b2share.eudat.eu/api/records/?page=1&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1" } } diff --git a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json index 7f085744c..51e75e11a 100644 --- a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json +++ b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json @@ -119,10 +119,9 @@ "updated": "2018-01-11T13:50:07.974669+00:00" } ], - "total": 1004 + "total": 1 }, "links": { - "next": "https://b2share.eudat.eu/api/records/?page=2&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1", "self": "https://b2share.eudat.eu/api/records/?page=1&sort=bestmatch&q=community%3Ad952913c-451e-4b5c-817e-d578dc8a4469&size=1" } } From c4c8c39aff5f8584c5dc42fe0692f289910690a2 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Mon, 7 Aug 2023 14:26:27 +0100 Subject: [PATCH 6/9] Add extra B2SHARE API field mappings We add some extra fields that either weren't available in Datacite or just hadn't been mapped yet. These are: - created + publication_date - open_access - proper link as supplemental - community_specific...metadata_url (deims site) - disciplines + keywords Of note is the extra code to query for deims sites and to parse all the different timestamp formats found in the wild at time of developing. We also change the main link to a direct b2share url instead of going through doi.org (which matches the other imports) and move the doi.org link to the supplemental section. Tweak some declarations of Lists to List bar = new ArrayList<>(); format. Add "Unknown" as an organisationName to the author so that the author displays. At time of writing no authors have an institute in the API. Jira: ELTER-20 --- .../catalogue/elter/B2shareImportService.java | 133 +++++++++++++++--- 1 file changed, 117 insertions(+), 16 deletions(-) diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java index 2c386450c..68c27e239 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java @@ -5,8 +5,12 @@ import java.io.IOException; import java.net.URL; +import java.time.LocalDate; import java.time.ZoneId; import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -20,6 +24,7 @@ import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.CommonParams; @@ -31,13 +36,17 @@ import org.springframework.stereotype.Service; import uk.ac.ceh.gateway.catalogue.TimeConstants; +import uk.ac.ceh.gateway.catalogue.deims.DeimsSolrIndex; import uk.ac.ceh.gateway.catalogue.elter.ElterDocument; import uk.ac.ceh.gateway.catalogue.gemini.AccessLimitation; +import uk.ac.ceh.gateway.catalogue.gemini.DatasetReferenceDate; +import uk.ac.ceh.gateway.catalogue.gemini.Keyword; import uk.ac.ceh.gateway.catalogue.gemini.OnlineResource; import uk.ac.ceh.gateway.catalogue.imports.CatalogueImportService; import uk.ac.ceh.gateway.catalogue.model.CatalogueUser; import uk.ac.ceh.gateway.catalogue.model.MetadataDocument; import uk.ac.ceh.gateway.catalogue.model.ResponsibleParty; +import uk.ac.ceh.gateway.catalogue.model.Supplemental; import uk.ac.ceh.gateway.catalogue.publication.PublicationService; import uk.ac.ceh.gateway.catalogue.repository.DocumentRepository; @@ -47,13 +56,25 @@ @ToString public class B2shareImportService implements CatalogueImportService { // constructor prep + private final DateTimeFormatter dateParser; private final DocumentRepository documentRepository; private final ObjectMapper objectMapper; - private final Pattern p; + private final Pattern deimsIdNormalise; + private final Pattern doiNormalise; private final PublicationService publicationService; private final SolrClient solrClient; private String b2shareRecordsUrl; + // fixed fields + private static final AccessLimitation openAccess = AccessLimitation.builder() + .value("no limitations to public access") + .code("Available") + .uri("http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations") + .build(); + private static final AccessLimitation controlledAccess = AccessLimitation.builder() + .value("To access this data, a licence needs to be negotiated with the provider and there may be a cost") + .code("Controlled") + .build(); private static final List blacklistedDois = List.of( "10.23728/b2share.09454896da99494f931be25e279658ef", "10.23728/b2share.16b4760bb98642fc97730e32bac39e63", @@ -121,12 +142,21 @@ public B2shareImportService( ) { log.info("Creating"); + this.b2shareRecordsUrl = b2shareRecordsUrl; + this.dateParser = new DateTimeFormatterBuilder() + // order matters! put patterns containing others first + .appendPattern("[y-M-d'T'H:m:s.nXXX]") + .appendPattern("[y-M-d'T'H:m:s.nxxx]") + .appendPattern("[y-M-d]") + .appendPattern("[d.M.yyyy]") + .appendPattern("[d/M/yyyy]") + .toFormatter(); this.documentRepository = documentRepository; this.objectMapper = new ObjectMapper(); - this.p = Pattern.compile("10\\.\\S+/\\S+"); + this.deimsIdNormalise = Pattern.compile("\\p{XDigit}{8}-\\p{XDigit}{4}-\\p{XDigit}{4}-\\p{XDigit}{4}-\\p{XDigit}{12}"); + this.doiNormalise = Pattern.compile("10\\.\\S+/\\S+"); this.publicationService = publicationService; this.solrClient = solrClient; - this.b2shareRecordsUrl = b2shareRecordsUrl; } // methods start here @@ -168,6 +198,15 @@ private Map getLocalRecordMapping() throws IOException { return resultMapping; } + @SneakyThrows + private List getDeimsSite(String id){ + SolrQuery query = new SolrQuery(); + query.setQuery(id); + query.setParam(CommonParams.DF, "id"); + QueryResponse response = solrClient.query("deims", query, POST); + return response.getBeans(DeimsSolrIndex.class); + } + @SneakyThrows private ElterDocument createDocumentFromJson(JsonNode inputJson) { // create document from B2SHARE API JSON @@ -183,7 +222,7 @@ private ElterDocument createDocumentFromJson(JsonNode inputJson) { } else { newDocument.setTitle(jsonTitles.get(0).get("title").asText()); - ArrayList alternativeTitles = new ArrayList<>(); + List alternativeTitles = new ArrayList<>(); for (int i = 1; i < numTitles; i++){ alternativeTitles.add(jsonTitles.get(i).get("title").asText()); } @@ -206,7 +245,7 @@ private ElterDocument createDocumentFromJson(JsonNode inputJson) { } newDocument.setDescription(descriptionBuilder.toString()); // authors and contact_email (separate field) - ArrayList contactList = new ArrayList<>(); + List contactList = new ArrayList<>(); // creators JsonNode creators = metadataJson.path("creators"); if (! creators.isMissingNode()) { @@ -214,6 +253,7 @@ private ElterDocument createDocumentFromJson(JsonNode inputJson) { ResponsibleParty creator = ResponsibleParty.builder() .individualName(creatorNode.get("creator_name").asText()) .role("author") + .organisationName("Unknown") .build(); contactList.add(creator); } @@ -229,27 +269,88 @@ private ElterDocument createDocumentFromJson(JsonNode inputJson) { } newDocument.setResponsibleParties(contactList); // onlineresources - ArrayList resources = new ArrayList<>(); + List resources = new ArrayList<>(); resources.add( OnlineResource.builder() - .url(inputJson.get("links").get("self").asText()) + .url("https://b2share.eudat.eu/records/" + inputJson.get("id").asText()) .name("View record") .description("View record at this link") .function("information") .build() ); newDocument.setOnlineResources(resources); - // metadata + // reference dates + LocalDate created = null; + LocalDate published = null; + String createdTimestamp = inputJson.get("created").asText(); + String publishedTimestamp = metadataJson.path("publication_date").asText(); + try { + created = LocalDate.parse(createdTimestamp, dateParser); + } catch (DateTimeParseException e) { + log.debug("invalid created date {}", createdTimestamp); + } + try { + if (!publishedTimestamp.equals("")) { + published = LocalDate.parse(publishedTimestamp, dateParser); + } + } catch (DateTimeParseException e) { + log.debug("invalid publication_date date {}", publishedTimestamp); + } + newDocument.setDatasetReferenceDate( + DatasetReferenceDate.builder() + .creationDate(created) + .publicationDate(published) + .build() + ); + // supplemental / DOI link + String recordDoi = metadataJson.get("DOI").asText(); + List supplemental = new ArrayList<>(); + supplemental.add( + Supplemental.builder() + .name(recordDoi) + .description("Resolve record DOI at doi.org") + .url(recordDoi) + .build() + ); + newDocument.setSupplemental(supplemental); + // access + boolean isOpenAccess = metadataJson.get("open_access").booleanValue(); + if (isOpenAccess) { + newDocument.setAccessLimitation(openAccess); + } else { + newDocument.setAccessLimitation(controlledAccess); + } + // deims site + String metadataUrl = metadataJson + .path("community_specific") + .path("27193e5b-97e6-4f6f-8e87-3694589bcebe") + .path("metadata_url") + .asText() + .toLowerCase(); + Matcher deimsCheck = deimsIdNormalise.matcher(metadataUrl); + if (deimsCheck.find()) { + String normalisedDeimsId = deimsCheck.group(0); + newDocument.setDeimsSites(getDeimsSite(normalisedDeimsId)); + } + // disciplines and keywords + List keywords = new ArrayList<>(); + for (JsonNode node : metadataJson.path("keywords")) { + Keyword keyword = Keyword.builder() + .value(node.get("keyword").asText()) + .build(); + keywords.add(keyword); + } + for (JsonNode node : metadataJson.path("disciplines")) { + Keyword keyword = Keyword.builder() + .value(node.get("discipline_name").asText()) + .build(); + keywords.add(keyword); + } + newDocument.setKeywords(keywords); + // import metadata newDocument.setImportLastModified(ZonedDateTime.now(ZoneId.of("UTC"))); // fixed fields - newDocument.setAccessLimitation( - AccessLimitation.builder() - .value("no limitations to public access") - .code("Available") - .uri("http://inspire.ec.europa.eu/metadata-codelist/LimitationsOnPublicAccess/noLimitations") - .build() - ); newDocument.setType("signpost"); newDocument.setDataLevel("Level 0"); @@ -321,7 +422,7 @@ public void runImport(){ // process each record on page // normalise DOI to actual DOI, i.e. "10.xxx.../xxxx" String originalDoi = record.get("metadata").get("DOI").asText(); - Matcher doiCheck = p.matcher(originalDoi); + Matcher doiCheck = doiNormalise.matcher(originalDoi); if (!doiCheck.find()) { log.debug("No DOI detected in record {}", originalDoi); skippedRecords++; From 849632966b061d19e32d29fe52e81277f6ed00b5 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Mon, 7 Aug 2023 16:46:34 +0100 Subject: [PATCH 7/9] Only publish new B2SHARE documents with DEIMS IDs Jira: ELTER-20 --- .../catalogue/elter/B2shareImportService.java | 17 ++++++++++++----- .../elter/B2shareImportServiceTest.java | 13 +++++++++++++ .../elter/b2share-valid-api-response.json | 4 +++- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java index 68c27e239..c7ccc608a 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java @@ -367,10 +367,6 @@ private String createRecord(String remoteRecordId, ElterDocument newRecord, Cata "Create new record " + remoteRecordId ); - // publish new record - publicationService.transition(user, savedDocument.getId(), "ykhm7b"); - publicationService.transition(user, savedDocument.getId(), "re4vkb"); - // success log.debug("Successfully imported record {}", remoteRecordId); return savedDocument.getId(); @@ -400,6 +396,7 @@ public void runImport(){ Map localRecordList = null; int blacklistedRecords = 0; int newRecords = 0; + int publishedRecords = 0; int skippedRecords = 0; int totalRecords = 0; int updatedRecords = 0; @@ -449,6 +446,15 @@ public void runImport(){ String newId = createRecord(normalisedDoi, recordDocument, importUser); log.debug("New document ID is {}", newId); newRecords++; + + // publish new record IF it has a DEIMS site linked to it + List deimsSites = recordDocument.getDeimsSites(); + if (deimsSites != null && deimsSites.size() > 0) { + publicationService.transition(importUser, newId, "ykhm7b"); + publicationService.transition(importUser, newId, "re4vkb"); + log.debug("Successfully detected DEIMS ID and published record {}", recordDocument.getId()); + publishedRecords++; + } } } b2shareRecordsUrl = b2shareRecords.path("links").path("next").asText(); @@ -457,8 +463,9 @@ public void runImport(){ // finished, log summary log.info("Finished B2SHARE metadata import!"); - log.info("{} created + {} updated + {} skipped = {} total ({} records in B2SHARE)", + log.info("{} created ({} published) + {} updated + {} skipped = {} total ({} records in B2SHARE)", newRecords, + publishedRecords, updatedRecords, skippedRecords, newRecords + updatedRecords + skippedRecords, diff --git a/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java b/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java index d906845f8..6256247ee 100644 --- a/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java +++ b/java/src/test/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportServiceTest.java @@ -1,5 +1,6 @@ package uk.ac.ceh.gateway.catalogue.elter; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -22,6 +23,7 @@ import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import uk.ac.ceh.gateway.catalogue.deims.DeimsSolrIndex; import uk.ac.ceh.gateway.catalogue.model.CatalogueUser; import uk.ac.ceh.gateway.catalogue.publication.PublicationService; import uk.ac.ceh.gateway.catalogue.repository.DocumentRepository; @@ -46,6 +48,8 @@ public class B2shareImportServiceTest { private String testB2shareResponse; private QueryResponse queryResponse; private CatalogueUser expectedUser; + private List dummyDeimsSiteList; + private DeimsSolrIndex dummyDeimsSite; String b2shareResponseUrl = getClass().getResource("b2share-valid-api-response.json").toString(); String invalidB2shareResponseUrl = getClass().getResource("b2share-invalid-api-response.json").toString(); @@ -59,6 +63,13 @@ public class B2shareImportServiceTest { @BeforeEach void setup() { + dummyDeimsSiteList = new ArrayList<>(); + dummyDeimsSite = new DeimsSolrIndex(); + dummyDeimsSite.setTitle("Fake title"); + dummyDeimsSite.setId("Fake id"); + dummyDeimsSite.setUrl("Fake url"); + dummyDeimsSiteList.add(dummyDeimsSite); + queryResponse = mock(QueryResponse.class); expectedUser = new CatalogueUser() @@ -82,6 +93,8 @@ public void importNewRecord() { .willReturn(queryResponse); given(queryResponse.getResults()) .willReturn(new SolrDocumentList()); + given(queryResponse.getBeans(DeimsSolrIndex.class)) + .willReturn(dummyDeimsSiteList); given(documentRepository.saveNew( any(CatalogueUser.class), diff --git a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json index 51e75e11a..f80979784 100644 --- a/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json +++ b/java/src/test/resources/uk/ac/ceh/gateway/catalogue/elter/b2share-valid-api-response.json @@ -79,7 +79,9 @@ "DOI": "https://doi.org/10.23728/b2share.b56cd875765a403599859177fced08ae", "community": "d952913c-451e-4b5c-817e-d578dc8a4469", "community_specific": { - "27193e5b-97e6-4f6f-8e87-3694589bcebe": {} + "27193e5b-97e6-4f6f-8e87-3694589bcebe": { + "metadata_url": "https://deims.org/3e97f1ff-c3ad-4d19-91f9-5ee3485878cd" + } }, "contact_email": "h.bogena@fz-juelich.de", "creators": [ From 5adc3cde432eb4a83429df62b91372381cf711a9 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Mon, 7 Aug 2023 17:41:24 +0100 Subject: [PATCH 8/9] Allow for missing DOI field In the training API the DOI field is sometimes null, so maybe it is in the live too. Either way the record will be skipped so this prevents NPEs. Jira: ELTER-20 --- .../uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java index c7ccc608a..dfc1d6153 100644 --- a/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java +++ b/java/src/main/java/uk/ac/ceh/gateway/catalogue/elter/B2shareImportService.java @@ -418,7 +418,7 @@ public void runImport(){ for (JsonNode record : b2shareRecords.path("hits").path("hits")){ // process each record on page // normalise DOI to actual DOI, i.e. "10.xxx.../xxxx" - String originalDoi = record.get("metadata").get("DOI").asText(); + String originalDoi = record.path("metadata").path("DOI").asText(); Matcher doiCheck = doiNormalise.matcher(originalDoi); if (!doiCheck.find()) { log.debug("No DOI detected in record {}", originalDoi); From 6382a34c34b79530ee30ed32c99bfa7d18e82810 Mon Sep 17 00:00:00 2001 From: Will Bolton Date: Wed, 9 Aug 2023 16:17:57 +0100 Subject: [PATCH 9/9] Fetch B2SHARE records 500 at a time Processing individual records is quick, but getting them from the remote server is slow, so fetching more records at a time will generally speed up the import. We could probably just set the size as 9999999999 since there are ~1000 records now (and growing slowly) but let's prioritise a low memory footprint (500 records is roughly 27M). Jira: ELTER-20 --- java/src/main/resources/application.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/resources/application.properties b/java/src/main/resources/application.properties index cce6edb05..dbad79ab4 100644 --- a/java/src/main/resources/application.properties +++ b/java/src/main/resources/application.properties @@ -1,7 +1,7 @@ # suppress inspection "UnusedProperty" for whole file # suppress inspection "HttpUrlsUsage" for whole file # suppress inspection "SpringBootApplicationProperties" for whole file -b2share.api=https://b2share.eudat.eu/api/records/?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=100 +b2share.api=https://b2share.eudat.eu/api/records/?q=community:d952913c-451e-4b5c-817e-d578dc8a4469&size=500 crowd.address=https://crowd.ceh.ac.uk/crowd/rest/usermanagement/latest crowd.username=eip-ro data.repository.location=/var/ceh-catalogue/datastore