From a15a93c6c03c62838109b6b3d6bcca1c007028f4 Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 7 Apr 2020 18:54:34 -0700 Subject: [PATCH 01/47] Remove obsolete method --- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index 27df0a7e..b135bf08 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -1041,36 +1041,6 @@ private SubjectInfo getSubjectInfo(Subject rightsHolder, String serviceUrl, Stri return subjectInfo; } -// /** -// * Get a DataONE authenticated session -// *

-// * If no subject or authentication token are provided, a public session is returned -// *

-// * @param authToken the authentication token -// * @return the DataONE session -// */ -// Session getSession(String subjectId, String authToken) { -// -// Session session; -// -// // query Solr - either the member node or cn, for the project 'solrquery' field -// if (authToken == null || authToken.isEmpty()) { -// log.debug("Creating public session"); -// session = new Session(); -// } else { -// log.debug("Creating authentication session"); -// session = new AuthTokenSession(authToken); -// } -// -// if (subjectId != null && !subjectId.isEmpty()) { -// Subject subject = new Subject(); -// subject.setValue(subjectId); -// session.setSubject(subject); -// } -// -// return session; -// } - /** * Get a DataONE MultipartCNode object, which will be used to communication with a CN * From 2874d892d60450cd08bb3662151bf92cd0fe6c4c Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 7 Apr 2020 18:55:24 -0700 Subject: [PATCH 02/47] Get rightsholder from solr, not getSystemetadata --- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index b135bf08..49a65415 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -427,6 +427,19 @@ which will be used to query DataONE Solr for all the pids associated with that p node = xpathResult.item(0); label = node.getTextContent(); } + + // Extract the portal 'rightsHolder' + fieldXpath = xpath.compile("//result/doc/str[@name='rightsHolder']/text()"); + xpathResult = (org.w3c.dom.NodeList) fieldXpath.evaluate(xmldoc, XPathConstants.NODESET); + if(xpathResult.getLength() == 0) { + log.debug("RightsHolder not found for collection id: " + collectionId); + ScorerResult result = new ScorerResult(); + result.setResult(pids); + return result; + } else { + node = xpathResult.item(0); + rightsHolder = node.getTextContent(); + } } catch (XPathExpressionException xpe) { log.error("Error extracting collectinQuery from solr result doc: " + xpe.getMessage()); metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); @@ -451,13 +464,6 @@ which will be used to query DataONE Solr for all the pids associated with that p // from the CN. Then add those groups into the query. Each group will be included in the filter query in this format: // "(readPermission:"http://orcid.org/0000-0002-2192-403X") // OR (rightsHolder:"http://orcid.org/0000-0002-2192-403X")" - SystemMetadata sysmeta = null; - try { - sysmeta = getSystemMetadata(collectionId, serviceUrl, subjectId, authToken); - } catch (MetadigProcessException mpe) { - log.error("Unable to get system metadata for collection: " + collectionId); - throw(mpe); - } Subject rightsHolder = sysmeta.getRightsHolder(); // The subject info can only be obtained from a CN, so use the CN auth info for the current DataONE environment, From f7e88840bf8044ff76c98ccd9936aa41cd085ed2 Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 7 Apr 2020 18:55:58 -0700 Subject: [PATCH 03/47] Addl changes for metadig properties file cleanup --- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 132 ++++++++++-------- 1 file changed, 77 insertions(+), 55 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index 49a65415..ac8cfbd1 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -20,7 +20,6 @@ import org.apache.solr.client.solrj.beans.BindingException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; -import org.dataone.client.auth.AuthTokenSession; import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; @@ -35,7 +34,6 @@ import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; -import org.quartz.JobExecutionException; import org.w3c.dom.Document; import org.xml.sax.InputSource; @@ -78,9 +76,9 @@ public class Scorer { private static String CNauthToken = null; private static String CNsubjectId = null; private static String CNserviceUrl = null; + private static String CNnodeId="urn:node:CN"; private static SolrClient client = null; private static String solrLocation = null; - private static String filestoreBase = null; private static final String SOLR_COLLECTION = "quality"; private static long startTimeProcessing; @@ -127,7 +125,6 @@ public static void main(String[] argv) throws Exception { RabbitMQhost = cfg.getString("RabbitMQ.host"); RabbitMQport = cfg.getInt("RabbitMQ.port"); solrLocation = cfg.getString("solr.location"); - filestoreBase = cfg.getString("metadig.store.directory"); CNauthToken = cfg.getString("CN.authToken"); CNserviceUrl = cfg.getString("CN.serviceUrl"); CNsubjectId = cfg.getString("CN.subjectId"); @@ -161,6 +158,7 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp MetadigException metadigException = null; String subjectId = null; String authToken = null; + String nodeServiceUrl = null; String label = null; String title = null; @@ -178,21 +176,13 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp } // The components of the graph queue request - String collectionId = qEntry.getProjectId(); - //String projectName = qEntry.getProjectName(); - //String authTokenName = qEntry.getAuthTokenName(); - //String subjectIdName = qEntry.getSubjectIdName(); + String collectionId = qEntry.getCollectionId(); // Select quality scores based on the nodeId String nodeId = qEntry.getNodeId(); - //String serviceUrl = qEntry.getServiceUrl(); String formatFamily = qEntry.getFormatFamily(); String suiteId = qEntry.getQualitySuiteId(); - String serviceUrl = null; - Scorer scorer = new Scorer(); long difference; - log.debug("read score query entry"); - if(formatFamily == null) { formatFamily = ""; } @@ -208,16 +198,26 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp } log.debug("collectionId: " + collectionId); + // A nodeId is not specified, then the CN will be used if(nodeId == null) { - nodeId = ""; + nodeId=CNnodeId; } + log.debug("nodeId: " + nodeId); label: try { MDQconfig cfg = new MDQconfig(); // Pids associated with a collection, based on query results using 'collectionQuery' field in solr. ArrayList collectionPids = null; - //String title = "Project " + projectName; + // The harvesting and evaluation of the collectionQuery is based on the nodeId that is passed in, i.e. + // If an MN is specified, then the collection (portal) Solr entry will be obtained from the MN, and the + // collectionQuery string will also be evaluated on that node. + String nodeAbbr = nodeId.replace("urn:node:", ""); + authToken = cfg.getString(nodeAbbr + ".authToken"); + subjectId = cfg.getString(nodeAbbr + ".subjectId"); + // TODO: Cache the node values from the CN listNode service + nodeServiceUrl = cfg.getString(nodeAbbr + ".serviceUrl"); + HashMap variables = new HashMap<>(); // Create the graph. // Two types of graphs are currently supported: @@ -228,16 +228,19 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp //Scorer gfr = new Scorer(); // If creating a graph for a collection, get the set of pids associated with the collection. // Only scores for these pids will be included in the graph. + if (collectionId != null && !collectionId.isEmpty()) { - // The collection query is evaluated on the CN - authToken = CNauthToken; - subjectId = CNsubjectId; - serviceUrl = CNserviceUrl; - log.info("* Getting pids for collection " + collectionId); + // If the nodeId is specified, use if to determine the values for authTokenName and subjectIdName, + // if those values are not defined + log.debug("collectionId is not null: " + collectionId); + String id = nodeId.replace("urn:node:", "").toUpperCase().trim(); + + // The collection query is obtained from the MN and evaluated on the CN + log.info("Getting pids for collection " + collectionId); // Always use the CN subject id and authentication token from the configuration file, as // requests that this method uses need CN subject privs ScorerResult result = null; - result = gfr.getCollectionPids(collectionId, nodeId, serviceUrl, subjectId, authToken); + result = gfr.getCollectionPids(collectionId, nodeServiceUrl, subjectId, authToken); collectionPids = result.getResult(); label = result.getLabel(); // Don't continue if no pids (and thus scores) were found for this collection @@ -250,9 +253,10 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp } } + log.debug("Getting quality scores..."); // Quality scores will now be obtained from the MetaDIG quality Solr index, using the list of pids obtained // for the collection. - List scores = gfr.getQualityScores(collectionId, suiteId, nodeId, formatFamily, collectionPids); + List scores = gfr.getQualityScores(collectionId, suiteId, formatFamily, collectionPids); // Don't continue if no quality scores were found for this collection if(scores.size() == 0) { @@ -279,13 +283,12 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp //String filePath = graph.create(GraphType.CUMULATIVE, title, scoreFile.getPath()); String filePath = graph.create(GraphType.MONTHLY, title, scoreFile.getPath()); // Now save the graphics file to permanent storage - //String outfile = projectName + "-" + suiteId + ".png"; String outfile; DateTime createDateTime = DateTime.now(); mdFile.setCreationDatetime(createDateTime); - mdFile.setCollectionId(collectionId); + mdFile.setPid(collectionId); mdFile.setSuiteId(suiteId); mdFile.setNodeId(nodeId); mdFile.setStorageType(StorageType.GRAPH.toString()); @@ -301,7 +304,7 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp // for fileid, storagetype, extension mdFile = new MetadigFile(); mdFile.setCreationDatetime(createDateTime); - mdFile.setCollectionId(collectionId); + mdFile.setPid(collectionId); mdFile.setSuiteId(suiteId); mdFile.setNodeId(nodeId); mdFile.setStorageType(StorageType.DATA.toString()); @@ -344,12 +347,16 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp *

First the 'collectionQuery' field is retrieved from DataONE Solr for the collection

*

Next, a query is issued with the query from collectionQuery field, to retrieve all Solr docs for the collection ids./p> * + *

Note that in the current design, the collection query is always obtained by querying the node specified in the taskList.csv file, + * which is usually an MN, but the collectionQuery is always evaluated on the CN

+ * * @param collectionId a DataONE project id to fetch scores for, e.g. urn:uuid:f137095e-4266-4474-aa5f-1e1fcaa5e2dc - * @param nodeId a DataONE node identifier, e.g. "urn:node:KNB" - * @param + * @param serviceUrl the DataONE service URL to obtain the collectionQuery string from + * @param subjectId the DataONE subjectId to use for the query, associated with the authentication token + * @param authToken the DataONE authentication token * @return a List of quality scores fetched from Solr */ - private ScorerResult getCollectionPids(String collectionId, String nodeId, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { + private ScorerResult getCollectionPids(String collectionId, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { Document xmldoc = null; String queryStr = null; @@ -362,12 +369,14 @@ private ScorerResult getCollectionPids(String collectionId, String nodeId, Strin which will be used to query DataONE Solr for all the pids associated with that project (that's 2 queries!) */ ArrayList pids = new ArrayList<>(); - queryStr = "?q=id:" + escapeSpecialChars(collectionId) + "&fl=collectionQuery,label&q.op=AND"; + queryStr = "?q=id:" + escapeSpecialChars(collectionId) + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; startPos = 0; countRequested = 10000; + // Get the collectionQuery from Solr try { + log.debug("Getting collectionQuery with query: " + queryStr); xmldoc = queryD1Solr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); } catch (MetadigProcessException mpe) { log.error("Unable to query Solr for collectionQuery field for collection id: " + collectionId); @@ -385,13 +394,13 @@ which will be used to query DataONE Solr for all the pids associated with that p XPath xpath = null; org.w3c.dom.Node node = null; String label = null; + String rightsHolder = null; try { log.debug("Getting collectionQuery for id: " + collectionId); // Extract the collection query from the Solr result XML XPathFactory xPathfactory = XPathFactory.newInstance(); xpath = xPathfactory.newXPath(); - // TODO: replace this test query with the live one fieldXpath = xpath.compile("//result/doc/str[@name='collectionQuery']/text()"); // extract the 'collectionQuery' field from the Solr result @@ -400,7 +409,6 @@ which will be used to query DataONE Solr for all the pids associated with that p log.debug("collectionQuery not found for collection id: " + collectionId); ScorerResult result = new ScorerResult(); result.setResult(pids); - result.setLabel(""); return result; } else { node = xpathResult.item(0); @@ -414,14 +422,13 @@ which will be used to query DataONE Solr for all the pids associated with that p log.debug("got collectionQuery: " + collectionQuery); } - // Extract the portal 'label' (title) + // Extract the portal 'label' fieldXpath = xpath.compile("//result/doc/str[@name='label']/text()"); xpathResult = (org.w3c.dom.NodeList) fieldXpath.evaluate(xmldoc, XPathConstants.NODESET); if(xpathResult.getLength() == 0) { - log.debug("label not found for collection id: " + collectionId); + log.debug("Title (label) not found for collection id: " + collectionId); ScorerResult result = new ScorerResult(); result.setResult(pids); - result.setLabel(""); return result; } else { node = xpathResult.item(0); @@ -457,7 +464,7 @@ which will be used to query DataONE Solr for all the pids associated with that p collectionQuery = collectionQuery.replaceAll("\\s*AND\\s*\\(-obsoletedBy:\\*\\s*AND\\s*formatType:METADATA\\)", ""); log.debug("Edited collectionQuery: " + collectionQuery); - // Get account information for the collection owner. The account info will be used when the 'collectionQuery' + // Get account information for the collection rightsHolder (owner). The account info will be used when the 'collectionQuery' // query is made, which will use the owner's identity and group memberships, so that the pids that are returned // from the query are the ones that the user would see when viewing their portal page. // First get the sysmeta from the collection pid, in order to determine the owner. Next, get the account info @@ -465,19 +472,21 @@ which will be used to query DataONE Solr for all the pids associated with that p // "(readPermission:"http://orcid.org/0000-0002-2192-403X") // OR (rightsHolder:"http://orcid.org/0000-0002-2192-403X")" - Subject rightsHolder = sysmeta.getRightsHolder(); + // Use the rightsHolder obtained from the Solr query + Subject subject = new Subject(); + subject.setValue(rightsHolder); // The subject info can only be obtained from a CN, so use the CN auth info for the current DataONE environment, // which should be configured in the metadig.properties file - SubjectInfo subjectInfo = getSubjectInfo(rightsHolder, CNserviceUrl, CNsubjectId, CNauthToken); + SubjectInfo subjectInfo = getSubjectInfo(subject, CNserviceUrl, CNsubjectId, CNauthToken); String groupStr = null; - groupStr = "(readPermission:" + "\"" + rightsHolder.getValue() - + "\")" + " OR (rightsHolder:\"" + rightsHolder.getValue() + "\"" + ")" + groupStr = "(readPermission:" + "\"" + rightsHolder + + "\")" + " OR (rightsHolder:\"" + rightsHolder + "\"" + ")" + " OR (readPermission:\"public\")"; - // Assemble the + // Assemble the query string that selects pids based on permissions from the rightsHolder for(Group group : subjectInfo.getGroupList()) { - log.debug("Adding group to query: " + group.getSubject().getValue()); + log.trace("Adding group to query: " + group.getSubject().getValue()); if(groupStr == null) { groupStr = "(readPermission:" + "\"" + group.getSubject().getValue() + "\")" + " OR (rightsHolder:\"" + group.getSubject().getValue() + "\"" + ")"; @@ -490,6 +499,9 @@ which will be used to query DataONE Solr for all the pids associated with that p //groupStr = "+AND+" + "(" + groupStr + ")"; //groupStr = "&fq=" + encodeValue("rightsHolder:\"CN=PASTA-GMN,O=LTER,ST=New Mexico,C=US\""); groupStr = "&fq=" + encodeValue(groupStr); + log.trace("groupStr: " + groupStr); + + // Now evaluate the collectionQuery // Send the collectionQuery string to Solr to get the pids associated with the collection // The 'collectionQuery' Solr field may have backslashes that are used to escape special characters (i.e. ":") that are not @@ -500,7 +512,8 @@ which will be used to query DataONE Solr for all the pids associated with that p int resultCount = 0; startPos = 0; countRequested = 1000; - // Now get the pids associated with the collection + // Now get the pids associated with the collection by sending the collectionQuery to the DataONE CN + // The collectionQuery is always evaluated on the CN, as portals should have all DataONE data available to them. // One query can return many documents, so use the paging mechanism to make sure we retrieve them all. // Keep paging through query results until all pids have been fetched. The last 'page' of query // results is indicated by the number of items returned being less than the number requested. @@ -517,11 +530,23 @@ which will be used to query DataONE Solr for all the pids associated with that p // Loop through the Solr result. As the result may be large, page through the results, accumulating // the pids returned - log.debug("query string: " + queryStr); - log.debug("Sending collectionQuery to Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); + // Determine where the collectionQuery should be evaluated. When the DataONE quata service is ready, query it + // for this collection to determine if the collectionQuery should be sent to the CN. Since this service is + // not ready, send the query to the same serviceUrl, subjectId, authToken which was used to harvest the + // collection document and obtain the collectionQuery string + + // When the service is available, use the DataONE quota service to set these variable conditionally + String evalServiceUrl = serviceUrl; + String evalSubjectId = subjectId; + String evalAuthToken = authToken; + + log.debug("Sending collectionQuery to Solr using subjectId: " + evalSubjectId + ", servicerUrl: " + evalServiceUrl); + log.trace("query string: " + queryStr); + do { //TODO: check that a result was returned - xmldoc = queryD1Solr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); + // Note: the collectionQuery is always evaluated on the CN, so that the entire DataONE network is queried. + xmldoc = queryD1Solr(queryStr, evalServiceUrl, startPos, countRequested, evalSubjectId, evalAuthToken); if(xmldoc == null) { log.info("no values returned from query"); break; @@ -560,12 +585,11 @@ which will be used to query DataONE Solr for all the pids associated with that p * * @param collectionId a DataONE project id to fetch scores for, e.g. urn:uuid:f137095e-4266-4474-aa5f-1e1fcaa5e2dc * @param suiteId a MetaDIG quality suite id, e.g. "FAIR.suite.1" - * @param nodeId a DataONE node identifier, e.g. "urn:node:KNB" * @param formatFamily list of MetaDIG metadata format "families", e.g. "iso19115,eml" - * @param + * @param collectionPids the list of pids to get scores for * @return a List of quality scores fetched from Solr */ - private List getQualityScores(String collectionId, String suiteId, String nodeId, String formatFamily, ArrayList collectionPids) throws Exception { + private List getQualityScores(String collectionId, String suiteId, String formatFamily, ArrayList collectionPids) throws Exception { // Now that we have all the pids, query the Quality Solr server for the scores for each pid associate with the project. // These scores will be written out to a file that will be used by the graphing routine to create a plot of the aggregated statistics. // If a project wasn't specified, then we are not building a special query for a list of pids, so try to get the max amount @@ -596,7 +620,7 @@ private List getQualityScores(String collectionId, String suiteId, } formatFamilySearchTerm = "(" + formatFamilySearchTerm + ")"; } - log.debug("FormatFamily query term: " + formatFamilySearchTerm); + log.trace("FormatFamily query term: " + formatFamilySearchTerm); } int startPosInResult = 0; @@ -646,16 +670,14 @@ private List getQualityScores(String collectionId, String suiteId, pidsLeft -= pidCntToRequest; } while (pidsLeft > 0); } else { - log.info("Getting quality scores for suiteId: " + suiteId + ", datasource: " + nodeId + " formats: " + formatFamily); + log.info("Getting quality scores for suiteId: " + suiteId + ", datasource: " + " formats: " + formatFamily); countRequested = 1000; formatFamilySearchTerm = null; queryStr = "metadataId:*"; if(suiteId != null) { queryStr += " AND suiteId:" + "\"" + suiteId + "\""; } - if(nodeId != null) { - queryStr += " AND datasource:" + "\"" + nodeId + "\""; - } + if (formatFamilySearchTerm != null) { queryStr += " AND metadataFormatId:" + "\"" + formatFamilySearchTerm + "\""; } @@ -756,7 +778,7 @@ private void returnGraphStatus(String metadataPid, String suiteId, ScorerQueueEn log.info(" [x] Done"); this.writeCompletedQueue(message); - log.info(" [x] Sent completed report for project id: '" + qEntry.getProjectId() + "'"); + log.info(" [x] Sent completed report for project id: '" + qEntry.getCollectionId() + "'"); } catch (Exception e) { // If we couldn't prepare the message, then there is nothing left to do log.error(" Unable to return report to controller"); @@ -1004,7 +1026,7 @@ protected SystemMetadata getSystemMetadata(String pid, String serviceUrl, String */ private SubjectInfo getSubjectInfo(Subject rightsHolder, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { - log.debug("Getting subject info for: " + rightsHolder); + log.debug("Getting subject info for: " + rightsHolder.getValue()); MultipartCNode cnNode = null; MetadigProcessException metadigException = null; From edf706851728c335aa0de406c7e7686fdef614e3 Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 7 Apr 2020 18:56:46 -0700 Subject: [PATCH 04/47] Add add'l debug statements to show graph creation options --- src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java index 9150df8c..3a95fd2d 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java @@ -102,6 +102,7 @@ public String create(GraphType type, String title, String inputFile) throws Exce case CUMULATIVE: mdFile.setMediaType("text/x-rsrc"); mdFile.setAltFilename("graph_" + GraphType.CUMULATIVE.toString().toLowerCase() + "_quality_scores.R"); + log.debug("Creating a " + GraphType.CUMULATIVE.toString().toLowerCase() + " graph with " + mdFile.getAltFilename()); codeFile = fileStore.getFile(mdFile); dispatcherType = "r"; @@ -109,12 +110,15 @@ public String create(GraphType type, String title, String inputFile) throws Exce case MONTHLY: mdFile.setMediaType("text/x-rsrc"); mdFile.setAltFilename("graph_" + GraphType.MONTHLY.toString().toLowerCase() + "_quality_scores.R"); + log.debug("Creating a " + GraphType.MONTHLY.toString().toLowerCase() + " graph with " + mdFile.getAltFilename()); codeFile = fileStore.getFile(mdFile); dispatcherType = "r"; break; } + log.debug("Graph program length: " + codeFile.length()); + // The the graph program the title of the graph // Currently we aren't putting titles on the graphs //variables.put("title", title); @@ -130,7 +134,7 @@ public String create(GraphType type, String title, String inputFile) throws Exce Result result = null; try { - log.debug("dispatching graph program "); + log.debug("dispatching graph program " + codeFile.toPath()); result = dispatcher.dispatch(variables, code); } catch (ScriptException e) { log.error("Error executing script"); From e391d9e30c4fee47f95c070dc3d4dcdd1fdd8bf7 Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 7 Apr 2020 18:59:52 -0700 Subject: [PATCH 05/47] Update assessment graph R script to use most recent scores per month --- .../resources/code/graph_monthly_quality_scores.R | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/main/resources/code/graph_monthly_quality_scores.R b/src/main/resources/code/graph_monthly_quality_scores.R index 9d31250b..61406d93 100644 --- a/src/main/resources/code/graph_monthly_quality_scores.R +++ b/src/main/resources/code/graph_monthly_quality_scores.R @@ -37,17 +37,12 @@ scores <- mutate(fsr, ym = as.Date(sprintf("%4s-%02d-01", year(dateUploaded), mo mutate(scoreI = scoreInteroperable * 100.0) %>% mutate(scoreR = scoreReusable * 100.0) -# Use this when sequenceId problem has been resolved (github metadig-engine #232) -#most_recent <- scores %>% -# arrange(ym, sequenceId, dateUploaded) %>% -# group_by(ym, sequenceId) %>% -# top_n(1, dateUploaded) -#head(most_recent) - most_recent <- scores %>% - arrange(ym) %>% - group_by(ym) -head(most_recent) + arrange(ym, sequenceId, dateUploaded) %>% + group_by(ym, sequenceId) %>% + top_n(1, dateUploaded) + +#head(most_recent) # calculate cummulative overall score_cumulative <- most_recent %>% From 235f86609184fe9f61d1bf02494af5e3b18cff55 Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 21 Apr 2020 13:33:48 -0700 Subject: [PATCH 06/47] Upgrade RabbitMQ --- Kubernetes/Admin/metadig-engine/rabbitmq.yaml | 14 +++++++------- pom.xml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Kubernetes/Admin/metadig-engine/rabbitmq.yaml b/Kubernetes/Admin/metadig-engine/rabbitmq.yaml index a0a609d5..050699b0 100644 --- a/Kubernetes/Admin/metadig-engine/rabbitmq.yaml +++ b/Kubernetes/Admin/metadig-engine/rabbitmq.yaml @@ -1,4 +1,4 @@ -apiVersion: apps/v1beta1 +apiVersion: apps/v1 kind: Deployment metadata: name: rabbitmq @@ -13,16 +13,17 @@ spec: labels: app: rabbitmq spec: - serviceAccountName: metadig-serviceaccount containers: - name: rabbitmq - image: rabbitmq:3.7 + # 3-management loads the RabbitMQ Administrative plugin with the + # most recent 3.x release. + image: rabbitmq:3-management ports: - - name: rabbitmq + - name: rabbitmq containerPort: 5672 - name: rabbitmqadmin containerPort: 15672 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always restartPolicy: Always #volumes: #- name: metadig-ctl-claim0 @@ -47,5 +48,4 @@ spec: - name: rabbitmqadmin port: 15672 protocol: TCP - targetPort: rabbitmqadmin - + targetPort: rabbitmqadmin \ No newline at end of file diff --git a/pom.xml b/pom.xml index 8259da96..02139762 100644 --- a/pom.xml +++ b/pom.xml @@ -120,7 +120,7 @@ com.rabbitmq amqp-client - 5.7.3 + 5.9.0 From c41421b387bf82ca804d5934637c516540515078 Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 21 Apr 2020 13:51:36 -0700 Subject: [PATCH 07/47] Add'l simplification of metadig config file --- .changed.txt.swp | Bin 0 -> 12288 bytes .../edu/ucsb/nceas/mdqengine/Controller.java | 49 +++++------------- 2 files changed, 14 insertions(+), 35 deletions(-) create mode 100644 .changed.txt.swp diff --git a/.changed.txt.swp b/.changed.txt.swp new file mode 100644 index 0000000000000000000000000000000000000000..acddfdb187b81286f488f6f674b55eafaa058797 GIT binary patch literal 12288 zcmeI&Jx>BL7zgmez~)PIazmZ3g=1kgguww8f}=^ftH+spfwo6TO!TAp6@o zIQjv6PEHU9B*vI%{!N}VZJ)mVw##);J2~!Do4n4bY%>f}%^2 z=Ts;jHW|C8K!E@RAOHafKmY;|fB*y_009X6qJYt!h-H`vtxTF}PZ?pxk;q1YmcDXq z>P%ej@*1aJHcvjMSL;IgnaDgdSag4X4Uu?1w2YEQ#Qu2pB3>9T)k=%rTgCHZFSc2_ b1RY>`W-CpsPNPWa*|bp_^JyHhk4wD*cr2gS literal 0 HcmV?d00001 diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java index 53ebf9d5..038fb74a 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java @@ -133,25 +133,20 @@ read from the port number (argv[0]) which will are the metadata and } String delims = "[,]"; String[] tokens = request.split(delims); + String nodeId = null; switch(requestType) { case "score": log.debug("Processing score request"); String collectionId = tokens[0]; - String projectName = tokens[1]; - String authTokenName = tokens[2]; - String subjectIdName = tokens[3]; - String memberNode = tokens[4]; - String serviceUrl = tokens[5]; - String formatFamily = tokens[6]; - String qualitySuiteId = tokens[7]; + nodeId = tokens[1]; + String formatFamily = tokens[2]; + String qualitySuiteId = tokens[3]; requestDateTime = new DateTime(); - log.info("Request queuing of: " + tokens[0] + ", " + tokens[1] + ", " + tokens[2] + ", " + tokens[3] + ", " + tokens[4] - + ", " + tokens[5] + "," + tokens[6]); + log.info("Request queuing of: " + tokens[0] + ", " + tokens[1] + ", " + tokens[2] + ", " + tokens[3]); - metadigCtrl.processScorerRequest(collectionId, projectName, authTokenName, subjectIdName, memberNode, serviceUrl, - formatFamily, qualitySuiteId, requestDateTime); + metadigCtrl.processScorerRequest(collectionId, nodeId, formatFamily, qualitySuiteId, requestDateTime); break; case "quality": log.debug("Processing quality request"); @@ -165,7 +160,7 @@ read from the port number (argv[0]) which will are the metadata and String suiteId = tokens[3]; requestDateTime = new DateTime(); - String nodeId = tokens[4]; + nodeId = tokens[4]; log.info("Request queuing of: " + tokens[0] + ", " + tokens[3] + ", " + tokens[4]); metadigCtrl.processQualityRequest(nodeId, metadataPid, metadata, suiteId, "/tmp", requestDateTime, sysmeta); break; @@ -375,8 +370,7 @@ public void processQualityRequest(String memberNode, *

* * @param collectionId - * @param projectName - * @param memberNode + * @param nodeId * @param formatFamily * @param qualitySuiteId * @param requestDateTime @@ -385,11 +379,7 @@ public void processQualityRequest(String memberNode, * @throws java.io.IOException */ public void processScorerRequest(String collectionId, - String projectName, - String authTokenName, - String subjectIdName, - String memberNode, - String serviceUrl, + String nodeId, String formatFamily, String qualitySuiteId, DateTime requestDateTime) throws java.io.IOException, MetadigException { @@ -399,18 +389,7 @@ public void processScorerRequest(String collectionId, byte[] message = null; String authToken = null; - if(authTokenName != null) { - try { - authToken = readConfigParam(authTokenName); - } catch (ConfigurationException ce) { - log.error("Error reading configuration for param " + "\"" + authTokenName + "\"" + ": " + ce.getMessage()); - MetadigException metadigException = new MetadigProcessException("Error reading configuration for param " + authTokenName + ": " + ce.getMessage()); - metadigException.initCause(ce); - throw metadigException; - } - } - - qEntry = new ScorerQueueEntry(collectionId, projectName, authTokenName, subjectIdName, qualitySuiteId, memberNode, serviceUrl, formatFamily, requestDateTime); + qEntry = new ScorerQueueEntry(collectionId, qualitySuiteId, nodeId, formatFamily, requestDateTime); ByteArrayOutputStream bos = new ByteArrayOutputStream(); ObjectOutput out = new ObjectOutputStream(bos); @@ -418,7 +397,7 @@ public void processScorerRequest(String collectionId, message = bos.toByteArray(); this.writeInProcessChannel(message, SCORER_ROUTING_KEY); - log.info(" [x] Queued Scorer request for collectionld: '" + qEntry.getProjectId() + "'" + " quality suite " + qualitySuiteId); + log.info(" [x] Queued Scorer request for collectionld: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); } /** @@ -536,16 +515,16 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp completedChannel.basicAck(envelope.getDeliveryTag(), false); } - log.info(" [x] Controller received notification of completed score for: '" + qEntry.getProjectId() + "'" + ", " + + log.info(" [x] Controller received notification of completed score for: '" + qEntry.getCollectionId() + "'" + ", " + "hostsname: " + qEntry.getHostname()); - log.info("Total processing time for worker " + qEntry.getHostname() + " for PID " + qEntry.getProjectId() + ": " + qEntry.getProcessingElapsedTimeSeconds()); + log.info("Total processing time for worker " + qEntry.getHostname() + " for PID " + qEntry.getCollectionId() + ": " + qEntry.getProcessingElapsedTimeSeconds()); /* An exception caught by the worker will be passed back to the controller via the queue entry * 'exception' field. Check this now and take the appropriate action. */ Exception me = qEntry.getException(); if (me instanceof MetadigException) { - log.error("Error running suite: " + qEntry.getQualitySuiteId() + ", pid: " + qEntry.getProjectId() + ", error msg: "); + log.error("Error running suite: " + qEntry.getQualitySuiteId() + ", pid: " + qEntry.getCollectionId() + ", error msg: "); log.error("\t" + me.getMessage()); Throwable thisCause = me.getCause(); if (thisCause != null) { From 654072ca9bf5ddee9a1c95b8a4cf532f466abc97 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 13 May 2020 09:24:20 -0700 Subject: [PATCH 08/47] Update RequestReportJob.java --- .../mdqengine/scheduler/RequestReportJob.java | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index c46fb9d5..9f5d8a6d 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -56,7 +56,13 @@ public class RequestReportJob implements Job { private Log log = LogFactory.getLog(RequestReportJob.class); class ListResult { - Integer resultCount; + // The total result count returned from DataONE + Integer totalResultCount; + // The filtered result count returned from DataONE. + // The DataONE listObjects service returns all new pids for all formatIds + // but we are typically only interested in a subset of those, i.e. EML metadata pids, + // so this is the count of pids from the result that we are actually interested in. + Integer filteredResultCount; ArrayList result = new ArrayList<>(); void setResult(ArrayList result) { @@ -67,12 +73,19 @@ ArrayList getResult() { return this.result; } - void setResultCount(Integer count) { - this.resultCount = count; + void setTotalResultCount(Integer count) { + this.totalResultCount = count; + } + void setFilteredResultCount(Integer count) { + this.filteredResultCount = count; + } + + Integer getTotalResultCount() { + return this.totalResultCount; } - Integer getResultCount() { - return this.resultCount; + Integer getFilteredResultCount() { + return this.filteredResultCount; } } @@ -247,7 +260,8 @@ public void execute(JobExecutionContext context) Integer startCount = new Integer(0); ListResult result = null; - Integer resultCount = null; + Integer totalResultCount = null; + Integer filteredResultCount = null; boolean morePids = true; while(morePids) { @@ -257,14 +271,15 @@ public void execute(JobExecutionContext context) try { result = getPidsToProcess(cnNode, mnNode, isCN, session, suiteId, nodeId, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); pidsToProcess = result.getResult(); - resultCount = result.getResultCount(); + totalResultCount = result.getTotalResultCount(); + filteredResultCount = result.getFilteredResultCount(); } catch (Exception e) { JobExecutionException jee = new JobExecutionException("Unable to get pids to process", e); jee.setRefireImmediately(false); throw jee; } - log.info("Found " + resultCount + " pids" + " for node: " + nodeId); + log.info("Found " + filteredResultCount + " pids" + " for node: " + nodeId); for (String pidStr : pidsToProcess) { try { log.info("submitting pid: " + pidStr); @@ -295,10 +310,10 @@ public void execute(JobExecutionContext context) } // Check if DataONE returned the max number of results. If so, we have to request more by paging through - // the results. - if(resultCount >= countRequested) { + // the results returned pidsToProcess (i.e. DataONE listObjects service). + if(totalResultCount >= countRequested) { morePids = true; - startCount = startCount + resultCount; + startCount = startCount + totalResultCount; log.info("Paging through more results, current start is " + startCount); } else { morePids = false; @@ -355,6 +370,7 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, for(ObjectInfo oi: objList.getObjectInfoList()) { thisFormatId = oi.getFormatId().getValue(); thisPid = oi.getIdentifier().getValue(); + log.debug("Checking pid: " + thisPid + ", format: " + thisFormatId); // Check all pid filters. There could be multiple wildcard filters, which are separated // by ','. @@ -381,7 +397,11 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, } ListResult result = new ListResult(); - result.setResultCount(pidCount); + // Set the count for the number of desired pids filtered from the total result set + result.setFilteredResultCount(pidCount); + // Set the count for the total number of pids returned from DataONE (all formatIds) for this query + // Set the count for the total number of pids returned from DataONE (all formatIds) for this query + result.setTotalResultCount(objList.getCount()); result.setResult(pids); return result; From c8ff28791cc57568ac25ba1e1b3786fca51339f3 Mon Sep 17 00:00:00 2001 From: gothub Date: Mon, 13 Jul 2020 15:36:48 -0700 Subject: [PATCH 09/47] Add DataONE bookkeeper call to check portal status --- .../edu/ucsb/nceas/mdqengine/Controller.java | 79 ++++++-- .../authentication/BookkeeperClient.java | 171 ++++++++++++++++++ .../configuration/metadig.properties | 3 + .../bookkeeper/BookkeeperClientTest.java | 35 ++++ 4 files changed, 275 insertions(+), 13 deletions(-) create mode 100644 src/main/java/edu/ucsb/nceas/mdqengine/authentication/BookkeeperClient.java create mode 100644 src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java index 038fb74a..4784fb37 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java @@ -1,12 +1,13 @@ package edu.ucsb.nceas.mdqengine; import com.rabbitmq.client.*; -import edu.ucsb.nceas.mdqengine.exception.MetadigProcessException; +import edu.ucsb.nceas.mdqengine.authentication.BookkeeperClient; import edu.ucsb.nceas.mdqengine.scorer.ScorerQueueEntry; import edu.ucsb.nceas.mdqengine.exception.MetadigException; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.dataone.bookkeeper.api.Usage; import org.dataone.exceptions.MarshallingException; import org.dataone.service.types.v2.SystemMetadata; import org.dataone.service.types.v2.TypeFactory; @@ -17,6 +18,8 @@ import java.lang.reflect.InvocationTargetException; import java.net.ServerSocket; import java.net.Socket; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -271,6 +274,52 @@ public void disableTestMode() { this.totalElapsedSeconds = 0; } + /** + * Query DataONE bookkeeper service to determine if a portal is active + * + *

+ * Before generating a metadata assessment graph for a portal, check + * if the portal is active. A portal can be marked to inactive by + * the portal owner, or by the bookkeeper admin if usage fees are + * delinquent. + *

+ * @param collectionId The DataONE collection identifier + * @return + * @throws MetadigException + */ + // Check the portal quota with DataONE bookkeaper + public Boolean isPortalActive(String collectionId) throws MetadigException { + // Check the portal quota with DataONE bookkeeper + log.debug("Checking bookkeeper portal Usage for collection: " + collectionId); + String msg = null; + BookkeeperClient bkClient = BookkeeperClient.getInstance(); + List usages = null; + Usage usage = null; + List subjects = new ArrayList(); + try { + if(bkClient.getBookkeeperEnabled()) { + // Set status = null so that any usage will be returned. + String status = null; + usages = bkClient.listUsages(0, collectionId, "portal", status , subjects); + usage = usages.get(0); + log.debug("Usage for portal " + collectionId + " is " + usage.getStatus()); + if(usage.getStatus().compareToIgnoreCase("active") == 0) { + return true; + } else { + return false; + } + } else { + msg = "Metadig config param 'bookkeeper.enabled is blank or missing"; + log.error(msg); + throw(new MetadigException(msg)); + } + } catch (Exception e) { + msg = "Unable to get usage from bookkeeper for collection id: " + collectionId; + log.error(msg); + throw(new MetadigException(msg)); + } + }; + /** * Forward a request to the "InProcess" queue. *

@@ -369,11 +418,11 @@ public void processQualityRequest(String memberNode, * create the graph from them. *

* - * @param collectionId - * @param nodeId - * @param formatFamily - * @param qualitySuiteId - * @param requestDateTime + * @param collectionId the DataONE collection identifier + * @param nodeId the node identifier the collection resides on + * @param formatFamily a string representing the DataONE formats to create score for + * @param qualitySuiteId the quality suite used to create the score graph + * @param requestDateTime the datetime that the request was made * * @return * @throws java.io.IOException @@ -389,15 +438,19 @@ public void processScorerRequest(String collectionId, byte[] message = null; String authToken = null; - qEntry = new ScorerQueueEntry(collectionId, qualitySuiteId, nodeId, formatFamily, requestDateTime); + if(!isPortalActive(collectionId)) { + log.info("[x} Skipping Scorer request for inactive portal, collectionld: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); + } else { + qEntry = new ScorerQueueEntry(collectionId, qualitySuiteId, nodeId, formatFamily, requestDateTime); - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - ObjectOutput out = new ObjectOutputStream(bos); - out.writeObject(qEntry); - message = bos.toByteArray(); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutput out = new ObjectOutputStream(bos); + out.writeObject(qEntry); + message = bos.toByteArray(); - this.writeInProcessChannel(message, SCORER_ROUTING_KEY); - log.info(" [x] Queued Scorer request for collectionld: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); + this.writeInProcessChannel(message, SCORER_ROUTING_KEY); + log.info(" [x] Queued Scorer request for collectionld: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); + } } /** diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/authentication/BookkeeperClient.java b/src/main/java/edu/ucsb/nceas/mdqengine/authentication/BookkeeperClient.java new file mode 100644 index 00000000..c055f807 --- /dev/null +++ b/src/main/java/edu/ucsb/nceas/mdqengine/authentication/BookkeeperClient.java @@ -0,0 +1,171 @@ +package edu.ucsb.nceas.mdqengine.authentication; + +import edu.ucsb.nceas.mdqengine.MDQconfig; +import edu.ucsb.nceas.mdqengine.exception.MetadigException; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.http.HttpStatus; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.dataone.bookkeeper.api.Usage; +import org.dataone.bookkeeper.api.UsageList; + +import java.io.*; +import java.io.IOException; +import java.util.List; + +public class BookkeeperClient { + + private static BookkeeperClient instance; + public static Log log = LogFactory.getLog(DataONE.class); + private String bookkeeperURL = null; + private Boolean bookkeeperEnabled = true; + private String bookkeeperAuthToken = null; + + private BookkeeperClient () { + } + + /** + * Get the singleton instance of the BookKeeplerClient class + * @return the instance of the class + */ + public static BookkeeperClient getInstance() throws MetadigException { + if (instance == null) { + synchronized (BookkeeperClient.class) { + if (instance == null) { + instance = new BookkeeperClient(); + instance.init(); + } + } + } + return instance; + } + + /** + * Initialize a bookkeeper client and get metadig config parameters needed for interacting with + * DataONE bookkeeper service + * + * @throws MetadigException + */ + protected void init () throws MetadigException { + // Get metadig config parameter for the bookkeeper URL + + try { + bookkeeperURL = MDQconfig.readConfigParam("bookkeeper.url"); + bookkeeperEnabled = new Boolean(MDQconfig.readConfigParam("bookkeeper.enabled")); + bookkeeperAuthToken = MDQconfig.readConfigParam("bookkeeper.authToken"); + } catch (ConfigurationException | IOException e) { + throw new MetadigException("Unable to initialize DataONE bookkeeper client: " + e.getMessage()); + } + } + + /** + * Get the value that indicates whether bookkeeper quota/usage checking is enabled. + */ + public Boolean getBookkeeperEnabled() { + return(this.bookkeeperEnabled); + } + + /** + * Retrieve a bookkeeper quota usage usage + * @param id the usage database sequence identifier + * @param instanceId the usage instance identifier + * @param quotaType the usage quota type ("portal" | "storage" | ...) + * @param status the usage status ("active" | "inactive") + * @return + * @throws MetadigException + */ + public List listUsages(int id, String instanceId, String quotaType, String status, List subjects) throws MetadigException { + // Check the portal quota with DataONE bookkeeper + String serviceURL = this.bookkeeperURL; + ObjectMapper objectMapper = new ObjectMapper(); + CloseableHttpClient httpClient = HttpClients.createDefault(); + String idStr = String.valueOf(id); + + if (id > 0) { + log.debug("Getting bookkeeper portal Usage for id: " + idStr); + serviceURL += "/usages?id=" + idStr; + } else { + log.debug("Getting bookkeeper portal Usage for quotaType, instanceId, status: " + + quotaType + ", " + + instanceId + ", " + + status); + if(status != null) { + serviceURL += "/usages?quotaType=" + quotaType + "&instanceId=" + String.valueOf(instanceId) + "&status=" + status; + } else { + serviceURL += "/usages?quotaType=" + quotaType + "&instanceId=" + String.valueOf(instanceId); + } + } + + // Is bookkeeper authentication/checking enabled? + log.debug("bookkeeper checking is enabled."); + log.debug("Using serviceURL: " + serviceURL); + HttpGet httpGet = new HttpGet(serviceURL); + + String msg = null; + // Send a request to the bookkeeper service for the quota related to this portal + try { + httpGet.addHeader("Authorization", "Bearer " + bookkeeperAuthToken); + // Ask for JSON reponse + httpGet.addHeader("Accept", "application/json"); + + log.debug("Submitting request to DataONE bookkeeper: " + serviceURL); + // send the request to bookkeeper + CloseableHttpResponse httpResponse = httpClient.execute(httpGet); + // Delete the token + + // Read the response from bookkeeper + StringBuffer response = new StringBuffer(); + int statusCode = httpResponse.getStatusLine().getStatusCode(); + + // If the HTTP request returned without an error, convert the result to a JSON string, + // then deserialize to a Java object so that we can easily inspect it. + if(statusCode == HttpStatus.SC_OK) { + BufferedReader reader = new BufferedReader(new InputStreamReader(httpResponse.getEntity().getContent())); + String inputLine; + response = new StringBuffer(); + + while ((inputLine = reader.readLine()) != null) { + response.append(inputLine); + } + + UsageList usageList = objectMapper.readValue(response.toString(), UsageList.class); + List usages = usageList.getUsages(); + if (usages.size() == 0) { + msg = "No usages returned."; + log.error(msg); + throw(new MetadigException(msg)); + } + log.debug("Bookkeeper Usage status found for portal " + idStr + ": " + usages.get(0).getStatus()); + return(usages); + } else { + log.debug("Getting bookkeeper portal Usage for quotaType, instanceId, status: " + + quotaType + ", " + + instanceId + ", " + + status); + msg = "HTTP error status getting bookkeeper usage for id, quotaType, instanceId, status: " + idStr + ": " + + "," + quotaType + + "," + instanceId + + "," + status + + httpResponse.getStatusLine().getReasonPhrase(); + log.error(msg); + throw(new MetadigException(msg)); + } + } catch (IOException ioe) { + msg = "Error getting bookkeeper usage: " + ioe.getMessage(); + log.error(msg); + throw(new MetadigException(msg)); + } finally { + try { + httpClient.close(); + } catch (IOException e) { + log.warn("Error closing connection to bookkeeper client: " + e.getMessage()); + } + } + } +} diff --git a/src/main/resources/configuration/metadig.properties b/src/main/resources/configuration/metadig.properties index e5199e89..7a05c3e7 100644 --- a/src/main/resources/configuration/metadig.properties +++ b/src/main/resources/configuration/metadig.properties @@ -14,3 +14,6 @@ metadig.base.directory = /opt/local/metadig metadig.store.directory = /opt/local/metadig/store index.latest = false metadig.data.dir = /opt/local/metadig/data +bookkeeper.enabled = true +bookkeeper.url = https://api.dataone.org:30443/bookkeeper/v1 +bookkeeper.authToken = diff --git a/src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java b/src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java new file mode 100644 index 00000000..0c727c49 --- /dev/null +++ b/src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java @@ -0,0 +1,35 @@ +package edu.ucsb.nceas.mdqengine.bookkeeper; + +import edu.ucsb.nceas.mdqengine.authentication.BookkeeperClient; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dataone.bookkeeper.api.Usage; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.fail; + +public class BookkeeperClientTest { + private String instanceId = "urn:uuid3b6827b9-4641-40c5-bae8-ccb23159b300"; + protected Log log = LogFactory.getLog(this.getClass()); + + @Test + @Ignore + public void testGetUsage() { + log.debug("Checking bookkeeper portal Usage for collection: " + instanceId); + String msg = null; + try { + BookkeeperClient bkClient = BookkeeperClient.getInstance(); + List usages = null; + List subjects = new ArrayList<>(); + usages = bkClient.listUsages(0, instanceId, "portal", null, subjects); + assert(usages.get(0).getStatus().compareToIgnoreCase("active") == 0); + } catch (Exception e) { + msg = "Bookkeeper client test failed: " + e.getMessage(); + fail(msg); + } + } +} From e3221e6c7b904d971711090d7a50c263b0f11e8f Mon Sep 17 00:00:00 2001 From: gothub Date: Mon, 13 Jul 2020 15:44:14 -0700 Subject: [PATCH 10/47] Close db connections when no longer needed (#251) --- .../nceas/mdqengine/filestore/MetadigFileStore.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/MetadigFileStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/MetadigFileStore.java index a1b90abd..9dcb739a 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/MetadigFileStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/MetadigFileStore.java @@ -48,13 +48,16 @@ public File getFile(MetadigFile mdFile) throws MetadigFilestoreException { // First query the database to find a match based on the data in the MetadigFile entry. In this version of // the filestore, only one file should match. MetadigFile resultFile = null; - FilestoreDB fsdb = new FilestoreDB(); + FilestoreDB fsdb = null; try { + fsdb = new FilestoreDB(); resultFile = fsdb.getFileEntry(mdFile); } catch (MetadigFilestoreException mse) { log.error("Unable to get file: " + mse.getMessage()); throw mse; + } finally { + fsdb.shutdown(); } path = this.getFilePath(resultFile); @@ -70,7 +73,6 @@ public File getFile(MetadigFile mdFile) throws MetadigFilestoreException { throw metadigFilestoreException; } - fsdb.shutdown(); return storeFile; } @@ -139,16 +141,17 @@ public String saveFile(MetadigFile mdFile, FileInputStream fis, Boolean replace) log.debug("Wrote file to path: " + path); } catch (IOException ioe) { log.error("Error writing to path: " + path); + } finally { + fsdb.shutdown(); } - fsdb.shutdown(); return path; } public boolean deleteFile(MetadigFile mdFile) throws MetadigFilestoreException { String path = null; - FilestoreDB fsdb; + FilestoreDB fsdb = null; try { fsdb = new FilestoreDB(); @@ -156,6 +159,8 @@ public boolean deleteFile(MetadigFile mdFile) throws MetadigFilestoreException { } catch (MetadigFilestoreException mse) { log.error("Unable to connect to filestore database"); throw (mse); + } finally { + fsdb.shutdown(); } File fileToDelete = FileUtils.getFile(getFilePath(mdFile)); From 231022eaa0d9c7647b10b0183aacf0a5c59f41e4 Mon Sep 17 00:00:00 2001 From: gothub Date: Mon, 13 Jul 2020 16:05:16 -0700 Subject: [PATCH 11/47] Use bookkeeper-client.jar, not bookkeeper.jar (#247) --- pom.xml | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index 02139762..28cab7ca 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ edu.ucsb.nceas metadig-engine - 2.2.0 + 2.3.0 jar metadig-engine @@ -12,6 +12,8 @@ https://github.com/NCEAS/metadig-engine + 0.1.0-SNAPSHOT + 2.9.8 UTF-8 2.4.0-SNAPSHOT 0.8.2242 @@ -26,8 +28,8 @@ 3.1.4.RELEASE metadig - 2.2.0 - + 2.3.0dev + **/*Test.java **/LTERSuiteTest.java @@ -39,6 +41,16 @@ http://nceas.ucsb.edu + + org.dataone + bookkeeper-client + ${bookkeeper.version} + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + org.renjin renjin-script-engine @@ -351,17 +363,15 @@ org.apache.maven.plugins maven-surefire-plugin - 2.22.1 + 3.0.0-M3 ${modules.test.excludes} **/*IT.java - From da04b951d00a12efb1b83498349dfa419d4fe20b Mon Sep 17 00:00:00 2001 From: gothub Date: Mon, 13 Jul 2020 19:22:33 -0700 Subject: [PATCH 12/47] Add convenience function to read config parameters --- .../java/edu/ucsb/nceas/mdqengine/MDQconfig.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java b/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java index d3606828..c2840e01 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java @@ -66,4 +66,16 @@ public String getString (String paramName) throws ConfigurationException { public int getInt(String paramName) throws ConfigurationException { return(config.getInt(paramName)); } + + public static String readConfigParam (String paramName) throws ConfigurationException, IOException { + String paramValue = null; + try { + MDQconfig cfg = new MDQconfig(); + paramValue = cfg.getString(paramName); + } catch (Exception e) { + log.error("Could not read configuration for param: " + paramName + ": " + e.getMessage()); + throw e; + } + return paramValue; + } } From 4a129b78b6c2d57ff3dc342a9b014882e3194c7c Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 17 Jul 2020 14:21:10 -0700 Subject: [PATCH 13/47] Complete DataONE bookkeeper checks --- .../edu/ucsb/nceas/mdqengine/Controller.java | 54 +++++++++++-------- .../mdqengine/authentication/DataONE.java | 1 + .../BookkeeperClient.java | 14 +---- 3 files changed, 36 insertions(+), 33 deletions(-) rename src/main/java/edu/ucsb/nceas/mdqengine/{authentication => authorization}/BookkeeperClient.java (92%) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java index 4784fb37..d38ec9bf 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java @@ -1,7 +1,7 @@ package edu.ucsb.nceas.mdqengine; import com.rabbitmq.client.*; -import edu.ucsb.nceas.mdqengine.authentication.BookkeeperClient; +import edu.ucsb.nceas.mdqengine.authorization.BookkeeperClient; import edu.ucsb.nceas.mdqengine.scorer.ScorerQueueEntry; import edu.ucsb.nceas.mdqengine.exception.MetadigException; import org.apache.commons.configuration2.ex.ConfigurationException; @@ -56,6 +56,7 @@ public class Controller { // where metadig-controller and the RabbitMQ server are running in containers that belong // to the same Pod. These defaults will be used if the properties file cannot be read. // These values are read from a config file, see class 'MDQconfig' + private static Boolean bookkeeperEnabled = false; private static String RabbitMQhost = null; private static int RabbitMQport = 0; private static String RabbitMQpassword = null; @@ -246,6 +247,7 @@ public void readConfig () throws ConfigurationException, IOException { RabbitMQusername = cfg.getString("RabbitMQ.username"); RabbitMQhost = cfg.getString("RabbitMQ.host"); RabbitMQport = cfg.getInt("RabbitMQ.port"); + bookkeeperEnabled = new Boolean(cfg.getString("bookkeeper.enabled")); } @@ -297,25 +299,18 @@ public Boolean isPortalActive(String collectionId) throws MetadigException { Usage usage = null; List subjects = new ArrayList(); try { - if(bkClient.getBookkeeperEnabled()) { - // Set status = null so that any usage will be returned. - String status = null; - usages = bkClient.listUsages(0, collectionId, "portal", status , subjects); - usage = usages.get(0); - log.debug("Usage for portal " + collectionId + " is " + usage.getStatus()); - if(usage.getStatus().compareToIgnoreCase("active") == 0) { - return true; - } else { - return false; - } + // Set status = null so that any usage will be returned. + String status = null; + usages = bkClient.listUsages(0, collectionId, "portal", status , subjects); + usage = usages.get(0); + log.debug("Usage for portal " + collectionId + " is " + usage.getStatus()); + if(usage.getStatus().compareToIgnoreCase("active") == 0) { + return true; } else { - msg = "Metadig config param 'bookkeeper.enabled is blank or missing"; - log.error(msg); - throw(new MetadigException(msg)); + return false; } } catch (Exception e) { msg = "Unable to get usage from bookkeeper for collection id: " + collectionId; - log.error(msg); throw(new MetadigException(msg)); } }; @@ -431,16 +426,29 @@ public void processScorerRequest(String collectionId, String nodeId, String formatFamily, String qualitySuiteId, - DateTime requestDateTime) throws java.io.IOException, MetadigException { + DateTime requestDateTime) throws java.io.IOException { log.info("Processing scorer request, collection: " + collectionId + ", suite: " + qualitySuiteId); ScorerQueueEntry qEntry = null; byte[] message = null; - String authToken = null; - if(!isPortalActive(collectionId)) { - log.info("[x} Skipping Scorer request for inactive portal, collectionld: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); - } else { + /** + * Bookkeeper checking can be disabled via a metadig-engine configuration parameter. The primary use case for + * doing this is for testing purposes, otherwise checking should always be enabled. + */ + if (bookkeeperEnabled) { + try { + if (!isPortalActive(collectionId)) { + log.info("Skipping Scorer request for inactive portal with pid: '" + collectionId + "'" + ", quality suite " + qualitySuiteId); + return; + } + } catch (MetadigException me) { + log.error("Unable to contact DataONE bookkeeper: " + me.getMessage() + + "\nSkipping Scorer request for portal with pid: '" + collectionId + + "'" + ", quality suite " + qualitySuiteId); + return; + } + qEntry = new ScorerQueueEntry(collectionId, qualitySuiteId, nodeId, formatFamily, requestDateTime); ByteArrayOutputStream bos = new ByteArrayOutputStream(); @@ -450,6 +458,10 @@ public void processScorerRequest(String collectionId, this.writeInProcessChannel(message, SCORER_ROUTING_KEY); log.info(" [x] Queued Scorer request for collectionld: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); + } else { + log.info("Skipping Scorer request for portal, collectionld: '" + collectionId + + "'" + ", quality suite " + qualitySuiteId + + "\n as DataONE bookkeeper service is disabled via metadig-engine configuration."); } } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java index 2cd1af4d..d5e8b73a 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java @@ -35,6 +35,7 @@ public static Session getSession(String subjectId, String authToken) { Subject subject = new Subject(); subject.setValue(subjectId); session.setSubject(subject); + log.debug("Set session subjectId to: " + session.getSubject().getValue()); } return session; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/authentication/BookkeeperClient.java b/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java similarity index 92% rename from src/main/java/edu/ucsb/nceas/mdqengine/authentication/BookkeeperClient.java rename to src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java index c055f807..d0475163 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/authentication/BookkeeperClient.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java @@ -1,6 +1,7 @@ -package edu.ucsb.nceas.mdqengine.authentication; +package edu.ucsb.nceas.mdqengine.authorization; import edu.ucsb.nceas.mdqengine.MDQconfig; +import edu.ucsb.nceas.mdqengine.authentication.DataONE; import edu.ucsb.nceas.mdqengine.exception.MetadigException; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.commons.logging.Log; @@ -24,7 +25,6 @@ public class BookkeeperClient { private static BookkeeperClient instance; public static Log log = LogFactory.getLog(DataONE.class); private String bookkeeperURL = null; - private Boolean bookkeeperEnabled = true; private String bookkeeperAuthToken = null; private BookkeeperClient () { @@ -57,20 +57,12 @@ protected void init () throws MetadigException { try { bookkeeperURL = MDQconfig.readConfigParam("bookkeeper.url"); - bookkeeperEnabled = new Boolean(MDQconfig.readConfigParam("bookkeeper.enabled")); bookkeeperAuthToken = MDQconfig.readConfigParam("bookkeeper.authToken"); } catch (ConfigurationException | IOException e) { throw new MetadigException("Unable to initialize DataONE bookkeeper client: " + e.getMessage()); } } - /** - * Get the value that indicates whether bookkeeper quota/usage checking is enabled. - */ - public Boolean getBookkeeperEnabled() { - return(this.bookkeeperEnabled); - } - /** * Retrieve a bookkeeper quota usage usage * @param id the usage database sequence identifier @@ -102,8 +94,6 @@ public List listUsages(int id, String instanceId, String quotaType, Strin } } - // Is bookkeeper authentication/checking enabled? - log.debug("bookkeeper checking is enabled."); log.debug("Using serviceURL: " + serviceURL); HttpGet httpGet = new HttpGet(serviceURL); From d13bce52b9344ba07f254fbdc99a68925c701a36 Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 17 Jul 2020 14:23:34 -0700 Subject: [PATCH 14/47] Retrieve graphs, csv based on pid, storage type, media only --- .../java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java index 5a5d17cd..1ea8d7ed 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java @@ -106,8 +106,7 @@ public MetadigFile getFileEntry(MetadigFile mdFile) throws MetadigFilestoreExcep stmt.setString(1, storageType); stmt.setString(2, altFilename); } else { - sql = "select * from filestore where pid = ? and suite_id = ?" + - " and node_id = ? and format_filter = ? and storage_type = ? and media_type = ?"; + sql = "select * from filestore where pid = ? and storage_type = ? and media_type = ?"; stmt = conn.prepareStatement(sql); stmt.setString(1, pid); stmt.setString(2, suiteId); From 73b008cb6311f07b60bd68f820e329c3ec2c560b Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 17 Jul 2020 14:27:08 -0700 Subject: [PATCH 15/47] Code cleanup --- .../java/edu/ucsb/nceas/mdqengine/Worker.java | 1 - .../ucsb/nceas/mdqengine/scorer/Scorer.java | 33 +++++-------------- .../nceas/mdqengine/store/InMemoryStore.java | 1 - .../ucsb/nceas/mdqengine/store/MDQStore.java | 1 - .../ucsb/nceas/mdqengine/store/MNStore.java | 1 - .../bookkeeper/BookkeeperClientTest.java | 2 +- .../mdqengine/filestore/FilestoreTestIT.java | 2 +- 7 files changed, 11 insertions(+), 30 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java b/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java index 5e267dce..7cd516bb 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java @@ -242,7 +242,6 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp /* Once the quality report has been created and saved to persistent storage, it can be added to the Solr index */ if(!failFast) { - MDQStore dbstore = null; log.debug("Indexing report"); try { startTimeIndexing = System.currentTimeMillis(); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index ac8cfbd1..e160635f 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -512,6 +512,7 @@ which will be used to query DataONE Solr for all the pids associated with that p int resultCount = 0; startPos = 0; countRequested = 1000; + // Now get the pids associated with the collection by sending the collectionQuery to the DataONE CN // The collectionQuery is always evaluated on the CN, as portals should have all DataONE data available to them. // One query can return many documents, so use the paging mechanism to make sure we retrieve them all. @@ -530,23 +531,17 @@ which will be used to query DataONE Solr for all the pids associated with that p // Loop through the Solr result. As the result may be large, page through the results, accumulating // the pids returned - // Determine where the collectionQuery should be evaluated. When the DataONE quata service is ready, query it - // for this collection to determine if the collectionQuery should be sent to the CN. Since this service is - // not ready, send the query to the same serviceUrl, subjectId, authToken which was used to harvest the - // collection document and obtain the collectionQuery string - - // When the service is available, use the DataONE quota service to set these variable conditionally - String evalServiceUrl = serviceUrl; - String evalSubjectId = subjectId; - String evalAuthToken = authToken; + /** The collectionQuery is evaluated on the same node that the portal document was harvested from (via the + * DataONE listObjects service. This node could either be an MN or CN. + */ - log.debug("Sending collectionQuery to Solr using subjectId: " + evalSubjectId + ", servicerUrl: " + evalServiceUrl); - log.trace("query string: " + queryStr); + log.debug("Sending collectionQuery to Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); + log.debug("query string: " + queryStr); do { //TODO: check that a result was returned // Note: the collectionQuery is always evaluated on the CN, so that the entire DataONE network is queried. - xmldoc = queryD1Solr(queryStr, evalServiceUrl, startPos, countRequested, evalSubjectId, evalAuthToken); + xmldoc = queryD1Solr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); if(xmldoc == null) { log.info("no values returned from query"); break; @@ -802,12 +797,6 @@ private Document queryD1Solr(String queryStr, String serviceUrl, int startPos, i MultipartRestClient mrc = null; // Polymorphism doesn't work with D1 node classes, so have to use the derived classes MultipartD1Node d1Node = null; -// -// Subject subject = new Subject(); -// if(subjectId != null && !subjectId.isEmpty()) { -// subject.setValue(subjectId); -// } - Session session = DataONE.getSession(subjectId, authToken); // Add the start and count, if pagination is being used @@ -818,6 +807,7 @@ private Document queryD1Solr(String queryStr, String serviceUrl, int startPos, i try { d1Node = getMultipartD1Node(session, serviceUrl); + log.debug("Created MultipartD1Node: " + d1Node.toString()); } catch (Exception ex) { log.error("Unable to create MultipartD1Node for Solr query"); metadigException = new MetadigProcessException("Unable to create multipart node client to query DataONE solr: " + ex.getMessage()); @@ -1031,11 +1021,6 @@ private SubjectInfo getSubjectInfo(Subject rightsHolder, String serviceUrl, Stri MetadigProcessException metadigException = null; SubjectInfo subjectInfo = null; - //Subject requestingSubject = new Subject(); -// if(subjectId != null && ! subjectId.isEmpty()) { -// requestingSubject.setValue(subjectId); -// } - Session session = DataONE.getSession(subjectId, authToken); // Identity node as either a CN or MN based on the serviceUrl @@ -1100,7 +1085,7 @@ MultipartD1Node getMultipartD1Node(Session session, String serviceUrl) throws Me log.debug("creating cn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); d1Node = new MultipartCNode(mrc, serviceUrl, session); } else { - log.debug("creating mn MultipartMNode" + " , subjectId: " + session.getSubject().getValue()); + log.debug("creating mn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); d1Node = new MultipartMNode(mrc, serviceUrl, session); } return d1Node; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java index cf3c0ffa..44bb386c 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java @@ -1,6 +1,5 @@ package edu.ucsb.nceas.mdqengine.store; -import com.sun.javafx.scene.control.skin.TableCellSkin; import edu.ucsb.nceas.mdqengine.MDQconfig; import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.*; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java index 013b1a77..fbef0bc3 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java @@ -1,6 +1,5 @@ package edu.ucsb.nceas.mdqengine.store; -import com.sun.javafx.scene.control.skin.TableCellSkin; import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.*; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java index 34c0efef..ec7a2772 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java @@ -1,6 +1,5 @@ package edu.ucsb.nceas.mdqengine.store; -import com.sun.javafx.scene.control.skin.TableCellSkin; import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.*; import edu.ucsb.nceas.mdqengine.serialize.XmlMarshaller; diff --git a/src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java b/src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java index 0c727c49..7e6c4a88 100644 --- a/src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java +++ b/src/test/java/edu/ucsb/nceas/mdqengine/bookkeeper/BookkeeperClientTest.java @@ -1,6 +1,6 @@ package edu.ucsb.nceas.mdqengine.bookkeeper; -import edu.ucsb.nceas.mdqengine.authentication.BookkeeperClient; +import edu.ucsb.nceas.mdqengine.authorization.BookkeeperClient; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dataone.bookkeeper.api.Usage; diff --git a/src/test/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreTestIT.java b/src/test/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreTestIT.java index bac92f10..0d4c8b05 100644 --- a/src/test/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreTestIT.java +++ b/src/test/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreTestIT.java @@ -42,7 +42,7 @@ public void saveFile() throws IOException, MetadigFilestoreException { MetadigFile mdFile = new MetadigFile(); mdFile.setCreationDatetime(DateTime.now()); - mdFile.setCollectionId("1234"); + mdFile.setPid("1234"); mdFile.setSuiteId("FAIR.suite.1"); mdFile.setNodeId("urn:node:KNB"); mdFile.setStorageType(StorageType.TMP.toString()); From b961bdf906c8eb898c1ae8f6eb29dfb7ce8b26ef Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 17 Jul 2020 14:27:41 -0700 Subject: [PATCH 16/47] Update sql init scripts --- src/main/resources/sql/quality-v2.3.0.sql | 3 ++- src/main/resources/sql/update_to_v2.3.0.sql | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/resources/sql/quality-v2.3.0.sql b/src/main/resources/sql/quality-v2.3.0.sql index 964af51d..3c4e7dfb 100644 --- a/src/main/resources/sql/quality-v2.3.0.sql +++ b/src/main/resources/sql/quality-v2.3.0.sql @@ -56,7 +56,8 @@ create TABLE filestore ( media_type TEXT not NULL, alt_filename TEXT not NULL, CONSTRAINT file_id_pk PRIMARY KEY (file_id), - CONSTRAINT all_properties_fk UNIQUE (pid, suite_id, node_id, format_filter, storage_type, media_type, alt_filename) + -- CONSTRAINT all_properties_fk UNIQUE (pid, suite_id, node_id, format_filter, storage_type, media_type, alt_filename) + CONSTRAINT all_properties_fk UNIQUE (pid, storage_type, media_type, alt_filename) ); alter table filestore owner to metadig; diff --git a/src/main/resources/sql/update_to_v2.3.0.sql b/src/main/resources/sql/update_to_v2.3.0.sql index 5f6c4de1..90fe8da3 100644 --- a/src/main/resources/sql/update_to_v2.3.0.sql +++ b/src/main/resources/sql/update_to_v2.3.0.sql @@ -1,5 +1,6 @@ -ALTER TABLE filestore DROP CONSTRAINT all_properties_fk; +ALTER TABLE filestore DROP CONSTRAINT IF EXISTS all_properties_fk; ALTER TABLE filestore RENAME COLUMN collection_id to pid; ALTER TABLE filestore DROP column metadata_id; -ALTER TABLE filestore ADD CONSTRAINT all_properties_fk UNIQUE (pid, suite_id, node_id, format_filter, storage_type, media_type, alt_filename); +-- ALTER TABLE filestore ADD CONSTRAINT all_properties_fk UNIQUE (pid, suite_id, node_id, format_filter, storage_type, media_type, alt_filename); +ALTER TABLE filestore ADD CONSTRAINT all_properties_fk UNIQUE (pid, storage_type, media_type, alt_filename); From 0720f498ffbef6c8f203cf4900f2e193371126fb Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 17 Jul 2020 14:29:01 -0700 Subject: [PATCH 17/47] Update debug logging conf --- pom.xml | 25 ++++----- src/main/resources/commons-logging.properties | 5 ++ src/main/resources/log4j.properties | 52 +++++++++---------- 3 files changed, 41 insertions(+), 41 deletions(-) create mode 100644 src/main/resources/commons-logging.properties diff --git a/pom.xml b/pom.xml index 28cab7ca..53f787f0 100644 --- a/pom.xml +++ b/pom.xml @@ -117,17 +117,6 @@ ${d1_libclient_java.version} jar - - org.slf4j - slf4j-simple - 1.7.25 - - - - org.slf4j - slf4j-api - 1.7.25 - com.rabbitmq @@ -144,6 +133,16 @@ org.apache.solr solr-solrj 7.3.0 + + + org.apache.logging.log4j + log4j-api + + + org.apache.logging.log4j + log4j-core + + org.apache.solr @@ -170,10 +169,6 @@ ${d1_cn_index_processor_version} jar - - org.slf4j - slf4j-log4j12 - com.hp.hpl.jena jena diff --git a/src/main/resources/commons-logging.properties b/src/main/resources/commons-logging.properties new file mode 100644 index 00000000..a13fe14c --- /dev/null +++ b/src/main/resources/commons-logging.properties @@ -0,0 +1,5 @@ +org.apache.commons.logging.LogFactory=org.apache.commons.logging.impl.LogFactoryImpl +org.apache.commons.logging.Log=org.apache.commons.logging.impl.Log4JLogger +log4j.configuration=log4j.properties + +#org.apache.commons.logging.Log=org.apache.commons.logging.impl.SimpleLog diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties index b9d51737..4050b437 100755 --- a/src/main/resources/log4j.properties +++ b/src/main/resources/log4j.properties @@ -1,31 +1,31 @@ +#log4j.rootLogger=ALL, stdout, warnStdout +log4j.rootLogger=ALL, stdout -# set the log level to WARN and the log should be printed to stdout. -log4j.rootLogger=WARN, stderr -#log4j.threshold=FATAL, ERROR, WARN, INFO +# configure stdout +# set the conversion pattern of stdout +# Print the date in ISO 8601 format -### LOGGING TO CONSOLE ######################################################### -log4j.appender.stderr=org.apache.log4j.ConsoleAppender -log4j.appender.stderr.layout=org.apache.log4j.PatternLayout +#This will be used to print WARN level or higher messages to console +#log4j.appender.warnStdout=org.apache.log4j.ConsoleAppender +#log4j.appender.warnStdout.layout=org.apache.log4j.PatternLayout +#log4j.appender.warnStdout.Threshold=WARN -# define the pattern to be used in the logs... -log4j.appender.stderr.layout.ConversionPattern=%d{yyyyMMdd-HH:mm:ss}: [%p]: %m [%c]%n +log4j.appender.stdout = org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Threshold = DEBUG +#log4j.appender.stdout.Target = System.out +log4j.appender.stdout.layout = org.apache.log4j.PatternLayout +#log4j.appender.stdout.layout.ConversionPattern = %-5p %d [%t][%F:%L] : %m%n +log4j.appender.stdout.layout.ConversionPattern = %-5p %d [%F:%L] : %m%n +#log4j.appender.stdout.filter.filter1=org.apache.log4j.varia.LevelRangeFilter +#log4j.appender.stdout.filter.filter1.levelMin=INFO +#log4j.appender.stdout.filter.filter1.levelMax=WARN -# %p -> priority level of the event - (e.g. WARN) -# %m -> message to be printed -# %c -> category name ... in this case name of the class -# %d -> Used to output the date of the logging event. example, %d{HH:mm:ss,SSS} or %d{dd MMM yyyy HH:mm:ss,SSS}. Default format is ISO8601 format -# %M -> print the method name where the event was generated ... can be extremely slow. -# %L -> print the line number of the event generated ... can be extremely slow. -# %t -> Used to output the name of the thread that generated the log event -# %n -> carriage return - -################################################################################ -# EXAMPLE: Print only messages of level WARN or above in the package com.foo: -log4j.logger.edu.ucsb.nceas=INFO -#log4j.logger.edu.ucsb.nceas=DEBUG -#log4j.logger.com.hp.hpl.jena=WARN -log4j.logger.org.dataone.ore=ERROR -log4j.logger.org.dataone.client=ERROR -#log4j.logger.org.apache.http=DEBUG -org.dataone.client.auth=ERROR +# Classes in the my.project package will accept messages of INFO level or higher +# and send those messages to the console and to the log file +log4j.logger.org.edu.ucsb.nceas=DEBUG, stdout +log4j.logger.org.apache=WARN, stdout +log4j.logger.org.dataone=WARN, stdout +# Need to set additivity to false or else both the my.project and root loggers +# will accept messages from classes in package my.project +#log4j.additivity.org.edu.ucsb.nceas \ No newline at end of file From 370f4762efb0e03b4b65bbe54da90a872636c373 Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 17 Jul 2020 14:29:38 -0700 Subject: [PATCH 18/47] Exclude unneeded jars from build --- pom.xml | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 53f787f0..c71130a1 100644 --- a/pom.xml +++ b/pom.xml @@ -22,8 +22,7 @@ 2.6.3 2.4.0-SNAPSHOT 2.4.0-SNAPSHOT - + 3.1.4.RELEASE @@ -41,10 +40,84 @@ http://nceas.ucsb.edu + + + commons-logging + commons-logging + 1.2 + + + + org.apache.logging.log4j + log4j-core + 2.13.3 + + + + org.apache.logging.log4j + log4j-api + 2.13.3 + + + + + + + + + + + org.dataone bookkeeper-client ${bookkeeper.version} + + + io.dropwizard + dropwizard-core + + + io.dropwizard + dropwizard-json-logging + + + io.dropwizard + dropwizard-testing + + + io.dropwizard + dropwizard-jdbi3 + + + io.dropwizard + dropwizard-auth + + + org.postgresql + postgresql + + + com.opentable.components + otj-pg-embedded + + + com.opentable.components + otj-pg-embedded + + + org.flywaydb + flyway-maven-plugin + + + org.mockito + mockito-core + + + org.dataone + d1_libclient_java + + com.fasterxml.jackson.core From ff6524a85e0496c1aa0a4a5e0251a725a1f24831 Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 17 Jul 2020 14:30:38 -0700 Subject: [PATCH 19/47] remove unneeded, obsolete files --- src/test/resources/log4j.properties | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100755 src/test/resources/log4j.properties diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties deleted file mode 100755 index 61ead9c3..00000000 --- a/src/test/resources/log4j.properties +++ /dev/null @@ -1,28 +0,0 @@ - -# set the log level to WARN and the log should be printed to stdout. -log4j.rootLogger=WARN, stderr -#log4j.threshold=FATAL, ERROR, WARN, INFO - - -### LOGGING TO CONSOLE ######################################################### -log4j.appender.stderr=org.apache.log4j.ConsoleAppender -log4j.appender.stderr.layout=org.apache.log4j.PatternLayout - -# define the pattern to be used in the logs... -log4j.appender.stderr.layout.ConversionPattern=%d{yyyyMMdd-HH:mm:ss}: [%p]: %m [%c]%n - -# %p -> priority level of the event - (e.g. WARN) -# %m -> message to be printed -# %c -> category name ... in this case name of the class -# %d -> Used to output the date of the logging event. example, %d{HH:mm:ss,SSS} or %d{dd MMM yyyy HH:mm:ss,SSS}. Default format is ISO8601 format -# %M -> print the method name where the event was generated ... can be extremely slow. -# %L -> print the line number of the event generated ... can be extremely slow. -# %t -> Used to output the name of the thread that generated the log event -# %n -> carriage return - -################################################################################ -# EXAMPLE: Print only messages of level WARN or above in the package com.foo: -log4j.logger.edu.ucsb.nceas=DEBUG -#log4j.logger.com.hp.hpl.jena=WARN -#log4j.logger.org.dataone.ore=INFO -#log4j.logger.org.apache.http=DEBUG From 01a01546d1e44c19b771267e260a1b2bad542973 Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 24 Jul 2020 10:06:09 -0700 Subject: [PATCH 20/47] Use bookkeeper `/usages/status` vs `/usages` (getStatus() vs listUsages()) (#247) --- .../edu/ucsb/nceas/mdqengine/Controller.java | 19 ++--- .../authorization/BookkeeperClient.java | 75 +++++++++++++++++++ 2 files changed, 85 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java index d38ec9bf..aaad57f6 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java @@ -8,6 +8,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dataone.bookkeeper.api.Usage; +import org.dataone.bookkeeper.api.UsageStatus; import org.dataone.exceptions.MarshallingException; import org.dataone.service.types.v2.SystemMetadata; import org.dataone.service.types.v2.TypeFactory; @@ -18,7 +19,6 @@ import java.lang.reflect.InvocationTargetException; import java.net.ServerSocket; import java.net.Socket; -import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -72,6 +72,8 @@ public class Controller { public static void main(String[] argv) throws Exception { + //System.setProperty("lo4j2.debug", "true"); + //System.setProperty("log4j.configurationFile", "log4j2.xml"); Controller metadigCtrl = Controller.getInstance(); metadigCtrl.start(); if (metadigCtrl.getIsStarted()) { @@ -292,25 +294,24 @@ public void disableTestMode() { // Check the portal quota with DataONE bookkeaper public Boolean isPortalActive(String collectionId) throws MetadigException { // Check the portal quota with DataONE bookkeeper - log.debug("Checking bookkeeper portal Usage for collection: " + collectionId); + log.info("Checking bookkeeper portal Usage for collection: " + collectionId); String msg = null; BookkeeperClient bkClient = BookkeeperClient.getInstance(); List usages = null; - Usage usage = null; - List subjects = new ArrayList(); + UsageStatus usageStatus = null; try { // Set status = null so that any usage will be returned. String status = null; - usages = bkClient.listUsages(0, collectionId, "portal", status , subjects); - usage = usages.get(0); - log.debug("Usage for portal " + collectionId + " is " + usage.getStatus()); - if(usage.getStatus().compareToIgnoreCase("active") == 0) { + //usages = bkClient.listUsages(0, collectionId, "portal", status , subjects); + usageStatus = bkClient.getUsageStatus(collectionId, "portal"); + log.info("Usage status for portal " + collectionId + " is " + usageStatus.getStatus()); + if(usageStatus.getStatus().compareToIgnoreCase("active") == 0) { return true; } else { return false; } } catch (Exception e) { - msg = "Unable to get usage from bookkeeper for collection id: " + collectionId; + msg = "Unable to get usage status from bookkeeper for collection id: " + collectionId; throw(new MetadigException(msg)); } }; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java b/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java index d0475163..9dd246a8 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java @@ -15,6 +15,7 @@ import org.apache.http.impl.client.HttpClients; import org.dataone.bookkeeper.api.Usage; import org.dataone.bookkeeper.api.UsageList; +import org.dataone.bookkeeper.api.UsageStatus; import java.io.*; import java.io.IOException; @@ -158,4 +159,78 @@ public List listUsages(int id, String instanceId, String quotaType, Strin } } } + /** + * Retrieve a bookkeeper quota usage usage + * @param instanceId the usage instance identifier + * @param quotaType the usage quota type ("portal" | "storage" | ...) + * @return + * @throws MetadigException + */ + public UsageStatus getUsageStatus(String instanceId, String quotaType) throws MetadigException { + // Check the portal quota with DataONE bookkeeper + String serviceURL = this.bookkeeperURL; + ObjectMapper objectMapper = new ObjectMapper(); + CloseableHttpClient httpClient = HttpClients.createDefault(); + + log.debug("Getting bookkeeper portal Usage for quotaType, instanceId: " + + quotaType + ", " + instanceId); + serviceURL += "/usages/status?quotaType=" + quotaType + "&instanceId=" + String.valueOf(instanceId); + + log.debug("Using serviceURL: " + serviceURL); + HttpGet httpGet = new HttpGet(serviceURL); + + String msg = null; + // Send a request to the bookkeeper service for the quota related to this portal + try { + httpGet.addHeader("Accept", "application/json"); + + log.debug("Submitting request to DataONE bookkeeper: " + serviceURL); + // send the request to bookkeeper + CloseableHttpResponse httpResponse = httpClient.execute(httpGet); + // Delete the token + + // Read the response from bookkeeper + StringBuffer response = new StringBuffer(); + int statusCode = httpResponse.getStatusLine().getStatusCode(); + + // If the HTTP request returned without an error, convert the result to a JSON string, + // then deserialize to a Java object so that we can easily inspect it. + if(statusCode == HttpStatus.SC_OK) { + BufferedReader reader = new BufferedReader(new InputStreamReader(httpResponse.getEntity().getContent())); + String inputLine; + response = new StringBuffer(); + + while ((inputLine = reader.readLine()) != null) { + response.append(inputLine); + } + + UsageStatus usageStatus = objectMapper.readValue(response.toString(), UsageStatus.class); + if (usageStatus == null) { + msg = "No usage status returned."; + log.error(msg); + throw(new MetadigException(msg)); + } + log.debug("Bookkeeper Usage status found for portal " + instanceId + usageStatus.getStatus()); + return(usageStatus); + } else { + log.debug("Getting bookkeeper portal usage status for quotaType, instanceId, status: " + + quotaType + ", " + instanceId); + msg = "HTTP error status getting bookkeeper usage status for quotaType, instanceId:" + + quotaType + "," + instanceId; + httpResponse.getStatusLine().getReasonPhrase(); + log.error(msg); + throw(new MetadigException(msg)); + } + } catch (IOException ioe) { + msg = "Error getting bookkeeper usage status: " + ioe.getMessage(); + log.error(msg); + throw(new MetadigException(msg)); + } finally { + try { + httpClient.close(); + } catch (IOException e) { + log.warn("Error closing connection to bookkeeper client: " + e.getMessage()); + } + } + } } From 7adb869b64020a5c171760361365791d5273798c Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 24 Jul 2020 10:07:06 -0700 Subject: [PATCH 21/47] Update debug logging --- src/main/resources/commons-logging.properties | 6 +-- src/main/resources/log4j.properties | 52 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) mode change 100755 => 100644 src/main/resources/log4j.properties diff --git a/src/main/resources/commons-logging.properties b/src/main/resources/commons-logging.properties index a13fe14c..b2b80118 100644 --- a/src/main/resources/commons-logging.properties +++ b/src/main/resources/commons-logging.properties @@ -1,5 +1,5 @@ -org.apache.commons.logging.LogFactory=org.apache.commons.logging.impl.LogFactoryImpl +# Explicitly set the Apache Commons Logging (JCL) implementation to log4j. +# JCL will attempt to discover an implementation if one is not specified, so +# make sure log4j is used. org.apache.commons.logging.Log=org.apache.commons.logging.impl.Log4JLogger log4j.configuration=log4j.properties - -#org.apache.commons.logging.Log=org.apache.commons.logging.impl.SimpleLog diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties old mode 100755 new mode 100644 index 4050b437..1485317d --- a/src/main/resources/log4j.properties +++ b/src/main/resources/log4j.properties @@ -1,31 +1,31 @@ -#log4j.rootLogger=ALL, stdout, warnStdout -log4j.rootLogger=ALL, stdout +# set the log level to WARN and the log should be printed to stdout. +log4j.rootLogger=DEBUG, stderr +#log4j.threshold=FATAL, ERROR, WARN, INFO -# configure stdout -# set the conversion pattern of stdout -# Print the date in ISO 8601 format +### LOGGING TO CONSOLE ######################################################### +log4j.appender.stderr=org.apache.log4j.ConsoleAppender +log4j.appender.stderr.layout=org.apache.log4j.PatternLayout -#This will be used to print WARN level or higher messages to console -#log4j.appender.warnStdout=org.apache.log4j.ConsoleAppender -#log4j.appender.warnStdout.layout=org.apache.log4j.PatternLayout -#log4j.appender.warnStdout.Threshold=WARN +# define the pattern to be used in the logs... +log4j.appender.stderr.layout.ConversionPattern=%d{yyyyMMdd-HH:mm:ss}: [%p]: %m [%c]%n -log4j.appender.stdout = org.apache.log4j.ConsoleAppender -log4j.appender.stdout.Threshold = DEBUG -#log4j.appender.stdout.Target = System.out -log4j.appender.stdout.layout = org.apache.log4j.PatternLayout -#log4j.appender.stdout.layout.ConversionPattern = %-5p %d [%t][%F:%L] : %m%n -log4j.appender.stdout.layout.ConversionPattern = %-5p %d [%F:%L] : %m%n -#log4j.appender.stdout.filter.filter1=org.apache.log4j.varia.LevelRangeFilter -#log4j.appender.stdout.filter.filter1.levelMin=INFO -#log4j.appender.stdout.filter.filter1.levelMax=WARN +# %p -> priority level of the event - (e.g. WARN) +# %m -> message to be printed +# %c -> category name ... in this case name of the class +# %d -> Used to output the date of the logging event. example, %d{HH:mm:ss,SSS} or %d{dd MMM yyyy HH:mm:ss,SSS}. Default format is ISO8601 format +# %M -> print the method name where the event was generated ... can be extremely slow. +# %L -> print the line number of the event generated ... can be extremely slow. +# %t -> Used to output the name of the thread that generated the log event +# %n -> carriage return -# Classes in the my.project package will accept messages of INFO level or higher -# and send those messages to the console and to the log file -log4j.logger.org.edu.ucsb.nceas=DEBUG, stdout -log4j.logger.org.apache=WARN, stdout -log4j.logger.org.dataone=WARN, stdout -# Need to set additivity to false or else both the my.project and root loggers -# will accept messages from classes in package my.project -#log4j.additivity.org.edu.ucsb.nceas \ No newline at end of file +################################################################################ +# EXAMPLE: Print only messages of level WARN or above in the package com.foo: +log4j.logger.edu.ucsb.nceas.mdqengine=DEBUG +#log4j.logger.com.hp.hpl.jena=WARN +log4j.logger.org.dataone.ore=ERROR +log4j.logger.org.dataone.client=ERROR +#log4j.logger.org.apache.http=DEBUG +log4j.logger.org.dataone.client.auth=ERROR +log4j.logger.org.apache.commons.beanutils=WARN +log4j.logger.org.apache.http=WARN From 5b9282462860cd95c0fa31f3e9c53c543ce34e46 Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 24 Jul 2020 10:18:42 -0700 Subject: [PATCH 22/47] Enable TLS client authentication (#258) --- .../Admin/Authentication/update-LE-cert.sh | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/Kubernetes/Admin/Authentication/update-LE-cert.sh b/Kubernetes/Admin/Authentication/update-LE-cert.sh index 18f00010..6196572f 100644 --- a/Kubernetes/Admin/Authentication/update-LE-cert.sh +++ b/Kubernetes/Admin/Authentication/update-LE-cert.sh @@ -8,13 +8,19 @@ debug=1 # The user managing k8s user=metadig # k8s namespace that we are managing -ns=metadig +#k8sns=metadig +k8sns=nginx-ingress # Save current LE cert modified time so we can see if certbot delivers # new certs -host=`hostname -f` -CA_DIR=/etc/letsencrypt/live/${host} -certFilename=${CA_DIR}/cert.pem +domain=`hostname -f` +damainDir=$domain +domain=api.test.dataone.org,${domain} +CA_DIR=/etc/letsencrypt/live/${domainDir} +# Use fullchain.pem, which includes the intermediate certificate, that will allow TLS +# client authentication, for those clients that don't know about LE certs +#certFilename=${CA_DIR}/cert.pem +certFilename=${CA_DIR}/fullchain.pem privkeyFilename=${CA_DIR}/privkey.pem certModTime=`stat -c %Y ${certFilename}` @@ -28,7 +34,8 @@ certModTime=`stat -c %Y ${certFilename}` # the IP that the certbot request will come from. ufw allow 80 #sudo ufw allow from ${certbotIP} to any port 80 -/usr/bin/certbot renew > /var/log/letsencrypt/letsencrypt-renew.log 2>&1 +#/usr/bin/certbot renew -d ${domain} > /var/log/letsencrypt/letsencrypt-renew.log 2>&1 +/usr/bin/certbot renew -d ${domain} > /var/log/letsencrypt/letsencrypt-renew.log 2>&1 # Close the port as soon as certbot is done ufw delete allow 80 #sudo ufw delete allow from ${certbotIP} to any port 80 @@ -55,7 +62,8 @@ if (( $certModTimeNew > $certModTime )); then su ${user} -c "kubectl get secret ${k8sns}-tls-cert --namespace ${k8sns}" su ${user} -c "kubectl delete secret ${k8sns}-tls-cert --namespace ${k8sns}" #sudo kubectl create secret tls ${k8sns}-tls-cert --key ${CA_DIR}/privkey.pem --cert ${CA_DIR}/cert.pem --namespace ${k8sns} - su ${user} -c "kubectl create secret tls ${k8sns}-tls-cert --key ~${user}/tmp/privkey.pem --cert ~${user}/tmp/cert.pem --namespace ${k8sns}" + #su ${user} -c "kubectl create secret tls ${k8sns}-tls-cert --key ~${user}/tmp/privkey.pem --cert ~${user}/tmp/cert.pem --namespace ${k8sns}" + su ${user} -c "kubectl create secret tls ${k8sns}-tls-cert --key ~${user}/tmp/privkey.pem --cert ~${user}/tmp/chain.pem --namespace ${k8sns}" #su metadig -c "kubectl get secret metadig-tls-cert --namespace metadig" rm -f ~${user}/tmp/privkey.pem ~${user}/tmp/cert.pem @@ -65,4 +73,4 @@ else if (( $debug )); then echo "Let's Encrypt cert not updated by certbot, Not updating k8s with new certfile " fi -fi +fi \ No newline at end of file From e95601df90aa9ab7babf47c44ba48ae79d9b0bd7 Mon Sep 17 00:00:00 2001 From: gothub Date: Fri, 24 Jul 2020 10:21:17 -0700 Subject: [PATCH 23/47] Add Solr LE cert update script --- .../Admin/Solr/renew-LE-cert-for-solr.sh | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 Kubernetes/Admin/Solr/renew-LE-cert-for-solr.sh diff --git a/Kubernetes/Admin/Solr/renew-LE-cert-for-solr.sh b/Kubernetes/Admin/Solr/renew-LE-cert-for-solr.sh new file mode 100644 index 00000000..8304f22d --- /dev/null +++ b/Kubernetes/Admin/Solr/renew-LE-cert-for-solr.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Follow these steps in order to enable SSL for solr standalone server. +# From SO: https://stackoverflow.com/questions/41592427/letsencypt-solr-ssl-jvm +# As i have a key for the Domain already, and Solr responds on mydomain.com:8983 all that is needed is to create a Java Key Store (jks) from the existing keys on the system + +# Note: Use the password "metadig" when prompted by openssl +sudo openssl pkcs12 -export -in /etc/letsencrypt/live/docker-ucsb-4.dataone.org/fullchain.pem -inkey /etc/letsencrypt/live/docker-ucsb-4.dataone.org/privkey.pem -out pkcs.p12 -name metadig + +# specifing the location of the Lets-Encrypt Cert (on my system /etc/letsencrypt/live/mydomain.com/) +# Then convert the PKCS12 key to a jks, replacing password where needed. + +# keytool -importkeystore -deststorepass PASSWORD_STORE -destkeypass PASSWORD_KEYPASS -destkeystore keystore.jks -srckeystore pkcs.p12 -srcstoretype PKCS12 -srcstorepass STORE_PASS -alias NAME + +sudo keytool -importkeystore -deststorepass metadig -destkeypass metadig -destkeystore keystore.jks -srckeystore pkcs.p12 -srcstoretype PKCS12 -srcstorepass metadig -alias metadig +sudo cp keystore.jks /opt/solr/server/etc/solr-ssl-letsencrypt.keystore.jks +sudo chown solr /opt/solr/server/etc/solr-ssl-letsencrypt.keystore.jks +sudo chgrp solr /opt/solr/server/etc/solr-ssl-letsencrypt.keystore.jks + +rm -f keystore.jks + +# Now that the keystore has been created, Solr must be told where it is: + +#* on docker-ucsb-4, the ’service solr start’ (/etc/init.d/solr) reads from /etc/default/solr.in.sh +# * these values are currently used +# * SOLR_SSL_ENABLED=true +# * # Uncomment to set SSL-related system properties +# * # Be sure to update the paths to the correct keystore for your environment +# * SOLR_SSL_KEY_STORE=/opt/solr/server/etc/solr-ssl-letsencrypt.keystore.jks +# * SOLR_SSL_KEY_STORE_PASSWORD=metadig +# * SOLR_SSL_KEY_STORE_TYPE=JKS +# * SOLR_SSL_TRUST_STORE=/opt/solr/server/etc/solr-ssl-letsencrypt.keystore.jks +# * SOLR_SSL_TRUST_STORE_PASSWORD=metadig +# * SOLR_SSL_TRUST_STORE_TYPE=JKS +# * #SOLR_SSL_NEED_CLIENT_AUTH=false +# * SOLR_SSL_WANT_CLIENT_AUTH=false + + +# Now restart Solr +sudo service solr restart From e0a7aab674f79e6b006a5d53d538083452611839 Mon Sep 17 00:00:00 2001 From: gothub Date: Mon, 27 Jul 2020 06:00:13 -0700 Subject: [PATCH 24/47] Enable/disable bookkeeper check with config parameter (#247) --- .../edu/ucsb/nceas/mdqengine/Controller.java | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java index aaad57f6..be83cd6e 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java @@ -414,7 +414,7 @@ public void processQualityRequest(String memberNode, * create the graph from them. *

* - * @param collectionId the DataONE collection identifier + * @param collectionId the DataONE collection identifier (the portal seriesId) * @param nodeId the node identifier the collection resides on * @param formatFamily a string representing the DataONE formats to create score for * @param qualitySuiteId the quality suite used to create the score graph @@ -429,7 +429,8 @@ public void processScorerRequest(String collectionId, String qualitySuiteId, DateTime requestDateTime) throws java.io.IOException { - log.info("Processing scorer request, collection: " + collectionId + ", suite: " + qualitySuiteId); + log.info("Processing scorer request, collection: " + collectionId + ", suite: " + qualitySuiteId + + "nodeId: " + nodeId + ", formatFamily: " + formatFamily); ScorerQueueEntry qEntry = null; byte[] message = null; @@ -439,31 +440,34 @@ public void processScorerRequest(String collectionId, */ if (bookkeeperEnabled) { try { + // Bookkeeper creates a portal usage with the portal sid as the 'instanceId', however if (!isPortalActive(collectionId)) { log.info("Skipping Scorer request for inactive portal with pid: '" + collectionId + "'" + ", quality suite " + qualitySuiteId); return; + } else { + log.info("Bookkeeper check indicates portal for pid: " + collectionId + " is active."); + log.info("Processing with Scorer request for inactive portal with pid: '" + collectionId + "'" + ", quality suite " + qualitySuiteId); } } catch (MetadigException me) { - log.error("Unable to contact DataONE bookkeeper: " + me.getMessage() + log.error("Unable to contact DataONE bookkeeper: " + me.getMessage() + "\nSkipping Scorer request for portal with pid: '" + collectionId + "'" + ", quality suite " + qualitySuiteId); return; } + } else { + log.info("Bookkeeper quota checking is disabled, proceeding with Scorer request for portal, collectionld: '" + collectionId + + "'" + ", quality suite " + qualitySuiteId); + } - qEntry = new ScorerQueueEntry(collectionId, qualitySuiteId, nodeId, formatFamily, requestDateTime); + qEntry = new ScorerQueueEntry(collectionId, qualitySuiteId, nodeId, formatFamily, requestDateTime); - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - ObjectOutput out = new ObjectOutputStream(bos); - out.writeObject(qEntry); - message = bos.toByteArray(); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutput out = new ObjectOutputStream(bos); + out.writeObject(qEntry); + message = bos.toByteArray(); - this.writeInProcessChannel(message, SCORER_ROUTING_KEY); - log.info(" [x] Queued Scorer request for collectionld: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); - } else { - log.info("Skipping Scorer request for portal, collectionld: '" + collectionId - + "'" + ", quality suite " + qualitySuiteId - + "\n as DataONE bookkeeper service is disabled via metadig-engine configuration."); - } + this.writeInProcessChannel(message, SCORER_ROUTING_KEY); + log.info(" [x] Queued Scorer request for pid: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); } /** From 097b7f477c28d15f341b752437a59c99395b4524 Mon Sep 17 00:00:00 2001 From: gothub Date: Mon, 27 Jul 2020 06:01:42 -0700 Subject: [PATCH 25/47] Associate portal series id with portal assessment graphs --- .../mdqengine/scheduler/RequestScorerJob.java | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index 9be3d2cc..daeaee34 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -2,7 +2,7 @@ import edu.ucsb.nceas.mdqengine.Controller; import edu.ucsb.nceas.mdqengine.MDQconfig; -import edu.ucsb.nceas.mdqengine.authentication.DataONE; +import edu.ucsb.nceas.mdqengine.DataONE; import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.Task; import edu.ucsb.nceas.mdqengine.store.DatabaseStore; @@ -15,11 +15,11 @@ import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; -import org.dataone.client.auth.AuthTokenSession; import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; import org.dataone.client.v2.impl.MultipartMNode; +import org.dataone.service.types.v2.SystemMetadata; import org.dataone.service.types.v1.*; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; @@ -346,18 +346,18 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, try { // Even though MultipartMNode and MultipartCNode have the same parent class, their interfaces are differnt, so polymorphism // isn't happening here. + log.debug("session: " + session.getSubject().getValue()); + log.debug("startDate: " + startDate); + log.debug("endDate: " + endDate); + log.debug("formatId: " + formatId); + log.debug("Identifier: " + identifier); + log.debug("startCount: " + startCount); + log.debug("countRequested: " + countRequested); if(isCN) { log.debug("cnNode: " + cnNode); - log.debug("Listing objects for CN"); - log.debug("session: " + session.getSubject().getValue()); - log.debug("startDate: " + startDate); - log.debug("endDate: " + endDate); - log.debug("formatId: " + formatId); - log.debug("Identifier: " + identifier); - log.debug("startCount: " + startCount); - log.debug("countRequested: " + countRequested); objList = cnNode.listObjects(session, startDate, endDate, formatId, nodeRef, identifier, startCount, countRequested); } else { + log.debug("mnNode: " + mnNode); objList = mnNode.listObjects(session, startDate, endDate, formatId, identifier, replicaStatus, startCount, countRequested); } log.debug("Retrieved " + objList.getCount() + " pids"); @@ -391,10 +391,27 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, // been updated (i.e. obsoletedBy, access) and the quality report and index contain // sysmeta fields. if(found) { + // The DataONE listObjects service retuns the pid for each object, but does not return the seriesId, + // so this has to be retrieved now, as Bookkeeper service and MetacatUI (when the graph is requested for + // this portal) uses the sid, not the pid, so create and store the graph based on the sid. // if (!runExists(thisPid, suiteId, store)) { + + Identifier thisId = new Identifier(); + thisId.setValue(thisPid); + + org.dataone.service.types.v2.SystemMetadata sysmeta = null; + + if(isCN) { + sysmeta = cnNode.getSystemMetadata(session, thisId); + } else { + sysmeta = mnNode.getSystemMetadata(session, thisId); + } + + String thisSeriesId = sysmeta.getSeriesId().getValue(); + pidCount = pidCount++; - pids.add(thisPid); - log.info("adding pid to process: " + thisPid + ", formatId: " + thisFormatId); + pids.add(thisSeriesId); + log.info("adding seriesId to process: " + thisSeriesId + ", formatId: " + thisFormatId); // } } } From 6a656950fcd69601d37289d61abb664b9470b9ed Mon Sep 17 00:00:00 2001 From: gothub Date: Mon, 27 Jul 2020 06:03:09 -0700 Subject: [PATCH 26/47] Refactor DataONE related methods into a new package --- .../edu/ucsb/nceas/mdqengine/DataONE.java | 234 +++++++++++++++++ .../mdqengine/authentication/DataONE.java | 43 ---- .../authorization/BookkeeperClient.java | 3 +- .../mdqengine/filestore/FilestoreDB.java | 1 - .../mdqengine/scheduler/RequestReportJob.java | 3 +- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 236 +----------------- 6 files changed, 243 insertions(+), 277 deletions(-) create mode 100644 src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java delete mode 100644 src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java new file mode 100644 index 00000000..7a2781a1 --- /dev/null +++ b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java @@ -0,0 +1,234 @@ +package edu.ucsb.nceas.mdqengine; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import edu.ucsb.nceas.mdqengine.exception.MetadigProcessException; +import org.dataone.client.auth.AuthTokenSession; +import org.dataone.client.rest.MultipartRestClient; +import org.dataone.client.v2.impl.MultipartD1Node; +import org.dataone.service.types.v1.Identifier; +import org.dataone.service.types.v1.Session; +import org.dataone.service.types.v1.SystemMetadata; +import edu.ucsb.nceas.mdqengine.exception.MetadigException; +import org.dataone.client.rest.DefaultHttpMultipartRestClient; +import org.dataone.client.v2.impl.MultipartCNode; +import org.dataone.client.v2.impl.MultipartMNode; +import org.dataone.service.types.v1.Subject; +import org.dataone.service.types.v1.SubjectInfo; +import org.w3c.dom.Document; +import org.xml.sax.InputSource; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DataONE { + + + private static Log log = LogFactory.getLog(DataONE.class); + + /** + * Get a DataONE subject information object + * @param serviceUrl the service URL of the DataONE node to request the subject info from + * @param authToken the authorization token to use for the request + * @return a DataONE subject information object + * @throws MetadigProcessException + */ + public static SubjectInfo getSubjectInfo(Subject rightsHolder, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { + + log.debug("Getting subject info for: " + rightsHolder.getValue()); + MultipartCNode cnNode = null; + MetadigProcessException metadigException = null; + + SubjectInfo subjectInfo = null; + Session session = DataONE.getSession(subjectId, authToken); + + // Identity node as either a CN or MN based on the serviceUrl + String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(serviceUrl); + if (!m.find()) { + log.error("Must call a CN to get subject information"); + metadigException = new MetadigProcessException("Must call a CN to get subject information."); + throw metadigException; + } + + // Only CNs can call the 'subjectInfo' service (aka accounts), so we have to use + // a MultipartCNode instance here. + try { + cnNode = (MultipartCNode) getMultipartD1Node(session, serviceUrl); + } catch (Exception ex) { + metadigException = new MetadigProcessException("Unable to create multipart D1 node: " + subjectId + ": " + ex.getMessage()); + metadigException.initCause(ex); + throw metadigException; + } + + try { + subjectInfo = cnNode.getSubjectInfo(session, rightsHolder); + } catch (Exception ex) { + metadigException = new MetadigProcessException("Unable to get subject information." + ex.getMessage()); + metadigException.initCause(ex); + throw metadigException; + } + + return subjectInfo; + } + + /** + * Get a DataONE MultipartCNode object, which will be used to communication with a CN + * + * @param session a DataONE authentication session + * @param serviceUrl the service URL for the node we are connecting to + * @return a DataONE MultipartCNode object + * @throws MetadigException + */ + public static MultipartD1Node getMultipartD1Node(Session session, String serviceUrl) throws MetadigException { + + MultipartRestClient mrc = null; + MultipartD1Node d1Node = null; + MetadigProcessException metadigException = null; + + // First create an HTTP client + try { + mrc = new DefaultHttpMultipartRestClient(); + } catch (Exception ex) { + log.error("Error creating rest client: " + ex.getMessage()); + metadigException = new MetadigProcessException("Unable to get collection pids"); + metadigException.initCause(ex); + throw metadigException; + } + + Boolean isCN = isCN(serviceUrl); + + // Now create a DataONE object that uses the rest client + if (isCN) { + log.debug("creating cn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); + d1Node = new MultipartCNode(mrc, serviceUrl, session); + } else { + log.debug("creating mn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); + d1Node = new MultipartMNode(mrc, serviceUrl, session); + } + return d1Node; + } + + /** + * Send a query to the DataONE Query Service , using the DataONE CN or MN API + * + * @param queryStr the query string to pass to the Solr server + * @param serviceUrl the service URL for the DataONE CN or MN + * @param startPos the start of the query result to return, if query pagination is being used + * @param countRequested the number of results to return + * @return an XML document containing the query result + * @throws Exception + */ + public static Document querySolr(String queryStr, String serviceUrl, int startPos, int countRequested, String subjectId, String authToken) throws MetadigProcessException { + + MultipartRestClient mrc = null; + // Polymorphism doesn't work with D1 node classes, so have to use the derived classes + MultipartD1Node d1Node = null; + Session session = DataONE.getSession(subjectId, authToken); + + // Add the start and count, if pagination is being used + queryStr = queryStr + "&start=" + startPos + "&rows=" + countRequested; + // Query the MN or CN Solr engine to get the query associated with this project that will return all project related pids. + InputStream qis = null; + MetadigProcessException metadigException = null; + + try { + d1Node = getMultipartD1Node(session, serviceUrl); + log.debug("Created MultipartD1Node: " + d1Node.toString()); + } catch (Exception ex) { + log.error("Unable to create MultipartD1Node for Solr query"); + metadigException = new MetadigProcessException("Unable to create multipart node client to query DataONE solr: " + ex.getMessage()); + metadigException.initCause(ex); + throw metadigException; + } + + // Send a query to a CN or MN + try { + qis = d1Node.query(session, "solr", queryStr); + } catch (Exception e) { + log.error("Error retrieving pids: " + e.getMessage()); + metadigException = new MetadigProcessException("Unable to query dataone node: " + e.getMessage()); + metadigException.initCause(e); + throw metadigException; + } + + Document xmldoc = null; + DocumentBuilder builder = null; + + try { + // If results were returned, create an XML document from them + if (qis.available() == 1) { + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + builder = factory.newDocumentBuilder(); + xmldoc = builder.parse(new InputSource(qis)); + } catch (Exception e) { + log.error("Unable to create w3c Document from input stream", e); + e.printStackTrace(); + } finally { + qis.close(); + } + } else { + log.info("No results returned from D1 Solr query"); + qis.close(); + } + } catch (IOException ioe) { + metadigException = new MetadigProcessException("Unable prepare query result xml document: " + ioe.getMessage()); + metadigException.initCause(ioe); + throw metadigException; + } + + return xmldoc; + } + /** + * Get a DataONE authenticated session + *

+ * If no subject or authentication token are provided, a public session is returned + *

+ * @param authToken the authentication token + * @return the DataONE session + */ + public static Session getSession(String subjectId, String authToken) { + + Session session; + + // query Solr - either the member node or cn, for the project 'solrquery' field + if (authToken == null || authToken.isEmpty()) { + log.debug("Creating public sessioni"); + session = new Session(); + } else { + log.debug("Creating authentication session for subjectId: " + subjectId + ", token: " + authToken.substring(0, 5) + "..."); + session = new AuthTokenSession(authToken); + } + + if (subjectId != null && !subjectId.isEmpty()) { + Subject subject = new Subject(); + subject.setValue(subjectId); + session.setSubject(subject); + log.debug("Set session subjectId to: " + session.getSubject().getValue()); + } + + return session; + } + + protected static Boolean isCN(String serviceUrl) { + + Boolean isCN = false; + // Identity node as either a CN or MN based on the serviceUrl + String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; + Pattern r = Pattern.compile(pattern); + Matcher m = r.matcher(serviceUrl); + if (m.find()) { + isCN = true; + log.debug("service URL is for a CN: " + serviceUrl); + } else { + log.debug("service URL is not for a CN: " + serviceUrl); + isCN = false; + } + return isCN; + } +} diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java deleted file mode 100644 index d5e8b73a..00000000 --- a/src/main/java/edu/ucsb/nceas/mdqengine/authentication/DataONE.java +++ /dev/null @@ -1,43 +0,0 @@ -package edu.ucsb.nceas.mdqengine.authentication; - -import org.dataone.client.auth.AuthTokenSession; -import org.dataone.service.types.v1.Session; -import org.dataone.service.types.v1.Subject; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -public class DataONE { - -public static Log log = LogFactory.getLog(DataONE.class); - - /** - * Get a DataONE authenticated session - *

- * If no subject or authentication token are provided, a public session is returned - *

- * @param authToken the authentication token - * @return the DataONE session - */ - public static Session getSession(String subjectId, String authToken) { - - Session session; - - // query Solr - either the member node or cn, for the project 'solrquery' field - if (authToken == null || authToken.isEmpty()) { - log.debug("Creating public sessioni"); - session = new Session(); - } else { - log.debug("Creating authentication session for subjectId: " + subjectId + ", token: " + authToken.substring(0, 5) + "..."); - session = new AuthTokenSession(authToken); - } - - if (subjectId != null && !subjectId.isEmpty()) { - Subject subject = new Subject(); - subject.setValue(subjectId); - session.setSubject(subject); - log.debug("Set session subjectId to: " + session.getSubject().getValue()); - } - - return session; - } -} diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java b/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java index 9dd246a8..f9eac335 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/authorization/BookkeeperClient.java @@ -1,7 +1,6 @@ package edu.ucsb.nceas.mdqengine.authorization; import edu.ucsb.nceas.mdqengine.MDQconfig; -import edu.ucsb.nceas.mdqengine.authentication.DataONE; import edu.ucsb.nceas.mdqengine.exception.MetadigException; import org.apache.commons.configuration2.ex.ConfigurationException; import org.apache.commons.logging.Log; @@ -24,7 +23,7 @@ public class BookkeeperClient { private static BookkeeperClient instance; - public static Log log = LogFactory.getLog(DataONE.class); + public static Log log = LogFactory.getLog(BookkeeperClient.class); private String bookkeeperURL = null; private String bookkeeperAuthToken = null; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java index 1ea8d7ed..ff67bac6 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java @@ -9,7 +9,6 @@ import edu.ucsb.nceas.mdqengine.model.*; import org.joda.time.DateTime; -import sun.tools.tree.NewArrayExpression; import java.io.IOException; import java.sql.*; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index 9f5d8a6d..b59b0224 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -1,7 +1,7 @@ package edu.ucsb.nceas.mdqengine.scheduler; import edu.ucsb.nceas.mdqengine.MDQconfig; -import edu.ucsb.nceas.mdqengine.authentication.DataONE; +import edu.ucsb.nceas.mdqengine.DataONE; import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.Run; import edu.ucsb.nceas.mdqengine.model.Task; @@ -400,7 +400,6 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, // Set the count for the number of desired pids filtered from the total result set result.setFilteredResultCount(pidCount); // Set the count for the total number of pids returned from DataONE (all formatIds) for this query - // Set the count for the total number of pids returned from DataONE (all formatIds) for this query result.setTotalResultCount(objList.getCount()); result.setResult(pids); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index e160635f..6414f852 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -2,7 +2,7 @@ import com.rabbitmq.client.*; import edu.ucsb.nceas.mdqengine.MDQconfig; -import edu.ucsb.nceas.mdqengine.authentication.DataONE; +import edu.ucsb.nceas.mdqengine.DataONE; import edu.ucsb.nceas.mdqengine.exception.MetadigException; import edu.ucsb.nceas.mdqengine.exception.MetadigProcessException; import edu.ucsb.nceas.mdqengine.filestore.MetadigFile; @@ -20,17 +20,12 @@ import org.apache.solr.client.solrj.beans.BindingException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; -import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; -import org.dataone.client.v2.impl.MultipartCNode; -import org.dataone.client.v2.impl.MultipartMNode; import org.dataone.client.v2.impl.MultipartD1Node; // Don't include org.dataone.client.rest.MultipartD1Node (this is what IDEA selects) import org.dataone.service.types.v1.Session; import org.dataone.service.types.v1.Subject; import org.dataone.service.types.v1.Group; -import org.dataone.service.types.v1.Identifier; import org.dataone.service.types.v1.SubjectInfo; -import org.dataone.service.types.v1.SystemMetadata; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; @@ -369,7 +364,7 @@ private ScorerResult getCollectionPids(String collectionId, String serviceUrl, S which will be used to query DataONE Solr for all the pids associated with that project (that's 2 queries!) */ ArrayList pids = new ArrayList<>(); - queryStr = "?q=id:" + escapeSpecialChars(collectionId) + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; + queryStr = "?q=seriesId:" + escapeSpecialChars(collectionId) + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; startPos = 0; countRequested = 10000; @@ -377,7 +372,7 @@ which will be used to query DataONE Solr for all the pids associated with that p // Get the collectionQuery from Solr try { log.debug("Getting collectionQuery with query: " + queryStr); - xmldoc = queryD1Solr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); + xmldoc = DataONE.querySolr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); } catch (MetadigProcessException mpe) { log.error("Unable to query Solr for collectionQuery field for collection id: " + collectionId); throw new MetadigProcessException("Unable to query Solr for collectionQuery field for collection id: " + collectionId); @@ -477,7 +472,7 @@ which will be used to query DataONE Solr for all the pids associated with that p subject.setValue(rightsHolder); // The subject info can only be obtained from a CN, so use the CN auth info for the current DataONE environment, // which should be configured in the metadig.properties file - SubjectInfo subjectInfo = getSubjectInfo(subject, CNserviceUrl, CNsubjectId, CNauthToken); + SubjectInfo subjectInfo = DataONE.getSubjectInfo(subject, CNserviceUrl, CNsubjectId, CNauthToken); String groupStr = null; groupStr = "(readPermission:" + "\"" + rightsHolder @@ -541,7 +536,7 @@ which will be used to query DataONE Solr for all the pids associated with that p do { //TODO: check that a result was returned // Note: the collectionQuery is always evaluated on the CN, so that the entire DataONE network is queried. - xmldoc = queryD1Solr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); + xmldoc = DataONE.querySolr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); if(xmldoc == null) { log.info("no values returned from query"); break; @@ -621,6 +616,7 @@ private List getQualityScores(String collectionId, String suiteId, int startPosInResult = 0; int startPosInQuery = 0; // this will always be zero - we are listing the pids to retrieve, so will always want to start at the first result + log.trace("Getting scores from Solr for " + collectionPids.size() + " pids."); // Now accumulate the Quality Solr document results for the list of pids for the project. if (collectionId != null && ! collectionId.isEmpty()) { log.info("Getting quality scores for collection: " + collectionId); @@ -652,7 +648,7 @@ private List getQualityScores(String collectionId, String suiteId, if (suiteId != null) { queryStr += " AND suiteId:" + suiteId; } - log.trace("query to quality Solr server: " + queryStr); + log.debug("query to quality Solr server: " + queryStr); // Send query to Quality Solr Server // Get all the pids in this pid string resultList = queryQualitySolr(queryStr, startPosInQuery, pidCntToRequest); @@ -782,78 +778,6 @@ private void returnGraphStatus(String metadataPid, String suiteId, ScorerQueueEn } } - /** - * Send a query to the DataONE Query Service , using the DataONE CN or MN API - * - * @param queryStr the query string to pass to the Solr server - * @param serviceUrl the service URL for the DataONE CN or MN - * @param startPos the start of the query result to return, if query pagination is being used - * @param countRequested the number of results to return - * @return an XML document containing the query result - * @throws Exception - */ - private Document queryD1Solr(String queryStr, String serviceUrl, int startPos, int countRequested, String subjectId, String authToken) throws MetadigProcessException { - - MultipartRestClient mrc = null; - // Polymorphism doesn't work with D1 node classes, so have to use the derived classes - MultipartD1Node d1Node = null; - Session session = DataONE.getSession(subjectId, authToken); - - // Add the start and count, if pagination is being used - queryStr = queryStr + "&start=" + startPos + "&rows=" + countRequested; - // Query the MN or CN Solr engine to get the query associated with this project that will return all project related pids. - InputStream qis = null; - MetadigProcessException metadigException = null; - - try { - d1Node = getMultipartD1Node(session, serviceUrl); - log.debug("Created MultipartD1Node: " + d1Node.toString()); - } catch (Exception ex) { - log.error("Unable to create MultipartD1Node for Solr query"); - metadigException = new MetadigProcessException("Unable to create multipart node client to query DataONE solr: " + ex.getMessage()); - metadigException.initCause(ex); - throw metadigException; - } - - // Send a query to a CN or MN - try { - qis = d1Node.query(session, "solr", queryStr); - } catch (Exception e) { - log.error("Error retrieving pids: " + e.getMessage()); - metadigException = new MetadigProcessException("Unable to query dataone node: " + e.getMessage()); - metadigException.initCause(e); - throw metadigException; - } - - Document xmldoc = null; - DocumentBuilder builder = null; - - try { - // If results were returned, create an XML document from them - if (qis.available() == 1) { - try { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - builder = factory.newDocumentBuilder(); - xmldoc = builder.parse(new InputSource(qis)); - } catch (Exception e) { - log.error("Unable to create w3c Document from input stream", e); - e.printStackTrace(); - } finally { - qis.close(); - } - } else { - log.info("No results returned from D1 Solr query"); - qis.close(); - } - } catch (IOException ioe) { - metadigException = new MetadigProcessException("Unable prepare query result xml document: " + ioe.getMessage()); - metadigException.initCause(ioe); - throw metadigException; - } - - return xmldoc; - } - /** * Send a query to the Quality Solr Server. * @param queryStr the query to send to Solr @@ -959,137 +883,6 @@ public void writeCompletedQueue (byte[] message) throws IOException { completedChannel.basicPublish(EXCHANGE_NAME, COMPLETED_ROUTING_KEY, basicProperties, message); } - /** - * Get a DataONE system metadata object - * @param pid the pid to get the system metadata for - * @param serviceUrl the service URL of the DataONE node to request the sysmeta - * @param authToken the authorization token to use for the request - * @return a DataONE system metadata object - * @throws MetadigProcessException - */ - protected SystemMetadata getSystemMetadata(String pid, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { - - SystemMetadata sysmeta = null; - MultipartRestClient mrc = null; - MultipartD1Node d1Node = null; - MetadigProcessException metadigException = null; - - log.debug("serviceUrl: " + serviceUrl); - log.debug("subjectId: " + subjectId); - -// Subject subject = new Subject(); -// if(subjectId != null && ! subjectId.isEmpty()) { -// subject.setValue(subjectId); -// } - - Session session = DataONE.getSession(subjectId, authToken); - Identifier identifier = new Identifier(); - identifier.setValue(pid); - - try { - d1Node = getMultipartD1Node(session, serviceUrl); - } catch (Exception ex) { - metadigException = new MetadigProcessException("Unable to get multipartD1Node for serviceUrl: " + serviceUrl); - metadigException.initCause(ex); - throw metadigException; - } - - try { - sysmeta = d1Node.getSystemMetadata(session, identifier); - log.debug("retrieved sysmeta for pid: " + sysmeta.getIdentifier().getValue()); - } catch (Exception ex) { - log.error("Unable to retrieve sysmeta for pid: " + pid); - metadigException = new MetadigProcessException("Unable to get sysmeta for pid: " + pid); - metadigException.initCause(ex); - throw metadigException; - } - - return sysmeta; - } - - /** - * Get a DataONE subject information object - * @param serviceUrl the service URL of the DataONE node to request the subject info from - * @param authToken the authorization token to use for the request - * @return a DataONE subject information object - * @throws MetadigProcessException - */ - private SubjectInfo getSubjectInfo(Subject rightsHolder, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { - - log.debug("Getting subject info for: " + rightsHolder.getValue()); - MultipartCNode cnNode = null; - MetadigProcessException metadigException = null; - - SubjectInfo subjectInfo = null; - Session session = DataONE.getSession(subjectId, authToken); - - // Identity node as either a CN or MN based on the serviceUrl - String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; - Pattern r = Pattern.compile(pattern); - Matcher m = r.matcher(serviceUrl); - if (!m.find()) { - log.error("Must call a CN to get subject information"); - metadigException = new MetadigProcessException("Must call a CN to get subject information."); - throw metadigException; - } - - // Only CNs can call the 'subjectInfo' service (aka accounts), so we have to use - // a MultipartCNode instance here. - try { - cnNode = (MultipartCNode) getMultipartD1Node(session, serviceUrl); - } catch (Exception ex) { - metadigException = new MetadigProcessException("Unable to create multipart D1 node: " + subjectId + ": " + ex.getMessage()); - metadigException.initCause(ex); - throw metadigException; - } - - try { - subjectInfo = cnNode.getSubjectInfo(session, rightsHolder); - } catch (Exception ex) { - metadigException = new MetadigProcessException("Unable to get subject information." + ex.getMessage()); - metadigException.initCause(ex); - throw metadigException; - } - - return subjectInfo; - } - - /** - * Get a DataONE MultipartCNode object, which will be used to communication with a CN - * - * @param session a DataONE authentication session - * @param serviceUrl the service URL for the node we are connecting to - * @return a DataONE MultipartCNode object - * @throws MetadigException - */ - MultipartD1Node getMultipartD1Node(Session session, String serviceUrl) throws MetadigException { - - MultipartRestClient mrc = null; - MultipartD1Node d1Node = null; - MetadigProcessException metadigException = null; - - // First create an HTTP client - try { - mrc = new DefaultHttpMultipartRestClient(); - } catch (Exception ex) { - log.error("Error creating rest client: " + ex.getMessage()); - metadigException = new MetadigProcessException("Unable to get collection pids"); - metadigException.initCause(ex); - throw metadigException; - } - - Boolean isCN = isCN(serviceUrl); - - // Now create a DataONE object that uses the rest client - if (isCN) { - log.debug("creating cn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); - d1Node = new MultipartCNode(mrc, serviceUrl, session); - } else { - log.debug("creating mn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); - d1Node = new MultipartMNode(mrc, serviceUrl, session); - } - return d1Node; - } /** * Read a file from a Java resources folder. * @@ -1163,21 +956,6 @@ private static String encodeValue(String value) { } } - private Boolean isCN(String serviceUrl) { - Boolean isCN = false; - // Identity node as either a CN or MN based on the serviceUrl - String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; - Pattern r = Pattern.compile(pattern); - Matcher m = r.matcher(serviceUrl); - if (m.find()) { - isCN = true; - log.debug("service URL is for a CN: " + serviceUrl); - } else { - log.debug("service URL is not for a CN: " + serviceUrl); - isCN = false; - } - return isCN; - } } From 581ad284e6fc272d98bd31bad08cc72fabd0cad9 Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 2 Aug 2020 12:49:47 -0700 Subject: [PATCH 27/47] Get list of new portal ids from Solr (#110) --- .../edu/ucsb/nceas/mdqengine/DataONE.java | 61 +++-- .../mdqengine/scheduler/RequestScorerJob.java | 258 ++++++++---------- 2 files changed, 150 insertions(+), 169 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java index 7a2781a1..82e4552d 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java @@ -6,9 +6,7 @@ import org.dataone.client.auth.AuthTokenSession; import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartD1Node; -import org.dataone.service.types.v1.Identifier; import org.dataone.service.types.v1.Session; -import org.dataone.service.types.v1.SystemMetadata; import edu.ucsb.nceas.mdqengine.exception.MetadigException; import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; @@ -117,18 +115,20 @@ public static MultipartD1Node getMultipartD1Node(Session session, String service * Send a query to the DataONE Query Service , using the DataONE CN or MN API * * @param queryStr the query string to pass to the Solr server - * @param serviceUrl the service URL for the DataONE CN or MN * @param startPos the start of the query result to return, if query pagination is being used * @param countRequested the number of results to return * @return an XML document containing the query result * @throws Exception */ - public static Document querySolr(String queryStr, String serviceUrl, int startPos, int countRequested, String subjectId, String authToken) throws MetadigProcessException { + //public static Document querySolr(String queryStr, int startPos, int countRequested, MultipartCNode cnNode, + // MultipartMNode mnNode, Boolean isCN, + // Session session) throws MetadigProcessException { + public static Document querySolr(String queryStr, int startPos, int countRequested, MultipartD1Node d1Node, + Session session) throws MetadigProcessException { - MultipartRestClient mrc = null; - // Polymorphism doesn't work with D1 node classes, so have to use the derived classes - MultipartD1Node d1Node = null; - Session session = DataONE.getSession(subjectId, authToken); +// // Polymorphism doesn't work with D1 node classes, so have to use the derived classes +// MultipartD1Node d1Node = null; +// Session session = DataONE.getSession(subjectId, authToken); // Add the start and count, if pagination is being used queryStr = queryStr + "&start=" + startPos + "&rows=" + countRequested; @@ -136,19 +136,34 @@ public static Document querySolr(String queryStr, String serviceUrl, int startPo InputStream qis = null; MetadigProcessException metadigException = null; - try { - d1Node = getMultipartD1Node(session, serviceUrl); - log.debug("Created MultipartD1Node: " + d1Node.toString()); - } catch (Exception ex) { - log.error("Unable to create MultipartD1Node for Solr query"); - metadigException = new MetadigProcessException("Unable to create multipart node client to query DataONE solr: " + ex.getMessage()); - metadigException.initCause(ex); - throw metadigException; - } - +// try { +// d1Node = getMultipartD1Node(session, serviceUrl); +// log.debug("Created MultipartD1Node, nodeId: " + d1Node.getNodeId().getValue()); +// } catch (Exception ex) { +// log.error("Unable to create MultipartD1Node for Solr query"); +// metadigException = new MetadigProcessException("Unable to create multipart node client to query DataONE solr: " + ex.getMessage()); +// metadigException.initCause(ex); +// throw metadigException; +// } + + log.debug("Sending query: " + queryStr); // Send a query to a CN or MN +// try { +// if(isCN) { +// qis = cnNode.query(session, "solr", queryStr); +// } else { +// qis = mnNode.query(session, "solr", queryStr); +// } +// log.debug("Sent query"); +// } catch (Exception e) { +// log.error("Error retrieving pids: " + e.getMessage()); +// metadigException = new MetadigProcessException("Unable to query dataone node: " + e.getMessage()); +// metadigException.initCause(e); +// throw metadigException; +// } try { qis = d1Node.query(session, "solr", queryStr); + log.debug("Sent query"); } catch (Exception e) { log.error("Error retrieving pids: " + e.getMessage()); metadigException = new MetadigProcessException("Unable to query dataone node: " + e.getMessage()); @@ -156,16 +171,19 @@ public static Document querySolr(String queryStr, String serviceUrl, int startPo throw metadigException; } + log.debug("Creating xml doc with results"); Document xmldoc = null; DocumentBuilder builder = null; try { // If results were returned, create an XML document from them - if (qis.available() == 1) { + log.debug("qis available: " + qis.available()); + if (qis.available() > 0) { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); builder = factory.newDocumentBuilder(); xmldoc = builder.parse(new InputSource(qis)); + log.debug("Created xml doc: " + xmldoc.toString()); } catch (Exception e) { log.error("Unable to create w3c Document from input stream", e); e.printStackTrace(); @@ -177,11 +195,14 @@ public static Document querySolr(String queryStr, String serviceUrl, int startPo qis.close(); } } catch (IOException ioe) { + log.debug("IO exception: " + ioe.getMessage()); metadigException = new MetadigProcessException("Unable prepare query result xml document: " + ioe.getMessage()); metadigException.initCause(ioe); throw metadigException; } + log.debug("Created results xml doc"); + return xmldoc; } /** @@ -215,7 +236,7 @@ public static Session getSession(String subjectId, String authToken) { return session; } - protected static Boolean isCN(String serviceUrl) { + public static Boolean isCN(String serviceUrl) { Boolean isCN = false; // Identity node as either a CN or MN based on the serviceUrl diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index daeaee34..29352235 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -3,6 +3,7 @@ import edu.ucsb.nceas.mdqengine.Controller; import edu.ucsb.nceas.mdqengine.MDQconfig; import edu.ucsb.nceas.mdqengine.DataONE; +import edu.ucsb.nceas.mdqengine.exception.MetadigProcessException; import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.Task; import edu.ucsb.nceas.mdqengine.store.DatabaseStore; @@ -18,20 +19,20 @@ import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; +import org.dataone.client.v2.impl.MultipartD1Node; import org.dataone.client.v2.impl.MultipartMNode; -import org.dataone.service.types.v2.SystemMetadata; import org.dataone.service.types.v1.*; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.quartz.*; +import org.w3c.dom.Document; +import javax.xml.xpath.*; import java.io.IOException; import java.io.InputStream; -import java.time.ZonedDateTime; import java.util.ArrayList; -import java.util.Date; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -124,6 +125,7 @@ public void execute(JobExecutionContext context) String nodeId = dataMap.getString("nodeId"); String startHarvestDatetimeStr = dataMap.getString("startHarvestDatetime"); int harvestDatetimeInc = dataMap.getInt("harvestDatetimeInc"); + // Number of pids to get each query (this number of pids will be fetched each query until all pids are obtained) int countRequested = dataMap.getInt("countRequested"); // TODO: add formatFamily to scheduler request String formatFamily = null; @@ -135,6 +137,8 @@ public void execute(JobExecutionContext context) String subjectId = null; String nodeServiceUrl = null; + log.info("Executing task: " + taskName + ", taskType: " + taskType); + try { cfg = new MDQconfig(); qualityServiceUrl = cfg.getString("quality.serviceUrl"); @@ -151,8 +155,6 @@ public void execute(JobExecutionContext context) throw jee; } - log.info("Executing task: " + taskName + ", taskType: " + taskType); - try { mrc = new DefaultHttpMultipartRestClient(); } catch (Exception e) { @@ -165,12 +167,16 @@ public void execute(JobExecutionContext context) Session session = DataONE.getSession(subjectId, authToken); // Don't know node type yet from the id, so have to manually check if it's a CN - Boolean isCN = isCN(nodeServiceUrl); + Boolean isCN = DataONE.isCN(nodeServiceUrl); + + MultipartD1Node d1Node = null; if(isCN) { - cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); + //cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); + d1Node = new MultipartCNode(mrc, nodeServiceUrl, session); log.debug("Created cnNode for serviceUrl: " + nodeServiceUrl); } else { - mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); + //mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); + d1Node = new MultipartMNode(mrc, nodeServiceUrl, session); log.debug("Created mnNode for serviceUrl: " + nodeServiceUrl); } @@ -203,7 +209,7 @@ public void execute(JobExecutionContext context) String lastHarvestDateStr = null; Task task; - task = store.getTask(taskName); + task = store.getTask(taskName, taskType); // If a 'task' entry has not been saved for this task name yet, then a 'lastHarvested' // DataTime will not be available, in which case the 'startHarvestDataTime' from the @@ -245,16 +251,20 @@ public void execute(JobExecutionContext context) String startDTRstr = dtfOut.print(startDTR); String endDTRstr = dtfOut.print(endDTR); - Integer startCount = new Integer(0); + int startCount = 0; RequestScorerJob.ListResult result = null; Integer resultCount = null; + log.debug("Getting portal pids to process..."); boolean morePids = true; while(morePids) { ArrayList pidsToProcess = null; + log.debug("startCount: " + startCount); + log.debug("countRequested:" + countRequested); try { - result = getPidsToProcess(cnNode, mnNode, isCN, session, nodeId, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); + //result = getPidsToProcess(cnNode, mnNode, isCN, session, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); + result = getPidsToProcess(d1Node, session, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); pidsToProcess = result.getResult(); resultCount = result.getResultCount(); } catch (Exception e) { @@ -263,32 +273,17 @@ public void execute(JobExecutionContext context) throw jee; } - log.info("Found " + resultCount + " pids" + " for servierUrl: " + nodeServiceUrl); + log.info("Found " + resultCount + " seriesIds" + " for date: " + startDTRstr + " at servierUrl: " + nodeServiceUrl); for (String pidStr : pidsToProcess) { try { - log.info("submitting pid: " + pidStr); submitScorerRequest(qualityServiceUrl, pidStr, suiteId, nodeId, formatFamily); - } catch (Exception e) { - JobExecutionException jee = new JobExecutionException("Unable to submit request to create new quality reports", e); + JobExecutionException jee = new JobExecutionException("Unable to submit request to create new score graph/data file", e); jee.setRefireImmediately(false); throw jee; } } - task.setLastHarvestDatetime(endDTRstr); - log.debug("taskName: " + task.getTaskName()); - log.debug("taskType: " + task.getTaskType()); - log.debug("lastharvestdate: " + task.getLastHarvestDatetime()); - - try { - store.saveTask(task); - } catch (MetadigStoreException mse) { - log.error("Error saving task: " + task.getTaskName()); - JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); - jee.setRefireImmediately(false); - throw jee; - } // Check if DataONE returned the max number of results. If so, we have to request more by paging through // the results. if(resultCount >= countRequested) { @@ -297,6 +292,21 @@ public void execute(JobExecutionContext context) log.info("Paging through more results, current start is " + startCount); } else { morePids = false; + + // Record the new "last harvested" date + task.setLastHarvestDatetime(endDTRstr); + log.debug("taskName: " + task.getTaskName()); + log.debug("taskType: " + task.getTaskType()); + log.debug("lastharvestdate: " + task.getLastHarvestDatetime()); + + try { + store.saveTask(task); + } catch (MetadigStoreException mse) { + log.error("Error saving task: " + task.getTaskName()); + JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); + jee.setRefireImmediately(false); + throw jee; + } } } store.shutdown(); @@ -305,11 +315,10 @@ public void execute(JobExecutionContext context) /** * Query a DataONE CN or MN object store for a list of object that match the time range and formatId filters provided. * - * @param cnNode the CN to query - * @param mnNode the MN to query - * @param isCN was a CN or MN specified - * @param session the authentication session to use - * @param nodeId the DataONE nodeId of the node to query + * //@param cnNode + * //@param mnNode + * //@param isCN + * @param session * @param pidFilter * @param startHarvestDatetimeStr * @param endHarvestDatetimeStr @@ -318,113 +327,85 @@ public void execute(JobExecutionContext context) * @return a ListResult object containing the matching pids * @throws Exception */ - public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, Boolean isCN, Session session, String nodeId, + //public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, Boolean isCN, Session session, + public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, String pidFilter, String startHarvestDatetimeStr, String endHarvestDatetimeStr, int startCount, int countRequested) throws Exception { - ArrayList pids = new ArrayList(); - InputStream qis = null; - ObjectList objList = null; - - ObjectFormatIdentifier formatId = null; - NodeReference nodeRef = null; - //nodeRef.setValue(nodeId); - Identifier identifier = null; - Boolean replicaStatus = false; - - // Do some back-flips to convert the start and end date to the ancient Java 'Date' type that is - // used by DataONE 'listObjects()'. - ZonedDateTime zdt = ZonedDateTime.parse(startHarvestDatetimeStr); - // start date milliseconds since the epoch date "midnight, January 1, 1970 UTC" - long msSinceEpoch = zdt.toInstant().toEpochMilli(); - Date startDate = new Date(msSinceEpoch); - - zdt = ZonedDateTime.parse(endHarvestDatetimeStr); - msSinceEpoch = zdt.toInstant().toEpochMilli(); - Date endDate = new Date(msSinceEpoch); - - try { - // Even though MultipartMNode and MultipartCNode have the same parent class, their interfaces are differnt, so polymorphism - // isn't happening here. - log.debug("session: " + session.getSubject().getValue()); - log.debug("startDate: " + startDate); - log.debug("endDate: " + endDate); - log.debug("formatId: " + formatId); - log.debug("Identifier: " + identifier); - log.debug("startCount: " + startCount); - log.debug("countRequested: " + countRequested); - if(isCN) { - log.debug("cnNode: " + cnNode); - objList = cnNode.listObjects(session, startDate, endDate, formatId, nodeRef, identifier, startCount, countRequested); - } else { - log.debug("mnNode: " + mnNode); - objList = mnNode.listObjects(session, startDate, endDate, formatId, identifier, replicaStatus, startCount, countRequested); - } - log.debug("Retrieved " + objList.getCount() + " pids"); - } catch (Exception e) { - log.error("Error retrieving pids for node: " + e.getMessage()); - throw e; - } + MetadigProcessException metadigException = null; - String thisFormatId = null; - String thisPid = null; - int pidCount = 0; - - log.info("Checking retrieved pids for matches with pid filter"); - if (objList.getCount() > 0) { - for(ObjectInfo oi: objList.getObjectInfoList()) { - thisFormatId = oi.getFormatId().getValue(); - thisPid = oi.getIdentifier().getValue(); - - // Check all pid filters. There could be multiple wildcard filters, which are separated - // by ','. - String [] filters = pidFilter.split("\\|"); - Boolean found = false; - for(String thisFilter:filters) { - if(thisFormatId.matches(thisFilter)) { - found = true; - continue; - } - } + org.w3c.dom.NodeList xpathResult = null; + XPathExpression fieldXpath = null; + XPath xpath = null; + org.w3c.dom.Node node = null; + ArrayList pids = new ArrayList(); + Document xmldoc = null; - // Always re-create a report, even if it exists for a pid, as the sysmeta could have - // been updated (i.e. obsoletedBy, access) and the quality report and index contain - // sysmeta fields. - if(found) { - // The DataONE listObjects service retuns the pid for each object, but does not return the seriesId, - // so this has to be retrieved now, as Bookkeeper service and MetacatUI (when the graph is requested for - // this portal) uses the sid, not the pid, so create and store the graph based on the sid. - // if (!runExists(thisPid, suiteId, store)) { + String queryStr = "?q=formatId:" + pidFilter + "+-obsoletedBy:*" + "+dateUploaded:[" + startHarvestDatetimeStr + "%20TO%20" + + endHarvestDatetimeStr + "]" + + "&fl=seriesId&q.op=AND"; + log.debug("query: " + queryStr); - Identifier thisId = new Identifier(); - thisId.setValue(thisPid); + // Send the query to DataONE Solr to retrieve portal seriesIds for a given time frame - org.dataone.service.types.v2.SystemMetadata sysmeta = null; + // One query can return many documents, so use the paging mechanism to make sure we retrieve them all. + // Keep paging through query results until all pids have been fetched. The last 'page' of query + // results is indicated by the number of items returned being less than the number requested. + int thisResultLength; + // Now setup the xpath to retrieve the ids returned from the collection query. + try { + log.debug("Compiling xpath for seriesId"); + // Extract the collection query from the Solr result XML + XPathFactory xPathfactory = XPathFactory.newInstance(); + xpath = xPathfactory.newXPath(); + fieldXpath = xpath.compile("//result/doc/str[@name='seriesId']/text()"); + } catch (XPathExpressionException xpe) { + log.error("Error extracting id from solr result doc: " + xpe.getMessage()); + metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); + metadigException.initCause(xpe); + throw metadigException; + } - if(isCN) { - sysmeta = cnNode.getSystemMetadata(session, thisId); - } else { - sysmeta = mnNode.getSystemMetadata(session, thisId); - } + // Loop through the Solr result. As the result may be large, page through the results, accumulating + // the pids returned into a ListResult object. - String thisSeriesId = sysmeta.getSeriesId().getValue(); + //log.debug("Getting portal seriesIds from Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); + log.debug("Getting portal seriesIds from Solr " ); + int startPos = startCount; - pidCount = pidCount++; - pids.add(thisSeriesId); - log.info("adding seriesId to process: " + thisSeriesId + ", formatId: " + thisFormatId); - // } - } + do { + //xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, cnNode, mnNode, isCN, session); + xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, d1Node, session); + if(xmldoc == null) { + log.info("no values returned from query"); + break; + } + try { + log.debug("processing xpathresult..."); + xpathResult = (org.w3c.dom.NodeList) fieldXpath.evaluate(xmldoc, XPathConstants.NODESET); + log.debug("processed xpathResult"); + } catch (XPathExpressionException xpe) { + log.error("Error extracting seriesId from solr result doc: " + xpe.getMessage()); + metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); + metadigException.initCause(xpe); + throw metadigException; + } + String currentPid = null; + thisResultLength = xpathResult.getLength(); + log.debug("Got " + thisResultLength + " pids this query"); + if(thisResultLength == 0) break; + for (int index = 0; index < xpathResult.getLength(); index++) { + node = xpathResult.item(index); + currentPid = node.getTextContent(); + pids.add(currentPid); + log.debug("adding pid: " + currentPid); } - } - if(pids.size() == 0) { - log.info("No matching pids found"); - } else { - log.info(pids.size() + " matching pids found."); - } + startPos += thisResultLength; + } while (thisResultLength > 0); RequestScorerJob.ListResult result = new RequestScorerJob.ListResult(); - result.setResultCount(pidCount); + result.setResultCount(pids.size()); result.setResult(pids); return result; @@ -437,7 +418,7 @@ public void submitScorerRequest(String qualityServiceUrl, String collectionId, S String scorerServiceUrl = qualityServiceUrl + "/scores" + "?suite=" + suiteId; if(collectionId != null && ! collectionId.isEmpty()) { - scorerServiceUrl += "&collection=" + collectionId; + scorerServiceUrl += "&id=" + collectionId; } if(nodeId != null && ! nodeId.isEmpty()) { @@ -455,7 +436,7 @@ public void submitScorerRequest(String qualityServiceUrl, String collectionId, S post.addHeader("Accept", "application/xml"); // send to service - log.trace("submitting scores request : " + scorerServiceUrl); + log.debug("submitting scores request : " + scorerServiceUrl); //post.setEntity((HttpEntity) entity); CloseableHttpClient client = HttpClients.createDefault(); CloseableHttpResponse response = client.execute(post); @@ -469,26 +450,5 @@ public void submitScorerRequest(String qualityServiceUrl, String collectionId, S throw(e); } } - - private Boolean isCN(String serviceUrl) { - - Boolean isCN = false; - // Identity node as either a CN or MN based on the serviceUrl - String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; - Pattern r = Pattern.compile(pattern); - Matcher m = r.matcher(serviceUrl); - if (m.find()) { - isCN = true; - log.debug("service URL is for a CN: " + serviceUrl); - } else { - log.debug("service URL is not for a CN: " + serviceUrl); - isCN = false; - } - - return isCN; - } - - - } From 66d0f9b195cfec991e6de51385cc3c8da8fd51fd Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 2 Aug 2020 12:50:54 -0700 Subject: [PATCH 28/47] Evaluate portal 'collectionQuery' on CN (#110) --- .../edu/ucsb/nceas/mdqengine/Controller.java | 8 +- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 121 ++++++++++++++---- 2 files changed, 103 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java index be83cd6e..f0958ff4 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Controller.java @@ -416,7 +416,7 @@ public void processQualityRequest(String memberNode, * * @param collectionId the DataONE collection identifier (the portal seriesId) * @param nodeId the node identifier the collection resides on - * @param formatFamily a string representing the DataONE formats to create score for + * @param formatFamily a string representing the DataONE formats to create score for ("eml", "iso"), optional * @param qualitySuiteId the quality suite used to create the score graph * @param requestDateTime the datetime that the request was made * @@ -425,12 +425,12 @@ public void processQualityRequest(String memberNode, */ public void processScorerRequest(String collectionId, String nodeId, - String formatFamily, + String formatFamily, // Optional format filter, if creating a graph for a submit of metadata formats ("eml", "iso") String qualitySuiteId, DateTime requestDateTime) throws java.io.IOException { log.info("Processing scorer request, collection: " + collectionId + ", suite: " + qualitySuiteId - + "nodeId: " + nodeId + ", formatFamily: " + formatFamily); + + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); ScorerQueueEntry qEntry = null; byte[] message = null; @@ -467,7 +467,7 @@ public void processScorerRequest(String collectionId, message = bos.toByteArray(); this.writeInProcessChannel(message, SCORER_ROUTING_KEY); - log.info(" [x] Queued Scorer request for pid: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); + log.info(" [x] Queued Scorer request for id: '" + qEntry.getCollectionId() + "'" + ", quality suite " + qualitySuiteId + ", nodeId: " + nodeId + ", formatFamily: " + formatFamily); } /** diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index 6414f852..b8cfb205 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -20,8 +20,11 @@ import org.apache.solr.client.solrj.beans.BindingException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; +import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; +import org.dataone.client.v2.impl.MultipartCNode; import org.dataone.client.v2.impl.MultipartD1Node; // Don't include org.dataone.client.rest.MultipartD1Node (this is what IDEA selects) +import org.dataone.client.v2.impl.MultipartMNode; import org.dataone.service.types.v1.Session; import org.dataone.service.types.v1.Subject; import org.dataone.service.types.v1.Group; @@ -29,6 +32,7 @@ import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormatter; import org.joda.time.format.ISODateTimeFormat; +import org.quartz.JobExecutionException; import org.w3c.dom.Document; import org.xml.sax.InputSource; @@ -156,6 +160,9 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp String nodeServiceUrl = null; String label = null; String title = null; + MultipartRestClient mrc = null; + MultipartMNode mnNode = null; + MultipartCNode cnNode = null; //long startTime = System.nanoTime(); startTimeProcessing = System.currentTimeMillis(); @@ -199,6 +206,7 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp } log.debug("nodeId: " + nodeId); + label: try { MDQconfig cfg = new MDQconfig(); // Pids associated with a collection, based on query results using 'collectionQuery' field in solr. @@ -224,6 +232,43 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp // If creating a graph for a collection, get the set of pids associated with the collection. // Only scores for these pids will be included in the graph. + try { + mrc = new DefaultHttpMultipartRestClient(); + } catch (Exception e) { + log.error("Error creating rest client: " + e.getMessage()); + JobExecutionException jee = new JobExecutionException(e); + jee.setRefireImmediately(false); + throw jee; + } + + Session session = DataONE.getSession(subjectId, authToken); + + // Don't know node type yet from the id, so have to manually check if it's a CN + Boolean isCN = DataONE.isCN(nodeServiceUrl); + + MultipartD1Node d1Node = null; + if(isCN) { + //cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); + d1Node = new MultipartCNode(mrc, nodeServiceUrl, session); + log.debug("Created cnNode for serviceUrl: " + nodeServiceUrl); + } else { + //mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); + d1Node = new MultipartMNode(mrc, nodeServiceUrl, session); + log.debug("Created mnNode for serviceUrl: " + nodeServiceUrl); + } +// +// Session session = DataONE.getSession(subjectId, authToken); +// +// // Don't know node type yet from the id, so have to manually check if it's a CN +// Boolean isCN = DataONE.isCN(nodeServiceUrl); +// if(isCN) { +// cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); +// log.debug("Created cnNode for serviceUrl: " + nodeServiceUrl); +// } else { +// mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); +// log.debug("Created mnNode for serviceUrl: " + nodeServiceUrl); +// } + if (collectionId != null && !collectionId.isEmpty()) { // If the nodeId is specified, use if to determine the values for authTokenName and subjectIdName, // if those values are not defined @@ -235,7 +280,8 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp // Always use the CN subject id and authentication token from the configuration file, as // requests that this method uses need CN subject privs ScorerResult result = null; - result = gfr.getCollectionPids(collectionId, nodeServiceUrl, subjectId, authToken); + //result = gfr.getCollectionPids(collectionId, cnNode, mnNode, isCN, session); + result = gfr.getCollectionPids(collectionId, d1Node, session); collectionPids = result.getResult(); label = result.getLabel(); // Don't continue if no pids (and thus scores) were found for this collection @@ -346,12 +392,13 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp * which is usually an MN, but the collectionQuery is always evaluated on the CN

* * @param collectionId a DataONE project id to fetch scores for, e.g. urn:uuid:f137095e-4266-4474-aa5f-1e1fcaa5e2dc - * @param serviceUrl the DataONE service URL to obtain the collectionQuery string from - * @param subjectId the DataONE subjectId to use for the query, associated with the authentication token - * @param authToken the DataONE authentication token + * @param d1Node + * @param session * @return a List of quality scores fetched from Solr */ - private ScorerResult getCollectionPids(String collectionId, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { + //private ScorerResult getCollectionPids(String collectionId, MultipartCNode cnNode, MultipartMNode mnNode, + // Boolean isCN, Session session) throws MetadigProcessException { + private ScorerResult getCollectionPids(String collectionId, MultipartD1Node d1Node, Session session) throws MetadigProcessException { Document xmldoc = null; String queryStr = null; @@ -364,7 +411,9 @@ private ScorerResult getCollectionPids(String collectionId, String serviceUrl, S which will be used to query DataONE Solr for all the pids associated with that project (that's 2 queries!) */ ArrayList pids = new ArrayList<>(); - queryStr = "?q=seriesId:" + escapeSpecialChars(collectionId) + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; + queryStr = "?q=seriesId:" + escapeSpecialChars(collectionId) + "+-obsoletedBy:*" + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; + //queryStr = "?q=seriesId:" + encodeValue(collectionId) + "+-obsoletedBy:*" + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; + //queryStr = "?q=seriesId:" + collectionId + "+-obsoletedBy:*&fl=collectionQuery,label,rightsHolder&q.op=AND"; startPos = 0; countRequested = 10000; @@ -372,7 +421,7 @@ which will be used to query DataONE Solr for all the pids associated with that p // Get the collectionQuery from Solr try { log.debug("Getting collectionQuery with query: " + queryStr); - xmldoc = DataONE.querySolr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); + xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, d1Node, session); } catch (MetadigProcessException mpe) { log.error("Unable to query Solr for collectionQuery field for collection id: " + collectionId); throw new MetadigProcessException("Unable to query Solr for collectionQuery field for collection id: " + collectionId); @@ -530,13 +579,32 @@ which will be used to query DataONE Solr for all the pids associated with that p * DataONE listObjects service. This node could either be an MN or CN. */ - log.debug("Sending collectionQuery to Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); + //log.debug("Sending collectionQuery to Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); + MultipartRestClient mrc = null; + MultipartCNode cnNode = null; + log.debug("query string: " + queryStr); + try { + mrc = new DefaultHttpMultipartRestClient(); + } catch (Exception e) { + log.error("Error creating rest client: " + e.getMessage()); + JobExecutionException jee = new JobExecutionException(e); + jee.setRefireImmediately(false); + throw new MetadigProcessException("Unable to create connection to CN "); + } + + Session CNsession = DataONE.getSession(CNsubjectId, CNauthToken); + + // Don't know node type yet from the id, so have to manually check if it's a CN + Boolean isCN = DataONE.isCN(CNserviceUrl); + + cnNode = new MultipartCNode(mrc, CNserviceUrl, CNsession); + do { //TODO: check that a result was returned // Note: the collectionQuery is always evaluated on the CN, so that the entire DataONE network is queried. - xmldoc = DataONE.querySolr(queryStr, serviceUrl, startPos, countRequested, subjectId, authToken); + xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, cnNode, CNsession); if(xmldoc == null) { log.info("no values returned from query"); break; @@ -930,20 +998,31 @@ private String URLencodeChars(String value, String target) { * @return the escaped value */ private String escapeSpecialChars(String value) { - // { + + // These are reserved characters in Solr + // + - && | | ! ( ) { } [ ] ^ " ~ * ? : \ value = value.replace("%7B", "\\%7B"); - // } value = value.replace("%7D", "\\%7D"); - // : - //value = value.replace("%3A", "\\%3A"); value = value.replace(":", "%5C:"); - - //value = value.replace("(", "\\("); - //value = value.replace(")", "\\)"); - //value = value.replace("?", "\\?"); - //value = value.replace("%3F", "\\%3F"); - //value = value.replace("\"", "\\\""); - //value = value.replace("'", "\\'"); + value = value.replace(",", "%5C,"); + value = value.replace(")", "%5C)"); + value = value.replace("+", "%5C+"); + value = value.replace("-", "%5C-"); + value = value.replace("&", "%5C&"); + value = value.replace("|", "%5C|"); + value = value.replace("!", "%5C!"); + value = value.replace("(", "%5C("); + value = value.replace(")", "%5C)"); + value = value.replace("{", "%5C{"); + value = value.replace("}", "%5C}"); + value = value.replace("[", "%5C["); + value = value.replace("]", "%5C]"); + value = value.replace("^", "%5C^"); + value = value.replace("\"", "%5C\""); + value = value.replace("~", "%5C~"); + value = value.replace("*", "%5C*"); + value = value.replace("?", "%5C?"); + value = value.replace("\\", "%5C\\"); return value; } @@ -955,7 +1034,5 @@ private static String encodeValue(String value) { throw new RuntimeException(ex.getCause()); } } - - } From 35b77b4fb34066630cf010403da4030d9e6cb949 Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 2 Aug 2020 12:52:04 -0700 Subject: [PATCH 29/47] Store task type (used by scheduler) --- .../mdqengine/scheduler/RequestReportJob.java | 2 +- .../nceas/mdqengine/store/DatabaseStore.java | 74 +------------------ .../nceas/mdqengine/store/InMemoryStore.java | 2 +- .../ucsb/nceas/mdqengine/store/MDQStore.java | 2 +- .../ucsb/nceas/mdqengine/store/MNStore.java | 2 +- 5 files changed, 7 insertions(+), 75 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index b59b0224..6a11c68c 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -217,7 +217,7 @@ public void execute(JobExecutionContext context) //node = store.getNode(nodeId, jobName); Task task; - task = store.getTask(taskName); + task = store.getTask(taskName, taskType); // If a 'task' entry has not been saved for this task name yet, then a 'lastHarvested' // DataTime will not be available, in which case the 'startHarvestDataTime' from the // config file will be used. diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java index ac340bdd..3fcca606 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java @@ -325,75 +325,6 @@ public void shutdown() { } } -// public Node getNode(String nodeId, String jobName) { -// -// //return runs.get(id); -// Result result = new Result(); -// PreparedStatement stmt = null; -// String lastDT = null; -// Node node = new Node(); -// -// // Select records from the 'nodes' table -// try { -// log.debug("preparing statement for query"); -// String sql = "select * from nodes where node_id = ? and job_name = ?"; -// stmt = conn.prepareStatement(sql); -// stmt.setString(1, nodeId); -// stmt.setString(2, jobName); -// -// log.debug("issuing query: " + sql); -// ResultSet rs = stmt.executeQuery(); -// if(rs.next()) { -// node.setNodeId(rs.getString("node_id")); -// node.setJobName(rs.getString("job_name")); -// node.setLastHarvestDatetime(rs.getString("last_harvest_datetime")); -// rs.close(); -// stmt.close(); -// } else { -// log.debug("No results returned from query"); -// } -// } catch ( Exception e ) { -// log.error( e.getClass().getName()+": "+ e.getMessage()); -// } -// -// return(node); -// } - - -// public void saveNode(Node node) throws MetadigStoreException { -// -// PreparedStatement stmt = null; -// -// // Perform an 'upsert' on the 'nodes' table - if a record exists for the 'metadata_id, suite_id' already, -// // then update the record with the incoming data. -// try { -// String sql = "INSERT INTO nodes (node_id, job_name, last_harvest_datetime) VALUES (?, ?, ?)" -// + " ON CONFLICT ON CONSTRAINT nodes_id_job_name_pk" -// + " DO UPDATE SET (node_id, job_name, last_harvest_datetime) = (?, ?, ?);"; -// -// stmt = conn.prepareStatement(sql); -// stmt.setString(1, node.getNodeId()); -// stmt.setString(2, node.getJobName()); -// stmt.setString(3, node.getLastHarvestDatetime()); -// stmt.setString(4, node.getNodeId()); -// stmt.setString(5, node.getJobName()); -// stmt.setString(6, node.getLastHarvestDatetime()); -// stmt.executeUpdate(); -// stmt.close(); -// conn.commit(); -// //conn.close(); -// } catch (SQLException e) { -// log.error( e.getClass().getName()+": "+ e.getMessage()); -// MetadigStoreException me = new MetadigStoreException("Unable save last harvest date to the datdabase."); -// me.initCause(e); -// throw(me); -// } -// -// // Next, insert a record into the child table ('runs') -// log.debug("Records created successfully"); -// } - - public void saveTask(Task task) throws MetadigStoreException { PreparedStatement stmt = null; @@ -427,7 +358,7 @@ public void saveTask(Task task) throws MetadigStoreException { log.debug("Records created successfully"); } - public Task getTask(String taskName) { + public Task getTask(String taskName, String taskType) { //return runs.get(id); Result result = new Result(); @@ -438,9 +369,10 @@ public Task getTask(String taskName) { // Select records from the 'nodes' table try { log.debug("preparing statement for query"); - String sql = "select * from tasks where task_name = ?"; + String sql = "select * from tasks where task_name = ? and task_type = ?"; stmt = conn.prepareStatement(sql); stmt.setString(1, taskName); + stmt.setString(2, taskType); log.debug("issuing query: " + sql); ResultSet rs = stmt.executeQuery(); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java index 44bb386c..af7637a0 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java @@ -210,7 +210,7 @@ public void deleteRun(Run run) { // public void saveNode(Node node) throws MetadigStoreException { } @Override - public Task getTask(String taskName) { return new Task(); } + public Task getTask(String taskName, String taskType) { return new Task(); } @Override public void saveTask(Task task) throws MetadigStoreException { } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java index fbef0bc3..c573803d 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java @@ -33,7 +33,7 @@ public interface MDQStore { // public Node getNode(String nodeId, String jobName); // public void saveNode(Node node) throws MetadigStoreException; - public Task getTask(String taskName); + public Task getTask(String taskName, String taskType); public void saveTask(Task task) throws MetadigStoreException; } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java index ec7a2772..4613577e 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java @@ -335,7 +335,7 @@ public void renew() {} // public void saveNode(Node node) throws MetadigStoreException { } @Override - public Task getTask(String taskName) { return new Task(); } + public Task getTask(String taskName, String taskType) { return new Task(); } @Override public void saveTask(Task task) throws MetadigStoreException { } From 2c6531123819b732ac7794b2e96af55668c4c86d Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 2 Aug 2020 12:53:11 -0700 Subject: [PATCH 30/47] Minor fix to assessment graph retrieval --- .../edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java index ff67bac6..3f7cc497 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/filestore/FilestoreDB.java @@ -105,14 +105,12 @@ public MetadigFile getFileEntry(MetadigFile mdFile) throws MetadigFilestoreExcep stmt.setString(1, storageType); stmt.setString(2, altFilename); } else { - sql = "select * from filestore where pid = ? and storage_type = ? and media_type = ?"; + sql = "select * from filestore where pid = ? and suite_id = ? and storage_type = ? and media_type = ?"; stmt = conn.prepareStatement(sql); stmt.setString(1, pid); stmt.setString(2, suiteId); - stmt.setString(3, nodeId); - stmt.setString(4, mdFormatFilter); - stmt.setString(5, storageType); - stmt.setString(6, mediaType); + stmt.setString(3, storageType); + stmt.setString(4, mediaType); } log.debug("issuing query: " + sql); From 0709ff21137c09f04258e6a2e40c8cdb767117f7 Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 2 Aug 2020 12:55:39 -0700 Subject: [PATCH 31/47] Read log4j.properties dynamically on container startup --- Kubernetes/metadig-scheduler/Dockerfile | 8 +++++--- Kubernetes/metadig-scorer/Dockerfile | 6 ++++-- Kubernetes/metadig-worker/Dockerfile | 6 ++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Kubernetes/metadig-scheduler/Dockerfile b/Kubernetes/metadig-scheduler/Dockerfile index 93fb94f8..b193bb70 100644 --- a/Kubernetes/metadig-scheduler/Dockerfile +++ b/Kubernetes/metadig-scheduler/Dockerfile @@ -6,14 +6,14 @@ MAINTAINER slaughter@nceas.ucsb.edu # Set the working directory WORKDIR /var/lib/metadig -COPY log4j.properties . +#COPY log4j.properties . # The most recently built jar file is copied from the maven build directory to this dir by maven, so that # it can be copied to the image. COPY metadig-engine.jar metadig-engine.jar #COPY metadig.properties /etc/metadig/metadig.properties #COPY taskList.csv /etc/metadig/taskList.csv -COPY log4j.properties . +#COPY log4j.properties . #COPY run.sh run.sh # The 'run.sh' script copies config files that should be available from persistent volume to the standard location where the software @@ -23,4 +23,6 @@ COPY log4j.properties . #CMD java -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap -XX:+UseSerialGC -cp ./metadig-engine.jar:./solr edu.ucsb.nceas.mdqengine.scheduler.JobScheduler #CMD [ "./run.sh" ] -CMD java -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap -XX:+UseSerialGC -cp ./metadig-engine.jar: edu.ucsb.nceas.mdqengine.scheduler.JobScheduler +# Set classpath to include /opt/local/metadig/log4j.properties, if it exists, so that logging can be changed without +# having to rebuild the container. Note that on k8s, this dir is mapped to the persistent volume, so will be /data/metadig/log4j.properties +CMD java -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap -XX:+UseSerialGC -cp /opt/local/metadig/config:./metadig-engine.jar: edu.ucsb.nceas.mdqengine.scheduler.JobScheduler diff --git a/Kubernetes/metadig-scorer/Dockerfile b/Kubernetes/metadig-scorer/Dockerfile index 63e47408..d539ee7b 100644 --- a/Kubernetes/metadig-scorer/Dockerfile +++ b/Kubernetes/metadig-scorer/Dockerfile @@ -9,7 +9,7 @@ WORKDIR /var/lib/metadig # This file was created from the https://github.com/NCEAS/metadig-r repo # and contains R functions that assist in writing R based quality checks. COPY metadig_0.2.0.tar.gz metadig.tar.gz -COPY log4j.properties . +#COPY log4j.properties . # The most recently built jar file is copied from the maven build directory to this dir by maven, so that # it can be copyied to the image. COPY metadig-engine.jar metadig-engine.jar @@ -40,5 +40,7 @@ RUN Rscript --vanilla -e 'install.packages("metadig.tar.gz", repos=NULL)' # Run the Scorer process # Note: docker --build-arg only allows one argument (one token only, multiple tokens inside quotes doesn't work, so have # to specify java options directly on command line. -CMD java -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap -XX:+UseSerialGC -cp ./metadig-engine.jar:./solr edu.ucsb.nceas.mdqengine.scorer.Scorer +# Set classpath to include /opt/local/metadig/log4j.properties, if it exists, so that logging can be changed without +# having to rebuild the container. Note that on k8s, this dir is mapped to the persistent volume, so will be /data/metadig/log4j.properties +CMD java -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap -XX:+UseSerialGC -cp /opt/local/metadig/config:./metadig-engine.jar:./solr edu.ucsb.nceas.mdqengine.scorer.Scorer diff --git a/Kubernetes/metadig-worker/Dockerfile b/Kubernetes/metadig-worker/Dockerfile index b662b5f6..fff34db1 100644 --- a/Kubernetes/metadig-worker/Dockerfile +++ b/Kubernetes/metadig-worker/Dockerfile @@ -11,7 +11,7 @@ WORKDIR /var/lib/metadig # This file was created from the https://github.com/NCEAS/metadig-r repo # and contains R functions that assist in writing R based quality checks. COPY metadig_0.2.0.tar.gz metadig.tar.gz -COPY log4j.properties . +#COPY log4j.properties . # The most recently built jar file is copied from the maven build directory to this dir by maven, so that # it can be copyied to the image. COPY metadig-engine.jar metadig-engine.jar @@ -34,6 +34,8 @@ RUN Rscript --vanilla r-cmds.txt # Run the Worker process # Note: docker --buile-arg only allows one argument (one token only, multiple tokens inside quotes doesn't work, so have # to specify java options directly on command line. -CMD java -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap -XX:+UseSerialGC -cp ./metadig-engine.jar:./solr edu.ucsb.nceas.mdqengine.Worker +# Set classpath to include /opt/local/metadig/log4j.properties, if it exists, so that logging can be changed without +# having to rebuild the container. Note that on k8s, this dir is mapped to the persistent volume, so will be /data/metadig/log4j.properties +CMD java -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap -XX:+UseSerialGC -cp /opt/local/metadig/config:./metadig-engine.jar:./solr edu.ucsb.nceas.mdqengine.Worker #CMD java -Xms128m -Xmx256m -Dlog4j.configuration=log4j.properties -cp ./metadig-engine.jar:./solr edu.ucsb.nceas.mdqengine.Worker From 639786477bfddbc6515137c8f2364a5ed43c487b Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 2 Aug 2020 12:56:44 -0700 Subject: [PATCH 32/47] Load R packages from cran.rstudio.com (#259) --- Kubernetes/metadig-scorer/Dockerfile | 13 +++---------- Kubernetes/metadig-worker/Dockerfile | 4 +++- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/Kubernetes/metadig-scorer/Dockerfile b/Kubernetes/metadig-scorer/Dockerfile index d539ee7b..e7146b8a 100644 --- a/Kubernetes/metadig-scorer/Dockerfile +++ b/Kubernetes/metadig-scorer/Dockerfile @@ -24,17 +24,10 @@ RUN mkdir -p /etc/dataone/index && touch /etc/dataone/index/d1client.properties # Add R runtime and install packges required by the quality suites RUN apt update -RUN apt -y install vim -RUN apt -y install r-base -RUN apt -y install r-cran-httr -RUN apt -y install r-cran-xml2 -RUN apt -y install r-cran-tidyr -RUN apt -y install r-cran-scales -RUN apt -y install r-cran-lubridate -RUN apt -y install r-cran-ggplot2 -RUN apt -y install r-cran-magrittr +RUN apt -y install vim bash +RUN apt -y install r-base r-cran-httr r-cran-xml2 r-cran-tidyr r-cran-scales r-cran-lubridate r-cran-ggplot2 r-cran-magrittr # Debian stretch doesn't have a pre-cooked package for readr, so install now. -RUN Rscript --vanilla -e 'install.packages("readr", repos="https://cran.mtu.edu/")' +RUN Rscript --vanilla -e 'install.packages("readr", repos=c(CRAN = "http://cran.rstudio.com"))' RUN Rscript --vanilla -e 'install.packages("metadig.tar.gz", repos=NULL)' # Run the Scorer process diff --git a/Kubernetes/metadig-worker/Dockerfile b/Kubernetes/metadig-worker/Dockerfile index fff34db1..09d55309 100644 --- a/Kubernetes/metadig-worker/Dockerfile +++ b/Kubernetes/metadig-worker/Dockerfile @@ -27,8 +27,10 @@ RUN mkdir -p /etc/dataone/index && touch /etc/dataone/index/d1client.properties # Add R runtime and install packges required by the quality suites COPY r-cmds.txt r-cmds.txt RUN apk update +# bash is needed by the openssl install +RUN apk add bash RUN apk add g++ R R-dev R-doc libc-dev openssl-dev libxml2 libxml2-dev -RUN echo 'options(repos = c(CRAN = "https://cran.cnr.berkeley.edu/"))' >> /usr/lib/R/etc/Rprofile.site +RUN echo 'options(repos = c(CRAN = "http://cran.rstudio.com"))' >> /usr/lib/R/etc/Rprofile.site RUN Rscript --vanilla r-cmds.txt # Run the Worker process From 4607036cfa539d7cdc59ed37037a5e3d29d106b1 Mon Sep 17 00:00:00 2001 From: gothub Date: Tue, 4 Aug 2020 18:39:43 -0700 Subject: [PATCH 33/47] minor formatting changes --- .../code/graph_cumulative_quality_scores.R | 57 +++++++++---------- .../code/graph_monthly_quality_scores.R | 37 ++++++------ 2 files changed, 44 insertions(+), 50 deletions(-) diff --git a/src/main/resources/code/graph_cumulative_quality_scores.R b/src/main/resources/code/graph_cumulative_quality_scores.R index 2344bf3b..78b97804 100644 --- a/src/main/resources/code/graph_cumulative_quality_scores.R +++ b/src/main/resources/code/graph_cumulative_quality_scores.R @@ -7,27 +7,22 @@ library(readr) library(magrittr) # Plot cummulative quality scores by month -# This program is dispatched (called) by the MetaDIG Grapher class. Several +# This program is dispatched (called) by the MetaDIG Scorer class. Several # variables are injected by metadig-engine Dispatcher # - title: the graph title -# - title: the graph title # - inFile: the CSV file containing quality scores, which has been prepared by Grapher # - outFile: the graphics output file to create # Variables read by metadig-engine Dispatcher after execution -# mdq_result, output, status +# mdq_result, output, status + +# Define these variable ("infile", "outFile" for local testing only +#inFile <- "toolik.csv" +#outFile <- "toolik-cumulative.png" -# Define these variable for local testing only -#inFile <- "dbo.csv" -#outFile <- "dbo.png" -#inFile <- "sasap.csv" -#outFile <- "sasap.png" -#inFile <- "FAIR-scores-eml.csv" -#outFile <- "FAIR-scores-eml.png" axisTextFontSize <- 7 -legendTextFontSize <- 7 +legendTextFontSize <- 8 axisTitleFontSize <- 9 legendTitleFontSize <- 9 - # Load data fsr <- read_csv(inFile) @@ -37,11 +32,10 @@ scores <- mutate(fsr, ym = as.Date(sprintf("%4s-%02d-01", year(dateUploaded), mo mutate(scoreI = scoreInteroperable * 100.0) %>% mutate(scoreR = scoreReusable * 100.0) -# Use this when sequenceId problem has been resolved (github metadig-engine #232) most_recent <- scores %>% - arrange(ym, sequenceId, dateUploaded) %>% - group_by(ym, sequenceId) %>% - top_n(1, dateUploaded) + arrange(ym, sequenceId, dateUploaded) %>% + group_by(ym, sequenceId) %>% + top_n(1, dateUploaded) head(most_recent) # calculate cummulative overall @@ -56,12 +50,16 @@ score_cumulative$metric <- factor(score_cumulative$metric, levels=c("f", "a", "i", "r", "fc", "ac", "ic", "rc"), labels=c("Findable", "Accessible", "Interoperable", "Reusable", "Cum. Findable", "Cum. Accessible", "Cum. Interoperable", "Cum. Reusable")) -score_monthly <- score_cumulative %>% filter(metric %in% c("Findable", "Accessible", "Interoperable", "Reusable")) -# Calculate the overall mean for each FAIR category -mf <- score_cumulative %>% filter(metric %in% c("Findable")) %>% extract2("mean") %>% mean(., na.rm = TRUE) -ma <- score_cumulative %>% filter(metric %in% c("Accessible")) %>% extract2("mean") %>% mean(., na.rm = TRUE) -mi <- score_cumulative %>% filter(metric %in% c("Interoperable")) %>% extract2("mean") %>% mean(., na.rm = TRUE) -mr <- score_cumulative %>% filter(metric %in% c("Reusable")) %>% extract2("mean") %>% mean(., na.rm = TRUE) +score_cumulative_alone <- score_cumulative %>% filter(metric %in% c("Cum. Findable", "Cum. Accessible", "Cum. Interoperable", "Cum. Reusable")) + +# Fetch the last year in the cumulative scores +ymLatest <- with(score_cumulative, max(ym)) +# Fetch last means - these will be used for the legend to show the mean of the latest and hopefully +# best scores for the latest time slot (month) +mfLatest <- score_cumulative_alone %>% filter(ym == ymLatest) %>% filter(metric %in% "Cum. Findable") %>% extract2("mean") +maLatest <- score_cumulative_alone %>% filter(ym == ymLatest) %>% filter(metric %in% "Cum. Accessible") %>% extract2("mean") +miLatest <- score_cumulative_alone %>% filter(ym == ymLatest) %>% filter(metric %in% "Cum. Interoperable") %>% extract2("mean") +mrLatest <- score_cumulative_alone %>% filter(ym == ymLatest) %>% filter(metric %in% "Cum. Reusable") %>% extract2("mean") # See if the 'dateUploaded' dates span multiple years and if not, the x-axis needs to be configured for ggplot so that # it will display. If it is configured for years and only a single year exists, the x-axis will not display. @@ -81,7 +79,7 @@ if(minYear == maxYear) { # Plot cummulative overall d1_colors <- c("#ff582d", "#c70a61", "#1a6379", "#60c5e4", "#ff582d", "#c70a61", "#1a6379", "#60c5e4") -p <- ggplot(data=score_monthly, mapping=aes(x=ym, y=mean, color=metric)) + +p <- ggplot(data=score_cumulative_alone, mapping=aes(x=ym, y=mean, color=metric)) + geom_line() + geom_point(size=1) + theme_bw() + @@ -93,19 +91,18 @@ p <- ggplot(data=score_monthly, mapping=aes(x=ym, y=mean, color=metric)) + legend.text = element_text(size = legendTextFontSize), panel.grid.minor = element_blank(), panel.background = element_blank()) + - #scale_color_manual(name = "Metric", labels = c("Findable", "Accessible", "Interoperable", "Reusable"), - # values=d1_colors) + - scale_color_manual(name = "Metric", labels = c(sprintf("Findable (%.0f%%)", mf), - sprintf("Accessible (%.0f%%)", ma), - sprintf("Interoperable (%.0f%%)", mi), - sprintf("Reusable (%.0f%%)", mr)), values=d1_colors) + + + scale_color_manual(name = "Metric", labels = c(sprintf("Findable (%.0f%%)", mfLatest), + sprintf("Accessible (%.0f%%)", maLatest), + sprintf("Interoperable (%.0f%%)", miLatest), + sprintf("Reusable (%.0f%%)", mrLatest)), values=d1_colors) + scale_x_date(date_breaks=dateBreaks, date_minor_breaks=dateMinorBreaks, labels=date_format(dateFormat)) + xlab(xLabel) + scale_y_continuous(limits=c(0,100)) + ylab("Average FAIR Score") + #ggtitle(paste0("DataONE: FAIR scores for ", format(sum(standards$n), big.mark=","), " EML and ISO metadata records")) #scale_fill_discrete(name = "metric", labels = c("Finabl", "Accessibl", "Interoperabl", "Reusabl")) + - ggsave(outFile, width = 8, height = 3) + ggsave(outFile, width = 8.0, height = 3.0) output <- sprintf("Created graphics file %s", outFile) status <- "SUCCESS" diff --git a/src/main/resources/code/graph_monthly_quality_scores.R b/src/main/resources/code/graph_monthly_quality_scores.R index 61406d93..7c06b560 100644 --- a/src/main/resources/code/graph_monthly_quality_scores.R +++ b/src/main/resources/code/graph_monthly_quality_scores.R @@ -6,30 +6,27 @@ library(lubridate) library(readr) library(magrittr) -# Plot cummulative quality scores by month +# Plot mean quality scores by month # This program is dispatched (called) by the MetaDIG Grapher class. Several # variables are injected by metadig-engine Dispatcher # - title: the graph title -# - title: the graph title # - inFile: the CSV file containing quality scores, which has been prepared by Grapher # - outFile: the graphics output file to create # Variables read by metadig-engine Dispatcher after execution -# mdq_result, output, status +# - mdq_result, output, status + +# Define these variable ("infile", "outFile" for local testing only +#inFile <- "toolik.csv" +#outFile <- "toolik-monthly.png" -# Define these variable for local testing only -#inFile <- "dbo.csv" -#outFile <- "dbo.png" -#inFile <- "sasap.csv" -#outFile <- "sasap.png" -#inFile <- "FAIR-scores-eml.csv" -#outFile <- "FAIR-scores-eml.png" -axisTextFontSize <- 6 -legendTextFontSize <- 6 -axisTitleFontSize <- 8 -legendTitleFontSize <- 8 +axisTextFontSize <- 7 +legendTextFontSize <- 8 +axisTitleFontSize <- 9 +legendTitleFontSize <- 9 # Load data fsr <- read_csv(inFile) +#fsr <- read_csv(inFile) %>% filter(grepl("*eml*", formatId)) scores <- mutate(fsr, ym = as.Date(sprintf("%4s-%02d-01", year(dateUploaded), month(dateUploaded)))) %>% mutate(scoreF = scoreFindable * 100.0) %>% @@ -65,8 +62,8 @@ mr <- score_cumulative %>% filter(metric %in% c("Reusable")) %>% extract2("mean" # See if the 'dateUploaded' dates span multiple years and if not, the x-axis needs to be configured for ggplot so that # it will display. If it is configured for years and only a single year exists, the x-axis will not display. -minYear <- format(with(scores, min(dateUploaded)), "%Y") -maxYear <- format(with(scores, max(dateUploaded)), "%Y") +minYear <- format(with(score_monthly, min(ym)), "%Y") +maxYear <- format(with(score_monthly, max(ym)), "%Y") if(minYear == maxYear) { xLabel <- "Month" dateBreaks <- "months" @@ -96,16 +93,16 @@ p <- ggplot(data=score_monthly, mapping=aes(x=ym, y=mean, color=metric)) + #scale_color_manual(name = "Metric", labels = c("Findable", "Accessible", "Interoperable", "Reusable"), # values=d1_colors) + scale_color_manual(name = "Metric", labels = c(sprintf("Findable (%.0f%%)", mf), - sprintf("Accessible (%.0f%%)", ma), - sprintf("Interoperable (%.0f%%)", mi), - sprintf("Reusable (%.0f%%)", mr)), values=d1_colors) + + sprintf("Accessible (%.0f%%)", ma), + sprintf("Interoperable (%.0f%%)", mi), + sprintf("Reusable (%.0f%%)", mr)), values=d1_colors) + scale_x_date(date_breaks=dateBreaks, date_minor_breaks=dateMinorBreaks, labels=date_format(dateFormat)) + xlab(xLabel) + scale_y_continuous(limits=c(0,100)) + ylab("Average FAIR Score") + #ggtitle(paste0("DataONE: FAIR scores for ", format(sum(standards$n), big.mark=","), " EML and ISO metadata records")) #scale_fill_discrete(name = "metric", labels = c("Finabl", "Accessibl", "Interoperabl", "Reusabl")) + - ggsave(outFile, width = 7.5, height = 2.5) + ggsave(outFile, width = 8.0, height = 3.0) output <- sprintf("Created graphics file %s", outFile) status <- "SUCCESS" From 7849f7d70995299d2375a3cf274546fe25a7f46a Mon Sep 17 00:00:00 2001 From: gothub Date: Thu, 6 Aug 2020 10:22:25 -0700 Subject: [PATCH 34/47] provide access for MN assessment graphs (#262) --- .../edu/ucsb/nceas/mdqengine/DataONE.java | 28 ----- .../mdqengine/scheduler/JobScheduler.java | 5 + .../mdqengine/scheduler/RequestScorerJob.java | 92 +++++++++------ .../ucsb/nceas/mdqengine/scorer/Graph.java | 1 - .../ucsb/nceas/mdqengine/scorer/Scorer.java | 108 +++++++++--------- 5 files changed, 114 insertions(+), 120 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java index 82e4552d..a0e935c8 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java @@ -126,41 +126,13 @@ public static MultipartD1Node getMultipartD1Node(Session session, String service public static Document querySolr(String queryStr, int startPos, int countRequested, MultipartD1Node d1Node, Session session) throws MetadigProcessException { -// // Polymorphism doesn't work with D1 node classes, so have to use the derived classes -// MultipartD1Node d1Node = null; -// Session session = DataONE.getSession(subjectId, authToken); - // Add the start and count, if pagination is being used queryStr = queryStr + "&start=" + startPos + "&rows=" + countRequested; // Query the MN or CN Solr engine to get the query associated with this project that will return all project related pids. InputStream qis = null; MetadigProcessException metadigException = null; -// try { -// d1Node = getMultipartD1Node(session, serviceUrl); -// log.debug("Created MultipartD1Node, nodeId: " + d1Node.getNodeId().getValue()); -// } catch (Exception ex) { -// log.error("Unable to create MultipartD1Node for Solr query"); -// metadigException = new MetadigProcessException("Unable to create multipart node client to query DataONE solr: " + ex.getMessage()); -// metadigException.initCause(ex); -// throw metadigException; -// } - log.debug("Sending query: " + queryStr); - // Send a query to a CN or MN -// try { -// if(isCN) { -// qis = cnNode.query(session, "solr", queryStr); -// } else { -// qis = mnNode.query(session, "solr", queryStr); -// } -// log.debug("Sent query"); -// } catch (Exception e) { -// log.error("Error retrieving pids: " + e.getMessage()); -// metadigException = new MetadigProcessException("Unable to query dataone node: " + e.getMessage()); -// metadigException.initCause(e); -// throw metadigException; -// } try { qis = d1Node.query(session, "solr", queryStr); log.debug("Sent query"); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java index 3f237f03..90efc13f 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java @@ -47,6 +47,7 @@ public static void main(String[] argv) throws Exception { String startHarvestDatetime = null; int countRequested = 1000; int harvestDatetimeInc = 1; + String requestType = null; // Filestore variables String dirIncludeMatch = null; @@ -144,6 +145,8 @@ public static void main(String[] argv) throws Exception { harvestDatetimeInc = Integer.parseInt(splitted[++icnt].trim()); // The number of results to return from the DataONE 'listObjects' service countRequested = Integer.parseInt(splitted[++icnt].trim()); + // Is this scores request for a portal or an entire member node? + requestType = splitted[++icnt].trim(); log.debug("pidFilter: " + pidFilter); log.debug("suiteId: " + suiteId); @@ -151,6 +154,7 @@ public static void main(String[] argv) throws Exception { log.debug("startHarvestDatetime: " + startHarvestDatetime); log.debug("harvestDatetimeInc: " + harvestDatetimeInc); log.debug("countRequested: " + countRequested); + log.debug("requestType: " + requestType); } else if(taskType.equals("filestore")) { // Example taskList.csv entry: // filestore,ingest,metadig,,,0 0/30 * * * ?,"stage;;*.*;README.txt;filestore-ingest.log" @@ -204,6 +208,7 @@ public static void main(String[] argv) throws Exception { .usingJobData("startHarvestDatetime", startHarvestDatetime) .usingJobData("harvestDatetimeInc", harvestDatetimeInc) .usingJobData("countRequested", countRequested) + .usingJobData("requestType", requestType) .build(); } else if (taskType.equalsIgnoreCase("filestore")) { job = newJob(FilestoreIngestJob.class) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index 29352235..11d965fe 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -127,6 +127,10 @@ public void execute(JobExecutionContext context) int harvestDatetimeInc = dataMap.getInt("harvestDatetimeInc"); // Number of pids to get each query (this number of pids will be fetched each query until all pids are obtained) int countRequested = dataMap.getInt("countRequested"); + String requestType = null; + if (taskType.equalsIgnoreCase("score")) { + requestType = dataMap.getString("requestType"); + } // TODO: add formatFamily to scheduler request String formatFamily = null; MultipartRestClient mrc = null; @@ -255,57 +259,69 @@ public void execute(JobExecutionContext context) RequestScorerJob.ListResult result = null; Integer resultCount = null; - log.debug("Getting portal pids to process..."); - boolean morePids = true; - while(morePids) { - ArrayList pidsToProcess = null; - log.debug("startCount: " + startCount); - log.debug("countRequested:" + countRequested); - + if(requestType != null && requestType.equalsIgnoreCase("node")) { try { - //result = getPidsToProcess(cnNode, mnNode, isCN, session, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); - result = getPidsToProcess(d1Node, session, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); - pidsToProcess = result.getResult(); - resultCount = result.getResultCount(); + // For a 'node' scores request, the 'collection' is the entire node, so specify + // the nodeId as the collectionid. + submitScorerRequest(qualityServiceUrl, nodeId, suiteId, nodeId, formatFamily); } catch (Exception e) { - JobExecutionException jee = new JobExecutionException("Unable to get pids to process", e); + JobExecutionException jee = new JobExecutionException("Unable to submit request to create new node (" + + nodeId + ")" + " score graph/data file ", e); jee.setRefireImmediately(false); throw jee; } + } else { + log.debug("Getting portal pids to process..."); + boolean morePids = true; + while (morePids) { + ArrayList pidsToProcess = null; + log.debug("startCount: " + startCount); + log.debug("countRequested:" + countRequested); - log.info("Found " + resultCount + " seriesIds" + " for date: " + startDTRstr + " at servierUrl: " + nodeServiceUrl); - for (String pidStr : pidsToProcess) { try { - submitScorerRequest(qualityServiceUrl, pidStr, suiteId, nodeId, formatFamily); + result = getPidsToProcess(d1Node, session, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); + pidsToProcess = result.getResult(); + resultCount = result.getResultCount(); } catch (Exception e) { - JobExecutionException jee = new JobExecutionException("Unable to submit request to create new score graph/data file", e); + JobExecutionException jee = new JobExecutionException("Unable to get pids to process", e); jee.setRefireImmediately(false); throw jee; } - } - // Check if DataONE returned the max number of results. If so, we have to request more by paging through - // the results. - if(resultCount >= countRequested) { - morePids = true; - startCount = startCount + resultCount; - log.info("Paging through more results, current start is " + startCount); - } else { - morePids = false; - - // Record the new "last harvested" date - task.setLastHarvestDatetime(endDTRstr); - log.debug("taskName: " + task.getTaskName()); - log.debug("taskType: " + task.getTaskType()); - log.debug("lastharvestdate: " + task.getLastHarvestDatetime()); + log.info("Found " + resultCount + " seriesIds" + " for date: " + startDTRstr + " at servierUrl: " + nodeServiceUrl); + for (String pidStr : pidsToProcess) { + try { + submitScorerRequest(qualityServiceUrl, pidStr, suiteId, nodeId, formatFamily); + } catch (Exception e) { + JobExecutionException jee = new JobExecutionException("Unable to submit request to create new score graph/data file", e); + jee.setRefireImmediately(false); + throw jee; + } + } - try { - store.saveTask(task); - } catch (MetadigStoreException mse) { - log.error("Error saving task: " + task.getTaskName()); - JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); - jee.setRefireImmediately(false); - throw jee; + // Check if DataONE returned the max number of results. If so, we have to request more by paging through + // the results. + if (resultCount >= countRequested) { + morePids = true; + startCount = startCount + resultCount; + log.info("Paging through more results, current start is " + startCount); + } else { + morePids = false; + + // Record the new "last harvested" date + task.setLastHarvestDatetime(endDTRstr); + log.debug("taskName: " + task.getTaskName()); + log.debug("taskType: " + task.getTaskType()); + log.debug("lastharvestdate: " + task.getLastHarvestDatetime()); + + try { + store.saveTask(task); + } catch (MetadigStoreException mse) { + log.error("Error saving task: " + task.getTaskName()); + JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); + jee.setRefireImmediately(false); + throw jee; + } } } } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java index 3a95fd2d..1f477f21 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Graph.java @@ -93,7 +93,6 @@ public String create(GraphType type, String title, String inputFile) throws Exce File codeFile = null; String dispatcherType = null; - MetadigFile mdFile = new MetadigFile(); mdFile.setCreationDatetime(DateTime.now()); mdFile.setStorageType(StorageType.CODE.toString()); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index b8cfb205..d31affb3 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -20,6 +20,7 @@ import org.apache.solr.client.solrj.beans.BindingException; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.client.solrj.util.ClientUtils; import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; @@ -163,6 +164,7 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp MultipartRestClient mrc = null; MultipartMNode mnNode = null; MultipartCNode cnNode = null; + GraphType graphType = null; //long startTime = System.nanoTime(); startTimeProcessing = System.currentTimeMillis(); @@ -228,7 +230,6 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp // - a graph for specified filters: member node, suite id, metadata format MetadigFile mdFile = new MetadigFile(); Graph graph = new Graph(); - //Scorer gfr = new Scorer(); // If creating a graph for a collection, get the set of pids associated with the collection. // Only scores for these pids will be included in the graph. @@ -256,37 +257,32 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp d1Node = new MultipartMNode(mrc, nodeServiceUrl, session); log.debug("Created mnNode for serviceUrl: " + nodeServiceUrl); } -// -// Session session = DataONE.getSession(subjectId, authToken); -// -// // Don't know node type yet from the id, so have to manually check if it's a CN -// Boolean isCN = DataONE.isCN(nodeServiceUrl); -// if(isCN) { -// cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); -// log.debug("Created cnNode for serviceUrl: " + nodeServiceUrl); -// } else { -// mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); -// log.debug("Created mnNode for serviceUrl: " + nodeServiceUrl); -// } - - if (collectionId != null && !collectionId.isEmpty()) { + + // Check if this is a "node" collection. For "node" collections, all scores for a member node + // are used to create the assessment graph, so we don't need to get the collection pids as is + // done for portals (by evaluating the Solr collectionQuery). Therefor, getCollectionPids doesn't + // need to be called and we can proceed directly to getting the quality scores from the quality + // Solr server. + if (collectionId.matches("^\\s*urn:node:.*")) { + graphType = GraphType.CUMULATIVE; + log.debug("Processing a member node request, skipping step of getting collection pids (not required)."); + } else { + graphType = GraphType.MONTHLY; // If the nodeId is specified, use if to determine the values for authTokenName and subjectIdName, // if those values are not defined - log.debug("collectionId is not null: " + collectionId); - String id = nodeId.replace("urn:node:", "").toUpperCase().trim(); + String id = nodeId.replace("urn:node:", "").toUpperCase().trim(); // The collection query is obtained from the MN and evaluated on the CN log.info("Getting pids for collection " + collectionId); // Always use the CN subject id and authentication token from the configuration file, as // requests that this method uses need CN subject privs ScorerResult result = null; - //result = gfr.getCollectionPids(collectionId, cnNode, mnNode, isCN, session); result = gfr.getCollectionPids(collectionId, d1Node, session); collectionPids = result.getResult(); label = result.getLabel(); // Don't continue if no pids (and thus scores) were found for this collection // TODO: Save a blank image and csv if no collection pids returned - if(collectionPids.size() == 0) { + if (collectionPids.size() == 0) { log.info("No pids returned for this collection."); break label; } else { @@ -322,7 +318,7 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp // Generate a temporary graph file based on the quality scores log.debug("Creating graph for collection id: " + collectionId); //String filePath = graph.create(GraphType.CUMULATIVE, title, scoreFile.getPath()); - String filePath = graph.create(GraphType.MONTHLY, title, scoreFile.getPath()); + String filePath = graph.create(graphType, title, scoreFile.getPath()); // Now save the graphics file to permanent storage String outfile; @@ -416,7 +412,8 @@ which will be used to query DataONE Solr for all the pids associated with that p //queryStr = "?q=seriesId:" + collectionId + "+-obsoletedBy:*&fl=collectionQuery,label,rightsHolder&q.op=AND"; startPos = 0; - countRequested = 10000; + // Just getting 1 row + countRequested = 10; // Get the collectionQuery from Solr try { @@ -430,6 +427,8 @@ which will be used to query DataONE Solr for all the pids associated with that p if(xmldoc == null) { log.error("No document returned from solr with queryStr: " + queryStr); throw new MetadigProcessException("No result returned from Solr query: " + queryStr); + } else { + log.trace("xml: " + xmldoc); } String collectionQuery = null; @@ -441,7 +440,7 @@ which will be used to query DataONE Solr for all the pids associated with that p String rightsHolder = null; try { - log.debug("Getting collectionQuery for id: " + collectionId); + log.debug("Parsing collectionQuery from resultdoc for id: " + collectionId); // Extract the collection query from the Solr result XML XPathFactory xPathfactory = XPathFactory.newInstance(); xpath = xPathfactory.newXPath(); @@ -505,6 +504,7 @@ which will be used to query DataONE Solr for all the pids associated with that p // Here is an example collectionQuery: (((project:"State of Alaska\'s Salmon and People") AND (-obsoletedBy:* AND formatType:METADATA))) // We have to remove the 'AND (-obsoletedBy:* AND formatType:METADATA)' portion + log.debug("Pre-edited collectionQuery: " + collectionQuery); collectionQuery = collectionQuery.replaceAll("\\s*AND\\s*\\(-obsoletedBy:\\*\\s*AND\\s*formatType:METADATA\\)", ""); log.debug("Edited collectionQuery: " + collectionQuery); @@ -575,15 +575,11 @@ which will be used to query DataONE Solr for all the pids associated with that p // Loop through the Solr result. As the result may be large, page through the results, accumulating // the pids returned - /** The collectionQuery is evaluated on the same node that the portal document was harvested from (via the - * DataONE listObjects service. This node could either be an MN or CN. - */ - //log.debug("Sending collectionQuery to Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); MultipartRestClient mrc = null; MultipartCNode cnNode = null; - log.debug("query string: " + queryStr); + log.debug("collectionQuery query string: " + queryStr); try { mrc = new DefaultHttpMultipartRestClient(); @@ -684,10 +680,38 @@ private List getQualityScores(String collectionId, String suiteId, int startPosInResult = 0; int startPosInQuery = 0; // this will always be zero - we are listing the pids to retrieve, so will always want to start at the first result - log.trace("Getting scores from Solr for " + collectionPids.size() + " pids."); - // Now accumulate the Quality Solr document results for the list of pids for the project. - if (collectionId != null && ! collectionId.isEmpty()) { - log.info("Getting quality scores for collection: " + collectionId); + // Now accumulate the Quality Solr document results for all scores for the node + if (collectionId.matches("^\\s*urn:node:.*")) { + log.info("Getting quality scores for member node with suiteId: " + suiteId + ", datasource: " + collectionId + " formats: " + formatFamily); + countRequested = 1000; + formatFamilySearchTerm = null; + queryStr = "metadataId:*"; + if(suiteId != null) { + //queryStr += " AND suiteId:" + "\"" + suiteId + "\""; + queryStr += " AND suiteId:" + ClientUtils.escapeQueryChars(suiteId); + } + + // Add this member nodeId as the datasource + //queryStr += " AND datasource:" + "\"" + collectionId + "\""; + queryStr += " AND datasource:" + ClientUtils.escapeQueryChars(collectionId); + + if (formatFamilySearchTerm != null) { + //queryStr += " AND metadataFormatId:" + "\"" + formatFamilySearchTerm + "\""; + queryStr += " AND metadataFormatId:" + ClientUtils.escapeQueryChars(formatFamilySearchTerm); + } + log.trace("query to quality Solr server: " + queryStr); + do { + resultList = queryQualitySolr(queryStr, startPosInQuery, countRequested); + // If no more results, break + if(resultList.size() == 0) break; + // Add results from this pid range to the accumulator of all results. + allResults.addAll(resultList); + startPosInQuery += resultList.size(); + //startPosInQuery += countRequested; + } while (resultList.size() > 0); + } else { + // Now accumulate the Quality Solr document results for the list of pids for the project. + log.info("Getting quality scores for collection: " + collectionId + ", for " + collectionPids.size() + " pids." ); int pidCntToRequest = 25; int totalPidCnt = collectionPids.size(); int pidsLeft = totalPidCnt; @@ -728,28 +752,6 @@ private List getQualityScores(String collectionId, String suiteId, } pidsLeft -= pidCntToRequest; } while (pidsLeft > 0); - } else { - log.info("Getting quality scores for suiteId: " + suiteId + ", datasource: " + " formats: " + formatFamily); - countRequested = 1000; - formatFamilySearchTerm = null; - queryStr = "metadataId:*"; - if(suiteId != null) { - queryStr += " AND suiteId:" + "\"" + suiteId + "\""; - } - - if (formatFamilySearchTerm != null) { - queryStr += " AND metadataFormatId:" + "\"" + formatFamilySearchTerm + "\""; - } - log.trace("query to quality Solr server: " + queryStr); - do { - resultList = queryQualitySolr(queryStr, startPosInQuery, countRequested); - // If no more results, break - if(resultList.size() == 0) break; - // Add results from this pid range to the accumulator of all results. - allResults.addAll(resultList); - //startPosInQuery += resultList.size(); - startPosInQuery += countRequested; - } while (resultList.size() > 0); } log.debug("Got " + allResults.size() + " scores from Quality Solr server"); return allResults; From 4ec7f4c0b9e52979eacbf4455ca73146cb75e8f4 Mon Sep 17 00:00:00 2001 From: gothub Date: Thu, 6 Aug 2020 10:39:16 -0700 Subject: [PATCH 35/47] Update docker tag (2.3.0); update maven dependencies --- pom.xml | 189 ++++++++++++++++++++++++++------------------------------ 1 file changed, 87 insertions(+), 102 deletions(-) diff --git a/pom.xml b/pom.xml index c71130a1..c653250e 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ jar metadig-engine - MetaDig library for running metadata quality tests + MetaDIG library for running metadata quality tests https://github.com/NCEAS/metadig-engine @@ -27,7 +27,7 @@ 3.1.4.RELEASE metadig - 2.3.0dev + 2.3.0 **/*Test.java **/LTERSuiteTest.java @@ -40,86 +40,13 @@ http://nceas.ucsb.edu - - - commons-logging - commons-logging - 1.2 - - - - org.apache.logging.log4j - log4j-core - 2.13.3 - - - - org.apache.logging.log4j - log4j-api - 2.13.3 - - - - - - - - - - - - - org.dataone - bookkeeper-client - ${bookkeeper.version} - - - io.dropwizard - dropwizard-core - - - io.dropwizard - dropwizard-json-logging - - - io.dropwizard - dropwizard-testing - - - io.dropwizard - dropwizard-jdbi3 - - - io.dropwizard - dropwizard-auth - - - org.postgresql - postgresql - - - com.opentable.components - otj-pg-embedded - - - com.opentable.components - otj-pg-embedded - - - org.flywaydb - flyway-maven-plugin - - - org.mockito - mockito-core - - - org.dataone - d1_libclient_java - - - - + + + + + + + com.fasterxml.jackson.core jackson-databind ${jackson.version} @@ -128,6 +55,12 @@ org.renjin renjin-script-engine ${renjin.version} + + + org.apache.httpcomponents + httpclient + + @@ -169,9 +102,15 @@ org.apache.commons commons-configuration2 - 2.3 + 2.7 - + + + log4j + log4j + 1.2.17 + + commons-beanutils commons-beanutils @@ -189,6 +128,12 @@ d1_libclient_java ${d1_libclient_java.version} jar + + + org.apache.httpcomponents + httpclient-cache + + @@ -205,22 +150,12 @@ org.apache.solr solr-solrj - 7.3.0 - - - org.apache.logging.log4j - log4j-api - - - org.apache.logging.log4j - log4j-core - - + 7.5.0 org.apache.solr solr-core - 7.3.0 + 7.5.0 bedatadriven bedatadriven public repo From 8f68c91a027ac3f9001ed3d845fb517986fbfa2e Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 9 Aug 2020 12:06:24 -0700 Subject: [PATCH 36/47] Reuse CN clients when possible (#264) --- .../edu/ucsb/nceas/mdqengine/DataONE.java | 34 ++++------------- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 38 +++++++++---------- 2 files changed, 24 insertions(+), 48 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java index a0e935c8..6973efab 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java @@ -29,42 +29,22 @@ public class DataONE { /** * Get a DataONE subject information object - * @param serviceUrl the service URL of the DataONE node to request the subject info from - * @param authToken the authorization token to use for the request + * @param rightsHolder the DataONE subject to get info for + * @param CNnode the DataONE CN to send the request to + * @param session the DataONE authenticated session * @return a DataONE subject information object * @throws MetadigProcessException */ - public static SubjectInfo getSubjectInfo(Subject rightsHolder, String serviceUrl, String subjectId, String authToken) throws MetadigProcessException { + public static SubjectInfo getSubjectInfo(Subject rightsHolder, MultipartCNode CNnode, + Session session) throws MetadigProcessException { log.debug("Getting subject info for: " + rightsHolder.getValue()); - MultipartCNode cnNode = null; + //MultipartCNode cnNode = null; MetadigProcessException metadigException = null; - SubjectInfo subjectInfo = null; - Session session = DataONE.getSession(subjectId, authToken); - - // Identity node as either a CN or MN based on the serviceUrl - String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; - Pattern r = Pattern.compile(pattern); - Matcher m = r.matcher(serviceUrl); - if (!m.find()) { - log.error("Must call a CN to get subject information"); - metadigException = new MetadigProcessException("Must call a CN to get subject information."); - throw metadigException; - } - - // Only CNs can call the 'subjectInfo' service (aka accounts), so we have to use - // a MultipartCNode instance here. - try { - cnNode = (MultipartCNode) getMultipartD1Node(session, serviceUrl); - } catch (Exception ex) { - metadigException = new MetadigProcessException("Unable to create multipart D1 node: " + subjectId + ": " + ex.getMessage()); - metadigException.initCause(ex); - throw metadigException; - } try { - subjectInfo = cnNode.getSubjectInfo(session, rightsHolder); + subjectInfo = CNnode.getSubjectInfo(session, rightsHolder); } catch (Exception ex) { metadigException = new MetadigProcessException("Unable to get subject information." + ex.getMessage()); metadigException.initCause(ex); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index d31affb3..30bee0b8 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -35,10 +35,7 @@ import org.joda.time.format.ISODateTimeFormat; import org.quartz.JobExecutionException; import org.w3c.dom.Document; -import org.xml.sax.InputSource; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.*; import java.io.*; import java.net.URLEncoder; @@ -438,8 +435,23 @@ which will be used to query DataONE Solr for all the pids associated with that p org.w3c.dom.Node node = null; String label = null; String rightsHolder = null; + MultipartRestClient mrc = null; + MultipartCNode CNnode = null; + Session CNsession = null; try { + + CNsession = DataONE.getSession(CNsubjectId, CNauthToken); + // // Only CNs can call the 'subjectInfo' service (aka accounts), so we have to use + // a MultipartCNode instance here. + try { + CNnode = (MultipartCNode) DataONE.getMultipartD1Node(CNsession, CNserviceUrl); + } catch (Exception ex) { + metadigException = new MetadigProcessException("Unable to create multipart D1 node: " + ex.getMessage()); + metadigException.initCause(ex); + throw metadigException; + } + log.debug("Parsing collectionQuery from resultdoc for id: " + collectionId); // Extract the collection query from the Solr result XML XPathFactory xPathfactory = XPathFactory.newInstance(); @@ -521,7 +533,7 @@ which will be used to query DataONE Solr for all the pids associated with that p subject.setValue(rightsHolder); // The subject info can only be obtained from a CN, so use the CN auth info for the current DataONE environment, // which should be configured in the metadig.properties file - SubjectInfo subjectInfo = DataONE.getSubjectInfo(subject, CNserviceUrl, CNsubjectId, CNauthToken); + SubjectInfo subjectInfo = DataONE.getSubjectInfo(subject, CNnode, CNsession); String groupStr = null; groupStr = "(readPermission:" + "\"" + rightsHolder @@ -581,26 +593,10 @@ which will be used to query DataONE Solr for all the pids associated with that p log.debug("collectionQuery query string: " + queryStr); - try { - mrc = new DefaultHttpMultipartRestClient(); - } catch (Exception e) { - log.error("Error creating rest client: " + e.getMessage()); - JobExecutionException jee = new JobExecutionException(e); - jee.setRefireImmediately(false); - throw new MetadigProcessException("Unable to create connection to CN "); - } - - Session CNsession = DataONE.getSession(CNsubjectId, CNauthToken); - - // Don't know node type yet from the id, so have to manually check if it's a CN - Boolean isCN = DataONE.isCN(CNserviceUrl); - - cnNode = new MultipartCNode(mrc, CNserviceUrl, CNsession); - do { //TODO: check that a result was returned // Note: the collectionQuery is always evaluated on the CN, so that the entire DataONE network is queried. - xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, cnNode, CNsession); + xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, CNnode, CNsession); if(xmldoc == null) { log.info("no values returned from query"); break; From 088352fb4632ebd9fd961248de481edf25e24165 Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 9 Aug 2020 12:20:05 -0700 Subject: [PATCH 37/47] Detect CN or MN based on service URL (#265) --- .../edu/ucsb/nceas/mdqengine/DataONE.java | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java index 6973efab..561ff9ee 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java @@ -19,8 +19,6 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import java.io.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class DataONE { @@ -188,19 +186,33 @@ public static Session getSession(String subjectId, String authToken) { return session; } - public static Boolean isCN(String serviceUrl) { + /* + * Determine if the string represents a DataONE CN. + * @param nodeStr either a DataONE node serviceURL (e.g. https://knb.ecoinformatics.org/knb/d1/mn) + * or a DataONE node identifier (e.g. urn:node:CN) + */ + public static Boolean isCN(String nodeStr) { Boolean isCN = false; - // Identity node as either a CN or MN based on the serviceUrl - String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; - Pattern r = Pattern.compile(pattern); - Matcher m = r.matcher(serviceUrl); - if (m.find()) { - isCN = true; - log.debug("service URL is for a CN: " + serviceUrl); + + // match node urn, e.g. "https://cn.dataone.org/cn" + if (nodeStr.matches("^\\s*urn:node:.*")) { + if (nodeStr.matches("^\\s*urn:node:CN.*$|^\\s*urn:node:cn.*$")) { + isCN = true; + log.debug("The nodeId is for a CN: " + nodeStr); + } else { + log.debug("The nodeId is not for a CN: " + nodeStr); + isCN = false; + } } else { - log.debug("service URL is not for a CN: " + serviceUrl); - isCN = false; + // match cn service url e.g. "https://cn.dataone.org/cn" + if (nodeStr.matches("^\\s*https*://cn.*?\\.dataone\\.org.*$|https*://cn.*?\\.test\\.dataone\\.org.*$")) { + isCN = true; + log.debug("The service URL is for a CN: " + nodeStr); + } else { + log.debug("The service URL is not for a CN: " + nodeStr); + isCN = false; + } } return isCN; } From 1ce5d3c92830459a172e665f5f6c1e27e27881fc Mon Sep 17 00:00:00 2001 From: gothub Date: Sun, 9 Aug 2020 12:20:56 -0700 Subject: [PATCH 38/47] Properly escape values for solr queries --- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index 30bee0b8..e61bbfcd 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -43,8 +43,6 @@ import java.util.*; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** * The Scorer class contains methods that create graphs of aggregated quality scores. @@ -588,8 +586,6 @@ which will be used to query DataONE Solr for all the pids associated with that p // the pids returned //log.debug("Sending collectionQuery to Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); - MultipartRestClient mrc = null; - MultipartCNode cnNode = null; log.debug("collectionQuery query string: " + queryStr); @@ -653,6 +649,7 @@ private List getQualityScores(String collectionId, String suiteId, String listString; ArrayList tmpList; String formatFamilySearchTerm = null; + String datasource = null; // The metadata format family can be specified to filter the quality scores that will be included // in the graph./ @@ -678,32 +675,36 @@ private List getQualityScores(String collectionId, String suiteId, // Now accumulate the Quality Solr document results for all scores for the node if (collectionId.matches("^\\s*urn:node:.*")) { - log.info("Getting quality scores for member node with suiteId: " + suiteId + ", datasource: " + collectionId + " formats: " + formatFamily); - countRequested = 1000; + countRequested = 10000; + if(DataONE.isCN(collectionId)) { + // Don't encode the wildcard, otherwise it will be deactivated in Solr + datasource = "*"; + log.info("Getting quality scores for CN node with suiteId: " + suiteId + ", datasource: " + datasource + " formats: " + formatFamily); + } else { + datasource = ClientUtils.escapeQueryChars(collectionId); + log.info("Getting quality scores for member node with (encoded) suiteId: " + suiteId + ", datasource: " + datasource + " formats: " + formatFamily); + } formatFamilySearchTerm = null; queryStr = "metadataId:*"; if(suiteId != null) { - //queryStr += " AND suiteId:" + "\"" + suiteId + "\""; queryStr += " AND suiteId:" + ClientUtils.escapeQueryChars(suiteId); } // Add this member nodeId as the datasource - //queryStr += " AND datasource:" + "\"" + collectionId + "\""; - queryStr += " AND datasource:" + ClientUtils.escapeQueryChars(collectionId); + queryStr += " AND datasource:" + datasource; if (formatFamilySearchTerm != null) { //queryStr += " AND metadataFormatId:" + "\"" + formatFamilySearchTerm + "\""; queryStr += " AND metadataFormatId:" + ClientUtils.escapeQueryChars(formatFamilySearchTerm); } - log.trace("query to quality Solr server: " + queryStr); do { + log.trace("query to quality Solr server: " + queryStr + ", startPos: " + startPosInQuery + ", countRequested: " + countRequested); resultList = queryQualitySolr(queryStr, startPosInQuery, countRequested); // If no more results, break if(resultList.size() == 0) break; // Add results from this pid range to the accumulator of all results. allResults.addAll(resultList); startPosInQuery += resultList.size(); - //startPosInQuery += countRequested; } while (resultList.size() > 0); } else { // Now accumulate the Quality Solr document results for the list of pids for the project. @@ -736,9 +737,9 @@ private List getQualityScores(String collectionId, String suiteId, if (suiteId != null) { queryStr += " AND suiteId:" + suiteId; } - log.debug("query to quality Solr server: " + queryStr); // Send query to Quality Solr Server // Get all the pids in this pid string + log.trace("query to quality Solr server: " + queryStr + ", startPos: " + startPosInQuery + ", countRequested: " + pidCntToRequest); resultList = queryQualitySolr(queryStr, startPosInQuery, pidCntToRequest); // It's possible that none of the pids from the collection have quality scores // This should not happen but check just in case. From ad04aff57638a675b4f0bc64f1764208e594ff16 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 19 Aug 2020 13:46:40 -0700 Subject: [PATCH 39/47] Code cleanup, adjust logging levels --- .../edu/ucsb/nceas/mdqengine/DataONE.java | 42 ++++++++------- .../edu/ucsb/nceas/mdqengine/MDQconfig.java | 18 +------ .../java/edu/ucsb/nceas/mdqengine/Worker.java | 13 ----- .../mdqengine/scheduler/JobScheduler.java | 4 +- .../mdqengine/scheduler/RequestReportJob.java | 20 ++------ .../mdqengine/scheduler/RequestScorerJob.java | 39 +++++--------- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 8 --- .../nceas/mdqengine/store/DatabaseStore.java | 35 ++++++------- .../ucsb/nceas/mdqengine/store/MDQStore.java | 51 +++++++++---------- 9 files changed, 80 insertions(+), 150 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java index 561ff9ee..17fea9e1 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/DataONE.java @@ -4,11 +4,12 @@ import org.apache.commons.logging.LogFactory; import edu.ucsb.nceas.mdqengine.exception.MetadigProcessException; import org.dataone.client.auth.AuthTokenSession; +import org.dataone.client.rest.DefaultHttpMultipartRestClient; +import org.dataone.client.rest.HttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartD1Node; import org.dataone.service.types.v1.Session; import edu.ucsb.nceas.mdqengine.exception.MetadigException; -import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; import org.dataone.client.v2.impl.MultipartMNode; import org.dataone.service.types.v1.Subject; @@ -36,7 +37,7 @@ public class DataONE { public static SubjectInfo getSubjectInfo(Subject rightsHolder, MultipartCNode CNnode, Session session) throws MetadigProcessException { - log.debug("Getting subject info for: " + rightsHolder.getValue()); + log.trace("Getting subject info for: " + rightsHolder.getValue()); //MultipartCNode cnNode = null; MetadigProcessException metadigException = null; SubjectInfo subjectInfo = null; @@ -68,7 +69,7 @@ public static MultipartD1Node getMultipartD1Node(Session session, String service // First create an HTTP client try { - mrc = new DefaultHttpMultipartRestClient(); + mrc = new HttpMultipartRestClient(); } catch (Exception ex) { log.error("Error creating rest client: " + ex.getMessage()); metadigException = new MetadigProcessException("Unable to get collection pids"); @@ -80,10 +81,10 @@ public static MultipartD1Node getMultipartD1Node(Session session, String service // Now create a DataONE object that uses the rest client if (isCN) { - log.debug("creating cn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); + log.debug("creating cn MultipartMNode"); d1Node = new MultipartCNode(mrc, serviceUrl, session); } else { - log.debug("creating mn MultipartMNode" + ", subjectId: " + session.getSubject().getValue()); + log.debug("creating mn MultipartMNode"); d1Node = new MultipartMNode(mrc, serviceUrl, session); } return d1Node; @@ -98,9 +99,6 @@ public static MultipartD1Node getMultipartD1Node(Session session, String service * @return an XML document containing the query result * @throws Exception */ - //public static Document querySolr(String queryStr, int startPos, int countRequested, MultipartCNode cnNode, - // MultipartMNode mnNode, Boolean isCN, - // Session session) throws MetadigProcessException { public static Document querySolr(String queryStr, int startPos, int countRequested, MultipartD1Node d1Node, Session session) throws MetadigProcessException { @@ -110,10 +108,10 @@ public static Document querySolr(String queryStr, int startPos, int countRequest InputStream qis = null; MetadigProcessException metadigException = null; - log.debug("Sending query: " + queryStr); + log.trace("Sending query: " + queryStr); try { qis = d1Node.query(session, "solr", queryStr); - log.debug("Sent query"); + log.trace("Sent query"); } catch (Exception e) { log.error("Error retrieving pids: " + e.getMessage()); metadigException = new MetadigProcessException("Unable to query dataone node: " + e.getMessage()); @@ -121,19 +119,19 @@ public static Document querySolr(String queryStr, int startPos, int countRequest throw metadigException; } - log.debug("Creating xml doc with results"); + log.trace("Creating xml doc with results"); Document xmldoc = null; DocumentBuilder builder = null; try { // If results were returned, create an XML document from them - log.debug("qis available: " + qis.available()); + log.trace("qis available: " + qis.available()); if (qis.available() > 0) { try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); builder = factory.newDocumentBuilder(); xmldoc = builder.parse(new InputSource(qis)); - log.debug("Created xml doc: " + xmldoc.toString()); + log.trace("Created xml doc: " + xmldoc.toString()); } catch (Exception e) { log.error("Unable to create w3c Document from input stream", e); e.printStackTrace(); @@ -145,13 +143,13 @@ public static Document querySolr(String queryStr, int startPos, int countRequest qis.close(); } } catch (IOException ioe) { - log.debug("IO exception: " + ioe.getMessage()); + log.trace("IO exception: " + ioe.getMessage()); metadigException = new MetadigProcessException("Unable prepare query result xml document: " + ioe.getMessage()); metadigException.initCause(ioe); throw metadigException; } - log.debug("Created results xml doc"); + log.trace("Created results xml doc"); return xmldoc; } @@ -169,10 +167,10 @@ public static Session getSession(String subjectId, String authToken) { // query Solr - either the member node or cn, for the project 'solrquery' field if (authToken == null || authToken.isEmpty()) { - log.debug("Creating public sessioni"); + log.trace("Creating public sessioni"); session = new Session(); } else { - log.debug("Creating authentication session for subjectId: " + subjectId + ", token: " + authToken.substring(0, 5) + "..."); + log.trace("Creating authentication session for subjectId: " + subjectId + ", token: " + authToken.substring(0, 5) + "..."); session = new AuthTokenSession(authToken); } @@ -180,7 +178,7 @@ public static Session getSession(String subjectId, String authToken) { Subject subject = new Subject(); subject.setValue(subjectId); session.setSubject(subject); - log.debug("Set session subjectId to: " + session.getSubject().getValue()); + log.trace("Set session subjectId to: " + session.getSubject().getValue()); } return session; @@ -199,18 +197,18 @@ public static Boolean isCN(String nodeStr) { if (nodeStr.matches("^\\s*urn:node:.*")) { if (nodeStr.matches("^\\s*urn:node:CN.*$|^\\s*urn:node:cn.*$")) { isCN = true; - log.debug("The nodeId is for a CN: " + nodeStr); + log.trace("The nodeId is for a CN: " + nodeStr); } else { - log.debug("The nodeId is not for a CN: " + nodeStr); + log.trace("The nodeId is not for a CN: " + nodeStr); isCN = false; } } else { // match cn service url e.g. "https://cn.dataone.org/cn" if (nodeStr.matches("^\\s*https*://cn.*?\\.dataone\\.org.*$|https*://cn.*?\\.test\\.dataone\\.org.*$")) { isCN = true; - log.debug("The service URL is for a CN: " + nodeStr); + log.trace("The service URL is for a CN: " + nodeStr); } else { - log.debug("The service URL is not for a CN: " + nodeStr); + log.trace("The service URL is not for a CN: " + nodeStr); isCN = false; } } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java b/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java index c2840e01..b3e7de4a 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/MDQconfig.java @@ -20,19 +20,7 @@ public class MDQconfig { public static Configuration config; public MDQconfig () throws ConfigurationException, IOException { - // Check if we are running in a servlet boolean inServlet = false; - /* - try { - Class servletClass = Class.forName("javax.servlet.http.HttpServlet"); - inServlet = true; - log.debug("Loaded javax.servlet.http.HttpServlet - running in servlet environment."); - //} catch (ClassNotFoundException ex) { - } catch (Exception e) { - log.debug("Unable to load javax.servlet.http.HttpServlet - not running in servlet environment."); - inServlet = false; - } - */ // If running in a servlet, have to get the config info from the webapp context, as we can't // read from external dirs on disk. @@ -41,15 +29,13 @@ public MDQconfig () throws ConfigurationException, IOException { InputStream inputStream = this.getClass().getClassLoader().getResourceAsStream("/metadig.properties"); String TMP_DIR = System.getProperty("java.io.tmpdir"); File tempFile = new File(TMP_DIR + "/metadig.properties"); - log.debug("Reading config properties in servlet from: " + tempFile); + log.trace("Reading config properties in servlet from: " + tempFile); FileOutputStream out = new FileOutputStream(tempFile); IOUtils.copy(inputStream, out); config = configs.properties(tempFile); - log.debug("Successfully read properties from: " + tempFile); } else { - log.debug("Reading config properties from: " + configFilePath); + log.trace("Reading config properties from: " + configFilePath); config = configs.properties(new File(configFilePath)); - log.debug("Successfully read properties from: " + configFilePath); } } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java b/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java index 7cd516bb..ecbbb554 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/Worker.java @@ -477,19 +477,6 @@ public Run processReport(QueueEntry message) throws InterruptedException, Except } catch (Throwable thrown) { log.error("Error while waiting for group lookup thread completion"); } - // Wait for a few seconds for the 'accounts' -// for (int i = 0; i < 5; i++) { -// try { -// groups = future.get(); -// } catch (Throwable thrown) { -// log.error("Error while waiting for thread completion"); -// } -// // Sleep for 1 second -// -// if (groups.size() > 0 ) break; -// log.debug("Waiting 1 second for DataONE group lookup"); -// Thread.sleep(1000); -// } if (groups != null) { smm.setGroups(groups); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java index 90efc13f..3f9612a3 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java @@ -85,6 +85,7 @@ public static void main(String[] argv) throws Exception { cronSchedule = record.get("cron-schedule").trim(); params = record.get("params").trim(); log.debug("Task type: " + taskType); + log.debug("Task name: " + taskName); log.debug("cronSchedule: " + cronSchedule); params = params.startsWith("\"") ? params.substring(1) : params; params = params.endsWith("\"") ? params.substring(0, params.length()-1) : params; @@ -182,7 +183,6 @@ public static void main(String[] argv) throws Exception { } try { - log.debug("Setting task"); // Currently there is only taskType="quality", but there could be more in the future! JobDetail job = null; if(taskType.equals("quality")) { @@ -223,13 +223,11 @@ public static void main(String[] argv) throws Exception { .build(); } - log.debug("Setting trigger"); CronTrigger trigger = newTrigger() .withIdentity(taskName + "-trigger", taskGroup) .withSchedule(cronSchedule(cronSchedule)) .build(); - log.debug("Scheduling task"); scheduler.scheduleJob(job, trigger); } catch (SchedulerException se) { diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index 6a11c68c..19fdc7ea 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -124,21 +124,13 @@ public void execute(JobExecutionContext context) JobDataMap dataMap = context.getJobDetail().getJobDataMap(); String taskName = dataMap.getString("taskName"); - log.debug("taskName: " + taskName); String taskType = dataMap.getString("taskType"); - log.debug("taskType: " + taskType); String pidFilter = dataMap.getString("pidFilter"); - log.debug("pidFilter: " + pidFilter); String suiteId = dataMap.getString("suiteId"); - log.debug("suiteId: " + suiteId); String nodeId = dataMap.getString("nodeId"); - log.debug("nodeId: " + nodeId); String startHarvestDatetimeStr = dataMap.getString("startHarvestDatetime"); - log.debug("startHavestDatetimeStr: " + startHarvestDatetimeStr); int harvestDatetimeInc = dataMap.getInt("harvestDatetimeInc"); - log.debug("harvestDatetimeInc: " + harvestDatetimeInc); int countRequested = dataMap.getInt("countRequested"); - log.debug("countRequested: " + countRequested); MultipartRestClient mrc = null; MultipartMNode mnNode = null; MultipartCNode cnNode = null; @@ -162,7 +154,7 @@ public void execute(JobExecutionContext context) throw jee; } - log.debug("Executing task for node: " + nodeId + ", suiteId: " + suiteId); + log.info("Executing task " + taskType + ", " + taskName + " for node: " + nodeId + ", suiteId: " + suiteId); try { mrc = new HttpMultipartRestClient(); @@ -183,7 +175,7 @@ public void execute(JobExecutionContext context) mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); } - // Don't know node type yet from the id, so have to manually check if it's a CN + // Get a connection to the database MDQStore store = null; try { @@ -208,13 +200,9 @@ public void execute(JobExecutionContext context) DateTime currentDT = new DateTime(DateTimeZone.UTC); DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SS'Z'"); String currentDatetimeStr = dtfOut.print(currentDT); - DateTime startDateTimeRange = null; DateTime endDateTimeRange = null; - String lastHarvestDateStr = null; - //edu.ucsb.nceas.mdqengine.model.Node node; - //node = store.getNode(nodeId, jobName); Task task; task = store.getTask(taskName, taskType); @@ -349,8 +337,8 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, Date endDate = new Date(msSinceEpoch); try { - // Even though MultipartMNode and MultipartCNode have the same parent class, their interfaces are differnt, so polymorphism - // isn't happening here. + // Even though MultipartMNode and MultipartCNode have the same parent class D1Node, the interface for D1Node doesn't + // include listObjects (it should), so we have to maintain a cnNode and mnNode. if(isCN) { objList = cnNode.listObjects(session, startDate, endDate, formatId, nodeRef, identifier, startCount, countRequested); } else { diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index 11d965fe..fe908c2d 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -3,6 +3,7 @@ import edu.ucsb.nceas.mdqengine.Controller; import edu.ucsb.nceas.mdqengine.MDQconfig; import edu.ucsb.nceas.mdqengine.DataONE; +import edu.ucsb.nceas.mdqengine.exception.MetadigException; import edu.ucsb.nceas.mdqengine.exception.MetadigProcessException; import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.Task; @@ -16,11 +17,7 @@ import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; -import org.dataone.client.rest.DefaultHttpMultipartRestClient; -import org.dataone.client.rest.MultipartRestClient; -import org.dataone.client.v2.impl.MultipartCNode; import org.dataone.client.v2.impl.MultipartD1Node; -import org.dataone.client.v2.impl.MultipartMNode; import org.dataone.service.types.v1.*; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; @@ -33,8 +30,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** *

@@ -74,7 +69,7 @@ Integer getResultCount() { } // Since Quartz will re-instantiate a class every time it - // gets executed, members non-static member variables can + // gets executed, non-static member variables can // not be used to maintain state! /** @@ -146,13 +141,12 @@ public void execute(JobExecutionContext context) try { cfg = new MDQconfig(); qualityServiceUrl = cfg.getString("quality.serviceUrl"); - log.debug("nodeId from request: " + nodeId); + log.trace("nodeId from request: " + nodeId); String nodeAbbr = nodeId.replace("urn:node:", ""); authToken = cfg.getString(nodeAbbr + ".authToken"); subjectId = cfg.getString(nodeAbbr + ".subjectId"); - // TODO: Cache the node values from the CN listNode service nodeServiceUrl = cfg.getString(nodeAbbr + ".serviceUrl"); - log.debug("nodeServiceUrl: " + nodeServiceUrl); + log.trace("nodeServiceUrl: " + nodeServiceUrl); } catch (ConfigurationException | IOException ce) { JobExecutionException jee = new JobExecutionException("Error executing task."); jee.initCause(ce); @@ -271,12 +265,11 @@ public void execute(JobExecutionContext context) throw jee; } } else { - log.debug("Getting portal pids to process..."); + Integer allIds = 0; boolean morePids = true; while (morePids) { ArrayList pidsToProcess = null; - log.debug("startCount: " + startCount); - log.debug("countRequested:" + countRequested); + log.trace("Getting portal pids to process, startCount: " + startCount + ", countRequested: " + countRequested); try { result = getPidsToProcess(d1Node, session, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); @@ -288,7 +281,7 @@ public void execute(JobExecutionContext context) throw jee; } - log.info("Found " + resultCount + " seriesIds" + " for date: " + startDTRstr + " at servierUrl: " + nodeServiceUrl); + log.trace(taskName + ": found " + resultCount + " seriesIds" + " for date: " + startDTRstr + " at servierUrl: " + nodeServiceUrl); for (String pidStr : pidsToProcess) { try { submitScorerRequest(qualityServiceUrl, pidStr, suiteId, nodeId, formatFamily); @@ -304,15 +297,12 @@ public void execute(JobExecutionContext context) if (resultCount >= countRequested) { morePids = true; startCount = startCount + resultCount; - log.info("Paging through more results, current start is " + startCount); + log.trace("Paging through more results, current start is " + startCount); } else { morePids = false; // Record the new "last harvested" date task.setLastHarvestDatetime(endDTRstr); - log.debug("taskName: " + task.getTaskName()); - log.debug("taskType: " + task.getTaskType()); - log.debug("lastharvestdate: " + task.getLastHarvestDatetime()); try { store.saveTask(task); @@ -360,7 +350,7 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, String queryStr = "?q=formatId:" + pidFilter + "+-obsoletedBy:*" + "+dateUploaded:[" + startHarvestDatetimeStr + "%20TO%20" + endHarvestDatetimeStr + "]" + "&fl=seriesId&q.op=AND"; - log.debug("query: " + queryStr); + log.trace("query: " + queryStr); // Send the query to DataONE Solr to retrieve portal seriesIds for a given time frame @@ -370,7 +360,7 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, int thisResultLength; // Now setup the xpath to retrieve the ids returned from the collection query. try { - log.debug("Compiling xpath for seriesId"); + log.trace("Compiling xpath for seriesId"); // Extract the collection query from the Solr result XML XPathFactory xPathfactory = XPathFactory.newInstance(); xpath = xPathfactory.newXPath(); @@ -384,9 +374,7 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, // Loop through the Solr result. As the result may be large, page through the results, accumulating // the pids returned into a ListResult object. - - //log.debug("Getting portal seriesIds from Solr using subjectId: " + subjectId + ", servicerUrl: " + serviceUrl); - log.debug("Getting portal seriesIds from Solr " ); + log.trace("Getting portal seriesIds from Solr " ); int startPos = startCount; do { @@ -408,13 +396,13 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, } String currentPid = null; thisResultLength = xpathResult.getLength(); - log.debug("Got " + thisResultLength + " pids this query"); + log.trace("Got " + thisResultLength + " pids this query"); if(thisResultLength == 0) break; for (int index = 0; index < xpathResult.getLength(); index++) { node = xpathResult.item(index); currentPid = node.getTextContent(); pids.add(currentPid); - log.debug("adding pid: " + currentPid); + log.trace("adding pid: " + currentPid); } startPos += thisResultLength; @@ -453,7 +441,6 @@ public void submitScorerRequest(String qualityServiceUrl, String collectionId, S // send to service log.debug("submitting scores request : " + scorerServiceUrl); - //post.setEntity((HttpEntity) entity); CloseableHttpClient client = HttpClients.createDefault(); CloseableHttpResponse response = client.execute(post); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index e61bbfcd..23ea5697 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -99,14 +99,6 @@ void setResult(ArrayList result) { ArrayList getResult() { return this.result; } - -// void setResultCount(Integer count) { -// this.resultCount = count; -// } -// -// Integer getResultCount() { -// return this.resultCount; -// } } public static void main(String[] argv) throws Exception { diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java index 3fcca606..9958136c 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java @@ -48,7 +48,7 @@ public class DatabaseStore implements MDQStore { private DataSource dataSource = null; public DatabaseStore () throws MetadigStoreException { - log.debug("Initializing a new DatabaseStore to " + dbUrl + "."); + log.trace("Initializing a new DatabaseStore to " + dbUrl + "."); this.init(); } @@ -57,7 +57,7 @@ public DatabaseStore () throws MetadigStoreException { */ private void init() throws MetadigStoreException { - log.debug("initializing connection"); + log.trace("initializing connection"); String additionalDir = null; try { MDQconfig cfg = new MDQconfig(); @@ -90,7 +90,7 @@ private void init() throws MetadigStoreException { throw(mse); } - log.debug("Connection initialized"); + log.trace("Connection initialized"); PathMatchingResourcePatternResolver resolver = new PathMatchingResourcePatternResolver(); @@ -111,7 +111,6 @@ private void init() throws MetadigStoreException { Suite suite = null; try { URL url = resource.getURL(); - //log.debug("Loading suite found at: " + url.toString()); String xml = IOUtils.toString(url.openStream(), "UTF-8"); suite = (Suite) XmlMarshaller.fromXml(xml, Suite.class); } catch (JAXBException | IOException | SAXException e) { @@ -123,7 +122,7 @@ private void init() throws MetadigStoreException { } } if(this.isAvailable()) { - log.debug("Initialized database store: opened database successfully"); + log.trace("Initialized database store: opened database successfully"); } else { throw new MetadigStoreException("Error initializing database, connection not available"); } @@ -153,13 +152,13 @@ public Run getRun(String metadataId, String suiteId) throws MetadigStoreExceptio MetadigStoreException me = new MetadigStoreException("Unable get quality report to the datdabase."); // Select records from the 'runs' table try { - log.debug("preparing statement for query"); + log.trace("preparing statement for query"); String sql = "select * from runs where metadata_id = ? and suite_id = ?"; stmt = conn.prepareStatement(sql); stmt.setString(1, metadataId); stmt.setString(2, suiteId); - log.debug("issuing query: " + sql); + log.trace("issuing query: " + sql); ResultSet rs = stmt.executeQuery(); if(rs.next()) { mId = rs.getString("metadata_id"); @@ -176,9 +175,9 @@ public Run getRun(String metadataId, String suiteId) throws MetadigStoreExceptio // have to be manually added after the JAXB marshalling has created the run object. run.setSequenceId(seqId); run.setIsLatest(isLatest); - log.debug("Retrieved run successfully for metadata id: " + run.getObjectIdentifier()); + log.trace("Retrieved run successfully for metadata id: " + run.getObjectIdentifier()); } else { - log.debug("Run not found for metadata id: " + metadataId + ", suiteId: " + suiteId); + log.trace("Run not found for metadata id: " + metadataId + ", suiteId: " + suiteId); } } catch ( Exception e ) { log.error( e.getClass().getName()+": "+ e.getMessage()); @@ -210,8 +209,6 @@ public void saveRun(Run run) throws MetadigStoreException { String sequenceId = run.getSequenceId(); Boolean isLatest = run.getIsLatest(); String resultStr = null; - //DateTime now = new DateTime(); - //OffsetDateTime dateTime = OffsetDateTime.now(); Timestamp dateTime = Timestamp.from(Instant.now()); run.setTimestamp(dateTime); @@ -288,7 +285,7 @@ public void saveRun(Run run) throws MetadigStoreException { } // Next, insert a record into the child table ('runs') - log.debug("Records created successfully"); + log.trace("Records created successfully"); } /* @@ -296,7 +293,7 @@ public void saveRun(Run run) throws MetadigStoreException { */ public boolean isAvailable() { boolean reachable = false; - log.debug("Checking if store (i.e. sql connection) is available."); + log.trace("Checking if store (i.e. sql connection) is available."); try { reachable = conn.isValid(10); } catch (Exception e ) { @@ -310,7 +307,7 @@ public boolean isAvailable() { */ public void renew() throws MetadigStoreException { if(!this.isAvailable()) { - log.debug("Renewing connection to database"); + log.trace("Renewing connection to database"); this.init(); } } @@ -319,7 +316,7 @@ public void shutdown() { try { conn.close(); - log.debug("Successfully closed database"); + log.trace("Successfully closed database"); } catch ( java.sql.SQLException e) { log.error("Error closing database: " + e.getMessage()); } @@ -355,7 +352,7 @@ public void saveTask(Task task) throws MetadigStoreException { } // Next, insert a record into the child table ('runs') - log.debug("Records created successfully"); + log.trace("Records created successfully"); } public Task getTask(String taskName, String taskType) { @@ -368,13 +365,13 @@ public Task getTask(String taskName, String taskType) { // Select records from the 'nodes' table try { - log.debug("preparing statement for query"); + log.trace("preparing statement for query"); String sql = "select * from tasks where task_name = ? and task_type = ?"; stmt = conn.prepareStatement(sql); stmt.setString(1, taskName); stmt.setString(2, taskType); - log.debug("issuing query: " + sql); + log.trace("issuing query: " + sql); ResultSet rs = stmt.executeQuery(); if(rs.next()) { task.setTaskName(rs.getString("task_name")); @@ -383,7 +380,7 @@ public Task getTask(String taskName, String taskType) { rs.close(); stmt.close(); } else { - log.debug("No results returned from query"); + log.trace("No results returned from query"); } } catch ( Exception e ) { log.error( e.getClass().getName()+": "+ e.getMessage()); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java index c573803d..b9796c29 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java @@ -7,33 +7,30 @@ public interface MDQStore { - public Collection listSuites(); - public Suite getSuite(String id); - public void createSuite(Suite suite); - public void updateSuite(Suite suite); - public void deleteSuite(Suite suite); - - public Collection listChecks(); - public Check getCheck(String id); - public void createCheck(Check check); - public void updateCheck(Check check); - public void deleteCheck(Check check); + Collection listSuites(); + Suite getSuite(String id); + void createSuite(Suite suite); + void updateSuite(Suite suite); + void deleteSuite(Suite suite); + + Collection listChecks(); + Check getCheck(String id); + void createCheck(Check check); + void updateCheck(Check check); + void deleteCheck(Check check); - public Collection listRuns(); - public Run getRun(String suite, String id ) throws MetadigStoreException; - public void saveRun(Run run) throws MetadigStoreException; - public void createRun(Run run); - public void deleteRun(Run run); - - public void shutdown(); - - public boolean isAvailable(); - public void renew() throws MetadigStoreException; -// -// public Node getNode(String nodeId, String jobName); -// public void saveNode(Node node) throws MetadigStoreException; - - public Task getTask(String taskName, String taskType); - public void saveTask(Task task) throws MetadigStoreException; + Collection listRuns(); + Run getRun(String suite, String id ) throws MetadigStoreException; + void saveRun(Run run) throws MetadigStoreException; + void createRun(Run run); + void deleteRun(Run run); + + void shutdown(); + + boolean isAvailable(); + void renew() throws MetadigStoreException; + + Task getTask(String taskName, String taskType); + void saveTask(Task task) throws MetadigStoreException; } From ef01e2639accf142080b7cb8559b667ec21d1c76 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 19 Aug 2020 13:50:54 -0700 Subject: [PATCH 40/47] CN harvesting is missing some pids bug (#267) --- .../mdqengine/scheduler/RequestReportJob.java | 177 +++++++++--------- 1 file changed, 86 insertions(+), 91 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index 19fdc7ea..43ebc9e0 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -37,8 +37,6 @@ import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.Date; -import java.util.regex.Matcher; -import java.util.regex.Pattern; /** *

@@ -56,37 +54,44 @@ public class RequestReportJob implements Job { private Log log = LogFactory.getLog(RequestReportJob.class); class ListResult { - // The total result count returned from DataONE - Integer totalResultCount; + // The total result count for all object types returned from DataONE. This is the count of all object types + // that were retrieved for a given request. The DataONE 'listObjects' service does provide + // parameters to filter by formatId wildcard, so we have to retrieve all pids for a time range + // and filter the result list. + private Integer totalResultCount = 0; // The filtered result count returned from DataONE. // The DataONE listObjects service returns all new pids for all formatIds // but we are typically only interested in a subset of those, i.e. EML metadata pids, // so this is the count of pids from the result that we are actually interested in. - Integer filteredResultCount; - ArrayList result = new ArrayList<>(); + private Integer filteredResultCount = 0; + private ArrayList result = new ArrayList<>(); + + // The scheduler keeps track of the sysmeta 'dateSystemMetadataModified' of the last pid harvested, + // which will be used as the starting time of the next harvest. + private DateTime lastDateModifiedDT = null; void setResult(ArrayList result) { this.result = result; } - ArrayList getResult() { + public ArrayList getResult() { return this.result; } void setTotalResultCount(Integer count) { this.totalResultCount = count; } - void setFilteredResultCount(Integer count) { - this.filteredResultCount = count; + void setFilteredResultCount(Integer count) { this.filteredResultCount = count; } + void setLastDateModified(DateTime date) { + log.debug("Setter last modified date, date: " + date.toString()); + this.lastDateModifiedDT = date; } - Integer getTotalResultCount() { - return this.totalResultCount; - } + public Integer getTotalResultCount() { return this.totalResultCount; } - Integer getFilteredResultCount() { - return this.filteredResultCount; - } + public Integer getFilteredResultCount() { return this.filteredResultCount; } + + public DateTime getLastDateModified() { return this.lastDateModifiedDT; } } // Since Quartz will re-instantiate a class every time it @@ -198,7 +203,7 @@ public void execute(JobExecutionContext context) // Get current datetime, which may be used for start time range. DateTimeZone.setDefault(DateTimeZone.UTC); DateTime currentDT = new DateTime(DateTimeZone.UTC); - DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SS'Z'"); + DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); String currentDatetimeStr = dtfOut.print(currentDT); DateTime startDateTimeRange = null; DateTime endDateTimeRange = null; @@ -219,58 +224,63 @@ public void execute(JobExecutionContext context) lastHarvestDateStr = task.getLastHarvestDatetime(); } - DateTime lastHarvestDate = new DateTime(lastHarvestDateStr); + DateTime lastHarvestDateDT = new DateTime(lastHarvestDateStr); // Set the search start datetime to the last harvest datetime, unless it is in the // future. (This can happen when the previous time range end was for the current day, // as the end datetime range for the previous task run will have been stored as the // new lastharvestDateTime. - DateTime startDTR = null; - if(lastHarvestDate.isAfter(currentDT.toInstant())) { - startDTR = currentDT; + DateTime startDT = null; + if(lastHarvestDateDT.isAfter(currentDT.toInstant())) { + startDT = currentDT; } else { - startDTR = new DateTime(lastHarvestDate); + startDT = new DateTime(lastHarvestDateDT); } - DateTime endDTR = new DateTime(startDTR); - endDTR = endDTR.plusDays(harvestDatetimeInc); - if(endDTR.isAfter(currentDT.toInstant())) { - endDTR = currentDT; + DateTime endDT = new DateTime(startDT); + endDT = endDT.plusDays(harvestDatetimeInc); + if(endDT.isAfter(currentDT.toInstant())) { + endDT = currentDT; } - // If the start and end harvest dates are the same (happends for a new node), then - // tweek the start so that DataONE listObjects doesn't complain. - if(startDTR == endDTR ) { - startDTR = startDTR.minusMinutes(1); + // If the start and end harvest dates are the same (happens for a new node), then + // tweak the start so that DataONE listObjects doesn't complain. + if(startDT == endDT ) { + startDT = startDT.minusMinutes(1); + log.debug("Reset start back 1 minute to: " + startDT); } - String startDTRstr = dtfOut.print(startDTR); - String endDTRstr = dtfOut.print(endDTR); + // Track the sysmeta dateUploaded of the latest harvested pid. This will become the starting time of + // the next harvest. + DateTime lastDateModifiedDT = startDT; + + String startDTstr = dtfOut.print(startDT); + String endDTstr = dtfOut.print(endDT); Integer startCount = new Integer(0); ListResult result = null; - Integer totalResultCount = null; - Integer filteredResultCount = null; + Integer totalResultCount = 0; + Integer filteredResultCount = 0; + Integer allPidsCnt = 0; boolean morePids = true; while(morePids) { ArrayList pidsToProcess = null; - log.info("Getting pids for node: " + nodeId + ", suiteId: " + suiteId + ", harvest start: " + startDTRstr); - try { - result = getPidsToProcess(cnNode, mnNode, isCN, session, suiteId, nodeId, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); + result = getPidsToProcess(cnNode, mnNode, isCN, session, suiteId, nodeId, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT); pidsToProcess = result.getResult(); totalResultCount = result.getTotalResultCount(); filteredResultCount = result.getFilteredResultCount(); + lastDateModifiedDT = result.getLastDateModified(); } catch (Exception e) { JobExecutionException jee = new JobExecutionException("Unable to get pids to process", e); jee.setRefireImmediately(false); throw jee; } - log.info("Found " + filteredResultCount + " pids" + " for node: " + nodeId); + allPidsCnt = pidsToProcess.size(); for (String pidStr : pidsToProcess) { try { - log.info("submitting pid: " + pidStr); + log.debug("submitting pid: " + pidStr); submitReportRequest(cnNode, mnNode, isCN, session, qualityServiceUrl, pidStr, suiteId); } catch (org.dataone.service.exceptions.NotFound nfe) { log.error("Unable to process pid: " + pidStr + nfe.getMessage()); @@ -278,16 +288,24 @@ public void execute(JobExecutionContext context) } catch (Exception e) { log.error("Unable to process pid: " + pidStr + " - " + e.getMessage()); continue; - //JobExecutionException jee = new JobExecutionException("Unable to submit request to create new quality reports", e); - //jee.setRefireImmediately(false); - //throw jee; } } - task.setLastHarvestDatetime(endDTRstr); - log.debug("taskName: " + task.getTaskName()); - log.debug("taskType: " + task.getTaskType()); - log.debug("lastharvestdate: " + task.getLastHarvestDatetime()); + // Check if DataONE returned the max number of results. If so, we have to request more by paging through + // the results returned pidsToProcess (i.e. DataONE listObjects service). If the returned result is + // less than the requested result, then all pids have been retrieved. + if(totalResultCount >= countRequested) { + morePids = true; + startCount = startCount + totalResultCount; + log.trace("Paging through more results, current start is " + startCount); + } else { + morePids = false; + } + } + // Don't update the lastHarvestDateDT if no pids were found. + if (allPidsCnt > 0) { + task.setLastHarvestDatetime(dtfOut.print(lastDateModifiedDT)); + log.debug("Saving lastHarvestDate: " + dtfOut.print(lastDateModifiedDT)); try { store.saveTask(task); } catch (MetadigStoreException mse) { @@ -296,24 +314,15 @@ public void execute(JobExecutionContext context) jee.setRefireImmediately(false); throw jee; } - - // Check if DataONE returned the max number of results. If so, we have to request more by paging through - // the results returned pidsToProcess (i.e. DataONE listObjects service). - if(totalResultCount >= countRequested) { - morePids = true; - startCount = startCount + totalResultCount; - log.info("Paging through more results, current start is " + startCount); - } else { - morePids = false; - } } + log.info(taskName + ": Found " + allPidsCnt + " pids for start: " + startDTstr + ", end: " + endDTstr + " at servierUrl: " + nodeServiceUrl); store.shutdown(); } public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, Boolean isCN, Session session, String suiteId, String nodeId, String pidFilter, String startHarvestDatetimeStr, String endHarvestDatetimeStr, int startCount, - int countRequested) throws Exception { + int countRequested, DateTime lastDateModifiedDT) throws Exception { ArrayList pids = new ArrayList(); InputStream qis = null; @@ -353,15 +362,16 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, String thisFormatId = null; String thisPid = null; int pidCount = 0; + Date thisDateModified; if (objList.getCount() > 0) { for(ObjectInfo oi: objList.getObjectInfoList()) { thisFormatId = oi.getFormatId().getValue(); thisPid = oi.getIdentifier().getValue(); - log.debug("Checking pid: " + thisPid + ", format: " + thisFormatId); + log.trace("Checking pid: " + thisPid + ", format: " + thisFormatId); - // Check all pid filters. There could be multiple wildcard filters, which are separated - // by ','. + // Check all pid filters to see if this pids's format was found in the list of desired formats. + // There could be multiple wildcard filters, which are separated by ','. String [] filters = pidFilter.split("\\|"); Boolean found = false; for(String thisFilter:filters) { @@ -378,7 +388,16 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, // if (!runExists(thisPid, suiteId, store)) { pidCount = pidCount++; pids.add(thisPid); - log.info("adding pid " + thisPid + ", formatId: " + thisFormatId); + log.trace("adding pid " + thisPid + ", formatId: " + thisFormatId); + // If this pid's modified date is after the stored latest encountered modified date, then update + // the lastModified date + DateTime thisDateModifiedDT = new DateTime(oi.getDateSysMetadataModified()); + // Add a millisecond to lastDateModfiedDT so that this pid won't be harvested again (in the event + // that this is the last pid to be harvested in this round. + if (thisDateModifiedDT.isAfter(lastDateModifiedDT)) { + lastDateModifiedDT = thisDateModifiedDT.plusMillis(1) ; + log.debug("Updated lastDateMoidifed: " + lastDateModifiedDT.toString()); + } // } } } @@ -390,6 +409,8 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, // Set the count for the total number of pids returned from DataONE (all formatIds) for this query result.setTotalResultCount(objList.getCount()); result.setResult(pids); + // Return the sysmeta 'dateSystemMetadataModified' of the last pid harvested. + result.setLastDateModified(lastDateModifiedDT); return result; } @@ -445,45 +466,19 @@ public void submitReportRequest(MultipartCNode cnNode, MultipartMNode mnNode, Bo } else { objectIS = mnNode.get(session, pid); } - log.debug("Retrieved metadata object for pid: " + pidStr); + log.trace("Retrieved metadata object for pid: " + pidStr); } catch (NotAuthorized na) { - log.error("Not authorized to read pid: " + pid + ", continuing with next pid..."); + log.error("Not authorized to read pid: " + pid + ", unable to retrieve metadata, continuing with next pid..."); return; - } catch (Exception e) { - throw(e); } // quality suite service url, i.e. "http://docke-ucsb-1.dataone.org:30433/quality/suites/knb.suite.1/run qualityServiceUrl = qualityServiceUrl + "/suites/" + suiteId + "/run"; HttpPost post = new HttpPost(qualityServiceUrl); - try { - // add document - SimpleMultipartEntity entity = new SimpleMultipartEntity(); - entity.addFilePart("document", objectIS); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - TypeMarshaller.marshalTypeToOutputStream(sysmeta, baos); - entity.addFilePart("systemMetadata", new ByteArrayInputStream(baos.toByteArray())); - - // make sure we get XML back - post.addHeader("Accept", "application/xml"); - - // send to service - log.trace("submitting: " + qualityServiceUrl); - post.setEntity((HttpEntity) entity); - CloseableHttpClient client = HttpClients.createDefault(); - CloseableHttpResponse response = client.execute(post); - - // retrieve results - HttpEntity reponseEntity = response.getEntity(); - if (reponseEntity != null) { - runResultIS = reponseEntity.getContent(); - } - } catch (Exception e) { - throw(e); - } - } + // add document + SimpleMultipartEntity entity = new SimpleMultipartEntity(); + entity.addFilePart("document", objectIS); private Boolean isCN(String serviceUrl) { From 7e557579f60961b82224b27480bd97eb90593b88 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 19 Aug 2020 13:52:20 -0700 Subject: [PATCH 41/47] Detect D1 client connection type (CN or MN) (#265) --- .../mdqengine/scheduler/RequestReportJob.java | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index 43ebc9e0..3900ac12 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -173,7 +173,7 @@ public void execute(JobExecutionContext context) Session session = DataONE.getSession(subjectId, authToken); // Don't know node type yet from the id, so have to manually check if it's a CN - Boolean isCN = isCN(nodeServiceUrl); + Boolean isCN = DataONE.isCN(nodeServiceUrl); if(isCN) { cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); } else { @@ -418,6 +418,7 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, public boolean runExists(String pid, String suiteId, MDQStore store) throws MetadigStoreException { boolean found = false; + Date runDateSystemMetadataModified = null; if(!store.isAvailable()) { try { @@ -480,21 +481,23 @@ public void submitReportRequest(MultipartCNode cnNode, MultipartMNode mnNode, Bo SimpleMultipartEntity entity = new SimpleMultipartEntity(); entity.addFilePart("document", objectIS); - private Boolean isCN(String serviceUrl) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + TypeMarshaller.marshalTypeToOutputStream(sysmeta, baos); + entity.addFilePart("systemMetadata", new ByteArrayInputStream(baos.toByteArray())); - Boolean isCN = false; - // Identity node as either a CN or MN based on the serviceUrl - String pattern = "https*://cn.*?\\.dataone\\.org|https*://cn.*?\\.test\\.dataone\\.org"; - Pattern r = Pattern.compile(pattern); - Matcher m = r.matcher(serviceUrl); - if (m.find()) { - isCN = true; - log.debug("service URL is for a CN: " + serviceUrl); - } else { - log.debug("service URL is not for a CN: " + serviceUrl); - isCN = false; - } + // make sure we get XML back + post.addHeader("Accept", "application/xml"); - return isCN; + // send to service + log.trace("submitting: " + qualityServiceUrl); + post.setEntity((HttpEntity) entity); + CloseableHttpClient client = HttpClients.createDefault(); + CloseableHttpResponse response = client.execute(post); + + // retrieve results + HttpEntity reponseEntity = response.getEntity(); + if (reponseEntity != null) { + runResultIS = reponseEntity.getContent(); + } } } From 594f4b8709a276e55325a7b34c1a532d29e98fc6 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 19 Aug 2020 13:53:33 -0700 Subject: [PATCH 42/47] Reuse CN clients when possible (#264) --- .../mdqengine/scheduler/RequestScorerJob.java | 46 ++++++------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index fe908c2d..7c099f31 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -103,9 +103,6 @@ public void execute(JobExecutionContext context) throws JobExecutionException { String qualityServiceUrl = null; - String CNsubjectId = null; - String CNauthToken = null; - String CNserviceUrl = null; MDQconfig cfg = null; JobKey key = context.getJobDetail().getKey(); @@ -123,20 +120,17 @@ public void execute(JobExecutionContext context) // Number of pids to get each query (this number of pids will be fetched each query until all pids are obtained) int countRequested = dataMap.getInt("countRequested"); String requestType = null; - if (taskType.equalsIgnoreCase("score")) { - requestType = dataMap.getString("requestType"); - } - // TODO: add formatFamily to scheduler request String formatFamily = null; - MultipartRestClient mrc = null; - MultipartMNode mnNode = null; - MultipartCNode cnNode = null; - + MultipartD1Node d1Node = null; String authToken = null; String subjectId = null; String nodeServiceUrl = null; - log.info("Executing task: " + taskName + ", taskType: " + taskType); + if (taskType.equalsIgnoreCase("score")) { + requestType = dataMap.getString("requestType"); + } + + log.info("Executing task " + taskType + ", " + taskName + " for node: " + nodeId + ", suiteId: " + suiteId); try { cfg = new MDQconfig(); @@ -153,33 +147,19 @@ public void execute(JobExecutionContext context) throw jee; } - try { - mrc = new DefaultHttpMultipartRestClient(); - } catch (Exception e) { - log.error("Error creating rest client: " + e.getMessage()); - JobExecutionException jee = new JobExecutionException(e); - jee.setRefireImmediately(false); - throw jee; - } - Session session = DataONE.getSession(subjectId, authToken); - // Don't know node type yet from the id, so have to manually check if it's a CN - Boolean isCN = DataONE.isCN(nodeServiceUrl); - - MultipartD1Node d1Node = null; - if(isCN) { - //cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); - d1Node = new MultipartCNode(mrc, nodeServiceUrl, session); - log.debug("Created cnNode for serviceUrl: " + nodeServiceUrl); - } else { - //mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); - d1Node = new MultipartMNode(mrc, nodeServiceUrl, session); - log.debug("Created mnNode for serviceUrl: " + nodeServiceUrl); + // Get a connection to the DataONE node (CN or MN) + try { + d1Node = DataONE.getMultipartD1Node(session, nodeServiceUrl); + } catch (MetadigException mpe) { + mpe.printStackTrace(); + throw new JobExecutionException(taskName + ": unable to create connection to service URL " + nodeServiceUrl , mpe); } MDQStore store = null; + // Get stored task info from the last task execution try { store = new DatabaseStore(); } catch (Exception e) { From bc176ed3cb7933942cafd3c987d9b55fe2509b27 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 19 Aug 2020 13:55:51 -0700 Subject: [PATCH 43/47] Reuse CN clients when possible (#264) --- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 72 ++++++------------- 1 file changed, 20 insertions(+), 52 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index 23ea5697..fede5a0f 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -21,11 +21,9 @@ import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.util.ClientUtils; -import org.dataone.client.rest.DefaultHttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; -import org.dataone.client.v2.impl.MultipartD1Node; // Don't include org.dataone.client.rest.MultipartD1Node (this is what IDEA selects) -import org.dataone.client.v2.impl.MultipartMNode; +import org.dataone.client.v2.impl.MultipartD1Node; import org.dataone.service.types.v1.Session; import org.dataone.service.types.v1.Subject; import org.dataone.service.types.v1.Group; @@ -148,9 +146,8 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp String nodeServiceUrl = null; String label = null; String title = null; - MultipartRestClient mrc = null; - MultipartMNode mnNode = null; - MultipartCNode cnNode = null; + //MultipartRestClient mrc = null; + MultipartD1Node d1Node = null; GraphType graphType = null; //long startTime = System.nanoTime(); @@ -201,9 +198,6 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp // Pids associated with a collection, based on query results using 'collectionQuery' field in solr. ArrayList collectionPids = null; - // The harvesting and evaluation of the collectionQuery is based on the nodeId that is passed in, i.e. - // If an MN is specified, then the collection (portal) Solr entry will be obtained from the MN, and the - // collectionQuery string will also be evaluated on that node. String nodeAbbr = nodeId.replace("urn:node:", ""); authToken = cfg.getString(nodeAbbr + ".authToken"); subjectId = cfg.getString(nodeAbbr + ".subjectId"); @@ -211,45 +205,20 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp nodeServiceUrl = cfg.getString(nodeAbbr + ".serviceUrl"); HashMap variables = new HashMap<>(); - // Create the graph. - // Two types of graphs are currently supported: - // - a graph for all pids included in a DataONE collection (portal), and a specified suite id - // - a graph for specified filters: member node, suite id, metadata format + MetadigFile mdFile = new MetadigFile(); Graph graph = new Graph(); - // If creating a graph for a collection, get the set of pids associated with the collection. - // Only scores for these pids will be included in the graph. - - try { - mrc = new DefaultHttpMultipartRestClient(); - } catch (Exception e) { - log.error("Error creating rest client: " + e.getMessage()); - JobExecutionException jee = new JobExecutionException(e); - jee.setRefireImmediately(false); - throw jee; - } - Session session = DataONE.getSession(subjectId, authToken); - // Don't know node type yet from the id, so have to manually check if it's a CN - Boolean isCN = DataONE.isCN(nodeServiceUrl); + d1Node = DataONE.getMultipartD1Node(session, nodeServiceUrl); - MultipartD1Node d1Node = null; - if(isCN) { - //cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); - d1Node = new MultipartCNode(mrc, nodeServiceUrl, session); - log.debug("Created cnNode for serviceUrl: " + nodeServiceUrl); - } else { - //mnNode = new MultipartMNode(mrc, nodeServiceUrl, session); - d1Node = new MultipartMNode(mrc, nodeServiceUrl, session); - log.debug("Created mnNode for serviceUrl: " + nodeServiceUrl); - } - - // Check if this is a "node" collection. For "node" collections, all scores for a member node - // are used to create the assessment graph, so we don't need to get the collection pids as is - // done for portals (by evaluating the Solr collectionQuery). Therefor, getCollectionPids doesn't - // need to be called and we can proceed directly to getting the quality scores from the quality - // Solr server. + // Quality scores must be retrieved from the quality Solr server from which a graph is created. + // There are two + // Check if this is a "node" collection. For "node" collections, all scores from the quality + // Solr server with 'datasource' = nodeId are used to create the assessment graph, so we don't need + // to get the collection pids. However, this is done for portals (by evaluating the DataONE Solr collectionQuery). + // Therefor, for a "node" collection, getCollectionPids doesn't need to be called and we can proceed directly + // to getting the quality scores from the quality Solr server. if (collectionId.matches("^\\s*urn:node:.*")) { graphType = GraphType.CUMULATIVE; log.debug("Processing a member node request, skipping step of getting collection pids (not required)."); @@ -290,6 +259,7 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp log.info("# of quality scores returned: " + scores.size()); } + // Create the data file used by the graphing method File scoreFile = gfr.createScoreFile(scores); log.debug("Created score file: " + scoreFile.getPath()); @@ -304,13 +274,11 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp // Generate a temporary graph file based on the quality scores log.debug("Creating graph for collection id: " + collectionId); - //String filePath = graph.create(GraphType.CUMULATIVE, title, scoreFile.getPath()); String filePath = graph.create(graphType, title, scoreFile.getPath()); + // Now save the graphics file to permanent storage String outfile; - DateTime createDateTime = DateTime.now(); - mdFile.setCreationDatetime(createDateTime); mdFile.setPid(collectionId); mdFile.setSuiteId(suiteId); @@ -425,17 +393,17 @@ which will be used to query DataONE Solr for all the pids associated with that p org.w3c.dom.Node node = null; String label = null; String rightsHolder = null; - MultipartRestClient mrc = null; - MultipartCNode CNnode = null; + //MultipartRestClient mrc = null; + MultipartCNode cnNode = null; Session CNsession = null; try { CNsession = DataONE.getSession(CNsubjectId, CNauthToken); - // // Only CNs can call the 'subjectInfo' service (aka accounts), so we have to use + // Only CNs can call the 'subjectInfo' service (aka accounts), so we have to use // a MultipartCNode instance here. try { - CNnode = (MultipartCNode) DataONE.getMultipartD1Node(CNsession, CNserviceUrl); + cnNode = (MultipartCNode) DataONE.getMultipartD1Node(CNsession, CNserviceUrl); } catch (Exception ex) { metadigException = new MetadigProcessException("Unable to create multipart D1 node: " + ex.getMessage()); metadigException.initCause(ex); @@ -523,7 +491,7 @@ which will be used to query DataONE Solr for all the pids associated with that p subject.setValue(rightsHolder); // The subject info can only be obtained from a CN, so use the CN auth info for the current DataONE environment, // which should be configured in the metadig.properties file - SubjectInfo subjectInfo = DataONE.getSubjectInfo(subject, CNnode, CNsession); + SubjectInfo subjectInfo = DataONE.getSubjectInfo(subject, cnNode, CNsession); String groupStr = null; groupStr = "(readPermission:" + "\"" + rightsHolder @@ -584,7 +552,7 @@ which will be used to query DataONE Solr for all the pids associated with that p do { //TODO: check that a result was returned // Note: the collectionQuery is always evaluated on the CN, so that the entire DataONE network is queried. - xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, CNnode, CNsession); + xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, cnNode, CNsession); if(xmldoc == null) { log.info("no values returned from query"); break; From d1f5a97d271ee2c0a666f69dfc8af81030679d78 Mon Sep 17 00:00:00 2001 From: gothub Date: Thu, 20 Aug 2020 11:16:46 -0700 Subject: [PATCH 44/47] CN harvesting is missing some pids (#267) --- .../mdqengine/scheduler/RequestReportJob.java | 29 ++-- .../mdqengine/scheduler/RequestScorerJob.java | 129 +++++++++++++----- 2 files changed, 107 insertions(+), 51 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index 3900ac12..22540674 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -83,7 +83,6 @@ void setTotalResultCount(Integer count) { } void setFilteredResultCount(Integer count) { this.filteredResultCount = count; } void setLastDateModified(DateTime date) { - log.debug("Setter last modified date, date: " + date.toString()); this.lastDateModifiedDT = date; } @@ -91,11 +90,13 @@ void setLastDateModified(DateTime date) { public Integer getFilteredResultCount() { return this.filteredResultCount; } - public DateTime getLastDateModified() { return this.lastDateModifiedDT; } + public DateTime getLastDateModified() { + return this.lastDateModifiedDT; + } } // Since Quartz will re-instantiate a class every time it - // gets executed, members non-static member variables can + // gets executed, non-static member variables can // not be used to maintain state! /** @@ -236,17 +237,17 @@ public void execute(JobExecutionContext context) startDT = new DateTime(lastHarvestDateDT); } - DateTime endDT = new DateTime(startDT); - endDT = endDT.plusDays(harvestDatetimeInc); - if(endDT.isAfter(currentDT.toInstant())) { - endDT = currentDT; - } +// DateTime endDT = new DateTime(startDT); +// endDT = endDT.plusDays(harvestDatetimeInc); +// if(endDT.isAfter(currentDT.toInstant())) { +// endDT = currentDT; +// } + DateTime endDT = new DateTime(currentDT); // If the start and end harvest dates are the same (happens for a new node), then // tweak the start so that DataONE listObjects doesn't complain. if(startDT == endDT ) { startDT = startDT.minusMinutes(1); - log.debug("Reset start back 1 minute to: " + startDT); } // Track the sysmeta dateUploaded of the latest harvested pid. This will become the starting time of @@ -266,7 +267,7 @@ public void execute(JobExecutionContext context) while(morePids) { ArrayList pidsToProcess = null; try { - result = getPidsToProcess(cnNode, mnNode, isCN, session, suiteId, nodeId, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT); + result = getPidsToProcess(cnNode, mnNode, isCN, session, suiteId, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT); pidsToProcess = result.getResult(); totalResultCount = result.getTotalResultCount(); filteredResultCount = result.getFilteredResultCount(); @@ -280,7 +281,7 @@ public void execute(JobExecutionContext context) allPidsCnt = pidsToProcess.size(); for (String pidStr : pidsToProcess) { try { - log.debug("submitting pid: " + pidStr); + log.debug(taskName + ": submitting pid: " + pidStr); submitReportRequest(cnNode, mnNode, isCN, session, qualityServiceUrl, pidStr, suiteId); } catch (org.dataone.service.exceptions.NotFound nfe) { log.error("Unable to process pid: " + pidStr + nfe.getMessage()); @@ -362,7 +363,7 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, String thisFormatId = null; String thisPid = null; int pidCount = 0; - Date thisDateModified; + DateTime thisDateModifiedDT; if (objList.getCount() > 0) { for(ObjectInfo oi: objList.getObjectInfoList()) { @@ -391,11 +392,11 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, log.trace("adding pid " + thisPid + ", formatId: " + thisFormatId); // If this pid's modified date is after the stored latest encountered modified date, then update // the lastModified date - DateTime thisDateModifiedDT = new DateTime(oi.getDateSysMetadataModified()); + thisDateModifiedDT = new DateTime(oi.getDateSysMetadataModified()); // Add a millisecond to lastDateModfiedDT so that this pid won't be harvested again (in the event // that this is the last pid to be harvested in this round. if (thisDateModifiedDT.isAfter(lastDateModifiedDT)) { - lastDateModifiedDT = thisDateModifiedDT.plusMillis(1) ; + lastDateModifiedDT = thisDateModifiedDT.plusMillis(1); log.debug("Updated lastDateMoidifed: " + lastDateModifiedDT.toString()); } // } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index 7c099f31..31dcea61 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -51,6 +51,10 @@ class ListResult { Integer resultCount; ArrayList result = new ArrayList<>(); + // The scheduler keeps track of Solr 'dateModified' of the last pid harvested, + // which will be used as the starting time of the next harvest. + private DateTime lastDateModifiedDT = null; + void setResult(ArrayList result) { this.result = result; } @@ -66,6 +70,12 @@ void setResultCount(Integer count) { Integer getResultCount() { return this.resultCount; } + + void setLastDateModified(DateTime date) { + this.lastDateModifiedDT = date; + } + + public DateTime getLastDateModified() { return this.lastDateModifiedDT; } } // Since Quartz will re-instantiate a class every time it @@ -180,10 +190,7 @@ public void execute(JobExecutionContext context) // Get current datetime, which may be used for start time range. DateTimeZone.setDefault(DateTimeZone.UTC); DateTime currentDT = new DateTime(DateTimeZone.UTC); - DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SS'Z'"); - String currentDatetimeStr = dtfOut.print(currentDT); - DateTime startDateTimeRange = null; - DateTime endDateTimeRange = null; + DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); String lastHarvestDateStr = null; Task task; @@ -202,41 +209,54 @@ public void execute(JobExecutionContext context) lastHarvestDateStr = task.getLastHarvestDatetime(); } - DateTime lastHarvestDate = new DateTime(lastHarvestDateStr); + DateTime lastHarvestDateDT = new DateTime(lastHarvestDateStr); // Set the search start datetime to the last harvest datetime, unless it is in the // future. (This can happen when the previous time range end was for the current day, // as the end datetime range for the previous task run will have been stored as the // new lastharvestDateTime. - DateTime startDTR = null; - if(lastHarvestDate.isAfter(currentDT.toInstant())) { - startDTR = currentDT; + DateTime startDT = null; + if(lastHarvestDateDT.isAfter(currentDT.toInstant())) { + startDT = currentDT; } else { - startDTR = new DateTime(lastHarvestDate); + startDT = new DateTime(lastHarvestDateDT); } - DateTime endDTR = new DateTime(startDTR); - endDTR = endDTR.plusDays(harvestDatetimeInc); - if(endDTR.isAfter(currentDT.toInstant())) { - endDTR = currentDT; - } +// DateTime endDT = new DateTime(startDT); +// endDT = endDT.plusDays(harvestDatetimeInc); +// if(endDT.isAfter(currentDT.toInstant())) { +// endDT = currentDT; +// } + + DateTime endDT = new DateTime(currentDT); // If the start and end harvest dates are the same (happends for a new node), then // tweek the start so that DataONE listObjects doesn't complain. - if(startDTR == endDTR ) { - startDTR = startDTR.minusMinutes(1); + if(startDT == endDT ) { + startDT = startDT.minusMinutes(1); } - String startDTRstr = dtfOut.print(startDTR); - String endDTRstr = dtfOut.print(endDTR); + // Track the sysmeta dateUploaded of the latest harvested pid. This will become the starting time of + // the next harvest. + DateTime lastDateModifiedDT = startDT; + + String startDTstr = dtfOut.print(startDT); + String endDTstr = dtfOut.print(endDT); int startCount = 0; RequestScorerJob.ListResult result = null; - Integer resultCount = null; + Integer resultCount = 0; + // Two types of score requests can be processed - a "node" request that will get score info for an + // entire repository (e.g. urn:node:ARCTIC) or a "portal" request that will get scores for a + // specific portal (from the Solr portal entry collectionQuery). if(requestType != null && requestType.equalsIgnoreCase("node")) { try { // For a 'node' scores request, the 'collection' is the entire node, so specify - // the nodeId as the collectionid. + // the nodeId as the collectionid. It is not necessary to retrieve a collectionQuery for this + // 'node' portal, as there is no Solr entry for this type collection. All quality scores available + // in the quality Solr server will be directly retrieved, filtering on the 'nodeId' (datasource) + log.info("TaskName: " + taskName + ", taskType: " + taskType + " submitting node request for nodeId: " + + nodeId + ", suiteId: " + suiteId + "formatFamily: " + formatFamily); submitScorerRequest(qualityServiceUrl, nodeId, suiteId, nodeId, formatFamily); } catch (Exception e) { JobExecutionException jee = new JobExecutionException("Unable to submit request to create new node (" @@ -248,22 +268,26 @@ public void execute(JobExecutionContext context) Integer allIds = 0; boolean morePids = true; while (morePids) { + // Get a list of pids selected by a collection (portal) search filter (collectionQuery) and get + // the quality scores (from the quality Solr server) for that list of pids. ArrayList pidsToProcess = null; log.trace("Getting portal pids to process, startCount: " + startCount + ", countRequested: " + countRequested); try { - result = getPidsToProcess(d1Node, session, pidFilter, startDTRstr, endDTRstr, startCount, countRequested); + result = getPidsToProcess(d1Node, session, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT); pidsToProcess = result.getResult(); resultCount = result.getResultCount(); + lastDateModifiedDT = result.getLastDateModified(); } catch (Exception e) { JobExecutionException jee = new JobExecutionException("Unable to get pids to process", e); jee.setRefireImmediately(false); throw jee; } - log.trace(taskName + ": found " + resultCount + " seriesIds" + " for date: " + startDTRstr + " at servierUrl: " + nodeServiceUrl); + log.trace(taskName + ": found " + resultCount + " seriesIds" + " for date: " + startDTstr + " at servierUrl: " + nodeServiceUrl); for (String pidStr : pidsToProcess) { try { + log.debug(taskName + ": submitting seriesId: " + pidStr); submitScorerRequest(qualityServiceUrl, pidStr, suiteId, nodeId, formatFamily); } catch (Exception e) { JobExecutionException jee = new JobExecutionException("Unable to submit request to create new score graph/data file", e); @@ -274,6 +298,7 @@ public void execute(JobExecutionContext context) // Check if DataONE returned the max number of results. If so, we have to request more by paging through // the results. + allIds += pidsToProcess.size(); if (resultCount >= countRequested) { morePids = true; startCount = startCount + resultCount; @@ -281,19 +306,23 @@ public void execute(JobExecutionContext context) } else { morePids = false; - // Record the new "last harvested" date - task.setLastHarvestDatetime(endDTRstr); + } + } - try { - store.saveTask(task); - } catch (MetadigStoreException mse) { - log.error("Error saving task: " + task.getTaskName()); - JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); - jee.setRefireImmediately(false); - throw jee; - } + if (allIds > 0) { + // Record the new "last harvested" date + task.setLastHarvestDatetime(dtfOut.print(lastDateModifiedDT)); + log.debug("Saving lastHarvestDate: " + dtfOut.print(lastDateModifiedDT)); + try { + store.saveTask(task); + } catch (MetadigStoreException mse) { + log.error("Error saving task: " + task.getTaskName()); + JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); + jee.setRefireImmediately(false); + throw jee; } } + log.info(taskName + ": found " + allIds + " seriesIds" + " for start: " + startDTstr + ", end: " + endDTstr + " at servierUrl: " + nodeServiceUrl); } store.shutdown(); } @@ -322,14 +351,15 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, org.w3c.dom.NodeList xpathResult = null; XPathExpression fieldXpath = null; + XPathExpression dateModifiedXpath = null; XPath xpath = null; org.w3c.dom.Node node = null; ArrayList pids = new ArrayList(); Document xmldoc = null; - String queryStr = "?q=formatId:" + pidFilter + "+-obsoletedBy:*" + "+dateUploaded:[" + startHarvestDatetimeStr + "%20TO%20" + String queryStr = "?q=formatId:" + pidFilter + "+-obsoletedBy:*" + "+dateModified:[" + startHarvestDatetimeStr + "%20TO%20" + endHarvestDatetimeStr + "]" - + "&fl=seriesId&q.op=AND"; + + "&fl=seriesId,dateModified&q.op=AND"; log.trace("query: " + queryStr); // Send the query to DataONE Solr to retrieve portal seriesIds for a given time frame @@ -345,6 +375,7 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, XPathFactory xPathfactory = XPathFactory.newInstance(); xpath = xPathfactory.newXPath(); fieldXpath = xpath.compile("//result/doc/str[@name='seriesId']/text()"); + dateModifiedXpath = xpath.compile("//result/doc/date[@name='dateModified']/text()"); } catch (XPathExpressionException xpe) { log.error("Error extracting id from solr result doc: " + xpe.getMessage()); metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); @@ -358,16 +389,13 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, int startPos = startCount; do { - //xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, cnNode, mnNode, isCN, session); xmldoc = DataONE.querySolr(queryStr, startPos, countRequested, d1Node, session); if(xmldoc == null) { log.info("no values returned from query"); break; } try { - log.debug("processing xpathresult..."); xpathResult = (org.w3c.dom.NodeList) fieldXpath.evaluate(xmldoc, XPathConstants.NODESET); - log.debug("processed xpathResult"); } catch (XPathExpressionException xpe) { log.error("Error extracting seriesId from solr result doc: " + xpe.getMessage()); metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); @@ -385,12 +413,39 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, log.trace("adding pid: " + currentPid); } + // Get dateModified for the returned seriesIds + try { + xpathResult = (org.w3c.dom.NodeList) dateModifiedXpath.evaluate(xmldoc, XPathConstants.NODESET); + } catch (XPathExpressionException xpe) { + log.error("Error extracting dateModified from solr result doc: " + xpe.getMessage()); + metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); + metadigException.initCause(xpe); + throw metadigException; + } + + DateTime thisDateModified; + thisResultLength = xpathResult.getLength(); + if(thisResultLength == 0) break; + for (int index = 0; index < xpathResult.getLength(); index++) { + node = xpathResult.item(index); + String dateStr = node.getTextContent(); + log.debug("Checking date str: " + dateStr); + thisDateModified = DateTime.parse(dateStr, + DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")); + if(thisDateModified.isAfter(lastDateModifiedDT)) { + lastDateModifiedDT = thisDateModified.plusMillis(1); + log.debug("Updated lastDateModified to " + lastDateModifiedDT); + } + } + startPos += thisResultLength; } while (thisResultLength > 0); RequestScorerJob.ListResult result = new RequestScorerJob.ListResult(); result.setResultCount(pids.size()); result.setResult(pids); + // Return the sysmeta 'dateSystemMetadataModified' of the last pid harvested. + result.setLastDateModified(lastDateModifiedDT); return result; } From 5bfd7a780b380b3feb6ddac493448e2f4aa64d29 Mon Sep 17 00:00:00 2001 From: gothub Date: Thu, 20 Aug 2020 11:17:40 -0700 Subject: [PATCH 45/47] Improve javadocs; code cleanup --- .../mdqengine/scheduler/JobScheduler.java | 8 ++- .../mdqengine/scheduler/RequestReportJob.java | 57 +++++++++++++++++-- .../mdqengine/scheduler/RequestScorerJob.java | 35 ++++++++---- .../ucsb/nceas/mdqengine/scorer/Scorer.java | 14 ++--- 4 files changed, 89 insertions(+), 25 deletions(-) diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java index 3f9612a3..dd72f43b 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java @@ -241,12 +241,18 @@ public static void main(String[] argv) throws Exception { public JobScheduler () { } + /** + * Read a single parameter from the quality engine parameter file + * @param paramName the parameter to read from the config file + * @throws ConfigurationException if there is an exception while reading the config file + * @throws IOException if there is an exception while reading the config file + */ public String readConfig (String paramName) throws ConfigurationException, IOException { String paramValue = null; try { MDQconfig cfg = new MDQconfig(); paramValue = cfg.getString(paramName); - } catch (Exception e) { + } catch (ConfigurationException | IOException e) { log.error("Could not read configuration for param: " + paramName + ": " + e.getMessage()); throw e; } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index 22540674..27a7458b 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -320,8 +320,25 @@ public void execute(JobExecutionContext context) store.shutdown(); } + /** + * Query a DataONE CN or MN to obtain a list of persistent identifiers (pids) for metadata objects have been + * added to the system during a specific time period. + * @param cnNode a DataONE CN connection client object + * @param mnNode a DataONE MN connection client object + * @param isCN a logical indicating whether a CN of MN object is being used + * @param session a DataONE authentication session + * @param suiteId the quality suite to check (if this pids has already been processed) + * @param pidFilter the DataONE format identifies to filter for + * @param startHarvestDatetimeStr the starting date to harvest pids from + * @param endHarvestDatetimeStr the ending data to harvest pids from + * @param startCount the start count for paging results from DataONE, for large results + * @param countRequested the number of items to get from DataONE on each request + * @param lastDateModifiedDT the sysmeta 'dateSystemMetadataModified' value of the last harvested pid + * @throws Exception if there is an exception while executing the job. + * @return a ListResult object containing the matching pids + */ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, Boolean isCN, Session session, - String suiteId, String nodeId, String pidFilter, String startHarvestDatetimeStr, + String suiteId, String pidFilter, String startHarvestDatetimeStr, String endHarvestDatetimeStr, int startCount, int countRequested, DateTime lastDateModifiedDT) throws Exception { @@ -331,7 +348,6 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, ObjectFormatIdentifier formatId = null; NodeReference nodeRef = null; - //nodeRef.setValue(nodeId); Identifier identifier = null; Boolean replicaStatus = false; @@ -356,7 +372,7 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, } //log.info("Got " + objList.getCount() + " pids for format: " + formatId.getValue() + " pids."); } catch (Exception e) { - log.error("Error retrieving pids for node " + nodeId + ": " + e.getMessage()); + log.error("Error retrieving pids: " + e.getMessage()); throw e; } @@ -416,7 +432,24 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, return result; } - public boolean runExists(String pid, String suiteId, MDQStore store) throws MetadigStoreException { + + /** + * Check if the specified quality suite has already been run for a pid. + *

+ * An additional check is made to see if the system metadata in the + * run is older than the passed in date. Because the quality engine + * uses fields from sysmeta (obsoletes, obsoletedBy), a run may need + * to be performed on an existing run in order to update the sysmeta, as + * the system is stored in the run object, and this run object is + * parsed when the run is inserted into the Solr index. + *

+ * @param pid the pid to check + * @param suiteId the suite identifier to check (e.g. "FAIR-suite-0.3.1") + * @param store the DataStore object to send the check request to. + * @throws MetadigStoreException + * + */ + public boolean runExists(String pid, String suiteId, MDQStore store, Date dateSystemMetadataModified) throws MetadigStoreException { boolean found = false; Date runDateSystemMetadataModified = null; @@ -440,6 +473,22 @@ public boolean runExists(String pid, String suiteId, MDQStore store) throws Meta return found; } + /** + * Submit a request to the metadig controller to run a quality suite for the specified pid. + *

+ * The system metadata for a pid is also obtained and sent with the request + *

+ * + * @param cnNode a DataONE CN connection client object + * @param mnNode a DataONE MN connection client object + * @param isCN a logical indicating whether a CN of MN object + * @param session a DataONE authentication session + * @param qualityServiceUrl the URL of the MetaDIG quality service + * @param pidStr the pid to submit the request for + * @param suiteId the suite identifier to submit the request for + * + * @throws Exception + */ public void submitReportRequest(MultipartCNode cnNode, MultipartMNode mnNode, Boolean isCN, Session session, String qualityServiceUrl, String pidStr, String suiteId) throws Exception { SystemMetadata sysmeta = null; diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index 31dcea61..1abb1dce 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -330,22 +330,21 @@ public void execute(JobExecutionContext context) /** * Query a DataONE CN or MN object store for a list of object that match the time range and formatId filters provided. * - * //@param cnNode - * //@param mnNode - * //@param isCN - * @param session - * @param pidFilter - * @param startHarvestDatetimeStr - * @param endHarvestDatetimeStr - * @param startCount - * @param countRequested + * @param d1Node a DataONE CN or MN connection client object + * @param session a DataONE authentication session + * @param pidFilter the DataONE format identifies to filter for + * @param startHarvestDatetimeStr the starting date to harvest pids from + * @param endHarvestDatetimeStr the ending data to harvest pids from + * @param startCount the start count for paging results from DataONE, for large results + * @param countRequested the number of items to get from DataONE on each request + * @param lastDateModifiedDT the sysmeta 'dateSystemMetadataModified' value of the last harvested pid + * @throws Exception if there is an exception while executing the job. * @return a ListResult object containing the matching pids * @throws Exception */ - //public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, Boolean isCN, Session session, public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, String pidFilter, String startHarvestDatetimeStr, String endHarvestDatetimeStr, - int startCount, int countRequested) throws Exception { + int startCount, int countRequested, DateTime lastDateModifiedDT) throws Exception { MetadigProcessException metadigException = null; @@ -450,6 +449,18 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, return result; } + /** + * Submit a requst to the metadig controller to get qualiry score info and create a graph for the specified collection. + * + * @param qualityServiceUrl + * @param collectionId + * @param suiteId + * @param nodeId + * @param formatFamily + * + * @throws Exception + * + */ public void submitScorerRequest(String qualityServiceUrl, String collectionId, String suiteId, String nodeId, String formatFamily) throws Exception { InputStream runResultIS = null; @@ -475,7 +486,7 @@ public void submitScorerRequest(String qualityServiceUrl, String collectionId, S post.addHeader("Accept", "application/xml"); // send to service - log.debug("submitting scores request : " + scorerServiceUrl); + log.trace("submitting scores request : " + scorerServiceUrl); CloseableHttpClient client = HttpClients.createDefault(); CloseableHttpResponse response = client.execute(post); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java index fede5a0f..df56654d 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scorer/Scorer.java @@ -129,6 +129,7 @@ public static void main(String[] argv) throws Exception { * A set of quality scores are retrieved from the Quality Solr Server and a quality graph and csv file are created from * them. For DataONE collections, the 'collectionQuery' is retrieved from Solr to determine the set of pids to be * included. + *

* */ final Consumer consumer = new DefaultConsumer(inProcessChannel) { @@ -330,6 +331,7 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp } }; + // Initialize the RabbitMQ queue for scorer requests send by the controller inProcessChannel.basicConsume(SCORER_QUEUE_NAME, false, consumer); } @@ -337,18 +339,16 @@ public void handleDelivery(String consumerTag, Envelope envelope, AMQP.BasicProp * Retrieve pids associated with a DataONE collection. * *

First the 'collectionQuery' field is retrieved from DataONE Solr for the collection

- *

Next, a query is issued with the query from collectionQuery field, to retrieve all Solr docs for the collection ids./p> + *

Next, a query is issued with the query from the collectionQuery field, to retrieve all Solr docs for the collection ids./p> * *

Note that in the current design, the collection query is always obtained by querying the node specified in the taskList.csv file, * which is usually an MN, but the collectionQuery is always evaluated on the CN

* * @param collectionId a DataONE project id to fetch scores for, e.g. urn:uuid:f137095e-4266-4474-aa5f-1e1fcaa5e2dc - * @param d1Node - * @param session + * @param d1Node the DataONE connection object for a node + * @param session the DataONE authentication session * @return a List of quality scores fetched from Solr */ - //private ScorerResult getCollectionPids(String collectionId, MultipartCNode cnNode, MultipartMNode mnNode, - // Boolean isCN, Session session) throws MetadigProcessException { private ScorerResult getCollectionPids(String collectionId, MultipartD1Node d1Node, Session session) throws MetadigProcessException { Document xmldoc = null; @@ -363,11 +363,9 @@ which will be used to query DataONE Solr for all the pids associated with that p */ ArrayList pids = new ArrayList<>(); queryStr = "?q=seriesId:" + escapeSpecialChars(collectionId) + "+-obsoletedBy:*" + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; - //queryStr = "?q=seriesId:" + encodeValue(collectionId) + "+-obsoletedBy:*" + "&fl=collectionQuery,label,rightsHolder&q.op=AND"; - //queryStr = "?q=seriesId:" + collectionId + "+-obsoletedBy:*&fl=collectionQuery,label,rightsHolder&q.op=AND"; startPos = 0; - // Just getting 1 row + // Just getting 1 row (for the collectionQuery field) countRequested = 10; // Get the collectionQuery from Solr From bc9b37ad577e78479797b42837283c7df2634f86 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 2 Sep 2020 15:45:35 -0700 Subject: [PATCH 46/47] CN harvesting is missing some pids #267 --- .../edu/ucsb/nceas/mdqengine/model/Task.java | 16 +- .../mdqengine/scheduler/JobScheduler.java | 17 ++ .../nceas/mdqengine/scheduler/NodeList.java | 168 +++++++++++ .../mdqengine/scheduler/RequestReportJob.java | 267 +++++++++++------- .../mdqengine/scheduler/RequestScorerJob.java | 51 ++-- .../nceas/mdqengine/store/DatabaseStore.java | 252 ++++++++++++++++- .../nceas/mdqengine/store/InMemoryStore.java | 15 +- .../ucsb/nceas/mdqengine/store/MDQStore.java | 12 +- .../ucsb/nceas/mdqengine/store/MNStore.java | 21 +- src/main/resources/sql/quality-v2.3.0.sql | 26 +- 10 files changed, 682 insertions(+), 163 deletions(-) create mode 100644 src/main/java/edu/ucsb/nceas/mdqengine/scheduler/NodeList.java diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/model/Task.java b/src/main/java/edu/ucsb/nceas/mdqengine/model/Task.java index 5e174d42..f2290b28 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/model/Task.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/model/Task.java @@ -1,10 +1,12 @@ package edu.ucsb.nceas.mdqengine.model; +import java.util.HashMap; + public class Task { private String taskName; private String taskType; - private String lastHarvestDatetime; + private HashMap lastHarvestDatetimes = new HashMap<>(); public void setTaskName(String name) { this.taskName = name; @@ -18,10 +20,16 @@ public String getTaskName() { public String getTaskType() { return taskType; } - public void setLastHarvestDatetime(String lastHarvestDatetime) { - this.lastHarvestDatetime = lastHarvestDatetime; + public void setLastHarvestDatetimes(HashMap lastHarvestDatetimes) { + this.lastHarvestDatetimes = lastHarvestDatetimes; + } + + public void setLastHarvestDatetime(String lastHarvestDatetime, String nodeId) { + this.lastHarvestDatetimes.put(nodeId, lastHarvestDatetime); } - public String getLastHarvestDatetime() { return lastHarvestDatetime; } + public String getLastHarvestDatetime(String nodeId) { + return this.lastHarvestDatetimes.get(nodeId); + } } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java index dd72f43b..c38e8d1f 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/JobScheduler.java @@ -180,6 +180,16 @@ public static void main(String[] argv) throws Exception { log.debug("fileExcludeMatch: " + fileExcludeMatch); logFile = splitted[++icnt].trim(); log.debug("log file: " + logFile); + } else if (taskType.equals("nodelist")) { + log.debug("Scheduling nodelist update from DataONE, task name: " + taskName + ", task group: " + taskGroup); + String[] splitted = Arrays.stream(params.split(";")) + .map(String::trim) + .toArray(String[]::new); + + int icnt = -1; + log.debug("Split length: " + splitted.length); + nodeId = splitted[++icnt].trim(); + log.debug("nodeId: " + nodeId); } try { @@ -221,6 +231,13 @@ public static void main(String[] argv) throws Exception { .usingJobData("fileExcludeMatch", fileExcludeMatch) .usingJobData("logFile", logFile) .build(); + } else if (taskType.equalsIgnoreCase("nodelist")) { + job = newJob(NodeList.class) + .withIdentity(taskName, taskGroup) + .usingJobData("taskName", taskName) + .usingJobData("taskType", taskType) + .usingJobData("nodeId", nodeId) + .build(); } CronTrigger trigger = newTrigger() diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/NodeList.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/NodeList.java new file mode 100644 index 00000000..5eecc2cd --- /dev/null +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/NodeList.java @@ -0,0 +1,168 @@ +package edu.ucsb.nceas.mdqengine.scheduler; + +import edu.ucsb.nceas.mdqengine.DataONE; +import edu.ucsb.nceas.mdqengine.MDQconfig; +import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; +import edu.ucsb.nceas.mdqengine.store.DatabaseStore; +import edu.ucsb.nceas.mdqengine.store.MDQStore; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dataone.client.rest.HttpMultipartRestClient; +import org.dataone.client.rest.MultipartRestClient; +import org.dataone.client.v2.impl.MultipartCNode; +import org.dataone.service.exceptions.NotImplemented; +import org.dataone.service.exceptions.ServiceFailure; +import org.dataone.service.types.v1.*; +import org.dataone.service.types.v2.Node; +import org.dataone.service.types.v2.Property; +import org.quartz.*; + +import java.io.IOException; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.TimeZone; + +/** + *

+ * Run a MetaDIG Quality Engine Scheduler task, for example, + * query a member node for new pids and request that a quality + * report is created for each one. + *

+ * + * @author Peter Slaughter + */ +@PersistJobDataAfterExecution +@DisallowConcurrentExecution +public class NodeList implements Job { + + private Log log = LogFactory.getLog(NodeList.class); + + // Since Quartz will re-instantiate a class every time it + // gets executed, non-static member variables can + // not be used to maintain state! + + /** + *

+ * Called by the {@link org.quartz.Scheduler} when a + * {@link org.quartz.Trigger} fires that is associated with + * the Job. + *

+ * + * @throws JobExecutionException if there is an exception while executing the job. + */ + public void execute(JobExecutionContext context) + throws JobExecutionException { + + Log log = LogFactory.getLog(NodeList.class); + JobKey key = context.getJobDetail().getKey(); + JobDataMap dataMap = context.getJobDetail().getJobDataMap(); + + String taskName = dataMap.getString("taskName"); + String taskType = dataMap.getString("taskType"); + String nodeId = dataMap.getString("nodeId"); + MultipartRestClient mrc = null; + MultipartCNode cnNode = null; + + String nodeServiceUrl = null; + + try { + MDQconfig cfg = new MDQconfig(); + String nodeAbbr = nodeId.replace("urn:node:", ""); + // TODO: Cache the node values from the CN listNode service + nodeServiceUrl = cfg.getString(nodeAbbr + ".serviceUrl"); + } catch (ConfigurationException | IOException ce) { + JobExecutionException jee = new JobExecutionException(taskName + ": error executing task."); + jee.initCause(ce); + throw jee; + } + + log.debug("Executing task " + taskType + ", " + taskName + " for node: " + nodeId); + + Session session = DataONE.getSession(null, null); + + try { + mrc = new HttpMultipartRestClient(); + } catch (Exception e) { + log.error(taskName + ": error creating rest client: " + e.getMessage()); + JobExecutionException jee = new JobExecutionException(e); + jee.setRefireImmediately(false); + throw jee; + } + + cnNode = new MultipartCNode(mrc, nodeServiceUrl, session); + org.dataone.service.types.v2.NodeList nodeList = null; + + try { + nodeList = cnNode.listNodes(); + } catch (NotImplemented | ServiceFailure e) { + e.printStackTrace(); + throw new JobExecutionException(taskName + ": cannot renew store, unable to schedule job", e); + } + + // Get a connection to the database + MDQStore store = null; + + try { + store = new DatabaseStore(); + } catch (Exception e) { + e.printStackTrace(); + throw new JobExecutionException(taskName + ": cannot create store, unable to schedule job", e); + } + + if (!store.isAvailable()) { + try { + store.renew(); + } catch (MetadigStoreException e) { + e.printStackTrace(); + throw new JobExecutionException(taskName + ": cannot renew store, unable to schedule job", e); + } + } + + Property property = null; + ArrayList plist = null; + for (Node node : nodeList.getNodeList()) { + log.debug("node: " + node.getName()); + log.debug("type: " + node.getType().toString()); + log.debug("id: " + node.getIdentifier().getValue()); + log.debug("state: " + node.getState().toString()); + log.debug("is synchonized: " + node.isSynchronize()); + + if (! node.isSynchronize()) { + log.debug(taskName + ": Skipping unsynchronized node " + node.getIdentifier().getValue()); + continue; + } else if (node.getType().toString().equalsIgnoreCase("MN")) { + log.debug(taskName + ": saving node " + node.getIdentifier().getValue()); + try { + store.saveNode(node); + } catch (MetadigStoreException mse) { + mse.printStackTrace(); + throw new JobExecutionException("Cannot save node " + node.getIdentifier().getValue() + " to store", mse); + } + } else { + log.debug(taskName + ": skipping CN node: " + node.getIdentifier().getValue()); + } + } + + // For debugging purposes: retrieve and print out all node entries if trace logging is enabled. + if (log.isTraceEnabled()) { + log.trace("Retrieving and printing out all saved node harvest dates..."); + + ArrayList nodes = store.getNodes(); + for (Node node : nodes) { + log.trace("identifier: " + node.getIdentifier().getValue()); + + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); + String lastHarvestDatetimeStr = dateFormat.format(node.getSynchronization().getLastHarvested()); + + log.trace("harvest: " + lastHarvestDatetimeStr); + log.trace("synchronize: " + node.isSynchronize()); + log.trace("state: " + node.getState().toString()); + log.trace("baseURL: " + node.getBaseURL()); + } + } + } +} + diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java index 27a7458b..acbecf1c 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestReportJob.java @@ -19,6 +19,7 @@ import org.dataone.client.rest.MultipartRestClient; import org.dataone.client.v2.impl.MultipartCNode; import org.dataone.client.v2.impl.MultipartMNode; +import org.dataone.service.types.v2.Node; import org.dataone.mimemultipart.SimpleMultipartEntity; import org.dataone.service.exceptions.NotAuthorized; import org.dataone.service.types.v1.*; @@ -155,17 +156,17 @@ public void execute(JobExecutionContext context) // TODO: Cache the node values from the CN listNode service nodeServiceUrl = cfg.getString(nodeAbbr + ".serviceUrl"); } catch (ConfigurationException | IOException ce) { - JobExecutionException jee = new JobExecutionException("Error executing task."); + JobExecutionException jee = new JobExecutionException(taskName + ": error executing task."); jee.initCause(ce); throw jee; } - log.info("Executing task " + taskType + ", " + taskName + " for node: " + nodeId + ", suiteId: " + suiteId); + log.debug("Executing task " + taskType + ", " + taskName + " for node: " + nodeId + ", suiteId: " + suiteId); try { mrc = new HttpMultipartRestClient(); } catch (Exception e) { - log.error("Error creating rest client: " + e.getMessage()); + log.error(taskName + ": error creating rest client: " + e.getMessage()); JobExecutionException jee = new JobExecutionException(e); jee.setRefireImmediately(false); throw jee; @@ -200,123 +201,170 @@ public void execute(JobExecutionContext context) } } - // Set UTC as the default time zone for all DateTime operations. - // Get current datetime, which may be used for start time range. - DateTimeZone.setDefault(DateTimeZone.UTC); - DateTime currentDT = new DateTime(DateTimeZone.UTC); - DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); - String currentDatetimeStr = dtfOut.print(currentDT); - DateTime startDateTimeRange = null; - DateTime endDateTimeRange = null; - String lastHarvestDateStr = null; - - Task task; - task = store.getTask(taskName, taskType); - // If a 'task' entry has not been saved for this task name yet, then a 'lastHarvested' - // DataTime will not be available, in which case the 'startHarvestDataTime' from the - // config file will be used. - if(task.getLastHarvestDatetime() == null) { - task = new Task(); - task.setTaskName(taskName); - task.setTaskType(taskType); - lastHarvestDateStr = startHarvestDatetimeStr; - task.setLastHarvestDatetime(lastHarvestDateStr); - } else { - lastHarvestDateStr = task.getLastHarvestDatetime(); - } + ArrayList nodes = new ArrayList<>(); - DateTime lastHarvestDateDT = new DateTime(lastHarvestDateStr); - // Set the search start datetime to the last harvest datetime, unless it is in the - // future. (This can happen when the previous time range end was for the current day, - // as the end datetime range for the previous task run will have been stored as the - // new lastharvestDateTime. - DateTime startDT = null; - if(lastHarvestDateDT.isAfter(currentDT.toInstant())) { - startDT = currentDT; + if (isCN) { + nodes = store.getNodes(); } else { - startDT = new DateTime(lastHarvestDateDT); + Node node = store.getNode(nodeId); + if (node.getIdentifier().getValue() == null) { + String msg = ("Node entry not found for node: " + nodeId); + log.error(msg); + JobExecutionException jee = new JobExecutionException(msg); + jee.setRefireImmediately(false); + throw jee; + } else { + log.trace("Got node " + node.getIdentifier().getValue()); + nodes.add(node); + } } -// DateTime endDT = new DateTime(startDT); -// endDT = endDT.plusDays(harvestDatetimeInc); -// if(endDT.isAfter(currentDT.toInstant())) { -// endDT = currentDT; -// } - DateTime endDT = new DateTime(currentDT); - - // If the start and end harvest dates are the same (happens for a new node), then - // tweak the start so that DataONE listObjects doesn't complain. - if(startDT == endDT ) { - startDT = startDT.minusMinutes(1); - } + String harvestNodeId = null; + for (Node node : nodes) { - // Track the sysmeta dateUploaded of the latest harvested pid. This will become the starting time of - // the next harvest. - DateTime lastDateModifiedDT = startDT; + harvestNodeId = node.getIdentifier().getValue(); + // If processing a CN, check each MN to see if it is being synchronized and if it + // is up. + if (isCN) { - String startDTstr = dtfOut.print(startDT); - String endDTstr = dtfOut.print(endDT); + // The NodeList task doesn't save CN entries from the DataONE 'listNodes()' service, but check + // just in case. + if (node.getType().equals(NodeType.CN)) { + log.debug("Harvesting from CN, skipping CN entry from node list for " + node.getIdentifier().getValue()); + continue; + } - Integer startCount = new Integer(0); - ListResult result = null; - Integer totalResultCount = 0; - Integer filteredResultCount = 0; - Integer allPidsCnt = 0; + if (! node.isSynchronize() || ! node.getState().equals(NodeState.UP)) { + log.trace("Skipping disabled node: " + node.getIdentifier().getValue() + ", sync: " + node.isSynchronize() + + ", status: " + node.getState().toString()); + continue; + } - boolean morePids = true; - while(morePids) { - ArrayList pidsToProcess = null; - try { - result = getPidsToProcess(cnNode, mnNode, isCN, session, suiteId, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT); - pidsToProcess = result.getResult(); - totalResultCount = result.getTotalResultCount(); - filteredResultCount = result.getFilteredResultCount(); - lastDateModifiedDT = result.getLastDateModified(); - } catch (Exception e) { - JobExecutionException jee = new JobExecutionException("Unable to get pids to process", e); - jee.setRefireImmediately(false); - throw jee; - } + DateTime mnLastHarvestDT = new DateTime(node.getSynchronization().getLastHarvested(), DateTimeZone.UTC); + DateTime oneMonthAgoDT = new DateTime(DateTimeZone.UTC).minusMonths(1); - allPidsCnt = pidsToProcess.size(); - for (String pidStr : pidsToProcess) { - try { - log.debug(taskName + ": submitting pid: " + pidStr); - submitReportRequest(cnNode, mnNode, isCN, session, qualityServiceUrl, pidStr, suiteId); - } catch (org.dataone.service.exceptions.NotFound nfe) { - log.error("Unable to process pid: " + pidStr + nfe.getMessage()); - continue; - } catch (Exception e) { - log.error("Unable to process pid: " + pidStr + " - " + e.getMessage()); + if (mnLastHarvestDT.isBefore(oneMonthAgoDT.toInstant())) { + DateTimeZone.setDefault(DateTimeZone.UTC); + DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + log.trace("Skipping node " + node.getIdentifier().getValue() + " that hasn't been sync'd since " + dtfOut.print(mnLastHarvestDT)); continue; } } - // Check if DataONE returned the max number of results. If so, we have to request more by paging through - // the results returned pidsToProcess (i.e. DataONE listObjects service). If the returned result is - // less than the requested result, then all pids have been retrieved. - if(totalResultCount >= countRequested) { - morePids = true; - startCount = startCount + totalResultCount; - log.trace("Paging through more results, current start is " + startCount); + log.trace("Harvesting node: " + node.getIdentifier().getValue()); + + // Set UTC as the default time zone for all DateTime operations. + // Get current datetime, which may be used for start time range. + DateTimeZone.setDefault(DateTimeZone.UTC); + DateTime currentDT = new DateTime(DateTimeZone.UTC); + DateTimeFormatter dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + String lastHarvestDateStr = null; + + Task task; + task = store.getTask(taskName, taskType, harvestNodeId); + // If a 'task' entry has not been saved for this task name yet, then a 'lastHarvested' + // DataTime will not be available, in which case the 'startHarvestDataTime' from the + // config file will be used. + if (task.getLastHarvestDatetime(harvestNodeId) == null) { + task.setTaskName(taskName); + task.setTaskType(taskType); + lastHarvestDateStr = startHarvestDatetimeStr; + task.setLastHarvestDatetime(lastHarvestDateStr, harvestNodeId); } else { - morePids = false; + lastHarvestDateStr = task.getLastHarvestDatetime(harvestNodeId); } - } - // Don't update the lastHarvestDateDT if no pids were found. - if (allPidsCnt > 0) { - task.setLastHarvestDatetime(dtfOut.print(lastDateModifiedDT)); - log.debug("Saving lastHarvestDate: " + dtfOut.print(lastDateModifiedDT)); - try { - store.saveTask(task); - } catch (MetadigStoreException mse) { - log.error("Error saving task: " + task.getTaskName()); - JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); - jee.setRefireImmediately(false); - throw jee; + + DateTime lastHarvestDateDT = new DateTime(lastHarvestDateStr); + // Set the search start datetime to the last harvest datetime, unless it is in the + // future. (This can happen when the previous time range end was for the current day, + // as the end datetime range for the previous task run will have been stored as the + // new lastharvestDateTime. + DateTime startDT = null; + if (lastHarvestDateDT.isAfter(currentDT.toInstant())) { + startDT = currentDT; + } else { + startDT = new DateTime(lastHarvestDateDT); + } + + DateTime endDT = new DateTime(currentDT); + + // If the start and end harvest dates are the same (happens for a new node), then + // tweak the start so that DataONE listObjects doesn't complain. + if (startDT == endDT) { + startDT = startDT.minusMinutes(1); + } + + // Track the sysmeta dateUploaded of the latest harvested pid. This will become the starting time of + // the next harvest. + DateTime lastDateModifiedDT = startDT; + + String startDTstr = dtfOut.print(startDT); + String endDTstr = dtfOut.print(endDT); + + log.trace("start time: " + startDTstr); + + Integer startCount = new Integer(0); + ListResult result = null; + Integer totalResultCount = 0; + Integer filteredResultCount = 0; + Integer allPidsCnt = 0; + + log.trace("Getting pids for nodeId: " + harvestNodeId); + boolean morePids = true; + while (morePids) { + ArrayList pidsToProcess = null; + try { + result = getPidsToProcess(cnNode, mnNode, isCN, session, suiteId, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT, harvestNodeId, taskName); + pidsToProcess = result.getResult(); + totalResultCount = result.getTotalResultCount(); + filteredResultCount = result.getFilteredResultCount(); + lastDateModifiedDT = result.getLastDateModified(); + } catch (Exception e) { + JobExecutionException jee = new JobExecutionException("Unable to get pids to process", e); + jee.setRefireImmediately(false); + throw jee; + } + + allPidsCnt = pidsToProcess.size(); + for (String pidStr : pidsToProcess) { + try { + log.debug(taskName + ": submitting pid: " + pidStr); + submitReportRequest(cnNode, mnNode, isCN, session, qualityServiceUrl, pidStr, suiteId); + } catch (org.dataone.service.exceptions.NotFound nfe) { + log.error("Unable to process pid: " + pidStr + nfe.getMessage()); + continue; + } catch (Exception e) { + log.error("Unable to process pid: " + pidStr + " - " + e.getMessage()); + continue; + } + } + + // Check if DataONE returned the max number of results. If so, we have to request more by paging through + // the results returned pidsToProcess (i.e. DataONE listObjects service). If the returned result is + // less than the requested result, then all pids have been retrieved. + if (totalResultCount >= countRequested) { + morePids = true; + startCount = startCount + totalResultCount; + log.trace("Paging through more results, current start is " + startCount); + } else { + morePids = false; + } + } + // Don't update the lastHarvestDateDT if no pids were found. + if (allPidsCnt > 0) { + task.setLastHarvestDatetime(dtfOut.print(lastDateModifiedDT), harvestNodeId); + log.trace("Saving lastHarvestDate: " + dtfOut.print(lastDateModifiedDT) + " for node: " + harvestNodeId); + try { + store.saveTask(task, harvestNodeId); + } catch (MetadigStoreException mse) { + log.error("Error saving task: " + task.getTaskName()); + JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); + jee.setRefireImmediately(false); + throw jee; + } + log.info(taskName + ": found " + allPidsCnt + " pids for nodeId: " + harvestNodeId + ", start: " + startDTstr + ", end: " + endDTstr + ", servierUrl: " + nodeServiceUrl); } } - log.info(taskName + ": Found " + allPidsCnt + " pids for start: " + startDTstr + ", end: " + endDTstr + " at servierUrl: " + nodeServiceUrl); store.shutdown(); } @@ -334,13 +382,14 @@ public void execute(JobExecutionContext context) * @param startCount the start count for paging results from DataONE, for large results * @param countRequested the number of items to get from DataONE on each request * @param lastDateModifiedDT the sysmeta 'dateSystemMetadataModified' value of the last harvested pid + * @param nodeIdFilter filter results for this nodeId (applies only to CN) * @throws Exception if there is an exception while executing the job. * @return a ListResult object containing the matching pids */ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, Boolean isCN, Session session, String suiteId, String pidFilter, String startHarvestDatetimeStr, String endHarvestDatetimeStr, int startCount, - int countRequested, DateTime lastDateModifiedDT) throws Exception { + int countRequested, DateTime lastDateModifiedDT, String nodeIdFilter, String taskName) throws Exception { ArrayList pids = new ArrayList(); InputStream qis = null; @@ -364,15 +413,19 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, try { // Even though MultipartMNode and MultipartCNode have the same parent class D1Node, the interface for D1Node doesn't - // include listObjects (it should), so we have to maintain a cnNode and mnNode. + // include listObjects, as the parameters differ from CN to MN, so we have to use a different object for each. if(isCN) { + log.trace("Getting pids for cn, for nodeid: " + nodeIdFilter); + nodeRef = new NodeReference(); + nodeRef.setValue(nodeIdFilter); objList = cnNode.listObjects(session, startDate, endDate, formatId, nodeRef, identifier, startCount, countRequested); } else { + log.trace("Getting pids for mn"); objList = mnNode.listObjects(session, startDate, endDate, formatId, identifier, replicaStatus, startCount, countRequested); } //log.info("Got " + objList.getCount() + " pids for format: " + formatId.getValue() + " pids."); } catch (Exception e) { - log.error("Error retrieving pids: " + e.getMessage()); + log.error(taskName + ": error retrieving pids: " + e.getMessage()); throw e; } @@ -413,7 +466,7 @@ public ListResult getPidsToProcess(MultipartCNode cnNode, MultipartMNode mnNode, // that this is the last pid to be harvested in this round. if (thisDateModifiedDT.isAfter(lastDateModifiedDT)) { lastDateModifiedDT = thisDateModifiedDT.plusMillis(1); - log.debug("Updated lastDateMoidifed: " + lastDateModifiedDT.toString()); + log.debug("New value for lastDateMoidifed: " + lastDateModifiedDT.toString()); } // } } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java index 1abb1dce..b98fbd4c 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/scheduler/RequestScorerJob.java @@ -140,7 +140,7 @@ public void execute(JobExecutionContext context) requestType = dataMap.getString("requestType"); } - log.info("Executing task " + taskType + ", " + taskName + " for node: " + nodeId + ", suiteId: " + suiteId); + log.debug("Executing task " + taskType + ", " + taskName + " for node: " + nodeId + ", suiteId: " + suiteId); try { cfg = new MDQconfig(); @@ -152,11 +152,16 @@ public void execute(JobExecutionContext context) nodeServiceUrl = cfg.getString(nodeAbbr + ".serviceUrl"); log.trace("nodeServiceUrl: " + nodeServiceUrl); } catch (ConfigurationException | IOException ce) { - JobExecutionException jee = new JobExecutionException("Error executing task."); + JobExecutionException jee = new JobExecutionException(taskName + ": Error executing task: " + ce.getMessage()); jee.initCause(ce); throw jee; } + if(nodeServiceUrl == null) { + String msg = taskName + "Unable to read serviceUrl from config file for: " + nodeId; + throw new JobExecutionException(msg); + } + Session session = DataONE.getSession(subjectId, authToken); // Get a connection to the DataONE node (CN or MN) @@ -194,19 +199,19 @@ public void execute(JobExecutionContext context) String lastHarvestDateStr = null; Task task; - task = store.getTask(taskName, taskType); + task = store.getTask(taskName, taskType, nodeId); // If a 'task' entry has not been saved for this task name yet, then a 'lastHarvested' // DataTime will not be available, in which case the 'startHarvestDataTime' from the // config file will be used. - if(task.getLastHarvestDatetime() == null) { + if(task.getLastHarvestDatetime(nodeId) == null) { task = new Task(); task.setTaskName(taskName); task.setTaskType(taskType); lastHarvestDateStr = startHarvestDatetimeStr; - task.setLastHarvestDatetime(lastHarvestDateStr); + task.setLastHarvestDatetime(lastHarvestDateStr, nodeId); } else { - lastHarvestDateStr = task.getLastHarvestDatetime(); + lastHarvestDateStr = task.getLastHarvestDatetime(nodeId); } DateTime lastHarvestDateDT = new DateTime(lastHarvestDateStr); @@ -221,12 +226,6 @@ public void execute(JobExecutionContext context) startDT = new DateTime(lastHarvestDateDT); } -// DateTime endDT = new DateTime(startDT); -// endDT = endDT.plusDays(harvestDatetimeInc); -// if(endDT.isAfter(currentDT.toInstant())) { -// endDT = currentDT; -// } - DateTime endDT = new DateTime(currentDT); // If the start and end harvest dates are the same (happends for a new node), then @@ -274,7 +273,7 @@ public void execute(JobExecutionContext context) log.trace("Getting portal pids to process, startCount: " + startCount + ", countRequested: " + countRequested); try { - result = getPidsToProcess(d1Node, session, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT); + result = getPidsToProcess(d1Node, session, pidFilter, startDTstr, endDTstr, startCount, countRequested, lastDateModifiedDT, taskName); pidsToProcess = result.getResult(); resultCount = result.getResultCount(); lastDateModifiedDT = result.getLastDateModified(); @@ -311,18 +310,18 @@ public void execute(JobExecutionContext context) if (allIds > 0) { // Record the new "last harvested" date - task.setLastHarvestDatetime(dtfOut.print(lastDateModifiedDT)); + task.setLastHarvestDatetime(dtfOut.print(lastDateModifiedDT), nodeId); log.debug("Saving lastHarvestDate: " + dtfOut.print(lastDateModifiedDT)); try { - store.saveTask(task); + store.saveTask(task, nodeId); } catch (MetadigStoreException mse) { log.error("Error saving task: " + task.getTaskName()); JobExecutionException jee = new JobExecutionException("Unable to save new harvest date", mse); jee.setRefireImmediately(false); throw jee; } + log.info(taskName + ": found " + allIds + " seriesIds" + " for start: " + startDTstr + ", end: " + endDTstr + " at servierUrl: " + nodeServiceUrl); } - log.info(taskName + ": found " + allIds + " seriesIds" + " for start: " + startDTstr + ", end: " + endDTstr + " at servierUrl: " + nodeServiceUrl); } store.shutdown(); } @@ -344,7 +343,7 @@ public void execute(JobExecutionContext context) */ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, String pidFilter, String startHarvestDatetimeStr, String endHarvestDatetimeStr, - int startCount, int countRequested, DateTime lastDateModifiedDT) throws Exception { + int startCount, int countRequested, DateTime lastDateModifiedDT, String taskName) throws Exception { MetadigProcessException metadigException = null; @@ -376,7 +375,7 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, fieldXpath = xpath.compile("//result/doc/str[@name='seriesId']/text()"); dateModifiedXpath = xpath.compile("//result/doc/date[@name='dateModified']/text()"); } catch (XPathExpressionException xpe) { - log.error("Error extracting id from solr result doc: " + xpe.getMessage()); + log.error(taskName + ": error extracting id from solr result doc: " + xpe.getMessage()); metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); metadigException.initCause(xpe); throw metadigException; @@ -396,7 +395,7 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, try { xpathResult = (org.w3c.dom.NodeList) fieldXpath.evaluate(xmldoc, XPathConstants.NODESET); } catch (XPathExpressionException xpe) { - log.error("Error extracting seriesId from solr result doc: " + xpe.getMessage()); + log.error(taskName + ": error extracting seriesId from solr result doc: " + xpe.getMessage()); metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); metadigException.initCause(xpe); throw metadigException; @@ -416,7 +415,7 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, try { xpathResult = (org.w3c.dom.NodeList) dateModifiedXpath.evaluate(xmldoc, XPathConstants.NODESET); } catch (XPathExpressionException xpe) { - log.error("Error extracting dateModified from solr result doc: " + xpe.getMessage()); + log.error(taskName + ": error extracting dateModified from solr result doc: " + xpe.getMessage()); metadigException = new MetadigProcessException("Unable to get collection pids: " + xpe.getMessage()); metadigException.initCause(xpe); throw metadigException; @@ -450,13 +449,13 @@ public ListResult getPidsToProcess(MultipartD1Node d1Node, Session session, } /** - * Submit a requst to the metadig controller to get qualiry score info and create a graph for the specified collection. + * Submit a requst to the metadig controller to get quality score info and create a graph for the specified collection. * - * @param qualityServiceUrl - * @param collectionId - * @param suiteId - * @param nodeId - * @param formatFamily + * @param qualityServiceUrl the URL of the MetaDIG quality service + * @param collectionId the DataONE collection (portal) seriesId + * @param suiteId the quality suite to run for the collection + * @param nodeId the DataONE node identifier that the collection is hosted on + * @param formatFamily the format identifier family (e.g. "eml" for all EML format identifier versions) * * @throws Exception * diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java index 9958136c..8a83abce 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/DatabaseStore.java @@ -9,6 +9,8 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.dataone.service.types.v1.*; +import org.dataone.service.types.v2.Node; import org.dataone.service.util.TypeMarshaller; import org.springframework.core.io.Resource; import org.springframework.core.io.support.PathMatchingResourcePatternResolver; @@ -22,11 +24,11 @@ import java.io.UnsupportedEncodingException; import java.net.URL; import java.sql.*; +import java.text.DateFormat; +import java.text.SimpleDateFormat; import java.time.Instant; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Properties; +import java.util.*; +import java.util.Date; /** * Persistent storage for quality runs. @@ -322,27 +324,24 @@ public void shutdown() { } } - public void saveTask(Task task) throws MetadigStoreException { + public void saveTask(Task task, String nodeId) throws MetadigStoreException { PreparedStatement stmt = null; // Perform an 'upsert' on the 'nodes' table - if a record exists for the 'metadata_id, suite_id' already, // then update the record with the incoming data. try { - String sql = "INSERT INTO tasks (task_name, task_type, last_harvest_datetime) VALUES (?, ?, ?)" + String sql = "INSERT INTO tasks (task_name, task_type) VALUES (?, ?)" + " ON CONFLICT ON CONSTRAINT task_name_task_type" - + " DO UPDATE SET (task_name, task_type, last_harvest_datetime) = (?, ?, ?);"; + + " DO NOTHING"; stmt = conn.prepareStatement(sql); stmt.setString(1, task.getTaskName()); stmt.setString(2, task.getTaskType()); - stmt.setString(3, task.getLastHarvestDatetime()); - stmt.setString(4, task.getTaskName()); - stmt.setString(5, task.getTaskType()); - stmt.setString(6, task.getLastHarvestDatetime()); stmt.executeUpdate(); stmt.close(); conn.commit(); + saveNodeHarvest(task, nodeId); //conn.close(); } catch (SQLException e) { log.error( e.getClass().getName()+": "+ e.getMessage()); @@ -355,7 +354,7 @@ public void saveTask(Task task) throws MetadigStoreException { log.trace("Records created successfully"); } - public Task getTask(String taskName, String taskType) { + public Task getTask(String taskName, String taskType, String nodeId) { //return runs.get(id); Result result = new Result(); @@ -376,12 +375,13 @@ public Task getTask(String taskName, String taskType) { if(rs.next()) { task.setTaskName(rs.getString("task_name")); task.setTaskType(rs.getString("task_type")); - task.setLastHarvestDatetime(rs.getString("last_harvest_datetime")); rs.close(); stmt.close(); } else { log.trace("No results returned from query"); } + + task.setLastHarvestDatetimes(getNodeHarvestDatetimes(task.getTaskName(), task.getTaskType(), nodeId)); } catch ( Exception e ) { log.error( e.getClass().getName()+": "+ e.getMessage()); } @@ -389,6 +389,232 @@ public Task getTask(String taskName, String taskType) { return(task); } + public HashMap getNodeHarvestDatetimes(String taskName, String taskType, String nodeId) { + + //return runs.get(id); + Result result = new Result(); + PreparedStatement stmt = null; + String lastDT = null; + Task task = new Task(); + + HashMap nodeHarvestDates = new HashMap<>(); + // Select records from the 'nodes' table + try { + String sql = "select * from node_harvest where task_name = ? and task_type = ? and node_id = ?"; + stmt = conn.prepareStatement(sql); + stmt.setString(1, taskName); + stmt.setString(2, taskType); + stmt.setString(3, nodeId); + + log.trace("issuing query: " + sql); + ResultSet rs = stmt.executeQuery(); + while (rs.next()) { + nodeHarvestDates.put(nodeId, rs.getString("last_harvest_datetime")); + } + rs.close(); + stmt.close(); + } catch ( Exception e ) { + log.error( e.getClass().getName()+": "+ e.getMessage()); + } + + return(nodeHarvestDates); + } + + + public void saveNodeHarvest(Task task, String nodeId) throws MetadigStoreException { + + PreparedStatement stmt = null; + + // Perform an 'upsert' on the 'nodes' table - if a record exists for the 'metadata_id, suite_id' already, + // then update the record with the incoming data. + try { + String sql = "INSERT INTO node_harvest (task_name, task_type, node_id, last_harvest_datetime) VALUES (?, ?, ?, ?)" + + " ON CONFLICT ON CONSTRAINT node_harvest_task_name_task_type_node_id_uc" + + " DO UPDATE SET (task_name, task_type, node_id, last_harvest_datetime) = (?, ?, ?, ?);"; + + stmt = conn.prepareStatement(sql); + stmt.setString(1, task.getTaskName()); + stmt.setString(2, task.getTaskType()); + stmt.setString(3, nodeId); + stmt.setString(4, task.getLastHarvestDatetime(nodeId)); + stmt.setString(5, task.getTaskName()); + stmt.setString(6, task.getTaskType()); + stmt.setString(7, nodeId); + stmt.setString(8, task.getLastHarvestDatetime(nodeId)); + stmt.executeUpdate(); + stmt.close(); + conn.commit(); + //conn.close(); + } catch (SQLException e) { + log.error( e.getClass().getName()+": "+ e.getMessage()); + MetadigStoreException me = new MetadigStoreException("Unable save last harvest date to the datdabase."); + me.initCause(e); + throw(me); + } + + // Next, insert a record into the child table ('runs') + log.trace("Records created successfully"); + } + + public void saveNode(Node node) throws MetadigStoreException { + + PreparedStatement stmt = null; + + // Perform an 'upsert' on the 'nodes' table - if a record exists for the 'metadata_id, suite_id' already, + // then update the record with the incoming data. + try { + String sql = "INSERT INTO nodes " + + " (identifier, name, type, state, synchronize, last_harvest, baseURL) VALUES (?, ?, ?, ?, ?, ?, ?) " + + " ON CONFLICT ON CONSTRAINT node_id_pk DO UPDATE SET " + + " (identifier, name, type, state, synchronize, last_harvest, baseURL) = (?, ?, ?, ?, ?, ?, ?);"; + + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); + String lastHarvestDatetimeStr = dateFormat.format(node.getSynchronization().getLastHarvested()); + + stmt = conn.prepareStatement(sql); + stmt.setString(1, node.getIdentifier().getValue()); + stmt.setString(2, node.getName()); + stmt.setString(3, node.getType().toString()); + stmt.setString(4, node.getState().toString()); + stmt.setBoolean(5, node.isSynchronize()); + stmt.setString(6, lastHarvestDatetimeStr); + stmt.setString(7, node.getBaseURL()); + stmt.setString(8, node.getIdentifier().getValue()); + stmt.setString(9, node.getName()); + stmt.setString(10, node.getType().toString()); + stmt.setString(11, node.getState().toString()); + stmt.setBoolean(12, node.isSynchronize()); + stmt.setString(13, lastHarvestDatetimeStr); + stmt.setString(14, node.getBaseURL()); + stmt.executeUpdate(); + stmt.close(); + conn.commit(); + } catch (SQLException e) { + log.error( e.getClass().getName()+": "+ e.getMessage()); + MetadigStoreException me = new MetadigStoreException("Unable to save node " + node.getIdentifier().getValue() + " to database."); + me.initCause(e); + throw(me); + } + + // Next, insert a record into the child table ('runs') + log.trace("Records created successfully"); + } + + public Node getNode(String nodeId) { + + Result result = new Result(); + PreparedStatement stmt = null; + Node node = new Node(); + + // Select records from the 'nodes' table + try { + log.trace("preparing statement for query"); + String sql = "select * from nodes where identifier = ? "; + stmt = conn.prepareStatement(sql); + stmt.setString(1, nodeId); + + log.trace("issuing query: " + sql); + ResultSet rs = stmt.executeQuery(); + if(rs.next()) { + node = extractNodeFields(rs); + rs.close(); + stmt.close(); + } else { + log.trace("No results returned for nodeId: " + nodeId); + } + } catch ( Exception e ) { + log.error( e.getClass().getName()+": "+ e.getMessage()); + } + + return(node); + } + + public ArrayList getNodes() { + + Result result = new Result(); + PreparedStatement stmt = null; + + ArrayList nodes = new ArrayList<> (); + ResultSet rs = null; + Node node; + // Select records from the 'nodes' table + try { + log.trace("preparing statement for query"); + String sql = "select * from nodes; "; + stmt = conn.prepareStatement(sql); + + log.trace("issuing query: " + sql); + rs = stmt.executeQuery(); + while(rs.next()) { + node = extractNodeFields(rs); + nodes.add(node); + } + } catch ( Exception e ) { + log.error(e.getClass().getName() + ": " + e.getMessage()); + } + + try { + rs.close(); + stmt.close(); + } catch (Exception e) { + log.error("Error closing node database: " + e.getMessage()); + } + + log.trace(nodes.size() + " nodes found in node table."); + + return(nodes); + } + + public Node extractNodeFields (ResultSet resultSet) { + + Node node = new Node(); + try { + NodeReference nodeReference = new NodeReference(); + nodeReference.setValue(resultSet.getString("identifier")); + node.setIdentifier(nodeReference); + node.setName(resultSet.getString("name")); + switch (resultSet.getString("type")) { + case "CN": + node.setType(NodeType.CN); + break; + case "MN": + node.setType(NodeType.MN); + break; + case "MONITOR": + node.setType(NodeType.MONITOR); + break; + } + + switch (resultSet.getString("state")) { + case "UP": + node.setState(NodeState.UP); + break; + case "DOWN": + node.setState(NodeState.DOWN); + break; + default: + node.setState(NodeState.UNKNOWN); + break; + } + + node.setSynchronize(resultSet.getBoolean("synchronize")); + + Synchronization synchronization = new Synchronization(); + SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); + formatter.setTimeZone(TimeZone.getTimeZone("GMT")); + Date lastHarvestDate = formatter.parse(resultSet.getString("last_harvest")); + synchronization.setLastHarvested(lastHarvestDate); + node.setSynchronization(synchronization); + + node.setBaseURL(resultSet.getString("baseURL")); + } catch (java.sql.SQLException | java.text.ParseException e) { + log.error("Error retrieving node from database: " + e); + } + + return node; + } + @Override public void createRun(Run run) { runs.put(run.getId(), run); diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java index af7637a0..e3f47e7a 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/InMemoryStore.java @@ -9,6 +9,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.dataone.service.types.v2.Node; import org.springframework.core.io.Resource; import org.springframework.core.io.support.PathMatchingResourcePatternResolver; import org.xml.sax.SAXException; @@ -16,6 +17,7 @@ import javax.xml.bind.JAXBException; import java.io.IOException; import java.net.URL; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Map; @@ -210,13 +212,22 @@ public void deleteRun(Run run) { // public void saveNode(Node node) throws MetadigStoreException { } @Override - public Task getTask(String taskName, String taskType) { return new Task(); } + public Task getTask(String taskName, String taskType, String nodeId) { return new Task(); } @Override - public void saveTask(Task task) throws MetadigStoreException { } + public void saveTask(Task task, String nodeId) throws MetadigStoreException { } @Override public void shutdown() {}; + @Override + public Node getNode (String nodeId) { return new Node(); }; + + @Override + public void saveNode(Node node) throws MetadigStoreException {}; + + @Override + public ArrayList getNodes() { return new ArrayList<> (); }; + } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java index b9796c29..ad64d726 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/MDQStore.java @@ -2,8 +2,11 @@ import edu.ucsb.nceas.mdqengine.exception.MetadigStoreException; import edu.ucsb.nceas.mdqengine.model.*; +import org.dataone.service.types.v2.Node; +import java.util.ArrayList; import java.util.Collection; +import java.util.List; public interface MDQStore { @@ -30,7 +33,12 @@ public interface MDQStore { boolean isAvailable(); void renew() throws MetadigStoreException; - Task getTask(String taskName, String taskType); - void saveTask(Task task) throws MetadigStoreException; + Task getTask(String taskName, String taskType, String nodeId); + void saveTask(Task task, String nodeId) throws MetadigStoreException; + + Node getNode (String nodeId); + void saveNode(Node node) throws MetadigStoreException; + + ArrayList getNodes(); } diff --git a/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java b/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java index 4613577e..593e3e36 100644 --- a/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java +++ b/src/main/java/edu/ucsb/nceas/mdqengine/store/MNStore.java @@ -19,6 +19,7 @@ import org.dataone.service.types.v1.Session; import org.dataone.service.types.v1.Subject; import org.dataone.service.types.v1.util.ChecksumUtil; +import org.dataone.service.types.v2.Node; import org.dataone.service.types.v2.SystemMetadata; import javax.xml.bind.JAXBException; @@ -328,17 +329,11 @@ public void deleteRun(Run run) { @Override public void renew() {} -// @Override -// public Node getNode(String nodeId, String jobName) { return new Node(); } -// -// @Override -// public void saveNode(Node node) throws MetadigStoreException { } - @Override - public Task getTask(String taskName, String taskType) { return new Task(); } + public Task getTask(String taskName, String taskType, String nodeId) { return new Task(); } @Override - public void saveTask(Task task) throws MetadigStoreException { } + public void saveTask(Task task, String nodeId) throws MetadigStoreException { } @Override public void shutdown() {}; @@ -346,4 +341,14 @@ public void saveTask(Task task) throws MetadigStoreException { } @Override public void saveRun(Run run) {} + @Override + public Node getNode (String nodeId) { return new Node(); }; + + @Override + public void saveNode(Node node) throws MetadigStoreException {}; + + @Override + public ArrayList getNodes() { return new ArrayList<> (); }; + + } diff --git a/src/main/resources/sql/quality-v2.3.0.sql b/src/main/resources/sql/quality-v2.3.0.sql index 3c4e7dfb..45a26865 100644 --- a/src/main/resources/sql/quality-v2.3.0.sql +++ b/src/main/resources/sql/quality-v2.3.0.sql @@ -24,12 +24,22 @@ alter table identifiers owner to metadig; create table tasks ( task_name TEXT not null, task_type TEXT not null, - last_harvest_datetime TEXT not null, CONSTRAINT task_name_task_type PRIMARY KEY (task_name, task_type) ); alter table tasks owner to metadig; +create table node_harvest ( + task_name TEXT not null, + task_type TEXT not null, + node_id TEXT not null, + last_harvest_datetime TEXT not null, + CONSTRAINT node_harvest_task_name_task_type_fk FOREIGN KEY (task_name, task_type) REFERENCES tasks (task_name, task_type), + CONSTRAINT node_harvest_task_name_task_type_node_id_uc UNIQUE (task_name, task_type, node_id) +); + +alter table node_harvest owner to metadig; + create TABLE runs ( metadata_id TEXT not null, suite_id TEXT not null, @@ -62,3 +72,17 @@ create TABLE filestore ( alter table filestore owner to metadig; +create TABLE nodes ( + identifier TEXT not null, + name TEXT not null, + type TEXT not NULL, + state TEXT not NULL, + synchronize boolean not null, + last_harvest TEXT not null, + baseURL TEXT not null, + CONSTRAINT node_id_pk PRIMARY KEY (identifier) +); + +alter table nodes owner to metadig; + + From 54a1c4efa885c4435787f1185540bd5df8618352 Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 2 Sep 2020 16:02:18 -0700 Subject: [PATCH 47/47] Add portal harvest task for mn-ucsb-1 (#256) This is the current taskList.csv, which includes add'l entries to mn-ucsb-1 --- src/main/resources/configuration/taskList.csv | 51 +++++++++++++++---- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/src/main/resources/configuration/taskList.csv b/src/main/resources/configuration/taskList.csv index e1351e9a..80976d46 100644 --- a/src/main/resources/configuration/taskList.csv +++ b/src/main/resources/configuration/taskList.csv @@ -1,9 +1,8 @@ task-type,task-name,task-group,cron-schedule,params -# task type, task name, task group, cron schedule, "formatId filter (regex); suite id; node id; D1 node base url; harvest begin date; harvest increment (days);requestCount" -# - task type: currently 'quality' and 'score' task are supported. -# - task name: any unique string, i.e. 'quality-knb' -# - task group: currently only 'metadig' is used -# - nodeId +# task type, job name, job group, cron schedule, "formatId filter (regex); suite id; node id; D1 node base url; harvest begin date; harvest increment (days);requestCount" +# - task type: +# - job name: +# - job group: # - cron schedule: # - seconds, minutes, hours, day of month, month, day of week, year # - params @@ -11,10 +10,40 @@ task-type,task-name,task-group,cron-schedule,params # - suite id: the metadig suite id # - node id: a DataONE node URN - data will be filtered using this (DataONE sysmeta "datasource") # - D1 node base url: the base service URL for an MN or CN that will be used to query for pids to be processed -# - harvest begin date: the first date to use for the DataONE 'listObjects' service -# - harvest increment (days): the time span for each search +# - harvest begin date: begin date: the first date to use for the DataONE 'listObjects' service +# - harvest increment (days): increment (days): the time span for each search # - requestCount: the number of itmes to request from DataONE listObjects -score,score-DataONE-fair,metadig,35 0/1 * * * ?,".*portal.*;FAIR.suite.1;urn:node:CN;2019-12-01T00:00:00.00Z;1;100;refresh" -quality,quality-arctic,metadig,20 0/1 * * * ?,"^eml.*|^http.*eml.*;arctic.data.center.suite.1;urn:node:ARCTIC;1;100" -filestore,ingest,metadig,0 0/1 * * * ?,"stage;;*.*;README.txt;filestore-ingest.log" - +# - requestType: for score tasks, determine type of portal processing ("portal" or "node") +# +# Dataset quality scoring tasks +quality,quality-knb,metadig,0 0/1 * * * ?,"^eml.*|^http.*eml.*;knb.suite.1;urn:node:KNB;2020-08-28T14:05:48.764Z;1;1000" +quality,quality-arctic,metadig,5 0/1 * * * ?,"^eml.*|^http.*eml.*;arctic.data.center.suite.1;urn:node:ARCTIC;2020-08-27T00:00:00.000Z;1;1000" +quality,quality-dataone-fair,metadig,10 0/1 * * * ?,"^eml.*|^http.*eml.*|.*www.isotc211.org.*;FAIR-suite-0.3.1;urn:node:CN;2020-08-28T00:00:00.000Z;1;1000" +quality,quality-ess-dive,metadig,15 0/1 * * * ?,"^eml.*|^http.*eml.*;ess-dive.data.center.suite.1;urn:node:ESS_DIVE;2020-08-27T20:38:19.953Z;1;1000;" +# +# Portal scoring tasks +score,portal-KNB-FAIR,metadig,5 0/1 * * * ?,"*portals*;FAIR-suite-0.3.1;urn:node:KNB;2020-08-28T00:00:00.00Z;1;100;portal" +score,portal-ARCTIC-FAIR,metadig,10 0/1 * * * ?,"*portals*;FAIR-suite-0.3.1;urn:node:ARCTIC;2020-08-28T00:00:00.00Z;1;100;portal" +score,portal-mnUCSB1-FAIR,metadig,15 0/1 * * * ?,"*portals*;FAIR-suite-0.3.1;urn:node:mnUCSB1;2020-08-28T00:00:00.00Z;1;100;portal" +# +# Note: Portal harvesting for DataONE portals created on search.dataone.org will be performed on mnUCSB1, as MetacatUI sends create and +# update requests performed on search.dataone.org to this host. We want to harvest them as soon as they are created, and not have to wait for mnUCSB1 to +# sync to the CN, and then the CN index it, so the following entry is obsolete, and no longer used. +# # score,portal-CN-FAIR,metadig,35 0/1 * * * ?,"*portals*;FAIR.suite-0.3.1;urn:node:CN;2020-08-24T00:00:00.00Z;1;100;portal" +# +# Task for creating member node metadata assessment graphs +score,mn-portal-ARCTIC-FAIR,metadig,0 0 2 * * ?,";FAIR-suite-0.3.1;urn:node:ARCTIC;2020-08-28T00:00:00.00Z;1;1000;node" +score,mn-portal-KNB-FAIR,metadig,0 1 2 * * ?,";FAIR-suite-0.3.1;urn:node:KNB;2020-08-28T00:00:00.00Z;1;1000;node" +score,mn-portal-ESS-DIVE-FAIR,metadig,0 2 2 * * ?,";FAIR-suite-0.3.1;urn:node:ESS_DIVE;2020-08-28T00:00:00.00Z;1;1000;node" +score,mn-portal-CA_OPC-FAIR,metadig,0 3 2 * * ?,";FAIR-suite-0.3.1;urn:node:CA_OPC;2020-08-28T00:00:00.00Z;1;1000;node" +score,mn-portal-DataONE-FAIR,metadig,0 4 2 * * ?,";FAIR-suite-0.3.1;urn:node:CN;2020-08-28T00:00:00.00Z;1;1000;node" +# +# Task for ingesting files into the file store from /data/metadig/store/stage/{code,data,graph,metadata} +# filestore,ingest,metadig,0 0/1 * * * ?,"stage;;*.*;README.txt;filestore-ingest.log" +# +# Admin NOTE: it appears that DataONE HttpMultipartRestClient can't handle two clients being created at the same time, even if they are by different threads. This needs to be +# investigated further and potentially a bug needs to be logged in redmine for this. Until then, an easy workaround is to ensure that no two tasks are started +# at the same time, so adjust the cron schedule accordingly. +# +# Node list from DataONE +nodelist,MN-NODE-LIST,metadig,0 0 0/1 * * ?,"urn:node:CN" \ No newline at end of file