diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index d6fdf6e4..ea17fce0 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,5 +1,27 @@ # dataone-indexer Release Notes +## dataone-indexer version 3.1.0 & helm chart version 1.1.0 + +* Release date: 2024-11-21 +* dataone-indexer version 3.1.0 + * Integrate with the new Dataone hash-based storage library + [`hashstore-java`](https://github.com/DataONEorg/hashstore-java). + * Indexer no longer needs access to an aut token in order to index private datasets. + * Update Docker base image to eclipse-temurin:17.0.12_7-jre-jammy + * Upgrade log4j-core to 2.24.0 to fix "method can't be found" issue + * Bump dependencies: + * org.apache.commons:commons-lang3 from 3.4 to 3.17.0 + * org.slf4j:slf4j-api from 1.7.36 to 2.0.16 + * org.springframework.data:spring-data-commons from 1.6.5.RELEASE to 3.3.4 + * org.apache.maven.plugins:maven-compiler-plugin from 2.0.1 to 3.13.0 + * com.coderplus.maven.plugins:copy-rename-maven-plugin from 1.0 to 1.0.1 + * org.apache.logging.log4j:log4j-jcl from 2.17.1 to 2.24.0 + * org.apache.maven.plugins:maven-clean-plugin from 3.2.0 to 3.4.0 + * com.fasterxml.jackson.core:jackson-annotations from 2.13.3 to 2.18.0 +* helm chart version 1.0.2 + * Bump Application version to 3.1.0 + * Add `storage` to values.yaml for new hashstore integration + ## dataone-indexer version 3.0.2 & helm chart version 1.0.2 * Release date: 2024-07-29 diff --git a/docker/Dockerfile b/docker/Dockerfile index 354a7937..3e012d47 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,7 +2,7 @@ # `nerdctl build -t dataone-index-worker:2.4.0 -f docker/Dockerfile --build-arg TAG=2.4.0 .` # Use an OpenJDK runtime as a parent image # Note: the prior alpine-based openjdk image had network DNS issues, so replacing with Eclipse Temurin -FROM eclipse-temurin:17.0.8.1_1-jre-jammy +FROM eclipse-temurin:17.0.12_7-jre-jammy ARG TAG=3.0.0-SNAPSHOT ENV TAG=${TAG} @@ -23,7 +23,7 @@ RUN groupadd -g 1000 d1indexer && useradd -u 1000 -g 1000 d1indexer \ # The most recently built jar file is copied from the maven build directory to this dir by maven, so that # it can be copied to the image. -COPY ../target/dataone-index-worker-${TAG}-shaded.jar . +COPY ./target/dataone-index-worker-${TAG}-shaded.jar . COPY ./docker/entrypoint.sh . # Change the ownership of the jar and sh files diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 2f92eb41..44f72768 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -21,13 +21,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.0.2 +version: "1.1.0" # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "3.0.2" +appVersion: "3.1.0" # Chart dependencies dependencies: diff --git a/helm/config/dataone-indexer.properties b/helm/config/dataone-indexer.properties index c51ce0aa..5c72e7c3 100644 --- a/helm/config/dataone-indexer.properties +++ b/helm/config/dataone-indexer.properties @@ -40,3 +40,12 @@ index.resourcemap.waitingComponent.time={{ default 800 .Values.idxworker.resourc index.resourcemap.waitingComponent.max.attempts={{ default 25 .Values.idxworker.resourcemapMaxTries }} index.solr.versionConflict.waiting.time={{ default 1000 .Values.idxworker.solrVerConflictWaitMs }} index.solr.versionConflict.max.attempts={{ default 50 .Values.idxworker.solrVerConflictMaxTries }} + +# Storage properties +storage.className={{ default "org.dataone.hashstore.filehashstore.FileHashStore" .Values.idxworker.storage.hashStoreClassName }} +storage.hashstore.rootDirectory={{ default "/var/metacat/hashstore" .Values.idxworker.storage.hashStoreRootDir }} +storage.hashstore.defaultNamespace={{ default "https://ns.dataone.org/service/types/v2.0#SystemMetadata" .Values.idxworker.storage.hashStoreDefaultNamespace }} +# The following three properties must NOT be modified after the hash store is initialized +storage.hashstore.fileNameAlgorithm={{ default "SHA-256" .Values.idxworker.storage.hashStoreAlgorithm }} +storage.hashstore.directory.width={{ default 2 .Values.idxworker.storage.hashStoreDirWidth }} +storage.hashstore.directory.depth={{ default 3 .Values.idxworker.storage.hashStoreDirDepth }} diff --git a/helm/values.yaml b/helm/values.yaml index fe5f099b..983dff71 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -32,10 +32,23 @@ global: ## @section Dataone-Indexer Application-Specific Properties image: + ## @param image.repository repository that the image will be pulled from + ## repository: ghcr.io/dataoneorg/dataone-index-worker + + ## @param image.pullPolicy image pull policy - Always, Never, or IfNotPresent + ## pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - #tag: "" + + ## @param image.tag Overrides the image tag. Will default to the chart appVersion if set to "" + ## + tag: "" + + ## @param image.debug Specify if container debugging should be enabled (sets log level to "DEBUG") + ## Set to true if you would like to see extra information in metacat/tomcat logs. + ## * * WARNING - FOR TESTING ONLY! * * May result in secrets being printed to logs in plain text. + ## + debug: false imagePullSecrets: [] @@ -181,6 +194,15 @@ idxworker: ## tripleDbDirectory: /etc/dataone/tdb-cache + storage: + hashStoreClassName: "org.dataone.hashstore.filehashstore.FileHashStore" + hashStoreRootDir: "/var/metacat/hashstore" + hashStoreDefaultNamespace: "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + # The following three properties must NOT be modified after the hashstore is initialized + hashStoreAlgorithm: "SHA-256" + hashStoreDirWidth: 2 + hashStoreDirDepth: 3 + ## @section RabbitMQ Bitnami Sub-Chart Configuration ## rabbitmq: @@ -200,7 +222,7 @@ rabbitmq: ## @param rabbitmq.existingPasswordSecret the k8s secret holding the rabbitmq password ## (must be associated with key: 'rabbitmq-password') ## - existingPasswordSecret: "" + existingPasswordSecret: "ssss" ## @section Solr Bitnami Sub-Chart Configuration diff --git a/pom.xml b/pom.xml index b78f79c4..e635f78f 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.dataone dataone-index-worker - 3.0.2 + 3.1.0 jar dataone-index-worker http://maven.apache.org @@ -12,7 +12,7 @@ 17 17 UTF-8 - 5.3.33 + 5.3.39 2.3.1 8.11.2 solr8home @@ -67,7 +67,7 @@ junit junit - 4.12 + 4.13.2 test @@ -79,6 +79,18 @@ log4j log4j + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-api + + + org.apache.logging.log4j + log4j-1.2-api + test @@ -91,18 +103,30 @@ log4j log4j + + org.apache.logging.log4j + log4j-core + + + org.apache.logging.log4j + log4j-api + + + org.apache.logging.log4j + log4j-1.2-api + test commons-beanutils commons-beanutils - 1.8.3 + 1.9.4 com.h2database h2 - 1.3.163 + 2.3.232 test @@ -128,12 +152,12 @@ org.springframework.data spring-data-jpa - 1.4.5.RELEASE + 3.3.4 org.springframework.data spring-data-commons - 1.6.5.RELEASE + 3.3.4 org.springframework @@ -188,12 +212,12 @@ org.apache.commons commons-lang3 - 3.4 + 3.17.0 org.slf4j slf4j-api - 1.7.36 + 2.0.16 org.slf4j @@ -209,7 +233,7 @@ net.minidev json-smart - 1.0.9 + 2.5.1 org.apache.jena @@ -229,7 +253,7 @@ commons-collections commons-collections - 3.2.1 + 3.2.2 ch.hsr @@ -254,17 +278,17 @@ org.apache.logging.log4j log4j-1.2-api - 2.17.1 + 2.24.0 org.apache.logging.log4j log4j-core - 2.17.1 + 2.24.0 org.apache.logging.log4j log4j-jcl - 2.17.1 + 2.24.0 org.apache.solr @@ -274,7 +298,7 @@ com.fasterxml.jackson.core jackson-annotations - 2.13.3 + 2.17.2 @@ -288,12 +312,17 @@ jaxb-runtime 2.3.2 + + org.dataone + hashstore + 1.1.0 + maven-clean-plugin - 3.2.0 + 3.4.0 @@ -318,7 +347,7 @@ org.apache.maven.plugins maven-compiler-plugin - 2.0.1 + 3.13.0 com.mycila.maven-license-plugin @@ -398,7 +427,7 @@ com.coderplus.maven.plugins copy-rename-maven-plugin - 1.0 + 1.0.1 copy-file @@ -430,6 +459,15 @@ + + + + + org.apache.maven.wagon + wagon-ssh-external + 3.5.3 + + https://repository.dataone.org/software/cicore @@ -443,4 +481,15 @@ LICENSE.txt + + + + + + + dataone.org + DataONE Repository + scpexe://maven.dataone.org/var/www/maven + + diff --git a/src/main/java/org/dataone/cn/indexer/IndexWorker.java b/src/main/java/org/dataone/cn/indexer/IndexWorker.java index 754040b6..3f5c6094 100644 --- a/src/main/java/org/dataone/cn/indexer/IndexWorker.java +++ b/src/main/java/org/dataone/cn/indexer/IndexWorker.java @@ -427,23 +427,21 @@ private void indexObject(IndexQueueMessageParser parser, boolean multipleThread) Identifier pid = parser.getIdentifier(); String indexType = parser.getIndexType(); int priority = parser.getPriority(); - String finalFilePath = parser.getObjectPath(); try { long threadId = Thread.currentThread().getId(); logger.info("IndexWorker.consumer.indexObject by multiple thread? " + multipleThread + ", with the thread id " + threadId + " - Received the index task from the index queue with the identifier: " + pid.getValue() + " , the index type: " + indexType - + ", the file path (null means not to have): " + finalFilePath + ", the priority: " + priority); switch (indexType) { case CREATE_INDEXT_TYPE -> { boolean sysmetaOnly = false; - solrIndex.update(pid, finalFilePath, sysmetaOnly); + solrIndex.update(pid, sysmetaOnly); } case SYSMETA_CHANGE_TYPE -> { boolean sysmetaOnly = true; - solrIndex.update(pid, finalFilePath, sysmetaOnly); + solrIndex.update(pid, sysmetaOnly); } case DELETE_INDEX_TYPE -> solrIndex.remove(pid); default -> throw new InvalidRequest( @@ -455,7 +453,6 @@ private void indexObject(IndexQueueMessageParser parser, boolean multipleThread) logger.info("IndexWorker.indexOjbect with the thread id " + threadId + " - Completed the index task from the index queue with the identifier: " + pid.getValue() + " , the index type: " + indexType - + ", the file path (null means not to have): " + finalFilePath + ", the priority: " + priority + " and the time taking is " + (end - start) + " milliseconds"); diff --git a/src/main/java/org/dataone/cn/indexer/SolrIndex.java b/src/main/java/org/dataone/cn/indexer/SolrIndex.java index 104a158a..b07211e0 100644 --- a/src/main/java/org/dataone/cn/indexer/SolrIndex.java +++ b/src/main/java/org/dataone/cn/indexer/SolrIndex.java @@ -24,6 +24,7 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collection; @@ -109,8 +110,10 @@ public class SolrIndex { * @throws SAXException * @throws IOException */ - public SolrIndex(XMLNamespaceConfig xmlNamespaceConfig, BaseXPathDocumentSubprocessor systemMetadataProcessor, HTTPService httpService) - throws XPathExpressionException, ParserConfigurationException, IOException, SAXException { + public SolrIndex(XMLNamespaceConfig xmlNamespaceConfig, + BaseXPathDocumentSubprocessor systemMetadataProcessor, HTTPService httpService) + throws XPathExpressionException, ParserConfigurationException, + IOException, SAXException { this.xmlNamespaceConfig = xmlNamespaceConfig; this.systemMetadataProcessor = systemMetadataProcessor; this.httpService = httpService; @@ -121,7 +124,8 @@ private void init() throws ParserConfigurationException, XPathExpressionExceptio sysmetaSolrFields = systemMetadataProcessor.getFieldList(); copyFields = httpService.getSolrCopyFields(); if (copyFields != null) { - log.info("SolrIndex.init - the size of the copy fields from the solr schema is : " + copyFields.size()); + log.info("SolrIndex.init - the size of the copy fields from the solr schema is : " + + copyFields.size()); for(String copyField : copyFields) { log.debug("SolrIndex.init - the copy field from the solr schema: " + copyField); } @@ -166,10 +170,9 @@ public void setDeleteSubprocessors( /** * Generate the index for the given information - * @param id - * @param systemMetadata - * @param dataStream - * @return + * @param id the id which will be indexed + * @param isSystemetaChange if this is a change on the system metadata only + * @return a map of solr doc with ids * @throws IOException * @throws SAXException * @throws ParserConfigurationException @@ -181,24 +184,21 @@ public void setDeleteSubprocessors( * @throws NotFound * @throws NotImplemented */ - private Map process(String id, SystemMetadata systemMetadata, String objectPath, boolean isSysmetaChangeOnly) - throws IOException, SAXException, ParserConfigurationException, - XPathExpressionException, MarshallingException, EncoderException, SolrServerException, NotImplemented, NotFound, UnsupportedType{ + private Map process(String id, boolean isSysmetaChangeOnly) + throws IOException, SAXException, ParserConfigurationException, + XPathExpressionException, MarshallingException, EncoderException, + SolrServerException, NotImplemented, NotFound, UnsupportedType{ log.debug("SolrIndex.process - trying to generate the solr doc object for the pid "+id); long start = System.currentTimeMillis(); Map docs = new HashMap(); // Load the System Metadata document - ByteArrayOutputStream systemMetadataOutputStream = new ByteArrayOutputStream(); - TypeMarshaller.marshalTypeToOutputStream(systemMetadata, systemMetadataOutputStream); - ByteArrayInputStream systemMetadataStream = new ByteArrayInputStream(systemMetadataOutputStream.toByteArray()); - try { + try (InputStream systemMetadataStream = ObjectManager.getInstance().getSystemMetadataStream(id)){ docs = systemMetadataProcessor.processDocument(id, docs, systemMetadataStream); } catch (Exception e) { log.error(e.getMessage(), e); throw new SolrServerException(e.getMessage()); } long end = System.currentTimeMillis(); - //log.info("SolrIndex.process - the time for processing the system metadata for the pid " + id + " is " + (end-start) + "milliseconds."); // get the format id for this object String formatId = docs.get(id).getFirstFieldValue(SolrElementField.FIELD_OBJECTFORMAT); boolean skipOtherProcessor = false; @@ -207,19 +207,19 @@ private Map process(String id, SystemMetadata systemMetadata, S //we need to make the solr doc exists (means the resource map was processed SolrDoc existingResourceMapSolrDoc = httpService.getSolrDocumentById(solrQueryUri, id); if (existingResourceMapSolrDoc != null ) { - log.info("SolrIndex.process - This is a systemmetadata-change-only event for the resource map " + id + - ". So we only use the system metadata subprocessor"); + log.info("SolrIndex.process - This is a systemmetadata-change-only event for the " + + "resource map " + id + ". So we only use the system metadata subprocessor"); skipOtherProcessor = true; } else { - log.info("SolrIndex.process - There is no solr doc for the resource map " + id + - ". Even though this is a systemmetadata-change-only event, we can NOT just reindex the systemmeta only."); + log.info("SolrIndex.process - There is no solr doc for the resource map " + id + + ". Even though this is a systemmetadata-change-only event, we can NOT " + + "just reindex the systemmeta only."); } - } - log.debug("SolrIndex.process - the value of skipOtherProcessors is " + skipOtherProcessor + - " and the object path is " + objectPath + " for the id " + id); + log.debug("SolrIndex.process - the value of skipOtherProcessors is " + skipOtherProcessor + + " for the id " + id); //if the objectPath is null, we should skip the other processes - if (!skipOtherProcessor && objectPath != null) { + if (!skipOtherProcessor) { log.debug("SolrIndex.process - Start to use subprocessor list to process " + id); // Determine if subprocessors are available for this ID if (subprocessors != null) { @@ -229,28 +229,21 @@ private Map process(String id, SystemMetadata systemMetadata, S if (subprocessor.canProcess(formatId)) { // if so, then extract the additional information from the // document. - try { + try (InputStream dataStream = ObjectManager.getInstance().getObject(id)) { // docObject = the resource map document or science // metadata document. // note that resource map processing touches all objects // referenced by the resource map. - //start = System.currentTimeMillis(); - FileInputStream dataStream = new FileInputStream(objectPath); - //end = System.currentTimeMillis(); - //log.info("SolrIndex.process - the time for reading the file input stream " + " for the pid " + id + " is " + (end-start) + "milliseconds."); - if (!dataStream.getFD().valid()) { - log.error("SolrIndex.process - subprocessor "+ subprocessor.getClass().getName() +" couldn't process since it could not load OBJECT file for ID,Path=" + id + ", " - + objectPath); - //throw new Exception("Could not load OBJECT for ID " + id ); - } else { - start = System.currentTimeMillis(); - docs = subprocessor.processDocument(id, docs, dataStream); - end = System.currentTimeMillis(); - log.info("SolrIndex.process - the time for calling processDocument for the subprocessor " + subprocessor.getClass().getName() +" for the pid " + id + " is " + (end-start) + "milliseconds."); - log.debug("SolrIndex.process - subprocessor "+ subprocessor.getClass().getName() +" generated solr doc for id "+id); - } + start = System.currentTimeMillis(); + docs = subprocessor.processDocument(id, docs, dataStream); + end = System.currentTimeMillis(); + log.info("SolrIndex.process - the time for calling processDocument " + + "for the subprocessor " + subprocessor.getClass().getName() + +" for the pid " + id + " is " + (end-start) + "milliseconds."); + log.debug("SolrIndex.process - subprocessor " + + subprocessor.getClass().getName() + +" generated solr doc for id "+id); } catch (Exception e) { - e.printStackTrace(); log.error(e.getMessage(), e); throw new SolrServerException(e.getMessage()); } @@ -258,14 +251,6 @@ private Map process(String id, SystemMetadata systemMetadata, S } } } - - /*if(docs != null) { - SolrDoc solrDoc = docs.get(id); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - solrDoc.serialize(baos, "UTF-8"); - log.warn("after process the science metadata, the solr doc is \n"+baos.toString()); - }*/ - // TODO: in the XPathDocumentParser class in d1_cn_index_process module, // merge is only for resource map. We need more work here. for (SolrDoc mergeDoc : docs.values()) { @@ -273,15 +258,6 @@ private Map process(String id, SystemMetadata systemMetadata, S mergeWithIndexedDocument(mergeDoc); } } - - /*if(docs != null) { - SolrDoc solrDoc = docs.get(id); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - solrDoc.serialize(baos, "UTF-8"); - log.warn("after merge, the solr doc is \n"+baos.toString()); - }*/ - //SolrElementAdd addCommand = getAddCommand(new ArrayList(docs.values())); - return docs; } @@ -310,16 +286,11 @@ private Map process(String id, SystemMetadata systemMetadata, S // TODO:combine merge function with resourcemap merge function private SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument) throws IOException, - EncoderException, XPathExpressionException, SolrServerException, ParserConfigurationException, SAXException, NotImplemented, NotFound, UnsupportedType { - //Retrieve the existing solr document from the solr server for the id. If it doesn't exist, null or empty solr doc will be returned. + EncoderException, XPathExpressionException, SolrServerException, + ParserConfigurationException, SAXException, NotImplemented, NotFound, UnsupportedType { + //Retrieve the existing solr document from the solr server for the id. If it doesn't exist, + //null or empty solr doc will be returned. SolrDoc indexedDocument = httpService.getSolrDocumentById(solrQueryUri, indexDocument.getIdentifier()); - /*int wait = new Double(Math.random() * 10000).intValue(); - System.out.println("++++++++++++++++++++++++++++ the wait time is " + wait); - try { - Thread.sleep(wait); - } catch (Exception e) { - - }*/ if (indexedDocument == null || indexedDocument.getFieldList().size() <= 0) { return indexDocument; } else { @@ -330,17 +301,25 @@ private SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument) throws IOExcepti .getName().equals(SolrElementField.FIELD_RESOURCEMAP)) && !indexDocument.hasFieldWithValue(field.getName(), field.getValue())) { indexDocument.addField(field); - } else if (!copyFields.contains(field.getName()) && !indexDocument.hasField(field.getName()) && !isSystemMetadataField(field.getName())) { + } else if (!copyFields.contains(field.getName()) + && !indexDocument.hasField(field.getName()) + && !isSystemMetadataField(field.getName())) { // we don't merge the system metadata field since they can be removed. // we don't merge the copyFields as well - log.debug("SolrIndex.mergeWithIndexedDocument - put the merge-needed existing solr field "+field.getName()+" with value "+field.getValue()+" from the solr server to a vector. We will merge it later."); - //indexDocument.addField(field); - mergeNeededFields.add(field);//record this name since we can have mutiple name/value for the same name. See https://projects.ecoinformatics.org/ecoinfo/issues/7168 + log.debug("SolrIndex.mergeWithIndexedDocument - put the merge-needed existing solr field " + + field.getName() + " with value " + field.getValue() + + " from the solr server to a vector. We will merge it later."); + //record this name since we can have mutiple name/value for the same name. + //See https://projects.ecoinformatics.org/ecoinfo/issues/7168 + mergeNeededFields.add(field); } } if(mergeNeededFields != null) { for(SolrElementField field: mergeNeededFields) { - log.debug("SolrIndex.mergeWithIndexedDocument - merge the existing solr field "+field.getName()+" with value "+field.getValue()+" from the solr server to the currently processing document of "+indexDocument.getIdentifier()); + log.debug("SolrIndex.mergeWithIndexedDocument - merge the existing solr field " + + field.getName() + " with value " + field.getValue() + +" from the solr server to the currently processing document of " + + indexDocument.getIdentifier()); indexDocument.addField(field); } } @@ -348,7 +327,7 @@ private SolrDoc mergeWithIndexedDocument(SolrDoc indexDocument) throws IOExcepti return indexDocument; } } - + /* * If the given field name is a system metadata field. */ @@ -357,7 +336,9 @@ private boolean isSystemMetadataField(String fieldName) { if (fieldName != null && !fieldName.trim().equals("") && sysmetaSolrFields != null) { for(ISolrField field : sysmetaSolrFields) { if(field != null && field.getName() != null && field.getName().equals(fieldName)) { - log.debug("SolrIndex.isSystemMetadataField - the field name "+fieldName+" matches one record of system metadata field list. It is a system metadata field."); + log.debug("SolrIndex.isSystemMetadataField - the field name " + fieldName + + " matches one record of system metadata field list. It is a " + + "system metadata field."); is = true; break; } @@ -369,21 +350,13 @@ private boolean isSystemMetadataField(String fieldName) { /** * Check the parameters of the insert or update methods. - * @param pid - * @param systemMetadata - * @param data + * @param pid the pid which will be indexed * @throws SolrServerException */ - private void checkParams(Identifier pid, SystemMetadata systemMetadata, String objectPath) throws InvalidRequest { + private void checkParams(Identifier pid) throws InvalidRequest { if(pid == null || pid.getValue() == null || pid.getValue().trim().equals("")) { throw new InvalidRequest("0000", "The identifier of the indexed document should not be null or blank."); } - if(systemMetadata == null) { - throw new InvalidRequest("0000", "The system metadata of the indexed document "+pid.getValue()+ " should not be null."); - } - /*if(objectPath == null) { - throw new SolrServerException("The indexed document itself for pid "+pid.getValue()+" should not be null."); - }*/ } /** @@ -391,23 +364,25 @@ private void checkParams(Identifier pid, SystemMetadata systemMetadata, String o * @param pid the id of this document * @param systemMetadata the system metadata associated with the data object * @param data the path to the object file itself - * @throws SolrServerException - * @throws MarshallingException - * @throws EncoderException - * @throws UnsupportedType - * @throws NotFound - * @throws NotImplemented + * @throws SolrServerException + * @throws MarshallingException + * @throws EncoderException + * @throws UnsupportedType + * @throws NotFound + * @throws NotImplemented * @throws InvalidRequest */ - private void insert(Identifier pid, SystemMetadata systemMetadata, String objectPath, boolean isSysmetaChangeOnly) + private void insert(Identifier pid, boolean isSysmetaChangeOnly) throws IOException, SAXException, ParserConfigurationException, InvalidRequest, - XPathExpressionException, SolrServerException, MarshallingException, EncoderException, NotImplemented, NotFound, UnsupportedType { - checkParams(pid, systemMetadata, objectPath); + XPathExpressionException, SolrServerException, MarshallingException, + EncoderException, NotImplemented, NotFound, UnsupportedType { + checkParams(pid); log.debug("SolrIndex.insert - trying to insert the solrDoc for object "+pid.getValue()); long start = System.currentTimeMillis(); - Map docs = process(pid.getValue(), systemMetadata, objectPath, isSysmetaChangeOnly); + Map docs = process(pid.getValue(), isSysmetaChangeOnly); long end = System.currentTimeMillis(); - log.info("SolrIndex.insert - the subprocessor processing time of " + pid.getValue() + " is " + (end-start) + " milliseconds."); + log.info("SolrIndex.insert - the subprocessor processing time of " + pid.getValue() + " is " + + (end-start) + " milliseconds."); //transform the Map to the SolrInputDocument which can be used by the solr server if(docs != null) { start = System.currentTimeMillis(); @@ -416,18 +391,22 @@ private void insert(Identifier pid, SystemMetadata systemMetadata, String object if(id != null) { SolrDoc doc = docs.get(id); insertToIndex(doc); - log.debug("SolrIndex.insert - inserted the solr-doc object of pid "+id+", which relates to object "+pid.getValue()+", into the solr server."); + log.debug("SolrIndex.insert - inserted the solr-doc object of pid " + id + + ", which relates to object " + pid.getValue() + + ", into the solr server."); } } end = System.currentTimeMillis(); - log.info("SolrIndex.insert - finished to insert the solrDoc to the solr server for object " + pid.getValue() + - " and it took " + (end-start) + " milliseconds."); + log.info("SolrIndex.insert - finished to insert the solrDoc to the solr server for " + + " object " + pid.getValue() + " and it took " + (end-start) + + " milliseconds."); } else { - log.debug("SolrIndex.insert - the genered solrDoc is null. So we will not index the object "+pid.getValue()); + log.debug("SolrIndex.insert - the genered solrDoc is null. So we will not index the " + + "object "+pid.getValue()); } } - + /* * Insert a SolrDoc to the solr server. */ @@ -438,44 +417,14 @@ private void insertToIndex(SolrDoc doc) throws SolrServerException, IOException httpService.sendUpdate(solrIndexUri, addCommand, "UTF-8"); } - - /*private void insertToIndex(SolrDoc doc) throws SolrServerException, IOException { - if(doc != null ) { - SolrInputDocument solrDoc = new SolrInputDocument(); - List list = doc.getFieldList(); - if(list != null) { - //solrDoc.addField(METACATPIDFIELD, pid); - Iterator iterator = list.iterator(); - while (iterator.hasNext()) { - SolrElementField field = iterator.next(); - if(field != null) { - String value = field.getValue(); - String name = field.getName(); - log.trace("SolrIndex.insertToIndex - add name/value pair - "+name+"/"+value); - solrDoc.addField(name, value); - } - } - } - if(!solrDoc.isEmpty()) { - try { - UpdateResponse response = solrServer.add(solrDoc); - solrServer.commit(); - } catch (SolrServerException e) { - throw e; - } catch (IOException e) { - throw e; - } - //System.out.println("=================the response is:\n"+response.toString()); - } - } - }*/ - /** * Update the solr index. This method handles the three scenarios: * 1. Remove an existing doc - if the the system metadata shows the value of the archive is true, * remove the index for the previous version(s) and generate new index for the doc. * 2. Add a new doc - if the system metadata shows the value of the archive is false, generate the * index for the doc. + * @param pid the identifier of object which will be indexed + * @param isSysmetaChangeOnly the flag indicating if the change is system metadata only * @throws NotFound * @throws ServiceFailure * @throws NotImplemented @@ -494,34 +443,38 @@ private void insertToIndex(SolrDoc doc) throws SolrServerException, IOException * @throws IllegalAccessException * @throws InstantiationException */ - public void update(Identifier pid, String relativePath, boolean isSysmetaChangeOnly) throws InvalidToken, NotAuthorized, - NotImplemented, ServiceFailure, NotFound, XPathExpressionException, UnsupportedType, - SAXException, ParserConfigurationException, SolrServerException, MarshallingException, - EncoderException, InterruptedException, IOException, InvalidRequest, InstantiationException, IllegalAccessException { - log.debug("SolrIndex.update - trying to update(insert or remove) solr index of object "+pid.getValue()); - String objectPath = null; - SystemMetadata systemMetadata = ObjectManager.getInstance().getSystemMetadata(pid.getValue(), relativePath); - objectPath = ObjectManager.getInstance().getFilePath(relativePath, systemMetadata.getFormatId().getValue()); + public void update(Identifier pid, boolean isSysmetaChangeOnly) + throws InvalidToken, NotAuthorized, NotImplemented, ServiceFailure, NotFound, + XPathExpressionException, UnsupportedType, SAXException, + ParserConfigurationException, SolrServerException, MarshallingException, + EncoderException, InterruptedException, IOException, InvalidRequest, + InstantiationException, IllegalAccessException { + log.debug("SolrIndex.update - trying to update(insert or remove) solr index of object " + + pid.getValue()); try { - insert(pid, systemMetadata, objectPath, isSysmetaChangeOnly); + insert(pid, isSysmetaChangeOnly); } catch (SolrServerException e) { if (e.getMessage().contains(VERSION_CONFLICT) && VERSION_CONFLICT_MAX_ATTEMPTS > 0) { - log.info("SolrIndex.update - Indexer grabbed an older verion (version conflict) of the solr doc for object " + - pid.getValue() + ". It will try " + VERSION_CONFLICT_MAX_ATTEMPTS + " to fix the issues"); + log.info("SolrIndex.update - Indexer grabbed an older verion (version conflict) of " + + "the solr doc for object " + pid.getValue() + + ". It will try " + VERSION_CONFLICT_MAX_ATTEMPTS + " to fix the issues"); for (int i=0; i docsToUpdate = getUpdatedSolrDocsByRemovingResourceMap(pid); if (docsToUpdate != null && !docsToUpdate.isEmpty()) { - //SolrElementAdd addCommand = new SolrElementAdd(docsToUpdate); - //httpService.sendUpdate(solrIndexUri, addCommand); for(SolrDoc doc : docsToUpdate) { - //deleteDocFromIndex(doc.getIdentifier()); insertToIndex(doc); } } break; } catch (SolrServerException e) { if (e.getMessage().contains(VERSION_CONFLICT) && VERSION_CONFLICT_MAX_ATTEMPTS > 0) { - log.info("SolrIndex.removeDataPackage - Indexer grabbed an older verion (version conflict) of the solr doc for object" + - ". It will try " + (VERSION_CONFLICT_MAX_ATTEMPTS - i )+ " to fix the issues"); + log.info("SolrIndex.removeDataPackage - Indexer grabbed an older verion " + + "(version conflict) of the solr doc for object" + + ". It will try " + (VERSION_CONFLICT_MAX_ATTEMPTS - i ) + + " to fix the issues"); } else { throw e; } @@ -650,12 +607,13 @@ private void removeDataPackage(String pid) throws IOException, UnsupportedType, * Get the list of the solr doc which need to be updated because the removal of the resource map */ private List getUpdatedSolrDocsByRemovingResourceMap(String resourceMapId) - throws UnsupportedType, NotFound, SolrServerException, ParserConfigurationException, SAXException, MalformedURLException, IOException, XPathExpressionException, EncoderException { + throws UnsupportedType, NotFound, SolrServerException, ParserConfigurationException, + SAXException, MalformedURLException, IOException, XPathExpressionException, + EncoderException { List updatedSolrDocs = null; if (resourceMapId != null && !resourceMapId.trim().equals("")) { - /*List docsContainResourceMap = httpService.getDocumentsByResourceMap( - solrQueryUri, resourceMapId);*/ - List docsContainResourceMap = httpService.getDocumentsByResourceMap(solrQueryUri, resourceMapId); + List docsContainResourceMap = httpService + .getDocumentsByResourceMap(solrQueryUri, resourceMapId); updatedSolrDocs = removeResourceMapRelationship(docsContainResourceMap, resourceMapId); } @@ -740,15 +698,6 @@ private List removeAggregatedItems(String targetResourceMapId, SolrDoc doc.removeFieldsWithValue(SolrElementField.FIELD_RESOURCEMAP, targetResourceMapId); updatedSolrDocs.add(doc); - /*if (aggregatedItemsInDoc.size() > 1) { - - - } else { - //multiple resource map aggregate same metadata and data. Just remove the resource map - doc.removeFieldsWithValue(SolrElementField.FIELD_RESOURCEMAP, - targetResourceMapId); - updatedSolrDocs.add(doc); - }*/ } } return updatedSolrDocs; @@ -835,32 +784,43 @@ private List mergeUpdatedSolrDocs(ListremovedDocumentBy, List< SolrDoc docInRemovedDocs = removedDocuments.get(j); if(docInRemovedDocBy.getIdentifier().equals(docInRemovedDocs.getIdentifier())) { //find the same doc in both list. let's merge them. - //first get all the documents element from the docWithDocs(it has the correct information about the documents element) - List idsInDocuments = docInRemovedDocs.getAllFieldValues(SolrElementField.FIELD_DOCUMENTS); - docInRemovedDocBy.removeAllFields(SolrElementField.FIELD_DOCUMENTS);//clear out any documents element in docInRemovedDocBy + //first get all the documents element from the docWithDocs + //(it has the correct information about the documents element) + List idsInDocuments = docInRemovedDocs + .getAllFieldValues(SolrElementField.FIELD_DOCUMENTS); + //clear out any documents element in docInRemovedDocBy + docInRemovedDocBy.removeAllFields(SolrElementField.FIELD_DOCUMENTS); //add the Documents element from the docInRemovedDocs if it has any. - // The docInRemovedDocs has the correct information about the documentBy. Now it copied the correct information of the documents element. - // So docInRemovedDocs has both correct information about the documentBy and documents elements. + // The docInRemovedDocs has the correct information about the documentBy. + // Now it copied the correct information of the documents element. + // So docInRemovedDocs has both correct information about the documentBy + //and documents elements. if(idsInDocuments != null) { for(String id : idsInDocuments) { if(id != null && !id.trim().equals("")) { - docInRemovedDocBy.addField(new SolrElementField(SolrElementField.FIELD_DOCUMENTS, id)); + docInRemovedDocBy.addField( + new SolrElementField(SolrElementField.FIELD_DOCUMENTS, id)); } } } //intersect the resource map ids. - List resourceMapIdsInWithDocs = docInRemovedDocs.getAllFieldValues(SolrElementField.FIELD_RESOURCEMAP); - List resourceMapIdsInWithDocBy = docInRemovedDocBy.getAllFieldValues(SolrElementField.FIELD_RESOURCEMAP); + List resourceMapIdsInWithDocs = docInRemovedDocs + .getAllFieldValues(SolrElementField.FIELD_RESOURCEMAP); + List resourceMapIdsInWithDocBy = docInRemovedDocBy + .getAllFieldValues(SolrElementField.FIELD_RESOURCEMAP); docInRemovedDocBy.removeAllFields(SolrElementField.FIELD_RESOURCEMAP); - Collection resourceMapIds = CollectionUtils.union(resourceMapIdsInWithDocs, resourceMapIdsInWithDocBy); + Collection resourceMapIds = CollectionUtils.union(resourceMapIdsInWithDocs, + resourceMapIdsInWithDocBy); if(resourceMapIds != null) { for(Object idObj : resourceMapIds) { String id = (String)idObj; - docInRemovedDocBy.addField(new SolrElementField(SolrElementField.FIELD_RESOURCEMAP, id)); + docInRemovedDocBy.addField(new SolrElementField( + SolrElementField.FIELD_RESOURCEMAP, id)); } } - //we don't need do anything about the documentBy elements since the docInRemovedDocBy has the correct information. + //we don't need do anything about the documentBy elements since the + //docInRemovedDocBy has the correct information. mergedDocuments.add(docInRemovedDocBy); //delete the two documents from the list removedDocumentBy.remove(i); @@ -870,8 +830,8 @@ private List mergeUpdatedSolrDocs(ListremovedDocumentBy, List< } } - // when we get there, if the two lists are empty, this will be a perfect merge. However, if something are left. we - //just put them in. + // when we get there, if the two lists are empty, this will be a perfect merge. + // However, if something are left. we just put them in. for(SolrDoc doc: removedDocumentBy) { mergedDocuments.add(doc); } @@ -886,7 +846,8 @@ private List mergeUpdatedSolrDocs(ListremovedDocumentBy, List< /* * Remove a pid which is part of resource map. */ - private void removeFromDataPackage(String pid) throws XPathExpressionException, IOException, EncoderException, SolrServerException { + private void removeFromDataPackage(String pid) throws XPathExpressionException, IOException, + EncoderException, SolrServerException { SolrDoc indexedDoc = httpService.getSolrDocumentById(solrQueryUri, pid); deleteDocFromIndex(pid); List documents = indexedDoc.getAllFieldValues(SolrElementField.FIELD_DOCUMENTS); @@ -903,8 +864,10 @@ private void removeFromDataPackage(String pid) throws XPathExpressionException, break; } catch (SolrServerException e) { if (e.getMessage().contains(VERSION_CONFLICT) && VERSION_CONFLICT_MAX_ATTEMPTS > 0) { - log.info("SolrIndex.removeFromDataPackage - Indexer grabbed an older verion (version conflict) of the solr doc for object " + - documentsValue + ". It will try " + (VERSION_CONFLICT_MAX_ATTEMPTS - i )+ " to fix the issues"); + log.info("SolrIndex.removeFromDataPackage - Indexer grabbed an older " + + "verion (version conflict) of the solr doc for object " + + documentsValue + ". It will try " + + (VERSION_CONFLICT_MAX_ATTEMPTS - i )+ " to fix the issues"); } else { throw e; } @@ -927,8 +890,10 @@ private void removeFromDataPackage(String pid) throws XPathExpressionException, break; } catch (SolrServerException e) { if (e.getMessage().contains(VERSION_CONFLICT) && VERSION_CONFLICT_MAX_ATTEMPTS > 0) { - log.info("SolrIndex.removeFromDataPackage - Indexer grabbed an older verion (version conflict) of the solr doc for object " + - documentedByValue + ". It will try " + (VERSION_CONFLICT_MAX_ATTEMPTS - i )+ " to fix the issues"); + log.info("SolrIndex.removeFromDataPackage - Indexer grabbed an older " + + "verion (version conflict) of the solr doc for object " + + documentedByValue + ". It will try " + + (VERSION_CONFLICT_MAX_ATTEMPTS - i )+ " to fix the issues"); } else { throw e; } @@ -938,68 +903,18 @@ private void removeFromDataPackage(String pid) throws XPathExpressionException, } } - /* - * Remove a pid from the solr index - */ - /*private synchronized void removeFromIndex(String identifier) throws Exception { - - - Map docs = new HashMap(); - for (IDocumentDeleteSubprocessor deleteSubprocessor : deleteSubprocessors) { - docs.putAll(deleteSubprocessor.processDocForDelete(identifier, docs)); - } - List docsToUpdate = new ArrayList(); - List idsToIndex = new ArrayList(); - for (String idToUpdate : docs.keySet()) { - if (docs.get(idToUpdate) != null) { - docsToUpdate.add(docs.get(idToUpdate)); - } else { - idsToIndex.add(idToUpdate); - } - } - - // update the docs we have - for (SolrDoc docToUpdate : docsToUpdate) { - insertToIndex(docToUpdate); - } - - // delete this one - deleteDocFromIndex(identifier); - - // index the rest - //TODO: we need to figure out how to get the file path - for (String idToIndex : idsToIndex) { - Identifier pid = new Identifier(); - pid.setValue(idToIndex); - SystemMetadata sysMeta = DistributedMapsFactory.getSystemMetadata(idToIndex); - if (SolrDoc.visibleInIndex(sysMeta)) { - String objectPath = DistributedMapsFactory.getObjectPathMap().get(pid); - boolean isSysmetaChangeOnlyEvent = false; - insert(pid, sysMeta, objectPath, isSysmetaChangeOnlyEvent); - } - } - - }*/ - private void deleteDocFromIndex(String pid) throws IOException { if (pid != null && !pid.trim().equals("")) { try { - //solrServer.deleteById(pid); - //solrServer.commit(); httpService.sendSolrDelete(pid, solrIndexUri); - //} catch (SolrServerException e) { - //throw e; - } catch (IOException e) { throw e; } } - } - /** * Set the http service * @param service @@ -1015,5 +930,5 @@ public void setHttpService(HTTPService service) { public HTTPService getHttpService() { return httpService; } - + } diff --git a/src/main/java/org/dataone/cn/indexer/object/ObjectManager.java b/src/main/java/org/dataone/cn/indexer/object/ObjectManager.java index f897903f..df67f0f7 100644 --- a/src/main/java/org/dataone/cn/indexer/object/ObjectManager.java +++ b/src/main/java/org/dataone/cn/indexer/object/ObjectManager.java @@ -1,42 +1,25 @@ -/** - * This work was created by participants in the DataONE project, and is - * jointly copyrighted by participating institutions in DataONE. For - * more information on DataONE, see our web site at http://dataone.org. - * - * Copyright 2022 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ package org.dataone.cn.indexer.object; +import java.io.ByteArrayInputStream; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; -import java.nio.file.FileSystems; -import java.nio.file.Files; +import java.io.InputStream; +import java.security.NoSuchAlgorithmException; import org.apache.commons.io.FileUtils; +import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.log4j.Logger; import org.dataone.client.auth.AuthTokenSession; import org.dataone.client.exception.ClientSideException; import org.dataone.client.rest.HttpMultipartRestClient; import org.dataone.client.rest.MultipartRestClient; -import org.dataone.client.v2.formats.ObjectFormatCache; import org.dataone.client.v2.impl.MultipartCNode; import org.dataone.client.v2.impl.MultipartD1Node; import org.dataone.client.v2.impl.MultipartMNode; import org.dataone.configuration.Settings; import org.dataone.exceptions.MarshallingException; +import org.dataone.indexer.storage.Storage; import org.dataone.service.exceptions.InvalidToken; import org.dataone.service.exceptions.NotAuthorized; import org.dataone.service.exceptions.NotFound; @@ -44,7 +27,6 @@ import org.dataone.service.exceptions.ServiceFailure; import org.dataone.service.types.v1.Identifier; import org.dataone.service.types.v1.Session; -import org.dataone.service.types.v2.ObjectFormat; import org.dataone.service.types.v2.SystemMetadata; import org.dataone.service.util.TypeMarshaller; @@ -56,164 +38,143 @@ */ public class ObjectManager { private static ObjectManager manager = null; - private static String dataRootDir = Settings.getConfiguration().getString("index.data.root.directory"); - private static String documentRootDir = Settings.getConfiguration().getString("index.document.root.directory"); private static String nodeBaseURL = Settings.getConfiguration().getString("dataone.mn.baseURL"); private static String DataONEauthToken = null; private static Logger logger = Logger.getLogger(ObjectManager.class); + private static Storage storage = null; private static final String TOKEN_VARIABLE_NAME = "DATAONE_AUTH_TOKEN"; private static final String TOKEN_FILE_PATH_PROP_NAME = "dataone.nodeToken.file"; - private static final String SYSTEMMETA_FILE_NAME = "systemmetadata.xml"; private static MultipartD1Node d1Node = null; private static Session session = null; - private static boolean ifDataAndDocRootSame = false; - + + static { + try { + refreshD1Node(); + } catch (ServiceFailure e) { + logger.warn("Metacat cannot initialize the d1Node since " + e.getMessage()); + } + storage = Storage.getInstance(); + manager = new ObjectManager(); + } + + /** * Private constructor - * @throws ServiceFailure */ - private ObjectManager() throws ServiceFailure { - if (dataRootDir == null || dataRootDir.trim().equals("")) { - throw new ServiceFailure("0000", "The data root directory specified by the property index.data.root.directory is blank in the properties file"); - } - if (documentRootDir == null || documentRootDir.trim().equals("")) { - throw new ServiceFailure("0000", "The metadata root directory specified by the property index.document.root.directory is blank in the properties file"); - } - if (!Files.exists(FileSystems.getDefault().getPath(dataRootDir))) { - throw new ServiceFailure("0000", "The data root directory " + dataRootDir + - " specified in the properties file doesn't exist"); - } - if (!Files.exists(FileSystems.getDefault().getPath(documentRootDir))) { - throw new ServiceFailure("0000", "The document root directory " + documentRootDir + - " specified in the properties file doesn't exist"); - } - if (!dataRootDir.endsWith("/")) { - dataRootDir = dataRootDir + "/"; - } - if (!documentRootDir.endsWith("/")) { - documentRootDir = documentRootDir + "/"; - } - - if (documentRootDir.equals(dataRootDir)) { - ifDataAndDocRootSame = true; - } - logger.info("ObjectManager.constructor - the root document directory is " + - documentRootDir + " and the root data directory is " + dataRootDir + - " Are they same?" + ifDataAndDocRootSame); - if (d1Node == null) { - refreshD1Node(); - } else { - logger.info("ObjectManager ---NOT going to create the d1node with the url " + nodeBaseURL + - " since the ObjectManager already was assigned a d1node with the url " + d1Node.getNodeBaseServiceUrl()); - } + private ObjectManager() { } - + /** * Get an ObjectManager instance through the singleton pattern. * @return the instance of ObjectManager - * @throws ServiceFailure */ - public static ObjectManager getInstance() throws ServiceFailure { - if (manager == null) { - synchronized (ObjectManager.class) { - if (manager == null) { - manager = new ObjectManager(); - } - } - } + public static ObjectManager getInstance() { return manager; } - + /** - * Get the absolute file path for a given relative path. If the relativePath is null or blank, - * null will be returned - * @param relativePath - * @param objectFormat - * @return the absolute file path + * Get the system metadata for the given id + * @param id the id to identify the system metadata + * @return the input stream of the system metadata associated with the id. It may be null. + * @throws InvalidToken + * @throws NotAuthorized + * @throws NotImplemented + * @throws ServiceFailure * @throws NotFound + * @throws MarshallingException + * @throws IOException + * @throws NoSuchAlgorithmException */ - public String getFilePath(String relativePath, String objectFormat) throws NotFound { - String absolutePath = null; - if (relativePath != null && !relativePath.trim().equals("")) { - if (ifDataAndDocRootSame) { - absolutePath = documentRootDir + relativePath; - } else if (objectFormat != null && !objectFormat.trim().equals("")) { - ObjectFormat format =ObjectFormatCache.getInstance().getFormat(objectFormat); - if (format.getFormatType().equals("METADATA")) { - absolutePath = documentRootDir + relativePath; - } else { - absolutePath = dataRootDir + relativePath; + public InputStream getSystemMetadataStream(String id) throws InvalidToken, NotAuthorized, + NotImplemented, ServiceFailure, NotFound, + NoSuchAlgorithmException, IOException, MarshallingException { + long start = System.currentTimeMillis(); + //try to get the system metadata from the storage system first + InputStream sysmetaInputStream = null; + try { + sysmetaInputStream = storage.retrieveSystemMetadata(id); + long end = System.currentTimeMillis(); + logger.info("Finish getting the system metadata via the file system for the pid " + id + + " and it took " + (end - start) + "milliseconds"); + } catch (FileNotFoundException exception ) { + if (d1Node != null) { + // Metacat can't find the system metadata from the storage system. + // So try to get it from the dataone api + SystemMetadata sysmeta = null; + Identifier identifier = new Identifier(); + identifier.setValue(id); + sysmeta = d1Node.getSystemMetadata(session, identifier); + logger.debug("Finish getting the system metadata via the DataONE API call for the pid " + + id); + if (sysmeta != null) { + ByteArrayOutputStream systemMetadataOutputStream = new ByteArrayOutputStream(); + TypeMarshaller.marshalTypeToOutputStream(sysmeta, systemMetadataOutputStream); + sysmetaInputStream = + new ByteArrayInputStream(systemMetadataOutputStream.toByteArray()); } + long end = System.currentTimeMillis(); + logger.info("Finish getting the system metadata via DataONE API for the pid " + id + + " and it took " + (end - start) + "milliseconds"); } } - logger.debug("ObjectManager.getFilePath - the absolute file path for the relative file path " + - relativePath + " is " + absolutePath); - return absolutePath; + return sysmetaInputStream; } - + /** - * Get the system metadata for the given id + * Get the system metadata object for the given identifier * @param id the id to identify the system metadata - * @param objectRelativePath the object path for this id. It can help to determine - * the system metadata file if the system metadata file exists. - * @return the system metadata associated with the id + * @return the system metadata object associated with the id. It may be null. * @throws InvalidToken * @throws NotAuthorized * @throws NotImplemented * @throws ServiceFailure * @throws NotFound - * @throws MarshallingException - * @throws IOException - * @throws IllegalAccessException - * @throws InstantiationException + * @throws InstantiationException + * @throws IllegalAccessException + * @throws IOException + * @throws MarshallingException + * @throws NoSuchAlgorithmException */ - public SystemMetadata getSystemMetadata(String id, String relativeObjPath) throws InvalidToken, NotAuthorized, NotImplemented, - ServiceFailure, NotFound, InstantiationException, IllegalAccessException, IOException, MarshallingException { - SystemMetadata sysmeta = null; - long start = System.currentTimeMillis(); - //try to get the system metadata from the file system first - File sysmetaFile = getSysmetaFile(relativeObjPath); - if (sysmetaFile != null) { - sysmeta = TypeMarshaller.unmarshalTypeFromFile(SystemMetadata.class, sysmetaFile); - long end = System.currentTimeMillis(); - logger.info("ObjectManager.getSystemMetadata - finish getting the system metadata via the file system for the pid " + id + - " and it took " + (end - start) + "milliseconds"); - } else { - //if we can't get it from the file system, get it from dataone API - Identifier identifier = new Identifier(); - identifier.setValue(id); - try { - for (int i=0; i<5; i++) { - try { - sysmeta = d1Node.getSystemMetadata(session, identifier); - break; - } catch (ServiceFailure ee) { - logger.warn("The DataONE api call doesn't get the system metadata since " - + ee.getMessage() + ". This is " + i - + " try and Indexer will try again."); - try { - Thread.sleep(300); - } catch (InterruptedException ie) { - logger.info("The sleep of the thread was interrupted."); + public org.dataone.service.types.v1.SystemMetadata getSystemMetadata(String id) + throws InvalidToken, NotAuthorized, NoSuchAlgorithmException, + NotImplemented, ServiceFailure, NotFound, + InstantiationException, IllegalAccessException, + IOException, MarshallingException { + org.dataone.service.types.v1.SystemMetadata sysmeta = null; + try (InputStream input = getSystemMetadataStream(id)) { + if (input != null) { + try { + SystemMetadata sysmeta2 = TypeMarshaller + .unmarshalTypeFromStream(SystemMetadata.class, input); + sysmeta = sysmeta2; + } catch (Exception e) { + try (InputStream input2 = getSystemMetadataStream(id)) { + if (input2 != null) { + sysmeta = TypeMarshaller.unmarshalTypeFromStream( + org.dataone.service.types.v1.SystemMetadata.class, input2); } - continue; } } - logger.debug("ObjectManager.getSystemMetadata - finish getting the system metadata via the DataONE API call for the pid " + id); - } catch (NotAuthorized e) { - logger.info("ObjectManager.getSystemMetadata - failed to get the system metadata via the DataONE API call for the pid " + id + - " since it is not authorized. We will refresh the token and try again"); - refreshD1Node(); - sysmeta = d1Node.getSystemMetadata(session, identifier); } - long end = System.currentTimeMillis(); - logger.info("ObjectManager.getSystemMetadata - finish getting the system metadata via DataONE API for the pid " + id + - " and it took " + (end - start) + "milliseconds"); } return sysmeta; } - + + /** + * Get the input stream of the content of the given pid + * @param pid the identifier of the content + * @return the input stream of the content + * @throws IllegalArgumentException + * @throws FileNotFoundException + * @throws NoSuchAlgorithmException + * @throws IOException + */ + public InputStream getObject(String pid) throws IllegalArgumentException, FileNotFoundException, + NoSuchAlgorithmException, IOException { + return storage.retrieveObject(pid); + } + /** * Set the d1 node for this object manager. * We only use it for testing @@ -222,63 +183,12 @@ public SystemMetadata getSystemMetadata(String id, String relativeObjPath) throw public static void setD1Node(MultipartD1Node node) { d1Node = node; } - - /** - * Get the system metadata file path from the objectPath. - * We assume the object and system metadata file are in the same directory. - * The system metadata file has a fixed name - systemmetadata.xml - * @param relativeObjPath the relative path of the object - * @return the file of system metadata. If it is null, this means the system metadata file does not exist. - */ - protected static File getSysmetaFile(String relativeObjPath) { - File sysmetaFile = null; - String sysmetaPath = null; - String relativeSysmetaPath = null; - if (relativeObjPath != null) { - if (relativeObjPath.contains(File.separator)) { - logger.debug("ObjectManager.getSysmetaFile - the object file path " + relativeObjPath + " has at least one path separator " + File.pathSeparator); - relativeSysmetaPath = relativeObjPath.substring(0, relativeObjPath.lastIndexOf(File.separator) + 1) + SYSTEMMETA_FILE_NAME; - } else { - logger.debug("ObjectManager.getSysmetaFile - the object file path " + relativeObjPath + " doesnot have any path separator " + File.pathSeparator); - //There is not path information in the object path ( it only has the file name). So we just simply return systemmetadata.xml - relativeSysmetaPath = SYSTEMMETA_FILE_NAME; - } - logger.debug("ObjectManager.getSysmetaFile - the relative system metadata file path for the object path " + - relativeObjPath + " is " + relativeSysmetaPath); - if (ifDataAndDocRootSame) { - sysmetaPath = documentRootDir + relativeSysmetaPath; - sysmetaFile = new File(sysmetaPath); - if (!sysmetaFile.exists()) { - //the system metadata file doesn't exist and we set it to null - sysmetaPath = null; - sysmetaFile = null; - } - } else { - //try if this object is a document first since we have no idea if the object is metadata or data. - sysmetaPath = documentRootDir + relativeSysmetaPath; - sysmetaFile = new File(sysmetaPath); - if (!sysmetaFile.exists()) { - // try data - sysmetaPath = dataRootDir + relativeSysmetaPath; - sysmetaFile = new File(sysmetaPath); - if (!sysmetaFile.exists()) { - //the system metadata file doesn't exist and we set it to null - sysmetaPath = null; - sysmetaFile = null; - } - } - } - } - logger.debug("ObjectManager.getSysmetaFile - the final system metadata file path for the object path " + - relativeObjPath + " is " + sysmetaPath + ". Null means that not system metadata file exists."); - return sysmetaFile; - } - + /** * In case the token expired, the method will retrieve the token and create a new d1 node * @throws ServiceFailure */ - private void refreshD1Node() throws ServiceFailure { + private static void refreshD1Node() throws ServiceFailure { //get the token DataONEauthToken = System.getenv(TOKEN_VARIABLE_NAME); if (DataONEauthToken == null || DataONEauthToken.trim().equals("")) { @@ -321,7 +231,7 @@ private void refreshD1Node() throws ServiceFailure { * @param authToken the authentication token * @return the DataONE session */ - private Session createSession(String authToken) { + private static Session createSession(String authToken) { Session session = null; if (authToken == null || authToken.trim().equals("")) { logger.info("ObjectManager.createSession - Creating the public session"); @@ -340,10 +250,9 @@ private Session createSession(String authToken) { * @param serviceUrl the service URL for the node we are connecting to * @return a DataONE MultipartCNode object * @throws ClientSideException - * @throws IOException - * @throws MetadigException + * @throws IOException */ - private MultipartD1Node getMultipartD1Node(Session session, String serviceUrl) throws IOException, ClientSideException { + private static MultipartD1Node getMultipartD1Node(Session session, String serviceUrl) throws IOException, ClientSideException { MultipartRestClient mrc = null; MultipartD1Node d1Node = null; // First create a default HTTP client @@ -366,7 +275,7 @@ private MultipartD1Node getMultipartD1Node(Session session, String serviceUrl) t * @param nodeStr either a DataONE node serviceURL (e.g. https://knb.ecoinformatics.org/knb/d1/mn) * or a DataONE node identifier (e.g. urn:node:CN) */ - private Boolean isCN(String nodeStr) { + private static Boolean isCN(String nodeStr) { Boolean isCN = false; // match node urn, e.g. "https://cn.dataone.org/cn" if (nodeStr.matches("^\\s*urn:node:.*")) { diff --git a/src/main/java/org/dataone/cn/indexer/parser/BaseXPathDocumentSubprocessor.java b/src/main/java/org/dataone/cn/indexer/parser/BaseXPathDocumentSubprocessor.java index 1be93575..e15cc90f 100644 --- a/src/main/java/org/dataone/cn/indexer/parser/BaseXPathDocumentSubprocessor.java +++ b/src/main/java/org/dataone/cn/indexer/parser/BaseXPathDocumentSubprocessor.java @@ -166,7 +166,7 @@ public void setMatchDocuments(List matchDocuments) { this.matchDocuments = matchDocuments; } - public static void setXmlNamespaceConfig(XMLNamespaceConfig xmlNamespaceConfig) { + public void setXmlNamespaceConfig(XMLNamespaceConfig xmlNamespaceConfig) { xpath.setNamespaceContext(xmlNamespaceConfig); } diff --git a/src/main/java/org/dataone/cn/indexer/parser/utility/SeriesIdResolver.java b/src/main/java/org/dataone/cn/indexer/parser/utility/SeriesIdResolver.java index e921a055..ca181ed6 100644 --- a/src/main/java/org/dataone/cn/indexer/parser/utility/SeriesIdResolver.java +++ b/src/main/java/org/dataone/cn/indexer/parser/utility/SeriesIdResolver.java @@ -80,42 +80,6 @@ public static Identifier getPid(Identifier identifier) return pid; } - /** - * Check if the given identifier is a PID or a SID - * - * @param identifier - * @return true if the identifier is a SID, false if a PID - * @throws NotFound - * @throws ServiceFailure - * @throws NotImplemented - * @throws NotAuthorized - * @throws InvalidToken - * @throws MarshallingException - * @throws IOException - * @throws IllegalAccessException - * @throws InstantiationException - */ - public static boolean isSeriesId(Identifier identifier) - throws InvalidToken, NotAuthorized, NotImplemented, ServiceFailure, NotFound, - InstantiationException, IllegalAccessException, IOException, MarshallingException { - - // if we have system metadata available via HZ map, then it's a PID - String relativeObjPath = null;//we don't know the path - SystemMetadata systemMetadata = - ObjectManager.getInstance().getSystemMetadata(identifier.getValue(), relativeObjPath); - if (systemMetadata != null) { - return false; - } - //TODO: check that it's not just bogus value by looking up the pid? -// Identifier pid = getPid(identifier); -// if (pid.equals(identifier)) { -// return false; -// } - - // okay, it's a SID - return true; - - } } diff --git a/src/main/java/org/dataone/cn/indexer/resourcemap/ForesiteResourceMap.java b/src/main/java/org/dataone/cn/indexer/resourcemap/ForesiteResourceMap.java index 55aa8a4f..c37c8f1d 100644 --- a/src/main/java/org/dataone/cn/indexer/resourcemap/ForesiteResourceMap.java +++ b/src/main/java/org/dataone/cn/indexer/resourcemap/ForesiteResourceMap.java @@ -1,25 +1,3 @@ -/** - * This work was created by participants in the DataONE project, and is - * jointly copyrighted by participating institutions in DataONE. For - * more information on DataONE, see our web site at http://dataone.org. - * - * Copyright ${year} - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * $Id$ - */ - package org.dataone.cn.indexer.resourcemap; import java.io.ByteArrayOutputStream; @@ -29,6 +7,7 @@ import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; +import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -218,60 +197,65 @@ private void _init(InputStream is) throws OREException, URISyntaxException, public static boolean representsResourceMap(String formatId) { return RESOURCE_MAP_FORMAT.equals(formatId); } - + private boolean isHeadVersion(Identifier pid, Identifier sid) { boolean isHead = true; if(pid != null && sid != null) { - /*Identifier newId = new Identifier(); - newId.setValue("peggym.130.5"); - if(pid.getValue().equals("peggym.130.4") && HazelcastClientFactory.getSystemMetadataMap().get(newId) != null) { - isHead =false; - } else if (pid.getValue().equals("peggym.130.4") && HazelcastClientFactory.getSystemMetadataMap().get(newId) == null) { - isHead = true; - }*/ Identifier head = null; try { - head = SeriesIdResolver.getPid(sid);//if the passed sid actually is a pid, the method will return the pid. + //if the passed sid actually is a pid, the method will return the pid. + head = SeriesIdResolver.getPid(sid); } catch (Exception e) { - System.out.println(""+e.getStackTrace()); isHead = true; } if(head != null ) { - //System.out.println("||||||||||||||||||| the head version is "+ head.getValue()+" for sid "+sid.getValue()); - logger.info("||||||||||||||||||| the head version is "+ head.getValue()+" for sid "+sid.getValue()); + + logger.info("||||||||||||||||||| the head version is " + head.getValue() + + " for sid " + sid.getValue()); if(head.equals(pid)) { - logger.info("||||||||||||||||||| the pid "+ pid.getValue()+" is the head version for sid "+sid.getValue()); + logger.info("||||||||||||||||||| the pid " + pid.getValue() + + " is the head version for sid " + sid.getValue()); isHead=true; } else { - logger.info("||||||||||||||||||| the pid "+ pid.getValue()+" is NOT the head version for sid "+sid.getValue()); + logger.info("||||||||||||||||||| the pid " + pid.getValue() + + " is NOT the head version for sid " + sid.getValue()); isHead=false; } } else { - //System.out.println("||||||||||||||||||| can't find the head version for sid "+sid.getValue()); - logger.info("||||||||||||||||||| can't find the head version for sid "+sid.getValue() + " and we think the given pid "+pid.getValue()+" is the head version."); + logger.info("||||||||||||||||||| can't find the head version for sid " + + sid.getValue() + " and we think the given pid " + pid.getValue() + + " is the head version."); } } return isHead; } - private SolrDoc _mergeMappedReference(ResourceEntry resourceEntry, SolrDoc mergeDocument) throws InvalidToken, NotAuthorized, NotImplemented, - ServiceFailure, NotFound, InstantiationException, IllegalAccessException, IOException, MarshallingException { - - Identifier identifier = new Identifier(); - identifier.setValue(mergeDocument.getIdentifier()); - //SystemMetadata sysMeta = HazelcastClientFactory.getSystemMetadataMap().get(identifier); - String relativeObjPath = null; //we don't know the path - SystemMetadata sysMeta = ObjectManager.getInstance().getSystemMetadata(identifier.getValue(), relativeObjPath); - if (sysMeta.getSeriesId() != null && sysMeta.getSeriesId().getValue() != null && !sysMeta.getSeriesId().getValue().trim().equals("")) { - // skip this one - if(!isHeadVersion(identifier, sysMeta.getSeriesId())) { - //System.out.println("The id "+identifier+" is not the head of the serial id "+sysMeta.getSeriesId().getValue()+" So, skip merge this one!!!!!!!!!!!!!!!!!!!!!!"+mergeDocument.getIdentifier()); - logger.info("The id "+identifier+" is not the head of the serial id "+sysMeta.getSeriesId().getValue()+" So, skip merge this one!!!!!!!!!!!!!!!!!!!!!!"+mergeDocument.getIdentifier()); - return mergeDocument; - } - - } - + private SolrDoc _mergeMappedReference(ResourceEntry resourceEntry, SolrDoc mergeDocument) + throws InvalidToken, NotAuthorized, NotImplemented, + NoSuchAlgorithmException, ServiceFailure, NotFound, InstantiationException, + IllegalAccessException, IOException, MarshallingException { + + Identifier identifier = new Identifier(); + identifier.setValue(mergeDocument.getIdentifier()); + try { + SystemMetadata sysMeta = (SystemMetadata) ObjectManager.getInstance() + .getSystemMetadata(identifier.getValue()); + if (sysMeta.getSeriesId() != null && sysMeta.getSeriesId().getValue() != null + && !sysMeta.getSeriesId().getValue().trim().equals("")) { + // skip this one + if(!isHeadVersion(identifier, sysMeta.getSeriesId())) { + logger.info("The id " + identifier + " is not the head of the serial id " + + sysMeta.getSeriesId().getValue() + + " So, skip merge this one!!!!!!!!!!!!!!!!!!!!!!" + + mergeDocument.getIdentifier()); + return mergeDocument; + } + } + } catch (ClassCastException e) { + logger.warn("The systemmetadata is a v1 object and we need to do nothing"); + } + + if (mergeDocument.hasField(SolrElementField.FIELD_ID) == false) { mergeDocument.addField(new SolrElementField(SolrElementField.FIELD_ID, resourceEntry .getIdentifier())); @@ -362,19 +346,22 @@ public List mergeIndexedDocuments(List docs) { List mergedDocuments = new ArrayList(); for (ResourceEntry resourceEntry : this.resourceMap.values()) { for (SolrDoc doc : docs) { - //System.out.println(">>>>>>>>in mergeIndexedDocuments of ForesiteResourceMap, the doc id is "+doc.getIdentifier() +" in the thread "+Thread.currentThread().getId()); - //System.out.println(">>>>>>>>in mergeIndexedDocuments of ForesiteResourceMap, the doc series id is "+doc.getSeriesId()+" in the thread "+Thread.currentThread().getId()); - //System.out.println(">>>>>>>>in mergeIndexedDocuments of ForesiteResourceMap, the resource entry id is "+resourceEntry.getIdentifier()+" in the thread "+Thread.currentThread().getId()); - logger.debug(">>>>>>>>in mergeIndexedDocuments of ForesiteResourceMap, the doc id is "+doc.getIdentifier() +" in the thread "+Thread.currentThread().getId()); - logger.debug(">>>>>>>>in mergeIndexedDocuments of ForesiteResourceMap, the doc series id is "+doc.getSeriesId()+" in the thread "+Thread.currentThread().getId()); - logger.debug(">>>>>>>>in mergeIndexedDocuments of ForesiteResourceMap, the resource entry id is "+resourceEntry.getIdentifier()+" in the thread "+Thread.currentThread().getId()); + + logger.debug("in mergeIndexedDocuments of ForesiteResourceMap, the doc id is " + + doc.getIdentifier() + " in the thread "+Thread.currentThread().getId()); + logger.debug("in mergeIndexedDocuments of ForesiteResourceMap, the doc series id is " + + doc.getSeriesId() + " in the thread "+Thread.currentThread().getId()); + logger.debug("in mergeIndexedDocuments of ForesiteResourceMap, the resource entry id is " + + resourceEntry.getIdentifier() + " in the thread " + + Thread.currentThread().getId()); if (doc.getIdentifier().equals(resourceEntry.getIdentifier()) || resourceEntry.getIdentifier().equals(doc.getSeriesId())) { try { mergedDocuments.add(_mergeMappedReference(resourceEntry, doc)); } catch (Exception e) { - logger.error("ForestieResourceMap.mergeIndexedDocuments - cannot merge the document since " + e.getMessage()); + logger.error("ForestieResourceMap.mergeIndexedDocuments - cannot merge the document since " + + e.getMessage()); } } diff --git a/src/main/java/org/dataone/cn/indexer/resourcemap/IndexVisibilityDelegateImpl.java b/src/main/java/org/dataone/cn/indexer/resourcemap/IndexVisibilityDelegateImpl.java index 1f375ba6..044550b8 100644 --- a/src/main/java/org/dataone/cn/indexer/resourcemap/IndexVisibilityDelegateImpl.java +++ b/src/main/java/org/dataone/cn/indexer/resourcemap/IndexVisibilityDelegateImpl.java @@ -1,6 +1,7 @@ package org.dataone.cn.indexer.resourcemap; import java.io.IOException; +import java.security.NoSuchAlgorithmException; import org.apache.log4j.Logger; import org.dataone.cn.indexer.object.ObjectManager; @@ -12,7 +13,7 @@ import org.dataone.service.exceptions.NotImplemented; import org.dataone.service.exceptions.ServiceFailure; import org.dataone.service.types.v1.Identifier; -import org.dataone.service.types.v2.SystemMetadata; +import org.dataone.service.types.v1.SystemMetadata; public class IndexVisibilityDelegateImpl implements IndexVisibilityDelegate { @@ -25,10 +26,8 @@ public IndexVisibilityDelegateImpl() { public boolean isDocumentVisible(Identifier pid) { boolean visible = false; try { - - //SystemMetadata systemMetadata = HazelcastClientFactory.getSystemMetadataMap().get(pid); - String relativeObjPath = null; //we don't know the path - SystemMetadata systemMetadata = ObjectManager.getInstance().getSystemMetadata(pid.getValue(), relativeObjPath); + SystemMetadata systemMetadata = ObjectManager.getInstance() + .getSystemMetadata(pid.getValue()); // TODO: Is pid Identifier a SID? if (systemMetadata == null) { return true; @@ -56,6 +55,8 @@ public boolean isDocumentVisible(Identifier pid) { logger.warn("Could not get visible value for pid: " + pid.getValue() + " since " +e.getMessage()); } catch (MarshallingException e) { logger.warn("Could not get visible value for pid: " + pid.getValue() + " since " +e.getMessage()); + } catch (NoSuchAlgorithmException e) { + logger.warn("Could not get visible value for pid: " + pid.getValue() + " since " +e.getMessage()); } return visible; } @@ -63,9 +64,7 @@ public boolean isDocumentVisible(Identifier pid) { public boolean documentExists(Identifier pid) { boolean exists = false; try { - //SystemMetadata systemMetadata = HazelcastClientFactory.getSystemMetadataMap().get(pid); - String relativeObjPath = null; //we don't know the path - SystemMetadata systemMetadata = ObjectManager.getInstance().getSystemMetadata(pid.getValue(), relativeObjPath); + SystemMetadata systemMetadata = ObjectManager.getInstance().getSystemMetadata(pid.getValue()); if (systemMetadata != null) { exists = true; } else { @@ -92,6 +91,8 @@ public boolean documentExists(Identifier pid) { logger.warn("Could not get visible value for pid: " + pid.getValue() + " since " +e.getMessage()); } catch (MarshallingException e) { logger.warn("Could not get visible value for pid: " + pid.getValue() + " since " +e.getMessage()); + } catch (NoSuchAlgorithmException e) { + logger.warn("Could not get visible value for pid: " + pid.getValue() + " since " +e.getMessage()); } return exists; } diff --git a/src/main/java/org/dataone/cn/indexer/solrhttp/SolrDoc.java b/src/main/java/org/dataone/cn/indexer/solrhttp/SolrDoc.java index 01271732..20d3ba06 100644 --- a/src/main/java/org/dataone/cn/indexer/solrhttp/SolrDoc.java +++ b/src/main/java/org/dataone/cn/indexer/solrhttp/SolrDoc.java @@ -28,7 +28,7 @@ import java.util.List; import org.apache.commons.io.IOUtils; -import org.dataone.service.types.v2.SystemMetadata; +import org.dataone.service.types.v1.SystemMetadata; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; diff --git a/src/main/java/org/dataone/indexer/queue/IndexQueueMessageParser.java b/src/main/java/org/dataone/indexer/queue/IndexQueueMessageParser.java index 81a725da..a3a18831 100644 --- a/src/main/java/org/dataone/indexer/queue/IndexQueueMessageParser.java +++ b/src/main/java/org/dataone/indexer/queue/IndexQueueMessageParser.java @@ -1,23 +1,3 @@ -/** - * This work was created by participants in the DataONE project, and is - * jointly copyrighted by participating institutions in DataONE. For - * more information on DataONE, see our web site at http://dataone.org. - * - * Copyright 2022 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ package org.dataone.indexer.queue; import java.util.Map; @@ -36,12 +16,12 @@ * */ public class IndexQueueMessageParser { - private final static String HEADER_ID = "id"; //The header name in the message to store the identifier - private final static String HEADER_PATH = "path"; //The header name in the message to store the path of the object - private final static String HEADER_INDEX_TYPE = "index_type"; //The header name in the message to store the index type - + //The header name in the message to store the identifier + private final static String HEADER_ID = "id"; + //The header name in the message to store the index type + private final static String HEADER_INDEX_TYPE = "index_type"; + private Identifier identifier = null; - private String objectPath = null; private String indexType = null; private int priority = 1; @@ -55,11 +35,13 @@ public class IndexQueueMessageParser { */ public void parse(AMQP.BasicProperties properties, byte[] body) throws InvalidRequest { if(properties == null) { - throw new InvalidRequest("0000", "The properties, which contains the index task info, cannot be null in the index queue message."); + throw new InvalidRequest("0000", "The properties, which contains the index task info, " + + "cannot be null in the index queue message."); } Map headers = properties.getHeaders(); if(headers == null) { - throw new InvalidRequest("0000", "The header of the properties, which contains the index task info, cannot be null in the index queue message."); + throw new InvalidRequest("0000", "The header of the properties, which contains the " + + "index task info, cannot be null in the index queue message."); } Object pidObj = headers.get(HEADER_ID); if (pidObj == null) { @@ -72,7 +54,7 @@ public void parse(AMQP.BasicProperties properties, byte[] body) throws InvalidRe logger.debug("IndexQueueMessageParser.parse - the identifier in the message is " + pid); identifier = new Identifier(); identifier.setValue(pid); - + Object typeObj = headers.get(HEADER_INDEX_TYPE); if (typeObj == null) { throw new InvalidRequest("0000", "The index type cannot be null in the index queue message."); @@ -82,12 +64,7 @@ public void parse(AMQP.BasicProperties properties, byte[] body) throws InvalidRe throw new InvalidRequest("0000", "The index type cannot be null or blank in the index queue message."); } logger.debug("IndexQueueMessageParser.parse - the index type in the message is " + indexType); - - Object pathObject = headers.get(HEADER_PATH); - if (pathObject != null) { - objectPath = ((LongString)pathObject).toString(); - } - logger.debug("IndexQueueMessageParser.parse - the file path of the object which be indexed in the message is " + objectPath); + try { priority = properties.getPriority(); } catch (NullPointerException e) { @@ -105,16 +82,6 @@ public Identifier getIdentifier() { return identifier; } - /** - * Get the file path of the object, which will be indexed, - * after calling the parse method to parse the index queue message. - * @return the file path of the object. It can be null or blank, which - * means we don't have the object in the system. - */ - public String getObjectPath() { - return objectPath; - } - /** * Get the type of the index task after calling the parse method to parse the index queue message. * @return the type of the index task. It can be create, delete or sysmeta. diff --git a/src/main/java/org/dataone/indexer/storage/Storage.java b/src/main/java/org/dataone/indexer/storage/Storage.java new file mode 100644 index 00000000..a0ea2176 --- /dev/null +++ b/src/main/java/org/dataone/indexer/storage/Storage.java @@ -0,0 +1,136 @@ +package org.dataone.indexer.storage; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.security.NoSuchAlgorithmException; +import java.util.Properties; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.dataone.configuration.Settings; +import org.dataone.hashstore.HashStore; +import org.dataone.hashstore.HashStoreFactory; +import org.dataone.hashstore.exceptions.HashStoreFactoryException; + +/** + * The HashFileStore implementation of the Storage interface + */ +public class Storage { + + private static Log log = LogFactory.getLog(Storage.class); + private static Storage instance; + private static HashStore hashStore; + static { + try { + instance = new Storage(); + } catch (IOException e) { + log.error( + "Dataone-indexer cannot initialize the Storage class since " + e.getMessage(), e); + System.exit(1); + } + } + + /** + * Private constructor + * @throws IOException + * @throws HashStoreFactoryException + */ + private Storage() throws HashStoreFactoryException, IOException { + String className = Settings.getConfiguration().getString("storage.className"); + String rootPath = Settings.getConfiguration().getString("storage.hashstore.rootDirectory"); + if (rootPath == null) { + throw new HashStoreFactoryException("HashStorage.constructor - The HashStore root path " + + " is null or blank from the property of storage.hashstore.rootDirectory"); + } + String directoryDepth = Settings.getConfiguration() + .getString("storage.hashstore.directory.depth", "3"); + String directoryNameWidth = Settings.getConfiguration() + .getString("storage.hashstore.directory.width", "2"); + String fileNameAlgorithm = Settings.getConfiguration() + .getString("storage.hashstore.fileNameAlgorithm", "SHA-256"); + String defaultNamespace = Settings.getConfiguration() + .getString("storage.hashstore.defaultNamespace", + "https://ns.dataone.org/service/types/v2.0#SystemMetadata"); + Properties storeProperties = new Properties(); + storeProperties.setProperty("storePath", rootPath); + storeProperties.setProperty("storeDepth", directoryDepth); + storeProperties.setProperty("storeWidth", directoryNameWidth); + storeProperties.setProperty("storeAlgorithm", fileNameAlgorithm); + storeProperties.setProperty("storeMetadataNamespace", defaultNamespace); + hashStore = HashStoreFactory.getHashStore(className, storeProperties); + } + + /** + * Get the instance of the class through the singleton pattern + * @return the instance of the class + */ + public static Storage getInstance() { + return instance; + } + + /** + * Returns an InputStream to an object from HashStore using a given persistent identifier. + * + * @param pid Authority-based identifier + * @return Object InputStream + * @throws IllegalArgumentException When pid is null or empty + * @throws FileNotFoundException When requested pid has no associated object + * @throws IOException I/O error when creating InputStream to object + * @throws NoSuchAlgorithmException When algorithm used to calculate object address is not + * supported + */ + public InputStream retrieveObject(String pid) + throws IllegalArgumentException, FileNotFoundException, IOException, + NoSuchAlgorithmException { + return hashStore.retrieveObject(pid); + } + + /** + * Returns an InputStream to the system metadata content of a given pid + * + * @param pid Authority-based identifier + * @return Metadata InputStream + * @throws IllegalArgumentException When pid/formatId is null or empty + * @throws FileNotFoundException When requested pid+formatId has no associated object + * @throws IOException I/O error when creating InputStream to metadata + * @throws NoSuchAlgorithmException When algorithm used to calculate metadata address is not + * supported + */ + public InputStream retrieveSystemMetadata(String pid) + throws IllegalArgumentException, FileNotFoundException, IOException, + NoSuchAlgorithmException { + return hashStore.retrieveMetadata(pid); + } + + /** + * Store the input stream object into hash store. This method is only for the test classes. + * @param object the input stream of the object + * @param pid the identifier of the object which will be stored + * @throws NoSuchAlgorithmException + * @throws IOException + * @throws RuntimeException + * @throws InterruptedException + */ + public void storeObject(InputStream object, String pid) throws NoSuchAlgorithmException, + IOException,RuntimeException, InterruptedException { + hashStore.storeObject(object, pid, null, null, null, -1); + } + + /** + * Store the system metadata into hash store. This method is only for the test classes. + * @param metadata the input stream of the system metadata + * @param pid the identifier of the system metadata + * @throws IOException + * @throws IllegalArgumentException + * @throws FileNotFoundException + * @throws InterruptedException + * @throws NoSuchAlgorithmException + */ + public void storeMetadata(InputStream metadata, String pid) throws IOException, + IllegalArgumentException, FileNotFoundException, + InterruptedException, NoSuchAlgorithmException { + hashStore.storeMetadata(metadata, pid); + } + +} diff --git a/src/main/resources/log4j2.properties b/src/main/resources/log4j2.properties index 75a89bc2..67b1596f 100644 --- a/src/main/resources/log4j2.properties +++ b/src/main/resources/log4j2.properties @@ -34,7 +34,7 @@ appender.consoleAppender.layout.pattern=dataone-indexer %d{yyyyMMdd-HH:mm:ss}: [ ################################## # the root logger configuration # ################################## -rootLogger.level=INFO +rootLogger.level={{ ternary "DEBUG" "ERROR" .Values.image.debug }} rootLogger.appenderRef.console.ref=consoleAppender ################################################################################ diff --git a/src/test/java/org/dataone/cn/index/DataONESolrJettyTestBase.java b/src/test/java/org/dataone/cn/index/DataONESolrJettyTestBase.java index 57ad01ae..e0addfb3 100644 --- a/src/test/java/org/dataone/cn/index/DataONESolrJettyTestBase.java +++ b/src/test/java/org/dataone/cn/index/DataONESolrJettyTestBase.java @@ -1,29 +1,10 @@ -/** - * This work was created by participants in the DataONE project, and is - * jointly copyrighted by participating institutions in DataONE. For - * more information on DataONE, see our web site at http://dataone.org. - * - * Copyright ${year} - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * $Id$ - */ - package org.dataone.cn.index; import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -38,7 +19,6 @@ import org.apache.commons.codec.EncoderException; import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.solr.SolrJettyTestBase; import org.apache.solr.SolrTestCaseJ4.SuppressSSL; import org.apache.solr.client.solrj.SolrServerException; @@ -55,14 +35,13 @@ import org.dataone.cn.indexer.parser.ISolrField; import org.dataone.cn.indexer.solrhttp.SolrElementField; import org.dataone.configuration.Settings; +import org.dataone.indexer.storage.Storage; import org.dataone.service.exceptions.NotFound; import org.dataone.service.exceptions.NotImplemented; import org.dataone.service.exceptions.ServiceFailure; import org.dataone.service.exceptions.UnsupportedType; import org.dataone.service.types.v1.Identifier; -import org.dataone.service.types.v2.SystemMetadata; import org.dataone.service.util.DateTimeMarshaller; -import org.dataone.service.util.TypeMarshaller; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.format.DateTimeFormat; @@ -88,7 +67,8 @@ public abstract class DataONESolrJettyTestBase extends SolrJettyTestBase { private SolrIndex solrIndexService; private int solrPort = Settings.getConfiguration().getInt("test.solr.port", 8985); private static final String DEFAULT_SOL_RHOME = "solr8home"; - + private static final String SYSTEMMETA_FILE_NAME = "systemmetadata.xml"; + /** * Index the given object into solr * @param identifier the identifier of the object which needs to be indexed @@ -98,11 +78,52 @@ public abstract class DataONESolrJettyTestBase extends SolrJettyTestBase { protected void indexObjectToSolr(String identifier, Resource objectFile) throws Exception { boolean isSysmetaChangeOnly = false; String relativePath = objectFile.getFile().getPath(); + try { + Storage.getInstance().retrieveObject(identifier); + } catch (FileNotFoundException e) { + // The pid is not in the hash store and we need to save the object into hashstore + try (InputStream object = objectFile.getInputStream()) { + Storage.getInstance().storeObject(object, identifier); + } + File sysmetaFile = getSysmetaFile(relativePath); + if (sysmetaFile != null) { + try (InputStream sysmeta = new FileInputStream(sysmetaFile)) { + Storage.getInstance().storeMetadata(sysmeta, identifier); + } + } + } Identifier pid = new Identifier(); pid.setValue(identifier); - solrIndexService.update(pid, relativePath, isSysmetaChangeOnly); + solrIndexService.update(pid, isSysmetaChangeOnly); } - + + /** + * The convention method to get the system metadata file path from the objectPath. + * We assume the object and system metadata file are in the same directory. + * The system metadata file has a fixed name - systemmetadata.xml + * @param relativeObjPath the relative path of the object + * @return the file of system metadata. If it is null, this means the system metadata file does not exist. + */ + private static File getSysmetaFile(String relativeObjPath) { + File sysmetaFile = null; + String sysmetaPath = null; + String relativeSysmetaPath = null; + if (relativeObjPath != null) { + if (relativeObjPath.contains(File.separator)) { + relativeSysmetaPath = relativeObjPath.substring(0, + relativeObjPath.lastIndexOf(File.separator) + 1) + SYSTEMMETA_FILE_NAME; + } else { + // There is not path information in the object path ( it only has the file name). + // So we just simply return systemmetadata.xml + relativeSysmetaPath = SYSTEMMETA_FILE_NAME; + } + } + if (relativeSysmetaPath != null) { + sysmetaFile = new File(relativeSysmetaPath); + } + return sysmetaFile; + } + /** * Delete the given identifier from the solr server * @param identifier @@ -125,25 +146,6 @@ protected void deleteSolrDoc(String identifier) throws XPathExpressionException, solrIndexService.remove(pid); } - protected void addEmlToSolrIndex(Resource sysMetaFile) throws Exception { - SolrIndex indexService = solrIndexService; - SystemMetadata smd = TypeMarshaller.unmarshalTypeFromStream(SystemMetadata.class, - sysMetaFile.getInputStream()); - // path to actual science metadata document - String path = StringUtils.remove(sysMetaFile.getFile().getPath(), File.separator + "SystemMetadata"); - boolean isSysmetaChangeOnly = false; - indexService.update(smd.getIdentifier(), path, isSysmetaChangeOnly); - - } - - protected void addSysAndSciMetaToSolrIndex(Resource sysMeta, Resource sciMeta) throws Exception { - SolrIndex indexService = solrIndexService; - SystemMetadata smd = TypeMarshaller.unmarshalTypeFromStream(SystemMetadata.class, - sysMeta.getInputStream()); - String path = sciMeta.getFile().getAbsolutePath(); - boolean isSysmetaChangeOnly = false; - indexService.update(smd.getIdentifier(), path, isSysmetaChangeOnly); - } protected SolrDocument assertPresentInSolrIndex(String pid) throws SolrServerException, IOException { diff --git a/src/test/java/org/dataone/cn/index/SolrFieldXPathFgdcTest.java b/src/test/java/org/dataone/cn/index/SolrFieldXPathFgdcTest.java index 4e317dd1..3ab0d6fc 100644 --- a/src/test/java/org/dataone/cn/index/SolrFieldXPathFgdcTest.java +++ b/src/test/java/org/dataone/cn/index/SolrFieldXPathFgdcTest.java @@ -296,8 +296,8 @@ public static void setUp() throws Exception { fgdcNasaExpected.put("mediaTypeProperty", ""); fgdcNasaExpected.put("formatId", "FGDC-STD-001.1-1999"); fgdcNasaExpected.put("formatType", "METADATA"); - fgdcNasaExpected.put("size", "14880"); - fgdcNasaExpected.put("checksum", "c72ff66bbe7fa99e5fb399bab8cb6f85"); + fgdcNasaExpected.put("size", "14828"); + fgdcNasaExpected.put("checksum", "1755a557c13be7af44d676bb09274b0e"); fgdcNasaExpected.put("checksumAlgorithm", "MD5"); fgdcNasaExpected.put("submitter", "CN=Dave Vieglais T799,O=Google,C=US,DC=cilogon,DC=org"); fgdcNasaExpected.put("rightsHolder", diff --git a/src/test/java/org/dataone/cn/indexer/IndexWorkerTest.java b/src/test/java/org/dataone/cn/indexer/IndexWorkerTest.java index e6d10f85..67f92e1f 100644 --- a/src/test/java/org/dataone/cn/indexer/IndexWorkerTest.java +++ b/src/test/java/org/dataone/cn/indexer/IndexWorkerTest.java @@ -69,7 +69,7 @@ public void testInitExecutorService() throws Exception { } String propertyName = "index.thread.number"; String numberStr = "5"; - int number = (new Integer(numberStr)).intValue(); + int number = Integer.parseInt(numberStr); // only test setting multiple threads if enough processors are available if (finalThreads > number) { Settings.getConfiguration().setProperty(propertyName, numberStr); @@ -79,7 +79,7 @@ public void testInitExecutorService() throws Exception { assertTrue(worker.multipleThread); } numberStr = "1"; - number = (new Integer(numberStr)).intValue(); + number = Integer.parseInt(numberStr); Settings.getConfiguration().setProperty(propertyName, numberStr); worker.initExecutorService(); System.out.println("worker.nThreads(1): " + worker.nThreads); diff --git a/src/test/java/org/dataone/cn/indexer/object/ObjectManagerTest.java b/src/test/java/org/dataone/cn/indexer/object/ObjectManagerTest.java index b50c6e5c..120286b9 100644 --- a/src/test/java/org/dataone/cn/indexer/object/ObjectManagerTest.java +++ b/src/test/java/org/dataone/cn/indexer/object/ObjectManagerTest.java @@ -1,33 +1,25 @@ -/** - * This work was created by participants in the DataONE project, and is - * jointly copyrighted by participating institutions in DataONE. For - * more information on DataONE, see our web site at http://dataone.org. - * - * Copyright 2022 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ package org.dataone.cn.indexer.object; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; + +import java.io.InputStream; +import java.io.OutputStream; +import java.security.MessageDigest; -import java.nio.file.Paths; +import javax.xml.bind.DatatypeConverter; -import org.dataone.cn.index.DataONESolrJettyTestBase; -import org.dataone.service.exceptions.NotFound; + +import org.dataone.indexer.storage.Storage; +import org.dataone.service.types.v1.Identifier; import org.dataone.service.types.v2.SystemMetadata; +import org.dataone.service.util.TypeMarshaller; +import org.junit.Before; import org.junit.Test; /** @@ -36,73 +28,61 @@ * */ public class ObjectManagerTest { - - /** - * Test the getFilePath method - * @throws Exception - */ - @Test - public void testgetFilePath() throws Exception { - ObjectManager manager = ObjectManager.getInstance(); - String path = null; - String format = "eml://ecoinformatics.org/eml-2.0.1"; - String resultPath = manager.getFilePath(path, format); - assertTrue(resultPath == null); - format = "image/bmp"; - resultPath = manager.getFilePath(path, format); - assertTrue(resultPath == null); - - path = ""; - format = "eml://ecoinformatics.org/eml-2.0.1"; - resultPath = manager.getFilePath(path, format); - assertTrue(resultPath == null); - format = "image/bmp"; - resultPath = manager.getFilePath(path, format); - assertTrue(resultPath == null); - - path = "/var/metacat/documents/foo.1.1"; - format = "eml://ecoinformatics.org/eml-2.0.1"; - resultPath = manager.getFilePath(path, format); - assertTrue(resultPath.equals("//var/metacat/documents/foo.1.1")); - - path = "/var/metacat/documents/foo.2.1"; - format = "image/bmp";; - resultPath = manager.getFilePath(path, format); - assertTrue(resultPath.equals("//var/metacat/documents/foo.2.1")); + + private String identifier; + + @Before + public void setUp() throws Exception { + identifier = "ObjectManagerTest-" + System.currentTimeMillis(); + File objectFile = new File("src/test/resources/org/dataone/cn/index/resources/d1_testdocs/" + + "fgdc/nasa_d_FEDGPS1293.xml"); + try (InputStream object = new FileInputStream(objectFile)) { + Storage.getInstance().storeObject(object, identifier); + } + File sysmetaFile = new File("src/test/resources/org/dataone/cn/index/resources/" + + "d1_testdocs/fgdc/nasa_d_FEDGPS1293Sysmeta.xml"); + try (InputStream sysmetaStream = new FileInputStream(sysmetaFile)) { + SystemMetadata sysmeta = TypeMarshaller + .unmarshalTypeFromStream(SystemMetadata.class, sysmetaStream); + Identifier pid = new Identifier(); + pid.setValue(identifier); + sysmeta.setIdentifier(pid); + try (ByteArrayOutputStream output = new ByteArrayOutputStream()) { + TypeMarshaller.marshalTypeToOutputStream(sysmeta, output); + try (ByteArrayInputStream input = new ByteArrayInputStream(output.toByteArray())) { + Storage.getInstance().storeMetadata(input, identifier); + } + } + } } - + /** - * Test the getSystemMetadata method + * Test the getObject and getSystemMetadata method * @throws Exception */ - @Test - public void testGetSystemMetadata() throws Exception { - //Test to get system metadata from a file - String currentDir = Paths.get(".").toAbsolutePath().normalize().toString(); - System.out.println("current dir " + currentDir); - String path = currentDir + "/src/test/resources/org/dataone/cn/index/resources/d1_testdocs/json-ld/hakai-deep-schema/hakai-deep-schema.jsonld"; - String id = "hakai-deep-schema.jsonld"; - SystemMetadata sysmeta = ObjectManager.getInstance().getSystemMetadata(id, path); - assertTrue(sysmeta.getIdentifier().getValue().equals(id)); - - //Test to get system metadata from the Mock dataone cn server. - id = "ala-wai-canal-ns02-matlab-processing.eml.1.xml"; - path = null; - MockMNode mockMNode = new MockMNode("http://mnode.foo"); - mockMNode.setContext(DataONESolrJettyTestBase.getContext()); - ObjectManager.setD1Node(mockMNode); - sysmeta = ObjectManager.getInstance().getSystemMetadata(id, path); - assertTrue(sysmeta.getIdentifier().getValue().equals(id)); - - //Test the system metadata not found - id = "foo.1.1"; - path = "foo1"; - try { - sysmeta = ObjectManager.getInstance().getSystemMetadata(id, path); - fail("We should reach here"); - } catch (NotFound e) { - assert(true); + @Test + public void testGetObjectAndSystemMetadata() throws Exception { + try (InputStream input = ObjectManager.getInstance().getObject(identifier)) { + assertNotNull(input); + try (OutputStream os = new ByteArrayOutputStream()) { + MessageDigest md5 = MessageDigest.getInstance("MD5"); + // Calculate hex digests + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = input.read(buffer)) != -1) { + os.write(buffer, 0, bytesRead); + md5.update(buffer, 0, bytesRead); + } + String md5Digest = DatatypeConverter.printHexBinary(md5.digest()).toLowerCase(); + assertEquals("1755a557c13be7af44d676bb09274b0e", md5Digest); + } } + org.dataone.service.types.v1.SystemMetadata sysmeta = ObjectManager.getInstance() + .getSystemMetadata(identifier); + assertEquals(identifier, sysmeta.getIdentifier().getValue()); + assertEquals("1755a557c13be7af44d676bb09274b0e", sysmeta.getChecksum().getValue()); + assertEquals(14828, sysmeta.getSize().intValue()); } + } diff --git a/src/test/java/org/dataone/indexer/queue/IndexQueueMessageParserTest.java b/src/test/java/org/dataone/indexer/queue/IndexQueueMessageParserTest.java index b6a5c1cc..201a55f7 100644 --- a/src/test/java/org/dataone/indexer/queue/IndexQueueMessageParserTest.java +++ b/src/test/java/org/dataone/indexer/queue/IndexQueueMessageParserTest.java @@ -1,5 +1,6 @@ package org.dataone.indexer.queue; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -19,10 +20,11 @@ * */ public class IndexQueueMessageParserTest { - private final static String HEADER_ID = "id"; //The header name in the message to store the identifier - private final static String HEADER_PATH = "path"; //The header name in the message to store the path of the object - private final static String HEADER_INDEX_TYPE = "index_type"; //The header name in the message to store the index type - + //The header name in the message to store the identifier + private final static String HEADER_ID = "id"; + //The header name in the message to store the index type + private final static String HEADER_INDEX_TYPE = "index_type"; + /** * Test the invalid messages * @throws Exception @@ -32,54 +34,50 @@ public void testInvalidRequest() throws Exception { LongString id = null; LongString index_type = LongStringHelper.asLongString("create"); int priority = 1; - LongString filePath = LongStringHelper.asLongString("foo"); - AMQP.BasicProperties properties = generateProperties(id, index_type, priority, filePath); + AMQP.BasicProperties properties = generateProperties(id, index_type, priority); byte[] body = null; IndexQueueMessageParser parser = new IndexQueueMessageParser(); try { parser.parse(properties, body); fail("Since the idenitifer is null, we shoulder get here"); } catch (InvalidRequest e) { - + } - + id = LongStringHelper.asLongString(" "); index_type = LongStringHelper.asLongString("create"); priority = 1; - filePath = LongStringHelper.asLongString("foo"); - properties = generateProperties(id, index_type, priority, filePath); + properties = generateProperties(id, index_type, priority); try { parser.parse(properties, body); fail("Since the idenitifer is null, we shouldn't get here"); } catch (InvalidRequest e) { - + } - + id = LongStringHelper.asLongString("foo"); index_type = null; priority = 1; - filePath = LongStringHelper.asLongString("foo"); - properties = generateProperties(id, index_type, priority, filePath); + properties = generateProperties(id, index_type, priority); try { parser.parse(properties, body); fail("Since the index type is null, we shouldn't get here"); } catch (InvalidRequest e) { - + } - + id = LongStringHelper.asLongString("foo"); index_type = LongStringHelper.asLongString(""); priority = 1; - filePath = LongStringHelper.asLongString("foo"); - properties = generateProperties(id, index_type, priority, filePath); + properties = generateProperties(id, index_type, priority); try { parser.parse(properties, body); fail("Since the index type is null, we shouldn't get here"); } catch (InvalidRequest e) { - + } } - + /** * Test valid messages * @throws Exception @@ -89,62 +87,51 @@ public void testParse() throws Exception { String id = "doi:10.5063/F1HX1B4Q"; String indexType = "create"; int priority = 1; - String filePath = "/var/metacat/12dfad"; LongString longId = LongStringHelper.asLongString(id); LongString longIndexType = LongStringHelper.asLongString(indexType); - LongString longFilePath = LongStringHelper.asLongString(filePath); - AMQP.BasicProperties properties = generateProperties(longId, longIndexType, priority, longFilePath); + AMQP.BasicProperties properties = generateProperties(longId, longIndexType, priority); byte[] body = null; IndexQueueMessageParser parser = new IndexQueueMessageParser(); parser.parse(properties, body); - assertTrue(parser.getIdentifier().getValue().equals(id)); - assertTrue(parser.getIndexType().equals(indexType)); - assertTrue(parser.getPriority() == priority); - assertTrue(parser.getObjectPath().equals(filePath)); - + assertEquals(id, parser.getIdentifier().getValue()); + assertEquals(indexType, parser.getIndexType()); + assertEquals(priority, parser.getPriority()); + id = "urn:uuid:45298965-f867-440c-841f-91d3abd729b7"; indexType = "delete"; priority = 2; - filePath = ""; longId = LongStringHelper.asLongString(id); longIndexType = LongStringHelper.asLongString(indexType); - longFilePath = LongStringHelper.asLongString(filePath); - properties = generateProperties(longId, longIndexType, priority, longFilePath); + properties = generateProperties(longId, longIndexType, priority); parser = new IndexQueueMessageParser(); parser.parse(properties, body); - assertTrue(parser.getIdentifier().getValue().equals(id)); - assertTrue(parser.getIndexType().equals(indexType)); - assertTrue(parser.getPriority() == priority); - assertTrue(parser.getObjectPath().equals(filePath)); + assertEquals(id, parser.getIdentifier().getValue()); + assertEquals(indexType, parser.getIndexType()); + assertEquals(priority, parser.getPriority()); id = "test-foo"; indexType = "sysmeta"; priority = 10; - filePath = "c:\\foo\\abc"; longId = LongStringHelper.asLongString(id); longIndexType = LongStringHelper.asLongString(indexType); - longFilePath = LongStringHelper.asLongString(filePath); - properties = generateProperties(longId, longIndexType, priority, longFilePath); + properties = generateProperties(longId, longIndexType, priority); parser = new IndexQueueMessageParser(); parser.parse(properties, body); - assertTrue(parser.getIdentifier().getValue().equals(id)); - assertTrue(parser.getIndexType().equals(indexType)); - assertTrue(parser.getPriority() == priority); - assertTrue(parser.getObjectPath().equals(filePath)); - + assertEquals(id, parser.getIdentifier().getValue()); + assertEquals(indexType, parser.getIndexType()); + assertEquals(priority, parser.getPriority()); + id = "test-foo2"; indexType = "sysmeta2"; priority = 10; longId = LongStringHelper.asLongString(id); longIndexType = LongStringHelper.asLongString(indexType); - longFilePath = null; - properties = generateProperties(longId, longIndexType, priority, longFilePath); + properties = generateProperties(longId, longIndexType, priority); parser = new IndexQueueMessageParser(); parser.parse(properties, body); - assertTrue(parser.getIdentifier().getValue().equals(id)); - assertTrue(parser.getIndexType().equals(indexType)); - assertTrue(parser.getPriority() == priority); - assertTrue(parser.getObjectPath() == null); + assertEquals(id, parser.getIdentifier().getValue()); + assertEquals(indexType, parser.getIndexType()); + assertEquals(priority, parser.getPriority()); } /** @@ -155,13 +142,10 @@ public void testParse() throws Exception { * @param filePath * @return */ - private AMQP.BasicProperties generateProperties(LongString id, LongString index_type, int priority, LongString filePath) { + private AMQP.BasicProperties generateProperties(LongString id, LongString index_type, int priority) { Map headers = new HashMap(); headers.put(HEADER_ID, id); headers.put(HEADER_INDEX_TYPE, index_type); - if (filePath != null) { - headers.put(HEADER_PATH, filePath); - } AMQP.BasicProperties basicProperties = new AMQP.BasicProperties.Builder() .contentType("text/plain") .deliveryMode(2) // set this message to persistent diff --git a/src/test/resources/org/dataone/cn/index/resources/d1_testdocs/fgdc/nasa_d_FEDGPS1293Sysmeta.xml b/src/test/resources/org/dataone/cn/index/resources/d1_testdocs/fgdc/nasa_d_FEDGPS1293Sysmeta.xml index 42998531..9b0dfbd6 100644 --- a/src/test/resources/org/dataone/cn/index/resources/d1_testdocs/fgdc/nasa_d_FEDGPS1293Sysmeta.xml +++ b/src/test/resources/org/dataone/cn/index/resources/d1_testdocs/fgdc/nasa_d_FEDGPS1293Sysmeta.xml @@ -9,8 +9,8 @@ 22 www.nbii.gov_metadata_mdata_NASA_nasa_d_FEDGPS1293 FGDC-STD-001.1-1999 - 14880 - c72ff66bbe7fa99e5fb399bab8cb6f85 + 14828 + 1755a557c13be7af44d676bb09274b0e CN=Dave Vieglais T799,O=Google,C=US,DC=cilogon,DC=org CN=Dave Vieglais T799,O=Google,C=US,DC=cilogon,DC=org diff --git a/src/test/resources/org/dataone/configuration/index-processor.properties b/src/test/resources/org/dataone/configuration/index-processor.properties index 05cb1b1e..256dd8d8 100644 --- a/src/test/resources/org/dataone/configuration/index-processor.properties +++ b/src/test/resources/org/dataone/configuration/index-processor.properties @@ -42,3 +42,12 @@ index.resourcemap.namespace=http://www.w3.org/TR/rdf-syntax-grammar;http://www.o dataone.mn.registration.serviceType.url=https://cn-sandbox-ucsb-1.test.dataone.org/mnServiceTypes.xml cn.router.hostname=cn.dataone.org + +# Storage properties +storage.className=org.dataone.hashstore.filehashstore.FileHashStore +storage.hashstore.rootDirectory=./target/hashstore +storage.hashstore.defaultNamespace=https://ns.dataone.org/service/types/v2.0#SystemMetadata +# The following three properties must NOT be modified after the hash store is initialized +storage.hashstore.fileNameAlgorithm=SHA-256 +storage.hashstore.directory.width=2 +storage.hashstore.directory.depth=3