diff --git a/Classes/Command/BaseCommand.php b/Classes/Command/BaseCommand.php index a974be94f..71f822781 100644 --- a/Classes/Command/BaseCommand.php +++ b/Classes/Command/BaseCommand.php @@ -22,6 +22,7 @@ use Kitodo\Dlf\Domain\Model\Collection; use Kitodo\Dlf\Domain\Model\Document; use Kitodo\Dlf\Domain\Model\Library; +use Kitodo\Dlf\Validation\DocumentValidator; use Symfony\Component\Console\Command\Command; use TYPO3\CMS\Core\Configuration\ExtensionConfiguration; use TYPO3\CMS\Core\Utility\GeneralUtility; @@ -216,71 +217,70 @@ protected function saveToDatabase(Document $document): bool $doc->cPid = $this->storagePid; $metadata = $doc->getToplevelMetadata($this->storagePid); + $validator = new DocumentValidator($metadata, explode(',', $this->extConf['general']['requiredMetadataFields'])); + + if ($validator->hasAllMandatoryMetadataFields()) { + // set title data + $document->setTitle($metadata['title'][0] ? : ''); + $document->setTitleSorting($metadata['title_sorting'][0] ? : ''); + $document->setPlace(implode('; ', $metadata['place'])); + $document->setYear(implode('; ', $metadata['year'])); + $document->setAuthor($this->getAuthors($metadata['author'])); + $document->setThumbnail($doc->thumbnail ? : ''); + $document->setMetsLabel($metadata['mets_label'][0] ? : ''); + $document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : ''); + + $structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]); + $document->setStructure($structure); + + if (is_array($metadata['collection'])) { + $this->addCollections($document, $metadata['collection']); + } - // set title data - $document->setTitle($metadata['title'][0] ? : ''); - $document->setTitleSorting($metadata['title_sorting'][0] ? : ''); - $document->setPlace(implode('; ', $metadata['place'])); - $document->setYear(implode('; ', $metadata['year'])); - - // Remove appended "valueURI" from authors' names for storing in database. - foreach ($metadata['author'] as $i => $author) { - $splitName = explode(pack('C', 31), $author); - $metadata['author'][$i] = $splitName[0]; - } - $document->setAuthor($this->getAuthors($metadata['author'])); - $document->setThumbnail($doc->thumbnail ? : ''); - $document->setMetsLabel($metadata['mets_label'][0] ? : ''); - $document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : ''); + // set identifiers + $document->setProdId($metadata['prod_id'][0] ? : ''); + $document->setOpacId($metadata['opac_id'][0] ? : ''); + $document->setUnionId($metadata['union_id'][0] ? : ''); + + $document->setRecordId($metadata['record_id'][0]); + $document->setUrn($metadata['urn'][0] ? : ''); + $document->setPurl($metadata['purl'][0] ? : ''); + $document->setDocumentFormat($metadata['document_format'][0] ? : ''); + + // set access + $document->setLicense($metadata['license'][0] ? : ''); + $document->setTerms($metadata['terms'][0] ? : ''); + $document->setRestrictions($metadata['restrictions'][0] ? : ''); + $document->setOutOfPrint($metadata['out_of_print'][0] ? : ''); + $document->setRightsInfo($metadata['rights_info'][0] ? : ''); + $document->setStatus(0); + + $this->setOwner($metadata['owner'][0]); + $document->setOwner($this->owner); + + // set volume data + $document->setVolume($metadata['volume'][0] ? : ''); + $document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : ''); + + // Get UID of parent document. + if ($document->getDocumentFormat() === 'METS') { + $document->setPartof($this->getParentDocumentUidForSaving($document)); + } - $structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]); - $document->setStructure($structure); + if ($document->getUid() === null) { + // new document + $this->documentRepository->add($document); + } else { + // update of existing document + $this->documentRepository->update($document); + } - if (is_array($metadata['collection'])) { - $this->addCollections($document, $metadata['collection']); - } + $this->persistenceManager->persistAll(); - // set identifiers - $document->setProdId($metadata['prod_id'][0] ? : ''); - $document->setOpacId($metadata['opac_id'][0] ? : ''); - $document->setUnionId($metadata['union_id'][0] ? : ''); - - $document->setRecordId($metadata['record_id'][0] ? : ''); // (?) $doc->recordId - $document->setUrn($metadata['urn'][0] ? : ''); - $document->setPurl($metadata['purl'][0] ? : ''); - $document->setDocumentFormat($metadata['document_format'][0] ? : ''); - - // set access - $document->setLicense($metadata['license'][0] ? : ''); - $document->setTerms($metadata['terms'][0] ? : ''); - $document->setRestrictions($metadata['restrictions'][0] ? : ''); - $document->setOutOfPrint($metadata['out_of_print'][0] ? : ''); - $document->setRightsInfo($metadata['rights_info'][0] ? : ''); - $document->setStatus(0); - - $this->setOwner($metadata['owner'][0]); - $document->setOwner($this->owner); - - // set volume data - $document->setVolume($metadata['volume'][0] ? : ''); - $document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : ''); - - // Get UID of parent document. - if ($document->getDocumentFormat() === 'METS') { - $document->setPartof($this->getParentDocumentUidForSaving($document)); + return true; } - if ($document->getUid() === null) { - // new document - $this->documentRepository->add($document); - } else { - // update of existing document - $this->documentRepository->update($document); - } - - $this->persistenceManager->persistAll(); - - return true; + return false; } /** @@ -371,6 +371,12 @@ private function addCollections(Document &$document, array $collections): void */ private function getAuthors(array $metadataAuthor): string { + // Remove appended "valueURI" from authors' names for storing in database. + foreach ($metadataAuthor as $i => $author) { + $splitName = explode(pack('C', 31), $author); + $metadataAuthor[$i] = $splitName[0]; + } + $authors = ''; $delimiter = '; '; $ellipsis = 'et al.'; diff --git a/Classes/Command/IndexCommand.php b/Classes/Command/IndexCommand.php index c8dbd4ce6..746c3b258 100644 --- a/Classes/Command/IndexCommand.php +++ b/Classes/Command/IndexCommand.php @@ -180,20 +180,29 @@ protected function execute(InputInterface $input, OutputInterface $output): int if ($dryRun) { $io->section('DRY RUN: Would index ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.'); + $io->success('All done!'); + return BaseCommand::SUCCESS; } else { + $document->setCurrentDocument($doc); + if ($io->isVerbose()) { - $io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.'); + $io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . '.'); } - $document->setCurrentDocument($doc); - // save to database - $this->saveToDatabase($document); - // add to index - Indexer::add($document, $this->documentRepository); - } + $isSaved = $this->saveToDatabase($document); - $io->success('All done!'); + if ($isSaved) { + if ($io->isVerbose()) { + $io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on Solr core ' . $solrCoreUid . '.'); + } + Indexer::add($document, $this->documentRepository); - return BaseCommand::SUCCESS; + $io->success('All done!'); + return BaseCommand::SUCCESS; + } + + $io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (document format or record identifier) in this document.'); + return BaseCommand::FAILURE; + } } /** diff --git a/Classes/Validation/DocumentValidator.php b/Classes/Validation/DocumentValidator.php new file mode 100644 index 000000000..82d3f0f37 --- /dev/null +++ b/Classes/Validation/DocumentValidator.php @@ -0,0 +1,141 @@ + + * + * This file is part of the Kitodo and TYPO3 projects. + * + * @license GNU General Public License version 3 or later. + * For the full copyright and license information, please read the + * LICENSE.txt file that was distributed with this source code. + */ + +namespace Kitodo\Dlf\Validation; + +use TYPO3\CMS\Core\Log\Logger; +use TYPO3\CMS\Core\Log\LogManager; +use TYPO3\CMS\Core\Utility\GeneralUtility; + +/** + * Class for document validation. Currently used for validating metadata + * fields but in the future should be extended also for other fields. + * + * @package TYPO3 + * @subpackage dlf + * + * @access public + */ +class DocumentValidator +{ + /** + * @access protected + * @var Logger This holds the logger + */ + protected Logger $logger; + + /** + * @access private + * @var array + */ + private array $metadata; + + /** + * @access private + * @var array + */ + private array $requiredMetadataFields; + + /** + * @access private + * @var ?\SimpleXMLElement + */ + private ?\SimpleXMLElement $xml; + + /** + * Constructs DocumentValidator instance. + * + * @access public + * + * @param array $metadata + * @param array $requiredMetadataFields + * + * @return void + */ + public function __construct(array $metadata = [], array $requiredMetadataFields = [], ?\SimpleXMLElement $xml = null) + { + $this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(static::class); + $this->metadata = $metadata; + $this->requiredMetadataFields = $requiredMetadataFields; + $this->xml = $xml; + } + + /** + * Check if metadata array contains all mandatory fields before save. + * + * @access public + * + * @return bool + */ + public function hasAllMandatoryMetadataFields(): bool + { + foreach ($this->requiredMetadataFields as $requiredMetadataField) { + if (empty($this->metadata[$requiredMetadataField][0])) { + $this->logger->error('Missing required metadata field "' . $requiredMetadataField . '".'); + return false; + } + } + return true; + } + + /** + * Check if xml contains at least one logical structure with given type. + * + * @access public + * + * @param string $type e.g. documentary, newspaper or object + * + * @return bool + */ + public function hasCorrectLogicalStructure(string $type): bool + { + $expectedNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div[@TYPE="' . $type . '"]'); + if ($expectedNodes) { + return true; + } + + $existingNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div'); + if ($existingNodes) { + $this->logger->error('Document contains logical structure but @TYPE="' . $type . '" is missing.'); + return false; + } + + $this->logger->error('Document does not contain logical structure.'); + return false; + } + + /** + * Check if xml contains at least one physical structure with type 'physSequence'. + * + * @access public + * + * @return bool + */ + public function hasCorrectPhysicalStructure(): bool + { + $physSequenceNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]'); + if ($physSequenceNodes) { + return true; + } + + $physicalStructureNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div'); + if ($physicalStructureNodes) { + $this->logger->error('Document contains physical structure but @TYPE="physSequence" is missing.'); + return false; + } + + $this->logger->error('Document does not contain physical structure.'); + return false; + } +} diff --git a/Resources/Private/Language/de.locallang_labels.xlf b/Resources/Private/Language/de.locallang_labels.xlf index b9101521e..597c15ac3 100644 --- a/Resources/Private/Language/de.locallang_labels.xlf +++ b/Resources/Private/Language/de.locallang_labels.xlf @@ -633,210 +633,214 @@ Default metadata namespaces Standard-Namensräume für Metadaten - - Enable internal page view proxy?: (default is "FALSE") - Internen Proxy für Werkansicht aktivieren? (Standard ist "FALSE") - - - DLF User-Agent: (Standard ist "Kitodo.Presentation") - DLF User-Agent: (default is "Kitodo.Presentation") - - - Verwende nur absolute Links für Seiten und Ressourcen?: Wird nur in speziellen Multi-Domain-Umgebungen benötigt; erfordert einen voll qualifizierten Einstiegspunkt in der Seitenkonfiguration (Standard ist "FALSE") - Force all links to pages and resources to be absolute?: Only needed for some multi-domain environments; requires a fully qualified Entry Point in Site Configuration (default is "FALSE") - - - Verwende HTTPS for absolute Links?: erfordert einen Einstiegspunkt mit "https://..." in der Seitenkonfiguration (Standard ist "FALSE") - Use HTTPS for absolute links?: requires a valid Entry Point with "https://..." in Site Configuration (default is "FALSE") - - - Eingelesene METS Dateien / IIIF-Manifeste zwischenspeichern: Dies kann die Geschwindigkeit geringfügig verbessern, führt aber zu einer sehr großen "fe_session_data" Tabelle (Standard ist "FALSE") - Cache parsed METS files / IIIF manifests: Caching improves performance a little bit but can result in a very large "fe_session_data" table (default is "FALSE") - - - Neue Kollektionen publizieren?: Sollen neue Kollektionen automatisch in der OAI-PMH-Schnittstelle veröffentlicht werden? (Standard ist "TRUE") - Publish new collections?: Should new collections automatically be published in the OAI-PMH interface? (default is "TRUE") - - - Unhide ided documents?: Should hidden documents be unhidden when re-iding them? (default is "FALSE") - Indexierte Dokumente einblenden?: Sollen ausgeblendete Dokumente bei der erneuten Indexierung wieder eingeblendet werden? (Standard ist "FALSE") - - - Verwende externe APIs zum Abrufen von Metadaten?: (Standard ist "FALSE") - Use external APIs for getting metadata?: (default is "FALSE") - - - Seiten fileGrps: Komma-getrennte Liste der @USE Attributwerte der Seitenansichten nach aufsteigender Größe sortiert (Standard ist "DEFAULT,MAX") - Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX") - - - Vorschau fileGrp: Komma-getrennte Liste der @USE Attributwerte der Vorschaubilder nach absteigender Priorität sortiert (Standard ist "THUMBS") - Thumbnail fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "THUMBS") - - - Download fileGrp: Komma-getrennte Liste der @USE Attributwerte der Downloads nach absteigender Priorität sortiert (Standard ist "DOWNLOAD") - Download fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "DOWNLOAD") - - - Volltext fileGrp: Komma-getrennte Liste der @USE Attributwerte der Volltexte nach absteigender Priorität sortiert (Standard ist "FULLTEXT") - Fulltext fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "FULLTEXT") - - - Audio fileGrp: Komma-getrennte Liste der @USE Attributwerte der Audiodateien nach absteigender Priorität sortiert (Standard ist "AUDIO") - Audio fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "AUDIO") - - - IIIF-Annotationen mit Motivation "painting" als Volltext behandeln?: Als Volltext behandelte Annotationen werden im Suchid idiert (Standard ist "FALSE") - Handle IIIF annotations with motivation "painting" as fulltext?: Handling annotations as fulltexts means they are ided (default is "FALSE") - - - Maximale Thumbnail-Breite für IIIF-Images: Gilt nur für Bilder ohne Thumbnail-Angaben (Standard ist "150") - Maximum thumbnail width for IIIF images: Only for images without a thumbnail declaration (default is "150") - - - Maximale Thumbnail-Höhe für IIIF-Images: Gilt nur für Bilder ohne Thumbnail-Angaben (Standard ist "150") - Maximum thumbnail height for IIIF images: Only for images without a thumbnail declaration (default is "150") - - - Solr Connection - Solr Verbindung - - - HTTPS verwenden: (Standard ist "FALSE") - Use HTTPS: (default is "FALSE") - - - Solr Server Host: (Standard ist "localhost") - Solr Server Host: (default is "localhost") - - - Solr Server Port: (Standard ist "8983") - Solr Server Port: (default is "8983") - - - Solr Server Pfad: ohne API-Endpunkt "/solr" (Standard ist "/") - Solr Server Path: without API endpoint "/solr" (default is "/") - - - Solr Server Benutzername: (Standard ist "") - Solr Server User: (default is "") - - - Solr Server Kennwort: (Standard ist "") - Solr Server Password: (default is "") - - - Solr Server Timeout: (Standard ist "10") - Solr Server Timeout: (default is "10") - - - Löschen von Solr Kern zulassen?: Soll beim Löschen eines Solr Kerns im TYPO3 Backend auch der entsprechende Index in Apache Solr gelöscht werden? (Standard ist "FALSE") - Allow Solr Core Deletion?: If a Solr Core is deleted in the TYPO3 Backend, should it be deleted in Apache Solr as well? (default is "FALSE") - - - Solr-Schema-Feld "id" : Unique identifier for the document in the id (Standard ist "id") - Solr Schema Field "id" : Unique identifier for the document in the id (default is "id") - - - Solr-Schema-Feld "uid" : Unique identifier for the document (or its top-level parent) in the TYPO3 database (Standard ist "uid") - Solr Schema Field "uid" : Unique identifier for the document (or its top-level parent) in the TYPO3 database (default is "uid") - - - Solr-Schema-Feld "pid" : PageID for the document (or its top-level parent) in the TYPO3 database (Standard ist "pid") - Solr Schema Field "pid" : PageID for the document (or its top-level parent) in the TYPO3 database (default is "pid") - - - Solr-Schema-Feld "page" : Image number where this document starts (Standard ist "page") - Solr Schema Field "page" : Image number where this document starts (default is "page") - - - Solr-Schema-Feld "partof" : Unique identifier for the parent document in the TYPO3 database. Only if this is a multi-volume work! (Standard ist "partof") - Solr Schema Field "partof" : Unique identifier for the parent document in the TYPO3 database. Only if this is a multi-volume work! (default is "partof") - - - Solr-Schema-Feld "root" : Unique identifier for the root document in the TYPO3 database. Only if this is a multi-volume work! (Standard ist "root") - Solr Schema Field "root" : Unique identifier for the root document in the TYPO3 database. Only if this is a multi-volume work! (default is "root") - - - Solr-Schema-Feld "sid" : XML ID of this document in the METS file. This is only unique within the METS file! (Standard ist "sid") - Solr Schema Field "sid" : XML ID of this document in the METS file. This is only unique within the METS file! (default is "sid") - - - Solr-Schema-Feld "toplevel" : Information if it is a top-level document (Standard ist "toplevel") - Solr Schema Field "toplevel" : Information if it is a top-level document (default is "toplevel") - - - Solr-Schema-Feld "type" : Type of document (eg. monograph, chapter, etc.) (Standard ist "type") - Solr Schema Field "type" : Type of document (eg. monograph, chapter, etc.) (default is "type") - - - Solr-Schema-Feld "title" : Title field is mandatory for identifying documents (Standard ist "title") - Solr Schema Field "title" : Title field is mandatory for identifying documents (default is "title") - - - Solr-Schema-Feld "volume" : Volume field is mandatory for identifying documents (Standard ist "volume") - Solr Schema Field "volume" : Volume field is mandatory for identifying documents (default is "volume") - - - Solr Schema Field "date" : The date a resource was issued or created. Used for datesearch (Standard ist "date") - Solr Schema Field "date" : The date a resource was issued or created. Used for datesearch (default is "date") - - - Solr-Schema-Feld "thumbnail" : URL of thumbnail image for the document (Standard ist "thumbnail") - Solr Schema Field "thumbnail" : URL of thumbnail image for the document (default is "thumbnail") - - - Solr-Schema-Feld "default" : CatchAll field (Standard ist "default") - Solr Schema Field default" : CatchAll field (default is "default") - - - Solr-Schema-Feld "timestamp" : (Standard ist "timestamp") - Solr Schema Field "timestamp" : (default is "timestamp") - - - Solr-Schema-Feld "autocomplete" : Autocomplete field for search form (Standard ist "autocomplete") - Solr Schema Field "autocomplete" : Autocomplete field for search form (default is "autocomplete") - - - Solr-Schema-Feld "fulltext" : Fulltext field for OCR results (Standard ist "fulltext") - Solr Schema Field "fulltext" : Fulltext field for OCR results (default is "fulltext") - - - Solr-Schema-Feld "record_id" : Record ID of the document (required for OAI_DC output) (Standard ist "record_id") - Solr Schema Field "record_id" : Record ID of the document (required for OAI_DC output) (default is "record_id") - - - Solr-Schema-Feld "purl" : Permanent URL of the document (required for EPICUR output) (Standard ist "purl") - Solr Schema Field "purl" : Permanent URL of the document (required for EPICUR output) (default is "purl") - - - Solr-Schema-Feld "urn" : URN of the Document (required for EPICUR output) (Standard ist "urn") - Solr Schema Field "urn" : URN of the Document (required for EPICUR output) (default is "urn") - - - Solr-Schema-Feld "location" : Location of METS XML (required for METS output) (Standard ist "location") - Solr Schema Field "location" : Location of METS XML (required for METS output) (default is "location") - - - Solr-Schema-Feld "collection" : Associated collection(s) of the document (Standard ist "collection") - Solr Schema Field "collection" : Associated collection(s) of the document (default is "collection") - - - Solr-Schema-Feld "license" : License (should be URI) (Standard ist "license") - Solr Schema Field "license" : License (should be URI) (default is "license") - - - Solr-Schema-Feld "terms" : Term of Use (should be URI) (Standard ist "terms") - Solr Schema Field "terms" : Term of Use (should be URI) (default is "terms") - - - Solr-Schema-Feld "restrictions" : Access Restrictions (should be URI) (Standard ist "restrictions") - Solr Schema Field "restrictions" : Access Restrictions (should be URI) (default is "restrictions") - - - Solr-Schema-Feld "geom" : GeoJSON geometry for spatial search (Standard ist "geom") - Solr Schema Field "geom" : GeoJSON geometry for spatial search (default is "geom") - + + Enable internal page view proxy?: (default is "FALSE") + Internen Proxy für Werkansicht aktivieren? (Standard ist "FALSE") + + + DLF User-Agent: (Standard ist "Kitodo.Presentation") + DLF User-Agent: (default is "Kitodo.Presentation") + + + Verwende nur absolute Links für Seiten und Ressourcen?: Wird nur in speziellen Multi-Domain-Umgebungen benötigt; erfordert einen voll qualifizierten Einstiegspunkt in der Seitenkonfiguration (Standard ist "FALSE") + Force all links to pages and resources to be absolute?: Only needed for some multi-domain environments; requires a fully qualified Entry Point in Site Configuration (default is "FALSE") + + + Verwende HTTPS for absolute Links?: erfordert einen Einstiegspunkt mit "https://..." in der Seitenkonfiguration (Standard ist "FALSE") + Use HTTPS for absolute links?: requires a valid Entry Point with "https://..." in Site Configuration (default is "FALSE") + + + Eingelesene METS Dateien / IIIF-Manifeste zwischenspeichern: Dies kann die Geschwindigkeit geringfügig verbessern, führt aber zu einer sehr großen "fe_session_data" Tabelle (Standard ist "FALSE") + Cache parsed METS files / IIIF manifests: Caching improves performance a little bit but can result in a very large "fe_session_data" table (default is "FALSE") + + + Neue Kollektionen publizieren?: Sollen neue Kollektionen automatisch in der OAI-PMH-Schnittstelle veröffentlicht werden? (Standard ist "TRUE") + Publish new collections?: Should new collections automatically be published in the OAI-PMH interface? (default is "TRUE") + + + Unhide ided documents?: Should hidden documents be unhidden when re-iding them? (default is "FALSE") + Indexierte Dokumente einblenden?: Sollen ausgeblendete Dokumente bei der erneuten Indexierung wieder eingeblendet werden? (Standard ist "FALSE") + + + Verwende externe APIs zum Abrufen von Metadaten?: (Standard ist "FALSE") + Use external APIs for getting metadata?: (default is "FALSE") + + + Für die Indexierung von Dokumenten erforderliche Metadatenfelder + Metadata fields required for indexing documents + + + Seiten fileGrps: Komma-getrennte Liste der @USE Attributwerte der Seitenansichten nach aufsteigender Größe sortiert (Standard ist "DEFAULT,MAX") + Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX") + + + Vorschau fileGrp: Komma-getrennte Liste der @USE Attributwerte der Vorschaubilder nach absteigender Priorität sortiert (Standard ist "THUMBS") + Thumbnail fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "THUMBS") + + + Download fileGrp: Komma-getrennte Liste der @USE Attributwerte der Downloads nach absteigender Priorität sortiert (Standard ist "DOWNLOAD") + Download fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "DOWNLOAD") + + + Volltext fileGrp: Komma-getrennte Liste der @USE Attributwerte der Volltexte nach absteigender Priorität sortiert (Standard ist "FULLTEXT") + Fulltext fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "FULLTEXT") + + + Audio fileGrp: Komma-getrennte Liste der @USE Attributwerte der Audiodateien nach absteigender Priorität sortiert (Standard ist "AUDIO") + Audio fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "AUDIO") + + + IIIF-Annotationen mit Motivation "painting" als Volltext behandeln?: Als Volltext behandelte Annotationen werden im Suchid idiert (Standard ist "FALSE") + Handle IIIF annotations with motivation "painting" as fulltext?: Handling annotations as fulltexts means they are ided (default is "FALSE") + + + Maximale Thumbnail-Breite für IIIF-Images: Gilt nur für Bilder ohne Thumbnail-Angaben (Standard ist "150") + Maximum thumbnail width for IIIF images: Only for images without a thumbnail declaration (default is "150") + + + Maximale Thumbnail-Höhe für IIIF-Images: Gilt nur für Bilder ohne Thumbnail-Angaben (Standard ist "150") + Maximum thumbnail height for IIIF images: Only for images without a thumbnail declaration (default is "150") + + + Solr Connection + Solr Verbindung + + + HTTPS verwenden: (Standard ist "FALSE") + Use HTTPS: (default is "FALSE") + + + Solr Server Host: (Standard ist "localhost") + Solr Server Host: (default is "localhost") + + + Solr Server Port: (Standard ist "8983") + Solr Server Port: (default is "8983") + + + Solr Server Pfad: ohne API-Endpunkt "/solr" (Standard ist "/") + Solr Server Path: without API endpoint "/solr" (default is "/") + + + Solr Server Benutzername: (Standard ist "") + Solr Server User: (default is "") + + + Solr Server Kennwort: (Standard ist "") + Solr Server Password: (default is "") + + + Solr Server Timeout: (Standard ist "10") + Solr Server Timeout: (default is "10") + + + Löschen von Solr Kern zulassen?: Soll beim Löschen eines Solr Kerns im TYPO3 Backend auch der entsprechende Index in Apache Solr gelöscht werden? (Standard ist "FALSE") + Allow Solr Core Deletion?: If a Solr Core is deleted in the TYPO3 Backend, should it be deleted in Apache Solr as well? (default is "FALSE") + + + Solr-Schema-Feld "id" : Unique identifier for the document in the id (Standard ist "id") + Solr Schema Field "id" : Unique identifier for the document in the id (default is "id") + + + Solr-Schema-Feld "uid" : Unique identifier for the document (or its top-level parent) in the TYPO3 database (Standard ist "uid") + Solr Schema Field "uid" : Unique identifier for the document (or its top-level parent) in the TYPO3 database (default is "uid") + + + Solr-Schema-Feld "pid" : PageID for the document (or its top-level parent) in the TYPO3 database (Standard ist "pid") + Solr Schema Field "pid" : PageID for the document (or its top-level parent) in the TYPO3 database (default is "pid") + + + Solr-Schema-Feld "page" : Image number where this document starts (Standard ist "page") + Solr Schema Field "page" : Image number where this document starts (default is "page") + + + Solr-Schema-Feld "partof" : Unique identifier for the parent document in the TYPO3 database. Only if this is a multi-volume work! (Standard ist "partof") + Solr Schema Field "partof" : Unique identifier for the parent document in the TYPO3 database. Only if this is a multi-volume work! (default is "partof") + + + Solr-Schema-Feld "root" : Unique identifier for the root document in the TYPO3 database. Only if this is a multi-volume work! (Standard ist "root") + Solr Schema Field "root" : Unique identifier for the root document in the TYPO3 database. Only if this is a multi-volume work! (default is "root") + + + Solr-Schema-Feld "sid" : XML ID of this document in the METS file. This is only unique within the METS file! (Standard ist "sid") + Solr Schema Field "sid" : XML ID of this document in the METS file. This is only unique within the METS file! (default is "sid") + + + Solr-Schema-Feld "toplevel" : Information if it is a top-level document (Standard ist "toplevel") + Solr Schema Field "toplevel" : Information if it is a top-level document (default is "toplevel") + + + Solr-Schema-Feld "type" : Type of document (eg. monograph, chapter, etc.) (Standard ist "type") + Solr Schema Field "type" : Type of document (eg. monograph, chapter, etc.) (default is "type") + + + Solr-Schema-Feld "title" : Title field is mandatory for identifying documents (Standard ist "title") + Solr Schema Field "title" : Title field is mandatory for identifying documents (default is "title") + + + Solr-Schema-Feld "volume" : Volume field is mandatory for identifying documents (Standard ist "volume") + Solr Schema Field "volume" : Volume field is mandatory for identifying documents (default is "volume") + + + Solr Schema Field "date" : The date a resource was issued or created. Used for datesearch (Standard ist "date") + Solr Schema Field "date" : The date a resource was issued or created. Used for datesearch (default is "date") + + + Solr-Schema-Feld "thumbnail" : URL of thumbnail image for the document (Standard ist "thumbnail") + Solr Schema Field "thumbnail" : URL of thumbnail image for the document (default is "thumbnail") + + + Solr-Schema-Feld "default" : CatchAll field (Standard ist "default") + Solr Schema Field default" : CatchAll field (default is "default") + + + Solr-Schema-Feld "timestamp" : (Standard ist "timestamp") + Solr Schema Field "timestamp" : (default is "timestamp") + + + Solr-Schema-Feld "autocomplete" : Autocomplete field for search form (Standard ist "autocomplete") + Solr Schema Field "autocomplete" : Autocomplete field for search form (default is "autocomplete") + + + Solr-Schema-Feld "fulltext" : Fulltext field for OCR results (Standard ist "fulltext") + Solr Schema Field "fulltext" : Fulltext field for OCR results (default is "fulltext") + + + Solr-Schema-Feld "record_id" : Record ID of the document (required for OAI_DC output) (Standard ist "record_id") + Solr Schema Field "record_id" : Record ID of the document (required for OAI_DC output) (default is "record_id") + + + Solr-Schema-Feld "purl" : Permanent URL of the document (required for EPICUR output) (Standard ist "purl") + Solr Schema Field "purl" : Permanent URL of the document (required for EPICUR output) (default is "purl") + + + Solr-Schema-Feld "urn" : URN of the Document (required for EPICUR output) (Standard ist "urn") + Solr Schema Field "urn" : URN of the Document (required for EPICUR output) (default is "urn") + + + Solr-Schema-Feld "location" : Location of METS XML (required for METS output) (Standard ist "location") + Solr Schema Field "location" : Location of METS XML (required for METS output) (default is "location") + + + Solr-Schema-Feld "collection" : Associated collection(s) of the document (Standard ist "collection") + Solr Schema Field "collection" : Associated collection(s) of the document (default is "collection") + + + Solr-Schema-Feld "license" : License (should be URI) (Standard ist "license") + Solr Schema Field "license" : License (should be URI) (default is "license") + + + Solr-Schema-Feld "terms" : Term of Use (should be URI) (Standard ist "terms") + Solr Schema Field "terms" : Term of Use (should be URI) (default is "terms") + + + Solr-Schema-Feld "restrictions" : Access Restrictions (should be URI) (Standard ist "restrictions") + Solr Schema Field "restrictions" : Access Restrictions (should be URI) (default is "restrictions") + + + Solr-Schema-Feld "geom" : GeoJSON geometry for spatial search (Standard ist "geom") + Solr Schema Field "geom" : GeoJSON geometry for spatial search (default is "geom") + diff --git a/Resources/Private/Language/locallang_labels.xlf b/Resources/Private/Language/locallang_labels.xlf index e3ceeb7f6..abacff0e9 100644 --- a/Resources/Private/Language/locallang_labels.xlf +++ b/Resources/Private/Language/locallang_labels.xlf @@ -476,159 +476,162 @@ Default metadata namespaces - - Enable internal page view proxy?: (default is "FALSE") - - - DLF User-Agent: (default is "Kitodo.Presentation") - - - Force all links to pages and resources to be absolute?: Only needed for some multi-domain environments; requires a fully qualified Entry Point in Site Configuration (default is "FALSE") - - - Use HTTPS for absolute links?: requires a valid Entry Point with "https://..." in Site Configuration (default is "FALSE") - - - Cache parsed METS files / IIIF manifests: Caching improves performance a little bit but can result in a very large "fe_session_data" table (default is "FALSE") - - - Publish new collections?: Should new collections automatically be published in the OAI-PMH interface? (default is "TRUE") - - - Unhide ided documents?: Should hidden documents be unhidden when re-iding them? (default is "FALSE") - - - Use external APIs for getting metadata?: (default is "FALSE") - - - Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX") - - - Thumbnail fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "THUMBS") - - - Download fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "DOWNLOAD") - - - Fulltext fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "FULLTEXT") - - - Audio fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "AUDIO") - - - Handle IIIF annotations with motivation "painting" as fulltext?: Handling annotations as fulltexts means they are ided (default is "FALSE") - - - Maximum thumbnail width for IIIF images: Only for images without a thumbnail declaration (default is "150") - - - Maximum thumbnail height for IIIF images: Only for images without a thumbnail declaration (default is "150") - - - Solr Connection - - - Use HTTPS: (default is "FALSE") - - - Solr Server Host: (default is "localhost") - - - Solr Server Port: (default is "8983") - - - Solr Server Path: without API endpoint "/solr" (default is "/") - - - Solr Server User: (default is "") - - - Solr Server Password: (default is "") - - - Solr Server Timeout: (default is "10") - - - Allow Solr Core Deletion?: If a Solr Core is deleted in the TYPO3 Backend, should it be deleted in Apache Solr as well? (default is "FALSE") - - - Solr Schema Field "id" : Unique identifier for the document in the id (default is "id") - - - Solr Schema Field "uid" : Unique identifier for the document (or its top-level parent) in the TYPO3 database (default is "uid") - - - Solr Schema Field "pid" : PageID for the document (or its top-level parent) in the TYPO3 database (default is "pid") - - - Solr Schema Field "page" : Image number where this document starts (default is "page") - - - Solr Schema Field "partof" : Unique identifier for the parent document in the TYPO3 database. Only if this is a multi-volume work! (default is "partof") - - - Solr Schema Field "root" : Unique identifier for the root document in the TYPO3 database. Only if this is a multi-volume work! (default is "root") - - - Solr Schema Field "sid" : XML ID of this document in the METS file. This is only unique within the METS file! (default is "sid") - - - Solr Schema Field "toplevel" : Information if it is a top-level document (default is "toplevel") - - - Solr Schema Field "type" : Type of document (eg. monograph, chapter, etc.) (default is "type") - - - Solr Schema Field "title" : Title field is mandatory for identifying documents (default is "title") - - - Solr Schema Field "volume" : Volume field is mandatory for identifying documents (default is "volume") - - - Solr Schema Field "date" : The date a resource was issued or created. Used for datesearch (default is "date") - - - Solr Schema Field "thumbnail" : URL of thumbnail image for the document (default is "thumbnail") - - - Solr Schema Field default" : CatchAll field (default is "default") - - - Solr Schema Field "timestamp" : (default is "timestamp") - - - Solr Schema Field "autocomplete" : Autocomplete field for search form (default is "autocomplete") - - - Solr Schema Field "fulltext" : Fulltext field for OCR results (default is "fulltext") - - - Solr Schema Field "record_id" : Record ID of the document (required for OAI_DC output) (default is "record_id") - - - Solr Schema Field "purl" : Permanent URL of the document (required for EPICUR output) (default is "purl") - - - Solr Schema Field "urn" : URN of the Document (required for EPICUR output) (default is "urn") - - - Solr Schema Field "location" : Location of METS XML (required for METS output) (default is "location") - - - Solr Schema Field "collection" : Associated collection(s) of the document (default is "collection") - - - Solr Schema Field "license" : License (should be URI) (default is "license") - - - Solr Schema Field "terms" : Term of Use (should be URI) (default is "terms") - - - Solr Schema Field "restrictions" : Access Restrictions (should be URI) (default is "restrictions") - - - Solr Schema Field "geom" : GeoJSON geometry for spatial search (default is "geom") - + + Enable internal page view proxy?: (default is "FALSE") + + + DLF User-Agent: (default is "Kitodo.Presentation") + + + Force all links to pages and resources to be absolute?: Only needed for some multi-domain environments; requires a fully qualified Entry Point in Site Configuration (default is "FALSE") + + + Use HTTPS for absolute links?: requires a valid Entry Point with "https://..." in Site Configuration (default is "FALSE") + + + Cache parsed METS files / IIIF manifests: Caching improves performance a little bit but can result in a very large "fe_session_data" table (default is "FALSE") + + + Publish new collections?: Should new collections automatically be published in the OAI-PMH interface? (default is "TRUE") + + + Unhide ided documents?: Should hidden documents be unhidden when re-iding them? (default is "FALSE") + + + Use external APIs for getting metadata?: (default is "FALSE") + + + Metadata fields required for indexing documents + + + Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX") + + + Thumbnail fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "THUMBS") + + + Download fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "DOWNLOAD") + + + Fulltext fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "FULLTEXT") + + + Audio fileGrp: comma-separated list of @USE attribute values ordered by decreasing priority (default is "AUDIO") + + + Handle IIIF annotations with motivation "painting" as fulltext?: Handling annotations as fulltexts means they are ided (default is "FALSE") + + + Maximum thumbnail width for IIIF images: Only for images without a thumbnail declaration (default is "150") + + + Maximum thumbnail height for IIIF images: Only for images without a thumbnail declaration (default is "150") + + + Solr Connection + + + Use HTTPS: (default is "FALSE") + + + Solr Server Host: (default is "localhost") + + + Solr Server Port: (default is "8983") + + + Solr Server Path: without API endpoint "/solr" (default is "/") + + + Solr Server User: (default is "") + + + Solr Server Password: (default is "") + + + Solr Server Timeout: (default is "10") + + + Allow Solr Core Deletion?: If a Solr Core is deleted in the TYPO3 Backend, should it be deleted in Apache Solr as well? (default is "FALSE") + + + Solr Schema Field "id" : Unique identifier for the document in the id (default is "id") + + + Solr Schema Field "uid" : Unique identifier for the document (or its top-level parent) in the TYPO3 database (default is "uid") + + + Solr Schema Field "pid" : PageID for the document (or its top-level parent) in the TYPO3 database (default is "pid") + + + Solr Schema Field "page" : Image number where this document starts (default is "page") + + + Solr Schema Field "partof" : Unique identifier for the parent document in the TYPO3 database. Only if this is a multi-volume work! (default is "partof") + + + Solr Schema Field "root" : Unique identifier for the root document in the TYPO3 database. Only if this is a multi-volume work! (default is "root") + + + Solr Schema Field "sid" : XML ID of this document in the METS file. This is only unique within the METS file! (default is "sid") + + + Solr Schema Field "toplevel" : Information if it is a top-level document (default is "toplevel") + + + Solr Schema Field "type" : Type of document (eg. monograph, chapter, etc.) (default is "type") + + + Solr Schema Field "title" : Title field is mandatory for identifying documents (default is "title") + + + Solr Schema Field "volume" : Volume field is mandatory for identifying documents (default is "volume") + + + Solr Schema Field "date" : The date a resource was issued or created. Used for datesearch (default is "date") + + + Solr Schema Field "thumbnail" : URL of thumbnail image for the document (default is "thumbnail") + + + Solr Schema Field default" : CatchAll field (default is "default") + + + Solr Schema Field "timestamp" : (default is "timestamp") + + + Solr Schema Field "autocomplete" : Autocomplete field for search form (default is "autocomplete") + + + Solr Schema Field "fulltext" : Fulltext field for OCR results (default is "fulltext") + + + Solr Schema Field "record_id" : Record ID of the document (required for OAI_DC output) (default is "record_id") + + + Solr Schema Field "purl" : Permanent URL of the document (required for EPICUR output) (default is "purl") + + + Solr Schema Field "urn" : URN of the Document (required for EPICUR output) (default is "urn") + + + Solr Schema Field "location" : Location of METS XML (required for METS output) (default is "location") + + + Solr Schema Field "collection" : Associated collection(s) of the document (default is "collection") + + + Solr Schema Field "license" : License (should be URI) (default is "license") + + + Solr Schema Field "terms" : Term of Use (should be URI) (default is "terms") + + + Solr Schema Field "restrictions" : Access Restrictions (should be URI) (default is "restrictions") + + + Solr Schema Field "geom" : GeoJSON geometry for spatial search (default is "geom") + - + diff --git a/Tests/Unit/Validation/DocumentValidatorTest.php b/Tests/Unit/Validation/DocumentValidatorTest.php new file mode 100644 index 000000000..8e77d2925 --- /dev/null +++ b/Tests/Unit/Validation/DocumentValidatorTest.php @@ -0,0 +1,112 @@ + + * + * This file is part of the Kitodo and TYPO3 projects. + * + * @license GNU General Public License version 3 or later. + * For the full copyright and license information, please read the + * LICENSE.txt file that was distributed with this source code. + */ + +namespace Kitodo\Dlf\Tests\Unit\Common; + +use Kitodo\Dlf\Validation\DocumentValidator; +use SimpleXMLElement; +use TYPO3\TestingFramework\Core\Unit\UnitTestCase; + +class DocumentValidatorTest extends UnitTestCase +{ + public function setUp(): void + { + parent::setUp(); + + $this->resetSingletonInstances = true; + } + + /** + * @test + */ + public function passesHasAllMandatoryMetadataFields() + { + $metadata = [ + 'record_id' => [ + 'xyz' + ] + ]; + $documentValidator = new DocumentValidator($metadata, $this->getRequiredMetadataFields()); + self::assertTrue($documentValidator->hasAllMandatoryMetadataFields()); + } + + /** + * @test + */ + public function notPassesHasAllMandatoryMetadataFields() + { + $metadata = [ + 'document_format' => [ + 'METS' + ] + ]; + $documentValidator = new DocumentValidator($metadata, $this->getRequiredMetadataFields()); + self::assertFalse($documentValidator->hasAllMandatoryMetadataFields()); + } + + /** + * @test + */ + public function passesHasCorrectLogicalStructure() + { + $xml = $this->getXml('av_beispiel.xml'); + + $documentValidator = new DocumentValidator([], [], $xml); + self::assertTrue($documentValidator->hasCorrectLogicalStructure('advertisement')); + } + + /** + * @test + */ + public function notPassesHasCorrectLogicalStructure() + { + $xml = $this->getXml('av_beispiel.xml'); + + $documentValidator = new DocumentValidator([], [], $xml); + self::assertFalse($documentValidator->hasCorrectLogicalStructure('newspaper')); + } + + /** + * @test + */ + public function passesHasCorrectPhysicalStructure() + { + $xml = $this->getXml('av_beispiel.xml'); + + $documentValidator = new DocumentValidator([], [], $xml); + self::assertTrue($documentValidator->hasCorrectPhysicalStructure()); + } + + /** + * @test + */ + public function notPassesHasCorrectPhysicalStructure() + { + $xml = $this->getXml('two_dmdsec.xml'); + + $documentValidator = new DocumentValidator([], [], $xml); + self::assertFalse($documentValidator->hasCorrectPhysicalStructure()); + } + + private function getRequiredMetadataFields(): array + { + return [ + 'record_id' + ]; + } + + private function getXml(string $file): SimpleXMLElement + { + $xml = simplexml_load_file(__DIR__ . '/../../Fixtures/MetsDocument/' . $file); + self::assertNotFalse($xml); + return $xml; + } +} diff --git a/ext_conf_template.txt b/ext_conf_template.txt index b2e51147a..484353b12 100644 --- a/ext_conf_template.txt +++ b/ext_conf_template.txt @@ -14,6 +14,8 @@ general.publishNewCollections = 1 general.unhideOnIndex = 0 # cat=General; type=boolean; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.general.useExternalApisForMetadata general.useExternalApisForMetadata = 0 +# cat=General; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.general.requiredMetadataFields +general.requiredMetadataFields = document_format,record_id # cat=Files; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.files.fileGrpImages files.fileGrpImages = DEFAULT,MAX # cat=Files; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.files.fileGrpThumbs @@ -25,7 +27,7 @@ files.fileGrpFulltext = FULLTEXT # cat=Files; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.files.fileGrpAudio files.fileGrpAudio = AUDIO # cat=IIIF; type=boolean; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.iiif.indexAnnotations -indexAnnotations = 0 +iiif.indexAnnotations = 0 # cat=IIIF; type=int[1-2000]; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.iiif.thumbnailWidth iiif.thumbnailWidth = 150 # cat=IIIF; type=int[1-2000]; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.iiif.thumbnailHeight