Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Implement document validation before document is saved to database #1149

Merged
merged 7 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 65 additions & 59 deletions Classes/Command/BaseCommand.php
sebastian-meyer marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
use Kitodo\Dlf\Domain\Model\Collection;
use Kitodo\Dlf\Domain\Model\Document;
use Kitodo\Dlf\Domain\Model\Library;
use Kitodo\Dlf\Validation\DocumentValidator;
use Symfony\Component\Console\Command\Command;
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
use TYPO3\CMS\Core\Utility\GeneralUtility;
Expand Down Expand Up @@ -216,71 +217,70 @@ protected function saveToDatabase(Document $document): bool
$doc->cPid = $this->storagePid;

$metadata = $doc->getToplevelMetadata($this->storagePid);
$validator = new DocumentValidator($metadata, explode(',', $this->extConf['requiredMetadataFields']));

if ($validator->hasAllMandatoryMetadataFields()) {
// set title data
$document->setTitle($metadata['title'][0] ? : '');
$document->setTitleSorting($metadata['title_sorting'][0] ? : '');
$document->setPlace(implode('; ', $metadata['place']));
$document->setYear(implode('; ', $metadata['year']));
$document->setAuthor($this->getAuthors($metadata['author']));
$document->setThumbnail($doc->thumbnail ? : '');
$document->setMetsLabel($metadata['mets_label'][0] ? : '');
$document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : '');

$structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]);
$document->setStructure($structure);

if (is_array($metadata['collection'])) {
$this->addCollections($document, $metadata['collection']);
}

// set title data
$document->setTitle($metadata['title'][0] ? : '');
$document->setTitleSorting($metadata['title_sorting'][0] ? : '');
$document->setPlace(implode('; ', $metadata['place']));
$document->setYear(implode('; ', $metadata['year']));

// Remove appended "valueURI" from authors' names for storing in database.
foreach ($metadata['author'] as $i => $author) {
$splitName = explode(pack('C', 31), $author);
$metadata['author'][$i] = $splitName[0];
}
$document->setAuthor($this->getAuthors($metadata['author']));
$document->setThumbnail($doc->thumbnail ? : '');
$document->setMetsLabel($metadata['mets_label'][0] ? : '');
$document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : '');
// set identifiers
$document->setProdId($metadata['prod_id'][0] ? : '');
$document->setOpacId($metadata['opac_id'][0] ? : '');
$document->setUnionId($metadata['union_id'][0] ? : '');

$document->setRecordId($metadata['record_id'][0]);
$document->setUrn($metadata['urn'][0] ? : '');
$document->setPurl($metadata['purl'][0] ? : '');
$document->setDocumentFormat($metadata['document_format'][0] ? : '');

// set access
$document->setLicense($metadata['license'][0] ? : '');
$document->setTerms($metadata['terms'][0] ? : '');
$document->setRestrictions($metadata['restrictions'][0] ? : '');
$document->setOutOfPrint($metadata['out_of_print'][0] ? : '');
$document->setRightsInfo($metadata['rights_info'][0] ? : '');
$document->setStatus(0);

$this->setOwner($metadata['owner'][0]);
$document->setOwner($this->owner);

// set volume data
$document->setVolume($metadata['volume'][0] ? : '');
$document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : '');

// Get UID of parent document.
if ($document->getDocumentFormat() === 'METS') {
$document->setPartof($this->getParentDocumentUidForSaving($document));
}

$structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]);
$document->setStructure($structure);
if ($document->getUid() === null) {
// new document
$this->documentRepository->add($document);
} else {
// update of existing document
$this->documentRepository->update($document);
}

if (is_array($metadata['collection'])) {
$this->addCollections($document, $metadata['collection']);
}
$this->persistenceManager->persistAll();

// set identifiers
$document->setProdId($metadata['prod_id'][0] ? : '');
$document->setOpacId($metadata['opac_id'][0] ? : '');
$document->setUnionId($metadata['union_id'][0] ? : '');

$document->setRecordId($metadata['record_id'][0] ? : ''); // (?) $doc->recordId
$document->setUrn($metadata['urn'][0] ? : '');
$document->setPurl($metadata['purl'][0] ? : '');
$document->setDocumentFormat($metadata['document_format'][0] ? : '');

// set access
$document->setLicense($metadata['license'][0] ? : '');
$document->setTerms($metadata['terms'][0] ? : '');
$document->setRestrictions($metadata['restrictions'][0] ? : '');
$document->setOutOfPrint($metadata['out_of_print'][0] ? : '');
$document->setRightsInfo($metadata['rights_info'][0] ? : '');
$document->setStatus(0);

$this->setOwner($metadata['owner'][0]);
$document->setOwner($this->owner);

// set volume data
$document->setVolume($metadata['volume'][0] ? : '');
$document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : '');

// Get UID of parent document.
if ($document->getDocumentFormat() === 'METS') {
$document->setPartof($this->getParentDocumentUidForSaving($document));
return true;
}

if ($document->getUid() === null) {
// new document
$this->documentRepository->add($document);
} else {
// update of existing document
$this->documentRepository->update($document);
}

$this->persistenceManager->persistAll();

return true;
return false;
}

/**
Expand Down Expand Up @@ -371,6 +371,12 @@ private function addCollections(Document &$document, array $collections): void
*/
private function getAuthors(array $metadataAuthor): string
{
// Remove appended "valueURI" from authors' names for storing in database.
foreach ($metadataAuthor as $i => $author) {
$splitName = explode(pack('C', 31), $author);
$metadataAuthor[$i] = $splitName[0];
}

$authors = '';
$delimiter = '; ';
$ellipsis = 'et al.';
Expand Down
27 changes: 18 additions & 9 deletions Classes/Command/IndexCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -180,20 +180,29 @@ protected function execute(InputInterface $input, OutputInterface $output): int

if ($dryRun) {
$io->section('DRY RUN: Would index ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.');
$io->success('All done!');
return BaseCommand::SUCCESS;
} else {
$document->setCurrentDocument($doc);

if ($io->isVerbose()) {
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.');
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . '.');
}
$document->setCurrentDocument($doc);
// save to database
$this->saveToDatabase($document);
// add to index
Indexer::add($document, $this->documentRepository);
}
$isSaved = $this->saveToDatabase($document);

$io->success('All done!');
if ($isSaved) {
if ($io->isVerbose()) {
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on Solr core ' . $solrCoreUid . '.');
}
Indexer::add($document, $this->documentRepository);

return BaseCommand::SUCCESS;
$io->success('All done!');
return BaseCommand::SUCCESS;
}

$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (document format or record identifier) in this document.');
return BaseCommand::FAILURE;
}
}

/**
Expand Down
141 changes: 141 additions & 0 deletions Classes/Validation/DocumentValidator.php
sebastian-meyer marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
<?php

declare(strict_types=1);

/*
* (c) Kitodo. Key to digital objects e.V. <[email protected]>
*
* This file is part of the Kitodo and TYPO3 projects.
*
* @license GNU General Public License version 3 or later.
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*/

namespace Kitodo\Dlf\Validation;

use TYPO3\CMS\Core\Log\Logger;
use TYPO3\CMS\Core\Log\LogManager;
use TYPO3\CMS\Core\Utility\GeneralUtility;

/**
* Class for document validation. Currently used for validating metadata
* fields but in the future should be extended also for other fields.
*
* @package TYPO3
* @subpackage dlf
*
* @access public
*/
class DocumentValidator
{
/**
* @access protected

Check notice on line 33 in Classes/Validation/DocumentValidator.php

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Classes/Validation/DocumentValidator.php#L33

Expected 6 space(s) before asterisk; 5 found
* @var Logger This holds the logger

Check notice on line 34 in Classes/Validation/DocumentValidator.php

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Classes/Validation/DocumentValidator.php#L34

Expected 6 space(s) before asterisk; 5 found
*/

Check notice on line 35 in Classes/Validation/DocumentValidator.php

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Classes/Validation/DocumentValidator.php#L35

Expected 6 space(s) before asterisk; 5 found
protected Logger $logger;

/**
* @access private
* @var array
*/
private array $metadata;

/**
* @access private
* @var array
*/
private array $requiredMetadataFields;

/**
* @access private
* @var ?\SimpleXMLElement
*/
private ?\SimpleXMLElement $xml;

/**
* Constructs DocumentValidator instance.
*
* @access public
*
* @param array $metadata
* @param array $requiredMetadataFields
*
* @return void
*/
public function __construct(array $metadata = [], array $requiredMetadataFields = [], ?\SimpleXMLElement $xml = null)
{
$this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(static::class);
$this->metadata = $metadata;
$this->requiredMetadataFields = $requiredMetadataFields;
$this->xml = $xml;
}

/**
* Check if metadata array contains all mandatory fields before save.
*
* @access public
*
* @return bool
*/
public function hasAllMandatoryMetadataFields(): bool
{
foreach ($this->requiredMetadataFields as $requiredMetadataField) {
if (empty($this->metadata[$requiredMetadataField][0])) {
$this->logger->error('Missing required metadata field "' . $requiredMetadataField . '".');
return false;
}
}
return true;
}

/**
* Check if xml contains at least one logical structure with given type.
*
* @access public
*

Check notice on line 96 in Classes/Validation/DocumentValidator.php

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Classes/Validation/DocumentValidator.php#L96

Whitespace found at end of line
* @param string $type e.g. documentary, newspaper or object
*
* @return bool
*/
public function hasCorrectLogicalStructure(string $type): bool
{
$expectedNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div[@TYPE="' . $type . '"]');
if ($expectedNodes) {
return true;
}

$existingNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div');
if ($existingNodes) {
$this->logger->error('Document contains logical structure but @TYPE="' . $type . '" is missing.');
return false;
}

$this->logger->error('Document does not contain logical structure.');
return false;
}

/**
* Check if xml contains at least one physical structure with type 'physSequence'.
*
* @access public
*
* @return bool
*/
public function hasCorrectPhysicalStructure(): bool
{
$physSequenceNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]');
if ($physSequenceNodes) {
return true;
}

$physicalStructureNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div');
if ($physicalStructureNodes) {
$this->logger->error('Document contains physical structure but @TYPE="physSequence" is missing.');
return false;
}

$this->logger->error('Document does not contain physical structure.');
return false;
}
}
4 changes: 4 additions & 0 deletions Resources/Private/Language/de.locallang_labels.xlf
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,10 @@
<target>Verwende externe APIs zum Abrufen von Metadaten?: (Standard ist "FALSE")</target>
<source>Use external APIs for getting metadata?: (default is "FALSE")</source>
</trans-unit>
<trans-unit id="config.requiredMetadataFields">
<target>Für die Indizierung von Dokumenten erforderliche Metadatenfelder</target>
<source>Metadata fields required for indexing documents</source>
</trans-unit>
<trans-unit id="config.fileGrpImages">
<target>Seiten fileGrps: Komma-getrennte Liste der @USE Attributwerte der Seitenansichten nach aufsteigender Größe sortiert (Standard ist "DEFAULT,MAX")</target>
<source>Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX")</source>
Expand Down
3 changes: 3 additions & 0 deletions Resources/Private/Language/locallang_labels.xlf
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,9 @@
<trans-unit id="config.useExternalApisForMetadata">
<source>Use external APIs for getting metadata?: (default is "FALSE")</source>
</trans-unit>
<trans-unit id="config.requiredMetadataFields">
<source>Metadata fields required for indexing documents</source>
</trans-unit>
<trans-unit id="config.fileGrpImages">
<source>Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX")</source>
</trans-unit>
Expand Down
Loading