Skip to content

Commit

Permalink
Implement document validation before document is saved to database
Browse files Browse the repository at this point in the history
  • Loading branch information
beatrycze-volk committed May 21, 2024
1 parent 6f570be commit c5a54c7
Show file tree
Hide file tree
Showing 6 changed files with 231 additions and 66 deletions.
120 changes: 63 additions & 57 deletions Classes/Command/BaseCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
use Kitodo\Dlf\Domain\Model\Collection;
use Kitodo\Dlf\Domain\Model\Document;
use Kitodo\Dlf\Domain\Model\Library;
use Kitodo\Dlf\Validation\DocumentValidator;
use Symfony\Component\Console\Command\Command;
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
use TYPO3\CMS\Core\Utility\GeneralUtility;
Expand Down Expand Up @@ -213,71 +214,76 @@ protected function saveToDatabase(Document $document): bool
$doc->cPid = $this->storagePid;

$metadata = $doc->getToplevelMetadata($this->storagePid);
$validator = new DocumentValidator($metadata, explode(',', $this->extConf['requiredMetadataFields']));

if ($validator->hasAllMandatoryMetadataFields()) {
// set title data
$document->setTitle($metadata['title'][0] ? : '');
$document->setTitleSorting($metadata['title_sorting'][0] ? : '');
$document->setPlace(implode('; ', $metadata['place']));
$document->setYear(implode('; ', $metadata['year']));

// Remove appended "valueURI" from authors' names for storing in database.
foreach ($metadata['author'] as $i => $author) {
$splitName = explode(pack('C', 31), $author);
$metadata['author'][$i] = $splitName[0];
}
$document->setAuthor(implode('; ', $metadata['author']));
$document->setThumbnail($doc->thumbnail ? : '');
$document->setMetsLabel($metadata['mets_label'][0] ? : '');
$document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : '');

// set title data
$document->setTitle($metadata['title'][0] ? : '');
$document->setTitleSorting($metadata['title_sorting'][0] ? : '');
$document->setPlace(implode('; ', $metadata['place']));
$document->setYear(implode('; ', $metadata['year']));
$structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]);
$document->setStructure($structure);

// Remove appended "valueURI" from authors' names for storing in database.
foreach ($metadata['author'] as $i => $author) {
$splitName = explode(chr(31), $author);
$metadata['author'][$i] = $splitName[0];
}
$document->setAuthor($this->getAuthors($metadata['author']));
$document->setThumbnail($doc->thumbnail ? : '');
$document->setMetsLabel($metadata['mets_label'][0] ? : '');
$document->setMetsOrderlabel($metadata['mets_orderlabel'][0] ? : '');
if (is_array($metadata['collection'])) {
$this->addCollections($document, $metadata['collection']);
}

$structure = $this->structureRepository->findOneByIndexName($metadata['type'][0]);
$document->setStructure($structure);
// set identifiers
$document->setProdId($metadata['prod_id'][0] ? : '');
$document->setOpacId($metadata['opac_id'][0] ? : '');
$document->setUnionId($metadata['union_id'][0] ? : '');

$document->setRecordId($metadata['record_id'][0]);
$document->setUrn($metadata['urn'][0] ? : '');
$document->setPurl($metadata['purl'][0] ? : '');
$document->setDocumentFormat($metadata['document_format'][0] ? : '');

// set access
$document->setLicense($metadata['license'][0] ? : '');
$document->setTerms($metadata['terms'][0] ? : '');
$document->setRestrictions($metadata['restrictions'][0] ? : '');
$document->setOutOfPrint($metadata['out_of_print'][0] ? : '');
$document->setRightsInfo($metadata['rights_info'][0] ? : '');
$document->setStatus(0);

$this->setOwner($metadata['owner'][0]);
$document->setOwner($this->owner);

// set volume data
$document->setVolume($metadata['volume'][0] ? : '');
$document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : '');

// Get UID of parent document.
if ($document->getDocumentFormat() === 'METS') {
$document->setPartof($this->getParentDocumentUidForSaving($document));
}

if (is_array($metadata['collection'])) {
$this->addCollections($document, $metadata['collection']);
}
if ($document->getUid() === null) {
// new document
$this->documentRepository->add($document);
} else {
// update of existing document
$this->documentRepository->update($document);
}

// set identifiers
$document->setProdId($metadata['prod_id'][0] ? : '');
$document->setOpacId($metadata['opac_id'][0] ? : '');
$document->setUnionId($metadata['union_id'][0] ? : '');

$document->setRecordId($metadata['record_id'][0] ? : ''); // (?) $doc->recordId
$document->setUrn($metadata['urn'][0] ? : '');
$document->setPurl($metadata['purl'][0] ? : '');
$document->setDocumentFormat($metadata['document_format'][0] ? : '');

// set access
$document->setLicense($metadata['license'][0] ? : '');
$document->setTerms($metadata['terms'][0] ? : '');
$document->setRestrictions($metadata['restrictions'][0] ? : '');
$document->setOutOfPrint($metadata['out_of_print'][0] ? : '');
$document->setRightsInfo($metadata['rights_info'][0] ? : '');
$document->setStatus(0);

$this->setOwner($metadata['owner'][0]);
$document->setOwner($this->owner);

// set volume data
$document->setVolume($metadata['volume'][0] ? : '');
$document->setVolumeSorting($metadata['volume_sorting'][0] ? : $metadata['mets_order'][0] ? : '');

// Get UID of parent document.
if ($document->getDocumentFormat() === 'METS') {
$document->setPartof($this->getParentDocumentUidForSaving($document));
}
$this->persistenceManager->persistAll();

if ($document->getUid() === null) {
// new document
$this->documentRepository->add($document);
} else {
// update of existing document
$this->documentRepository->update($document);
return true;
}

$this->persistenceManager->persistAll();

return true;
return false;
}

/**
Expand Down
27 changes: 18 additions & 9 deletions Classes/Command/IndexCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -180,20 +180,29 @@ protected function execute(InputInterface $input, OutputInterface $output): int

if ($dryRun) {
$io->section('DRY RUN: Would index ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.');
$io->success('All done!');
return BaseCommand::SUCCESS;
} else {
$document->setCurrentDocument($doc);

if ($io->isVerbose()) {
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . ' and Solr core ' . $solrCoreUid . '.');
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on PID ' . $this->storagePid . '.');
}
$document->setCurrentDocument($doc);
// save to database
$this->saveToDatabase($document);
// add to index
Indexer::add($document, $this->documentRepository);
}
$isSaved = $this->saveToDatabase($document);

$io->success('All done!');
if ($isSaved) {
if ($io->isVerbose()) {
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on Solr core ' . $solrCoreUid . '.');
}
Indexer::add($document, $this->documentRepository);

return BaseCommand::SUCCESS;
$io->success('All done!');
return BaseCommand::SUCCESS;
}

$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (document format or record identifier) in this document.');
return BaseCommand::FAILURE;
}
}

/**
Expand Down
141 changes: 141 additions & 0 deletions Classes/Validation/DocumentValidator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
<?php

declare(strict_types=1);

/*
* (c) Kitodo. Key to digital objects e.V. <[email protected]>
*
* This file is part of the Kitodo and TYPO3 projects.
*
* @license GNU General Public License version 3 or later.
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*/

namespace Kitodo\Dlf\Validation;

use TYPO3\CMS\Core\Log\Logger;
use TYPO3\CMS\Core\Log\LogManager;
use TYPO3\CMS\Core\Utility\GeneralUtility;

/**
* Class for document validation. Currently used for validating metadata
* fields but in the future should be extended also for other fields.
*
* @package TYPO3
* @subpackage dlf
*
* @access public
*/
class DocumentValidator
{
/**
* @access protected
* @var Logger This holds the logger
*/
protected Logger $logger;

/**
* @access private
* @var array
*/
private array $metadata;

/**
* @access private
* @var array
*/
private array $requiredMetadataFields;

/**
* @access private
* @var ?\SimpleXMLElement
*/
private ?\SimpleXMLElement $xml;

/**
* Constructs DocumentValidator instance.
*
* @access public
*
* @param array $metadata
* @param array $requiredMetadataFields
*
* @return void
*/
public function __construct(array $metadata = [], array $requiredMetadataFields = [], ?\SimpleXMLElement $xml = null)
{
$this->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(static::class);
$this->metadata = $metadata;
$this->requiredMetadataFields = $requiredMetadataFields;
$this->xml = $xml;
}

/**
* Check if metadata array contains all mandatory fields before save.
*
* @access public
*
* @return bool
*/
public function hasAllMandatoryMetadataFields(): bool
{
foreach ($this->requiredMetadataFields as $requiredMetadataField) {
if (empty($this->metadata[$requiredMetadataField][0])) {
$this->logger->error('Missing required metadata field "' . $requiredMetadataField . '".');
return false;
}
}
return true;
}

/**
* Check if xml contains at least one logical structure with given type.
*
* @access public
*
* @param string $type e.g. documentary, newspaper or object
*
* @return bool
*/
public function hasCorrectLogicalStructure(string $type): bool
{
$expectedNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div[@TYPE="' . $type . '"]');
if ($expectedNodes) {
return true;
}

$existingNodes = $this->xml->xpath('./mets:structMap[@TYPE="LOGICAL"]/mets:div');
if ($existingNodes) {
$this->logger->error('Document contains logical structure but @TYPE="' . $type . '" is missing.');
return false;
}

$this->logger->error('Document does not contain logical structure.');
return false;
}

/**
* Check if xml contains at least one physical structure with type 'physSequence'.
*
* @access public
*
* @return bool
*/
public function hasCorrectPhysicalStructure(): bool
{
$physSequenceNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]');
if ($physSequenceNodes) {
return true;
}

$physicalStructureNodes = $this->xml->xpath('./mets:structMap[@TYPE="PHYSICAL"]/mets:div');
if ($physicalStructureNodes) {
$this->logger->error('Document contains physical structure but @TYPE="physSequence" is missing.');
return false;
}

$this->logger->error('Document does not contain physical structure.');
return false;
}
}
4 changes: 4 additions & 0 deletions Resources/Private/Language/de.locallang_labels.xlf
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,10 @@
<target>Verwende externe APIs zum Abrufen von Metadaten?: (Standard ist "FALSE")</target>
<source>Use external APIs for getting metadata?: (default is "FALSE")</source>
</trans-unit>
<trans-unit id="config.requiredMetadataFields">
<target>Für die Indizierung von Dokumenten erforderliche Metadatenfelder</target>
<source>Metadata fields required for indexing documents</source>
</trans-unit>
<trans-unit id="config.fileGrpImages">
<target>Seiten fileGrps: Komma-getrennte Liste der @USE Attributwerte der Seitenansichten nach aufsteigender Größe sortiert (Standard ist "DEFAULT,MAX")</target>
<source>Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX")</source>
Expand Down
3 changes: 3 additions & 0 deletions Resources/Private/Language/locallang_labels.xlf
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,9 @@
<trans-unit id="config.useExternalApisForMetadata">
<source>Use external APIs for getting metadata?: (default is "FALSE")</source>
</trans-unit>
<trans-unit id="config.requiredMetadataFields">
<source>Metadata fields required for indexing documents</source>
</trans-unit>
<trans-unit id="config.fileGrpImages">
<source>Page fileGrps: comma-separated list of @USE attribute values ordered by increasing size (default is "DEFAULT,MAX")</source>
</trans-unit>
Expand Down
2 changes: 2 additions & 0 deletions ext_conf_template.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ publishNewCollections = 1
unhideOnIndex = 0
# cat=Basic; type=boolean; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.useExternalApisForMetadata
useExternalApisForMetadata = 0
# cat=Document; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.requiredMetadataFields
requiredMetadataFields = document_format,record_id
# cat=Files; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.fileGrpImages
fileGrpImages = DEFAULT,MAX
# cat=Files; type=string; label=LLL:EXT:dlf/Resources/Private/Language/locallang_labels.xlf:config.fileGrpThumbs
Expand Down

0 comments on commit c5a54c7

Please sign in to comment.