Skip to content

Commit

Permalink
[FEATURE] Implement document validation before document is saved to S…
Browse files Browse the repository at this point in the history
…OLR (#1158)

Co-authored-by: Sebastian Meyer <[email protected]>
  • Loading branch information
beatrycze-volk and sebastian-meyer authored May 29, 2024
1 parent 35c7a31 commit 86c0697
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 56 deletions.
10 changes: 8 additions & 2 deletions Classes/Command/IndexCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,19 @@ protected function execute(InputInterface $input, OutputInterface $output): int
if ($io->isVerbose()) {
$io->section('Indexing ' . $document->getUid() . ' ("' . $document->getLocation() . '") on Solr core ' . $solrCoreUid . '.');
}
Indexer::add($document, $this->documentRepository);
$isSaved = Indexer::add($document, $this->documentRepository);
} else {
$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (at least one of those: ' . $this->extConf['requiredMetadataFields'] . ') in this document.');
return BaseCommand::FAILURE;
}

if ($isSaved) {
$io->success('All done!');
return BaseCommand::SUCCESS;
}

$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on PID ' . $this->storagePid . ' . There are missing mandatory fields (document format or record identifier) in this document.');
$io->error('ERROR: Document with UID "' . $document->getUid() . '" could not be indexed on Solr core ' . $solrCoreUid . ' . There are missing mandatory fields (at least one of those: ' . $this->extConf['requiredMetadataFields'] . ') in this document.');
$io->info('INFO: Document with UID "' . $document->getUid() . '" is already in database. If you want to keep the database and index consistent you need to remove it.');
return BaseCommand::FAILURE;
}
}
Expand Down
115 changes: 62 additions & 53 deletions Classes/Common/Indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use Kitodo\Dlf\Common\Solr\Solr;
use Kitodo\Dlf\Domain\Repository\DocumentRepository;
use Kitodo\Dlf\Domain\Model\Document;
use Kitodo\Dlf\Validation\DocumentValidator;
use Solarium\Core\Query\DocumentInterface;
use Solarium\QueryType\Update\Query\Query;
use Symfony\Component\Console\Input\InputInterface;
Expand Down Expand Up @@ -334,59 +335,67 @@ protected static function processLogical(Document $document, array $logicalUnit)
// Get metadata for logical unit.
$metadata = $doc->metadataArray[$logicalUnit['id']];
if (!empty($metadata)) {
$metadata['author'] = self::removeAppendsFromAuthor($metadata['author']);
// set Owner if available
if ($document->getOwner()) {
$metadata['owner'][0] = $document->getOwner()->getIndexName();
}
// Create new Solr document.
$updateQuery = self::$solr->service->createUpdate();
$solrDoc = self::getSolrDocument($updateQuery, $document, $logicalUnit);
if (MathUtility::canBeInterpretedAsInteger($logicalUnit['points'])) {
$solrDoc->setField('page', $logicalUnit['points']);
}
if ($logicalUnit['id'] == $doc->toplevelId) {
$solrDoc->setField('thumbnail', $doc->thumbnail);
} elseif (!empty($logicalUnit['thumbnailId'])) {
$solrDoc->setField('thumbnail', $doc->getFileLocation($logicalUnit['thumbnailId']));
}
// There can be only one toplevel unit per UID, independently of backend configuration
$solrDoc->setField('toplevel', $logicalUnit['id'] == $doc->toplevelId ? true : false);
$solrDoc->setField('title', $metadata['title'][0], self::$fields['fieldboost']['title']);
$solrDoc->setField('volume', $metadata['volume'][0], self::$fields['fieldboost']['volume']);
// verify date formatting
if(strtotime($metadata['date'][0])) {
$solrDoc->setField('date', self::getFormattedDate($metadata['date'][0]));
}
$solrDoc->setField('record_id', $metadata['record_id'][0]);
$solrDoc->setField('purl', $metadata['purl'][0]);
$solrDoc->setField('location', $document->getLocation());
$solrDoc->setField('urn', $metadata['urn']);
$solrDoc->setField('license', $metadata['license']);
$solrDoc->setField('terms', $metadata['terms']);
$solrDoc->setField('restrictions', $metadata['restrictions']);
$coordinates = json_decode($metadata['coordinates'][0]);
if (is_object($coordinates)) {
$solrDoc->setField('geom', json_encode($coordinates->features[0]));
}
$autocomplete = self::processMetadata($document, $metadata, $solrDoc);
// Add autocomplete values to index.
if (!empty($autocomplete)) {
$solrDoc->setField('autocomplete', $autocomplete);
}
// Add collection information to logical sub-elements if applicable.
if (
in_array('collection', self::$fields['facets'])
&& empty($metadata['collection'])
&& !empty($doc->metadataArray[$doc->toplevelId]['collection'])
) {
$solrDoc->setField('collection_faceting', $doc->metadataArray[$doc->toplevelId]['collection']);
}
try {
$updateQuery->addDocument($solrDoc);
self::$solr->service->update($updateQuery);
} catch (\Exception $e) {
self::handleException($e->getMessage());
$extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey, 'general');
$validator = new DocumentValidator($metadata, explode(',', $extConf['requiredMetadataFields']));

if ($validator->hasAllMandatoryMetadataFields()) {
$metadata['author'] = self::removeAppendsFromAuthor($metadata['author']);
// set Owner if available
if ($document->getOwner()) {
$metadata['owner'][0] = $document->getOwner()->getIndexName();
}
// Create new Solr document.
$updateQuery = self::$solr->service->createUpdate();
$solrDoc = self::getSolrDocument($updateQuery, $document, $logicalUnit);
if (MathUtility::canBeInterpretedAsInteger($logicalUnit['points'])) {
$solrDoc->setField('page', $logicalUnit['points']);
}
if ($logicalUnit['id'] == $doc->toplevelId) {
$solrDoc->setField('thumbnail', $doc->thumbnail);
} elseif (!empty($logicalUnit['thumbnailId'])) {
$solrDoc->setField('thumbnail', $doc->getFileLocation($logicalUnit['thumbnailId']));
}
// There can be only one toplevel unit per UID, independently of backend configuration
$solrDoc->setField('toplevel', $logicalUnit['id'] == $doc->toplevelId ? true : false);
$solrDoc->setField('title', $metadata['title'][0], self::$fields['fieldboost']['title']);
$solrDoc->setField('volume', $metadata['volume'][0], self::$fields['fieldboost']['volume']);
// verify date formatting
if(strtotime($metadata['date'][0])) {
$solrDoc->setField('date', self::getFormattedDate($metadata['date'][0]));
}
$solrDoc->setField('record_id', $metadata['record_id'][0]);
$solrDoc->setField('purl', $metadata['purl'][0]);
$solrDoc->setField('location', $document->getLocation());
$solrDoc->setField('urn', $metadata['urn']);
$solrDoc->setField('license', $metadata['license']);
$solrDoc->setField('terms', $metadata['terms']);
$solrDoc->setField('restrictions', $metadata['restrictions']);
$coordinates = json_decode($metadata['coordinates'][0]);
if (is_object($coordinates)) {
$solrDoc->setField('geom', json_encode($coordinates->features[0]));
}
$autocomplete = self::processMetadata($document, $metadata, $solrDoc);
// Add autocomplete values to index.
if (!empty($autocomplete)) {
$solrDoc->setField('autocomplete', $autocomplete);
}
// Add collection information to logical sub-elements if applicable.
if (
in_array('collection', self::$fields['facets'])
&& empty($metadata['collection'])
&& !empty($doc->metadataArray[$doc->toplevelId]['collection'])
) {
$solrDoc->setField('collection_faceting', $doc->metadataArray[$doc->toplevelId]['collection']);
}
try {
$updateQuery->addDocument($solrDoc);
self::$solr->service->update($updateQuery);
} catch (\Exception $e) {
self::handleException($e->getMessage());
return false;
}
} else {
Helper::log('Tip: If "record_id" field is missing then there is possibility that METS file still contains it but with the wrong source type attribute in "recordIdentifier" element', LOG_SEVERITY_NOTICE);
return false;
}
}
Expand Down
3 changes: 2 additions & 1 deletion Tests/Functional/FunctionalTestCase.php
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ protected function getDlfConfiguration()

return [
'general' => [
'useExternalApisForMetadata' => 0
'useExternalApisForMetadata' => 0,
'requiredMetadataFields' => 'document_format'
],
'files' => [
'fileGrpImages' => 'DEFAULT,MAX',
Expand Down

0 comments on commit 86c0697

Please sign in to comment.