From db6af2783a0187320e957427a789d1b1edfa9713 Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Wed, 26 Jun 2024 04:03:19 -0400 Subject: [PATCH] Merge XMP Metadata if dc:format tag not found (#722) * If no `dc:format` XMP tag, merge metadata Previously `extractXMPMetadata()` would check for the existence of a `dc:format` tag with an `application/pdf` MIME-type value before allowing found XMP metadata to be merged with the other document details. If the tag doesn't exist, merge the metadata anyway. If it _does_ exist, _then_ check to see if it has the `application/pdf` MIME-type. * DocumentTest.php: small code niceup --------- Co-authored-by: Konrad Abicht --- src/Smalot/PdfParser/Document.php | 2 +- tests/PHPUnit/Integration/DocumentTest.php | 32 ++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index 016787af..df0a6402 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -287,7 +287,7 @@ public function extractXMPMetadata(string $content): void } // Only use this metadata if it's referring to a PDF - if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) { + if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) { // According to the XMP specifications: 'Conflict resolution // for separate packets that describe the same resource is // beyond the scope of this document.' - Section 6.1 diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 5f19b696..346ba633 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -232,4 +232,36 @@ public function testGetPagesMissingCatalog(): void $document = $this->getDocumentInstance(); $document->getPages(); } + + /** + * @see https://github.com/smalot/pdfparser/issues/721 + */ + public function testExtractXMPMetadataIssue721(): void + { + $document = $this->getDocumentInstance(); + + // Check that XMP metadata is parsed even if missing a dc:format tag + $content = ' + + + + + + PdfParser + + + 2018-02-07T11:51:44-05:00 + 2019-10-23T09:56:01-04:00 + + +'; + + $document->extractXMPMetadata($content); + $document->init(); + $details = $document->getDetails(); + + $this->assertEquals(4, \count($details)); + $this->assertEquals('PdfParser', $details['dc:creator']); + $this->assertEquals('2019-10-23T09:56:01-04:00', $details['xmp:modifydate']); + } }