diff --git a/samples/ImproperFontFallback.pdf b/samples/ImproperFontFallback.pdf new file mode 100644 index 00000000..2f6669d8 Binary files /dev/null and b/samples/ImproperFontFallback.pdf differ diff --git a/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php b/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php index 1c3b2687..70bc48cb 100644 --- a/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php +++ b/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php @@ -178,7 +178,7 @@ public static function getCodePage(): array "\xfc" => "\u{00fc}", // udieresis "\xfd" => "\u{00fd}", // yacute "\xfe" => "\u{00fe}", // thorn - "\xff" => "\u{00ff}", // ydieresis + "\xff" => "\u{00ff}", // ydieresis ]; } diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index c879176b..798b84d0 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -246,6 +246,39 @@ private function getDefaultFont(Page $page = null): Font return new Font($this->document, null, null, $this->config); } + /** + * @param array> $command + */ + private function getTJUsingFontFallback(Font $font, array $command, Page $page = null): string + { + $orig_text = $font->decodeText($command); + $text = $orig_text; + + // If we make this a Config option, we can add a check if it's + // enabled here. + if (null !== $page) { + $font_ids = array_keys($page->getFonts()); + + // If the decoded text contains UTF-8 control characters + // then the font page being used is probably the wrong one. + // Loop through the rest of the fonts to see if we can get + // a good decode. + while (preg_match('/[\x00-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { + // If we're out of font IDs, then give up and use the + // original string + if (0 == \count($font_ids)) { + return $orig_text; + } + + // Try the next font ID + $font = $page->getFont(array_shift($font_ids)); + $text = $font->decodeText($command); + } + } + + return $text; + } + /** * @throws \Exception */ @@ -339,8 +372,11 @@ public function getText(Page $page = null): string $command[self::COMMAND] = [$command]; // no break case 'TJ': - $sub_text = $current_font->decodeText($command[self::COMMAND]); - $text .= $sub_text; + $text .= $this->getTJUsingFontFallback( + $current_font, + $command[self::COMMAND], + $page + ); break; // set leading @@ -492,8 +528,11 @@ public function getTextArray(Page $page = null): array $command[self::COMMAND] = [$command]; // no break case 'TJ': - $sub_text = $current_font->decodeText($command[self::COMMAND]); - $text[] = $sub_text; + $text[] = $this->getTJUsingFontFallback( + $current_font, + $command[self::COMMAND], + $page + ); break; // set leading @@ -592,7 +631,7 @@ public function getCommandsText(string $text_part, int &$offset = 0): array case '/': $type = $char; if (preg_match( - '/\G\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si', + '/\G\/([A-Z0-9\._,\+-]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si', $text_part, $matches, 0, @@ -603,7 +642,7 @@ public function getCommandsText(string $text_part, int &$offset = 0): array $command = $matches[1]; $offset += \strlen($matches[0]); } elseif (preg_match( - '/\G\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si', + '/\G\/([A-Z0-9\._,\+-]+)\s+([A-Z]+)\s*/si', $text_part, $matches, 0, diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php index 97741675..ab7b229b 100644 --- a/tests/PHPUnit/Integration/PDFObjectTest.php +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -256,4 +256,38 @@ public function testReversedChars(): void $this->assertStringContainsString('שלומי טסט', $pages[0]->getText()); } + + /** + * Tests that a text stream with an improperly selected font code + * page falls back to one that maps all characters. + * + * @see: https://github.com/smalot/pdfparser/issues/586 + */ + public function testImproperFontFallback(): void + { + $filename = $this->rootDir.'/samples/ImproperFontFallback.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $this->assertStringContainsString('сделал', $pages[0]->getText()); + } + + /** + * Tests that a font ID containing a hyphen / dash character was + * correctly parsed + * + * @see: https://github.com/smalot/pdfparser/issues/145 + */ + public function testFontIDWithHyphen(): void + { + $pdfObject = $this->getPdfObjectInstance(new Document()); + + $fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf'); + + $this->assertEquals('/', $fontCommandHyphen[0]['t']); + $this->assertEquals('Tf', $fontCommandHyphen[0]['o']); + $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']); + } }