From 52de298ece5a7f5b4a6ebbeb513ed2d714cc75e7 Mon Sep 17 00:00:00 2001 From: oleibman <10341515+oleibman@users.noreply.github.com> Date: Thu, 21 Nov 2024 22:17:24 -0800 Subject: [PATCH] Ignore Settings::libXmlLoaderOptions Backport of PR #4233. --- CHANGELOG.md | 10 +++++ docs/topics/reading-and-writing-to-file.md | 1 - src/PhpSpreadsheet/Reader/Gnumeric.php | 7 ++-- src/PhpSpreadsheet/Reader/Html.php | 2 +- src/PhpSpreadsheet/Reader/Ods.php | 37 +++++++++---------- src/PhpSpreadsheet/Reader/Xlsx.php | 22 +++++------ src/PhpSpreadsheet/Reader/Xlsx/Properties.php | 5 +-- src/PhpSpreadsheet/Reader/Xml.php | 6 +-- src/PhpSpreadsheet/Settings.php | 10 ++--- .../Reader/Html/HtmlCharsetTest.php | 1 + .../Reader/HTML/charset.ISO-8859-1.html4.html | 2 +- tests/data/Reader/HTML/xhtml4.entity.xhtml | 17 +++++++++ 12 files changed, 70 insertions(+), 50 deletions(-) create mode 100644 tests/data/Reader/HTML/xhtml4.entity.xhtml diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a69fdff53..1e3ede6f55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com) and this project adheres to [Semantic Versioning](https://semver.org). +## 2024-11-22 - 2.1.4 + +### Changed + +- Settings::libXmlLoaderOptions is ignored. Backport of [PR #4233](https://github.com/PHPOffice/PhpSpreadsheet/pull/4233) + +### Deprecated + +- Settings::setLibXmlLoaderOptions() and Settings::getLibXmlLoaderOptions() are no longer needed - no replacement. + ## 2024-11-10 - 2.1.3 ### Fixed diff --git a/docs/topics/reading-and-writing-to-file.md b/docs/topics/reading-and-writing-to-file.md index f1ac473c08..6fb3d1d1d0 100644 --- a/docs/topics/reading-and-writing-to-file.md +++ b/docs/topics/reading-and-writing-to-file.md @@ -298,7 +298,6 @@ versions of Microsoft Excel. **Excel 2003 XML limitations** Please note that Excel 2003 XML format has some limits regarding to styling cells and handling large spreadsheets via PHP. -Also, only files using charset UTF-8 or ISO-8859-* are supported. ### \PhpOffice\PhpSpreadsheet\Reader\Xml diff --git a/src/PhpSpreadsheet/Reader/Gnumeric.php b/src/PhpSpreadsheet/Reader/Gnumeric.php index 723e23bb1a..e57a07f492 100644 --- a/src/PhpSpreadsheet/Reader/Gnumeric.php +++ b/src/PhpSpreadsheet/Reader/Gnumeric.php @@ -11,7 +11,6 @@ use PhpOffice\PhpSpreadsheet\Reader\Security\XmlScanner; use PhpOffice\PhpSpreadsheet\ReferenceHelper; use PhpOffice\PhpSpreadsheet\RichText\RichText; -use PhpOffice\PhpSpreadsheet\Settings; use PhpOffice\PhpSpreadsheet\Shared\File; use PhpOffice\PhpSpreadsheet\Spreadsheet; use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet; @@ -104,7 +103,7 @@ public function listWorksheetNames(string $filename): array $xml = new XMLReader(); $contents = $this->gzfileGetContents($filename); - $xml->xml($contents, null, Settings::getLibXmlLoaderOptions()); + $xml->xml($contents); $xml->setParserProperty(2, true); $worksheetNames = []; @@ -133,7 +132,7 @@ public function listWorksheetInfo(string $filename): array $xml = new XMLReader(); $contents = $this->gzfileGetContents($filename); - $xml->xml($contents, null, Settings::getLibXmlLoaderOptions()); + $xml->xml($contents); $xml->setParserProperty(2, true); $worksheetInfo = []; @@ -247,7 +246,7 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp /** @var XmlScanner */ $securityScanner = $this->securityScanner; - $xml2 = simplexml_load_string($securityScanner->scan($gFileData), 'SimpleXMLElement', Settings::getLibXmlLoaderOptions()); + $xml2 = simplexml_load_string($securityScanner->scan($gFileData)); $xml = self::testSimpleXml($xml2); $gnmXML = $xml->children(self::NAMESPACE_GNM); diff --git a/src/PhpSpreadsheet/Reader/Html.php b/src/PhpSpreadsheet/Reader/Html.php index 12f8b42ef1..6b4dd997fa 100644 --- a/src/PhpSpreadsheet/Reader/Html.php +++ b/src/PhpSpreadsheet/Reader/Html.php @@ -32,7 +32,7 @@ class Html extends BaseReader private const STARTS_WITH_BOM = '/^(?:\xfe\xff|\xff\xfe|\xEF\xBB\xBF)/'; - private const DECLARES_CHARSET = '/ charset=/i'; + private const DECLARES_CHARSET = '/\\bcharset=/i'; /** * Input encoding. diff --git a/src/PhpSpreadsheet/Reader/Ods.php b/src/PhpSpreadsheet/Reader/Ods.php index ceb345dc3f..595cce8d0d 100644 --- a/src/PhpSpreadsheet/Reader/Ods.php +++ b/src/PhpSpreadsheet/Reader/Ods.php @@ -16,7 +16,6 @@ use PhpOffice\PhpSpreadsheet\Reader\Ods\Properties as DocumentProperties; use PhpOffice\PhpSpreadsheet\Reader\Security\XmlScanner; use PhpOffice\PhpSpreadsheet\RichText\RichText; -use PhpOffice\PhpSpreadsheet\Settings; use PhpOffice\PhpSpreadsheet\Shared\Date; use PhpOffice\PhpSpreadsheet\Shared\File; use PhpOffice\PhpSpreadsheet\Spreadsheet; @@ -57,9 +56,12 @@ public function canRead(string $filename): bool $mimeType = $zip->getFromName($stat['name']); } elseif ($zip->statName('META-INF/manifest.xml')) { $xml = simplexml_load_string( - $this->getSecurityScannerOrThrow()->scan($zip->getFromName('META-INF/manifest.xml')), - 'SimpleXMLElement', - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan( + $zip->getFromName( + 'META-INF/manifest.xml' + ) + ) ); if ($xml !== false) { $namespacesContent = $xml->getNamespaces(true); @@ -97,9 +99,8 @@ public function listWorksheetNames(string $filename): array $xml = new XMLReader(); $xml->xml( - $this->getSecurityScannerOrThrow()->scanFile('zip://' . realpath($filename) . '#' . self::INITIAL_FILE), - null, - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scanFile('zip://' . realpath($filename) . '#' . self::INITIAL_FILE) ); $xml->setParserProperty(2, true); @@ -144,9 +145,8 @@ public function listWorksheetInfo(string $filename): array $xml = new XMLReader(); $xml->xml( - $this->getSecurityScannerOrThrow()->scanFile('zip://' . realpath($filename) . '#' . self::INITIAL_FILE), - null, - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scanFile('zip://' . realpath($filename) . '#' . self::INITIAL_FILE) ); $xml->setParserProperty(2, true); @@ -252,9 +252,8 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp // Meta $xml = @simplexml_load_string( - $this->getSecurityScannerOrThrow()->scan($zip->getFromName('meta.xml')), - 'SimpleXMLElement', - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan($zip->getFromName('meta.xml')) ); if ($xml === false) { throw new Exception('Unable to read data from {$pFilename}'); @@ -268,8 +267,8 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp $dom = new DOMDocument('1.01', 'UTF-8'); $dom->loadXML( - $this->getSecurityScannerOrThrow()->scan($zip->getFromName('styles.xml')), - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan($zip->getFromName('styles.xml')) ); $pageSettings = new PageSettings($dom); @@ -278,8 +277,8 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp $dom = new DOMDocument('1.01', 'UTF-8'); $dom->loadXML( - $this->getSecurityScannerOrThrow()->scan($zip->getFromName(self::INITIAL_FILE)), - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan($zip->getFromName(self::INITIAL_FILE)) ); $officeNs = (string) $dom->lookupNamespaceUri('office'); @@ -655,8 +654,8 @@ private function processSettings(ZipArchive $zip, Spreadsheet $spreadsheet): voi { $dom = new DOMDocument('1.01', 'UTF-8'); $dom->loadXML( - $this->getSecurityScannerOrThrow()->scan($zip->getFromName('settings.xml')), - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan($zip->getFromName('settings.xml')) ); //$xlinkNs = $dom->lookupNamespaceUri('xlink'); $configNs = (string) $dom->lookupNamespaceUri('config'); diff --git a/src/PhpSpreadsheet/Reader/Xlsx.php b/src/PhpSpreadsheet/Reader/Xlsx.php index af5b11660d..eba5ae589a 100644 --- a/src/PhpSpreadsheet/Reader/Xlsx.php +++ b/src/PhpSpreadsheet/Reader/Xlsx.php @@ -25,7 +25,6 @@ use PhpOffice\PhpSpreadsheet\Reader\Xlsx\WorkbookView; use PhpOffice\PhpSpreadsheet\ReferenceHelper; use PhpOffice\PhpSpreadsheet\RichText\RichText; -use PhpOffice\PhpSpreadsheet\Settings; use PhpOffice\PhpSpreadsheet\Shared\Date; use PhpOffice\PhpSpreadsheet\Shared\Drawing; use PhpOffice\PhpSpreadsheet\Shared\File; @@ -120,7 +119,7 @@ private function loadZip(string $filename, string $ns = '', bool $replaceUnclose $rels = @simplexml_load_string( $this->getSecurityScannerOrThrow()->scan($contents), 'SimpleXMLElement', - Settings::getLibXmlLoaderOptions(), + 0, $ns ); @@ -135,7 +134,7 @@ private function loadZipNonamespace(string $filename, string $ns): SimpleXMLElem $rels = simplexml_load_string( $this->getSecurityScannerOrThrow()->scan($contents), 'SimpleXMLElement', - Settings::getLibXmlLoaderOptions(), + 0, ($ns === '' ? $ns : '') ); @@ -243,11 +242,13 @@ public function listWorksheetInfo(string $filename): array $xml = new XMLReader(); $xml->xml( - $this->getSecurityScannerOrThrow()->scan( - $this->getFromZipArchive($this->zip, $fileWorksheetPath) - ), - null, - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan( + $this->getFromZipArchive( + $this->zip, + $fileWorksheetPath + ) + ) ); $xml->setParserProperty(2, true); @@ -1950,9 +1951,8 @@ private function readRibbon(Spreadsheet $excel, string $customUITarget, ZipArchi if ($dataRels) { // exists and not empty if the ribbon have some pictures (other than internal MSO) $UIRels = simplexml_load_string( - $this->getSecurityScannerOrThrow()->scan($dataRels), - 'SimpleXMLElement', - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan($dataRels) ); if (false !== $UIRels) { // we need to save id and target to avoid parsing customUI.xml and "guess" if it's a pseudo callback who load the image diff --git a/src/PhpSpreadsheet/Reader/Xlsx/Properties.php b/src/PhpSpreadsheet/Reader/Xlsx/Properties.php index fb501e84ec..1a0517b19f 100644 --- a/src/PhpSpreadsheet/Reader/Xlsx/Properties.php +++ b/src/PhpSpreadsheet/Reader/Xlsx/Properties.php @@ -4,7 +4,6 @@ use PhpOffice\PhpSpreadsheet\Document\Properties as DocumentProperties; use PhpOffice\PhpSpreadsheet\Reader\Security\XmlScanner; -use PhpOffice\PhpSpreadsheet\Settings; use SimpleXMLElement; class Properties @@ -23,9 +22,7 @@ private function extractPropertyData(string $propertyData): ?SimpleXMLElement { // okay to omit namespace because everything will be processed by xpath $obj = simplexml_load_string( - $this->securityScanner->scan($propertyData), - 'SimpleXMLElement', - Settings::getLibXmlLoaderOptions() + $this->securityScanner->scan($propertyData) ); return $obj === false ? null : $obj; diff --git a/src/PhpSpreadsheet/Reader/Xml.php b/src/PhpSpreadsheet/Reader/Xml.php index 65cd282853..c229f1602c 100644 --- a/src/PhpSpreadsheet/Reader/Xml.php +++ b/src/PhpSpreadsheet/Reader/Xml.php @@ -15,7 +15,6 @@ use PhpOffice\PhpSpreadsheet\Reader\Xml\Properties; use PhpOffice\PhpSpreadsheet\Reader\Xml\Style; use PhpOffice\PhpSpreadsheet\RichText\RichText; -use PhpOffice\PhpSpreadsheet\Settings; use PhpOffice\PhpSpreadsheet\Shared\Date; use PhpOffice\PhpSpreadsheet\Shared\File; use PhpOffice\PhpSpreadsheet\Spreadsheet; @@ -134,9 +133,8 @@ private function trySimpleXMLLoadStringPrivate(string $filename, string $fileOrS } if ($continue) { $xml = @simplexml_load_string( - $this->getSecurityScannerOrThrow()->scan($data), - 'SimpleXMLElement', - Settings::getLibXmlLoaderOptions() + $this->getSecurityScannerOrThrow() + ->scan($data) ); } } catch (Throwable $e) { diff --git a/src/PhpSpreadsheet/Settings.php b/src/PhpSpreadsheet/Settings.php index 58d75164bb..2c9adbbf45 100644 --- a/src/PhpSpreadsheet/Settings.php +++ b/src/PhpSpreadsheet/Settings.php @@ -96,6 +96,8 @@ public static function htmlEntityFlags(): int * Set default options for libxml loader. * * @param ?int $options Default options for libxml loader + * + * @deprecated 3.5.0 no longer needed */ public static function setLibXmlLoaderOptions(?int $options): int { @@ -112,14 +114,12 @@ public static function setLibXmlLoaderOptions(?int $options): int * Defaults to LIBXML_DTDLOAD | LIBXML_DTDATTR when not set explicitly. * * @return int Default options for libxml loader + * + * @deprecated 3.5.0 no longer needed */ public static function getLibXmlLoaderOptions(): int { - if (self::$libXmlLoaderOptions === null) { - return self::setLibXmlLoaderOptions(null); - } - - return self::$libXmlLoaderOptions; + return self::$libXmlLoaderOptions ?? (defined('LIBXML_DTDLOAD') ? (LIBXML_DTDLOAD | LIBXML_DTDATTR) : 0); } /** diff --git a/tests/PhpSpreadsheetTests/Reader/Html/HtmlCharsetTest.php b/tests/PhpSpreadsheetTests/Reader/Html/HtmlCharsetTest.php index 2e1ef19dfd..1daacaf62b 100644 --- a/tests/PhpSpreadsheetTests/Reader/Html/HtmlCharsetTest.php +++ b/tests/PhpSpreadsheetTests/Reader/Html/HtmlCharsetTest.php @@ -40,6 +40,7 @@ public static function providerCharset(): array ['charset.UTF-16.lebom.html', 'À1'], ['charset.gb18030.html', '电视机'], ['charset.unknown.html', 'exception'], + ['xhtml4.entity.xhtml', 'exception'], ]; } } diff --git a/tests/data/Reader/HTML/charset.ISO-8859-1.html4.html b/tests/data/Reader/HTML/charset.ISO-8859-1.html4.html index 63e45ec081..6b81fc9e80 100644 --- a/tests/data/Reader/HTML/charset.ISO-8859-1.html4.html +++ b/tests/data/Reader/HTML/charset.ISO-8859-1.html4.html @@ -1,7 +1,7 @@
- +&test; |