From cbb003cb08017faaea7c6fd279ab11e8acd8a706 Mon Sep 17 00:00:00 2001 From: Ariel Allon Date: Tue, 6 Oct 2020 00:06:34 -0500 Subject: [PATCH] Add XML control-character stripping - Encountered issues with bad XML from some RETS feeds breaking parsing on response payloads. - This matches xml-encoded character references (decimal or hex) for ASCII control characters group, and then removes those from the XML string. --- src/Parsers/XML.php | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Parsers/XML.php b/src/Parsers/XML.php index 7309ca1..4e840c4 100644 --- a/src/Parsers/XML.php +++ b/src/Parsers/XML.php @@ -5,12 +5,28 @@ class XML { + /** + * Matches xml-encoded character references (decimal or hex) for ASCII control characters group. + * Will match any string that begins with &# and ends with ; and in between is either + * (0?[0-9]|[12][0-9]|3[01]) - a decimal integer between 00 and 31 (leading 0 optional) + * ([xX](0?|1)[0-9A-Fa-f]) - a hex integer between 00 and 1F (upper or lower case, leading 0 optional) + * + * @link https://regexr.com/5c41j + * @link https://en.wikipedia.org/wiki/ASCII#Control_characters + * @link https://www.liquid-technologies.com/XML/CharRefs.aspx + */ + const REGEX_ASCII_CONTROL_CHARACTERS = '/(&#((0?[0-9]|[12][0-9]|3[01])|([xX](0?|1)[0-9A-Fa-f]));)/'; + public function parse($string) { if ($string instanceof ResponseInterface or $string instanceof Response) { $string = $string->getBody()->__toString(); } - return new \SimpleXMLElement((string) $string); + $string = (string)$string; + + $string = preg_replace(self::REGEX_ASCII_CONTROL_CHARACTERS, '', $string); + + return new \SimpleXMLElement($string); } }