From f0e3985d2a84463def286c73ace6e88d8eac1e47 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 8 Jan 2025 12:53:52 +0200 Subject: [PATCH 1/3] Test: grapheme_extract should slide properly past error bytes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a test to assert that the `$next` parameter of `grapheme_extract()` points to the next byte offset in the input `$haystack` after accounting for the moved offset, according to the docs: > If offset does not point to the first byte of a UTF-8 character, > the start position is moved to the next character boundary. It seems that the existing behavior is to find the next grapheme boundary from the original provided offset, but if the offset doesn’t point to a valid starting byte, the assigned `$next` value will point to the byte that was immediately decoded in the same call, leading to possible infinite loops in user-space code. ``` while ( $at < strlen( $s ) ) { $grapheme = grapheme_extract( "\x85PHP", 1, GRAPHEME_EXTR_COUNT, $at, $at ); // never moves past the second byte, always returns 'P' } ``` --- ext/intl/tests/grapheme2.phpt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ext/intl/tests/grapheme2.phpt b/ext/intl/tests/grapheme2.phpt index 1fb32a7368217..76aead806b43b 100644 --- a/ext/intl/tests/grapheme2.phpt +++ b/ext/intl/tests/grapheme2.phpt @@ -590,6 +590,8 @@ function ut_main() array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, 5, 11, "de" . $char_a_ring_nfd . "f" ), array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, -6, 11, "de" . $char_a_ring_nfd . "f" ), + array( "\x95\x00a\x85b", 1, 0, 2, "\x00" ), + array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 3, $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd ), array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 2, $char_a_ring_nfd . $char_o_diaeresis_nfd ), array( $char_a_ring_nfd . $char_o_diaeresis_nfd . "c", 1, $char_a_ring_nfd . "" ), @@ -1134,6 +1136,7 @@ extract from "a%CC%8Abcde" "2" graphemes - grapheme_extract starting at byte pos extract from "a%CC%8Abcde" "2" graphemes - grapheme_extract starting at byte position -7 with $next = a%CC%8Ab == a%CC%8Ab $next=4 == 4 extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position 5 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11 extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position -6 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11 +extract from "%95%00a%85b" "1" graphemes - grapheme_extract starting at byte position 0 with $next = %00 == %00 $next=2 == 2 extract from "a%CC%8Ao%CC%88o%CC%88" "3" graphemes - grapheme_extract = a%CC%8Ao%CC%88o%CC%88 == a%CC%8Ao%CC%88o%CC%88 extract from "a%CC%8Ao%CC%88o%CC%88" "2" graphemes - grapheme_extract = a%CC%8Ao%CC%88 == a%CC%8Ao%CC%88 extract from "a%CC%8Ao%CC%88c" "1" graphemes - grapheme_extract = a%CC%8A == a%CC%8A From be2b3fa85560f24a6d57b46f7c97a14300767435 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 9 Jan 2025 07:15:25 +0200 Subject: [PATCH 2/3] Update starting offset when scanning past the middle of an existing char. --- ext/intl/grapheme/grapheme_string.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index f33dab8eafa22..77bf4319928a8 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -781,6 +781,7 @@ PHP_FUNCTION(grapheme_extract) while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) { pstr++; + start++; if ( pstr >= str_end ) { intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: invalid input string", 0 ); From 19fa67ae3f8f0e12cff6fb82ac424d01bdebb956 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 10 Jan 2025 09:04:40 +0200 Subject: [PATCH 3/3] Add note in UPGRADING for bug fix. --- UPGRADING | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/UPGRADING b/UPGRADING index ff6d4bc048f94..f9e784357effd 100644 --- a/UPGRADING +++ b/UPGRADING @@ -109,6 +109,10 @@ PHP 8.5 UPGRADE NOTES . IntlDateFormatter::setTimeZone()/datefmt_set_timezone() throws an IntlException on uninitialised classes/clone failures. + . grapheme_extract() properly assigns $next value when skipping over + invalid starting bytes. Previously there were cases where it would + point to the start of the grapheme boundary instead of the end. + - PCNTL: . pcntl_exec() now has a formal return type of false.