diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c index 4cdf129831278..13d088cf8e0a1 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c @@ -11560,6 +11560,45 @@ static void mb_wchar_to_cp936(uint32_t *in, size_t len, mb_convert_buf *buf, boo MB_CONVERT_BUF_STORE(buf, out, limit); } +/* Step through a GB18030 string one character at a time. Find the last position at or + * before `limit` which falls directly after the end of a (single or multi-byte) character */ +static zend_always_inline unsigned char* step_through_gb18030_str(unsigned char *p, unsigned char *limit) +{ + while (p < limit) { + unsigned char c = *p; + if (c < 0x81 || c == 0xFF) { + p++; + } else { + if (limit - p == 1) { + break; + } + unsigned char c2 = p[1]; + /* For a 4-byte char, the 2nd byte will be 0x30-0x39 */ + unsigned int w = (c2 >= 0x30 && c2 <= 0x39) ? 4 : 2; + if (limit - p < w) { + break; + } + p += w; + } + } + return p; +} + +static zend_string* mb_cut_gb18030(unsigned char *str, size_t from, size_t len, unsigned char *end) +{ + ZEND_ASSERT(str + from <= end); + unsigned char *start = step_through_gb18030_str(str, str + from); + if (str + from + len > end) { + len = (end - str) - from; + } + if (start + len >= end) { + return zend_string_init_fast((const char*)start, end - start); + } else { + unsigned char *_end = step_through_gb18030_str(start, start + len); + return zend_string_init_fast((const char*)start, _end - start); + } +} + static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL}; static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = { @@ -11594,7 +11633,7 @@ const mbfl_encoding mbfl_encoding_gb18030 = { mb_gb18030_to_wchar, mb_wchar_to_gb18030, NULL, - NULL, + mb_cut_gb18030, }; static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL}; diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 14e7b71ed6287..237a2d2f62baf 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -2363,7 +2363,7 @@ PHP_FUNCTION(mb_strcut) zend_string *encoding = NULL; char *string_val; zend_long from, len; - bool len_is_null = 1; + bool len_is_null = true; mbfl_string string, result, *ret; ZEND_PARSE_PARAMETERS_START(2, 4) diff --git a/ext/mbstring/tests/mb_strcut.phpt b/ext/mbstring/tests/mb_strcut.phpt index d94ac30ceaaf6..a3703cbdfc4c6 100644 --- a/ext/mbstring/tests/mb_strcut.phpt +++ b/ext/mbstring/tests/mb_strcut.phpt @@ -26,6 +26,7 @@ $jis = mb_convert_encoding("漢字 abc カナ", 'JIS', 'UTF-8'); $iso2022jp2004 = mb_convert_encoding("漢字 abc カナ凜", 'ISO-2022-JP-2004', 'UTF-8'); // [1b242851 3441 3b7a 1b2842 20 61 62 63 20 1b242851 252b 254a 7425 1b2842] $iso2022jpms = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-MS', 'UTF-8'); // [1b2442 3441 3b7a 1b2842 20 61 62 63 20 1b2442 252b 254a 1b2842] $iso2022jp_kddi = mb_convert_encoding("漢字 abc カナ", 'ISO-2022-JP-KDDI', 'UTF-8'); +$gb18030 = mb_convert_encoding("漢字 abc カナ", 'GB18030', 'UTF-8'); print "== EUC-JP ==\n"; print MBStringChars(mb_strcut($euc_jp, 6, 5, 'EUC-JP'), 'EUC-JP') . "\n"; @@ -218,9 +219,37 @@ print "UTF-16 section is terminated improperly: [" . mb_strcut("&i6o\x83", 0, 10 print "== GB18030 ==\n"; +print "Empty string: [" . bin2hex(mb_strcut("", 0, 5, 'GB18030')) . "]\n"; +print "Empty string 2: [" . bin2hex(mb_strcut("", -2, 1, 'GB18030')) . "]\n"; +print "Empty string 3: [" . bin2hex(mb_strcut("", 0, -1, 'GB18030')) . "]\n"; print "Invalid byte 0xF5: [" . bin2hex(mb_strcut("\xF5a", 1, 100, 'GB18030')) . "]\n"; print "Double-byte char: [" . bin2hex(mb_strcut("\xAFw", -1, 100, "GB18030")) . "]\n"; +print MBStringChars(mb_strcut($gb18030, 0, 0, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 0, 1, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 0, 2, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 0, 3, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 0, 4, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 0, 5, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 1, 2, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 1, 3, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($gb18030, 1, 4, 'GB18030'), 'GB18030') . "\n"; + +// U+210A is encoded using 4 bytes in GB18030 +print "Operating on 4-byte GB18030 character:\n"; +$fourbyte = mb_convert_encoding("\x21\x0A", 'GB18030', 'UTF-16BE'); +print MBStringChars(mb_strcut($fourbyte, 0, 4, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 1, 4, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 2, 4, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 3, 4, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 4, 4, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 1, 3, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 2, 3, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 2, 4, 'GB18030'), 'GB18030') . "\n"; +print MBStringChars(mb_strcut($fourbyte, 0, -1, 'GB18030'), 'GB18030') . "\n"; + +print "[" . bin2hex(mb_strcut(hex2bin("84308130"), 2, null, "GB18030")) . "]\n"; + print "== UHC ==\n"; print "Single byte 0x96: [" . bin2hex(mb_strcut("\x96", 1, 1280, "UHC")) . "]\n"; @@ -405,8 +434,31 @@ UTF-16 section ends abruptly: [] UTF-16 section ends abruptly in middle of 2nd codepoint: [] UTF-16 section is terminated improperly: [] == GB18030 == +Empty string: [] +Empty string 2: [] +Empty string 3: [] Invalid byte 0xF5: [] Double-byte char: [] +[] +[] +[9d68] +[9d68] +[9d68 d7d6] +[9d68 d7d6 20] +[9d68] +[9d68] +[9d68 d7d6] +Operating on 4-byte GB18030 character: +[8136bc32] +[] +[] +[] +[] +[] +[] +[] +[] +[] == UHC == Single byte 0x96: [96] == ASCII ==