From bdb60973cdd82310767e11911ec3666f610fe3df Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Tue, 25 Oct 2022 19:37:03 -0600 Subject: [PATCH 1/8] Updates Unicode data files in a background task Signed-off-by: Jon Stovell --- Sources/ScheduledTasks.php | 6 + Sources/Subs-Charset.php | 4 +- Sources/tasks/UpdateUnicode.php | 1948 +++++++++++++++++++++++++++++++ other/update_unicode_data.php | 1273 +------------------- 4 files changed, 1991 insertions(+), 1240 deletions(-) create mode 100644 Sources/tasks/UpdateUnicode.php diff --git a/Sources/ScheduledTasks.php b/Sources/ScheduledTasks.php index 93edf77c88..98511e62e2 100644 --- a/Sources/ScheduledTasks.php +++ b/Sources/ScheduledTasks.php @@ -1322,6 +1322,12 @@ function scheduled_weekly_maintenance() array('$sourcedir/tasks/UpdateTldRegex.php', 'Update_TLD_Regex', '', 0), array() ); + // Ensure Unicode data files are up to date + $smcFunc['db_insert']('insert', '{db_prefix}background_tasks', + array('task_file' => 'string-255', 'task_class' => 'string-255', 'task_data' => 'string', 'claimed_time' => 'int'), + array('$sourcedir/tasks/UpdateUnicode.php', 'Update_Unicode', '', 0), array() + ); + // Run Cache housekeeping if (!empty($cache_enable) && !empty($cacheAPI)) $cacheAPI->housekeeping(); diff --git a/Sources/Subs-Charset.php b/Sources/Subs-Charset.php index 79f0582832..af0c794196 100644 --- a/Sources/Subs-Charset.php +++ b/Sources/Subs-Charset.php @@ -14,7 +14,9 @@ if (!defined('SMF')) die('No direct access...'); -require_once($sourcedir . '/Unicode/Metadata.php'); +// If this file is missing, we're using an old version of Unicode. +if (!@include_once($sourcedir . '/Unicode/Metadata.php')) + define('SMF_UNICODE_VERSION', '14.0.0.0'); /** * Converts the given UTF-8 string into lowercase. diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php new file mode 100644 index 0000000000..3ed45ef9b2 --- /dev/null +++ b/Sources/tasks/UpdateUnicode.php @@ -0,0 +1,1948 @@ + 'Metadata.php', + 'regex' => '/if \(!defined\(\'SMF_UNICODE_VERSION\'\)\)\n\tdefine\(\'SMF_UNICODE_VERSION\', \'\d+(\.\d+)*\'\);/', + 'data' => array( + // 0.0.0.0 will be replaced with correct value at runtime. + "if (!defined('SMF_UNICODE_VERSION'))\n\tdefine('SMF_UNICODE_VERSION', '0.0.0.0');", + ), + ), + 'utf8_normalize_d_maps' => array( + 'file' => 'DecompositionCanonical.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_normalize_d.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Canonical Decomposition maps for Unicode normalization.', + ), + 'data' => array(), + ), + 'utf8_normalize_kd_maps' => array( + 'file' => 'DecompositionCompatibility.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_normalize_kd.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Compatibility Decomposition maps for Unicode normalization.', + ), + 'data' => array(), + ), + 'utf8_compose_maps' => array( + 'file' => 'Composition.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_compose.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Composition maps for Unicode normalization.', + ), + 'data' => array(), + ), + 'utf8_combining_classes' => array( + 'file' => 'CombiningClasses.php', + 'key_type' => 'hexchar', + 'val_type' => 'int', + 'desc' => array('Helper function for utf8_normalize_d.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Combining Class data for Unicode normalization.', + ), + 'data' => array(), + ), + 'utf8_strtolower_simple_maps' => array( + 'file' => 'CaseLower.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_strtolower.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Uppercase to lowercase maps.', + ), + 'data' => array(), + ), + 'utf8_strtolower_maps' => array( + 'file' => 'CaseLower.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_strtolower.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Uppercase to lowercase maps.', + ), + 'data' => array(), + ), + 'utf8_strtoupper_simple_maps' => array( + 'file' => 'CaseUpper.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_strtoupper.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Lowercase to uppercase maps.', + ), + 'data' => array(), + ), + 'utf8_strtoupper_maps' => array( + 'file' => 'CaseUpper.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_strtoupper.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Lowercase to uppercase maps.', + ), + 'data' => array(), + ), + 'utf8_titlecase_simple_maps' => array( + 'file' => 'CaseTitle.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_convert_case.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Simple title case maps.', + ), + 'data' => array(), + ), + 'utf8_titlecase_maps' => array( + 'file' => 'CaseTitle.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_convert_case.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Full title case maps.', + ), + 'data' => array(), + ), + 'utf8_casefold_simple_maps' => array( + 'file' => 'CaseFold.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_casefold.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Casefolding maps.', + ), + 'data' => array(), + ), + 'utf8_casefold_maps' => array( + 'file' => 'CaseFold.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_casefold.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Casefolding maps.', + ), + 'data' => array(), + ), + 'utf8_default_ignorables' => array( + 'file' => 'DefaultIgnorables.php', + 'key_type' => 'int', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for utf8_normalize_kc_casefold.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Characters with the \'Default_Ignorable_Code_Point\' property.', + ), + 'data' => array(), + ), + 'utf8_regex_properties' => array( + 'file' => 'RegularExpressions.php', + 'key_type' => 'string', + 'val_type' => 'string', + 'propfiles' => array( + 'DerivedCoreProperties.txt', + 'PropList.txt', + 'emoji/emoji-data.txt', + 'extracted/DerivedGeneralCategory.txt', + ), + 'props' => array( + 'Bidi_Control', + 'Case_Ignorable', + 'Cn', + 'Default_Ignorable_Code_Point', + 'Emoji', + 'Emoji_Modifier', + 'Ideographic', + 'Join_Control', + 'Regional_Indicator', + 'Variation_Selector', + ), + 'desc' => array( + 'Helper function for utf8_sanitize_invisibles and utf8_convert_case.', + '', + 'Character class lists compiled from:', + 'https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt', + 'https://unicode.org/Public/UNIDATA/PropList.txt', + 'https://unicode.org/Public/UNIDATA/emoji/emoji-data.txt', + 'https://unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt', + ), + 'return' => array( + 'type' => 'array', + 'desc' => 'Character classes for various Unicode properties.', + ), + 'data' => array(), + ), + 'utf8_regex_variation_selectors' => array( + 'file' => 'RegularExpressions.php', + 'key_type' => 'string', + 'val_type' => 'string', + 'desc' => array( + 'Helper function for utf8_sanitize_invisibles.', + '', + 'Character class lists compiled from:', + 'https://unicode.org/Public/UNIDATA/StandardizedVariants.txt', + 'https://unicode.org/Public/UNIDATA/emoji/emoji-variation-sequences.txt', + ), + 'return' => array( + 'type' => 'array', + 'desc' => 'Character classes for filtering variation selectors.', + ), + 'data' => array(), + ), + 'utf8_regex_joining_type' => array( + 'file' => 'RegularExpressions.php', + 'key_type' => 'string', + 'val_type' => 'string', + 'desc' => array( + 'Helper function for utf8_sanitize_invisibles.', + '', + 'Character class lists compiled from:', + 'https://unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt', + ), + 'return' => array( + 'type' => 'array', + 'desc' => 'Character classes for joining characters in certain scripts.', + ), + 'data' => array(), + ), + 'utf8_regex_indic' => array( + 'file' => 'RegularExpressions.php', + 'key_type' => 'string', + 'val_type' => 'string', + 'desc' => array( + 'Helper function for utf8_sanitize_invisibles.', + '', + 'Character class lists compiled from:', + 'https://unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt', + 'https://unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt', + ), + 'return' => array( + 'type' => 'array', + 'desc' => 'Character classes for Indic scripts that use viramas.', + ), + 'data' => array(), + ), + 'idna_maps' => array( + 'file' => 'Idna.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for idn_to_* polyfills.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Character maps for IDNA processing.', + ), + 'data' => array(), + ), + 'idna_maps_deviation' => array( + 'file' => 'Idna.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for idn_to_* polyfills.'), + 'return' => array( + 'type' => 'array', + 'desc' => '"Deviation" character maps for IDNA processing.', + ), + 'data' => array(), + ), + 'idna_maps_not_std3' => array( + 'file' => 'Idna.php', + 'key_type' => 'hexchar', + 'val_type' => 'hexchar', + 'desc' => array('Helper function for idn_to_* polyfills.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Non-STD3 character maps for IDNA processing.', + ), + 'data' => array(), + ), + 'idna_regex' => array( + 'file' => 'Idna.php', + 'key_type' => 'string', + 'val_type' => 'string', + 'desc' => array('Helper function for idn_to_* polyfills.'), + 'return' => array( + 'type' => 'array', + 'desc' => 'Regular expressions useful for IDNA processing.', + ), + 'data' => array(), + ), + ); + + // Prefetching the files helps ensure the task runs smoothly. + private $prefetch = array( + self::DATA_URL_UCD => array( + 'CaseFolding.txt', + 'DerivedAge.txt', + 'DerivedCoreProperties.txt', + 'DerivedNormalizationProps.txt', + 'IndicSyllabicCategory.txt', + 'PropertyValueAliases.txt', + 'PropList.txt', + 'ScriptExtensions.txt', + 'Scripts.txt', + 'SpecialCasing.txt', + 'StandardizedVariants.txt', + 'UnicodeData.txt', + 'emoji/emoji-data.txt', + 'emoji/emoji-variation-sequences.txt', + 'extracted/DerivedGeneralCategory.txt', + 'extracted/DerivedJoiningType.txt', + ), + self::DATA_URL_IDNA => array( + 'IdnaMappingTable.txt', + ), + ); + + /** + * This executes the task. + * + * @return bool Always returns true + */ + public function execute() + { + global $sourcedir, $smcFunc; + + /***************** + * Part 1: Setup * + *****************/ + $this->unicodedir = $sourcedir . DIRECTORY_SEPARATOR . 'Unicode'; + + // We need a temporary directory to hold our files while we work on them. + $this->make_temp_dir(); + + if (empty($this->temp_dir)) + return true; + + // Do we even need to update? + if (!$this->should_update()) + { + $this->deltree($this->temp_dir); + return true; + } + + @ini_set('memory_limit', '256M'); + + foreach ($this->funcs as $func_name => &$func_info) + { + $file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file'])); + + if (!file_exists($file_paths['temp'])) + touch($file_paths['temp']); + + if (!is_file($file_paths['temp'])) + { + log_error($file_paths['temp'] . ' is not a file.'); + return true; + } + + if (!smf_chmod($file_paths['temp'])) + { + log_error($file_paths['temp'] . ' is not writable.'); + return true; + } + + $file_contents['temp'] = file_get_contents($file_paths['temp']); + + if (empty($file_contents['temp'])) + { + file_put_contents($file_paths['temp'], $this->smf_file_header()); + } + elseif (substr($file_contents['temp'], -2) === '?' . '>') + { + file_put_contents($file_paths['temp'], substr($file_contents['temp'], 0, -2)); + } + } + + // Prefetch the files in case the network is slow. + foreach ($this->prefetch as $data_url => $files) + { + foreach ($files as $filename) + { + $local_file = $this->fetch_unicode_file($filename, $data_url); + + // If prefetch is taking a really long time, pause and try again later. + if ($local_file === false || microtime(true) - TIME_START >= MAX_CLAIM_THRESHOLD - 1) + { + $smcFunc['db_insert']('', + '{db_prefix}background_tasks', + array( + 'task_file' => 'string', + 'task_class' => 'string', + 'task_data' => 'string', + 'claimed_time' => 'int', + ), + array( + '$sourcedir/tasks/UpdateUnicode.php', + 'Update_Unicode', + '', + time() - MAX_CLAIM_THRESHOLD, + ), + array('id_task') + ); + + return true; + } + } + } + + /********************************************* + * Part 2: Normalization, case folding, etc. * + *********************************************/ + $this->process_derived_normalization_props(); + $this->process_main_unicode_data(); + $this->process_casing_data(); + $this->finalize_decomposition_forms(); + + $this->full_decomposition_maps = array(); + $this->derived_normalization_props = array(); + $this->export_funcs_to_file(); + + /*********************************** + * Part 3: Regular expression data * + ***********************************/ + $this->build_regex_properties(); + $this->build_regex_variation_selectors(); + $this->build_script_stats(); + $this->build_regex_joining_type(); + $this->build_regex_indic(); + + unset($this->funcs['utf8_combining_classes']['data']); + $this->export_funcs_to_file(); + + /********************************* + * Part 4: IDNA maps and regexes * + *********************************/ + $this->build_idna(); + $this->export_funcs_to_file(); + + /******************* + * Part 5: Wrapup. * + *******************/ + $done_files = array(); + + foreach ($this->funcs as $func_name => $func_info) + { + $file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file']; + $file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file']; + + if (in_array($file_paths['temp'], $done_files)) + continue; + + // Add closing PHP tag to the temp file. + file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND); + + $done_files[] = $file_paths['temp']; + + // Only move if the file has changed, discounting the license block. + foreach (array('temp', 'real') as $f) + { + if (file_exists($file_paths[$f])) + { + $file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f])); + } + else + $file_contents[$f] = ''; + } + + if ($file_contents['temp'] !== $file_contents['real']) + rename($file_paths['temp'], $file_paths['real']); + } + + // Clean up after ourselves. + $this->deltree($this->temp_dir); + + // All done. + return true; + } + + /** + * Makes a temporary directory to hold our working files, and sets + * $this->temp_dir to the path of the created directory. + */ + private function make_temp_dir() + { + global $sourcedir; + + if (empty($this->temp_dir)) + { + require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php'); + + $this->temp_dir = rtrim(sm_temp_dir(), DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . 'Unicode'; + + if (!is_dir($this->temp_dir)) + @mkdir($this->temp_dir); + + // Needs to be a writable directory. + if (!is_dir($this->temp_dir) || !smf_chmod($this->temp_dir)) + $this->temp_dir = null; + } + } + + /** + * Fetches the contents of a Unicode data file. + * + * Caches a local copy for subsequent lookups. + * + * @param string $filename Name of a Unicode datafile, relative to $data_url. + * @param string $data_url One of this class's DATA_URL_* constants. + * + * @return string Path to locally saved copy of the file. + */ + private function fetch_unicode_file($filename, $data_url) + { + global $sourcedir; + + $filename = ltrim($filename, '\\/'); + $file_url_name = strtr($filename, array('\\' => '/')); + $file_local_name = strtr($filename, array('\\' => DIRECTORY_SEPARATOR, '/' => DIRECTORY_SEPARATOR)); + + switch ($data_url) + { + case self::DATA_URL_IDNA: + $sub_dir = 'idna'; + break; + + default: + $sub_dir = 'ucd'; + break; + } + + $local_file = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $sub_dir, $file_local_name)); + + if (file_exists($local_file)) + return $local_file; + + if (!file_exists(dirname($local_file))) + { + @mkdir(dirname($local_file), 0777, true); + + if (!is_dir(dirname($local_file))) + return false; + } + + require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php'); + + $file_contents = fetch_web_data($data_url . '/' . $file_url_name); + + if (empty($file_contents)) + return false; + + file_put_contents($local_file, $file_contents); + + $this->files_to_fetch[$sub_dir][] = $filename; + + return $local_file; + } + + /** + * Deletes a directory and its contents. + * + * @param string Path to directory + */ + private function deltree($dir_path) + { + // For safety. + if (strpos($dir_path, $this->temp_dir) !== 0) + return; + + $dir = new DirectoryIterator($dir_path); + + $to_delete = array(); + foreach ($dir as $fileinfo) + { + if ($fileinfo->isDot()) + continue; + + if ($fileinfo->isDir()) + $this->deltree($fileinfo->getPathname()); + else + $to_delete[] = $fileinfo->getPathname(); + } + + foreach ($to_delete as $pathname) + unlink($pathname); + + rmdir($dir_path); + } + + /** + * Gets basic boilerplate for the PHP files that will be created. + * + * @return string Standard SMF file header. + */ + private function smf_file_header() + { + global $sourcedir; + + static $file_template; + + if (!empty($file_template)) + return $file_template; + + require_once($sourcedir . '/Subs-Admin.php'); + $settings_defs = get_settings_defs(); + + $license_block = ''; + + $keep_line = true; + foreach (explode("\n", $settings_defs[0]['text']) as $line) + { + if (strpos($line, 'SMF') !== false || strpos($line, 'Simple Machines') !== false) + $keep_line = true; + + if ($keep_line) + $license_block .= $line . "\n"; + + if ($line === '/**') + $keep_line = false; + } + + $file_template = implode("\n\n", array( + '<' . '?php', + trim($license_block), + "if (!defined('SMF'))\n\tdie('No direct access...');", + '', + )); + + return $file_template; + } + + /** + * Updates Unicode data functions in their designated files. + */ + function export_funcs_to_file() + { + foreach ($this->funcs as $func_name => $func_info) + { + if (empty($func_info['data'])) + continue; + + $temp_file_path = $this->temp_dir . '/' . $func_info['file']; + + list($func_code, $func_regex) = $this->get_function_code_and_regex($func_name); + + $file_contents = file_get_contents($temp_file_path); + + if (preg_match($func_regex, $file_contents)) + { + file_put_contents($temp_file_path, preg_replace($func_regex, $func_code, $file_contents)); + } + else + { + file_put_contents($temp_file_path, $func_code . "\n\n", FILE_APPEND); + } + + // Free up some memory. + if ($func_name != 'utf8_combining_classes') + unset($this->funcs[$func_name]['data']); + } + } + + /** + * Builds complete code for the specified element in $this->funcs + * to be inserted into the relevant PHP file. Also builds a regex + * to check whether a copy of the the function is already present + * in the file. + * + * @param string $func_name Key of an element in $this->funcs. + * + * @return array PHP code and a regular expression. + */ + private function get_function_code_and_regex($func_name) + { + // No function name means data is raw code. + if (!is_string($func_name)) + { + $func_code = implode("\n\n", $this->funcs[$func_name]['data']); + $func_regex = isset($this->funcs[$func_name]['regex']) ? $this->funcs[$func_name]['regex'] : '/' . preg_quote($func_code, '/') . '/'; + } + else + { + // The regex to look for this function in the existing file content. + $func_regex = "/(\/\*([^*]|\*(?!\/))*\*\/\n)?function $func_name\(\)\n{.+?\n}/s"; + + // The PHPDoc comment for this function. + $func_code = '/**' . implode("\n * ", array_merge( + array(''), + $this->funcs[$func_name]['desc'], + array( + '', + 'Developers: Do not update the data in this function manually. Instead,', + 'run "php -f other/update_unicode_data.php" on the command line.', + ), + empty($this->funcs[$func_name]['return']) ? array() : array( + '', + '@return ' . implode(' ', $this->funcs[$func_name]['return']) + ), + )) . "\n */\n"; + + // The code for this function. + $func_code .= implode("\n", array( + 'function ' . $func_name . '()', + '{', + "\t" . 'return array(', + '', + )); + + $this->build_func_array( + $func_code, + $this->funcs[$func_name]['data'], + $this->funcs[$func_name]['key_type'], + $this->funcs[$func_name]['val_type'] + ); + + $func_code .= implode("\n", array( + "\t" . ');', + '}', + )); + } + + // Some final tidying. + $func_code = str_replace('\\\\x', '\x', $func_code); + $func_code = preg_replace('/\h+$/m', '', $func_code); + + return array($func_code, $func_regex); + } + + /** + * Helper for get_function_code_and_regex(). Builds the function's data array. + * + * @param string &$func_code The raw string that contains function code. + * @param array $data Data to format as an array. + * @param string $key_type How to format the array keys. + * @param string $val_type How to format the array values. + */ + private function build_func_array(&$func_code, $data, $key_type, $val_type) + { + static $indent = 2; + + foreach ($data as $key => $value) + { + $func_code .= str_repeat("\t", $indent); + + if ($key_type == 'hexchar') + { + $func_code .= '"'; + + $key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); + + foreach (unpack('C*', $key) as $byte_value) + { + $func_code .= '\x' . strtoupper(dechex($byte_value)); + } + + $func_code .= '" => '; + } + elseif ($key_type == 'string' && !is_int($key)) + { + $func_code .= var_export($key, true) . ' => '; + } + + if (is_array($value)) + { + if ($val_type == 'string' && count($value) === count($value, COUNT_RECURSIVE)) + { + $nextline = "\n" . str_repeat("\t", $indent + 1); + + $func_code = rtrim($func_code); + + $func_code .= $nextline . implode(' .' . $nextline, array_map( + function ($v) + { + return var_export($v, true); + }, + $value + )); + } + else + { + $func_code .= 'array(' . "\n"; + + $indent++; + $this->build_func_array($func_code, $value, $key_type, $val_type); + $indent--; + + $func_code .= str_repeat("\t", $indent) . ')'; + } + } + elseif ($val_type == 'hexchar') + { + $func_code .= '"'; + + $value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); + foreach (unpack('C*', $value) as $byte_value) + { + $func_code .= '\x' . strtoupper(dechex($byte_value)); + } + + $func_code .= '"'; + } + elseif ($val_type == 'string') + { + $func_code .= var_export($value, true); + } + else + { + $func_code .= $value; + } + + $func_code .= ',' . "\n"; + } + } + + /** + * Compares version of SMF's local Unicode data with the latest release. + * + * @return bool Whether SMF should update its local Unicode data or not. + */ + private function should_update() + { + $this->lookup_ucd_version(); + + // We can't do anything if lookup failed. + if (empty($this->ucd_version)) + return false; + + require_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php'); + + if (version_compare($this->ucd_version, SMF_UNICODE_VERSION, '<=')) + return false; + } + + /** + * Compares version of SMF's local Unicode data with the latest release. + * + * @return bool Whether SMF should update its local Unicode data or not. + */ + private function lookup_ucd_version() + { + global $sourcedir; + + if (!empty($this->ucd_version)) + return true; + + $local_file = $this->fetch_unicode_file('ReadMe.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + preg_match('/Version\s+(\d+(?:\.\d+)*)/', file_get_contents($local_file), $matches); + + if (empty($matches[1])) + return false; + + $this->ucd_version = implode('.', array_pad(explode('.', $matches[1]), 4, '0')); + + // Update this while we are at it. + foreach ($this->funcs as $func_name => &$func_info) + { + if ($func_info['file'] === 'Metadata.php') + { + $func_info['data'][0] = str_replace('0.0.0.0', $this->ucd_version, $func_info['data'][0]); + + break; + } + } + + return true; + } + + /** + * Processes DerivedNormalizationProps.txt in order to populate + * $this->derived_normalization_props. + */ + private function process_derived_normalization_props() + { + $local_file = $this->fetch_unicode_file('DerivedNormalizationProps.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + if (!isset($this->derived_normalization_props[$fields[1]])) + { + $this->derived_normalization_props[$fields[1]] = array(); + } + + if (strpos($fields[0], '..') === false) + { + $entities = array('&#x' . $fields[0] . ';'); + } + else + { + $entities = array(); + + list($start, $end) = explode('..', $fields[0]); + + $ord_s = hexdec($start); + $ord_e = hexdec($end); + + $ord = $ord_s; + while ($ord <= $ord_e) + { + $entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; + } + } + + $value = ''; + if (!isset($fields[2])) + { + $value = 'SAME'; + } + elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF'))) + { + $value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : ''; + } + else + { + $value = $fields[2]; + } + + foreach ($entities as $entity) + { + $this->derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value; + } + } + } + + /** + * Processes UnicodeData.txt in order to populate $this->char_data, + * $this->full_decomposition_maps, and the 'data' element of most elements + * of $this->funcs. + */ + private function process_main_unicode_data() + { + $local_file = $this->fetch_unicode_file('UnicodeData.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + if (!empty($fields[3])) + { + $this->funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = $fields[3]; + } + + // Uppercase maps. + if ($fields[12] !== '') + { + $this->funcs['utf8_strtoupper_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';'; + } + + // Lowercase maps. + if ($fields[13] !== '') + { + $this->funcs['utf8_strtolower_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';'; + } + + // Titlecase maps, where different from uppercase maps. + if ($fields[14] !== '' && $fields[14] !== $fields[12]) + { + $this->funcs['utf8_titlecase_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[14] . ';'; + } + + // Remember this character's general category for later. + $this->char_data['&#x' . $fields[0] . ';']['General_Category'] = $fields[2]; + + if ($fields[5] === '') + { + continue; + } + + // All canonical decompositions AND all compatibility decompositions. + $this->full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';'; + + // Just the canonical decompositions. + if (strpos($fields[5], '<') === false) + { + $this->funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';'; + } + } + } + + /** + * Processes SpecialCasing.txt and CaseFolding.txt in order to get + * finalized versions of all case conversion data. + */ + private function process_casing_data() + { + // Full case conversion maps are the same as the simple ones, unless they're not. + $this->funcs['utf8_strtoupper_maps']['data'] = $this->funcs['utf8_strtoupper_simple_maps']['data']; + $this->funcs['utf8_strtolower_maps']['data'] = $this->funcs['utf8_strtolower_simple_maps']['data']; + $this->funcs['utf8_titlecase_maps']['data'] = $this->funcs['utf8_titlecase_simple_maps']['data']; + + // Deal with the special casing data. + $local_file = $this->fetch_unicode_file('SpecialCasing.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + // Unconditional mappings. + // Note: conditional mappings need to be handled by more complex code. + if (empty($fields[4])) + { + $this->funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[1])) . ';'; + + $this->funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[3])) . ';'; + + // Titlecase only where different from uppercase. + if ($fields[3] !== $fields[2]) + { + $this->funcs['utf8_titlecase_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; + } + } + } + + ksort($this->funcs['utf8_strtolower_maps']['data']); + ksort($this->funcs['utf8_strtoupper_maps']['data']); + ksort($this->funcs['utf8_titlecase_maps']['data']); + + // Deal with the case folding data. + $local_file = $this->fetch_unicode_file('CaseFolding.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + // Full casefolding. + if (in_array($fields[1], array('C', 'F'))) + { + $this->funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; + } + + // Simple casefolding. + if (in_array($fields[1], array('C', 'S'))) + $this->funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; + } + } + + /** + * Finalizes all the decomposition forms. + * + * This is necessary because some characters decompose to other characters + * that themselves decompose further. + */ + private function finalize_decomposition_forms() + { + // Iterate until we reach the final decomposition forms. + // First we do the compatibility decomposition forms. + $changed = true; + while ($changed) + { + $temp = array(); + foreach ($this->full_decomposition_maps as $composed => $decomposed) + { + $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; + + foreach ($parts as $partnum => $hex) + { + if (isset($this->full_decomposition_maps[$hex])) + { + $parts[$partnum] = $this->full_decomposition_maps[$hex]; + } + } + + $decomposed = implode(' ', $parts); + unset($parts); + + $temp[$composed] = $decomposed; + } + + $changed = $this->full_decomposition_maps !== $temp; + + $this->full_decomposition_maps = $temp; + } + + // Same as above, but using only canonical decompositions. + $changed = true; + $iteration = 0; + while ($changed) + { + $temp = array(); + foreach ($this->funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed) + { + if ($iteration === 0 && !in_array($composed, $this->derived_normalization_props['Full_Composition_Exclusion'])) + { + $this->funcs['utf8_compose_maps']['data'][$decomposed] = $composed; + } + + $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; + + foreach ($parts as $partnum => $hex) + { + if (isset($this->funcs['utf8_normalize_d_maps']['data'][$hex])) + { + $parts[$partnum] = $this->funcs['utf8_normalize_d_maps']['data'][$hex]; + } + } + + $decomposed = implode(' ', $parts); + unset($parts); + + $temp[$composed] = $decomposed; + } + + $changed = $this->funcs['utf8_normalize_d_maps']['data'] !== $temp; + + $this->funcs['utf8_normalize_d_maps']['data'] = $temp; + $iteration++; + } + + // Avoid bloat. + $this->funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($this->full_decomposition_maps, $this->funcs['utf8_normalize_d_maps']['data']); + } + + /** + * Builds regular expression classes for extended Unicode properties. + */ + private function build_regex_properties() + { + foreach ($this->funcs['utf8_regex_properties']['propfiles'] as $filename) + { + $local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + if (in_array($fields[1], $this->funcs['utf8_regex_properties']['props'])) + { + if (!isset($this->funcs['utf8_regex_properties']['data'][$fields[1]])) + { + $this->funcs['utf8_regex_properties']['data'][$fields[1]] = array(); + } + + $this->funcs['utf8_regex_properties']['data'][$fields[1]][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + + // We also track 'Default_Ignorable_Code_Point' property in a separate array. + if ($fields[1] !== 'Default_Ignorable_Code_Point') + { + continue; + } + + if (strpos($fields[0], '..') === false) + { + $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';'; + } + else + { + list($start, $end) = explode('..', $fields[0]); + + $ord_s = hexdec($start); + $ord_e = hexdec($end); + + $ord = $ord_s; + while ($ord <= $ord_e) + { + $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; + } + } + } + } + + ksort($this->funcs['utf8_regex_properties']['data']); + } + + /** + * Builds regular expression classes for filtering variation selectors. + */ + private function build_regex_variation_selectors() + { + $files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt'); + + foreach ($files as $filename) + { + $local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + list($base_char, $variation_selector) = explode(' ', $fields[0]); + + $this->funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char); + } + } + + foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords) + { + $class_string = ''; + + $current_range = array('start' => null, 'end' => null); + foreach ($ords as $ord) + { + if (!isset($current_range['start'])) + { + $current_range['start'] = $ord; + } + + if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) + { + $current_range['end'] = $ord; + continue; + } + else + { + $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; + + if ($current_range['start'] != $current_range['end']) + { + $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; + } + + $current_range = array('start' => $ord, 'end' => $ord); + } + } + + if (isset($current_range['start'])) + { + $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; + + if ($current_range['start'] != $current_range['end']) + { + $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; + } + } + + // As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters. + if (($identical = array_search($class_string, $this->funcs['utf8_regex_variation_selectors']['data'])) !== false) + { + unset( + $this->funcs['utf8_regex_variation_selectors']['data'][$identical], + $this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] + ); + + $compound_selector = array($identical, $variation_selector); + sort($compound_selector); + + $variation_selector = implode('', $compound_selector); + } + + $this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string; + } + + foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $class_string) + { + $this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = preg_split('/(?<=})(?=\\\x{)/', $class_string); + } + + krsort($this->funcs['utf8_regex_variation_selectors']['data']); + } + + /** + * Helper function for build_regex_joining_type and build_regex_indic. + */ + private function build_script_stats() + { + $local_file = $this->fetch_unicode_file('PropertyValueAliases.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + if ($fields[0] !== 'sc') + { + continue; + } + + $this->script_aliases[$fields[1]] = $fields[2]; + } + + $local_file = $this->fetch_unicode_file('Scripts.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + if (in_array($fields[1], array('Common', 'Inherited'))) + { + continue; + } + + if (strpos($fields[0], '..') === false) + { + $this->char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1]; + } + else + { + list($start, $end) = explode('..', $fields[0]); + + $ord_s = hexdec($start); + $ord_e = hexdec($end); + + $ord = $ord_s; + while ($ord <= $ord_e) + { + $this->char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1]; + } + } + } + + $local_file = $this->fetch_unicode_file('ScriptExtensions.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + $char_scripts = array(); + foreach (explode(' ', $fields[1]) as $alias) + { + if (!in_array($this->script_aliases[$alias], array('Common', 'Inherited'))) + { + $char_scripts[] = $this->script_aliases[$alias]; + } + } + + if (strpos($fields[0], '..') === false) + { + foreach ($char_scripts as $char_script) + { + $this->char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script; + } + } + else + { + list($start, $end) = explode('..', $fields[0]); + + $ord_s = hexdec($start); + $ord_e = hexdec($end); + + $ord = $ord_s; + while ($ord <= $ord_e) + { + foreach ($char_scripts as $char_script) + { + $this->char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script; + } + } + } + } + + $local_file = $this->fetch_unicode_file('DerivedAge.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + $fields[1] = (float) $fields[1]; + + if (strpos($fields[0], '..') === false) + { + $entity = '&#x' . $fields[0] . ';'; + + if (empty($this->char_data[$entity]['scripts'])) + { + continue; + } + + foreach ($this->char_data[$entity]['scripts'] as $char_script) + { + if (!isset($this->script_stats[$char_script])) + { + $this->script_stats[$char_script]['age'] = (float) $fields[1]; + $this->script_stats[$char_script]['count'] = 1; + } + else + { + $this->script_stats[$char_script]['age'] = min((float) $fields[1], $this->script_stats[$char_script]['age']); + $this->script_stats[$char_script]['count']++; + } + } + } + else + { + list($start, $end) = explode('..', $fields[0]); + + $ord_s = hexdec($start); + $ord_e = hexdec($end); + + $ord = $ord_s; + while ($ord <= $ord_e) + { + $entity = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; + + if (empty($this->char_data[$entity]['scripts'])) + { + continue; + } + + foreach ($this->char_data[$entity]['scripts'] as $char_script) + { + if (!isset($this->script_stats[$char_script])) + { + $this->script_stats[$char_script]['age'] = $fields[1]; + $this->script_stats[$char_script]['count'] = 1; + } + else + { + $this->script_stats[$char_script]['age'] = min($fields[1], $this->script_stats[$char_script]['age']); + $this->script_stats[$char_script]['count']++; + } + } + } + } + } + } + + /** + * Builds regex classes for join control tests in utf8_sanitize_invisibles. + * Specifically, for cursive scripts like Arabic. + */ + private function build_regex_joining_type() + { + $local_file = $this->fetch_unicode_file('extracted/DerivedJoiningType.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + switch ($fields[1]) + { + case 'C': + $joining_type = 'Join_Causing'; + break; + + case 'D': + $joining_type = 'Dual_Joining'; + break; + + case 'R': + $joining_type = 'Right_Joining'; + break; + + case 'L': + $joining_type = 'Left_Joining'; + break; + + case 'T': + $joining_type = 'Transparent'; + break; + + default: + $joining_type = null; + break; + } + + if (!isset($joining_type)) + { + continue; + } + + $entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';'; + + if (empty($this->char_data[$entity]['scripts'])) + { + continue; + } + + foreach ($this->char_data[$entity]['scripts'] as $char_script) + { + if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'])) + { + $this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $this->script_stats[$char_script]; + } + + if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type])) + { + $this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array(); + } + + $this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + } + // This sort works decently well to ensure widely used scripts are ranked before rare scripts. + uasort($this->funcs['utf8_regex_joining_type']['data'], function ($a, $b) + { + if ($a['stats']['age'] == $b['stats']['age']) + { + return $b['stats']['count'] - $a['stats']['count']; + } + else + { + return $a['stats']['age'] - $b['stats']['age']; + } + }); + foreach ($this->funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types) + { + unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']); + + // If the only joining type in this script is transparent, we don't care about it. + if (array_keys($joining_types) === array('Transparent')) + { + unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]); + continue; + } + + foreach ($joining_types as $joining_type => $value) + { + sort($value); + } + } + } + + /** + * Builds regex classes for join control tests in utf8_sanitize_invisibles. + * Specifically, for Indic scripts like Devanagari. + */ + private function build_regex_indic() + { + $local_file = $this->fetch_unicode_file('IndicSyllabicCategory.txt', self::DATA_URL_UCD); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = trim($value); + } + + $insc = $fields[1]; + + if (!in_array($insc, array('Virama', 'Vowel_Dependent'))) + { + continue; + } + + $char_scripts = $this->char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts']; + + if (empty($char_scripts)) + { + continue; + } + + foreach ($char_scripts as $char_script) + { + if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats'])) + { + $this->funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $this->script_stats[$char_script]; + } + + if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script][$insc])) + { + $this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = array(); + } + + $this->funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + } + // Again, sort commonly used scripts before rare scripts. + uasort($this->funcs['utf8_regex_indic']['data'], function ($a, $b) + { + if ($a['stats']['age'] == $b['stats']['age']) + { + return $b['stats']['count'] - $a['stats']['count']; + } + else + { + return $a['stats']['age'] - $b['stats']['age']; + } + }); + // We only want scripts with viramas. + foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs) + { + unset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']); + + if (!isset($inscs['Virama'])) + { + unset($this->funcs['utf8_regex_indic']['data'][$char_script]); + continue; + } + } + // Now add some more classes that we need for each script. + foreach ($this->char_data as $entity => $info) + { + if (empty($info['scripts'])) + { + continue; + } + + $ord = hexdec(trim($entity, '&#x;')); + + foreach ($info['scripts'] as $char_script) + { + if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script])) + { + continue; + } + + $this->funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord; + + if (empty($info['General_Category'])) + { + continue; + } + elseif ($info['General_Category'] == 'Mn') + { + $this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord; + + if (!empty($this->funcs['utf8_combining_classes']['data'][$entity])) + { + $this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord; + } + } + elseif (substr($info['General_Category'], 0, 1) == 'L') + { + $this->funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord; + } + } + } + foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs) + { + foreach ($inscs as $insc => $value) + { + sort($value); + + if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark'))) + { + continue; + } + + $class_string = ''; + + $current_range = array('start' => null, 'end' => null); + foreach ($value as $ord) + { + if (!isset($current_range['start'])) + { + $current_range['start'] = $ord; + } + + if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) + { + $current_range['end'] = $ord; + continue; + } + else + { + $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; + + if ($current_range['start'] != $current_range['end']) + { + $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; + } + + $current_range = array('start' => $ord, 'end' => $ord); + } + } + + if (isset($current_range['start'])) + { + $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; + + if ($current_range['start'] != $current_range['end']) + { + $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; + } + } + + $this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = preg_split('/(?<=})(?=\\\x{)/', $class_string); + } + + ksort($this->funcs['utf8_regex_indic']['data'][$char_script]); + } + } + + /** + * Builds maps and regex classes for IDNA purposes. + */ + private function build_idna() + { + $local_file = $this->fetch_unicode_file('IdnaMappingTable.txt', self::DATA_URL_IDNA); + + if (empty($local_file)) + return false; + + foreach (file($local_file) as $line) + { + $line = substr($line, 0, strcspn($line, '#')); + + if (strpos($line, ';') === false) + { + continue; + } + + $fields = explode(';', $line); + + foreach ($fields as $key => $value) + { + $fields[$key] = preg_replace('/\b(0(?!\b))+/', '', trim($value)); + } + + if (strpos($fields[0], '..') === false) + { + $entities = array('&#x' . $fields[0] . ';'); + } + else + { + $entities = array(); + + list($start, $end) = explode('..', $fields[0]); + + $ord_s = hexdec($start); + $ord_e = hexdec($end); + + $ord = $ord_s; + while ($ord <= $ord_e) + { + $entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; + } + } + + if ($fields[1] === 'mapped') + { + foreach ($entities as $entity) + $this->funcs['idna_maps']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';'; + } + elseif ($fields[1] === 'deviation') + { + foreach ($entities as $entity) + $this->funcs['idna_maps_deviation']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';'; + + $this->funcs['idna_regex']['data']['deviation'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + elseif ($fields[1] === 'ignored') + { + $this->funcs['idna_regex']['data']['ignored'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + elseif ($fields[1] === 'disallowed') + { + if (in_array('�', $entities)) + continue; + + $this->funcs['idna_regex']['data']['disallowed'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + elseif ($fields[1] === 'disallowed_STD3_mapped') + { + foreach ($entities as $entity) + $this->funcs['idna_maps_not_std3']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';'; + + $this->funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + elseif ($fields[1] === 'disallowed_STD3_valid') + { + $this->funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; + } + } + } +} + +?> \ No newline at end of file diff --git a/other/update_unicode_data.php b/other/update_unicode_data.php index 4134d4cfb4..ceaf5a29cc 100644 --- a/other/update_unicode_data.php +++ b/other/update_unicode_data.php @@ -5,10 +5,20 @@ * any SMF distribution packages. * * This file exists to make it easy for developers to update the - * Unicode data in Subs-Charset.php whenever a new version of the + * Unicode data in $sourcedir/Unicode whenever a new version of the * Unicode Character Database is released. Just run this file from the * command line in order to perform the update. * + * Note: + * + * 1. Any updates to the Unicode data files SHOULD be included in the + * install and large upgrade packages. + * + * 2. Any updates to the Unicode data files SHOULD NOT be included in + * the patch packages. The Update_Unicode background task will take + * care of that on existing forums. + * + * * Simple Machines Forum (SMF) * * @package SMF @@ -16,1258 +26,43 @@ * @copyright 2022 Simple Machines and individual contributors * @license https://www.simplemachines.org/about/smf/license.php BSD * - * @version 2.1.2 + * @version 2.1.3 */ -$unicode_data_url = 'https://unicode.org/Public/UCD/latest/ucd'; -$idna_data_url = 'https://www.unicode.org/Public/idna/latest'; - -$sourcedir = realpath(dirname(__DIR__) . '/Sources'); -$unicodedir = $sourcedir . '/Unicode'; - -$full_decomposition_maps = array(); -$funcs = array( - 'utf8_normalize_d_maps' => array( - 'file' => 'DecompositionCanonical.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_normalize_kd_maps' => array( - 'file' => 'DecompositionCompatibility.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_compose_maps' => array( - 'file' => 'Composition.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_combining_classes' => array( - 'file' => 'CombiningClasses.php', - 'key_type' => 'hexchar', - 'val_type' => 'int', - 'data' => array(), - ), - 'utf8_strtolower_simple_maps' => array( - 'file' => 'CaseLower.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_strtolower_maps' => array( - 'file' => 'CaseLower.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_strtoupper_simple_maps' => array( - 'file' => 'CaseUpper.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_strtoupper_maps' => array( - 'file' => 'CaseUpper.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_titlecase_simple_maps' => array( - 'file' => 'CaseTitle.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_titlecase_maps' => array( - 'file' => 'CaseTitle.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_casefold_simple_maps' => array( - 'file' => 'CaseFold.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_casefold_maps' => array( - 'file' => 'CaseFold.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_default_ignorables' => array( - 'file' => 'DefaultIgnorables.php', - 'key_type' => 'int', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'utf8_regex_properties' => array( - 'file' => 'RegularExpressions.php', - 'key_type' => 'string', - 'val_type' => 'string', - 'propfiles' => array( - 'DerivedCoreProperties.txt', - 'PropList.txt', - 'emoji/emoji-data.txt', - 'extracted/DerivedGeneralCategory.txt', - ), - 'props' => array( - 'Bidi_Control', - 'Case_Ignorable', - 'Cn', - 'Default_Ignorable_Code_Point', - 'Emoji', - 'Emoji_Modifier', - 'Ideographic', - 'Join_Control', - 'Regional_Indicator', - 'Variation_Selector', - ), - 'data' => array(), - ), - 'utf8_regex_variation_selectors' => array( - 'file' => 'RegularExpressions.php', - 'key_type' => 'string', - 'val_type' => 'string', - 'data' => array(), - ), - 'utf8_regex_joining_type' => array( - 'file' => 'RegularExpressions.php', - 'key_type' => 'string', - 'val_type' => 'string', - 'data' => array(), - ), - 'utf8_regex_indic' => array( - 'file' => 'RegularExpressions.php', - 'key_type' => 'string', - 'val_type' => 'string', - 'data' => array(), - ), - 'idna_maps' => array( - 'file' => 'Idna.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'idna_maps_deviation' => array( - 'file' => 'Idna.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'idna_maps_not_std3' => array( - 'file' => 'Idna.php', - 'key_type' => 'hexchar', - 'val_type' => 'hexchar', - 'data' => array(), - ), - 'idna_regex' => array( - 'file' => 'Idna.php', - 'key_type' => 'string', - 'val_type' => 'string', - 'data' => array(), - ), -); - -foreach ($funcs as $func_name => $func_info) -{ - if (!is_file($unicodedir . '/' . $func_info['file']) || !is_writable($unicodedir . '/' . $func_info['file'])) - { - die($unicodedir . '/' . $func_info['file'] . ' not found or not writable.'); - } -} - -@ini_set('memory_limit', '256M'); - -/********************************************* - * Part 1: Normalization, case folding, etc. * - *********************************************/ - -// We need some of these for further analysis below. -$derived_normalization_props = array(); -$unicode_version = ''; -foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line) -{ - if ($unicode_version === '' && preg_match('/(\d+\.\d+\.\d+(?:\.\d+)?)\.txt$/', $line, $matches)) - { - $unicode_version = implode('.', array_pad(explode('.', $matches[1]), 4, '0')); - - $file_contents = file_get_contents($unicodedir . '/Metadata.php'); - $file_contents = preg_replace( - "~\bdefine\('SMF_UNICODE_VERSION', '[^']+'\)~", - "define('SMF_UNICODE_VERSION', '" . $unicode_version . "')", - $file_contents - ); - file_put_contents($unicodedir . '/Metadata.php', $file_contents); - } - - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - if (!isset($derived_normalization_props[$fields[1]])) - { - $derived_normalization_props[$fields[1]] = array(); - } - - if (strpos($fields[0], '..') === false) - { - $entities = array('&#x' . $fields[0] . ';'); - } - else - { - $entities = array(); - - list($start, $end) = explode('..', $fields[0]); - - $ord_s = hexdec($start); - $ord_e = hexdec($end); - - $ord = $ord_s; - while ($ord <= $ord_e) - { - $entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; - } - } - - $value = ''; - if (!isset($fields[2])) - { - $value = 'SAME'; - } - elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF'))) - { - $value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : ''; - } - else - { - $value = $fields[2]; - } - - foreach ($entities as $entity) - { - $derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value; - } -} - -// Go through all the characters in the Unicode database. -$char_data = array(); -foreach (file($unicode_data_url . '/UnicodeData.txt') as $line) -{ - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - if (!empty($fields[3])) - { - $funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = $fields[3]; - } - - // Uppercase maps. - if ($fields[12] !== '') - { - $funcs['utf8_strtoupper_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';'; - } - - // Lowercase maps. - if ($fields[13] !== '') - { - $funcs['utf8_strtolower_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';'; - } - - // Titlecase maps, where different from uppercase maps. - if ($fields[14] !== '' && $fields[14] !== $fields[12]) - { - $funcs['utf8_titlecase_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[14] . ';'; - } - - // Remember this character's general category for later. - $char_data['&#x' . $fields[0] . ';']['General_Category'] = $fields[2]; - - if ($fields[5] === '') - { - continue; - } - - // All canonical decompositions AND all compatibility decompositions. - $full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';'; - - // Just the canonical decompositions. - if (strpos($fields[5], '<') === false) - { - $funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';'; - } -} - -// Full case conversion maps -$funcs['utf8_strtoupper_maps']['data'] = $funcs['utf8_strtoupper_simple_maps']['data']; -$funcs['utf8_strtolower_maps']['data'] = $funcs['utf8_strtolower_simple_maps']['data']; -$funcs['utf8_titlecase_maps']['data'] = $funcs['utf8_titlecase_simple_maps']['data']; -foreach (file($unicode_data_url . '/SpecialCasing.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - // Unconditional mappings. - // Note: conditional mappings need to be handled by more complex code. - if (empty($fields[4])) - { - $funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[1])) . ';'; - - $funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[3])) . ';'; - - // Titlecase only where different from uppercase. - if ($fields[3] !== $fields[2]) - { - $funcs['utf8_titlecase_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; - } - } -} -ksort($funcs['utf8_strtolower_maps']['data']); -ksort($funcs['utf8_strtoupper_maps']['data']); -ksort($funcs['utf8_titlecase_maps']['data']); - -foreach (file($unicode_data_url . '/CaseFolding.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - // Full casefolding. - if (in_array($fields[1], array('C', 'F'))) - { - $funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; - } - - // Simple casefolding. - if (in_array($fields[1], array('C', 'S'))) - $funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; -} - -// Recursively iterate until we reach the final decomposition forms. -// This is necessary because some characters decompose to other characters that -// themselves decompose further. -$changed = true; -while ($changed) -{ - $temp = array(); - foreach ($full_decomposition_maps as $composed => $decomposed) - { - $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; - - foreach ($parts as $partnum => $hex) - { - if (isset($full_decomposition_maps[$hex])) - { - $parts[$partnum] = $full_decomposition_maps[$hex]; - } - } - - $decomposed = implode(' ', $parts); - unset($parts); - - $temp[$composed] = $decomposed; - } - - $changed = $full_decomposition_maps !== $temp; - - $full_decomposition_maps = $temp; -} - -// Same as above, but using only canonical decompositions. -$changed = true; -$iteration = 0; -while ($changed) -{ - $temp = array(); - foreach ($funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed) - { - if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion'])) - { - $funcs['utf8_compose_maps']['data'][$decomposed] = $composed; - } - - $parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed; - - foreach ($parts as $partnum => $hex) - { - if (isset($funcs['utf8_normalize_d_maps']['data'][$hex])) - { - $parts[$partnum] = $funcs['utf8_normalize_d_maps']['data'][$hex]; - } - } - - $decomposed = implode(' ', $parts); - unset($parts); - - $temp[$composed] = $decomposed; - } - - $changed = $funcs['utf8_normalize_d_maps']['data'] !== $temp; - - $funcs['utf8_normalize_d_maps']['data'] = $temp; - $iteration++; -} - -$funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($full_decomposition_maps, $funcs['utf8_normalize_d_maps']['data']); -unset($full_decomposition_maps, $derived_normalization_props); - -// Now update the files with the data we've got so far. -foreach ($funcs as $func_name => $func_info) -{ - if (empty($func_info['data'])) - { - continue; - } - - export_func_to_file($func_name, $func_info); - - // Free up some memory. - if ($func_name != 'utf8_combining_classes') - { - unset($funcs[$func_name]); - } -} - -/*********************************** - * Part 2: Regular expression data * - ***********************************/ - -// Build regular expression classes for extended Unicode properties. -foreach ($funcs['utf8_regex_properties']['propfiles'] as $filename) -{ - foreach (file($unicode_data_url . '/' . $filename) as $line) - { - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - if (in_array($fields[1], $funcs['utf8_regex_properties']['props'])) - { - if (!isset($funcs['utf8_regex_properties']['data'][$fields[1]])) - { - $funcs['utf8_regex_properties']['data'][$fields[1]] = array(); - } - - $funcs['utf8_regex_properties']['data'][$fields[1]][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } - - // We also track 'Default_Ignorable_Code_Point' property in a separate array. - if ($fields[1] !== 'Default_Ignorable_Code_Point') - { - continue; - } - - if (strpos($fields[0], '..') === false) - { - $funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';'; - } - else - { - list($start, $end) = explode('..', $fields[0]); - - $ord_s = hexdec($start); - $ord_e = hexdec($end); - - $ord = $ord_s; - while ($ord <= $ord_e) - { - $funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; - } - } - } -} -ksort($funcs['utf8_regex_properties']['data']); - -// Build regular expression classes for filtering variation selectors. -$files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt'); -foreach ($files as $filename) -{ - foreach (file($unicode_data_url . '/' . $filename) as $line) - { - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - list($base_char, $variation_selector) = explode(' ', $fields[0]); - - $funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char); - } -} -foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords) -{ - $class_string = ''; - - $current_range = array('start' => null, 'end' => null); - foreach ($ords as $ord) - { - if (!isset($current_range['start'])) - { - $current_range['start'] = $ord; - } - - if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) - { - $current_range['end'] = $ord; - continue; - } - else - { - $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; - - if ($current_range['start'] != $current_range['end']) - { - $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; - } - - $current_range = array('start' => $ord, 'end' => $ord); - } - } - - if (isset($current_range['start'])) - { - $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; - - if ($current_range['start'] != $current_range['end']) - { - $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; - } - } - - // As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters. - if (($identical = array_search($class_string, $funcs['utf8_regex_variation_selectors']['data'])) !== false) - { - unset( - $funcs['utf8_regex_variation_selectors']['data'][$identical], - $funcs['utf8_regex_variation_selectors']['data'][$variation_selector] - ); - - $compound_selector = array($identical, $variation_selector); - sort($compound_selector); - - $variation_selector = implode('', $compound_selector); - } - - $funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string; -} -foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $class_string) -{ - $funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = preg_split('/(?<=})(?=\\\x{)/', $class_string); -} -krsort($funcs['utf8_regex_variation_selectors']['data']); - -// The regex classes for join control tests require info about language scripts. -$script_stats = array(); -$script_aliases = array(); -foreach (file($unicode_data_url . '/PropertyValueAliases.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - if ($fields[0] !== 'sc') - { - continue; - } - - $script_aliases[$fields[1]] = $fields[2]; -} -foreach (file($unicode_data_url . '/Scripts.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - if (in_array($fields[1], array('Common', 'Inherited'))) - { - continue; - } - - if (strpos($fields[0], '..') === false) - { - $char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1]; - } - else - { - list($start, $end) = explode('..', $fields[0]); +// 1. Set a couple of variables that we'll need. +$boarddir = realpath(dirname(__DIR__)); +$sourcedir = $boarddir . '/Sources'; - $ord_s = hexdec($start); - $ord_e = hexdec($end); +// 2. Borrow a bit of stuff from cron.php. +$cron_php_start = file_get_contents($boarddir . '/cron.php', false, null, 0, 4096); - $ord = $ord_s; - while ($ord <= $ord_e) - { - $char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1]; - } - } -} -foreach (file($unicode_data_url . '/ScriptExtensions.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - $char_scripts = array(); - foreach (explode(' ', $fields[1]) as $alias) - { - if (!in_array($script_aliases[$alias], array('Common', 'Inherited'))) - { - $char_scripts[] = $script_aliases[$alias]; - } - } - - if (strpos($fields[0], '..') === false) - { - foreach ($char_scripts as $char_script) - { - $char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script; - } - } - else - { - list($start, $end) = explode('..', $fields[0]); - - $ord_s = hexdec($start); - $ord_e = hexdec($end); - - $ord = $ord_s; - while ($ord <= $ord_e) - { - foreach ($char_scripts as $char_script) - { - $char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script; - } - } - } -} -foreach (file($unicode_data_url . '/DerivedAge.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - $fields[1] = (float) $fields[1]; - - if (strpos($fields[0], '..') === false) - { - $entity = '&#x' . $fields[0] . ';'; - - if (empty($char_data[$entity]['scripts'])) - { - continue; - } - - foreach ($char_data[$entity]['scripts'] as $char_script) - { - if (!isset($script_stats[$char_script])) - { - $script_stats[$char_script]['age'] = (float) $fields[1]; - $script_stats[$char_script]['count'] = 1; - } - else - { - $script_stats[$char_script]['age'] = min((float) $fields[1], $script_stats[$char_script]['age']); - $script_stats[$char_script]['count']++; - } - } - } - else - { - list($start, $end) = explode('..', $fields[0]); - - $ord_s = hexdec($start); - $ord_e = hexdec($end); - - $ord = $ord_s; - while ($ord <= $ord_e) - { - $entity = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; - - if (empty($char_data[$entity]['scripts'])) - { - continue; - } - - foreach ($char_data[$entity]['scripts'] as $char_script) - { - if (!isset($script_stats[$char_script])) - { - $script_stats[$char_script]['age'] = $fields[1]; - $script_stats[$char_script]['count'] = 1; - } - else - { - $script_stats[$char_script]['age'] = min($fields[1], $script_stats[$char_script]['age']); - $script_stats[$char_script]['count']++; - } - } - } - } -} - -// Build regex classes for join control tests in utf8_sanitize_invisibles: -// 1. Cursive scripts like Arabic. -foreach (file($unicode_data_url . '/extracted/DerivedJoiningType.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - switch ($fields[1]) - { - case 'C': - $joining_type = 'Join_Causing'; - break; - - case 'D': - $joining_type = 'Dual_Joining'; - break; - - case 'R': - $joining_type = 'Right_Joining'; - break; - - case 'L': - $joining_type = 'Left_Joining'; - break; - - case 'T': - $joining_type = 'Transparent'; - break; - - default: - $joining_type = null; - break; - } - - if (!isset($joining_type)) - { - continue; - } - - $entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';'; - - if (empty($char_data[$entity]['scripts'])) - { - continue; - } - - foreach ($char_data[$entity]['scripts'] as $char_script) - { - if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'])) - { - $funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $script_stats[$char_script]; - } - - if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type])) - { - $funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array(); - } - - $funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } -} -// This sort works decently well to ensure widely used scripts are ranked before rare scripts. -uasort($funcs['utf8_regex_joining_type']['data'], function ($a, $b) -{ - if ($a['stats']['age'] == $b['stats']['age']) - { - return $b['stats']['count'] - $a['stats']['count']; - } - else - { - return $a['stats']['age'] - $b['stats']['age']; - } -}); -foreach ($funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types) +foreach (array('SMF', 'SMF_VERSION', 'SMF_SOFTWARE_YEAR') as $const) { - unset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']); + preg_match("/define\('$const', '([^)]+)'\);/", $cron_php_start, $matches); - // If the only joining type in this script is transparent, we don't care about it. - if (array_keys($joining_types) === array('Transparent')) - { - unset($funcs['utf8_regex_joining_type']['data'][$char_script]); - continue; - } + if (empty($matches[1])) + die("Could not find value for $const in cron.php"); - foreach ($joining_types as $joining_type => $value) - { - sort($value); - } + define($const, $matches[1]); } -// 2. Indic scripts like Devanagari. -foreach (file($unicode_data_url . '/IndicSyllabicCategory.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = trim($value); - } - - $insc = $fields[1]; - - if (!in_array($insc, array('Virama', 'Vowel_Dependent'))) - { - continue; - } +define('SMF_USER_AGENT', 'SMF'); +define('MAX_CLAIM_THRESHOLD', 300); +define('TIME_START', microtime(true)); - $char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts']; - - if (empty($char_scripts)) - { - continue; - } - - foreach ($char_scripts as $char_script) - { - if (!isset($funcs['utf8_regex_indic']['data'][$char_script]['stats'])) - { - $funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $script_stats[$char_script]; - } - - if (!isset($funcs['utf8_regex_indic']['data'][$char_script][$insc])) - { - $funcs['utf8_regex_indic']['data'][$char_script][$insc] = array(); - } - - $funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } -} -// Again, sort commonly used scripts before rare scripts. -uasort($funcs['utf8_regex_indic']['data'], function ($a, $b) -{ - if ($a['stats']['age'] == $b['stats']['age']) - { - return $b['stats']['count'] - $a['stats']['count']; - } - else - { - return $a['stats']['age'] - $b['stats']['age']; - } -}); -// We only want scripts with viramas. -foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs) -{ - unset($funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']); - - if (!isset($inscs['Virama'])) - { - unset($funcs['utf8_regex_indic']['data'][$char_script]); - continue; - } -} -// Now add some more classes that we need for each script. -foreach ($char_data as $entity => $info) +abstract class SMF_BackgroundTask { - if (empty($info['scripts'])) - { - continue; - } - - $ord = hexdec(trim($entity, '&#x;')); - - foreach ($info['scripts'] as $char_script) - { - if (!isset($funcs['utf8_regex_indic']['data'][$char_script])) - { - continue; - } - - $funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord; - - if (empty($info['General_Category'])) - { - continue; - } - elseif ($info['General_Category'] == 'Mn') - { - $funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord; - - if (!empty($funcs['utf8_combining_classes']['data'][$entity])) - { - $funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord; - } - } - elseif (substr($info['General_Category'], 0, 1) == 'L') - { - $funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord; - } - } + abstract public function execute(); } -foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs) -{ - foreach ($inscs as $insc => $value) - { - sort($value); - - if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark'))) - { - continue; - } - $class_string = ''; +// This should never be needed, but set it for completeness. +$smcFunc['db_insert'] = function($method, $table, $columns, $data, $keys, $returnmode = 0, $connection = null) {}; - $current_range = array('start' => null, 'end' => null); - foreach ($value as $ord) - { - if (!isset($current_range['start'])) - { - $current_range['start'] = $ord; - } +// 3. Do the job. +require_once($sourcedir . '/Subs.php'); +require_once($sourcedir . '/tasks/UpdateUnicode.php'); - if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) - { - $current_range['end'] = $ord; - continue; - } - else - { - $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; - - if ($current_range['start'] != $current_range['end']) - { - $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; - } - - $current_range = array('start' => $ord, 'end' => $ord); - } - } - - if (isset($current_range['start'])) - { - $class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; - - if ($current_range['start'] != $current_range['end']) - { - $class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; - } - } - - $funcs['utf8_regex_indic']['data'][$char_script][$insc] = preg_split('/(?<=})(?=\\\x{)/', $class_string); - } - - ksort($funcs['utf8_regex_indic']['data'][$char_script]); -} -unset($funcs['utf8_combining_classes']); - -foreach ($funcs as $func_name => $func_info) -{ - if (empty($func_info['data'])) - { - continue; - } - - export_func_to_file($func_name, $func_info); -} - -/********************************* - * Part 3: IDNA maps and regexes * - *********************************/ - -foreach (file($idna_data_url . '/IdnaMappingTable.txt') as $line) -{ - $line = substr($line, 0, strcspn($line, '#')); - - if (strpos($line, ';') === false) - { - continue; - } - - $fields = explode(';', $line); - - foreach ($fields as $key => $value) - { - $fields[$key] = preg_replace('/\b(0(?!\b))+/', '', trim($value)); - } - - if (strpos($fields[0], '..') === false) - { - $entities = array('&#x' . $fields[0] . ';'); - } - else - { - $entities = array(); - - list($start, $end) = explode('..', $fields[0]); - - $ord_s = hexdec($start); - $ord_e = hexdec($end); - - $ord = $ord_s; - while ($ord <= $ord_e) - { - $entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; - } - } - - if ($fields[1] === 'mapped') - { - foreach ($entities as $entity) - $funcs['idna_maps']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';'; - } - elseif ($fields[1] === 'deviation') - { - foreach ($entities as $entity) - $funcs['idna_maps_deviation']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';'; - - $funcs['idna_regex']['data']['deviation'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } - elseif ($fields[1] === 'ignored') - { - $funcs['idna_regex']['data']['ignored'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } - elseif ($fields[1] === 'disallowed') - { - if (in_array('�', $entities)) - continue; - - $funcs['idna_regex']['data']['disallowed'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } - elseif ($fields[1] === 'disallowed_STD3_mapped') - { - foreach ($entities as $entity) - $funcs['idna_maps_not_std3']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';'; - - $funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } - elseif ($fields[1] === 'disallowed_STD3_valid') - { - $funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } -} - -foreach ($funcs as $func_name => $func_info) -{ - if (empty($func_info['data'])) - { - continue; - } - - export_func_to_file($func_name, $func_info); -} - -/** - * Updates a Unicode data function in its designated file. - * - * @param string $func_name The name of the function. - * @param array $func_info Info about the function, including its data. - */ -function export_func_to_file($func_name, $func_info) -{ - global $unicodedir; - - $file_contents = file_get_contents($unicodedir . '/' . $func_info['file']); - - $func_text = 'function ' . $func_name . '()' . "\n" . '{'; - - $func_regex = '/' . preg_quote($func_text, '/') . '.+?\n}/s'; - - $func_text .= "\n\t" . 'return array(' . "\n"; - - build_func_array($func_text, $func_info['data'], $func_info['key_type'], $func_info['val_type']); - - $func_text .= "\t" . ');' . "\n" . '}'; - - $file_contents = preg_replace($func_regex, $func_text, $file_contents); - - file_put_contents($unicodedir . '/' . $func_info['file'], $file_contents); -} - -/** - * Helper for export_func_to_file(). Builds the function's data array. - * - * @param string &$func_text The raw string that contains function code. - * @param array $data Data to format as an array. - * @param string $key_type How to format the array keys. - * @param string $val_type How to format the array values. - */ -function build_func_array(&$func_text, $data, $key_type, $val_type) -{ - static $indent = 2; - - foreach ($data as $key => $value) - { - $func_text .= str_repeat("\t", $indent); - - if ($key_type == 'hexchar') - { - $func_text .= '"'; - - $key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); - - foreach (unpack('C*', $key) as $byte_value) - { - $func_text .= '\\x' . strtoupper(dechex($byte_value)); - } - - $func_text .= '" => '; - } - elseif ($key_type == 'string' && !is_int($key)) - { - $func_text .= var_export($key, true) . ' => '; - } - - if (is_array($value)) - { - if ($val_type == 'string' && ($string_count = count($value)) === count($value, COUNT_RECURSIVE)) - { - $nextline = "\n" . str_repeat("\t", $indent + 1); - - $func_text = rtrim($func_text); - - $func_text .= $nextline . implode(' .' . $nextline, array_map(function ($v) { return var_export($v, true); }, $value)); - } - else - { - $func_text .= 'array(' . "\n"; - - $indent++; - build_func_array($func_text, $value, $key_type, $val_type); - $indent--; - - $func_text .= str_repeat("\t", $indent) . ')'; - } - } - elseif ($val_type == 'hexchar') - { - $func_text .= '"'; - - $value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8'); - foreach (unpack('C*', $value) as $byte_value) - { - $func_text .= '\\x' . strtoupper(dechex($byte_value)); - } - - $func_text .= '"'; - } - elseif ($val_type == 'string') - { - $func_text .= var_export($value, true); - } - else - { - $func_text .= $value; - } - - $func_text .= ',' . "\n"; - } -} +$unicode_updater = new Update_Unicode(); +$unicode_updater->execute(); ?> \ No newline at end of file From 8d03dfa890f7948911b1c7e20a2003a32c893d08 Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Thu, 27 Oct 2022 10:13:29 -0600 Subject: [PATCH 2/8] Improves documentation Signed-off-by: Jon Stovell --- Sources/tasks/UpdateUnicode.php | 40 ++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php index 3ed45ef9b2..169a713ee6 100644 --- a/Sources/tasks/UpdateUnicode.php +++ b/Sources/tasks/UpdateUnicode.php @@ -18,19 +18,55 @@ */ class Update_Unicode extends SMF_BackgroundTask { + /** + * URLs where we can fetch the Unicode data files. + */ const DATA_URL_UCD = 'https://unicode.org/Public/UCD/latest/ucd'; const DATA_URL_IDNA = 'https://www.unicode.org/Public/idna/latest'; + /** + * @var string The latest official release of the Unicode Character Database. + */ public $ucd_version = ''; + + /** + * @var string Path to temporary working directory. + */ public $temp_dir = ''; + + /** + * @var string Convenince alias of $sourcedir . '/Unicode'. + */ public $unicodedir = ''; + /** + * @var array Key-value pairs of character decompositions. + */ private $full_decomposition_maps = array(); + + /** + * @var array Character properties used during normalization. + */ private $derived_normalization_props = array(); + + /** + * @var array Assorted info about Unicode characters. + */ private $char_data = array(); + + /** + * @var array Statistical info about character scripts (e.g. Latin, Greek, Cyrillic, etc.) + */ private $script_stats = array(); + + /** + * @var array Tracks associations between character scripts' short and long names. + */ private $script_aliases = array(); + /** + * @var array Info about functions to build in SMF's Unicode data files. + */ private $funcs = array( array( 'file' => 'Metadata.php', @@ -316,7 +352,9 @@ class Update_Unicode extends SMF_BackgroundTask ), ); - // Prefetching the files helps ensure the task runs smoothly. + /** + * @var array Files to fetch from unicode.org. + */ private $prefetch = array( self::DATA_URL_UCD => array( 'CaseFolding.txt', From a7e01e051305bc7f75577c5b8fd0a0ead9ef3263 Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Thu, 27 Oct 2022 11:27:15 -0600 Subject: [PATCH 3/8] Improves timeout avoidance in UpdateUnicode.php Signed-off-by: Jon Stovell --- Sources/tasks/UpdateUnicode.php | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php index 169a713ee6..27e0e7ae4f 100644 --- a/Sources/tasks/UpdateUnicode.php +++ b/Sources/tasks/UpdateUnicode.php @@ -39,6 +39,11 @@ class Update_Unicode extends SMF_BackgroundTask */ public $unicodedir = ''; + /** + * @var int Used to ensure we exit long running tasks cleanly. + */ + private $time_limit = 30; + /** * @var array Key-value pairs of character decompositions. */ @@ -408,6 +413,8 @@ public function execute() @ini_set('memory_limit', '256M'); + $this->time_limit = (empty(ini_get('max_execution_time')) || @set_time_limit(MAX_CLAIM_THRESHOLD) !== false) ? MAX_CLAIM_THRESHOLD : ini_get('max_execution_time'); + foreach ($this->funcs as $func_name => &$func_info) { $file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file'])); @@ -442,12 +449,18 @@ public function execute() // Prefetch the files in case the network is slow. foreach ($this->prefetch as $data_url => $files) { + $max_fetch_time = 0; + foreach ($files as $filename) { + $fetch_start = microtime(true); + $local_file = $this->fetch_unicode_file($filename, $data_url); + $max_fetch_time = max($max_fetch_time, microtime(true) - $fetch_start); + // If prefetch is taking a really long time, pause and try again later. - if ($local_file === false || microtime(true) - TIME_START >= MAX_CLAIM_THRESHOLD - 1) + if ($local_file === false || microtime(true) - TIME_START >= $this->time_limit - $max_fetch_time) { $smcFunc['db_insert']('', '{db_prefix}background_tasks', @@ -885,6 +898,8 @@ private function should_update() { $this->lookup_ucd_version(); + return true; // For testing + // We can't do anything if lookup failed. if (empty($this->ucd_version)) return false; From c595452dc7fdacb8fbc8c1655b0f1dd349e97901 Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Thu, 27 Oct 2022 15:25:04 -0600 Subject: [PATCH 4/8] Removes testing line that forced updates to happen even when unneeded Signed-off-by: Jon Stovell --- Sources/tasks/UpdateUnicode.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php index 27e0e7ae4f..9ae88ae09a 100644 --- a/Sources/tasks/UpdateUnicode.php +++ b/Sources/tasks/UpdateUnicode.php @@ -898,8 +898,6 @@ private function should_update() { $this->lookup_ucd_version(); - return true; // For testing - // We can't do anything if lookup failed. if (empty($this->ucd_version)) return false; From 293b8c362e3bdaaa0ea985225188891c1869a8ae Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Wed, 2 Nov 2022 12:50:10 -0600 Subject: [PATCH 5/8] Even more bulletproofing Signed-off-by: Jon Stovell --- Sources/tasks/UpdateUnicode.php | 104 ++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 40 deletions(-) diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php index 9ae88ae09a..c68354f002 100644 --- a/Sources/tasks/UpdateUnicode.php +++ b/Sources/tasks/UpdateUnicode.php @@ -484,67 +484,76 @@ public function execute() } } + // Track whether anything goes wrong along the way. + $success = true; + /********************************************* * Part 2: Normalization, case folding, etc. * *********************************************/ - $this->process_derived_normalization_props(); - $this->process_main_unicode_data(); - $this->process_casing_data(); - $this->finalize_decomposition_forms(); + $success = $this->process_derived_normalization_props() & $success; + $success = $this->process_main_unicode_data() & $success; + $success = $this->process_casing_data() & $success; + $success = $this->finalize_decomposition_forms() & $success; $this->full_decomposition_maps = array(); $this->derived_normalization_props = array(); + $this->export_funcs_to_file(); /*********************************** * Part 3: Regular expression data * ***********************************/ - $this->build_regex_properties(); - $this->build_regex_variation_selectors(); - $this->build_script_stats(); - $this->build_regex_joining_type(); - $this->build_regex_indic(); + $success = $this->build_regex_properties() & $success; + $success = $this->build_regex_variation_selectors() & $success; + $success = $this->build_script_stats() & $success; + $success = $this->build_regex_joining_type() & $success; + $success = $this->build_regex_indic() & $success; unset($this->funcs['utf8_combining_classes']['data']); + $this->export_funcs_to_file(); /********************************* * Part 4: IDNA maps and regexes * *********************************/ - $this->build_idna(); + $success = $this->build_idna() & $success; + $this->export_funcs_to_file(); /******************* * Part 5: Wrapup. * *******************/ - $done_files = array(); - - foreach ($this->funcs as $func_name => $func_info) + if ($success) { - $file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file']; - $file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file']; + $done_files = array(); - if (in_array($file_paths['temp'], $done_files)) - continue; + foreach ($this->funcs as $func_name => $func_info) + { + $file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file']; + $file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file']; - // Add closing PHP tag to the temp file. - file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND); + if (in_array($file_paths['temp'], $done_files)) + continue; - $done_files[] = $file_paths['temp']; + // Add closing PHP tag to the temp file. + file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND); - // Only move if the file has changed, discounting the license block. - foreach (array('temp', 'real') as $f) - { - if (file_exists($file_paths[$f])) + $done_files[] = $file_paths['temp']; + + // Only move if the file has changed, discounting the license block. + foreach (array('temp', 'real') as $f) { - $file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f])); + if (file_exists($file_paths[$f])) + { + $file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f])); + } + else + $file_contents[$f] = ''; } - else - $file_contents[$f] = ''; - } - if ($file_contents['temp'] !== $file_contents['real']) - rename($file_paths['temp'], $file_paths['real']); + if ($file_contents['temp'] !== $file_contents['real']) + rename($file_paths['temp'], $file_paths['real']); + } } // Clean up after ourselves. @@ -619,8 +628,6 @@ private function fetch_unicode_file($filename, $data_url) return false; } - require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php'); - $file_contents = fetch_web_data($data_url . '/' . $file_url_name); if (empty($file_contents)) @@ -628,8 +635,6 @@ private function fetch_unicode_file($filename, $data_url) file_put_contents($local_file, $file_contents); - $this->files_to_fetch[$sub_dir][] = $filename; - return $local_file; } @@ -902,16 +907,15 @@ private function should_update() if (empty($this->ucd_version)) return false; - require_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php'); + // If this file is missing, force an update. + if (!@include_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php')) + return true; - if (version_compare($this->ucd_version, SMF_UNICODE_VERSION, '<=')) - return false; + return version_compare($this->ucd_version, SMF_UNICODE_VERSION, '>='); } /** - * Compares version of SMF's local Unicode data with the latest release. - * - * @return bool Whether SMF should update its local Unicode data or not. + * Sets $this->ucd_version to latest version number of the UCD. */ private function lookup_ucd_version() { @@ -1017,6 +1021,8 @@ private function process_derived_normalization_props() $this->derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value; } } + + return true; } /** @@ -1080,6 +1086,8 @@ private function process_main_unicode_data() $this->funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';'; } } + + return true; } /** @@ -1167,6 +1175,8 @@ private function process_casing_data() if (in_array($fields[1], array('C', 'S'))) $this->funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';'; } + + return true; } /** @@ -1243,6 +1253,8 @@ private function finalize_decomposition_forms() // Avoid bloat. $this->funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($this->full_decomposition_maps, $this->funcs['utf8_normalize_d_maps']['data']); + + return true; } /** @@ -1310,6 +1322,8 @@ private function build_regex_properties() } ksort($this->funcs['utf8_regex_properties']['data']); + + return true; } /** @@ -1411,6 +1425,8 @@ private function build_regex_variation_selectors() } krsort($this->funcs['utf8_regex_variation_selectors']['data']); + + return true; } /** @@ -1626,6 +1642,8 @@ private function build_script_stats() } } } + + return true; } /** @@ -1737,6 +1755,8 @@ private function build_regex_joining_type() sort($value); } } + + return true; } /** @@ -1910,6 +1930,8 @@ private function build_regex_indic() ksort($this->funcs['utf8_regex_indic']['data'][$char_script]); } + + return true; } /** @@ -1993,6 +2015,8 @@ private function build_idna() $this->funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; } } + + return true; } } From 1bbdef4aa37a178ef61006b0603a59fccce9169d Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Tue, 14 Mar 2023 18:41:53 -0600 Subject: [PATCH 6/8] Adds method to build QuickCheck data file Signed-off-by: Jon Stovell --- Sources/tasks/UpdateUnicode.php | 73 ++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php index c68354f002..582ca683b9 100644 --- a/Sources/tasks/UpdateUnicode.php +++ b/Sources/tasks/UpdateUnicode.php @@ -311,6 +311,22 @@ class Update_Unicode extends SMF_BackgroundTask ), 'data' => array(), ), + 'utf8_regex_quick_check' => array( + 'file' => 'QuickCheck.php', + 'key_type' => 'string', + 'val_type' => 'string', + 'desc' => array( + 'Helper function for utf8_is_normalized.', + '', + 'Character class lists compiled from:', + 'https://unicode.org/Public/UNIDATA/extracted/DerivedNormalizationProps.txt', + ), + 'return' => array( + 'type' => 'array', + 'desc' => 'Character classes for disallowed characters in normalization forms.', + ), + 'data' => array(), + ), 'idna_maps' => array( 'file' => 'Idna.php', 'key_type' => 'hexchar', @@ -496,13 +512,16 @@ public function execute() $success = $this->finalize_decomposition_forms() & $success; $this->full_decomposition_maps = array(); - $this->derived_normalization_props = array(); $this->export_funcs_to_file(); /*********************************** * Part 3: Regular expression data * ***********************************/ + $success = $this->build_quick_check() & $success; + + $this->derived_normalization_props = array(); + $success = $this->build_regex_properties() & $success; $success = $this->build_regex_variation_selectors() & $success; $success = $this->build_script_stats() & $success; @@ -1257,6 +1276,58 @@ private function finalize_decomposition_forms() return true; } + /** + * Builds regular expressions for normalization quick check. + */ + private function build_quick_check() + { + foreach (array('NFC_QC', 'NFKC_QC', 'NFD_QC', 'NFKD_QC', 'Changes_When_NFKC_Casefolded') as $prop) + { + $current_range = array('start' => null, 'end' => null); + foreach ($this->derived_normalization_props[$prop] as $entity => $nm) + { + $range_string = ''; + + $ord = hexdec(trim($entity, '&#x;')); + + if (!isset($current_range['start'])) + { + $current_range['start'] = $ord; + } + + if (!isset($current_range['end']) || $ord == $current_range['end'] + 1) + { + $current_range['end'] = $ord; + } + else + { + $range_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; + + if ($current_range['start'] != $current_range['end']) + { + $range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; + } + + $current_range = array('start' => $ord, 'end' => $ord); + + $this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string; + } + } + + if (isset($current_range['start'])) + { + $range_string = '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}'; + + if ($current_range['start'] != $current_range['end']) + { + $range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}'; + } + + $this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string; + } + } + } + /** * Builds regular expression classes for extended Unicode properties. */ From 9a8960718441ce1832da1583ae76ac0f92c39404 Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Tue, 14 Mar 2023 18:56:07 -0600 Subject: [PATCH 7/8] Syntax compatibility with PHP 7.0 Signed-off-by: Jon Stovell --- Sources/tasks/UpdateUnicode.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php index 582ca683b9..62602e5ec9 100644 --- a/Sources/tasks/UpdateUnicode.php +++ b/Sources/tasks/UpdateUnicode.php @@ -796,7 +796,7 @@ private function get_function_code_and_regex($func_name) empty($this->funcs[$func_name]['return']) ? array() : array( '', '@return ' . implode(' ', $this->funcs[$func_name]['return']) - ), + ) )) . "\n */\n"; // The code for this function. @@ -1326,6 +1326,8 @@ private function build_quick_check() $this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string; } } + + return true; } /** From 60580a1d7f804f2887e89e65e18a4d261137f7b2 Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Wed, 20 Sep 2023 19:31:31 -0600 Subject: [PATCH 8/8] Informs the admin if Unicode files are not writable Signed-off-by: Jon Stovell --- Sources/tasks/UpdateUnicode.php | 21 ++++++++++++++------- Themes/default/languages/Errors.english.php | 2 ++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php index 62602e5ec9..516a57a9ce 100644 --- a/Sources/tasks/UpdateUnicode.php +++ b/Sources/tasks/UpdateUnicode.php @@ -433,20 +433,27 @@ public function execute() foreach ($this->funcs as $func_name => &$func_info) { - $file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file'])); + $file_paths['final'] = implode(DIRECTORY_SEPARATOR, array($this->unicodedir, $func_info['file'])); - if (!file_exists($file_paths['temp'])) - touch($file_paths['temp']); + if (!file_exists($file_paths['final'])) + touch($file_paths['final']); - if (!is_file($file_paths['temp'])) + if (!is_file($file_paths['final']) || !smf_chmod($file_paths['final'])) { - log_error($file_paths['temp'] . ' is not a file.'); + loadLanguage('Errors'); + log_error(sprintf($txt['unicode_update_failed'], $this->unicodedir)); return true; } - if (!smf_chmod($file_paths['temp'])) + $file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file'])); + + if (!file_exists($file_paths['temp'])) + touch($file_paths['temp']); + + if (!is_file($file_paths['temp']) || !smf_chmod($file_paths['temp'])) { - log_error($file_paths['temp'] . ' is not writable.'); + loadLanguage('Errors'); + log_error(sprintf($txt['unicode_update_failed'], $this->temp_dir)); return true; } diff --git a/Themes/default/languages/Errors.english.php b/Themes/default/languages/Errors.english.php index 6745a1b0d9..3823c5d36f 100644 --- a/Themes/default/languages/Errors.english.php +++ b/Themes/default/languages/Errors.english.php @@ -522,4 +522,6 @@ $txt['fetch_web_data_bad_url'] = 'fetch_web_data(): Bad URL'; +$txt['unicode_update_failed'] = 'A new version of Unicode is available, but SMF could not update to it. Please make sure %1$s and all the files in it are writable. SMF will try to update its Unicode data files again automatically.'; + ?> \ No newline at end of file