From bdb60973cdd82310767e11911ec3666f610fe3df Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Tue, 25 Oct 2022 19:37:03 -0600
Subject: [PATCH 1/8] Updates Unicode data files in a background task

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/ScheduledTasks.php      |    6 +
 Sources/Subs-Charset.php        |    4 +-
 Sources/tasks/UpdateUnicode.php | 1948 +++++++++++++++++++++++++++++++
 other/update_unicode_data.php   | 1273 +-------------------
 4 files changed, 1991 insertions(+), 1240 deletions(-)
 create mode 100644 Sources/tasks/UpdateUnicode.php

diff --git a/Sources/ScheduledTasks.php b/Sources/ScheduledTasks.php
index 93edf77c88..98511e62e2 100644
--- a/Sources/ScheduledTasks.php
+++ b/Sources/ScheduledTasks.php
@@ -1322,6 +1322,12 @@ function scheduled_weekly_maintenance()
 		array('$sourcedir/tasks/UpdateTldRegex.php', 'Update_TLD_Regex', '', 0), array()
 	);
 
+	// Ensure Unicode data files are up to date
+	$smcFunc['db_insert']('insert', '{db_prefix}background_tasks',
+		array('task_file' => 'string-255', 'task_class' => 'string-255', 'task_data' => 'string', 'claimed_time' => 'int'),
+		array('$sourcedir/tasks/UpdateUnicode.php', 'Update_Unicode', '', 0), array()
+	);
+
 	// Run Cache housekeeping
 	if (!empty($cache_enable) && !empty($cacheAPI))
 		$cacheAPI->housekeeping();
diff --git a/Sources/Subs-Charset.php b/Sources/Subs-Charset.php
index 79f0582832..af0c794196 100644
--- a/Sources/Subs-Charset.php
+++ b/Sources/Subs-Charset.php
@@ -14,7 +14,9 @@
 if (!defined('SMF'))
 	die('No direct access...');
 
-require_once($sourcedir . '/Unicode/Metadata.php');
+// If this file is missing, we're using an old version of Unicode.
+if (!@include_once($sourcedir . '/Unicode/Metadata.php'))
+	define('SMF_UNICODE_VERSION', '14.0.0.0');
 
 /**
  * Converts the given UTF-8 string into lowercase.
diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
new file mode 100644
index 0000000000..3ed45ef9b2
--- /dev/null
+++ b/Sources/tasks/UpdateUnicode.php
@@ -0,0 +1,1948 @@
+<?php
+
+/**
+ * This file contains code used to update SMF's Unicode data files.
+ *
+ * Simple Machines Forum (SMF)
+ *
+ * @package SMF
+ * @author Simple Machines https://www.simplemachines.org
+ * @copyright 2022 Simple Machines and individual contributors
+ * @license https://www.simplemachines.org/about/smf/license.php BSD
+ *
+ * @version 2.1.3
+ */
+
+/**
+ * Class Update_Unicode
+ */
+class Update_Unicode extends SMF_BackgroundTask
+{
+	const DATA_URL_UCD = 'https://unicode.org/Public/UCD/latest/ucd';
+	const DATA_URL_IDNA = 'https://www.unicode.org/Public/idna/latest';
+
+	public $ucd_version = '';
+	public $temp_dir = '';
+	public $unicodedir = '';
+
+	private $full_decomposition_maps = array();
+	private $derived_normalization_props = array();
+	private $char_data = array();
+	private $script_stats = array();
+	private $script_aliases = array();
+
+	private $funcs = array(
+		array(
+			'file' => 'Metadata.php',
+			'regex' => '/if \(!defined\(\'SMF_UNICODE_VERSION\'\)\)\n\tdefine\(\'SMF_UNICODE_VERSION\', \'\d+(\.\d+)*\'\);/',
+			'data' => array(
+				// 0.0.0.0 will be replaced with correct value at runtime.
+				"if (!defined('SMF_UNICODE_VERSION'))\n\tdefine('SMF_UNICODE_VERSION', '0.0.0.0');",
+			),
+		),
+		'utf8_normalize_d_maps' => array(
+			'file' => 'DecompositionCanonical.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_normalize_d.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Canonical Decomposition maps for Unicode normalization.',
+			),
+			'data' => array(),
+		),
+		'utf8_normalize_kd_maps' => array(
+			'file' => 'DecompositionCompatibility.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_normalize_kd.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Compatibility Decomposition maps for Unicode normalization.',
+			),
+			'data' => array(),
+		),
+		'utf8_compose_maps' => array(
+			'file' => 'Composition.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_compose.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Composition maps for Unicode normalization.',
+			),
+			'data' => array(),
+		),
+		'utf8_combining_classes' => array(
+			'file' => 'CombiningClasses.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'int',
+			'desc' => array('Helper function for utf8_normalize_d.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Combining Class data for Unicode normalization.',
+			),
+			'data' => array(),
+		),
+		'utf8_strtolower_simple_maps' => array(
+			'file' => 'CaseLower.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_strtolower.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Uppercase to lowercase maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_strtolower_maps' => array(
+			'file' => 'CaseLower.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_strtolower.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Uppercase to lowercase maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_strtoupper_simple_maps' => array(
+			'file' => 'CaseUpper.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_strtoupper.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Lowercase to uppercase maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_strtoupper_maps' => array(
+			'file' => 'CaseUpper.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_strtoupper.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Lowercase to uppercase maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_titlecase_simple_maps' => array(
+			'file' => 'CaseTitle.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_convert_case.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Simple title case maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_titlecase_maps' => array(
+			'file' => 'CaseTitle.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_convert_case.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Full title case maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_casefold_simple_maps' => array(
+			'file' => 'CaseFold.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_casefold.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Casefolding maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_casefold_maps' => array(
+			'file' => 'CaseFold.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_casefold.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Casefolding maps.',
+			),
+			'data' => array(),
+		),
+		'utf8_default_ignorables' => array(
+			'file' => 'DefaultIgnorables.php',
+			'key_type' => 'int',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for utf8_normalize_kc_casefold.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Characters with the \'Default_Ignorable_Code_Point\' property.',
+			),
+			'data' => array(),
+		),
+		'utf8_regex_properties' => array(
+			'file' => 'RegularExpressions.php',
+			'key_type' => 'string',
+			'val_type' => 'string',
+			'propfiles' => array(
+				'DerivedCoreProperties.txt',
+				'PropList.txt',
+				'emoji/emoji-data.txt',
+				'extracted/DerivedGeneralCategory.txt',
+			),
+			'props' => array(
+				'Bidi_Control',
+				'Case_Ignorable',
+				'Cn',
+				'Default_Ignorable_Code_Point',
+				'Emoji',
+				'Emoji_Modifier',
+				'Ideographic',
+				'Join_Control',
+				'Regional_Indicator',
+				'Variation_Selector',
+			),
+			'desc' => array(
+				'Helper function for utf8_sanitize_invisibles and utf8_convert_case.',
+				'',
+				'Character class lists compiled from:',
+				'https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt',
+				'https://unicode.org/Public/UNIDATA/PropList.txt',
+				'https://unicode.org/Public/UNIDATA/emoji/emoji-data.txt',
+				'https://unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt',
+			),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Character classes for various Unicode properties.',
+			),
+			'data' => array(),
+		),
+		'utf8_regex_variation_selectors' => array(
+			'file' => 'RegularExpressions.php',
+			'key_type' => 'string',
+			'val_type' => 'string',
+			'desc' => array(
+				'Helper function for utf8_sanitize_invisibles.',
+				'',
+				'Character class lists compiled from:',
+				'https://unicode.org/Public/UNIDATA/StandardizedVariants.txt',
+				'https://unicode.org/Public/UNIDATA/emoji/emoji-variation-sequences.txt',
+			),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Character classes for filtering variation selectors.',
+			),
+			'data' => array(),
+		),
+		'utf8_regex_joining_type' => array(
+			'file' => 'RegularExpressions.php',
+			'key_type' => 'string',
+			'val_type' => 'string',
+			'desc' => array(
+				'Helper function for utf8_sanitize_invisibles.',
+				'',
+				'Character class lists compiled from:',
+				'https://unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt',
+			),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Character classes for joining characters in certain scripts.',
+			),
+			'data' => array(),
+		),
+		'utf8_regex_indic' => array(
+			'file' => 'RegularExpressions.php',
+			'key_type' => 'string',
+			'val_type' => 'string',
+			'desc' => array(
+				'Helper function for utf8_sanitize_invisibles.',
+				'',
+				'Character class lists compiled from:',
+				'https://unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt',
+				'https://unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt',
+			),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Character classes for Indic scripts that use viramas.',
+			),
+			'data' => array(),
+		),
+		'idna_maps' => array(
+			'file' => 'Idna.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for idn_to_* polyfills.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Character maps for IDNA processing.',
+			),
+			'data' => array(),
+		),
+		'idna_maps_deviation' => array(
+			'file' => 'Idna.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for idn_to_* polyfills.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => '"Deviation" character maps for IDNA processing.',
+			),
+			'data' => array(),
+		),
+		'idna_maps_not_std3' => array(
+			'file' => 'Idna.php',
+			'key_type' => 'hexchar',
+			'val_type' => 'hexchar',
+			'desc' => array('Helper function for idn_to_* polyfills.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Non-STD3 character maps for IDNA processing.',
+			),
+			'data' => array(),
+		),
+		'idna_regex' => array(
+			'file' => 'Idna.php',
+			'key_type' => 'string',
+			'val_type' => 'string',
+			'desc' => array('Helper function for idn_to_* polyfills.'),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Regular expressions useful for IDNA processing.',
+			),
+			'data' => array(),
+		),
+	);
+
+	// Prefetching the files helps ensure the task runs smoothly.
+	private $prefetch = array(
+		self::DATA_URL_UCD => array(
+			'CaseFolding.txt',
+			'DerivedAge.txt',
+			'DerivedCoreProperties.txt',
+			'DerivedNormalizationProps.txt',
+			'IndicSyllabicCategory.txt',
+			'PropertyValueAliases.txt',
+			'PropList.txt',
+			'ScriptExtensions.txt',
+			'Scripts.txt',
+			'SpecialCasing.txt',
+			'StandardizedVariants.txt',
+			'UnicodeData.txt',
+			'emoji/emoji-data.txt',
+			'emoji/emoji-variation-sequences.txt',
+			'extracted/DerivedGeneralCategory.txt',
+			'extracted/DerivedJoiningType.txt',
+		),
+		self::DATA_URL_IDNA => array(
+			'IdnaMappingTable.txt',
+		),
+	);
+
+	/**
+	 * This executes the task.
+	 *
+	 * @return bool Always returns true
+	 */
+	public function execute()
+	{
+		global $sourcedir, $smcFunc;
+
+		/*****************
+		 * Part 1: Setup *
+		 *****************/
+		$this->unicodedir = $sourcedir . DIRECTORY_SEPARATOR . 'Unicode';
+
+		// We need a temporary directory to hold our files while we work on them.
+		$this->make_temp_dir();
+
+		if (empty($this->temp_dir))
+			return true;
+
+		// Do we even need to update?
+		if (!$this->should_update())
+		{
+			$this->deltree($this->temp_dir);
+			return true;
+		}
+
+		@ini_set('memory_limit', '256M');
+
+		foreach ($this->funcs as $func_name => &$func_info)
+		{
+			$file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file']));
+
+			if (!file_exists($file_paths['temp']))
+				touch($file_paths['temp']);
+
+			if (!is_file($file_paths['temp']))
+			{
+				log_error($file_paths['temp'] . ' is not a file.');
+				return true;
+			}
+
+			if (!smf_chmod($file_paths['temp']))
+			{
+				log_error($file_paths['temp'] . ' is not writable.');
+				return true;
+			}
+
+			$file_contents['temp'] = file_get_contents($file_paths['temp']);
+
+			if (empty($file_contents['temp']))
+			{
+				file_put_contents($file_paths['temp'], $this->smf_file_header());
+			}
+			elseif (substr($file_contents['temp'], -2) === '?' . '>')
+			{
+				file_put_contents($file_paths['temp'], substr($file_contents['temp'], 0, -2));
+			}
+		}
+
+		// Prefetch the files in case the network is slow.
+		foreach ($this->prefetch as $data_url => $files)
+		{
+			foreach ($files as $filename)
+			{
+				$local_file = $this->fetch_unicode_file($filename, $data_url);
+
+				// If prefetch is taking a really long time, pause and try again later.
+				if ($local_file === false || microtime(true) - TIME_START >= MAX_CLAIM_THRESHOLD - 1)
+				{
+					$smcFunc['db_insert']('',
+						'{db_prefix}background_tasks',
+						array(
+							'task_file' => 'string',
+							'task_class' => 'string',
+							'task_data' => 'string',
+							'claimed_time' => 'int',
+						),
+						array(
+							'$sourcedir/tasks/UpdateUnicode.php',
+							'Update_Unicode',
+							'',
+							time() - MAX_CLAIM_THRESHOLD,
+						),
+						array('id_task')
+					);
+
+					return true;
+				}
+			}
+		}
+
+		/*********************************************
+		 * Part 2: Normalization, case folding, etc. *
+		 *********************************************/
+		$this->process_derived_normalization_props();
+		$this->process_main_unicode_data();
+		$this->process_casing_data();
+		$this->finalize_decomposition_forms();
+
+		$this->full_decomposition_maps = array();
+		$this->derived_normalization_props = array();
+		$this->export_funcs_to_file();
+
+		/***********************************
+		 * Part 3: Regular expression data *
+		 ***********************************/
+		$this->build_regex_properties();
+		$this->build_regex_variation_selectors();
+		$this->build_script_stats();
+		$this->build_regex_joining_type();
+		$this->build_regex_indic();
+
+		unset($this->funcs['utf8_combining_classes']['data']);
+		$this->export_funcs_to_file();
+
+		/*********************************
+		 * Part 4: IDNA maps and regexes *
+		 *********************************/
+		$this->build_idna();
+		$this->export_funcs_to_file();
+
+		/*******************
+		 * Part 5: Wrapup. *
+		 *******************/
+		$done_files = array();
+
+		foreach ($this->funcs as $func_name => $func_info)
+		{
+			$file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file'];
+			$file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file'];
+
+			if (in_array($file_paths['temp'], $done_files))
+				continue;
+
+			// Add closing PHP tag to the temp file.
+			file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND);
+
+			$done_files[] = $file_paths['temp'];
+
+			// Only move if the file has changed, discounting the license block.
+			foreach (array('temp', 'real') as $f)
+			{
+				if (file_exists($file_paths[$f]))
+				{
+					$file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f]));
+				}
+				else
+					$file_contents[$f] = '';
+			}
+
+			if ($file_contents['temp'] !== $file_contents['real'])
+				rename($file_paths['temp'], $file_paths['real']);
+		}
+
+		// Clean up after ourselves.
+		$this->deltree($this->temp_dir);
+
+		// All done.
+		return true;
+	}
+
+	/**
+	 * Makes a temporary directory to hold our working files, and sets
+	 * $this->temp_dir to the path of the created directory.
+	 */
+	private function make_temp_dir()
+	{
+		global $sourcedir;
+
+		if (empty($this->temp_dir))
+		{
+			require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php');
+
+			$this->temp_dir = rtrim(sm_temp_dir(), DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . 'Unicode';
+
+			if (!is_dir($this->temp_dir))
+				@mkdir($this->temp_dir);
+
+			// Needs to be a writable directory.
+			if (!is_dir($this->temp_dir) || !smf_chmod($this->temp_dir))
+				$this->temp_dir = null;
+		}
+	}
+
+	/**
+	 * Fetches the contents of a Unicode data file.
+	 *
+	 * Caches a local copy for subsequent lookups.
+	 *
+	 * @param string $filename Name of a Unicode datafile, relative to $data_url.
+	 * @param string $data_url One of this class's DATA_URL_* constants.
+	 *
+	 * @return string Path to locally saved copy of the file.
+	 */
+	private function fetch_unicode_file($filename, $data_url)
+	{
+		global $sourcedir;
+
+		$filename = ltrim($filename, '\\/');
+		$file_url_name = strtr($filename, array('\\' => '/'));
+		$file_local_name = strtr($filename, array('\\' => DIRECTORY_SEPARATOR, '/' => DIRECTORY_SEPARATOR));
+
+		switch ($data_url)
+		{
+			case self::DATA_URL_IDNA:
+				$sub_dir = 'idna';
+				break;
+
+			default:
+				$sub_dir = 'ucd';
+				break;
+		}
+
+		$local_file = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $sub_dir, $file_local_name));
+
+		if (file_exists($local_file))
+			return $local_file;
+
+		if (!file_exists(dirname($local_file)))
+		{
+			@mkdir(dirname($local_file), 0777, true);
+
+			if (!is_dir(dirname($local_file)))
+				return false;
+		}
+
+		require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php');
+
+		$file_contents = fetch_web_data($data_url . '/' . $file_url_name);
+
+		if (empty($file_contents))
+			return false;
+
+		file_put_contents($local_file, $file_contents);
+
+		$this->files_to_fetch[$sub_dir][] = $filename;
+
+		return $local_file;
+	}
+
+	/**
+	 * Deletes a directory and its contents.
+	 *
+	 * @param string Path to directory
+	 */
+	private function deltree($dir_path)
+	{
+		// For safety.
+		if (strpos($dir_path, $this->temp_dir) !== 0)
+			return;
+
+		$dir = new DirectoryIterator($dir_path);
+
+		$to_delete = array();
+		foreach ($dir as $fileinfo)
+		{
+			if ($fileinfo->isDot())
+				continue;
+
+			if ($fileinfo->isDir())
+				$this->deltree($fileinfo->getPathname());
+			else
+				$to_delete[] = $fileinfo->getPathname();
+		}
+
+		foreach ($to_delete as $pathname)
+			unlink($pathname);
+
+		rmdir($dir_path);
+	}
+
+	/**
+	 * Gets basic boilerplate for the PHP files that will be created.
+	 *
+	 * @return string Standard SMF file header.
+	 */
+	private function smf_file_header()
+	{
+		global $sourcedir;
+
+		static $file_template;
+
+		if (!empty($file_template))
+			return $file_template;
+
+		require_once($sourcedir . '/Subs-Admin.php');
+		$settings_defs = get_settings_defs();
+
+		$license_block = '';
+
+		$keep_line = true;
+		foreach (explode("\n", $settings_defs[0]['text']) as $line)
+		{
+			if (strpos($line, 'SMF') !== false || strpos($line, 'Simple Machines') !== false)
+				$keep_line = true;
+
+			if ($keep_line)
+				$license_block .= $line . "\n";
+
+			if ($line === '/**')
+				$keep_line = false;
+		}
+
+		$file_template = implode("\n\n", array(
+			'<' . '?php',
+			trim($license_block),
+			"if (!defined('SMF'))\n\tdie('No direct access...');",
+			'',
+		));
+
+		return $file_template;
+	}
+
+	/**
+	 * Updates Unicode data functions in their designated files.
+	 */
+	function export_funcs_to_file()
+	{
+		foreach ($this->funcs as $func_name => $func_info)
+		{
+			if (empty($func_info['data']))
+				continue;
+
+			$temp_file_path = $this->temp_dir . '/' . $func_info['file'];
+
+			list($func_code, $func_regex) = $this->get_function_code_and_regex($func_name);
+
+			$file_contents = file_get_contents($temp_file_path);
+
+			if (preg_match($func_regex, $file_contents))
+			{
+				file_put_contents($temp_file_path, preg_replace($func_regex, $func_code, $file_contents));
+			}
+			else
+			{
+				file_put_contents($temp_file_path, $func_code . "\n\n", FILE_APPEND);
+			}
+
+			// Free up some memory.
+			if ($func_name != 'utf8_combining_classes')
+				unset($this->funcs[$func_name]['data']);
+		}
+	}
+
+	/**
+	 * Builds complete code for the specified element in $this->funcs
+	 * to be inserted into the relevant PHP file. Also builds a regex
+	 * to check whether a copy of the the function is already present
+	 * in the file.
+	 *
+	 * @param string $func_name Key of an element in $this->funcs.
+	 *
+	 * @return array PHP code and a regular expression.
+	 */
+	private function get_function_code_and_regex($func_name)
+	{
+		// No function name means data is raw code.
+		if (!is_string($func_name))
+		{
+			$func_code = implode("\n\n", $this->funcs[$func_name]['data']);
+			$func_regex = isset($this->funcs[$func_name]['regex']) ? $this->funcs[$func_name]['regex'] : '/' . preg_quote($func_code, '/') . '/';
+		}
+		else
+		{
+			// The regex to look for this function in the existing file content.
+			$func_regex = "/(\/\*([^*]|\*(?!\/))*\*\/\n)?function $func_name\(\)\n{.+?\n}/s";
+
+			// The PHPDoc comment for this function.
+			$func_code = '/**' . implode("\n * ", array_merge(
+				array(''),
+				$this->funcs[$func_name]['desc'],
+				array(
+					'',
+					'Developers: Do not update the data in this function manually. Instead,',
+					'run "php -f other/update_unicode_data.php" on the command line.',
+				),
+				empty($this->funcs[$func_name]['return']) ? array() : array(
+					'',
+					'@return ' . implode(' ', $this->funcs[$func_name]['return'])
+				),
+			)) . "\n */\n";
+
+			// The code for this function.
+			$func_code .= implode("\n", array(
+				'function ' . $func_name . '()',
+				'{',
+				"\t" . 'return array(',
+				'',
+			));
+
+			$this->build_func_array(
+				$func_code,
+				$this->funcs[$func_name]['data'],
+				$this->funcs[$func_name]['key_type'],
+				$this->funcs[$func_name]['val_type']
+			);
+
+			$func_code .= implode("\n", array(
+				"\t" . ');',
+				'}',
+			));
+		}
+
+		// Some final tidying.
+		$func_code = str_replace('\\\\x', '\x', $func_code);
+		$func_code = preg_replace('/\h+$/m', '', $func_code);
+
+		return array($func_code, $func_regex);
+	}
+
+	/**
+	 * Helper for get_function_code_and_regex(). Builds the function's data array.
+	 *
+	 * @param string &$func_code The raw string that contains function code.
+	 * @param array $data Data to format as an array.
+	 * @param string $key_type How to format the array keys.
+	 * @param string $val_type How to format the array values.
+	 */
+	private function build_func_array(&$func_code, $data, $key_type, $val_type)
+	{
+		static $indent = 2;
+
+		foreach ($data as $key => $value)
+		{
+			$func_code .= str_repeat("\t", $indent);
+
+			if ($key_type == 'hexchar')
+			{
+				$func_code .= '"';
+
+				$key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
+
+				foreach (unpack('C*', $key) as $byte_value)
+				{
+					$func_code .= '\x' . strtoupper(dechex($byte_value));
+				}
+
+				$func_code .= '" => ';
+			}
+			elseif ($key_type == 'string' && !is_int($key))
+			{
+				$func_code .= var_export($key, true) . ' => ';
+			}
+
+			if (is_array($value))
+			{
+				if ($val_type == 'string' && count($value) === count($value, COUNT_RECURSIVE))
+				{
+					$nextline = "\n" . str_repeat("\t", $indent + 1);
+
+					$func_code = rtrim($func_code);
+
+					$func_code .= $nextline . implode(' .' . $nextline, array_map(
+						function ($v)
+						{
+							return var_export($v, true);
+						},
+						$value
+					));
+				}
+				else
+				{
+					$func_code .= 'array(' . "\n";
+
+					$indent++;
+					$this->build_func_array($func_code, $value, $key_type, $val_type);
+					$indent--;
+
+					$func_code .= str_repeat("\t", $indent) . ')';
+				}
+			}
+			elseif ($val_type == 'hexchar')
+			{
+				$func_code .= '"';
+
+				$value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
+				foreach (unpack('C*', $value) as $byte_value)
+				{
+					$func_code .= '\x' . strtoupper(dechex($byte_value));
+				}
+
+				$func_code .= '"';
+			}
+			elseif ($val_type == 'string')
+			{
+				$func_code .= var_export($value, true);
+			}
+			else
+			{
+				$func_code .= $value;
+			}
+
+			$func_code .= ',' . "\n";
+		}
+	}
+
+	/**
+	 * Compares version of SMF's local Unicode data with the latest release.
+	 *
+	 * @return bool Whether SMF should update its local Unicode data or not.
+	 */
+	private function should_update()
+	{
+		$this->lookup_ucd_version();
+
+		// We can't do anything if lookup failed.
+		if (empty($this->ucd_version))
+			return false;
+
+		require_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php');
+
+		if (version_compare($this->ucd_version, SMF_UNICODE_VERSION, '<='))
+			return false;
+	}
+
+	/**
+	 * Compares version of SMF's local Unicode data with the latest release.
+	 *
+	 * @return bool Whether SMF should update its local Unicode data or not.
+	 */
+	private function lookup_ucd_version()
+	{
+		global $sourcedir;
+
+		if (!empty($this->ucd_version))
+			return true;
+
+		$local_file = $this->fetch_unicode_file('ReadMe.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		preg_match('/Version\s+(\d+(?:\.\d+)*)/', file_get_contents($local_file), $matches);
+
+		if (empty($matches[1]))
+			return false;
+
+		$this->ucd_version = implode('.', array_pad(explode('.', $matches[1]), 4, '0'));
+
+		// Update this while we are at it.
+		foreach ($this->funcs as $func_name => &$func_info)
+		{
+			if ($func_info['file'] === 'Metadata.php')
+			{
+				$func_info['data'][0] = str_replace('0.0.0.0', $this->ucd_version, $func_info['data'][0]);
+
+				break;
+			}
+		}
+
+		return true;
+	}
+
+	/**
+	 * Processes DerivedNormalizationProps.txt in order to populate
+	 * $this->derived_normalization_props.
+	 */
+	private function process_derived_normalization_props()
+	{
+		$local_file = $this->fetch_unicode_file('DerivedNormalizationProps.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			if (!isset($this->derived_normalization_props[$fields[1]]))
+			{
+				$this->derived_normalization_props[$fields[1]] = array();
+			}
+
+			if (strpos($fields[0], '..') === false)
+			{
+				$entities = array('&#x' . $fields[0] . ';');
+			}
+			else
+			{
+				$entities = array();
+
+				list($start, $end) = explode('..', $fields[0]);
+
+				$ord_s = hexdec($start);
+				$ord_e = hexdec($end);
+
+				$ord = $ord_s;
+				while ($ord <= $ord_e)
+				{
+					$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
+				}
+			}
+
+			$value = '';
+			if (!isset($fields[2]))
+			{
+				$value = 'SAME';
+			}
+			elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF')))
+			{
+				$value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : '';
+			}
+			else
+			{
+				$value = $fields[2];
+			}
+
+			foreach ($entities as $entity)
+			{
+				$this->derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value;
+			}
+		}
+	}
+
+	/**
+	 * Processes UnicodeData.txt in order to populate $this->char_data,
+	 * $this->full_decomposition_maps, and the 'data' element of most elements
+	 * of $this->funcs.
+	 */
+	private function process_main_unicode_data()
+	{
+		$local_file = $this->fetch_unicode_file('UnicodeData.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			if (!empty($fields[3]))
+			{
+				$this->funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = $fields[3];
+			}
+
+			// Uppercase maps.
+			if ($fields[12] !== '')
+			{
+				$this->funcs['utf8_strtoupper_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';';
+			}
+
+			// Lowercase maps.
+			if ($fields[13] !== '')
+			{
+				$this->funcs['utf8_strtolower_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';';
+			}
+
+			// Titlecase maps, where different from uppercase maps.
+			if ($fields[14] !== '' && $fields[14] !== $fields[12])
+			{
+				$this->funcs['utf8_titlecase_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[14] . ';';
+			}
+
+			// Remember this character's general category for later.
+			$this->char_data['&#x' . $fields[0] . ';']['General_Category'] = $fields[2];
+
+			if ($fields[5] === '')
+			{
+				continue;
+			}
+
+			// All canonical decompositions AND all compatibility decompositions.
+			$this->full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';';
+
+			// Just the canonical decompositions.
+			if (strpos($fields[5], '<') === false)
+			{
+				$this->funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';';
+			}
+		}
+	}
+
+	/**
+	 * Processes SpecialCasing.txt and CaseFolding.txt in order to get
+	 * finalized versions of all case conversion data.
+	 */
+	private function process_casing_data()
+	{
+		// Full case conversion maps are the same as the simple ones, unless they're not.
+		$this->funcs['utf8_strtoupper_maps']['data'] = $this->funcs['utf8_strtoupper_simple_maps']['data'];
+		$this->funcs['utf8_strtolower_maps']['data'] = $this->funcs['utf8_strtolower_simple_maps']['data'];
+		$this->funcs['utf8_titlecase_maps']['data'] = $this->funcs['utf8_titlecase_simple_maps']['data'];
+
+		// Deal with the special casing data.
+		$local_file = $this->fetch_unicode_file('SpecialCasing.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			// Unconditional mappings.
+			// Note: conditional mappings need to be handled by more complex code.
+			if (empty($fields[4]))
+			{
+				$this->funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[1])) . ';';
+
+				$this->funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[3])) . ';';
+
+				// Titlecase only where different from uppercase.
+				if ($fields[3] !== $fields[2])
+				{
+					$this->funcs['utf8_titlecase_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
+				}
+			}
+		}
+
+		ksort($this->funcs['utf8_strtolower_maps']['data']);
+		ksort($this->funcs['utf8_strtoupper_maps']['data']);
+		ksort($this->funcs['utf8_titlecase_maps']['data']);
+
+		// Deal with the case folding data.
+		$local_file = $this->fetch_unicode_file('CaseFolding.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			// Full casefolding.
+			if (in_array($fields[1], array('C', 'F')))
+			{
+				$this->funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
+			}
+
+			// Simple casefolding.
+			if (in_array($fields[1], array('C', 'S')))
+				$this->funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
+		}
+	}
+
+	/**
+	 * Finalizes all the decomposition forms.
+	 *
+	 * This is necessary because some characters decompose to other characters
+	 * that themselves decompose further.
+	 */
+	private function finalize_decomposition_forms()
+	{
+		// Iterate until we reach the final decomposition forms.
+		// First we do the compatibility decomposition forms.
+		$changed = true;
+		while ($changed)
+		{
+			$temp = array();
+			foreach ($this->full_decomposition_maps as $composed => $decomposed)
+			{
+				$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
+
+				foreach ($parts as $partnum => $hex)
+				{
+					if (isset($this->full_decomposition_maps[$hex]))
+					{
+						$parts[$partnum] = $this->full_decomposition_maps[$hex];
+					}
+				}
+
+				$decomposed = implode(' ', $parts);
+				unset($parts);
+
+				$temp[$composed] = $decomposed;
+			}
+
+			$changed = $this->full_decomposition_maps !== $temp;
+
+			$this->full_decomposition_maps = $temp;
+		}
+
+		// Same as above, but using only canonical decompositions.
+		$changed = true;
+		$iteration = 0;
+		while ($changed)
+		{
+			$temp = array();
+			foreach ($this->funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed)
+			{
+				if ($iteration === 0 && !in_array($composed, $this->derived_normalization_props['Full_Composition_Exclusion']))
+				{
+					$this->funcs['utf8_compose_maps']['data'][$decomposed] = $composed;
+				}
+
+				$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
+
+				foreach ($parts as $partnum => $hex)
+				{
+					if (isset($this->funcs['utf8_normalize_d_maps']['data'][$hex]))
+					{
+						$parts[$partnum] = $this->funcs['utf8_normalize_d_maps']['data'][$hex];
+					}
+				}
+
+				$decomposed = implode(' ', $parts);
+				unset($parts);
+
+				$temp[$composed] = $decomposed;
+			}
+
+			$changed = $this->funcs['utf8_normalize_d_maps']['data'] !== $temp;
+
+			$this->funcs['utf8_normalize_d_maps']['data'] = $temp;
+			$iteration++;
+		}
+
+		// Avoid bloat.
+		$this->funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($this->full_decomposition_maps, $this->funcs['utf8_normalize_d_maps']['data']);
+	}
+
+	/**
+	 * Builds regular expression classes for extended Unicode properties.
+	 */
+	private function build_regex_properties()
+	{
+		foreach ($this->funcs['utf8_regex_properties']['propfiles'] as $filename)
+		{
+			$local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD);
+
+			if (empty($local_file))
+				return false;
+
+			foreach (file($local_file) as $line)
+			{
+				$line = substr($line, 0, strcspn($line, '#'));
+
+				if (strpos($line, ';') === false)
+				{
+					continue;
+				}
+
+				$fields = explode(';', $line);
+
+				foreach ($fields as $key => $value)
+				{
+					$fields[$key] = trim($value);
+				}
+
+				if (in_array($fields[1], $this->funcs['utf8_regex_properties']['props']))
+				{
+					if (!isset($this->funcs['utf8_regex_properties']['data'][$fields[1]]))
+					{
+						$this->funcs['utf8_regex_properties']['data'][$fields[1]] = array();
+					}
+
+					$this->funcs['utf8_regex_properties']['data'][$fields[1]][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+				}
+
+				// We also track 'Default_Ignorable_Code_Point' property in a separate array.
+				if ($fields[1] !== 'Default_Ignorable_Code_Point')
+				{
+					continue;
+				}
+
+				if (strpos($fields[0], '..') === false)
+				{
+					$this->funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';';
+				}
+				else
+				{
+					list($start, $end) = explode('..', $fields[0]);
+
+					$ord_s = hexdec($start);
+					$ord_e = hexdec($end);
+
+					$ord = $ord_s;
+					while ($ord <= $ord_e)
+					{
+						$this->funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
+					}
+				}
+			}
+		}
+
+		ksort($this->funcs['utf8_regex_properties']['data']);
+	}
+
+	/**
+	 * Builds regular expression classes for filtering variation selectors.
+	 */
+	private function build_regex_variation_selectors()
+	{
+		$files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt');
+
+		foreach ($files as $filename)
+		{
+			$local_file = $this->fetch_unicode_file($filename, self::DATA_URL_UCD);
+
+			if (empty($local_file))
+				return false;
+
+			foreach (file($local_file) as $line)
+			{
+				$line = substr($line, 0, strcspn($line, '#'));
+
+				if (strpos($line, ';') === false)
+				{
+					continue;
+				}
+
+				$fields = explode(';', $line);
+
+				foreach ($fields as $key => $value)
+				{
+					$fields[$key] = trim($value);
+				}
+
+				list($base_char, $variation_selector) = explode(' ', $fields[0]);
+
+				$this->funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char);
+			}
+		}
+
+		foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords)
+		{
+			$class_string = '';
+
+			$current_range = array('start' => null, 'end' => null);
+			foreach ($ords as $ord)
+			{
+				if (!isset($current_range['start']))
+				{
+					$current_range['start'] = $ord;
+				}
+
+				if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
+				{
+					$current_range['end'] = $ord;
+					continue;
+				}
+				else
+				{
+					$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
+
+					if ($current_range['start'] != $current_range['end'])
+					{
+						$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
+					}
+
+					$current_range = array('start' => $ord, 'end' => $ord);
+				}
+			}
+
+			if (isset($current_range['start']))
+			{
+				$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
+
+				if ($current_range['start'] != $current_range['end'])
+				{
+					$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
+				}
+			}
+
+			// As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters.
+			if (($identical = array_search($class_string, $this->funcs['utf8_regex_variation_selectors']['data'])) !== false)
+			{
+				unset(
+					$this->funcs['utf8_regex_variation_selectors']['data'][$identical],
+					$this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector]
+				);
+
+				$compound_selector = array($identical, $variation_selector);
+				sort($compound_selector);
+
+				$variation_selector = implode('', $compound_selector);
+			}
+
+			$this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string;
+		}
+
+		foreach ($this->funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $class_string)
+		{
+			$this->funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = preg_split('/(?<=})(?=\\\x{)/', $class_string);
+		}
+
+		krsort($this->funcs['utf8_regex_variation_selectors']['data']);
+	}
+
+	/**
+	 * Helper function for build_regex_joining_type and build_regex_indic.
+	 */
+	private function build_script_stats()
+	{
+		$local_file = $this->fetch_unicode_file('PropertyValueAliases.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			if ($fields[0] !== 'sc')
+			{
+				continue;
+			}
+
+			$this->script_aliases[$fields[1]] = $fields[2];
+		}
+
+		$local_file = $this->fetch_unicode_file('Scripts.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			if (in_array($fields[1], array('Common', 'Inherited')))
+			{
+				continue;
+			}
+
+			if (strpos($fields[0], '..') === false)
+			{
+				$this->char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1];
+			}
+			else
+			{
+				list($start, $end) = explode('..', $fields[0]);
+
+				$ord_s = hexdec($start);
+				$ord_e = hexdec($end);
+
+				$ord = $ord_s;
+				while ($ord <= $ord_e)
+				{
+					$this->char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1];
+				}
+			}
+		}
+
+		$local_file = $this->fetch_unicode_file('ScriptExtensions.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			$char_scripts = array();
+			foreach (explode(' ', $fields[1]) as $alias)
+			{
+				if (!in_array($this->script_aliases[$alias], array('Common', 'Inherited')))
+				{
+					$char_scripts[] = $this->script_aliases[$alias];
+				}
+			}
+
+			if (strpos($fields[0], '..') === false)
+			{
+				foreach ($char_scripts as $char_script)
+				{
+					$this->char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script;
+				}
+			}
+			else
+			{
+				list($start, $end) = explode('..', $fields[0]);
+
+				$ord_s = hexdec($start);
+				$ord_e = hexdec($end);
+
+				$ord = $ord_s;
+				while ($ord <= $ord_e)
+				{
+					foreach ($char_scripts as $char_script)
+					{
+						$this->char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script;
+					}
+				}
+			}
+		}
+
+		$local_file = $this->fetch_unicode_file('DerivedAge.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			$fields[1] = (float) $fields[1];
+
+			if (strpos($fields[0], '..') === false)
+			{
+				$entity = '&#x' . $fields[0] . ';';
+
+				if (empty($this->char_data[$entity]['scripts']))
+				{
+					continue;
+				}
+
+				foreach ($this->char_data[$entity]['scripts'] as $char_script)
+				{
+					if (!isset($this->script_stats[$char_script]))
+					{
+						$this->script_stats[$char_script]['age'] = (float) $fields[1];
+						$this->script_stats[$char_script]['count'] = 1;
+					}
+					else
+					{
+						$this->script_stats[$char_script]['age'] = min((float) $fields[1], $this->script_stats[$char_script]['age']);
+						$this->script_stats[$char_script]['count']++;
+					}
+				}
+			}
+			else
+			{
+				list($start, $end) = explode('..', $fields[0]);
+
+				$ord_s = hexdec($start);
+				$ord_e = hexdec($end);
+
+				$ord = $ord_s;
+				while ($ord <= $ord_e)
+				{
+					$entity = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
+
+					if (empty($this->char_data[$entity]['scripts']))
+					{
+						continue;
+					}
+
+					foreach ($this->char_data[$entity]['scripts'] as $char_script)
+					{
+						if (!isset($this->script_stats[$char_script]))
+						{
+							$this->script_stats[$char_script]['age'] = $fields[1];
+							$this->script_stats[$char_script]['count'] = 1;
+						}
+						else
+						{
+							$this->script_stats[$char_script]['age'] = min($fields[1], $this->script_stats[$char_script]['age']);
+							$this->script_stats[$char_script]['count']++;
+						}
+					}
+				}
+			}
+		}
+	}
+
+	/**
+	 * Builds regex classes for join control tests in utf8_sanitize_invisibles.
+	 * Specifically, for cursive scripts like Arabic.
+	 */
+	private function build_regex_joining_type()
+	{
+		$local_file = $this->fetch_unicode_file('extracted/DerivedJoiningType.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			switch ($fields[1])
+			{
+				case 'C':
+					$joining_type = 'Join_Causing';
+					break;
+
+				case 'D':
+					$joining_type = 'Dual_Joining';
+					break;
+
+				case 'R':
+					$joining_type = 'Right_Joining';
+					break;
+
+				case 'L':
+					$joining_type = 'Left_Joining';
+					break;
+
+				case 'T':
+					$joining_type = 'Transparent';
+					break;
+
+				default:
+					$joining_type = null;
+					break;
+			}
+
+			if (!isset($joining_type))
+			{
+				continue;
+			}
+
+			$entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';';
+
+			if (empty($this->char_data[$entity]['scripts']))
+			{
+				continue;
+			}
+
+			foreach ($this->char_data[$entity]['scripts'] as $char_script)
+			{
+				if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats']))
+				{
+					$this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $this->script_stats[$char_script];
+				}
+
+				if (!isset($this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type]))
+				{
+					$this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array();
+				}
+
+				$this->funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+			}
+		}
+		// This sort works decently well to ensure widely used scripts are ranked before rare scripts.
+		uasort($this->funcs['utf8_regex_joining_type']['data'], function ($a, $b)
+		{
+			if ($a['stats']['age'] == $b['stats']['age'])
+			{
+				return $b['stats']['count'] - $a['stats']['count'];
+			}
+			else
+			{
+				return $a['stats']['age'] - $b['stats']['age'];
+			}
+		});
+		foreach ($this->funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types)
+		{
+			unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']);
+
+			// If the only joining type in this script is transparent, we don't care about it.
+			if (array_keys($joining_types) === array('Transparent'))
+			{
+				unset($this->funcs['utf8_regex_joining_type']['data'][$char_script]);
+				continue;
+			}
+
+			foreach ($joining_types as $joining_type => $value)
+			{
+				sort($value);
+			}
+		}
+	}
+
+	/**
+	 * Builds regex classes for join control tests in utf8_sanitize_invisibles.
+	 * Specifically, for Indic scripts like Devanagari.
+	 */
+	private function build_regex_indic()
+	{
+		$local_file = $this->fetch_unicode_file('IndicSyllabicCategory.txt', self::DATA_URL_UCD);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = trim($value);
+			}
+
+			$insc = $fields[1];
+
+			if (!in_array($insc, array('Virama', 'Vowel_Dependent')))
+			{
+				continue;
+			}
+
+			$char_scripts = $this->char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
+
+			if (empty($char_scripts))
+			{
+				continue;
+			}
+
+			foreach ($char_scripts as $char_script)
+			{
+				if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats']))
+				{
+					$this->funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $this->script_stats[$char_script];
+				}
+
+				if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script][$insc]))
+				{
+					$this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = array();
+				}
+
+				$this->funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+			}
+		}
+		// Again, sort commonly used scripts before rare scripts.
+		uasort($this->funcs['utf8_regex_indic']['data'], function ($a, $b)
+		{
+			if ($a['stats']['age'] == $b['stats']['age'])
+			{
+				return $b['stats']['count'] - $a['stats']['count'];
+			}
+			else
+			{
+				return $a['stats']['age'] - $b['stats']['age'];
+			}
+		});
+		// We only want scripts with viramas.
+		foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
+		{
+			unset($this->funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']);
+
+			if (!isset($inscs['Virama']))
+			{
+				unset($this->funcs['utf8_regex_indic']['data'][$char_script]);
+				continue;
+			}
+		}
+		// Now add some more classes that we need for each script.
+		foreach ($this->char_data as $entity => $info)
+		{
+			if (empty($info['scripts']))
+			{
+				continue;
+			}
+
+			$ord = hexdec(trim($entity, '&#x;'));
+
+			foreach ($info['scripts'] as $char_script)
+			{
+				if (!isset($this->funcs['utf8_regex_indic']['data'][$char_script]))
+				{
+					continue;
+				}
+
+				$this->funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord;
+
+				if (empty($info['General_Category']))
+				{
+					continue;
+				}
+				elseif ($info['General_Category'] == 'Mn')
+				{
+					$this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord;
+
+					if (!empty($this->funcs['utf8_combining_classes']['data'][$entity]))
+					{
+						$this->funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord;
+					}
+				}
+				elseif (substr($info['General_Category'], 0, 1) == 'L')
+				{
+					$this->funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord;
+				}
+			}
+		}
+		foreach ($this->funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
+		{
+			foreach ($inscs as $insc => $value)
+			{
+				sort($value);
+
+				if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark')))
+				{
+					continue;
+				}
+
+				$class_string = '';
+
+				$current_range = array('start' => null, 'end' => null);
+				foreach ($value as $ord)
+				{
+					if (!isset($current_range['start']))
+					{
+						$current_range['start'] = $ord;
+					}
+
+					if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
+					{
+						$current_range['end'] = $ord;
+						continue;
+					}
+					else
+					{
+						$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
+
+						if ($current_range['start'] != $current_range['end'])
+						{
+							$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
+						}
+
+						$current_range = array('start' => $ord, 'end' => $ord);
+					}
+				}
+
+				if (isset($current_range['start']))
+				{
+					$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
+
+					if ($current_range['start'] != $current_range['end'])
+					{
+						$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
+					}
+				}
+
+				$this->funcs['utf8_regex_indic']['data'][$char_script][$insc] = preg_split('/(?<=})(?=\\\x{)/', $class_string);
+			}
+
+			ksort($this->funcs['utf8_regex_indic']['data'][$char_script]);
+		}
+	}
+
+	/**
+	 * Builds maps and regex classes for IDNA purposes.
+	 */
+	private function build_idna()
+	{
+		$local_file = $this->fetch_unicode_file('IdnaMappingTable.txt', self::DATA_URL_IDNA);
+
+		if (empty($local_file))
+			return false;
+
+		foreach (file($local_file) as $line)
+		{
+			$line = substr($line, 0, strcspn($line, '#'));
+
+			if (strpos($line, ';') === false)
+			{
+				continue;
+			}
+
+			$fields = explode(';', $line);
+
+			foreach ($fields as $key => $value)
+			{
+				$fields[$key] = preg_replace('/\b(0(?!\b))+/', '', trim($value));
+			}
+
+			if (strpos($fields[0], '..') === false)
+			{
+				$entities = array('&#x' . $fields[0] . ';');
+			}
+			else
+			{
+				$entities = array();
+
+				list($start, $end) = explode('..', $fields[0]);
+
+				$ord_s = hexdec($start);
+				$ord_e = hexdec($end);
+
+				$ord = $ord_s;
+				while ($ord <= $ord_e)
+				{
+					$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
+				}
+			}
+
+			if ($fields[1] === 'mapped')
+			{
+				foreach ($entities as $entity)
+					$this->funcs['idna_maps']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
+			}
+			elseif ($fields[1] === 'deviation')
+			{
+				foreach ($entities as $entity)
+					$this->funcs['idna_maps_deviation']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
+
+				$this->funcs['idna_regex']['data']['deviation'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+			}
+			elseif ($fields[1] === 'ignored')
+			{
+				$this->funcs['idna_regex']['data']['ignored'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+			}
+			elseif ($fields[1] === 'disallowed')
+			{
+				if (in_array('&#xD800;', $entities))
+					continue;
+
+				$this->funcs['idna_regex']['data']['disallowed'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+			}
+			elseif ($fields[1] === 'disallowed_STD3_mapped')
+			{
+				foreach ($entities as $entity)
+					$this->funcs['idna_maps_not_std3']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
+
+				$this->funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+			}
+			elseif ($fields[1] === 'disallowed_STD3_valid')
+			{
+				$this->funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
+			}
+		}
+	}
+}
+
+?>
\ No newline at end of file
diff --git a/other/update_unicode_data.php b/other/update_unicode_data.php
index 4134d4cfb4..ceaf5a29cc 100644
--- a/other/update_unicode_data.php
+++ b/other/update_unicode_data.php
@@ -5,10 +5,20 @@
  * any SMF distribution packages.
  *
  * This file exists to make it easy for developers to update the
- * Unicode data in Subs-Charset.php whenever a new version of the
+ * Unicode data in $sourcedir/Unicode whenever a new version of the
  * Unicode Character Database is released. Just run this file from the
  * command line in order to perform the update.
  *
+ * Note:
+ *
+ *  1. Any updates to the Unicode data files SHOULD be included in the
+ *     install and large upgrade packages.
+ *
+ * 	2. Any updates to the Unicode data files SHOULD NOT be included in
+ *     the patch packages. The Update_Unicode background task will take
+ *     care of that on existing forums.
+ *
+ *
  * Simple Machines Forum (SMF)
  *
  * @package SMF
@@ -16,1258 +26,43 @@
  * @copyright 2022 Simple Machines and individual contributors
  * @license https://www.simplemachines.org/about/smf/license.php BSD
  *
- * @version 2.1.2
+ * @version 2.1.3
  */
 
-$unicode_data_url = 'https://unicode.org/Public/UCD/latest/ucd';
-$idna_data_url = 'https://www.unicode.org/Public/idna/latest';
-
-$sourcedir = realpath(dirname(__DIR__) . '/Sources');
-$unicodedir = $sourcedir . '/Unicode';
-
-$full_decomposition_maps = array();
-$funcs = array(
-	'utf8_normalize_d_maps' => array(
-		'file' => 'DecompositionCanonical.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_normalize_kd_maps' => array(
-		'file' => 'DecompositionCompatibility.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_compose_maps' => array(
-		'file' => 'Composition.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_combining_classes' => array(
-		'file' => 'CombiningClasses.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'int',
-		'data' => array(),
-	),
-	'utf8_strtolower_simple_maps' => array(
-		'file' => 'CaseLower.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_strtolower_maps' => array(
-		'file' => 'CaseLower.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_strtoupper_simple_maps' => array(
-		'file' => 'CaseUpper.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_strtoupper_maps' => array(
-		'file' => 'CaseUpper.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_titlecase_simple_maps' => array(
-		'file' => 'CaseTitle.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_titlecase_maps' => array(
-		'file' => 'CaseTitle.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_casefold_simple_maps' => array(
-		'file' => 'CaseFold.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_casefold_maps' => array(
-		'file' => 'CaseFold.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_default_ignorables' => array(
-		'file' => 'DefaultIgnorables.php',
-		'key_type' => 'int',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'utf8_regex_properties' => array(
-		'file' => 'RegularExpressions.php',
-		'key_type' => 'string',
-		'val_type' => 'string',
-		'propfiles' => array(
-			'DerivedCoreProperties.txt',
-			'PropList.txt',
-			'emoji/emoji-data.txt',
-			'extracted/DerivedGeneralCategory.txt',
-		),
-		'props' => array(
-			'Bidi_Control',
-			'Case_Ignorable',
-			'Cn',
-			'Default_Ignorable_Code_Point',
-			'Emoji',
-			'Emoji_Modifier',
-			'Ideographic',
-			'Join_Control',
-			'Regional_Indicator',
-			'Variation_Selector',
-		),
-		'data' => array(),
-	),
-	'utf8_regex_variation_selectors' => array(
-		'file' => 'RegularExpressions.php',
-		'key_type' => 'string',
-		'val_type' => 'string',
-		'data' => array(),
-	),
-	'utf8_regex_joining_type' => array(
-		'file' => 'RegularExpressions.php',
-		'key_type' => 'string',
-		'val_type' => 'string',
-		'data' => array(),
-	),
-	'utf8_regex_indic' => array(
-		'file' => 'RegularExpressions.php',
-		'key_type' => 'string',
-		'val_type' => 'string',
-		'data' => array(),
-	),
-	'idna_maps' => array(
-		'file' => 'Idna.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'idna_maps_deviation' => array(
-		'file' => 'Idna.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'idna_maps_not_std3' => array(
-		'file' => 'Idna.php',
-		'key_type' => 'hexchar',
-		'val_type' => 'hexchar',
-		'data' => array(),
-	),
-	'idna_regex' => array(
-		'file' => 'Idna.php',
-		'key_type' => 'string',
-		'val_type' => 'string',
-		'data' => array(),
-	),
-);
-
-foreach ($funcs as $func_name => $func_info)
-{
-	if (!is_file($unicodedir . '/' . $func_info['file']) || !is_writable($unicodedir . '/' . $func_info['file']))
-	{
-		die($unicodedir . '/' . $func_info['file'] . ' not found or not writable.');
-	}
-}
-
-@ini_set('memory_limit', '256M');
-
-/*********************************************
- * Part 1: Normalization, case folding, etc. *
- *********************************************/
-
-// We need some of these for further analysis below.
-$derived_normalization_props = array();
-$unicode_version = '';
-foreach (file($unicode_data_url . '/DerivedNormalizationProps.txt') as $line)
-{
-	if ($unicode_version === '' && preg_match('/(\d+\.\d+\.\d+(?:\.\d+)?)\.txt$/', $line, $matches))
-	{
-		$unicode_version = implode('.', array_pad(explode('.', $matches[1]), 4, '0'));
-
-		$file_contents = file_get_contents($unicodedir . '/Metadata.php');
-		$file_contents = preg_replace(
-			"~\bdefine\('SMF_UNICODE_VERSION', '[^']+'\)~",
-			"define('SMF_UNICODE_VERSION', '" . $unicode_version . "')",
-			$file_contents
-		);
-		file_put_contents($unicodedir . '/Metadata.php', $file_contents);
-	}
-
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	if (!isset($derived_normalization_props[$fields[1]]))
-	{
-		$derived_normalization_props[$fields[1]] = array();
-	}
-
-	if (strpos($fields[0], '..') === false)
-	{
-		$entities = array('&#x' . $fields[0] . ';');
-	}
-	else
-	{
-		$entities = array();
-
-		list($start, $end) = explode('..', $fields[0]);
-
-		$ord_s = hexdec($start);
-		$ord_e = hexdec($end);
-
-		$ord = $ord_s;
-		while ($ord <= $ord_e)
-		{
-			$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
-		}
-	}
-
-	$value = '';
-	if (!isset($fields[2]))
-	{
-		$value = 'SAME';
-	}
-	elseif (in_array($fields[1], array('FC_NFKC', 'NFKC_CF')))
-	{
-		$value = trim($fields[2]) !== '' ? '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';' : '';
-	}
-	else
-	{
-		$value = $fields[2];
-	}
-
-	foreach ($entities as $entity)
-	{
-		$derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value;
-	}
-}
-
-// Go through all the characters in the Unicode database.
-$char_data = array();
-foreach (file($unicode_data_url . '/UnicodeData.txt') as $line)
-{
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	if (!empty($fields[3]))
-	{
-		$funcs['utf8_combining_classes']['data']['&#x' . $fields[0] . ';'] = $fields[3];
-	}
-
-	// Uppercase maps.
-	if ($fields[12] !== '')
-	{
-		$funcs['utf8_strtoupper_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[12] . ';';
-	}
-
-	// Lowercase maps.
-	if ($fields[13] !== '')
-	{
-		$funcs['utf8_strtolower_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[13] . ';';
-	}
-
-	// Titlecase maps, where different from uppercase maps.
-	if ($fields[14] !== '' && $fields[14] !== $fields[12])
-	{
-		$funcs['utf8_titlecase_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . $fields[14] . ';';
-	}
-
-	// Remember this character's general category for later.
-	$char_data['&#x' . $fields[0] . ';']['General_Category'] = $fields[2];
-
-	if ($fields[5] === '')
-	{
-		continue;
-	}
-
-	// All canonical decompositions AND all compatibility decompositions.
-	$full_decomposition_maps['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim(strip_tags($fields[5]))) . ';';
-
-	// Just the canonical decompositions.
-	if (strpos($fields[5], '<') === false)
-	{
-		$funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';';
-	}
-}
-
-// Full case conversion maps
-$funcs['utf8_strtoupper_maps']['data'] = $funcs['utf8_strtoupper_simple_maps']['data'];
-$funcs['utf8_strtolower_maps']['data'] = $funcs['utf8_strtolower_simple_maps']['data'];
-$funcs['utf8_titlecase_maps']['data'] = $funcs['utf8_titlecase_simple_maps']['data'];
-foreach (file($unicode_data_url . '/SpecialCasing.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	// Unconditional mappings.
-	// Note: conditional mappings need to be handled by more complex code.
-	if (empty($fields[4]))
-	{
-		$funcs['utf8_strtolower_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[1])) . ';';
-
-		$funcs['utf8_strtoupper_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[3])) . ';';
-
-		// Titlecase only where different from uppercase.
-		if ($fields[3] !== $fields[2])
-		{
-			$funcs['utf8_titlecase_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
-		}
-	}
-}
-ksort($funcs['utf8_strtolower_maps']['data']);
-ksort($funcs['utf8_strtoupper_maps']['data']);
-ksort($funcs['utf8_titlecase_maps']['data']);
-
-foreach (file($unicode_data_url . '/CaseFolding.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	// Full casefolding.
-	if (in_array($fields[1], array('C', 'F')))
-	{
-		$funcs['utf8_casefold_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
-	}
-
-	// Simple casefolding.
-	if (in_array($fields[1], array('C', 'S')))
-		$funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
-}
-
-// Recursively iterate until we reach the final decomposition forms.
-// This is necessary because some characters decompose to other characters that
-// themselves decompose further.
-$changed = true;
-while ($changed)
-{
-	$temp = array();
-	foreach ($full_decomposition_maps as $composed => $decomposed)
-	{
-		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
-
-		foreach ($parts as $partnum => $hex)
-		{
-			if (isset($full_decomposition_maps[$hex]))
-			{
-				$parts[$partnum] = $full_decomposition_maps[$hex];
-			}
-		}
-
-		$decomposed = implode(' ', $parts);
-		unset($parts);
-
-		$temp[$composed] = $decomposed;
-	}
-
-	$changed = $full_decomposition_maps !== $temp;
-
-	$full_decomposition_maps = $temp;
-}
-
-// Same as above, but using only canonical decompositions.
-$changed = true;
-$iteration = 0;
-while ($changed)
-{
-	$temp = array();
-	foreach ($funcs['utf8_normalize_d_maps']['data'] as $composed => $decomposed)
-	{
-		if ($iteration === 0 && !in_array($composed, $derived_normalization_props['Full_Composition_Exclusion']))
-		{
-			$funcs['utf8_compose_maps']['data'][$decomposed] = $composed;
-		}
-
-		$parts = strpos($decomposed, ' ') !== false ? explode(' ', $decomposed) : (array) $decomposed;
-
-		foreach ($parts as $partnum => $hex)
-		{
-			if (isset($funcs['utf8_normalize_d_maps']['data'][$hex]))
-			{
-				$parts[$partnum] = $funcs['utf8_normalize_d_maps']['data'][$hex];
-			}
-		}
-
-		$decomposed = implode(' ', $parts);
-		unset($parts);
-
-		$temp[$composed] = $decomposed;
-	}
-
-	$changed = $funcs['utf8_normalize_d_maps']['data'] !== $temp;
-
-	$funcs['utf8_normalize_d_maps']['data'] = $temp;
-	$iteration++;
-}
-
-$funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($full_decomposition_maps, $funcs['utf8_normalize_d_maps']['data']);
-unset($full_decomposition_maps, $derived_normalization_props);
-
-// Now update the files with the data we've got so far.
-foreach ($funcs as $func_name => $func_info)
-{
-	if (empty($func_info['data']))
-	{
-		continue;
-	}
-
-	export_func_to_file($func_name, $func_info);
-
-	// Free up some memory.
-	if ($func_name != 'utf8_combining_classes')
-	{
-		unset($funcs[$func_name]);
-	}
-}
-
-/***********************************
- * Part 2: Regular expression data *
- ***********************************/
-
-// Build regular expression classes for extended Unicode properties.
-foreach ($funcs['utf8_regex_properties']['propfiles'] as $filename)
-{
-	foreach (file($unicode_data_url . '/' . $filename) as $line)
-	{
-		$line = substr($line, 0, strcspn($line, '#'));
-
-		if (strpos($line, ';') === false)
-		{
-			continue;
-		}
-
-		$fields = explode(';', $line);
-
-		foreach ($fields as $key => $value)
-		{
-			$fields[$key] = trim($value);
-		}
-
-		if (in_array($fields[1], $funcs['utf8_regex_properties']['props']))
-		{
-			if (!isset($funcs['utf8_regex_properties']['data'][$fields[1]]))
-			{
-				$funcs['utf8_regex_properties']['data'][$fields[1]] = array();
-			}
-
-			$funcs['utf8_regex_properties']['data'][$fields[1]][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-		}
-
-		// We also track 'Default_Ignorable_Code_Point' property in a separate array.
-		if ($fields[1] !== 'Default_Ignorable_Code_Point')
-		{
-			continue;
-		}
-
-		if (strpos($fields[0], '..') === false)
-		{
-			$funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';';
-		}
-		else
-		{
-			list($start, $end) = explode('..', $fields[0]);
-
-			$ord_s = hexdec($start);
-			$ord_e = hexdec($end);
-
-			$ord = $ord_s;
-			while ($ord <= $ord_e)
-			{
-				$funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
-			}
-		}
-	}
-}
-ksort($funcs['utf8_regex_properties']['data']);
-
-// Build regular expression classes for filtering variation selectors.
-$files = array('StandardizedVariants.txt', 'emoji/emoji-variation-sequences.txt');
-foreach ($files as $filename)
-{
-	foreach (file($unicode_data_url . '/' . $filename) as $line)
-	{
-		$line = substr($line, 0, strcspn($line, '#'));
-
-		if (strpos($line, ';') === false)
-		{
-			continue;
-		}
-
-		$fields = explode(';', $line);
-
-		foreach ($fields as $key => $value)
-		{
-			$fields[$key] = trim($value);
-		}
-
-		list($base_char, $variation_selector) = explode(' ', $fields[0]);
-
-		$funcs['utf8_regex_variation_selectors']['data']['\\x{' . $variation_selector . '}'][] = hexdec($base_char);
-	}
-}
-foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $ords)
-{
-	$class_string = '';
-
-	$current_range = array('start' => null, 'end' => null);
-	foreach ($ords as $ord)
-	{
-		if (!isset($current_range['start']))
-		{
-			$current_range['start'] = $ord;
-		}
-
-		if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
-		{
-			$current_range['end'] = $ord;
-			continue;
-		}
-		else
-		{
-			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
-
-			if ($current_range['start'] != $current_range['end'])
-			{
-				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
-			}
-
-			$current_range = array('start' => $ord, 'end' => $ord);
-		}
-	}
-
-	if (isset($current_range['start']))
-	{
-		$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
-
-		if ($current_range['start'] != $current_range['end'])
-		{
-			$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
-		}
-	}
-
-	// As of Unicode 14.0, \x{FE0E} and \x{FE0F} work with identical ranges of base characters.
-	if (($identical = array_search($class_string, $funcs['utf8_regex_variation_selectors']['data'])) !== false)
-	{
-		unset(
-			$funcs['utf8_regex_variation_selectors']['data'][$identical],
-			$funcs['utf8_regex_variation_selectors']['data'][$variation_selector]
-		);
-
-		$compound_selector = array($identical, $variation_selector);
-		sort($compound_selector);
-
-		$variation_selector = implode('', $compound_selector);
-	}
-
-	$funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = $class_string;
-}
-foreach ($funcs['utf8_regex_variation_selectors']['data'] as $variation_selector => $class_string)
-{
-	$funcs['utf8_regex_variation_selectors']['data'][$variation_selector] = preg_split('/(?<=})(?=\\\x{)/', $class_string);
-}
-krsort($funcs['utf8_regex_variation_selectors']['data']);
-
-// The regex classes for join control tests require info about language scripts.
-$script_stats = array();
-$script_aliases = array();
-foreach (file($unicode_data_url . '/PropertyValueAliases.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	if ($fields[0] !== 'sc')
-	{
-		continue;
-	}
-
-	$script_aliases[$fields[1]] = $fields[2];
-}
-foreach (file($unicode_data_url . '/Scripts.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	if (in_array($fields[1], array('Common', 'Inherited')))
-	{
-		continue;
-	}
-
-	if (strpos($fields[0], '..') === false)
-	{
-		$char_data['&#x' . $fields[0] . ';']['scripts'][] = $fields[1];
-	}
-	else
-	{
-		list($start, $end) = explode('..', $fields[0]);
+// 1. Set a couple of variables that we'll need.
+$boarddir = realpath(dirname(__DIR__));
+$sourcedir = $boarddir . '/Sources';
 
-		$ord_s = hexdec($start);
-		$ord_e = hexdec($end);
+// 2. Borrow a bit of stuff from cron.php.
+$cron_php_start = file_get_contents($boarddir . '/cron.php', false, null, 0, 4096);
 
-		$ord = $ord_s;
-		while ($ord <= $ord_e)
-		{
-			$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $fields[1];
-		}
-	}
-}
-foreach (file($unicode_data_url . '/ScriptExtensions.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	$char_scripts = array();
-	foreach (explode(' ', $fields[1]) as $alias)
-	{
-		if (!in_array($script_aliases[$alias], array('Common', 'Inherited')))
-		{
-			$char_scripts[] = $script_aliases[$alias];
-		}
-	}
-
-	if (strpos($fields[0], '..') === false)
-	{
-		foreach ($char_scripts as $char_script)
-		{
-			$char_data['&#x' . $fields[0] . ';']['scripts'][] = $char_script;
-		}
-	}
-	else
-	{
-		list($start, $end) = explode('..', $fields[0]);
-
-		$ord_s = hexdec($start);
-		$ord_e = hexdec($end);
-
-		$ord = $ord_s;
-		while ($ord <= $ord_e)
-		{
-			foreach ($char_scripts as $char_script)
-			{
-				$char_data['&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';']['scripts'][] = $char_script;
-			}
-		}
-	}
-}
-foreach (file($unicode_data_url . '/DerivedAge.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	$fields[1] = (float) $fields[1];
-
-	if (strpos($fields[0], '..') === false)
-	{
-		$entity = '&#x' . $fields[0] . ';';
-
-		if (empty($char_data[$entity]['scripts']))
-		{
-			continue;
-		}
-
-		foreach ($char_data[$entity]['scripts'] as $char_script)
-		{
-			if (!isset($script_stats[$char_script]))
-			{
-				$script_stats[$char_script]['age'] = (float) $fields[1];
-				$script_stats[$char_script]['count'] = 1;
-			}
-			else
-			{
-				$script_stats[$char_script]['age'] = min((float) $fields[1], $script_stats[$char_script]['age']);
-				$script_stats[$char_script]['count']++;
-			}
-		}
-	}
-	else
-	{
-		list($start, $end) = explode('..', $fields[0]);
-
-		$ord_s = hexdec($start);
-		$ord_e = hexdec($end);
-
-		$ord = $ord_s;
-		while ($ord <= $ord_e)
-		{
-			$entity = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
-
-			if (empty($char_data[$entity]['scripts']))
-			{
-				continue;
-			}
-
-			foreach ($char_data[$entity]['scripts'] as $char_script)
-			{
-				if (!isset($script_stats[$char_script]))
-				{
-					$script_stats[$char_script]['age'] = $fields[1];
-					$script_stats[$char_script]['count'] = 1;
-				}
-				else
-				{
-					$script_stats[$char_script]['age'] = min($fields[1], $script_stats[$char_script]['age']);
-					$script_stats[$char_script]['count']++;
-				}
-			}
-		}
-	}
-}
-
-// Build regex classes for join control tests in utf8_sanitize_invisibles:
-// 1. Cursive scripts like Arabic.
-foreach (file($unicode_data_url . '/extracted/DerivedJoiningType.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	switch ($fields[1])
-	{
-		case 'C':
-			$joining_type = 'Join_Causing';
-			break;
-
-		case 'D':
-			$joining_type = 'Dual_Joining';
-			break;
-
-		case 'R':
-			$joining_type = 'Right_Joining';
-			break;
-
-		case 'L':
-			$joining_type = 'Left_Joining';
-			break;
-
-		case 'T':
-			$joining_type = 'Transparent';
-			break;
-
-		default:
-			$joining_type = null;
-			break;
-	}
-
-	if (!isset($joining_type))
-	{
-		continue;
-	}
-
-	$entity = '&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';';
-
-	if (empty($char_data[$entity]['scripts']))
-	{
-		continue;
-	}
-
-	foreach ($char_data[$entity]['scripts'] as $char_script)
-	{
-		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats']))
-		{
-			$funcs['utf8_regex_joining_type']['data'][$char_script]['stats'] = $script_stats[$char_script];
-		}
-
-		if (!isset($funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type]))
-		{
-			$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type] = array();
-		}
-
-		$funcs['utf8_regex_joining_type']['data'][$char_script][$joining_type][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-	}
-}
-// This sort works decently well to ensure widely used scripts are ranked before rare scripts.
-uasort($funcs['utf8_regex_joining_type']['data'], function ($a, $b)
-{
-	if ($a['stats']['age'] == $b['stats']['age'])
-	{
-		return $b['stats']['count'] - $a['stats']['count'];
-	}
-	else
-	{
-		return $a['stats']['age'] - $b['stats']['age'];
-	}
-});
-foreach ($funcs['utf8_regex_joining_type']['data'] as $char_script => $joining_types)
+foreach (array('SMF', 'SMF_VERSION', 'SMF_SOFTWARE_YEAR') as $const)
 {
-	unset($funcs['utf8_regex_joining_type']['data'][$char_script]['stats'], $joining_types['stats']);
+	preg_match("/define\('$const', '([^)]+)'\);/", $cron_php_start, $matches);
 
-	// If the only joining type in this script is transparent, we don't care about it.
-	if (array_keys($joining_types) === array('Transparent'))
-	{
-		unset($funcs['utf8_regex_joining_type']['data'][$char_script]);
-		continue;
-	}
+	if (empty($matches[1]))
+		die("Could not find value for $const in cron.php");
 
-	foreach ($joining_types as $joining_type => $value)
-	{
-		sort($value);
-	}
+	define($const, $matches[1]);
 }
 
-// 2. Indic scripts like Devanagari.
-foreach (file($unicode_data_url . '/IndicSyllabicCategory.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = trim($value);
-	}
-
-	$insc = $fields[1];
-
-	if (!in_array($insc, array('Virama', 'Vowel_Dependent')))
-	{
-		continue;
-	}
+define('SMF_USER_AGENT', 'SMF');
+define('MAX_CLAIM_THRESHOLD', 300);
+define('TIME_START', microtime(true));
 
-	$char_scripts = $char_data['&#x' . substr($fields[0], 0, strcspn($fields[0], '.')) . ';']['scripts'];
-
-	if (empty($char_scripts))
-	{
-		continue;
-	}
-
-	foreach ($char_scripts as $char_script)
-	{
-		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]['stats']))
-		{
-			$funcs['utf8_regex_indic']['data'][$char_script]['stats'] = $script_stats[$char_script];
-		}
-
-		if (!isset($funcs['utf8_regex_indic']['data'][$char_script][$insc]))
-		{
-			$funcs['utf8_regex_indic']['data'][$char_script][$insc] = array();
-		}
-
-		$funcs['utf8_regex_indic']['data'][$char_script][$insc][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-	}
-}
-// Again, sort commonly used scripts before rare scripts.
-uasort($funcs['utf8_regex_indic']['data'], function ($a, $b)
-{
-	if ($a['stats']['age'] == $b['stats']['age'])
-	{
-		return $b['stats']['count'] - $a['stats']['count'];
-	}
-	else
-	{
-		return $a['stats']['age'] - $b['stats']['age'];
-	}
-});
-// We only want scripts with viramas.
-foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
-{
-	unset($funcs['utf8_regex_indic']['data'][$char_script]['stats'], $inscs['stats']);
-
-	if (!isset($inscs['Virama']))
-	{
-		unset($funcs['utf8_regex_indic']['data'][$char_script]);
-		continue;
-	}
-}
-// Now add some more classes that we need for each script.
-foreach ($char_data as $entity => $info)
+abstract class SMF_BackgroundTask
 {
-	if (empty($info['scripts']))
-	{
-		continue;
-	}
-
-	$ord = hexdec(trim($entity, '&#x;'));
-
-	foreach ($info['scripts'] as $char_script)
-	{
-		if (!isset($funcs['utf8_regex_indic']['data'][$char_script]))
-		{
-			continue;
-		}
-
-		$funcs['utf8_regex_indic']['data'][$char_script]['All'][] = $ord;
-
-		if (empty($info['General_Category']))
-		{
-			continue;
-		}
-		elseif ($info['General_Category'] == 'Mn')
-		{
-			$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Mark'][] = $ord;
-
-			if (!empty($funcs['utf8_combining_classes']['data'][$entity]))
-			{
-				$funcs['utf8_regex_indic']['data'][$char_script]['Nonspacing_Combining_Mark'][] = $ord;
-			}
-		}
-		elseif (substr($info['General_Category'], 0, 1) == 'L')
-		{
-			$funcs['utf8_regex_indic']['data'][$char_script]['Letter'][] = $ord;
-		}
-	}
+	abstract public function execute();
 }
-foreach ($funcs['utf8_regex_indic']['data'] as $char_script => $inscs)
-{
-	foreach ($inscs as $insc => $value)
-	{
-		sort($value);
-
-		if (!in_array($insc, array('All', 'Letter', 'Nonspacing_Mark', 'Nonspacing_Combining_Mark')))
-		{
-			continue;
-		}
 
-		$class_string = '';
+// This should never be needed, but set it for completeness.
+$smcFunc['db_insert'] = function($method, $table, $columns, $data, $keys, $returnmode = 0, $connection = null) {};
 
-		$current_range = array('start' => null, 'end' => null);
-		foreach ($value as $ord)
-		{
-			if (!isset($current_range['start']))
-			{
-				$current_range['start'] = $ord;
-			}
+// 3. Do the job.
+require_once($sourcedir . '/Subs.php');
+require_once($sourcedir . '/tasks/UpdateUnicode.php');
 
-			if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
-			{
-				$current_range['end'] = $ord;
-				continue;
-			}
-			else
-			{
-				$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
-
-				if ($current_range['start'] != $current_range['end'])
-				{
-					$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
-				}
-
-				$current_range = array('start' => $ord, 'end' => $ord);
-			}
-		}
-
-		if (isset($current_range['start']))
-		{
-			$class_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
-
-			if ($current_range['start'] != $current_range['end'])
-			{
-				$class_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
-			}
-		}
-
-		$funcs['utf8_regex_indic']['data'][$char_script][$insc] = preg_split('/(?<=})(?=\\\x{)/', $class_string);
-	}
-
-	ksort($funcs['utf8_regex_indic']['data'][$char_script]);
-}
-unset($funcs['utf8_combining_classes']);
-
-foreach ($funcs as $func_name => $func_info)
-{
-	if (empty($func_info['data']))
-	{
-		continue;
-	}
-
-	export_func_to_file($func_name, $func_info);
-}
-
-/*********************************
- * Part 3: IDNA maps and regexes *
- *********************************/
-
-foreach (file($idna_data_url . '/IdnaMappingTable.txt') as $line)
-{
-	$line = substr($line, 0, strcspn($line, '#'));
-
-	if (strpos($line, ';') === false)
-	{
-		continue;
-	}
-
-	$fields = explode(';', $line);
-
-	foreach ($fields as $key => $value)
-	{
-		$fields[$key] = preg_replace('/\b(0(?!\b))+/', '', trim($value));
-	}
-
-	if (strpos($fields[0], '..') === false)
-	{
-		$entities = array('&#x' . $fields[0] . ';');
-	}
-	else
-	{
-		$entities = array();
-
-		list($start, $end) = explode('..', $fields[0]);
-
-		$ord_s = hexdec($start);
-		$ord_e = hexdec($end);
-
-		$ord = $ord_s;
-		while ($ord <= $ord_e)
-		{
-			$entities[] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';';
-		}
-	}
-
-	if ($fields[1] === 'mapped')
-	{
-		foreach ($entities as $entity)
-			$funcs['idna_maps']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
-	}
-	elseif ($fields[1] === 'deviation')
-	{
-		foreach ($entities as $entity)
-			$funcs['idna_maps_deviation']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
-
-		$funcs['idna_regex']['data']['deviation'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-	}
-	elseif ($fields[1] === 'ignored')
-	{
-		$funcs['idna_regex']['data']['ignored'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-	}
-	elseif ($fields[1] === 'disallowed')
-	{
-		if (in_array('&#xD800;', $entities))
-			continue;
-
-		$funcs['idna_regex']['data']['disallowed'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-	}
-	elseif ($fields[1] === 'disallowed_STD3_mapped')
-	{
-		foreach ($entities as $entity)
-			$funcs['idna_maps_not_std3']['data'][$entity] = $fields[2] === '' ? '' : '&#x' . str_replace(' ', '; &#x', $fields[2]) . ';';
-
-		$funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-	}
-	elseif ($fields[1] === 'disallowed_STD3_valid')
-	{
-		$funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
-	}
-}
-
-foreach ($funcs as $func_name => $func_info)
-{
-	if (empty($func_info['data']))
-	{
-		continue;
-	}
-
-	export_func_to_file($func_name, $func_info);
-}
-
-/**
- * Updates a Unicode data function in its designated file.
- *
- * @param string $func_name The name of the function.
- * @param array $func_info Info about the function, including its data.
- */
-function export_func_to_file($func_name, $func_info)
-{
-	global $unicodedir;
-
-	$file_contents = file_get_contents($unicodedir . '/' . $func_info['file']);
-
-	$func_text = 'function ' . $func_name . '()' . "\n" . '{';
-
-	$func_regex = '/' . preg_quote($func_text, '/') . '.+?\n}/s';
-
-	$func_text .= "\n\t" . 'return array(' . "\n";
-
-	build_func_array($func_text, $func_info['data'], $func_info['key_type'], $func_info['val_type']);
-
-	$func_text .= "\t" . ');' . "\n" . '}';
-
-	$file_contents = preg_replace($func_regex, $func_text, $file_contents);
-
-	file_put_contents($unicodedir . '/' . $func_info['file'], $file_contents);
-}
-
-/**
- * Helper for export_func_to_file(). Builds the function's data array.
- *
- * @param string &$func_text The raw string that contains function code.
- * @param array $data Data to format as an array.
- * @param string $key_type How to format the array keys.
- * @param string $val_type How to format the array values.
- */
-function build_func_array(&$func_text, $data, $key_type, $val_type)
-{
-	static $indent = 2;
-
-	foreach ($data as $key => $value)
-	{
-		$func_text .= str_repeat("\t", $indent);
-
-		if ($key_type == 'hexchar')
-		{
-			$func_text .= '"';
-
-			$key = mb_decode_numericentity(str_replace(' ', '', $key), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
-
-			foreach (unpack('C*', $key) as $byte_value)
-			{
-				$func_text .= '\\x' . strtoupper(dechex($byte_value));
-			}
-
-			$func_text .= '" => ';
-		}
-		elseif ($key_type == 'string' && !is_int($key))
-		{
-			$func_text .= var_export($key, true) . ' => ';
-		}
-
-		if (is_array($value))
-		{
-			if ($val_type == 'string' && ($string_count = count($value)) === count($value, COUNT_RECURSIVE))
-			{
-				$nextline = "\n" . str_repeat("\t", $indent + 1);
-
-				$func_text = rtrim($func_text);
-
-				$func_text .= $nextline . implode(' .' . $nextline, array_map(function ($v) { return var_export($v, true); }, $value));
-			}
-			else
-			{
-				$func_text .= 'array(' . "\n";
-
-				$indent++;
-				build_func_array($func_text, $value, $key_type, $val_type);
-				$indent--;
-
-				$func_text .= str_repeat("\t", $indent) . ')';
-			}
-		}
-		elseif ($val_type == 'hexchar')
-		{
-			$func_text .= '"';
-
-			$value = mb_decode_numericentity(str_replace(' ', '', $value), array(0, 0x10FFFF, 0, 0xFFFFFF), 'UTF-8');
-			foreach (unpack('C*', $value) as $byte_value)
-			{
-				$func_text .= '\\x' . strtoupper(dechex($byte_value));
-			}
-
-			$func_text .= '"';
-		}
-		elseif ($val_type == 'string')
-		{
-			$func_text .= var_export($value, true);
-		}
-		else
-		{
-			$func_text .= $value;
-		}
-
-		$func_text .= ',' . "\n";
-	}
-}
+$unicode_updater = new Update_Unicode();
+$unicode_updater->execute();
 
 ?>
\ No newline at end of file

From 8d03dfa890f7948911b1c7e20a2003a32c893d08 Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Thu, 27 Oct 2022 10:13:29 -0600
Subject: [PATCH 2/8] Improves documentation

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/tasks/UpdateUnicode.php | 40 ++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
index 3ed45ef9b2..169a713ee6 100644
--- a/Sources/tasks/UpdateUnicode.php
+++ b/Sources/tasks/UpdateUnicode.php
@@ -18,19 +18,55 @@
  */
 class Update_Unicode extends SMF_BackgroundTask
 {
+	/**
+	 * URLs where we can fetch the Unicode data files.
+	 */
 	const DATA_URL_UCD = 'https://unicode.org/Public/UCD/latest/ucd';
 	const DATA_URL_IDNA = 'https://www.unicode.org/Public/idna/latest';
 
+	/**
+	 * @var string The latest official release of the Unicode Character Database.
+	 */
 	public $ucd_version = '';
+
+	/**
+	 * @var string Path to temporary working directory.
+	 */
 	public $temp_dir = '';
+
+	/**
+	 * @var string Convenince alias of $sourcedir . '/Unicode'.
+	 */
 	public $unicodedir = '';
 
+	/**
+	 * @var array Key-value pairs of character decompositions.
+	 */
 	private $full_decomposition_maps = array();
+
+	/**
+	 * @var array Character properties used during normalization.
+	 */
 	private $derived_normalization_props = array();
+
+	/**
+	 * @var array Assorted info about Unicode characters.
+	 */
 	private $char_data = array();
+
+	/**
+	 * @var array Statistical info about character scripts (e.g. Latin, Greek, Cyrillic, etc.)
+	 */
 	private $script_stats = array();
+
+	/**
+	 * @var array Tracks associations between character scripts' short and long names.
+	 */
 	private $script_aliases = array();
 
+	/**
+	 * @var array Info about functions to build in SMF's Unicode data files.
+	 */
 	private $funcs = array(
 		array(
 			'file' => 'Metadata.php',
@@ -316,7 +352,9 @@ class Update_Unicode extends SMF_BackgroundTask
 		),
 	);
 
-	// Prefetching the files helps ensure the task runs smoothly.
+	/**
+	 * @var array Files to fetch from unicode.org.
+	 */
 	private $prefetch = array(
 		self::DATA_URL_UCD => array(
 			'CaseFolding.txt',

From a7e01e051305bc7f75577c5b8fd0a0ead9ef3263 Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Thu, 27 Oct 2022 11:27:15 -0600
Subject: [PATCH 3/8] Improves timeout avoidance in UpdateUnicode.php

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/tasks/UpdateUnicode.php | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
index 169a713ee6..27e0e7ae4f 100644
--- a/Sources/tasks/UpdateUnicode.php
+++ b/Sources/tasks/UpdateUnicode.php
@@ -39,6 +39,11 @@ class Update_Unicode extends SMF_BackgroundTask
 	 */
 	public $unicodedir = '';
 
+	/**
+	 * @var int Used to ensure we exit long running tasks cleanly.
+	 */
+	private $time_limit = 30;
+
 	/**
 	 * @var array Key-value pairs of character decompositions.
 	 */
@@ -408,6 +413,8 @@ public function execute()
 
 		@ini_set('memory_limit', '256M');
 
+		$this->time_limit = (empty(ini_get('max_execution_time')) || @set_time_limit(MAX_CLAIM_THRESHOLD) !== false) ? MAX_CLAIM_THRESHOLD : ini_get('max_execution_time');
+
 		foreach ($this->funcs as $func_name => &$func_info)
 		{
 			$file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file']));
@@ -442,12 +449,18 @@ public function execute()
 		// Prefetch the files in case the network is slow.
 		foreach ($this->prefetch as $data_url => $files)
 		{
+			$max_fetch_time = 0;
+
 			foreach ($files as $filename)
 			{
+				$fetch_start = microtime(true);
+
 				$local_file = $this->fetch_unicode_file($filename, $data_url);
 
+				$max_fetch_time = max($max_fetch_time, microtime(true) - $fetch_start);
+
 				// If prefetch is taking a really long time, pause and try again later.
-				if ($local_file === false || microtime(true) - TIME_START >= MAX_CLAIM_THRESHOLD - 1)
+				if ($local_file === false || microtime(true) - TIME_START >= $this->time_limit - $max_fetch_time)
 				{
 					$smcFunc['db_insert']('',
 						'{db_prefix}background_tasks',
@@ -885,6 +898,8 @@ private function should_update()
 	{
 		$this->lookup_ucd_version();
 
+		return true; // For testing
+
 		// We can't do anything if lookup failed.
 		if (empty($this->ucd_version))
 			return false;

From c595452dc7fdacb8fbc8c1655b0f1dd349e97901 Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Thu, 27 Oct 2022 15:25:04 -0600
Subject: [PATCH 4/8] Removes testing line that forced updates to happen even
 when unneeded

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/tasks/UpdateUnicode.php | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
index 27e0e7ae4f..9ae88ae09a 100644
--- a/Sources/tasks/UpdateUnicode.php
+++ b/Sources/tasks/UpdateUnicode.php
@@ -898,8 +898,6 @@ private function should_update()
 	{
 		$this->lookup_ucd_version();
 
-		return true; // For testing
-
 		// We can't do anything if lookup failed.
 		if (empty($this->ucd_version))
 			return false;

From 293b8c362e3bdaaa0ea985225188891c1869a8ae Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Wed, 2 Nov 2022 12:50:10 -0600
Subject: [PATCH 5/8] Even more bulletproofing

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/tasks/UpdateUnicode.php | 104 ++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 40 deletions(-)

diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
index 9ae88ae09a..c68354f002 100644
--- a/Sources/tasks/UpdateUnicode.php
+++ b/Sources/tasks/UpdateUnicode.php
@@ -484,67 +484,76 @@ public function execute()
 			}
 		}
 
+		// Track whether anything goes wrong along the way.
+		$success = true;
+
 		/*********************************************
 		 * Part 2: Normalization, case folding, etc. *
 		 *********************************************/
-		$this->process_derived_normalization_props();
-		$this->process_main_unicode_data();
-		$this->process_casing_data();
-		$this->finalize_decomposition_forms();
+		$success = $this->process_derived_normalization_props() & $success;
+		$success = $this->process_main_unicode_data() & $success;
+		$success = $this->process_casing_data() & $success;
+		$success = $this->finalize_decomposition_forms() & $success;
 
 		$this->full_decomposition_maps = array();
 		$this->derived_normalization_props = array();
+
 		$this->export_funcs_to_file();
 
 		/***********************************
 		 * Part 3: Regular expression data *
 		 ***********************************/
-		$this->build_regex_properties();
-		$this->build_regex_variation_selectors();
-		$this->build_script_stats();
-		$this->build_regex_joining_type();
-		$this->build_regex_indic();
+		$success = $this->build_regex_properties() & $success;
+		$success = $this->build_regex_variation_selectors() & $success;
+		$success = $this->build_script_stats() & $success;
+		$success = $this->build_regex_joining_type() & $success;
+		$success = $this->build_regex_indic() & $success;
 
 		unset($this->funcs['utf8_combining_classes']['data']);
+
 		$this->export_funcs_to_file();
 
 		/*********************************
 		 * Part 4: IDNA maps and regexes *
 		 *********************************/
-		$this->build_idna();
+		$success = $this->build_idna() & $success;
+
 		$this->export_funcs_to_file();
 
 		/*******************
 		 * Part 5: Wrapup. *
 		 *******************/
-		$done_files = array();
-
-		foreach ($this->funcs as $func_name => $func_info)
+		if ($success)
 		{
-			$file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file'];
-			$file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file'];
+			$done_files = array();
 
-			if (in_array($file_paths['temp'], $done_files))
-				continue;
+			foreach ($this->funcs as $func_name => $func_info)
+			{
+				$file_paths['temp'] = $this->temp_dir . DIRECTORY_SEPARATOR . $func_info['file'];
+				$file_paths['real'] = $this->unicodedir . DIRECTORY_SEPARATOR . $func_info['file'];
 
-			// Add closing PHP tag to the temp file.
-			file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND);
+				if (in_array($file_paths['temp'], $done_files))
+					continue;
 
-			$done_files[] = $file_paths['temp'];
+				// Add closing PHP tag to the temp file.
+				file_put_contents($file_paths['temp'], '?' . '>', FILE_APPEND);
 
-			// Only move if the file has changed, discounting the license block.
-			foreach (array('temp', 'real') as $f)
-			{
-				if (file_exists($file_paths[$f]))
+				$done_files[] = $file_paths['temp'];
+
+				// Only move if the file has changed, discounting the license block.
+				foreach (array('temp', 'real') as $f)
 				{
-					$file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f]));
+					if (file_exists($file_paths[$f]))
+					{
+						$file_contents[$f] = preg_replace('~/\*\*.*?@package\h+SMF\b.*?\*/~s', '', file_get_contents($file_paths[$f]));
+					}
+					else
+						$file_contents[$f] = '';
 				}
-				else
-					$file_contents[$f] = '';
-			}
 
-			if ($file_contents['temp'] !== $file_contents['real'])
-				rename($file_paths['temp'], $file_paths['real']);
+				if ($file_contents['temp'] !== $file_contents['real'])
+					rename($file_paths['temp'], $file_paths['real']);
+			}
 		}
 
 		// Clean up after ourselves.
@@ -619,8 +628,6 @@ private function fetch_unicode_file($filename, $data_url)
 				return false;
 		}
 
-		require_once($sourcedir . DIRECTORY_SEPARATOR . 'Subs-Admin.php');
-
 		$file_contents = fetch_web_data($data_url . '/' . $file_url_name);
 
 		if (empty($file_contents))
@@ -628,8 +635,6 @@ private function fetch_unicode_file($filename, $data_url)
 
 		file_put_contents($local_file, $file_contents);
 
-		$this->files_to_fetch[$sub_dir][] = $filename;
-
 		return $local_file;
 	}
 
@@ -902,16 +907,15 @@ private function should_update()
 		if (empty($this->ucd_version))
 			return false;
 
-		require_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php');
+		// If this file is missing, force an update.
+		if (!@include_once($this->unicodedir . DIRECTORY_SEPARATOR . 'Metadata.php'))
+			return true;
 
-		if (version_compare($this->ucd_version, SMF_UNICODE_VERSION, '<='))
-			return false;
+		return version_compare($this->ucd_version, SMF_UNICODE_VERSION, '>=');
 	}
 
 	/**
-	 * Compares version of SMF's local Unicode data with the latest release.
-	 *
-	 * @return bool Whether SMF should update its local Unicode data or not.
+	 * Sets $this->ucd_version to latest version number of the UCD.
 	 */
 	private function lookup_ucd_version()
 	{
@@ -1017,6 +1021,8 @@ private function process_derived_normalization_props()
 				$this->derived_normalization_props[$fields[1]][$entity] = $value === 'SAME' ? $entity : $value;
 			}
 		}
+
+		return true;
 	}
 
 	/**
@@ -1080,6 +1086,8 @@ private function process_main_unicode_data()
 				$this->funcs['utf8_normalize_d_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', $fields[5]) . ';';
 			}
 		}
+
+		return true;
 	}
 
 	/**
@@ -1167,6 +1175,8 @@ private function process_casing_data()
 			if (in_array($fields[1], array('C', 'S')))
 				$this->funcs['utf8_casefold_simple_maps']['data']['&#x' . $fields[0] . ';'] = '&#x' . str_replace(' ', '; &#x', trim($fields[2])) . ';';
 		}
+
+		return true;
 	}
 
 	/**
@@ -1243,6 +1253,8 @@ private function finalize_decomposition_forms()
 
 		// Avoid bloat.
 		$this->funcs['utf8_normalize_kd_maps']['data'] = array_diff_assoc($this->full_decomposition_maps, $this->funcs['utf8_normalize_d_maps']['data']);
+
+		return true;
 	}
 
 	/**
@@ -1310,6 +1322,8 @@ private function build_regex_properties()
 		}
 
 		ksort($this->funcs['utf8_regex_properties']['data']);
+
+		return true;
 	}
 
 	/**
@@ -1411,6 +1425,8 @@ private function build_regex_variation_selectors()
 		}
 
 		krsort($this->funcs['utf8_regex_variation_selectors']['data']);
+
+		return true;
 	}
 
 	/**
@@ -1626,6 +1642,8 @@ private function build_script_stats()
 				}
 			}
 		}
+
+		return true;
 	}
 
 	/**
@@ -1737,6 +1755,8 @@ private function build_regex_joining_type()
 				sort($value);
 			}
 		}
+
+		return true;
 	}
 
 	/**
@@ -1910,6 +1930,8 @@ private function build_regex_indic()
 
 			ksort($this->funcs['utf8_regex_indic']['data'][$char_script]);
 		}
+
+		return true;
 	}
 
 	/**
@@ -1993,6 +2015,8 @@ private function build_idna()
 				$this->funcs['idna_regex']['data']['disallowed_std3'][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}';
 			}
 		}
+
+		return true;
 	}
 }
 

From 1bbdef4aa37a178ef61006b0603a59fccce9169d Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Tue, 14 Mar 2023 18:41:53 -0600
Subject: [PATCH 6/8] Adds method to build QuickCheck data file

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/tasks/UpdateUnicode.php | 73 ++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
index c68354f002..582ca683b9 100644
--- a/Sources/tasks/UpdateUnicode.php
+++ b/Sources/tasks/UpdateUnicode.php
@@ -311,6 +311,22 @@ class Update_Unicode extends SMF_BackgroundTask
 			),
 			'data' => array(),
 		),
+		'utf8_regex_quick_check' => array(
+			'file' => 'QuickCheck.php',
+			'key_type' => 'string',
+			'val_type' => 'string',
+			'desc' => array(
+				'Helper function for utf8_is_normalized.',
+				'',
+				'Character class lists compiled from:',
+				'https://unicode.org/Public/UNIDATA/extracted/DerivedNormalizationProps.txt',
+			),
+			'return' => array(
+				'type' => 'array',
+				'desc' => 'Character classes for disallowed characters in normalization forms.',
+			),
+			'data' => array(),
+		),
 		'idna_maps' => array(
 			'file' => 'Idna.php',
 			'key_type' => 'hexchar',
@@ -496,13 +512,16 @@ public function execute()
 		$success = $this->finalize_decomposition_forms() & $success;
 
 		$this->full_decomposition_maps = array();
-		$this->derived_normalization_props = array();
 
 		$this->export_funcs_to_file();
 
 		/***********************************
 		 * Part 3: Regular expression data *
 		 ***********************************/
+		$success = $this->build_quick_check() & $success;
+
+		$this->derived_normalization_props = array();
+
 		$success = $this->build_regex_properties() & $success;
 		$success = $this->build_regex_variation_selectors() & $success;
 		$success = $this->build_script_stats() & $success;
@@ -1257,6 +1276,58 @@ private function finalize_decomposition_forms()
 		return true;
 	}
 
+	/**
+	 * Builds regular expressions for normalization quick check.
+	 */
+	private function build_quick_check()
+	{
+		foreach (array('NFC_QC', 'NFKC_QC', 'NFD_QC', 'NFKD_QC', 'Changes_When_NFKC_Casefolded') as $prop)
+		{
+			$current_range = array('start' => null, 'end' => null);
+			foreach ($this->derived_normalization_props[$prop] as $entity => $nm)
+			{
+				$range_string = '';
+
+				$ord = hexdec(trim($entity, '&#x;'));
+
+				if (!isset($current_range['start']))
+				{
+					$current_range['start'] = $ord;
+				}
+
+				if (!isset($current_range['end']) || $ord == $current_range['end'] + 1)
+				{
+					$current_range['end'] = $ord;
+				}
+				else
+				{
+					$range_string .= '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
+
+					if ($current_range['start'] != $current_range['end'])
+					{
+						$range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
+					}
+
+					$current_range = array('start' => $ord, 'end' => $ord);
+
+					$this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string;
+				}
+			}
+
+			if (isset($current_range['start']))
+			{
+				$range_string = '\\x{' . strtoupper(sprintf('%04s', dechex($current_range['start']))) . '}';
+
+				if ($current_range['start'] != $current_range['end'])
+				{
+					$range_string .= '-\\x{' . strtoupper(sprintf('%04s', dechex($current_range['end']))) . '}';
+				}
+
+				$this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string;
+			}
+		}
+	}
+
 	/**
 	 * Builds regular expression classes for extended Unicode properties.
 	 */

From 9a8960718441ce1832da1583ae76ac0f92c39404 Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Tue, 14 Mar 2023 18:56:07 -0600
Subject: [PATCH 7/8] Syntax compatibility with PHP 7.0

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/tasks/UpdateUnicode.php | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
index 582ca683b9..62602e5ec9 100644
--- a/Sources/tasks/UpdateUnicode.php
+++ b/Sources/tasks/UpdateUnicode.php
@@ -796,7 +796,7 @@ private function get_function_code_and_regex($func_name)
 				empty($this->funcs[$func_name]['return']) ? array() : array(
 					'',
 					'@return ' . implode(' ', $this->funcs[$func_name]['return'])
-				),
+				)
 			)) . "\n */\n";
 
 			// The code for this function.
@@ -1326,6 +1326,8 @@ private function build_quick_check()
 				$this->funcs['utf8_regex_quick_check']['data'][$prop][] = $range_string;
 			}
 		}
+
+		return true;
 	}
 
 	/**

From 60580a1d7f804f2887e89e65e18a4d261137f7b2 Mon Sep 17 00:00:00 2001
From: Jon Stovell <jonstovell@gmail.com>
Date: Wed, 20 Sep 2023 19:31:31 -0600
Subject: [PATCH 8/8] Informs the admin if Unicode files are not writable

Signed-off-by: Jon Stovell <jonstovell@gmail.com>
---
 Sources/tasks/UpdateUnicode.php             | 21 ++++++++++++++-------
 Themes/default/languages/Errors.english.php |  2 ++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/Sources/tasks/UpdateUnicode.php b/Sources/tasks/UpdateUnicode.php
index 62602e5ec9..516a57a9ce 100644
--- a/Sources/tasks/UpdateUnicode.php
+++ b/Sources/tasks/UpdateUnicode.php
@@ -433,20 +433,27 @@ public function execute()
 
 		foreach ($this->funcs as $func_name => &$func_info)
 		{
-			$file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file']));
+			$file_paths['final'] = implode(DIRECTORY_SEPARATOR, array($this->unicodedir, $func_info['file']));
 
-			if (!file_exists($file_paths['temp']))
-				touch($file_paths['temp']);
+			if (!file_exists($file_paths['final']))
+				touch($file_paths['final']);
 
-			if (!is_file($file_paths['temp']))
+			if (!is_file($file_paths['final']) || !smf_chmod($file_paths['final']))
 			{
-				log_error($file_paths['temp'] . ' is not a file.');
+				loadLanguage('Errors');
+				log_error(sprintf($txt['unicode_update_failed'], $this->unicodedir));
 				return true;
 			}
 
-			if (!smf_chmod($file_paths['temp']))
+			$file_paths['temp'] = implode(DIRECTORY_SEPARATOR, array($this->temp_dir, $func_info['file']));
+
+			if (!file_exists($file_paths['temp']))
+				touch($file_paths['temp']);
+
+			if (!is_file($file_paths['temp']) || !smf_chmod($file_paths['temp']))
 			{
-				log_error($file_paths['temp'] . ' is not writable.');
+				loadLanguage('Errors');
+				log_error(sprintf($txt['unicode_update_failed'], $this->temp_dir));
 				return true;
 			}
 
diff --git a/Themes/default/languages/Errors.english.php b/Themes/default/languages/Errors.english.php
index 6745a1b0d9..3823c5d36f 100644
--- a/Themes/default/languages/Errors.english.php
+++ b/Themes/default/languages/Errors.english.php
@@ -522,4 +522,6 @@
 
 $txt['fetch_web_data_bad_url'] = 'fetch_web_data(): Bad URL';
 
+$txt['unicode_update_failed'] = 'A new version of Unicode is available, but SMF could not update to it. Please make sure %1$s and all the files in it are writable. SMF will try to update its Unicode data files again automatically.';
+
 ?>
\ No newline at end of file