From 3157c392aa625d248ffaaeb9a1a539458dde13bb Mon Sep 17 00:00:00 2001 From: Balearica Date: Wed, 14 Aug 2024 22:59:36 -0700 Subject: [PATCH] Minor updates --- .gitignore | 6 ++++++ .npmignore | 8 +++++++- dev/check_chars.js | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 dev/check_chars.js diff --git a/.gitignore b/.gitignore index 37c3456..10b4922 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,14 @@ ignore .DS_Store .env +.vscode node_modules/ +dev/debug cypress/downloads cypress/videos browserstack.json +*.traineddata +*.traineddata.gz log results **/screenshots @@ -14,3 +18,5 @@ results /playwright/.cache/ *.traineddata *.traineddata.gz +settings.json +tmp \ No newline at end of file diff --git a/.npmignore b/.npmignore index bf0a0b3..981e7e6 100644 --- a/.npmignore +++ b/.npmignore @@ -1,8 +1,14 @@ +ignore +.DS_Store +.env +.vscode +node_modules +dev/debug +browserstack.json *.traineddata *.traineddata.gz .github .clinic -node_modules gui tests dev diff --git a/dev/check_chars.js b/dev/check_chars.js new file mode 100644 index 0000000..d145863 --- /dev/null +++ b/dev/check_chars.js @@ -0,0 +1,38 @@ +/* eslint-disable max-len */ + +// Note: This script currently does not work as intended, because the desired_characters files are not correct and/or up-to-date. +// We should still check that all characters in the languages we claim to support are in the fonts, +// however doing this correctly will likely require extracting character lists from the .traineddata files. + +const langsLatin = ['cat', 'nld', 'eng', 'fra', 'deu', 'ita', 'pol', 'por', 'spa', 'swe']; +const langsCyrillic = ['rus', 'ukr']; + +export const getChars = async (lang) => { + const url = `https://raw.githubusercontent.com/tesseract-ocr/langdata/main/${lang}/desired_characters`; + const response = await fetch(url); + if (!response.ok) { + console.error(`Failed to fetch ${url} (${response.status})`); + return []; + } + const text = await response.text(); + return text.split('\n'); +}; + +const charsLatin0Arr = await Promise.all(langsLatin.map((lang) => getChars(lang))); +const charsLatinSet = new Set(charsLatin0Arr.flat()); +const charsLatinArr = Array.from(charsLatinSet); + +const charsCyrillic0Arr = await Promise.all(langsCyrillic.map((lang) => getChars(lang))); +const charsCyrillicSet = new Set(charsCyrillic0Arr.flat()); +const charsCyrillicArr = Array.from(charsCyrillicSet); + +// TODO: This should import the current lists directly rather than hardcoding them here. +const charSetLatinBaseArr = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789fiffflffiſé.,;:!?()[]{}-–—_\'\\"/\\@*#&$¢£¥¶§©‹›«»®™°•€%^+<=>`~|‘’“”… '.split(''); +const charSetLatinExtArr = 'ÀÁÂÄÆÃÅĀĂĄÇĆĈČĎĐÈÉÊËĒĖĘĚĜĞĠĢĤĦÌÍÎÏĪĮİĴĶĹĻĽŁŃŅŇÑÒÓÔÖŒÕØŌŐŔŖŘŚŜŞŠȘŤŢȚÙÚÛÜŪŮŰŲŴÝŶŸŹŻŽàáâäæãåāăąçćĉčďđèéêëēėęěĝğġģĥħìíîïīįıĵķĸĺļľłńņňñòóôöœõøōőŕŗřśŝşšșßťţțùúûüūůűųŵýÿŷźżž¿¡‚„'.split(''); +const charSetLatinArr = [...charSetLatinBaseArr, ...charSetLatinExtArr]; + +const charSetCyrillicOnlyArr = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюяіІ№ЄЇҐєїґ'.split(''); +const charSetCyrillicArr = [...charSetCyrillicOnlyArr, ...charSetLatinArr]; + +console.log('Missing latin characters:', charsLatinArr.filter((char) => !charSetLatinArr.includes(char))); +console.log('Missing cyrillic characters:', charsCyrillicArr.filter((char) => !charSetCyrillicArr.includes(char)));