Skip to content

Commit

Permalink
Minor updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Aug 15, 2024
1 parent 19e95bb commit 3157c39
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 1 deletion.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
ignore
.DS_Store
.env
.vscode
node_modules/
dev/debug
cypress/downloads
cypress/videos
browserstack.json
*.traineddata
*.traineddata.gz
log
results
**/screenshots
Expand All @@ -14,3 +18,5 @@ results
/playwright/.cache/
*.traineddata
*.traineddata.gz
settings.json
tmp
8 changes: 7 additions & 1 deletion .npmignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
ignore
.DS_Store
.env
.vscode
node_modules
dev/debug
browserstack.json
*.traineddata
*.traineddata.gz
.github
.clinic
node_modules
gui
tests
dev
Expand Down
38 changes: 38 additions & 0 deletions dev/check_chars.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/* eslint-disable max-len */

// Note: This script currently does not work as intended, because the desired_characters files are not correct and/or up-to-date.
// We should still check that all characters in the languages we claim to support are in the fonts,
// however doing this correctly will likely require extracting character lists from the .traineddata files.

const langsLatin = ['cat', 'nld', 'eng', 'fra', 'deu', 'ita', 'pol', 'por', 'spa', 'swe'];
const langsCyrillic = ['rus', 'ukr'];

export const getChars = async (lang) => {
const url = `https://raw.githubusercontent.com/tesseract-ocr/langdata/main/${lang}/desired_characters`;
const response = await fetch(url);
if (!response.ok) {
console.error(`Failed to fetch ${url} (${response.status})`);
return [];
}
const text = await response.text();
return text.split('\n');
};

const charsLatin0Arr = await Promise.all(langsLatin.map((lang) => getChars(lang)));
const charsLatinSet = new Set(charsLatin0Arr.flat());
const charsLatinArr = Array.from(charsLatinSet);

const charsCyrillic0Arr = await Promise.all(langsCyrillic.map((lang) => getChars(lang)));
const charsCyrillicSet = new Set(charsCyrillic0Arr.flat());
const charsCyrillicArr = Array.from(charsCyrillicSet);

// TODO: This should import the current lists directly rather than hardcoding them here.
const charSetLatinBaseArr = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789fiffflffiſé.,;:!?()[]{}-–—_\'\\"/\\@*#&$¢£¥¶§©‹›«»®™°•€%^+<=>`~|‘’“”… '.split('');
const charSetLatinExtArr = 'ÀÁÂÄÆÃÅĀĂĄÇĆĈČĎĐÈÉÊËĒĖĘĚĜĞĠĢĤĦÌÍÎÏĪĮİĴĶĹĻĽŁŃŅŇÑÒÓÔÖŒÕØŌŐŔŖŘŚŜŞŠȘŤŢȚÙÚÛÜŪŮŰŲŴÝŶŸŹŻŽàáâäæãåāăąçćĉčďđèéêëēėęěĝğġģĥħìíîïīįıĵķĸĺļľłńņňñòóôöœõøōőŕŗřśŝşšșßťţțùúûüūůűųŵýÿŷźżž¿¡‚„'.split('');
const charSetLatinArr = [...charSetLatinBaseArr, ...charSetLatinExtArr];

const charSetCyrillicOnlyArr = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюяіІ№ЄЇҐєїґ'.split('');
const charSetCyrillicArr = [...charSetCyrillicOnlyArr, ...charSetLatinArr];

console.log('Missing latin characters:', charsLatinArr.filter((char) => !charSetLatinArr.includes(char)));
console.log('Missing cyrillic characters:', charsCyrillicArr.filter((char) => !charSetCyrillicArr.includes(char)));

0 comments on commit 3157c39

Please sign in to comment.