From 73b91cc5b72f64bbba08f216d9a98bb870178d53 Mon Sep 17 00:00:00 2001 From: Bent Date: Tue, 25 Jun 2024 16:01:39 +0200 Subject: [PATCH] added PreanalyzedWord to js bindings --- bindings/wasm/kiwi_wasm.cpp | 30 ++++++++++++++++++- bindings/wasm/package/src/build-args.ts | 39 +++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/bindings/wasm/kiwi_wasm.cpp b/bindings/wasm/kiwi_wasm.cpp index cc0ad4df..7652989d 100644 --- a/bindings/wasm/kiwi_wasm.cpp +++ b/bindings/wasm/kiwi_wasm.cpp @@ -229,8 +229,36 @@ json build(const json& args) { } } - const auto typos = buildArgs.value("typos", json(nullptr)); + const auto preanalyzedWords = buildArgs.value("preanalyzedWords", json::array()); + for (const auto& preanalyzedWord : preanalyzedWords) { + const std::string form8 = preanalyzedWord["form"]; + const std::u16string form = utf8To16(form8); + const float score = preanalyzedWord.value("score", 0.0f); + + std::vector> analyzed; + std::vector> positions; + + for (const auto& analyzedToken : preanalyzedWord["analyzed"]) { + const std::string form8 = analyzedToken["form"]; + const std::u16string form = utf8To16(form8); + + const std::string tag8 = analyzedToken["tag"]; + const std::u16string tag16 = utf8To16(tag8); + const POSTag tag = toPOSTag(tag16); + + analyzed.push_back({ form, tag }); + if (analyzedToken.contains("start") && analyzedToken.contains("end")) { + const size_t start = analyzedToken["start"]; + const size_t end = analyzedToken["end"]; + positions.push_back({ start, end }); + } + } + + builder.addPreAnalyzedWord(form, analyzed, positions, score); + } + + const auto typos = buildArgs.value("typos", json(nullptr)); const float typoCostThreshold = buildArgs.value("typoCostThreshold", 2.5f); if (typos.is_null()) { diff --git a/bindings/wasm/package/src/build-args.ts b/bindings/wasm/package/src/build-args.ts index 49a5cfd5..75316d4b 100644 --- a/bindings/wasm/package/src/build-args.ts +++ b/bindings/wasm/package/src/build-args.ts @@ -30,6 +30,41 @@ export interface UserWord { origWord?: string; }; +export interface PreanalyzedToken { + /** + * Form of the token. + */ + form: string; + /** + * Part-of-speech tag of the token. + */ + tag: string; + /** + * Start position of the token in the preanalyzed word. If omitted, all token positions are automatically calculated. + */ + start?: number; + /** + * Ebd position of the token in the preanalyzed word. If omitted, all token positions are automatically calculated. + */ + end?: number; +} + +export interface PreanalyzedWord { + /** + * Form to add. + */ + form: string; + /** + * The result of the morphological analysis of form. + */ + analyzed: PreanalyzedToken[]; + /** + * The weighted score of the morpheme sequence to add. + * If there are multiple morpheme combinations that match the form, the word with the higher score will be prioritized. + */ + score?: number; +} + export interface TypoDefinition { /** * Source strings @@ -99,6 +134,10 @@ export interface BuildArgs { * Additional user words to load. */ userWords?: UserWord[]; + /** + * Preanalyzed words to load. + */ + preanalyzedWords?: PreanalyzedWord[]; /** * Specifies the language model to use for morphological analysis. Defaults to 'knlm'. * - `knlm`: Fast and can model the relationships between morphemes within a short distance (usually two or three) with high accuracy. However, it has the limitation that it cannot take into account the relationships between morphemes over a long distance.