added PreanalyzedWord to js bindings

bab2min · Jun 25, 2024 · 73b91cc · 73b91cc
1 parent d76603c
commit 73b91cc
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 1 deletion.
diff --git a/bindings/wasm/kiwi_wasm.cpp b/bindings/wasm/kiwi_wasm.cpp
@@ -229,8 +229,36 @@ json build(const json& args) {
         }
     }
 
-    const auto typos = buildArgs.value("typos", json(nullptr));
+    const auto preanalyzedWords = buildArgs.value("preanalyzedWords", json::array());
+    for (const auto& preanalyzedWord : preanalyzedWords) {
+        const std::string form8 = preanalyzedWord["form"];
+        const std::u16string form = utf8To16(form8);
+        const float score = preanalyzedWord.value("score", 0.0f);
+
+        std::vector<std::pair<std::u16string, POSTag>> analyzed;
+        std::vector<std::pair<size_t, size_t>> positions;
+
+        for (const auto& analyzedToken : preanalyzedWord["analyzed"]) {
+            const std::string form8 = analyzedToken["form"];
+            const std::u16string form = utf8To16(form8);
+
+            const std::string tag8 = analyzedToken["tag"];
+            const std::u16string tag16 = utf8To16(tag8);
+            const POSTag tag = toPOSTag(tag16);
+
+            analyzed.push_back({ form, tag });
 
+            if (analyzedToken.contains("start") && analyzedToken.contains("end")) {
+                const size_t start = analyzedToken["start"];
+                const size_t end = analyzedToken["end"];
+                positions.push_back({ start, end });
+            }
+        }
+
+        builder.addPreAnalyzedWord(form, analyzed, positions, score);
+    }
+
+    const auto typos = buildArgs.value("typos", json(nullptr));
     const float typoCostThreshold = buildArgs.value("typoCostThreshold", 2.5f);
 
     if (typos.is_null()) {

diff --git a/bindings/wasm/package/src/build-args.ts b/bindings/wasm/package/src/build-args.ts
@@ -30,6 +30,41 @@ export interface UserWord {
     origWord?: string;
 };
 
+export interface PreanalyzedToken {
+    /**
+     * Form of the token.
+     */
+    form: string;
+    /**
+     * Part-of-speech tag of the token.
+     */
+    tag: string;
+    /**
+     * Start position of the token in the preanalyzed word. If omitted, all token positions are automatically calculated.
+     */
+    start?: number;
+    /**
+     * Ebd position of the token in the preanalyzed word. If omitted, all token positions are automatically calculated.
+     */
+    end?: number;
+}
+
+export interface PreanalyzedWord {
+    /**
+     * Form to add.
+     */
+    form: string;
+    /**
+     * The result of the morphological analysis of form.
+     */
+    analyzed: PreanalyzedToken[];
+    /**
+     * The weighted score of the morpheme sequence to add.
+     * If there are multiple morpheme combinations that match the form, the word with the higher score will be prioritized.
+     */
+    score?: number;
+}
+
 export interface TypoDefinition {
     /**
      * Source strings
@@ -99,6 +134,10 @@ export interface BuildArgs {
      * Additional user words to load.
      */
     userWords?: UserWord[];
+    /**
+     * Preanalyzed words to load.
+     */
+    preanalyzedWords?: PreanalyzedWord[];
     /**
      * Specifies the language model to use for morphological analysis. Defaults to 'knlm'.
      * - `knlm`: Fast and can model the relationships between morphemes within a short distance (usually two or three) with high accuracy. However, it has the limitation that it cannot take into account the relationships between morphemes over a long distance.