Skip to content

Commit

Permalink
added PreanalyzedWord to js bindings
Browse files Browse the repository at this point in the history
  • Loading branch information
RicBent committed Jun 25, 2024
1 parent d76603c commit 73b91cc
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 1 deletion.
30 changes: 29 additions & 1 deletion bindings/wasm/kiwi_wasm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,36 @@ json build(const json& args) {
}
}

const auto typos = buildArgs.value("typos", json(nullptr));
const auto preanalyzedWords = buildArgs.value("preanalyzedWords", json::array());
for (const auto& preanalyzedWord : preanalyzedWords) {
const std::string form8 = preanalyzedWord["form"];
const std::u16string form = utf8To16(form8);
const float score = preanalyzedWord.value("score", 0.0f);

std::vector<std::pair<std::u16string, POSTag>> analyzed;
std::vector<std::pair<size_t, size_t>> positions;

for (const auto& analyzedToken : preanalyzedWord["analyzed"]) {
const std::string form8 = analyzedToken["form"];
const std::u16string form = utf8To16(form8);

const std::string tag8 = analyzedToken["tag"];
const std::u16string tag16 = utf8To16(tag8);
const POSTag tag = toPOSTag(tag16);

analyzed.push_back({ form, tag });

if (analyzedToken.contains("start") && analyzedToken.contains("end")) {
const size_t start = analyzedToken["start"];
const size_t end = analyzedToken["end"];
positions.push_back({ start, end });
}
}

builder.addPreAnalyzedWord(form, analyzed, positions, score);
}

const auto typos = buildArgs.value("typos", json(nullptr));
const float typoCostThreshold = buildArgs.value("typoCostThreshold", 2.5f);

if (typos.is_null()) {
Expand Down
39 changes: 39 additions & 0 deletions bindings/wasm/package/src/build-args.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,41 @@ export interface UserWord {
origWord?: string;
};

export interface PreanalyzedToken {
/**
* Form of the token.
*/
form: string;
/**
* Part-of-speech tag of the token.
*/
tag: string;
/**
* Start position of the token in the preanalyzed word. If omitted, all token positions are automatically calculated.
*/
start?: number;
/**
* Ebd position of the token in the preanalyzed word. If omitted, all token positions are automatically calculated.
*/
end?: number;
}

export interface PreanalyzedWord {
/**
* Form to add.
*/
form: string;
/**
* The result of the morphological analysis of form.
*/
analyzed: PreanalyzedToken[];
/**
* The weighted score of the morpheme sequence to add.
* If there are multiple morpheme combinations that match the form, the word with the higher score will be prioritized.
*/
score?: number;
}

export interface TypoDefinition {
/**
* Source strings
Expand Down Expand Up @@ -99,6 +134,10 @@ export interface BuildArgs {
* Additional user words to load.
*/
userWords?: UserWord[];
/**
* Preanalyzed words to load.
*/
preanalyzedWords?: PreanalyzedWord[];
/**
* Specifies the language model to use for morphological analysis. Defaults to 'knlm'.
* - `knlm`: Fast and can model the relationships between morphemes within a short distance (usually two or three) with high accuracy. However, it has the limitation that it cannot take into account the relationships between morphemes over a long distance.
Expand Down

0 comments on commit 73b91cc

Please sign in to comment.