From f3dfb4c1ae5a1eaff7b6c71b09de0db5a4d36f80 Mon Sep 17 00:00:00 2001 From: bab2min Date: Mon, 5 Feb 2024 02:07:26 +0900 Subject: [PATCH] added `loadMultiDict` options --- include/kiwi/Types.h | 4 +++- include/kiwi/capi.h | 3 ++- src/KiwiBuilder.cpp | 21 ++++++++++++++++++--- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/include/kiwi/Types.h b/include/kiwi/Types.h index 18c39e6b..e130a931 100644 --- a/include/kiwi/Types.h +++ b/include/kiwi/Types.h @@ -274,7 +274,9 @@ namespace kiwi loadTypoDict = 1 << 2, /**< 오타 사전(typo.dict)의 로딩 여부를 설정한다.*/ - default_ = integrateAllomorph | loadDefaultDict | loadTypoDict, + loadMultiDict = 1 << 3, /**< 복합명사 사전(multi.dict)의 로딩 여부를 설정한다. 복합명사 사전은 복합명사의 구성 형태소를 저장하고 있다. */ + + default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict, }; struct Morpheme; diff --git a/include/kiwi/capi.h b/include/kiwi/capi.h index 25d6e4f1..21fae687 100644 --- a/include/kiwi/capi.h +++ b/include/kiwi/capi.h @@ -92,7 +92,8 @@ enum KIWI_BUILD_INTEGRATE_ALLOMORPH = 1, KIWI_BUILD_LOAD_DEFAULT_DICT = 2, KIWI_BUILD_LOAD_TYPO_DICT = 4, - KIWI_BUILD_DEFAULT = 7, + KIWI_BUILD_LOAD_MULTI_DICT = 8, + KIWI_BUILD_DEFAULT = 15, KIWI_BUILD_MODEL_TYPE_KNLM = 0x0000, KIWI_BUILD_MODEL_TYPE_SBG = 0x0100, }; diff --git a/src/KiwiBuilder.cpp b/src/KiwiBuilder.cpp index 2255c5e7..0f3129a0 100644 --- a/src/KiwiBuilder.cpp +++ b/src/KiwiBuilder.cpp @@ -632,6 +632,11 @@ KiwiBuilder::KiwiBuilder(const string& modelPath, size_t _numThreads, BuildOptio loadDictionary(modelPath + "/typo.dict"); } + if (!!(options & BuildOption::loadMultiDict)) + { + loadDictionary(modelPath + "/multi.dict"); + } + { ifstream ifs; combiningRule = make_shared(cmb::RuleSet{ openFile(ifs, modelPath + string{ "/combiningRule.txt" }) }.compile()); @@ -1807,10 +1812,20 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c auto ptypos = typos.prepare(); for (auto f : sortedForms) { - for (auto t : ptypos._generate(f->form, typoCostThreshold)) + // 현재는 공백이 없는 단일 단어에 대해서만 오타 교정을 수행 + // 공백이 포함된 복합 명사류의 경우 오타 후보가 지나치게 많아져 + // 메모리 요구량이 급격히 증가하기 때문. + if (f->numSpaces == 0) + { + for (auto t : ptypos._generate(f->form, typoCostThreshold)) + { + if (t.leftCond != CondVowel::none && f->vowel != CondVowel::none && t.leftCond != f->vowel) continue; + typoGroup[removeSpace(t.str)].emplace_back(f - ret.forms.data(), t.cost, t.leftCond); + } + } + else { - if (t.leftCond != CondVowel::none && f->vowel != CondVowel::none && t.leftCond != f->vowel) continue; - typoGroup[removeSpace(t.str)].emplace_back(f - ret.forms.data(), t.cost, t.leftCond); + typoGroup[removeSpace(f->form)].emplace_back(f - ret.forms.data(), 0, CondVowel::none); } }