Skip to content

Commit

Permalink
Merge pull request #199 from bab2min/dev/saisiot
Browse files Browse the repository at this point in the history
사이시옷 분석 기능 보강
  • Loading branch information
bab2min authored Oct 27, 2024
2 parents 2a5291f + 2c162c7 commit a9ee8c3
Show file tree
Hide file tree
Showing 14 changed files with 57 additions and 33 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.12)

project(kiwi VERSION 0.19.1 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
project(kiwi VERSION 0.20.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")

set ( CMAKE_CXX_STANDARD 14 )
set ( CMAKE_VERBOSE_MAKEFILE true )
Expand Down
8 changes: 4 additions & 4 deletions ModelGenerator/morphemes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6370,7 +6370,7 @@
와인 NNG 270
은혜 NNG 270
공평 NNG 270
횟수 NNG 270
횟수 NNG 270 complex 회/NNG ᆺ/Z_SIOT 수/NNG 010112
반짝이 VV 270 complex 반짝/MAG 이/XSV 0223
서랍 NNG 270
허무 NNG 270
Expand Down Expand Up @@ -14689,7 +14689,7 @@ LG화학 NNP 82
조흥은행 NNP 75
노라 EC 75
영양가 NNG 75
툇마루 NNG 75
툇마루 NNG 75 complex 퇴/NNG ᆺ/Z_SIOT 마루/NNG 010113
오묘 XR 75
의미심장 XR 75
주인집 NNG 75
Expand Down Expand Up @@ -16670,7 +16670,7 @@ LG화학 NNP 82
막중 XR 61
엄중 XR 61
경박 XR 61
셋방 NNG 61
셋방 NNG 61 complex 세/NNG ᆺ/Z_SIOT 방/NNG 010112
애무 NNG 61
천진 NNG 61
맞아들이 VV 61 complex 맞/VV 어/EC 들이/VV 011224
Expand Down Expand Up @@ -23962,7 +23962,7 @@ SK그룹 NNP 33
판매자 NNG 33
차두리 NNP 33
자필 NNG 33
곳간 NNG 33
곳간 NNG 33 complex 고/NNG ᆺ/Z_SIOT 간/NNB 010112
에베레스트 NNP 33
국전 NNG 33
온존 NNG 33
Expand Down
2 changes: 1 addition & 1 deletion bindings/java/kr/pe/bab2min/Kiwi.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

public class Kiwi implements AutoCloseable {
private long _inst;
final private static String _version = "0.19.1";
final private static String _version = "0.20.0";

public static class Match {
final static public int none = 0,
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Form.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Form.h
* @author bab2min ([email protected])
* @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Kiwi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Kiwi.h
* @author bab2min ([email protected])
* @brief Kiwi C++ API를 담고 있는 헤더 파일
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
4 changes: 2 additions & 2 deletions include/kiwi/Macro.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#define KIWI_STR(x) KIWI_STR_HELPER(x)

#define KIWI_VERSION_MAJOR 0
#define KIWI_VERSION_MINOR 19
#define KIWI_VERSION_PATCH 1
#define KIWI_VERSION_MINOR 20
#define KIWI_VERSION_PATCH 0

#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH)
2 changes: 1 addition & 1 deletion include/kiwi/SwTokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file SwTokenizer.h
* @author bab2min ([email protected])
* @brief Subword Tokenizer
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Types.h
* @author bab2min ([email protected])
* @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/TypoTransformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file TypoTransformer.h
* @author bab2min ([email protected])
* @brief 오타 교정에 사용되는 TypoTransformer 및 관련 클래스들을 정의합니다.
* @version 0.19.0
* @version 0.20.0
* @date 2024-09-15
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file capi.h
* @author bab2min ([email protected])
* @brief Kiwi C API를 담고 있는 헤더 파일
* @version 0.19.0
* @version 0.20.0
* @date 2024-07-01
*
*
Expand Down
4 changes: 2 additions & 2 deletions models/base/sj.morph
Git LFS file not shown
1 change: 1 addition & 0 deletions src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,7 @@ namespace kiwi
false,
!!(matchOptions & Match::splitComplex),
!!(matchOptions & Match::splitSaisiot),
!!(matchOptions & Match::mergeSaisiot),
blocklist
);
insertPathIntoResults(ret, spStatesByRet, res, topN, matchOptions, integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);
Expand Down
46 changes: 29 additions & 17 deletions src/PathEvaluator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ namespace kiwi
bool openEnd,
bool splitComplex = false,
bool splitSaisiot = false,
bool mergeSaisiot = false,
const std::unordered_set<const Morpheme*>* blocklist = nullptr
);

Expand All @@ -136,6 +137,7 @@ namespace kiwi
const Vector<SpecialState>& prevSpStates,
bool splitComplex = false,
bool splitSaisiot = false,
bool mergeSaisiot = false,
const std::unordered_set<const Morpheme*>* blocklist = nullptr
);

Expand Down Expand Up @@ -525,7 +527,7 @@ namespace kiwi

// fill the rest information of resultOut
newPath.wid = lastSeqId;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
newPath.combineSocket = curMorph->combineSocket;
newPath.ownFormId = ownFormId;
Expand Down Expand Up @@ -570,7 +572,7 @@ namespace kiwi

// fill the rest information of resultOut
newPath.wid = lastSeqId;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
newPath.combineSocket = curMorph->combineSocket;
newPath.ownFormId = ownFormId;
Expand Down Expand Up @@ -622,7 +624,7 @@ namespace kiwi

// fill the rest information of resultOut
newPath.wid = lastSeqId;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
newPath.combineSocket = curMorph->combineSocket;
newPath.ownFormId = ownFormId;
Expand Down Expand Up @@ -659,7 +661,7 @@ namespace kiwi

const Morpheme* lastMorph;
Wid firstWid;
if (curMorph->chunks.empty() || curMorph->complex)
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
{
lastMorph = curMorph->getCombined() ? curMorph->getCombined() : curMorph;
firstWid = curMorph->lmMorphemeId;
Expand Down Expand Up @@ -691,8 +693,10 @@ namespace kiwi
{
for (auto& prevPath : cache[prev - startNode])
{
// 사이시옷 뒤에 명사가 아닌 태그가 오는 경우 제외
if (prevPath.morpheme->tag == POSTag::z_siot && !isNNClass(curMorph->tag))
// 사이시옷 뒤에 명사가 아닌 태그가 오거나 공백이 있는 경우 제외
if (prevPath.morpheme->tag == POSTag::z_siot && (
!isNNClass(curMorph->tag) || prev->endPos < node->startPos
))
{
continue;
}
Expand All @@ -701,7 +705,7 @@ namespace kiwi
if (prevPath.combineSocket)
{
// merge <v> <chunk> with only the same socket
if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex))
if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
continue;
}
Expand Down Expand Up @@ -747,7 +751,7 @@ namespace kiwi
}

auto cLmState = prevPath.lmState;
if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex))
if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
// no-op
}
Expand All @@ -760,7 +764,7 @@ namespace kiwi
}
float ll = cLmState.next(langMdl, firstWid);
candScore += ll;
if (!(curMorph->chunks.empty() || curMorph->complex))
if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
for (size_t i = 1; i < curMorph->chunks.size(); ++i)
{
Expand Down Expand Up @@ -833,6 +837,7 @@ namespace kiwi
const Vector<SpecialState>& prevSpStates,
bool splitComplex,
bool splitSaisiot,
bool mergeSaisiot,
const std::unordered_set<const Morpheme*>* blocklist
)
{
Expand Down Expand Up @@ -893,6 +898,11 @@ namespace kiwi
// 사이시옷(zSiot)을 위한 지름길
if (curMorph->tag == POSTag::z_siot)
{
if (!(splitSaisiot || mergeSaisiot))
{
continue;
}

for (auto* prev = node->getPrev(); prev; prev = prev->getSibling())
{
for (auto& p : cache[prev - startNode])
Expand All @@ -912,7 +922,7 @@ namespace kiwi
}

// if the morpheme has chunk set
if (!(curMorph->chunks.empty()|| curMorph->complex))
if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
{
// '하다/하게/하지'가 '다/게/지'로 축약된 경우인데 앞에 공백이 있는 경우는 탐색후보에서 제외
if (node->prev && node[-(int)node->prev].endPos < node->startPos
Expand Down Expand Up @@ -1019,13 +1029,13 @@ namespace kiwi
float scoreDiff = cur->accScore - prev->accScore;
float typoCostDiff = cur->accTypoCost - prev->accTypoCost;
auto morpheme = cur->morpheme;
size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex) ? 1 : morpheme->chunks.size();
const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size();
auto& gNode = graph[csearcher(cur)];
scoreDiff += typoCostDiff * typoCostWeight;
scoreDiff /= numNewTokens;
typoCostDiff /= numNewTokens;

if (morpheme->chunks.empty() || morpheme->complex)
if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
{
ret.emplace_back(
unifyMorpheme(morpheme),
Expand Down Expand Up @@ -1093,6 +1103,7 @@ namespace kiwi
bool openEnd,
bool splitComplex,
bool splitSaisiot,
bool mergeSaisiot,
const std::unordered_set<const Morpheme*>* blocklist
)
{
Expand Down Expand Up @@ -1148,24 +1159,24 @@ namespace kiwi
{
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, node->form->candidate,
false, uniqStates, splitComplex, splitSaisiot, blocklist);
false, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
if (all_of(node->form->candidate.begin(), node->form->candidate.end(), [](const Morpheme* m)
{
return m->combineSocket || (!m->chunks.empty() && !m->complex);
return m->combineSocket || !(m->chunks.empty() || m->complex || m->saisiot);
}))
{
ownFormList.emplace_back(node->form->form);
ownFormId = ownFormList.size();
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, unknownNodeLCands,
true, uniqStates, splitComplex, splitSaisiot, blocklist);
true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
};
}
else
{
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, unknownNodeCands,
true, uniqStates, splitComplex, splitSaisiot, blocklist);
true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
}

#ifdef DEBUG_PRINT
Expand All @@ -1186,13 +1197,14 @@ namespace kiwi
for (auto& p : cache[prev - startNode])
{
if (p.combineSocket) continue;
if (!p.morpheme->chunks.empty() && !p.morpheme->complex)
if (!(p.morpheme->chunks.empty() || p.morpheme->complex || p.morpheme->saisiot))
{
if (p.morpheme->chunks.size() <= (p.morpheme->combineSocket ? 2 : 1))
{
if (!FeatureTestor::isMatched(nullptr, p.morpheme->vowel)) continue;
}
}
if (p.morpheme->tag == POSTag::z_siot) continue;

float c = p.accScore + (openEnd ? 0 : p.lmState.next(kw->langMdl, eosId));
if (p.spState.singleQuote) c -= 2;
Expand Down
11 changes: 11 additions & 0 deletions test/test_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,13 +994,24 @@ TEST(KiwiCpp, ZSiot)
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
EXPECT_EQ(resSplit.first.size(), 3);
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
EXPECT_EQ(resSplit.first[1].tag, POSTag::z_siot);
EXPECT_EQ(resSplit.first[2].tag, POSTag::nng);
EXPECT_EQ(resMerge.first.size(), 1);
EXPECT_EQ(resMerge.first[0].tag, POSTag::nng);
}

for (auto s : {u"발렛 파킹", u"미닛"})
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
EXPECT_EQ(resNone.second, resSplit.second);
EXPECT_EQ(resNone.second, resMerge.second);
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
}
}

TEST(KiwiCpp, AnalyzeWithWordPosition)
Expand Down

0 comments on commit a9ee8c3

Please sign in to comment.