Skip to content

Commit

Permalink
Merge pull request #203 from bab2min/dev/fix_issue_189
Browse files Browse the repository at this point in the history
숫자가 종종 잘못된 문자로 매칭되는 버그 수정
  • Loading branch information
bab2min authored Nov 20, 2024
2 parents 979432f + 8a938f1 commit 8c346b3
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 9 deletions.
9 changes: 0 additions & 9 deletions src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -818,15 +818,6 @@ size_t kiwi::splitByTrie(
if (curNode->fail())
{
curNode = curNode->fail();
for (auto submatcher = curNode; submatcher; submatcher = submatcher->fail())
{
const Form* cand = submatcher->val(trie);
if (!cand) break;
else if (!trie.hasSubmatch(cand))
{
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break;
}
}
nextNode = curNode->template nextOpt<arch>(trie, str[n + i]);
}
else
Expand Down
48 changes: 48 additions & 0 deletions test/test_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,12 @@ TEST(KiwiCpp, ZCoda)
TEST(KiwiCpp, ZSiot)
{
Kiwi& kiwi = reuseKiwiInstance();

auto resSplit = kiwi.analyze(u"찰랑찰랑한 머릿결과 볼륨감", Match::allWithNormalizing | Match::splitSaisiot);
EXPECT_EQ(resSplit.first.size(), 8);
EXPECT_EQ(resSplit.first[3].str, u"머리");
EXPECT_EQ(resSplit.first[4].tag, POSTag::z_siot);
EXPECT_EQ(resSplit.first[5].str, u"");

for (auto s : {u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방"})
{
Expand All @@ -1014,6 +1020,35 @@ TEST(KiwiCpp, ZSiot)
}
}

TEST(KiwiCpp, ZSiotWithTypo)
{
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build(getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual));

for (auto s : { u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방" })
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
EXPECT_EQ(resSplit.first.size(), 3);
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
EXPECT_EQ(resSplit.first[1].tag, POSTag::z_siot);
EXPECT_EQ(resSplit.first[2].tag, POSTag::nng);
EXPECT_EQ(resMerge.first.size(), 1);
EXPECT_EQ(resMerge.first[0].tag, POSTag::nng);
}

for (auto s : { u"발렛 파킹", u"미닛" })
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
EXPECT_EQ(resNone.second, resSplit.second);
EXPECT_EQ(resNone.second, resMerge.second);
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
}
}

TEST(KiwiCpp, AnalyzeWithWordPosition)
{
std::u16string testSentence = u"나 정말 배불렄ㅋㅋ";
Expand Down Expand Up @@ -1609,3 +1644,16 @@ TEST(KiwiCpp, IssueP172_LengthError)
auto res = kiwi.analyze(text, Match::allWithNormalizing).first;
EXPECT_GT(res.size(), 0);
}

TEST(KiwiCpp, IssueP189)
{
Kiwi& kiwi = reuseKiwiInstance();
auto res = kiwi.analyze(u"담아 1팩 무료", Match::allWithNormalizing).first;

EXPECT_EQ(res.size(), 5);
EXPECT_EQ(res[0].str, u"");
EXPECT_EQ(res[1].str, u"");
EXPECT_EQ(res[2].str, u"1");
EXPECT_EQ(res[3].str, u"");
EXPECT_EQ(res[4].str, u"무료");
}

0 comments on commit 8c346b3

Please sign in to comment.