Skip to content

Commit

Permalink
Merge pull request #157 from bab2min/dev_0170
Browse files Browse the repository at this point in the history
Dev 0.17.0
  • Loading branch information
bab2min authored Mar 9, 2024
2 parents 7c337b6 + a0795c1 commit 87bd2b4
Show file tree
Hide file tree
Showing 11 changed files with 252,277 additions and 274,608 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.12)

project(kiwi VERSION 0.16.1 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
project(kiwi VERSION 0.17.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")

set ( CMAKE_CXX_STANDARD 14 )
set ( CMAKE_VERBOSE_MAKEFILE true )
Expand Down
526,825 changes: 252,228 additions & 274,597 deletions ModelGenerator/multi.dict

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion bindings/java/kr/pe/bab2min/Kiwi.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

public class Kiwi implements AutoCloseable {
private long _inst;
final private static String _version = "0.16.1";
final private static String _version = "0.17.0";

public static class Match {
final static public int none = 0,
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Form.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Form.h
* @author bab2min ([email protected])
* @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더
* @version 0.16.1
* @version 0.17.0
* @date 2022-09-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Kiwi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Kiwi.h
* @author bab2min ([email protected])
* @brief Kiwi C++ API를 담고 있는 헤더 파일
* @version 0.16.1
* @version 0.17.0
* @date 2022-09-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file Types.h
* @author bab2min ([email protected])
* @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일
* @version 0.16.1
* @version 0.17.0
* @date 2022-09-01
*
*
Expand Down
16 changes: 16 additions & 0 deletions include/kiwi/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,23 @@ namespace kiwi
case u'\r':
case u'\t':
case u'\v':
case u'\xA0':
case u'\u1680':
case u'\u2000':
case u'\u2001':
case u'\u2002':
case u'\u2003':
case u'\u2004':
case u'\u2005':
case u'\u2006':
case u'\u2007':
case u'\u2008':
case u'\u2009':
case u'\u200A':
case u'\u202F':
case u'\u205F':
case u'\u2800':
case u'\u3000':
return true;
}
return false;
Expand Down
2 changes: 1 addition & 1 deletion include/kiwi/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file capi.h
* @author bab2min ([email protected])
* @brief Kiwi C API를 담고 있는 헤더 파일
* @version 0.16.1
* @version 0.17.0
* @date 2022-09-01
*
*
Expand Down
2 changes: 1 addition & 1 deletion src/Combiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,7 @@ void RuleSet::loadRules(istream& istr)
while (getline(istr, line))
{
if (line[0] == '#') continue;
while (!line.empty() && isSpace(line.back())) line.pop_back();
while (!line.empty() && line.back() < 0x80 && isSpace(line.back())) line.pop_back();
if (line.empty()) continue;

auto fields = split(line, '\t');
Expand Down
7 changes: 6 additions & 1 deletion src/PatternMatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,15 @@ size_t PatternMatcherImpl::testSerial(const char16_t* first, const char16_t* las
{
++b;
if (b != last && *b == ' ') ++b;
if (b == last || !isDigit(*b)) return b - first;
if (b == last || !isDigit(*b))
{
if (b[-1] == ' ') --b;
return b - first;
}
++b;
while (b != last && isDigit(*b)) ++b;
}
if (b[-1] == ' ') --b;
return b - first;
}

Expand Down
23 changes: 20 additions & 3 deletions test/test_cpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -531,13 +531,13 @@ TEST(KiwiCpp, SpaceTolerant)
TEST(KiwiCpp, MultiWordDictionary)
{
auto& kiwi = reuseKiwiInstance();
const auto text = u"밀리언 달러 베이비랑 바람과 함께 사라지다랑 뭐가 더 재밌었어?";
const auto text = u"밀리언 달러 베이비랑 더 웨이 백 중 뭐가 더 재밌었어?";

auto res = kiwi.analyze(text, Match::allWithNormalizing).first;
EXPECT_EQ(res[0].str, u"밀리언 달러 베이비");
EXPECT_EQ(res[0].tag, POSTag::nnp);

EXPECT_EQ(res[2].str, u"바람과 함께 사라지다");
EXPECT_EQ(res[2].str, u"더 웨이 백");
EXPECT_EQ(res[2].tag, POSTag::nnp);

auto kiwi2 = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_ & ~BuildOption::loadMultiDict, }.build();
Expand All @@ -547,7 +547,7 @@ TEST(KiwiCpp, MultiWordDictionary)

TEST(KiwiCpp, WordsWithSpaces)
{
KiwiBuilder kw{ MODEL_PATH, 0, BuildOption::default_, };
KiwiBuilder kw{ MODEL_PATH, 0, BuildOption::default_ & ~BuildOption::loadMultiDict, };
EXPECT_TRUE(kw.addWord(u"대학생 선교회", POSTag::nnp, 0.0).second);
Kiwi kiwi = kw.build();

Expand Down Expand Up @@ -643,6 +643,22 @@ TEST(KiwiCpp, WordsWithSpaces)
EXPECT_EQ(res5.first[1].lineNumber, 2);
}

TEST(KiwiCpp, MultiDict)
{
Kiwi& kiwi = reuseKiwiInstance();
auto res = kiwi.analyze(u"프렌치카페 로스터리 크리스마스에디션 인증샷", Match::all).first;
for (auto& r : res)
{
EXPECT_NE(r.str, u"리 크리스마스");
}

res = kiwi.analyze(u"추첨이벤트 2018년 리빙디자인페어 행사기간", Match::all).first;
for (auto& r : res)
{
EXPECT_NE(r.str, u"리 빙");
}
}

TEST(KiwiCpp, Pattern)
{
Kiwi& kiwi = reuseKiwiInstance();
Expand Down Expand Up @@ -716,6 +732,7 @@ TEST(KiwiCpp, Pattern)
tokens = kiwi.analyze(u"2001. 01. 02. 에", Match::all).first;
EXPECT_EQ(tokens.size(), 2);
EXPECT_EQ(tokens[0].tag, POSTag::w_serial);
EXPECT_EQ(tokens[0].str.back(), u'.');

tokens = kiwi.analyze(u"010-1234-5678에", Match::all).first;
EXPECT_EQ(tokens.size(), 2);
Expand Down

0 comments on commit 87bd2b4

Please sign in to comment.