Skip to content

Commit

Permalink
Merge pull request #165 from bab2min/dev_script_type
Browse files Browse the repository at this point in the history
Add `ScriptType`
  • Loading branch information
bab2min authored May 15, 2024
2 parents 95f1e24 + 230188e commit 0db05c4
Show file tree
Hide file tree
Showing 8 changed files with 1,087 additions and 5 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ set ( CORE_SRCS
src/KTrie.cpp
src/PatternMatcher.cpp
src/search.cpp
src/ScriptType.cpp
src/SwTokenizer.cpp
src/TagUtils.cpp
src/TypoTransformer.cpp
Expand Down
245 changes: 245 additions & 0 deletions include/kiwi/ScriptType.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
#pragma once
#include <cstdint>

namespace kiwi
{
enum class ScriptType : uint8_t
{
unknown,
latin,
ipa_extensions,
spacing_modifier_letters,
combining_diacritical_marks,
greek_and_coptic,
cyrillic,
armenian,
hebrew,
arabic,
syriac,
thaana,
nko,
samaritan,
mandaic,
devanagari,
bengali,
gurmukhi,
gujarati,
oriya,
tamil,
telugu,
kannada,
malayalam,
sinhala,
thai,
lao,
tibetan,
myanmar,
georgian,
hangul,
ethiopic,
cherokee,
unified_canadian_aboriginal_syllabics,
ogham,
runic,
tagalog,
hanunoo,
buhid,
tagbanwa,
khmer,
mongolian,
limbu,
tai_le,
new_tai_lue,
khmer_symbols,
buginese,
tai_tham,
balinese,
sundanese,
batak,
lepcha,
ol_chiki,
phonetic_extensions,
punctuation,
superscripts_and_subscripts,
currency_symbols,
combining_diacritical_marks_for_symbols,
letterlike_symbols,
number_forms,
arrows,
mathematical,
miscellaneous_technical,
control_pictures,
optical_character_recognition,
enclosed_alphanumerics,
box_drawing,
block_elements,
geometric_shapes,
miscellaneous_symbols,
dingbats,
braille_patterns,
glagolitic,
tifinagh,
hanja,
ideographic_description_characters,
kana,
bopomofo,
kanbun,
yijing_hexagram_symbols,
yi,
lisu,
vai,
bamum,
modifier_tone_letters,
syloti_nagri,
common_indic_number_forms,
phags_pa,
saurashtra,
kayah_li,
rejang,
javanese,
cham,
tai_viet,
meetei_mayek,
private_use_area,
alphabetic_presentation_forms,
arabic_presentation_forms_a,
variation_selectors,
vertical_forms,
combining_half_marks,
small_form_variants,
arabic_presentation_forms_b,
halfwidth_and_fullwidth_forms,
specials,
linear_b,
aegean_numbers,
ancient_greek_numbers,
ancient_symbols,
phaistos_disc,
lycian,
carian,
coptic_epact_numbers,
old_italic,
gothic,
old_permic,
ugaritic,
old_persian,
deseret,
shavian,
osmanya,
osage,
elbasan,
caucasian_albanian,
vithkuqi,
linear_a,
cypriot_syllabary,
imperial_aramaic,
palmyrene,
nabataean,
hatran,
phoenician,
lydian,
meroitic_hieroglyphs,
meroitic_cursive,
kharoshthi,
old_south_arabian,
old_north_arabian,
manichaean,
avestan,
inscriptional_parthian,
inscriptional_pahlavi,
psalter_pahlavi,
old_turkic,
old_hungarian,
hanifi_rohingya,
rumi_numeral_symbols,
yezidi,
old_sogdian,
sogdian,
old_uyghur,
chorasmian,
elymaic,
brahmi,
kaithi,
sora_sompeng,
chakma,
mahajani,
sharada,
sinhala_archaic_numbers,
khojki,
multani,
khudawadi,
grantha,
newa,
tirhuta,
siddham,
modi,
takri,
ahom,
dogra,
warang_citi,
dives_akuru,
nandinagari,
zanabazar_square,
soyombo,
pau_cin_hau,
bhaiksuki,
marchen,
masaram_gondi,
gunjala_gondi,
makasar,
kawi,
cuneiform,
early_dynastic_cuneiform,
cypro_minoan,
egyptian_hieroglyphs,
anatolian_hieroglyphs,
mro,
tangsa,
bassa_vah,
pahawh_hmong,
medefaidrin,
miao,
ideographic_symbols_and_punctuation,
tangut,
khitan_small_script,
nushu,
duployan,
shorthand_format_controls,
znamenny_musical_notation,
byzantine_musical_symbols,
musical_symbols,
ancient_greek_musical_notation,
kaktovik_numerals,
mayan_numerals,
tai_xuan_jing_symbols,
counting_rod_numerals,
mathematical_alphanumeric_symbols,
sutton_signwriting,
nyiakeng_puachue_hmong,
toto,
wancho,
nag_mundari,
mende_kikakui,
adlam,
indic_siyaq_numbers,
ottoman_siyaq_numbers,
arabic_mathematical_alphabetic_symbols,
mahjong_tiles,
domino_tiles,
playing_cards,
enclosed_ideographic_supplement,
symbols_and_pictographs,
emoticons,
transport_and_map_symbols,
alchemical_symbols,
chess_symbols,
symbols_for_legacy_computing,
tags,
};

ScriptType chr2ScriptType(char32_t c);

const char* getScriptName(ScriptType type);

bool isEmoji(char32_t c0, char32_t c1 = 0);
}
6 changes: 5 additions & 1 deletion include/kiwi/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#endif

#include "TemplateUtils.hpp"
#include "ScriptType.h"

#define KIWI_DEFINE_ENUM_FLAG_OPERATORS(Type) \
inline Type operator~(Type a)\
Expand Down Expand Up @@ -306,7 +307,10 @@ namespace kiwi
uint32_t lineNumber = 0; /**< 줄 번호*/
uint16_t length = 0; /**< 길이(UTF16 문자 기준) */
POSTag tag = POSTag::unknown; /**< 품사 태그 */
uint8_t senseId = 0; /**< 의미 번호 */
union {
uint8_t senseId = 0; /**< 의미 번호 */
ScriptType script; /**< 유니코드 영역에 기반한 문자 타입 */
};
float score = 0; /**< 해당 형태소의 언어모델 점수 */
float typoCost = 0; /**< 오타가 교정된 경우 오타 비용. 그렇지 않은 경우 0 */
uint32_t typoFormId = 0; /**< 교정 전 오타의 형태에 대한 정보 (typoCost가 0인 경우 PreTokenizedSpan의 ID값) */
Expand Down
25 changes: 24 additions & 1 deletion src/KTrie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,16 @@ namespace kiwi
}
}

inline bool isDiscontinuous(POSTag prevTag, POSTag curTag, ScriptType prevScript, ScriptType curScript)
{
if ((prevTag == POSTag::sl || prevTag == POSTag::sh || prevTag == POSTag::sw) &&
(curTag == POSTag::sl || curTag == POSTag::sh || curTag == POSTag::sw))
{
return prevScript != curScript;
}
return prevTag != curTag;
}

template<ArchType arch, bool typoTolerant, bool continualTypoTolerant>
size_t kiwi::splitByTrie(
Vector<KGraphNode>& ret,
Expand Down Expand Up @@ -585,6 +595,7 @@ size_t kiwi::splitByTrie(

size_t lastSpecialEndPos = 0, specialStartPos = 0;
POSTag chrType, lastChrType = POSTag::unknown, lastMatchedPattern = POSTag::unknown;
ScriptType scriptType, lastScriptType = ScriptType::unknown;
auto flushBranch = [&](size_t unkFormEndPos = 0, size_t unkFormEndPosWithSpace = 0, bool specialMatched = false)
{
if (!candidates.empty())
Expand Down Expand Up @@ -836,8 +847,17 @@ size_t kiwi::splitByTrie(
}

chrType = identifySpecialChr(c32);
scriptType = chr2ScriptType(c32);
if (lastChrType == POSTag::sw &&
(c32 == 0x200d || // zero width joiner
(0x1f3fb <= c32 && c32 <= 0x1f3ff) || // skin color modifier
scriptType == ScriptType::variation_selectors)) // variation selectors
{
chrType = lastChrType;
scriptType = lastScriptType;
}

if (lastChrType != chrType || lastChrType == POSTag::sso || lastChrType == POSTag::ssc)
if (isDiscontinuous(lastChrType, chrType, lastScriptType, scriptType) || lastChrType == POSTag::sso || lastChrType == POSTag::ssc)
{
// sequence of speical characters found
if (lastChrType != POSTag::max && lastChrType != POSTag::unknown && lastChrType != lastMatchedPattern)
Expand Down Expand Up @@ -875,13 +895,15 @@ size_t kiwi::splitByTrie(
if (!isSpace(str[n - 3]) && !isSpace(str[n - 2]))
{
lastChrType = chrType;
lastScriptType = scriptType;
break;
}
}
// 혹은 공백 문자가 아예 없는 경우 너무 길어지는 것을 방지하기 위해 강제로 중단
else if (n >= 8192)
{
lastChrType = chrType;
lastScriptType = scriptType;
break;
}

Expand Down Expand Up @@ -1021,6 +1043,7 @@ size_t kiwi::splitByTrie(
}
continueFor:
lastChrType = chrType;
lastScriptType = scriptType;
}

// sequence of speical characters found
Expand Down
19 changes: 19 additions & 0 deletions src/Kiwi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,23 @@ namespace kiwi
return ++first;
}

inline void updateTokenInfoScript(TokenInfo& info)
{
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw)) return;
if ((info.morph && info.morph->kform && !info.morph->kform->empty())) return;
if (info.str.empty()) return;
char32_t c = info.str[0];
if (isHighSurrogate(c))
{
c = mergeSurrogate(c, info.str[1]);
}
info.script = chr2ScriptType(c);
if (info.script == ScriptType::latin)
{
info.tag = POSTag::sl;
}
}

inline void insertPathIntoResults(
vector<TokenResult>& ret,
Vector<SpecialState>& spStatesByRet,
Expand Down Expand Up @@ -718,6 +735,8 @@ namespace kiwi
token.score = s.wordScore;
token.typoCost = s.typoCost;
token.typoFormId = s.typoFormId;
token.senseId = s.morph->senseId;
updateTokenInfoScript(token);
auto ptId = nodeInWhichPretokenized[s.nodeId] + 1;
if (ptId)
{
Expand Down
Loading

0 comments on commit 0db05c4

Please sign in to comment.