Skip to content

Commit

Permalink
Remove hardcoded references in segmenter datagen. (#4298)
Browse files Browse the repository at this point in the history
clean up datagen for segmenter to use icu_properties.
  • Loading branch information
makotokato authored Nov 15, 2023
1 parent 3e431a5 commit 666ef8a
Showing 1 changed file with 26 additions and 125 deletions.
151 changes: 26 additions & 125 deletions provider/datagen/src/transform/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,19 @@ fn generate_rule_break_data(
let data = sets::load_extended_pictographic(provider).expect("The data should be valid!");
let extended_pictographic = data.as_borrowed();

let data =
GraphemeClusterBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
let gcb_name_to_enum = data.as_borrowed();

let data = LineBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
let lb_name_to_enum = data.as_borrowed();

let data = SentenceBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
let sb_name_to_enum = data.as_borrowed();

let data = WordBreak::get_name_to_enum_mapper(provider).expect("The data should be vaild!");
let wb_name_to_enum = data.as_borrowed();

fn set_break_state(
break_state_table: &mut [i8],
property_length: usize,
Expand All @@ -133,126 +146,6 @@ fn generate_rule_break_data(
properties_names.iter().position(|n| n.eq(s))
}

fn get_word_segmenter_value_from_name(name: &str) -> WordBreak {
match name {
"ALetter" => WordBreak::ALetter,
"CR" => WordBreak::CR,
"Double_Quote" => WordBreak::DoubleQuote,
"Extend" => WordBreak::Extend,
"ExtendNumLet" => WordBreak::ExtendNumLet,
"Format" => WordBreak::Format,
"Katakana" => WordBreak::Katakana,
"Hebrew_Letter" => WordBreak::HebrewLetter,
"LF" => WordBreak::LF,
"MidLetter" => WordBreak::MidLetter,
"MidNum" => WordBreak::MidNum,
"MidNumLet" => WordBreak::MidNumLet,
"Newline" => WordBreak::Newline,
"Numeric" => WordBreak::Numeric,
"Regional_Indicator" => WordBreak::RegionalIndicator,
"Single_Quote" => WordBreak::SingleQuote,
"WSegSpace" => WordBreak::WSegSpace,
"ZWJ" => WordBreak::ZWJ,
_ => {
panic!("Invalid property name")
}
}
}

fn get_grapheme_segmenter_value_from_name(name: &str) -> GraphemeClusterBreak {
match name {
"Control" => GraphemeClusterBreak::Control,
"CR" => GraphemeClusterBreak::CR,
"Extend" => GraphemeClusterBreak::Extend,
"L" => GraphemeClusterBreak::L,
"LF" => GraphemeClusterBreak::LF,
"LV" => GraphemeClusterBreak::LV,
"LVT" => GraphemeClusterBreak::LVT,
"Prepend" => GraphemeClusterBreak::Prepend,
"Regional_Indicator" => GraphemeClusterBreak::RegionalIndicator,
"SpacingMark" => GraphemeClusterBreak::SpacingMark,
"T" => GraphemeClusterBreak::T,
"V" => GraphemeClusterBreak::V,
"ZWJ" => GraphemeClusterBreak::ZWJ,
_ => {
panic!("Invalid property name")
}
}
}

fn get_sentence_segmenter_value_from_name(name: &str) -> SentenceBreak {
match name {
"ATerm" => SentenceBreak::ATerm,
"Close" => SentenceBreak::Close,
"CR" => SentenceBreak::CR,
"Extend" => SentenceBreak::Extend,
"Format" => SentenceBreak::Format,
"LF" => SentenceBreak::LF,
"Lower" => SentenceBreak::Lower,
"Numeric" => SentenceBreak::Numeric,
"OLetter" => SentenceBreak::OLetter,
"SContinue" => SentenceBreak::SContinue,
"Sep" => SentenceBreak::Sep,
"Sp" => SentenceBreak::Sp,
"STerm" => SentenceBreak::STerm,
"Upper" => SentenceBreak::Upper,
_ => {
panic!("Invalid property name")
}
}
}

fn get_line_segmenter_value_from_name(name: &str) -> LineBreak {
match name {
"AI" => LineBreak::Ambiguous,
"AL" => LineBreak::Alphabetic,
"B2" => LineBreak::BreakBoth,
"BA" => LineBreak::BreakAfter,
"BB" => LineBreak::BreakBefore,
"BK" => LineBreak::MandatoryBreak,
"CB" => LineBreak::ContingentBreak,
"CJ" => LineBreak::ConditionalJapaneseStarter,
"CL" => LineBreak::ClosePunctuation,
"CM" => LineBreak::CombiningMark,
"CP" => LineBreak::CloseParenthesis,
"CR" => LineBreak::CarriageReturn,
"EB" => LineBreak::EBase,
"EM" => LineBreak::EModifier,
"EX" => LineBreak::Exclamation,
"GL" => LineBreak::Glue,
"H2" => LineBreak::H2,
"H3" => LineBreak::H3,
"HL" => LineBreak::HebrewLetter,
"HY" => LineBreak::Hyphen,
"ID" => LineBreak::Ideographic,
"IN" => LineBreak::Inseparable,
"IS" => LineBreak::InfixNumeric,
"JL" => LineBreak::JL,
"JT" => LineBreak::JT,
"JV" => LineBreak::JV,
"LF" => LineBreak::LineFeed,
"NL" => LineBreak::NextLine,
"NS" => LineBreak::Nonstarter,
"NU" => LineBreak::Numeric,
"OP" => LineBreak::OpenPunctuation,
"PO" => LineBreak::PostfixNumeric,
"PR" => LineBreak::PrefixNumeric,
"QU" => LineBreak::Quotation,
"RI" => LineBreak::RegionalIndicator,
"SA" => LineBreak::ComplexContext,
"SG" => LineBreak::Surrogate,
"SP" => LineBreak::Space,
"SY" => LineBreak::BreakSymbols,
"WJ" => LineBreak::WordJoiner,
"XX" => LineBreak::Unknown,
"ZW" => LineBreak::ZWSpace,
"ZWJ" => LineBreak::ZWJ,
_ => {
panic!("Invalid property name: {name}")
}
}
}

fn is_cjk_fullwidth(
eaw: maps::CodePointMapDataBorrowed<EastAsianWidth>,
codepoint: u32,
Expand Down Expand Up @@ -323,19 +216,21 @@ fn generate_rule_break_data(
Script::Han | Script::Hiragana => {
properties_map[c as usize] = property_index;
}

_ => {}
}
}
}

continue;
}

// TODO(#2239):
// How to handle Katakana in UAX29? UAX29 defines Katakana rule, but CJ dictionary has another rules.
// Katakana will use UAX#29 rules instead of dictionary.

let prop = get_word_segmenter_value_from_name(&p.name);
let prop = wb_name_to_enum
.get_loose(&p.name)
.expect("property name should be valid!");
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
if wb.get32(c) == prop {
properties_map[c as usize] = property_index;
Expand All @@ -358,7 +253,9 @@ fn generate_rule_break_data(
continue;
}

let prop = get_grapheme_segmenter_value_from_name(&p.name);
let prop = gcb_name_to_enum
.get_loose(&p.name)
.expect("property name should be valid!");
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
if gb.get32(c) == prop {
properties_map[c as usize] = property_index;
Expand All @@ -368,7 +265,9 @@ fn generate_rule_break_data(
}

"sentence" => {
let prop = get_sentence_segmenter_value_from_name(&p.name);
let prop = sb_name_to_enum
.get_loose(&p.name)
.expect("property name should be valid!");
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
if sb.get32(c) == prop {
properties_map[c as usize] = property_index;
Expand Down Expand Up @@ -442,7 +341,9 @@ fn generate_rule_break_data(
continue;
}

let prop = get_line_segmenter_value_from_name(&p.name);
let prop = lb_name_to_enum
.get_loose(&p.name)
.expect("property name should be valid!");
for c in 0..(CODEPOINT_TABLE_LEN as u32) {
if lb.get32(c) == prop {
properties_map[c as usize] = property_index;
Expand Down

0 comments on commit 666ef8a

Please sign in to comment.