From 317c8afebca0c2fcc304762fc0d2130082750b2f Mon Sep 17 00:00:00 2001 From: Robert Bastian <4706271+robertbastian@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:16:11 +0100 Subject: [PATCH] Caching segmenter data (#4316) --- provider/datagen/src/provider.rs | 74 ++--------------- .../datagen/src/transform/segmenter/mod.rs | 81 ++++++++++++++++++- 2 files changed, 85 insertions(+), 70 deletions(-) diff --git a/provider/datagen/src/provider.rs b/provider/datagen/src/provider.rs index 4c3e8f576b6..7fb09fccc7c 100644 --- a/provider/datagen/src/provider.rs +++ b/provider/datagen/src/provider.rs @@ -279,70 +279,6 @@ impl DatagenProvider { ) -> Result, DataError> { self.cldr()?.locales(levels) } - - pub(crate) fn new_hardcoded_segmenter_data() -> Self { - Self { - source: SourceData { - trie_type: TrieType::Small, - icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::Memory( - [ - ( - "uprops/small/ea.toml", - include_bytes!("../data/segmenter/uprops/small/ea.toml").as_slice(), - ), - ( - "uprops/small/ExtPict.toml", - include_bytes!("../data/segmenter/uprops/small/ExtPict.toml") - .as_slice(), - ), - ( - "uprops/small/gc.toml", - include_bytes!("../data/segmenter/uprops/small/gc.toml").as_slice(), - ), - ( - "uprops/small/GCB.toml", - include_bytes!("../data/segmenter/uprops/small/GCB.toml").as_slice(), - ), - ( - "uprops/small/lb.toml", - include_bytes!("../data/segmenter/uprops/small/lb.toml").as_slice(), - ), - ( - "uprops/small/SB.toml", - include_bytes!("../data/segmenter/uprops/small/SB.toml").as_slice(), - ), - ( - "uprops/small/sc.toml", - include_bytes!("../data/segmenter/uprops/small/sc.toml").as_slice(), - ), - ( - "uprops/small/WB.toml", - include_bytes!("../data/segmenter/uprops/small/WB.toml").as_slice(), - ), - ( - "segmenter/grapheme.toml", - include_bytes!("../data/segmenter/grapheme.toml").as_slice(), - ), - ( - "segmenter/line.toml", - include_bytes!("../data/segmenter/line.toml").as_slice(), - ), - ( - "segmenter/sentence.toml", - include_bytes!("../data/segmenter/sentence.toml").as_slice(), - ), - ( - "segmenter/word.toml", - include_bytes!("../data/segmenter/word.toml").as_slice(), - ), - ] - .into_iter() - .collect(), - )))), - ..Self::new_custom().source - }, - } - } } /// Specifies the trie type to use. @@ -381,11 +317,11 @@ impl std::fmt::Display for TrieType { #[non_exhaustive] #[deprecated(since = "1.3.0", note = "use `DatagenProvider`")] pub struct SourceData { - cldr_paths: Option>, - icuexport_paths: Option>, - segmenter_lstm_paths: Option>, - trie_type: TrieType, - collation_han_database: CollationHanDatabase, + pub(crate) cldr_paths: Option>, + pub(crate) icuexport_paths: Option>, + pub(crate) segmenter_lstm_paths: Option>, + pub(crate) trie_type: TrieType, + pub(crate) collation_han_database: CollationHanDatabase, #[cfg(feature = "legacy_api")] // populated if constructed through `SourceData` constructor only pub(crate) icuexport_dictionary_fallback: Option>, diff --git a/provider/datagen/src/transform/segmenter/mod.rs b/provider/datagen/src/transform/segmenter/mod.rs index cef4b7fc01f..5e733dbc041 100644 --- a/provider/datagen/src/transform/segmenter/mod.rs +++ b/provider/datagen/src/transform/segmenter/mod.rs @@ -534,7 +534,7 @@ macro_rules! implement { return { self.check_req::<$marker>(req)?; let data = generate_rule_break_data( - &Self::new_hardcoded_segmenter_data(), + &hardcoded_segmenter_provider(), $rules, self.trie_type(), ); @@ -555,6 +555,85 @@ macro_rules! implement { } } +fn hardcoded_segmenter_provider() -> crate::DatagenProvider { + #![allow(deprecated)] + use crate::{ + source::{AbstractFs, SerdeCache}, + DatagenProvider, TrieType, + }; + // Singleton so that all instantiations share the same cache. + static SINGLETON: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); + SINGLETON + .get_or_init(|| { + let mut provider = DatagenProvider::new_custom(); + provider.source.icuexport_paths = + Some(std::sync::Arc::new(SerdeCache::new(AbstractFs::Memory( + [ + ( + "uprops/small/ea.toml", + include_bytes!("../../../data/segmenter/uprops/small/ea.toml") + .as_slice(), + ), + ( + "uprops/small/ExtPict.toml", + include_bytes!("../../../data/segmenter/uprops/small/ExtPict.toml") + .as_slice(), + ), + ( + "uprops/small/gc.toml", + include_bytes!("../../../data/segmenter/uprops/small/gc.toml") + .as_slice(), + ), + ( + "uprops/small/GCB.toml", + include_bytes!("../../../data/segmenter/uprops/small/GCB.toml") + .as_slice(), + ), + ( + "uprops/small/lb.toml", + include_bytes!("../../../data/segmenter/uprops/small/lb.toml") + .as_slice(), + ), + ( + "uprops/small/SB.toml", + include_bytes!("../../../data/segmenter/uprops/small/SB.toml") + .as_slice(), + ), + ( + "uprops/small/sc.toml", + include_bytes!("../../../data/segmenter/uprops/small/sc.toml") + .as_slice(), + ), + ( + "uprops/small/WB.toml", + include_bytes!("../../../data/segmenter/uprops/small/WB.toml") + .as_slice(), + ), + ( + "segmenter/grapheme.toml", + include_bytes!("../../../data/segmenter/grapheme.toml").as_slice(), + ), + ( + "segmenter/line.toml", + include_bytes!("../../../data/segmenter/line.toml").as_slice(), + ), + ( + "segmenter/sentence.toml", + include_bytes!("../../../data/segmenter/sentence.toml").as_slice(), + ), + ( + "segmenter/word.toml", + include_bytes!("../../../data/segmenter/word.toml").as_slice(), + ), + ] + .into_iter() + .collect(), + )))); + provider + }) + .clone() +} + implement!(LineBreakDataV1Marker, "segmenter/line.toml"); implement!(GraphemeClusterBreakDataV1Marker, "segmenter/grapheme.toml"); implement!(WordBreakDataV1Marker, "segmenter/word.toml");