Skip to content

Commit

Permalink
Caching segmenter data (#4316)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian authored Nov 16, 2023
1 parent 5df2e19 commit 317c8af
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 70 deletions.
74 changes: 5 additions & 69 deletions provider/datagen/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,70 +279,6 @@ impl DatagenProvider {
) -> Result<impl IntoIterator<Item = icu_locid::LanguageIdentifier>, DataError> {
self.cldr()?.locales(levels)
}

pub(crate) fn new_hardcoded_segmenter_data() -> Self {
Self {
source: SourceData {
trie_type: TrieType::Small,
icuexport_paths: Some(Arc::new(SerdeCache::new(AbstractFs::Memory(
[
(
"uprops/small/ea.toml",
include_bytes!("../data/segmenter/uprops/small/ea.toml").as_slice(),
),
(
"uprops/small/ExtPict.toml",
include_bytes!("../data/segmenter/uprops/small/ExtPict.toml")
.as_slice(),
),
(
"uprops/small/gc.toml",
include_bytes!("../data/segmenter/uprops/small/gc.toml").as_slice(),
),
(
"uprops/small/GCB.toml",
include_bytes!("../data/segmenter/uprops/small/GCB.toml").as_slice(),
),
(
"uprops/small/lb.toml",
include_bytes!("../data/segmenter/uprops/small/lb.toml").as_slice(),
),
(
"uprops/small/SB.toml",
include_bytes!("../data/segmenter/uprops/small/SB.toml").as_slice(),
),
(
"uprops/small/sc.toml",
include_bytes!("../data/segmenter/uprops/small/sc.toml").as_slice(),
),
(
"uprops/small/WB.toml",
include_bytes!("../data/segmenter/uprops/small/WB.toml").as_slice(),
),
(
"segmenter/grapheme.toml",
include_bytes!("../data/segmenter/grapheme.toml").as_slice(),
),
(
"segmenter/line.toml",
include_bytes!("../data/segmenter/line.toml").as_slice(),
),
(
"segmenter/sentence.toml",
include_bytes!("../data/segmenter/sentence.toml").as_slice(),
),
(
"segmenter/word.toml",
include_bytes!("../data/segmenter/word.toml").as_slice(),
),
]
.into_iter()
.collect(),
)))),
..Self::new_custom().source
},
}
}
}

/// Specifies the trie type to use.
Expand Down Expand Up @@ -381,11 +317,11 @@ impl std::fmt::Display for TrieType {
#[non_exhaustive]
#[deprecated(since = "1.3.0", note = "use `DatagenProvider`")]
pub struct SourceData {
cldr_paths: Option<Arc<CldrCache>>,
icuexport_paths: Option<Arc<SerdeCache>>,
segmenter_lstm_paths: Option<Arc<SerdeCache>>,
trie_type: TrieType,
collation_han_database: CollationHanDatabase,
pub(crate) cldr_paths: Option<Arc<CldrCache>>,
pub(crate) icuexport_paths: Option<Arc<SerdeCache>>,
pub(crate) segmenter_lstm_paths: Option<Arc<SerdeCache>>,
pub(crate) trie_type: TrieType,
pub(crate) collation_han_database: CollationHanDatabase,
#[cfg(feature = "legacy_api")]
// populated if constructed through `SourceData` constructor only
pub(crate) icuexport_dictionary_fallback: Option<Arc<SerdeCache>>,
Expand Down
81 changes: 80 additions & 1 deletion provider/datagen/src/transform/segmenter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ macro_rules! implement {
return {
self.check_req::<$marker>(req)?;
let data = generate_rule_break_data(
&Self::new_hardcoded_segmenter_data(),
&hardcoded_segmenter_provider(),
$rules,
self.trie_type(),
);
Expand All @@ -555,6 +555,85 @@ macro_rules! implement {
}
}

fn hardcoded_segmenter_provider() -> crate::DatagenProvider {
#![allow(deprecated)]
use crate::{
source::{AbstractFs, SerdeCache},
DatagenProvider, TrieType,
};
// Singleton so that all instantiations share the same cache.
static SINGLETON: once_cell::sync::OnceCell<DatagenProvider> = once_cell::sync::OnceCell::new();
SINGLETON
.get_or_init(|| {
let mut provider = DatagenProvider::new_custom();
provider.source.icuexport_paths =
Some(std::sync::Arc::new(SerdeCache::new(AbstractFs::Memory(
[
(
"uprops/small/ea.toml",
include_bytes!("../../../data/segmenter/uprops/small/ea.toml")
.as_slice(),
),
(
"uprops/small/ExtPict.toml",
include_bytes!("../../../data/segmenter/uprops/small/ExtPict.toml")
.as_slice(),
),
(
"uprops/small/gc.toml",
include_bytes!("../../../data/segmenter/uprops/small/gc.toml")
.as_slice(),
),
(
"uprops/small/GCB.toml",
include_bytes!("../../../data/segmenter/uprops/small/GCB.toml")
.as_slice(),
),
(
"uprops/small/lb.toml",
include_bytes!("../../../data/segmenter/uprops/small/lb.toml")
.as_slice(),
),
(
"uprops/small/SB.toml",
include_bytes!("../../../data/segmenter/uprops/small/SB.toml")
.as_slice(),
),
(
"uprops/small/sc.toml",
include_bytes!("../../../data/segmenter/uprops/small/sc.toml")
.as_slice(),
),
(
"uprops/small/WB.toml",
include_bytes!("../../../data/segmenter/uprops/small/WB.toml")
.as_slice(),
),
(
"segmenter/grapheme.toml",
include_bytes!("../../../data/segmenter/grapheme.toml").as_slice(),
),
(
"segmenter/line.toml",
include_bytes!("../../../data/segmenter/line.toml").as_slice(),
),
(
"segmenter/sentence.toml",
include_bytes!("../../../data/segmenter/sentence.toml").as_slice(),
),
(
"segmenter/word.toml",
include_bytes!("../../../data/segmenter/word.toml").as_slice(),
),
]
.into_iter()
.collect(),
))));
provider
})
.clone()
}

implement!(LineBreakDataV1Marker, "segmenter/line.toml");
implement!(GraphemeClusterBreakDataV1Marker, "segmenter/grapheme.toml");
implement!(WordBreakDataV1Marker, "segmenter/word.toml");
Expand Down

0 comments on commit 317c8af

Please sign in to comment.