Skip to content

Commit

Permalink
Add marker attribute prefix matching (#5207)
Browse files Browse the repository at this point in the history
Fixes #4511
  • Loading branch information
robertbastian authored Jul 10, 2024
1 parent 05769e3 commit 6ebbcb2
Show file tree
Hide file tree
Showing 146 changed files with 481 additions and 291 deletions.
13 changes: 5 additions & 8 deletions components/segmenter/src/complex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,13 @@ pub(crate) struct ComplexPayloads {
}

#[cfg(feature = "lstm")]
const MY_LSTM: &DataMarkerAttributes =
DataMarkerAttributes::from_str_or_panic("Burmese_codepoints_exclusive_model4_heavy");
const MY_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Burmese_");
#[cfg(feature = "lstm")]
const KM_LSTM: &DataMarkerAttributes =
DataMarkerAttributes::from_str_or_panic("Khmer_codepoints_exclusive_model4_heavy");
const KM_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Khmer_");
#[cfg(feature = "lstm")]
const LO_LSTM: &DataMarkerAttributes =
DataMarkerAttributes::from_str_or_panic("Lao_codepoints_exclusive_model4_heavy");
const LO_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Lao_");
#[cfg(feature = "lstm")]
const TH_LSTM: &DataMarkerAttributes =
DataMarkerAttributes::from_str_or_panic("Thai_codepoints_exclusive_model4_heavy");
const TH_LSTM: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("Thai_");

const MY_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("burmesedict");
const KM_DICT: &DataMarkerAttributes = DataMarkerAttributes::from_str_or_panic("khmerdict");
Expand Down Expand Up @@ -341,6 +337,7 @@ fn try_load<M: DataMarker, P: DataProvider<M> + ?Sized>(
metadata: {
let mut m = DataRequestMetadata::default();
m.silent = true;
m.attributes_prefix_match = true;
m
},
}) {
Expand Down
13 changes: 12 additions & 1 deletion provider/baked/src/binary_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,20 @@ pub(crate) fn bake(
pub struct Data<K: BinarySearchKey, M: DataMarker>(pub &'static [(K::Type, &'static M::Yokeable)]);

impl<K: BinarySearchKey, M: DataMarker> super::DataStore<M> for Data<K, M> {
fn get(&self, id: DataIdentifierBorrowed) -> Option<&'static M::Yokeable> {
fn get(
&self,
id: DataIdentifierBorrowed,
attributes_prefix_match: bool,
) -> Option<&'static M::Yokeable> {
self.0
.binary_search_by(|&(k, _)| K::cmp(k, id))
.or_else(|e| {
if attributes_prefix_match {
Ok(e)
} else {
Err(e)
}
})
.map(|i| unsafe { self.0.get_unchecked(i) }.1)
.ok()
}
Expand Down
6 changes: 3 additions & 3 deletions provider/baked/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ impl DataExporter for BakedExporter {
let search = if !needs_fallback {
quote! {
let metadata = Default::default();
let Some(payload) = icu_provider_baked::DataStore::get(&Self::#data_ident, req.id) else {
let Some(payload) = icu_provider_baked::DataStore::get(&Self::#data_ident, req.id, req.metadata.attributes_prefix_match) else {
return Err(icu_provider::DataErrorKind::IdentifierNotFound.with_req(<#marker_bake as icu_provider::DataMarker>::INFO, req))
};
}
Expand All @@ -557,15 +557,15 @@ impl DataExporter for BakedExporter {
quote! {
let mut metadata = icu_provider::DataResponseMetadata::default();

let payload = if let Some(payload) = icu_provider_baked::DataStore::get(&Self::#data_ident, req.id) {
let payload = if let Some(payload) = icu_provider_baked::DataStore::get(&Self::#data_ident, req.id, req.metadata.attributes_prefix_match) {
payload
} else {
const FALLBACKER: icu_locale::fallback::LocaleFallbackerWithConfig<'static> =
icu_locale::fallback::LocaleFallbacker::new()
.for_config(<#marker_bake as icu_provider::DataMarker>::INFO.fallback_config);
let mut fallback_iterator = FALLBACKER.fallback_for(req.id.locale.clone());
loop {
if let Some(payload) = icu_provider_baked::DataStore::get(&Self::#data_ident, icu_provider::DataIdentifierBorrowed::for_marker_attributes_and_locale(req.id.marker_attributes, fallback_iterator.get())) {
if let Some(payload) = icu_provider_baked::DataStore::get(&Self::#data_ident, icu_provider::DataIdentifierBorrowed::for_marker_attributes_and_locale(req.id.marker_attributes, fallback_iterator.get()), req.metadata.attributes_prefix_match) {
metadata.locale = Some(fallback_iterator.take());
break payload;
}
Expand Down
6 changes: 5 additions & 1 deletion provider/baked/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ pub mod binary_search;
pub mod zerotrie;

pub trait DataStore<M: DataMarker> {
fn get(&self, req: DataIdentifierBorrowed) -> Option<&'static M::Yokeable>;
fn get(
&self,
req: DataIdentifierBorrowed,
attributes_prefix_match: bool,
) -> Option<&'static M::Yokeable>;

type IterReturn: Iterator<Item = DataIdentifierCow<'static>>;
fn iter(&'static self) -> Self::IterReturn;
Expand Down
20 changes: 16 additions & 4 deletions provider/baked/src/zerotrie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,29 @@ pub struct Data<M: DataMarker> {
}

impl<M: DataMarker> super::DataStore<M> for Data<M> {
fn get(&self, id: DataIdentifierBorrowed) -> Option<&'static <M>::Yokeable> {
fn get(
&self,
id: DataIdentifierBorrowed,
attributes_prefix_match: bool,
) -> Option<&'static <M>::Yokeable> {
use writeable::Writeable;
let mut cursor = self.trie.cursor();
let _is_ascii = id.locale.write_to(&mut cursor);
if !id.marker_attributes.is_empty() {
cursor.step(ID_SEPARATOR);
id.marker_attributes.write_to(&mut cursor).ok()?;
loop {
if let Some(v) = cursor.take_value() {
break Some(v);
}
if !attributes_prefix_match || cursor.probe(0).is_none() {
break None;
}
}
} else {
cursor.take_value()
}
cursor
.take_value()
.map(|i| unsafe { self.values.get_unchecked(i) })
.map(|i| unsafe { self.values.get_unchecked(i) })
}

type IterReturn = core::iter::FilterMap<
Expand Down
2 changes: 1 addition & 1 deletion provider/baked/tests/data/hello_world_v1_marker.rs.data
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ macro_rules! __impl_hello_world_v1_marker {
impl icu_provider::DataProvider<icu_provider::hello_world::HelloWorldV1Marker> for $provider {
fn load(&self, req: icu_provider::DataRequest) -> Result<icu_provider::DataResponse<icu_provider::hello_world::HelloWorldV1Marker>, icu_provider::DataError> {
let metadata = Default::default();
let Some(payload) = icu_provider_baked::DataStore::get(&Self::DATA_HELLO_WORLD_V1_MARKER, req.id) else { return Err(icu_provider::DataErrorKind::IdentifierNotFound.with_req(<icu_provider::hello_world::HelloWorldV1Marker as icu_provider::DataMarker>::INFO, req)) };
let Some(payload) = icu_provider_baked::DataStore::get(&Self::DATA_HELLO_WORLD_V1_MARKER, req.id, req.metadata.attributes_prefix_match) else { return Err(icu_provider::DataErrorKind::IdentifierNotFound.with_req(<icu_provider::hello_world::HelloWorldV1Marker as icu_provider::DataMarker>::INFO, req)) };
Ok(icu_provider::DataResponse { payload: icu_provider::DataPayload::from_static_ref(payload), metadata })
}
}
Expand Down
51 changes: 51 additions & 0 deletions provider/baked/tests/test-baked-source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,54 @@ fn load() {
assert_eq!(baked, expected);
}
}

#[test]
fn prefix_match() {
use icu_provider::hello_world::HelloWorldV1Marker;
use icu_provider::prelude::*;

let id = DataIdentifierCow::from_owned(
DataMarkerAttributes::from_str_or_panic("reve").to_owned(),
"ja".parse().unwrap(),
);

assert!(DataProvider::<HelloWorldV1Marker>::load(
&Baked,
DataRequest {
id: id.as_borrowed(),
..Default::default()
}
)
.is_err());

assert!(DataProvider::<HelloWorldV1Marker>::load(
&Baked,
DataRequest {
id: id.as_borrowed(),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.attributes_prefix_match = true;
metadata
}
}
)
.is_ok());

let id = DataIdentifierCow::from_owned(
DataMarkerAttributes::from_str_or_panic("non-existent").to_owned(),
"ja".parse().unwrap(),
);

assert!(DataProvider::<HelloWorldV1Marker>::load(
&Baked,
DataRequest {
id: id.as_borrowed(),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.attributes_prefix_match = true;
metadata
}
}
)
.is_err());
}
16 changes: 12 additions & 4 deletions provider/blob/src/blob_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,16 +233,24 @@ impl<'data, LocaleVecFormat: VarZeroVecFormat> BlobSchemaV2<'data, LocaleVecForm
.ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(marker, req))?;
let mut cursor = ZeroTrieSimpleAscii::from_store(zerotrie).into_cursor();
let _infallible_ascii = req.id.locale.write_to(&mut cursor);
if !req.id.marker_attributes.is_empty() {
let blob_index = if !req.id.marker_attributes.is_empty() {
let _infallible_ascii = cursor.write_char(REQUEST_SEPARATOR);
req.id
.marker_attributes
.write_to(&mut cursor)
.map_err(|_| DataErrorKind::IdentifierNotFound.with_req(marker, req))?;
loop {
if let Some(v) = cursor.take_value() {
break Some(v);
}
if !req.metadata.attributes_prefix_match || cursor.probe(0).is_none() {
break None;
}
}
} else {
cursor.take_value()
}
let blob_index = cursor
.take_value()
.ok_or_else(|| DataErrorKind::IdentifierNotFound.with_req(marker, req))?;
.ok_or_else(|| DataErrorKind::IdentifierNotFound.with_req(marker, req))?;
let buffer = self
.buffers
.get(blob_index)
Expand Down
49 changes: 46 additions & 3 deletions provider/blob/tests/test_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ where
exporter.close().unwrap();
}

fn check_hello_world(blob_provider: impl DataProvider<HelloWorldV1Marker>) {
fn check_hello_world(
blob_provider: impl DataProvider<HelloWorldV1Marker>,
test_prefix_match: bool,
) {
let hello_world_provider = HelloWorldProvider;
for id in hello_world_provider.iter_ids().unwrap() {
let blob_result = blob_provider
Expand All @@ -57,6 +60,46 @@ fn check_hello_world(blob_provider: impl DataProvider<HelloWorldV1Marker>) {
.payload;
assert_eq!(blob_result, expected_result, "{:?}", id);
}

if test_prefix_match {
let id = DataIdentifierCow::from_owned(
DataMarkerAttributes::from_str_or_panic("reve").to_owned(),
"ja".parse().unwrap(),
);
assert!(blob_provider
.load(DataRequest {
id: id.as_borrowed(),
..Default::default()
})
.is_err());

assert!(blob_provider
.load(DataRequest {
id: id.as_borrowed(),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.attributes_prefix_match = true;
metadata
}
})
.is_ok());

let id = DataIdentifierCow::from_owned(
DataMarkerAttributes::from_str_or_panic("non-existent").to_owned(),
"ja".parse().unwrap(),
);

assert!(blob_provider
.load(DataRequest {
id: id.as_borrowed(),
metadata: {
let mut metadata = DataRequestMetadata::default();
metadata.attributes_prefix_match = true;
metadata
}
})
.is_err());
}
}

#[test]
Expand All @@ -67,7 +110,7 @@ fn test_v1() {
assert_eq!(BLOB_V1, blob.as_slice());

let blob_provider = BlobDataProvider::try_new_from_blob(blob.into_boxed_slice()).unwrap();
check_hello_world(blob_provider.as_deserializing());
check_hello_world(blob_provider.as_deserializing(), false);
}

#[test]
Expand All @@ -82,7 +125,7 @@ fn test_v2() {
!blob_provider.internal_is_using_v2_bigger_format(),
"Should have exported to smaller V2 format"
);
check_hello_world(blob_provider.as_deserializing());
check_hello_world(blob_provider.as_deserializing(), true);
}

// This tests that the V2Bigger format works by attempting to export something with 26^4 = 456976 data entries
Expand Down
2 changes: 2 additions & 0 deletions provider/core/src/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ pub struct DataRequest<'a> {
pub struct DataRequestMetadata {
/// Silent requests do not log errors. This can be used for exploratory querying, such as fallbacks.
pub silent: bool,
/// Whether to allow prefix matches for the data marker attributes.
pub attributes_prefix_match: bool,
}

/// The borrowed version of a [`DataIdentifierCow`].
Expand Down
4 changes: 2 additions & 2 deletions provider/data/calendar/data/week_data_v1_marker.rs.data

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions provider/data/calendar/data/week_data_v2_marker.rs.data

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions provider/data/collator/data/collation_data_v1_marker.rs.data

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 6ebbcb2

Please sign in to comment.