Skip to content

Commit

Permalink
Add is_normalized_up_to to Normalizer (#4334)
Browse files Browse the repository at this point in the history
Closes #4256. Added UTF8 variant to FFI as `is_normalized_up_to`. No
UTF16 tests or fuzzing yet.

Co-authored-by: Henri Sivonen <[email protected]>
  • Loading branch information
Oliver Medhurst and hsivonen authored Jul 10, 2024
1 parent 620242e commit 34c0a2e
Show file tree
Hide file tree
Showing 16 changed files with 539 additions and 2 deletions.
30 changes: 30 additions & 0 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1503,6 +1503,13 @@ macro_rules! normalizer_methods {
ret
}

/// Return the index a string slice is normalized up to.
pub fn is_normalized_up_to(&self, text: &str) -> usize {
let mut sink = IsNormalizedSinkStr::new(text);
let _ = self.normalize_to(text, &mut sink);
text.len() - sink.remaining_len()
}

/// Check whether a string slice is normalized.
pub fn is_normalized(&self, text: &str) -> bool {
let mut sink = IsNormalizedSinkStr::new(text);
Expand All @@ -1522,6 +1529,13 @@ macro_rules! normalizer_methods {
ret
}

/// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
pub fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
let mut sink = IsNormalizedSinkUtf16::new(text);
let _ = self.normalize_utf16_to(text, &mut sink);
text.len() - sink.remaining_len()
}

/// Checks whether a slice of potentially-invalid UTF-16 is normalized.
///
/// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
Expand All @@ -1544,6 +1558,13 @@ macro_rules! normalizer_methods {
ret
}

/// Return the index a slice of potentially-invalid UTF-8 is normalized up to
pub fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
let mut sink = IsNormalizedSinkUtf8::new(text);
let _ = self.normalize_utf8_to(text, &mut sink);
text.len() - sink.remaining_len()
}

/// Check if a slice of potentially-invalid UTF-8 is normalized.
///
/// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
Expand Down Expand Up @@ -2673,6 +2694,9 @@ impl<'a> IsNormalizedSinkUtf16<'a> {
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
pub fn remaining_len(&self) -> usize {
self.expect.len()
}
}

impl<'a> Write16 for IsNormalizedSinkUtf16<'a> {
Expand Down Expand Up @@ -2712,6 +2736,9 @@ impl<'a> IsNormalizedSinkUtf8<'a> {
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
pub fn remaining_len(&self) -> usize {
self.expect.len()
}
}

impl<'a> core::fmt::Write for IsNormalizedSinkUtf8<'a> {
Expand Down Expand Up @@ -2751,6 +2778,9 @@ impl<'a> IsNormalizedSinkStr<'a> {
pub fn finished(&self) -> bool {
self.expect.is_empty()
}
pub fn remaining_len(&self) -> usize {
self.expect.len()
}
}

impl<'a> core::fmt::Write for IsNormalizedSinkStr<'a> {
Expand Down
163 changes: 163 additions & 0 deletions components/normalizer/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1547,3 +1547,166 @@ fn test_is_normalized() {
assert!(nfc.is_normalized_utf16(fraction16));
assert!(!nfkc.is_normalized_utf16(fraction16));
}

#[test]
fn test_is_normalized_up_to() {
let nfd: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let nfkd: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let nfc: ComposingNormalizer = ComposingNormalizer::new_nfc();
let nfkc: ComposingNormalizer = ComposingNormalizer::new_nfkc();

// Check a string slice is normalized up to where is_normalized_up_to reports
let check_str = |input: &str| {
// Check nfd
let up_to = nfd.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfd.normalize_to(tail, &mut normalized);
assert!(nfd.is_normalized(&normalized));

// Check nfkd
let up_to = nfkd.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfkd.normalize_to(tail, &mut normalized);
assert!(nfkd.is_normalized(&normalized));

// Check nfc
let up_to = nfc.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfc.normalize_to(tail, &mut normalized);
assert!(nfc.is_normalized(&normalized));

// Check nfkc
let up_to = nfkc.is_normalized_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from(head);
let _ = nfkc.normalize_to(tail, &mut normalized);
assert!(nfkc.is_normalized(&normalized));
};

// Check a string of UTF8 bytes is normalized up to where is_normalized_up_to reports
// note: from_utf8 can panic with invalid UTF8 input
let check_utf8 = |input: &[u8]| {
// Check nfd
let up_to = nfd.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfd.normalize_utf8_to(tail, &mut normalized);
assert!(nfd.is_normalized(&normalized));

// Check nfkd
let up_to = nfkd.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfkd.normalize_utf8_to(tail, &mut normalized);
assert!(nfkd.is_normalized(&normalized));

// Check nfc
let up_to = nfc.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfc.normalize_utf8_to(tail, &mut normalized);
assert!(nfc.is_normalized(&normalized));

// Check nfkc
let up_to = nfkc.is_normalized_utf8_up_to(input);
let (head, tail) = input.split_at(up_to);
let mut normalized = String::from_utf8(head.to_vec()).unwrap();
let _ = nfkc.normalize_utf8_to(tail, &mut normalized);
assert!(nfkc.is_normalized(&normalized));
};

// todo: UTF16 tests?

let aaa = "aaa";
check_str(aaa);

let aaa_utf8 = aaa.as_bytes();
check_utf8(aaa_utf8);

assert!(nfd.is_normalized_up_to(aaa) == aaa.len());
assert!(nfkd.is_normalized_up_to(aaa) == aaa.len());
assert!(nfc.is_normalized_up_to(aaa) == aaa.len());
assert!(nfkc.is_normalized_up_to(aaa) == aaa.len());
assert!(nfd.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
assert!(nfkd.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
assert!(nfc.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());
assert!(nfkc.is_normalized_utf8_up_to(aaa_utf8) == aaa_utf8.len());

let note = "a𝅗\u{1D165}a";
check_str(note);

let note_utf8 = note.as_bytes();
check_utf8(note_utf8);

assert!(nfd.is_normalized_up_to(note) == note.len());
assert!(nfkd.is_normalized_up_to(note) == note.len());
assert!(nfc.is_normalized_up_to(note) == note.len());
assert!(nfkc.is_normalized_up_to(note) == note.len());
assert!(nfd.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
assert!(nfkd.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
assert!(nfc.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());
assert!(nfkc.is_normalized_utf8_up_to(note_utf8) == note_utf8.len());

let umlaut = "aäa";
check_str(umlaut);

let umlaut_utf8 = umlaut.as_bytes();
check_utf8(umlaut_utf8);

assert_eq!(nfd.is_normalized_up_to(umlaut), 1);
assert_eq!(nfkd.is_normalized_up_to(umlaut), 1);
assert_eq!(nfc.is_normalized_up_to(umlaut), 4);
assert_eq!(nfkc.is_normalized_up_to(umlaut), 4);
assert_eq!(nfd.is_normalized_utf8_up_to(umlaut_utf8), 1);
assert_eq!(nfkd.is_normalized_utf8_up_to(umlaut_utf8), 1);
assert_eq!(nfc.is_normalized_utf8_up_to(umlaut_utf8), 4);
assert_eq!(nfkc.is_normalized_utf8_up_to(umlaut_utf8), 4);

let fraction = "a½a";
check_str(fraction);

let fraction_utf8 = fraction.as_bytes();
check_utf8(fraction_utf8);

assert_eq!(nfd.is_normalized_up_to(fraction), 4);
assert_eq!(nfkd.is_normalized_up_to(fraction), 1);
assert_eq!(nfc.is_normalized_up_to(fraction), 4);
assert_eq!(nfkc.is_normalized_up_to(fraction), 1);
assert_eq!(nfd.is_normalized_utf8_up_to(fraction_utf8), 4);
assert_eq!(nfkd.is_normalized_utf8_up_to(fraction_utf8), 1);
assert_eq!(nfc.is_normalized_utf8_up_to(fraction_utf8), 4);
assert_eq!(nfkc.is_normalized_utf8_up_to(fraction_utf8), 1);

let reversed_vietnamese = "e\u{0302}\u{0323}";
check_str(reversed_vietnamese);

let reversed_vietnamese_utf8 = reversed_vietnamese.as_bytes();
check_utf8(reversed_vietnamese_utf8);

assert_eq!(nfd.is_normalized_up_to(reversed_vietnamese), 1);
assert_eq!(nfkd.is_normalized_up_to(reversed_vietnamese), 1);
assert_eq!(nfc.is_normalized_up_to(reversed_vietnamese), 0);
assert_eq!(nfkc.is_normalized_up_to(reversed_vietnamese), 0);
assert_eq!(nfd.is_normalized_utf8_up_to(reversed_vietnamese_utf8), 1);
assert_eq!(nfkd.is_normalized_utf8_up_to(reversed_vietnamese_utf8), 1);
assert_eq!(nfc.is_normalized_utf8_up_to(reversed_vietnamese_utf8), 0);
assert_eq!(nfkc.is_normalized_utf8_up_to(reversed_vietnamese_utf8), 0);

let truncated_vietnamese = "e\u{0302}";
check_str(truncated_vietnamese);

let truncated_vietnamese_utf8 = truncated_vietnamese.as_bytes();
check_utf8(truncated_vietnamese_utf8);

assert_eq!(nfd.is_normalized_up_to(truncated_vietnamese), 3);
assert_eq!(nfkd.is_normalized_up_to(truncated_vietnamese), 3);
assert_eq!(nfc.is_normalized_up_to(truncated_vietnamese), 0);
assert_eq!(nfkc.is_normalized_up_to(truncated_vietnamese), 0);
assert_eq!(nfd.is_normalized_utf8_up_to(truncated_vietnamese_utf8), 3);
assert_eq!(nfkd.is_normalized_utf8_up_to(truncated_vietnamese_utf8), 3);
assert_eq!(nfc.is_normalized_utf8_up_to(truncated_vietnamese_utf8), 0);
assert_eq!(nfkc.is_normalized_utf8_up_to(truncated_vietnamese_utf8), 0);
}
6 changes: 6 additions & 0 deletions ffi/capi/bindings/c/ICU4XComposingNormalizer.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions ffi/capi/bindings/c/ICU4XDecomposingNormalizer.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions ffi/capi/bindings/cpp/ICU4XComposingNormalizer.d.hpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions ffi/capi/bindings/cpp/ICU4XComposingNormalizer.hpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions ffi/capi/bindings/cpp/ICU4XDecomposingNormalizer.d.hpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions ffi/capi/bindings/cpp/ICU4XDecomposingNormalizer.hpp

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 34c0a2e

Please sign in to comment.