diff --git a/src/string/duval_algorithm.rs b/src/string/duval_algorithm.rs index 0ff93d8f132..69e9dbff2a9 100644 --- a/src/string/duval_algorithm.rs +++ b/src/string/duval_algorithm.rs @@ -1,34 +1,60 @@ -// A string is called simple (or a Lyndon word), if it is strictly smaller than any of its own nontrivial suffixes. -// Duval (1983) developed an algorithm for finding the standard factorization that runs in linear time and constant space. Source: https://en.wikipedia.org/wiki/Lyndon_word -fn factorization_with_duval(s: Vec) -> Vec { - let n = s.len(); - let mut i = 0; - let mut factorization: Vec = Vec::new(); +//! Implementation of Duval's Algorithm to compute the standard factorization of a string +//! into Lyndon words. A Lyndon word is defined as a string that is strictly smaller +//! (lexicographically) than any of its nontrivial suffixes. This implementation operates +//! in linear time and space. - while i < n { - let mut j = i + 1; - let mut k = i; +/// Performs Duval's algorithm to factorize a given string into its Lyndon words. +/// +/// # Arguments +/// +/// * `s` - A slice of characters representing the input string. +/// +/// # Returns +/// +/// A vector of strings, where each string is a Lyndon word, representing the factorization +/// of the input string. +/// +/// # Time Complexity +/// +/// The algorithm runs in O(n) time, where `n` is the length of the input string. +pub fn duval_algorithm(s: &str) -> Vec { + factorize_duval(&s.chars().collect::>()) +} + +/// Helper function that takes a string slice, converts it to a vector of characters, +/// and then applies the Duval factorization algorithm to find the Lyndon words. +/// +/// # Arguments +/// +/// * `s` - A string slice representing the input text. +/// +/// # Returns +/// +/// A vector of strings, each representing a Lyndon word in the factorization. +fn factorize_duval(s: &[char]) -> Vec { + let mut start = 0; + let mut factors: Vec = Vec::new(); - while j < n && s[k] <= s[j] { - if s[k] < s[j] { - k = i; + while start < s.len() { + let mut end = start + 1; + let mut repeat = start; + + while end < s.len() && s[repeat] <= s[end] { + if s[repeat] < s[end] { + repeat = start; } else { - k += 1; + repeat += 1; } - j += 1; + end += 1; } - while i <= k { - factorization.push(s[i..i + j - k].iter().collect::()); - i += j - k; + while start <= repeat { + factors.push(s[start..start + end - repeat].iter().collect::()); + start += end - repeat; } } - factorization -} - -pub fn duval_algorithm(s: &str) -> Vec { - return factorization_with_duval(s.chars().collect::>()); + factors } #[cfg(test)] @@ -37,29 +63,35 @@ mod test { macro_rules! test_duval_algorithm { ($($name:ident: $inputs:expr,)*) => { - $( - #[test] - fn $name() { - let (text, expected) = $inputs; - assert_eq!(duval_algorithm(text), expected); - } - )* + $( + #[test] + fn $name() { + let (text, expected) = $inputs; + assert_eq!(duval_algorithm(text), expected); + } + )* } } test_duval_algorithm! { - multiple: ("abcdabcdababc", ["abcd", "abcd", "ababc"]), - all: ("aaa", ["a", "a", "a"]), + repeating_with_suffix: ("abcdabcdababc", ["abcd", "abcd", "ababc"]), + single_repeating_char: ("aaa", ["a", "a", "a"]), single: ("ababb", ["ababb"]), unicode: ("അഅഅ", ["അ", "അ", "അ"]), - } - - #[test] - fn test_factorization_with_duval_multiple() { - let text = "abcdabcdababc"; - assert_eq!( - factorization_with_duval(text.chars().collect::>()), - ["abcd", "abcd", "ababc"] - ); + empty_string: ("", Vec::::new()), + single_char: ("x", ["x"]), + palindrome: ("racecar", ["r", "acecar"]), + long_repeating: ("aaaaaa", ["a", "a", "a", "a", "a", "a"]), + mixed_repeating: ("ababcbabc", ["ababcbabc"]), + non_repeating_sorted: ("abcdefg", ["abcdefg"]), + alternating_increasing: ("abababab", ["ab", "ab", "ab", "ab"]), + long_repeating_lyndon: ("abcabcabcabc", ["abc", "abc", "abc", "abc"]), + decreasing_order: ("zyxwvutsrqponm", ["z", "y", "x", "w", "v", "u", "t", "s", "r", "q", "p", "o", "n", "m"]), + alphanumeric_mixed: ("a1b2c3a1", ["a", "1b2c3a", "1"]), + special_characters: ("a@b#c$d", ["a", "@b", "#c$d"]), + unicode_complex: ("αβγδ", ["αβγδ"]), + long_string_performance: (&"a".repeat(1_000_000), vec!["a"; 1_000_000]), + palindrome_repeating_prefix: ("abccba", ["abccb", "a"]), + interrupted_lyndon: ("abcxabc", ["abcx", "abc"]), } }