From 8c20f0fab92706defa37fc86816412e5763b8e40 Mon Sep 17 00:00:00 2001
From: Truong Nhan Nguyen <nhantruongnguyen123@gmail.com>
Date: Tue, 17 Sep 2024 20:10:18 +0700
Subject: [PATCH] ref: refactor Boyer-Moore search algorithm

- Decompse the functions with helper functions
- Write docstring for explaination
- Rewrite tests using macro and add some edge tests
---
 src/string/boyer_moore_search.rs | 177 ++++++++++++++++++++++++-------
 1 file changed, 139 insertions(+), 38 deletions(-)
diff --git a/src/string/boyer_moore_search.rs b/src/string/boyer_moore_search.rs
index eb4297a3d03..e9c46a8c980 100644
--- a/src/string/boyer_moore_search.rs
+++ b/src/string/boyer_moore_search.rs
@@ -1,46 +1,126 @@
-// In computer science, the Boyer–Moore string-search algorithm is an efficient string-searching algorithm,
-// that is the standard benchmark for practical string-search literature. Source: https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
+//! This module implements the Boyer-Moore string search algorithm, an efficient method
+//! for finding all occurrences of a pattern within a given text. The algorithm skips
+//! sections of the text by leveraging two key rules: the bad character rule and the
+//! good suffix rule (only the bad character rule is implemented here for simplicity).
 
 use std::collections::HashMap;
 
-pub fn boyer_moore_search(text: &str, pattern: &str) -> Vec<usize> {
+/// Builds the bad character table for the Boyer-Moore algorithm.
+/// This table stores the last occurrence of each character in the pattern.
+///
+/// # Arguments
+/// * `pat` - The pattern as a slice of characters.
+///
+/// # Returns
+/// A `HashMap` where the keys are characters from the pattern and the values are their
+/// last known positions within the pattern.
+fn build_bad_char_table(pat: &[char]) -> HashMap<char, isize> {
+    let mut bad_char_table = HashMap::new();
+    for (i, &ch) in pat.iter().enumerate() {
+        bad_char_table.insert(ch, i as isize);
+    }
+    bad_char_table
+}
+
+/// Calculates the shift when a full match occurs in the Boyer-Moore algorithm.
+/// It uses the bad character table to determine how much to shift the pattern.
+///
+/// # Arguments
+/// * `shift` - The current shift of the pattern on the text.
+/// * `pat_len` - The length of the pattern.
+/// * `text_len` - The length of the text.
+/// * `bad_char_table` - The bad character table built for the pattern.
+/// * `text` - The text as a slice of characters.
+///
+/// # Returns
+/// The number of positions to shift the pattern after a match.
+fn calc_match_shift(
+    shift: isize,
+    pat_len: isize,
+    text_len: isize,
+    bad_char_table: &HashMap<char, isize>,
+    text: &[char],
+) -> isize {
+    if shift + pat_len >= text_len {
+        return 1;
+    }
+    let next_ch = text[(shift + pat_len) as usize];
+    pat_len - bad_char_table.get(&next_ch).unwrap_or(&-1)
+}
+
+/// Calculates the shift when a mismatch occurs in the Boyer-Moore algorithm.
+/// The bad character rule is used to determine how far to shift the pattern.
+///
+/// # Arguments
+/// * `mis_idx` - The mismatch index in the pattern.
+/// * `shift` - The current shift of the pattern on the text.
+/// * `text` - The text as a slice of characters.
+/// * `bad_char_table` - The bad character table built for the pattern.
+///
+/// # Returns
+/// The number of positions to shift the pattern after a mismatch.
+fn calc_mismatch_shift(
+    mis_idx: isize,
+    shift: isize,
+    text: &[char],
+    bad_char_table: &HashMap<char, isize>,
+) -> isize {
+    let mis_ch = text[(shift + mis_idx) as usize];
+    let bad_char_shift = bad_char_table.get(&mis_ch).unwrap_or(&-1);
+    std::cmp::max(1, mis_idx - bad_char_shift)
+}
+
+/// Performs the Boyer-Moore string search algorithm, which searches for all
+/// occurrences of a pattern within a text.
+///
+/// The Boyer-Moore algorithm is efficient for large texts and patterns, as it
+/// skips sections of the text based on the bad character rule and other optimizations.
+///
+/// # Arguments
+/// * `text` - The text to search within as a string slice.
+/// * `pat` - The pattern to search for as a string slice.
+///
+/// # Returns
+/// A vector of starting indices where the pattern occurs in the text.
+pub fn boyer_moore_search(text: &str, pat: &str) -> Vec<usize> {
     let mut positions = Vec::new();
-    let n = text.len() as i32;
-    let m = pattern.len() as i32;
-    let pattern: Vec<char> = pattern.chars().collect();
-    let text: Vec<char> = text.chars().collect();
-    if n == 0 || m == 0 {
+
+    let text_len = text.len() as isize;
+    let pat_len = pat.len() as isize;
+
+    // Handle edge cases where the text or pattern is empty, or the pattern is longer than the text
+    if text_len == 0 || pat_len == 0 || pat_len > text_len {
         return positions;
     }
-    let mut collection = HashMap::new();
-    for (i, c) in pattern.iter().enumerate() {
-        collection.insert(c, i as i32);
-    }
-    let mut shift: i32 = 0;
-    while shift <= (n - m) {
-        let mut j = m - 1;
-        while j >= 0 && pattern[j as usize] == text[(shift + j) as usize] {
+
+    // Convert text and pattern to character vectors for easier indexing
+    let pat: Vec<char> = pat.chars().collect();
+    let text: Vec<char> = text.chars().collect();
+
+    // Build the bad character table for the pattern
+    let bad_char_table = build_bad_char_table(&pat);
+
+    let mut shift = 0;
+
+    // Main loop: shift the pattern over the text
+    while shift <= text_len - pat_len {
+        let mut j = pat_len - 1;
+
+        // Compare pattern from right to left
+        while j >= 0 && pat[j as usize] == text[(shift + j) as usize] {
             j -= 1;
         }
+
+        // If we found a match (j < 0), record the position
         if j < 0 {
             positions.push(shift as usize);
-            let add_to_shift = {
-                if (shift + m) < n {
-                    let c = text[(shift + m) as usize];
-                    let index = collection.get(&c).unwrap_or(&-1);
-                    m - index
-                } else {
-                    1
-                }
-            };
-            shift += add_to_shift;
+            shift += calc_match_shift(shift, pat_len, text_len, &bad_char_table, &text);
         } else {
-            let c = text[(shift + j) as usize];
-            let index = collection.get(&c).unwrap_or(&-1);
-            let add_to_shift = std::cmp::max(1, j - index);
-            shift += add_to_shift;
+            // If mismatch, calculate how far to shift based on the bad character rule
+            shift += calc_mismatch_shift(j, shift, &text, &bad_char_table);
         }
     }
+
     positions
 }
 
@@ -48,13 +128,34 @@ pub fn boyer_moore_search(text: &str, pattern: &str) -> Vec<usize> {
 mod tests {
     use super::*;
 
-    #[test]
-    fn test_boyer_moore_search() {
-        let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "ABCAB");
-        assert_eq!(a, [1, 11, 20]);
-        let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "FFF");
-        assert_eq!(a, []);
-        let a = boyer_moore_search("AABCAB12AFAABCABFFEGABCAB", "CAB");
-        assert_eq!(a, [3, 13, 22]);
+    macro_rules! boyer_moore_tests {
+        ($($name:ident: $tc:expr,)*) => {
+            $(
+                #[test]
+                fn $name() {
+                    let (text, pattern, expected) = $tc;
+                    assert_eq!(boyer_moore_search(text, pattern), expected);
+                }
+            )*
+        };
+    }
+
+    boyer_moore_tests! {
+        test_simple_match: ("AABCAB12AFAABCABFFEGABCAB", "ABCAB", vec![1, 11, 20]),
+        test_no_match: ("AABCAB12AFAABCABFFEGABCAB", "FFF", vec![]),
+        test_partial_match: ("AABCAB12AFAABCABFFEGABCAB", "CAB", vec![3, 13, 22]),
+        test_empty_text: ("", "A", vec![]),
+        test_empty_pattern: ("ABC", "", vec![]),
+        test_both_empty: ("", "", vec![]),
+        test_pattern_longer_than_text: ("ABC", "ABCDEFG", vec![]),
+        test_single_character_text: ("A", "A", vec![0]),
+        test_single_character_pattern: ("AAAA", "A", vec![0, 1, 2, 3]),
+        test_case_sensitivity: ("ABCabcABC", "abc", vec![3]),
+        test_overlapping_patterns: ("AAAAA", "AAA", vec![0, 1, 2]),
+        test_special_characters: ("@!#$$%^&*", "$$", vec![3]),
+        test_numerical_pattern: ("123456789123456", "456", vec![3, 12]),
+        test_partial_overlap_no_match: ("ABCD", "ABCDE", vec![]),
+        test_single_occurrence: ("XXXXXXXXXXXXXXXXXXPATTERNXXXXXXXXXXXXXXXXXX", "PATTERN", vec![18]),
+        test_single_occurrence_with_noise: ("PATPATPATPATTERNPAT", "PATTERN", vec![9]),
     }
 }