The Algorithms logo
The Algorithms
Acerca deDonar

Z Algorithm

//! This module provides functionalities to match patterns in strings
//! and compute the Z-array for a given input string.

/// Calculates the Z-value for a given substring of the input string
/// based on a specified pattern.
///
/// # Parameters
/// - `input_string`: A slice of elements that represents the input string.
/// - `pattern`: A slice of elements representing the pattern to match.
/// - `start_index`: The index in the input string to start checking for matches.
/// - `z_value`: The initial Z-value to be computed.
///
/// # Returns
/// The computed Z-value indicating the length of the matching prefix.
fn calculate_z_value<T: Eq>(
    input_string: &[T],
    pattern: &[T],
    start_index: usize,
    mut z_value: usize,
) -> usize {
    let size = input_string.len();
    let pattern_size = pattern.len();

    while (start_index + z_value) < size && z_value < pattern_size {
        if input_string[start_index + z_value] != pattern[z_value] {
            break;
        }
        z_value += 1;
    }
    z_value
}

/// Initializes the Z-array value based on a previous match and updates
/// it to optimize further calculations.
///
/// # Parameters
/// - `z_array`: A mutable slice of the Z-array to be updated.
/// - `i`: The current index in the input string.
/// - `match_end`: The index of the last character matched in the pattern.
/// - `last_match`: The index of the last match found.
///
/// # Returns
/// The initialized Z-array value for the current index.
fn initialize_z_array_from_previous_match(
    z_array: &mut [usize],
    i: usize,
    match_end: usize,
    last_match: usize,
) -> usize {
    std::cmp::min(z_array[i - last_match], match_end - i + 1)
}

/// Finds the starting indices of all full matches of the pattern
/// in the Z-array.
///
/// # Parameters
/// - `z_array`: A slice of the Z-array containing computed Z-values.
/// - `pattern_size`: The length of the pattern to find in the Z-array.
///
/// # Returns
/// A vector containing the starting indices of full matches.
fn find_full_matches(z_array: &[usize], pattern_size: usize) -> Vec<usize> {
    z_array
        .iter()
        .enumerate()
        .filter_map(|(idx, &z_value)| (z_value == pattern_size).then_some(idx))
        .collect()
}

/// Matches the occurrences of a pattern in an input string starting
/// from a specified index.
///
/// # Parameters
/// - `input_string`: A slice of elements to search within.
/// - `pattern`: A slice of elements that represents the pattern to match.
/// - `start_index`: The index in the input string to start the search.
/// - `only_full_matches`: If true, only full matches of the pattern will be returned.
///
/// # Returns
/// A vector containing the starting indices of the matches.
fn match_with_z_array<T: Eq>(
    input_string: &[T],
    pattern: &[T],
    start_index: usize,
    only_full_matches: bool,
) -> Vec<usize> {
    let size = input_string.len();
    let pattern_size = pattern.len();
    let mut last_match: usize = 0;
    let mut match_end: usize = 0;
    let mut z_array = vec![0usize; size];

    for i in start_index..size {
        if i <= match_end {
            z_array[i] =
                initialize_z_array_from_previous_match(&mut z_array, i, match_end, last_match);
        }

        z_array[i] = calculate_z_value(input_string, pattern, i, z_array[i]);

        if i + z_array[i] > match_end + 1 {
            match_end = i + z_array[i] - 1;
            last_match = i;
        }
    }

    if !only_full_matches {
        z_array
    } else {
        find_full_matches(&z_array, pattern_size)
    }
}

/// Constructs the Z-array for the given input string.
///
/// The Z-array is an array where the i-th element is the length of the longest
/// substring starting from s[i] that is also a prefix of s.
///
/// # Parameters
/// - `input`: A slice of the input string for which the Z-array is to be constructed.
///
/// # Returns
/// A vector representing the Z-array of the input string.
pub fn z_array<T: Eq>(input: &[T]) -> Vec<usize> {
    match_with_z_array(input, input, 1, false)
}

/// Matches the occurrences of a given pattern in an input string.
///
/// This function acts as a wrapper around `match_with_z_array` to provide a simpler
/// interface for pattern matching, returning only full matches.
///
/// # Parameters
/// - `input`: A slice of the input string where the pattern will be searched.
/// - `pattern`: A slice of the pattern to search for in the input string.
///
/// # Returns
/// A vector of indices where the pattern matches the input string.
pub fn match_pattern<T: Eq>(input: &[T], pattern: &[T]) -> Vec<usize> {
    match_with_z_array(input, pattern, 0, true)
}

#[cfg(test)]
mod tests {
    use super::*;

    macro_rules! test_match_pattern {
        ($($name:ident: ($input:expr, $pattern:expr, $expected:expr),)*) => {
            $(
                #[test]
                fn $name() {
                    let (input, pattern, expected) = ($input, $pattern, $expected);
                    assert_eq!(match_pattern(input.as_bytes(), pattern.as_bytes()), expected);
                }
            )*
        };
    }

    macro_rules! test_z_array_cases {
        ($($name:ident: ($input:expr, $expected:expr),)*) => {
            $(
                #[test]
                fn $name() {
                    let (input, expected) = ($input, $expected);
                    assert_eq!(z_array(input.as_bytes()), expected);
                }
            )*
        };
    }

    test_match_pattern! {
        simple_match: ("abcabcabc", "abc", vec![0, 3, 6]),
        no_match: ("abcdef", "xyz", vec![]),
        single_char_match: ("aaaaaa", "a", vec![0, 1, 2, 3, 4, 5]),
        overlapping_match: ("abababa", "aba", vec![0, 2, 4]),
        full_string_match: ("pattern", "pattern", vec![0]),
        empty_pattern: ("nonempty", " ", vec![]),
        pattern_larger_than_text: ("small", "largerpattern", vec![]),
        repeated_pattern_in_text: (
            "aaaaaaaa",
            "aaa",
            vec![0, 1, 2, 3, 4, 5]
        ),
        pattern_not_in_lipsum: (
            concat!(
                "lorem ipsum dolor sit amet, consectetur ",
                "adipiscing elit, sed do eiusmod tempor ",
                "incididunt ut labore et dolore magna aliqua"
            ),
            ";alksdjfoiwer",
            vec![]
        ),
        pattern_in_lipsum: (
            concat!(
                "lorem ipsum dolor sit amet, consectetur ",
                "adipiscing elit, sed do eiusmod tempor ",
                "incididunt ut labore et dolore magna aliqua"
            ),
            "m",
            vec![4, 10, 23, 68, 74, 110]
        ),
    }

    test_z_array_cases! {
        basic_z_array: ("aabaabab", vec![0, 1, 0, 4, 1, 0, 1, 0]),
        empty_string: ("", vec![]),
        single_char_z_array: ("a", vec![0]),
        repeated_char_z_array: ("aaaaaa", vec![0, 5, 4, 3, 2, 1]),
    }
}