The Algorithms logo
The Algorithms
Über unsSpenden

Bitap String Match

p
"""
Bitap exact string matching
https://en.wikipedia.org/wiki/Bitap_algorithm

Searches for a pattern inside text, and returns the index of the first occurrence
of the pattern. Both text and pattern consist of lowercase alphabetical characters only.

Complexity: O(m*n)
    n = length of text
    m = length of pattern

Python doctests can be run using this command:
python3 -m doctest -v bitap_string_match.py
"""


def bitap_string_match(text: str, pattern: str) -> int:
    """
    Retrieves the index of the first occurrence of pattern in text.

    Args:
        text: A string consisting only of lowercase alphabetical characters.
        pattern: A string consisting only of lowercase alphabetical characters.

    Returns:
        int: The index where pattern first occurs. Return -1  if not found.

    >>> bitap_string_match('abdabababc', 'ababc')
    5
    >>> bitap_string_match('aaaaaaaaaaaaaaaaaa', 'a')
    0
    >>> bitap_string_match('zxywsijdfosdfnso', 'zxywsijdfosdfnso')
    0
    >>> bitap_string_match('abdabababc', '')
    0
    >>> bitap_string_match('abdabababc', 'c')
    9
    >>> bitap_string_match('abdabababc', 'fofosdfo')
    -1
    >>> bitap_string_match('abdab', 'fofosdfo')
    -1
    """
    if not pattern:
        return 0
    m = len(pattern)
    if m > len(text):
        return -1

    # Initial state of bit string 1110
    state = ~1
    # Bit = 0 if character appears at index, and 1 otherwise
    pattern_mask: list[int] = [~0] * 27  # 1111

    for i, char in enumerate(pattern):
        # For the pattern mask for this character, set the bit to 0 for each i
        # the character appears.
        pattern_index: int = ord(char) - ord("a")
        pattern_mask[pattern_index] &= ~(1 << i)

    for i, char in enumerate(text):
        text_index = ord(char) - ord("a")
        # If this character does not appear in pattern, it's pattern mask is 1111.
        # Performing a bitwise OR between state and 1111 will reset the state to 1111
        # and start searching the start of pattern again.
        state |= pattern_mask[text_index]
        state <<= 1

        # If the mth bit (counting right to left) of the state is 0, then we have
        # found pattern in text
        if (state & (1 << m)) == 0:
            return i - m + 1

    return -1


if __name__ == "__main__":
    import doctest

    doctest.testmod()