using System;
namespace Algorithms.Strings.PatternMatching;
/// <summary>
/// The Bitap algorithm is a fuzzy string matching technique. It ains to find approximate matches of a pattern within a
/// text, allowing for a certain degree of mismatch (e.g., mistypes, minor variations etc.). It's knowd for its efficiency,
/// using bitwise operations for fast comparisons.
///
/// <para>
/// <b>How it works:</b>
/// <list type="number">
/// <item>
/// <term>Initialization</term>
/// <description>
/// Bitmasks are created for each character in the pattern. These bitmasks are essentially binary numbers where each bit
/// represents a specific character's position within the pattern. An initial state variable <c>R</c> is set to all 1s,
/// indicating that all characters in the pattern are initially unmatched.
/// </description>
/// </item>
/// <item>
/// <term>Iteration</term>
/// <description>
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
/// the corresponding character in the pattern.
/// </description>
/// </item>
/// <item>
/// <term>Matching</term>
/// <description>
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
/// If it is, it means there's a potential match at that position, with a mismatch distance that's within the allowed
/// threshold.
/// </description>
/// </item>
/// </list>
/// </para>
/// <para>
/// <b> Finding Matches </b>
/// </para>
/// <para>
/// If the least significant bit of <c>R</c> is 1, it means a potential match is found.
/// The number of leading zeros in <c>R</c> indicates the mismatch distance.
/// If this distance is within the allowed threshold, it's considered a valid match.
/// </para>
/// </summary>
public static class Bitap
{
/// <summary>
/// <para>
/// This function implements the Bitap algorithm for finding exact matches of a pattern within a text.
/// It aims to find the first occurrence of the pattern in the text, allowing for no mismatches.
/// </para>
/// <para>
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
/// the corresponding character in the pattern.
/// </para>
/// <para>
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
/// If it is, it means there's a potential match at that position, with a mismatch distance of 0.
/// The function returns the index of the first occurrence of the pattern in the text, or -1 if not found.
/// </para>
/// <para>
/// The function throws an <see cref="ArgumentException"/> if the pattern is longer than 31 characters.
/// This is because the maximum length of the pattern is 31, because if it's longer than that,
/// we won't be able to represent the pattern mask in an int.
/// </para>
/// </summary>
/// <param name="text">The text to search in.</param>
/// <param name="pattern">The pattern to search for.</param>
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
/// <exception cref="ArgumentException">The pattern is longer than 31 characters.</exception>
public static int FindExactPattern(string text, string pattern)
{
// The length of the pattern.
var len = pattern.Length;
// An array of integers that will be used to mask the pattern.
// The pattern mask is a bitmask that we will use to search for the pattern characters
// in the text. We'll set the bit corresponding to the character in the pattern
// to 0, and then use bitwise operations to check for the pattern.
var patternMask = new int[128];
int index;
// Check if the pattern is empty.
if (string.IsNullOrEmpty(pattern))
{
return 0;
}
// Check if the pattern is longer than 31 characters.
if (len > 31)
{
throw new ArgumentException("The pattern is longer than 31 characters.");
}
// Initialize the register <c>R</c> to all 1s.
var r = ~1;
// Initialize the pattern mask to all 1s.
for (index = 0; index <= 127; ++index)
{
patternMask[index] = ~0;
}
// Set the bits corresponding to the characters in the pattern to 0 in the pattern mask.
for (index = 0; index < len; ++index)
{
patternMask[pattern[index]] &= ~(1 << index);
}
// Iterate through each character in the text.
for (index = 0; index < text.Length; ++index)
{
// Update the state <c>R</c> by ORing the pattern mask with the character in the text,
// and then shift it to the left by 1.
r |= patternMask[text[index]];
r <<= 1;
// Check if the least significant bit of <c>R</c> is set to 1.
// If there's a potential match at that position, with a mismatch distance of 0,
// return the index of the first occurrence of the pattern in the text.
if ((r & 1 << len) == 0)
{
return index - len + 1;
}
}
// If no match is found, return -1.
return -1;
}
/// <summary>
/// Finds the first occurrence of a pattern in a given text with a given threshold for mismatches.
/// </summary>
/// <param name="text">The text to search in.</param>
/// <param name="pattern">The pattern to search for.</param>
/// <param name="threshold">The maximum number of mismatches allowed.</param>
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
public static int FindFuzzyPattern(string text, string pattern, int threshold)
{
// Create a pattern mask for each character in the pattern.
// The pattern mask is a bitmask that we will use to search for the pattern characters
// in the text. We'll set the bit corresponding to the character in the pattern
// to 0, and then use bitwise operations to check for the pattern.
var patternMask = new int[128];
// Create a register array.
// The register array is used to keep track of the pattern mask as we search for the pattern.
// We'll start with a register that has all bits set to 1, because all bits in the pattern mask
// will be set to 1 initially.
var r = new int[(threshold + 1) * sizeof(int)];
var len = pattern.Length;
// Check for empty strings.
// If the text is empty, return 0.
// If the pattern is empty, return 0.
if (string.IsNullOrEmpty(text))
{
return 0;
}
if (string.IsNullOrEmpty(pattern))
{
return 0;
}
// Check for a pattern that is too long.
// If the pattern is longer than 31 characters, return -1.
// The maximum length of the pattern is 31, because if it's longer than that,
// we won't be able to represent the pattern mask in an int.
if (len > 31)
{
return -1;
}
// Initialize the register.
// Set the least significant bit in the register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
for (var i = 0; i <= threshold; ++i)
{
r[i] = ~1;
}
// Initialize the pattern mask.
// Set the bit corresponding to each character in the pattern to 0 in the pattern mask.
// This will make it easier to check for the pattern later.
for (var i = 0; i <= 127; i++)
{
patternMask[i] = ~0;
}
// Set the pattern mask for each character in the pattern.
// Use bitwise AND to clear the bit corresponding to the current character.
for (var i = 0; i < len; ++i)
{
patternMask[pattern[i]] &= ~(1 << i);
}
// Search for the pattern in the text.
// Loop through each character in the text.
for (var i = 0; i < text.Length; ++i)
{
// Update the register.
// Set the least significant bit in the register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
var oldR = r[0];
r[0] |= patternMask[text[i]];
r[0] <<= 1;
// Update the other registers.
// Set the least significant bit in each register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
for (var j = 1; j <= threshold; ++j)
{
var tmp = r[j];
r[j] = (oldR & (r[j] | patternMask[text[i]])) << 1;
oldR = tmp;
}
// If the pattern has been found, return the index.
// Check the most significant bit in the register.
// If it's 0, then the pattern has been found.
if ((r[threshold] & 1 << len) == 0)
{
// The pattern has been found.
// Return the index of the first character in the pattern.
return i - len + 1;
}
}
// The pattern has not been found.
return -1;
}
}