/**
* @file
* @brief [McNaughton–Yamada–Thompson algorithm](https://en.wikipedia.org/wiki/Thompson%27s_construction)
* @details
* From Wikipedia:
* In computer science, Thompson's construction algorithm,
* also called the McNaughton–Yamada–Thompson algorithm,
* is a method of transforming a regular expression into
* an equivalent nondeterministic finite automaton (NFA).
* This implementation implements the all three operations
* (implicit concatenation, '|' for union, '*' for Kleene star)
* required by the formal definition of regular expressions.
* @author [Sharon Cassidy](https://github.com/CascadingCascade)
*/
#include <assert.h> /// for assert()
#include <stdio.h> /// for IO operations
#include <string.h> /// for string operations
#include <stdlib.h> /// for memory management
/* Begin declarations, I opted to place various helper / utility functions
* close to their usages and didn't split their declaration / definition */
/**
* @brief Definition for a binary abstract syntax tree (AST) node
*/
struct ASTNode {
char content; ///< the content of this node
struct ASTNode* left; ///< left child
struct ASTNode* right; ///< right child
};
struct ASTNode* createNode(char content);
void destroyNode(struct ASTNode* node);
char* preProcessing(const char* input);
struct ASTNode* buildAST(const char* input);
/**
* @brief Definition for a NFA state transition rule
*/
struct transRule {
struct NFAState* target; ///< pointer to the state to transit to
char cond; ///< the input required to activate this transition
};
struct transRule* createRule(struct NFAState* state, char c);
void destroyRule(struct transRule* rule);
/**
* @brief Definition for a NFA state. Each NFAState object is initialized
* to have a capacity of three rules, since there will only be at most two
* outgoing rules and one empty character circular rule in this algorithm
*/
struct NFAState {
int ruleCount; ///< number of transition rules this state have
struct transRule** rules; ///< the transition rules
};
struct NFAState* createState(void);
void destroyState(struct NFAState* state);
/**
* @brief Definition for the NFA itself.
* statePool[0] is defined to be its starting state,
* and statePool[1] is defined to be its accepting state.
* for simplicity's sake all NFAs are initialized to have
* a small fixed capacity, although due to the recursive nature
* of this algorithm this capacity is believed to be sufficient
*/
struct NFA {
int stateCount; ///< the total number of states this NFA have
struct NFAState** statePool; ///< the pool of all available states
int ruleCount; ///< the total number of transition rules in this NFA
struct transRule** rulePool; ///< the pool of all transition rules
int CSCount; ///< the number of currently active states
struct NFAState** currentStates; ///< the pool of all active states
int subCount; ///< the number of sub NFAs
struct NFA** subs; ///< the pool of all sub NFAs
int wrapperFlag; ///< whether this NFA is a concatenation wrapper
};
struct NFA* createNFA(void);
void destroyNFA(struct NFA* nfa);
void addState(struct NFA* nfa, struct NFAState* state);
void addRule(struct NFA* nfa, struct transRule* rule, int loc);
void postProcessing(struct NFA* nfa);
void transit(struct NFA* nfa, char input);
int isAccepting(const struct NFA* nfa);
/* End definitions, begin abstract syntax tree construction */
/**
* @brief helper function to determine whether a character should be
* considered a character literal
* @param ch the character to be tested
* @returns `1` if it is a character literal
* @returns `0` otherwise
*/
int isLiteral(const char ch) {
return !(ch == '(' || ch == ')' || ch == '*' || ch == '\n' || ch == '|');
}
/**
* @brief performs preprocessing on a regex string,
* making all implicit concatenations explicit
* @param input target regex string
* @returns pointer to the processing result
*/
char* preProcessing(const char* input) {
const size_t len = strlen(input);
if(len == 0) {
char* str = malloc(1);
str[0] = '\0';
return str;
}
char* str = malloc(len * 2);
size_t op = 0;
for (size_t i = 0; i < len - 1; ++i) {
char c = input[i];
str[op++] = c;
// one character lookahead
char c1 = input[i + 1];
if( (isLiteral(c) && isLiteral(c1)) ||
(isLiteral(c) && c1 == '(') ||
(c == ')' && c1 == '(') ||
(c == ')' && isLiteral(c1)) ||
(c == '*' && isLiteral(c1)) ||
(c == '*' && c1 == '(')
) {
// '\n' is used to represent concatenation
// in this implementation
str[op++] = '\n';
}
}
str[op++] = input[len - 1];
str[op] = '\0';
return str;
}
/**
* @brief utility function to locate the first occurrence
* of a character in a string while respecting parentheses
* @param str target string
* @param key the character to be located
* @returns the index of its first occurrence, `0` if it could not be found
*/
size_t indexOf(const char* str, char key) {
int depth = 0;
for (size_t i = 0; i < strlen(str); ++i) {
const char c = str[i];
if(depth == 0 && c == key) {
return i;
}
if(c == '(') depth++;
if(c == ')') depth--;
}
// Due to the way this function is intended to be used,
// it's safe to assume the character will not appear as
// the string's first character
// thus `0` is used as the `not found` value
return 0;
}
/**
* @brief utility function to create a subString
* @param str target string
* @param begin starting index, inclusive
* @param end ending index, inclusive
* @returns pointer to the newly created subString
*/
char* subString(const char* str, size_t begin, size_t end) {
char* res = malloc(end - begin + 2);
strncpy(res, str + begin, end - begin + 1);
res[end - begin + 1] = '\0';
return res;
}
/**
* @brief recursively constructs a AST from a preprocessed regex string
* @param input regex
* @returns pointer to the resulting tree
*/
struct ASTNode* buildAST(const char* input) {
struct ASTNode* node = createNode('\0');
node->left = NULL;
node->right = NULL;
const size_t len = strlen(input);
size_t index;
// Empty input
if(len == 0) return node;
// Character literals
if(len == 1) {
node->content = input[0];
return node;
}
// Discard parentheses
if(input[0] == '(' && input[len - 1] == ')') {
char* temp = subString(input, 1, len - 2);
destroyNode(node);
node = buildAST(temp);
free(temp);
return node;
}
// Union
index = indexOf(input, '|');
if(index) {
node->content = '|';
char* temp1 = subString(input, 0, index - 1);
char* temp2 = subString(input, index + 1, len - 1);
node->left = buildAST(temp1);
node->right = buildAST(temp2);
free(temp2);
free(temp1);
return node;
}
// Concatenation
index = indexOf(input, '\n');
if(index) {
node->content = '\n';
char* temp1 = subString(input, 0, index - 1);
char* temp2 = subString(input, index + 1, len - 1);
node->left = buildAST(temp1);
node->right = buildAST(temp2);
free(temp2);
free(temp1);
return node;
}
// Kleene star
// Testing with indexOf() is unnecessary here,
// Since all other possibilities have been exhausted
node->content = '*';
char* temp = subString(input, 0, len - 2);
node->left = buildAST(temp);
node->right = NULL;
free(temp);
return node;
}
/* End AST construction, begins the actual algorithm itself */
/**
* @brief helper function to recursively redirect transition rule targets
* @param nfa target NFA
* @param src the state to redirect away from
* @param dest the state to redirect to
* @returns void
*/
void redirect(struct NFA* nfa, struct NFAState* src, struct NFAState* dest) {
for (int i = 0; i < nfa->subCount; ++i) {
redirect(nfa->subs[i], src, dest);
}
for (int i = 0; i < nfa->ruleCount; ++i) {
struct transRule* rule = nfa->rulePool[i];
if (rule->target == src) {
rule->target = dest;
}
}
}
struct NFA* compileFromAST(struct ASTNode* root) {
struct NFA* nfa = createNFA();
// Empty input
if (root->content == '\0') {
addRule(nfa, createRule(nfa->statePool[1], '\0'), 0);
return nfa;
}
// Character literals
if (isLiteral(root->content)) {
addRule(nfa, createRule(nfa->statePool[1], root->content), 0);
return nfa;
}
switch (root->content) {
case '\n': {
struct NFA* ln = compileFromAST(root->left);
struct NFA* rn = compileFromAST(root->right);
// Redirects all rules targeting ln's accepting state to
// target rn's starting state
redirect(ln, ln->statePool[1], rn->statePool[0]);
// Manually creates and initializes a special
// "wrapper" NFA
destroyNFA(nfa);
struct NFA* wrapper = malloc(sizeof(struct NFA));
wrapper->stateCount = 2;
wrapper->statePool = malloc(sizeof(struct NFAState*) * 2);
wrapper->subCount = 0;
wrapper->subs = malloc(sizeof(struct NFA*) * 2);
wrapper->ruleCount = 0;
wrapper->rulePool = malloc(sizeof(struct transRule*) * 3);
wrapper->CSCount = 0;
wrapper->currentStates = malloc(sizeof(struct NFAState*) * 2);
wrapper->wrapperFlag = 1;
wrapper->subs[wrapper->subCount++] = ln;
wrapper->subs[wrapper->subCount++] = rn;
// Maps the wrapper NFA's starting and ending states
// to its sub NFAs
wrapper->statePool[0] = ln->statePool[0];
wrapper->statePool[1] = rn->statePool[1];
return wrapper;
}
case '|': {
struct NFA* ln = compileFromAST(root->left);
struct NFA* rn = compileFromAST(root->right);
nfa->subs[nfa->subCount++] = ln;
nfa->subs[nfa->subCount++] = rn;
// Adds empty character transition rules
addRule(nfa, createRule(ln->statePool[0], '\0'), 0);
addRule(ln, createRule(nfa->statePool[1], '\0'), 1);
addRule(nfa, createRule(rn->statePool[0], '\0'), 0);
addRule(rn, createRule(nfa->statePool[1], '\0'), 1);
return nfa;
}
case '*': {
struct NFA* ln = compileFromAST(root->left);
nfa->subs[nfa->subCount++] = ln;
addRule(ln, createRule(ln->statePool[0], '\0'), 1);
addRule(nfa, createRule(ln->statePool[0], '\0'), 0);
addRule(ln, createRule(nfa->statePool[1], '\0'), 1);
addRule(nfa, createRule(nfa->statePool[1], '\0'), 0);
return nfa;
}
}
// Fallback, shouldn't happen in normal operation
destroyNFA(nfa);
return NULL;
}
/* Ends the algorithm, begins NFA utility functions*/
/**
* @brief adds a state to a NFA
* @param nfa target NFA
* @param state the NFA state to be added
* @returns void
*/
void addState(struct NFA* nfa, struct NFAState* state) {
nfa->statePool[nfa->stateCount++] = state;
}
/**
* @brief adds a transition rule to a NFA
* @param nfa target NFA
* @param rule the rule to be added
* @param loc which state this rule should be added to
* @returns void
*/
void addRule(struct NFA* nfa, struct transRule* rule, int loc) {
nfa->rulePool[nfa->ruleCount++] = rule;
struct NFAState* state = nfa->statePool[loc];
state->rules[state->ruleCount++] = rule;
}
/**
* @brief performs postprocessing on a compiled NFA,
* add circular empty character transition rules where
* it's needed for the NFA to function correctly
* @param nfa target NFA
* @returns void
*/
void postProcessing(struct NFA* nfa) {
// Since the sub NFA's states and rules are managed
// through their own pools, recursion is necessary
for (int i = 0; i < nfa->subCount; ++i) {
postProcessing(nfa->subs[i]);
}
// If a state does not have any empty character accepting rule,
// we add a rule that circles back to itself
// So this state will be preserved when
// empty characters are inputted
for (int i = 0; i < nfa->stateCount; ++i) {
struct NFAState* pState = nfa->statePool[i];
int f = 0;
for (int j = 0; j < pState->ruleCount; ++j) {
if(pState->rules[j]->cond == '\0') {
f = 1;
break;
}
}
if (!f) {
addRule(nfa, createRule(pState, '\0'), i);
}
}
}
/**
* @brief helper function to determine an element's presence in an array
* @param states target array
* @param len length of the target array
* @param state the element to search for
* @returns `1` if the element is present, `0` otherwise
*/
int contains(struct NFAState** states, int len, struct NFAState* state) {
int f = 0;
for (int i = 0; i < len; ++i) {
if(states[i] == state) {
f = 1;
break;
}
}
return f;
}
/**
* @brief helper function to manage empty character transitions
* @param target target NFA
* @param states pointer to results storage location
* @param sc pointer to results count storage location
* @returns void
*/
void findEmpty(struct NFAState* target, struct NFAState** states, int *sc) {
for (int i = 0; i < target->ruleCount; ++i) {
const struct transRule *pRule = target->rules[i];
if (pRule->cond == '\0' && !contains(states, *sc, pRule->target)) {
states[(*sc)++] = pRule->target;
// the use of `states` and `sc` is necessary
// to sync data across recursion levels
findEmpty(pRule->target, states, sc);
}
}
}
/**
* @brief moves a NFA forward
* @param nfa target NFA
* @param input the character to be fed into the NFA
* @returns void
*/
void transit(struct NFA* nfa, char input) {
struct NFAState** newStates = malloc(sizeof(struct NFAState*) * 10);
int NSCount = 0;
if (input == '\0') {
// In case of empty character input, it's possible for
// a state to transit to another state that's more than
// one rule away, we need to take that into account
for (int i = nfa->CSCount - 1; i > -1; --i) {
struct NFAState *pState = nfa->currentStates[i];
nfa->CSCount--;
struct NFAState** states = malloc(sizeof(struct NFAState*) * 10);
int sc = 0;
findEmpty(pState, states, &sc);
for (int j = 0; j < sc; ++j) {
if(!contains(newStates,NSCount, states[j])) {
newStates[NSCount++] = states[j];
}
}
free(states);
}
} else {
// Iterates through all current states
for (int i = nfa->CSCount - 1; i > -1; --i) {
struct NFAState *pState = nfa->currentStates[i];
// Gradually empties the current states pool, so
// it can be refilled
nfa->CSCount--;
// Iterates through rules of this state
for (int j = 0; j < pState->ruleCount; ++j) {
const struct transRule *pRule = pState->rules[j];
if(pRule->cond == input) {
if(!contains(newStates, NSCount, pRule->target)) {
newStates[NSCount++] = pRule->target;
}
}
}
}
}
nfa->CSCount = NSCount;
for (int i = 0; i < NSCount; ++i) {
nfa->currentStates[i] = newStates[i];
}
free(newStates);
}
/**
* @brief determines whether the NFA is currently in its accepting state
* @param nfa target NFA
* @returns `1` if the NFA is in its accepting state
* @returns `0` otherwise
*/
int isAccepting(const struct NFA* nfa) {
for (int i = 0; i < nfa->CSCount; ++i) {
if(nfa->currentStates[i] == nfa->statePool[1]) {
return 1;
}
}
return 0;
}
/* Ends NFA utilities, begins testing function*/
/**
* @brief Testing helper function
* @param regex the regular expression to be used
* @param string the string to match against
* @param expected expected results
* @returns void
*/
void testHelper(const char* regex, const char* string, const int expected) {
char* temp = preProcessing(regex);
struct ASTNode* node = buildAST(temp);
struct NFA* nfa = compileFromAST(node);
postProcessing(nfa);
// reallocates the outermost NFA's current states pool
// because it will actually be used to store all the states
nfa->currentStates = realloc(nfa->currentStates, sizeof(struct NFAState*) * 100);
// Starts the NFA by adding its starting state to the pool
nfa->currentStates[nfa->CSCount++] = nfa->statePool[0];
// feeds empty characters into the NFA before and after
// every normal character
for (size_t i = 0; i < strlen(string); ++i) {
transit(nfa, '\0');
transit(nfa, string[i]);
}
transit(nfa, '\0');
assert(isAccepting(nfa) == expected);
destroyNFA(nfa);
destroyNode(node);
free(temp);
}
/**
* @brief Self-test implementations
* @returns void
*/
static void test(void) {
testHelper("(c|a*b)", "c", 1);
testHelper("(c|a*b)", "aab", 1);
testHelper("(c|a*b)", "ca", 0);
testHelper("(c|a*b)*", "caaab", 1);
testHelper("(c|a*b)*", "caba", 0);
testHelper("", "", 1);
testHelper("", "1", 0);
testHelper("(0|(1(01*(00)*0)*1)*)*","11",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","110",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","1100",1);
testHelper("(0|(1(01*(00)*0)*1)*)*","10000",0);
testHelper("(0|(1(01*(00)*0)*1)*)*","00000",1);
printf("All tests have successfully passed!\n");
}
/**
* @brief Main function
* @returns 0 on exit
*/
int main(void) {
test(); // run self-test implementations
return 0;
}
/* I opted to place these more-or-less boilerplate code and their docs
* at the end of file for better readability */
/**
* @brief creates and initializes a AST node
* @param content data to initializes the node with
* @returns pointer to the newly created node
*/
struct ASTNode* createNode(const char content) {
struct ASTNode* node = malloc(sizeof(struct ASTNode));
node->content = content;
node->left = NULL;
node->right = NULL;
return node;
}
/**
* @brief recursively destroys a AST
* @param node the root node of the tree to be deleted
* @returns void
*/
void destroyNode(struct ASTNode* node) {
if(node->left != NULL) {
destroyNode(node->left);
}
if(node->right != NULL) {
destroyNode(node->right);
}
free(node);
}
/**
* @brief creates and initializes a transition rule
* @param state transition target
* @param c transition condition
* @returns pointer to the newly created rule
*/
struct transRule* createRule(struct NFAState* state, char c) {
struct transRule* rule = malloc(sizeof(struct transRule));
rule->target = state;
rule->cond = c;
return rule;
}
/**
* @brief destroys a transition rule object
* @param rule pointer to the object to be deleted
* @returns void
*/
void destroyRule(struct transRule* rule) {
free(rule);
}
/**
* @brief creates and initializes a NFA state
* @returns pointer to the newly created NFA state
*/
struct NFAState* createState(void) {
struct NFAState* state = malloc(sizeof(struct NFAState));
state->ruleCount = 0;
state->rules = malloc(sizeof(struct transRule*) * 3);
return state;
}
/**
* @brief destroys a NFA state
* @param state pointer to the object to be deleted
* @returns void
*/
void destroyState(struct NFAState* state) {
free(state->rules);
free(state);
}
/**
* @brief creates and initializes a NFA
* @returns pointer to the newly created NFA
*/
struct NFA* createNFA(void) {
struct NFA* nfa = malloc(sizeof(struct NFA));
nfa->stateCount = 0;
nfa->statePool = malloc(sizeof(struct NFAState*) * 5);
nfa->ruleCount = 0;
nfa->rulePool = malloc(sizeof(struct transRule*) * 10);
nfa->CSCount = 0;
nfa->currentStates = malloc(sizeof(struct NFAState*) * 5);
nfa->subCount = 0;
nfa->subs = malloc(sizeof(struct NFA*) * 5);
nfa->wrapperFlag = 0;
addState(nfa, createState());
addState(nfa, createState());
return nfa;
}
/**
* @brief recursively destroys a NFA
* @param nfa pointer to the object to be deleted
* @returns void
*/
void destroyNFA(struct NFA* nfa) {
for (int i = 0; i < nfa->subCount; ++i) {
destroyNFA(nfa->subs[i]);
}
// In case of a wrapper NFA, do not free its states
// because it doesn't really have any states of its own
if (!nfa->wrapperFlag) {
for (int i = 0; i < nfa->stateCount; ++i) {
destroyState(nfa->statePool[i]);
}
}
for (int i = 0; i < nfa->ruleCount; ++i) {
destroyRule(nfa->rulePool[i]);
}
free(nfa->statePool);
free(nfa->currentStates);
free(nfa->rulePool);
free(nfa->subs);
free(nfa);
}