/**********************************************************************\ * * DFATREE.C * * Creates a syntax tree from the regular expression. * * Before parsing the regular expression is changed from the character * array to an integer array and also changed to a simpler syntax. Same * process does also lexical analyzing so that special symbols are in * the higher byte and normal characters in the lower byte. * * Parser is a recursive descent parser that builds the syntax tree. * * Parser syntax: * * primary-expression: * constant * character-class * empty * ( expression ) * * closure-expression: * primary-expression * closure-expression primary-expression * * cat-expression: * closure-expression * cat-expression closure-expression * * or-expression: * cat-expression * or-expression cat-expression * * expression: * or-expression * * Author: Jarmo Ruuth 15-Mar-1988 * * Copyright (C) 1988-90 by Jarmo Ruuth * May be freely copied for any non-commercial usage. \**********************************************************************/ #include "system.h" #include "dfa.h" #define NCLASSITEMS NLANG #define REGBLOCK 10 #define NEGATE_CLASS '^' #define EMPTY ( '@' << CHARBITS ) /* epsilon */ #define CLASS ( '[' << CHARBITS ) #define L_PAR ( '(' << CHARBITS ) #define R_PAR ( ')' << CHARBITS ) #define CLOSURE_OP ( '*' << CHARBITS ) #define OR_OP ( '|' << CHARBITS ) #define BOL_OP ( '^' << CHARBITS ) #define EOL_OP ( '$' << CHARBITS ) #define EOS 0 static node_t* treeptr; /* current syntax tree pointer */ static node_t* treestart; /* pointer to start of the syntax tree */ static int* regexp; /* modified regular expression */ static int* regptr; /* current pointer to regexp */ static int memerr; /* flag for out of memory error */ static int current; /* current regexp token type */ static int next; /* next regexp token type */ static int pos; /* position in syntax tree */ static int regsize; /* allocated size for regexp */ static int regpos; /* index to regexp */ static int last_class; /* last character class in classes */ static set_t** classes; /* array of character classes for regexp */ static uchar* language; /* boolean array of characters in language */ static int expression (void); /********************************************************************** * * CLEAR_LANGUAGE */ #ifdef BSD #define CLEAR_LANGUAGE(l) bzero(l, NLANG) #else #define CLEAR_LANGUAGE(l) memset(l, FALSE, NLANG) #endif /********************************************************************** * * tree_error */ static int tree_error(char* msg) { current = next = pos = ERROR; /* stop parsing */ rbuf.reg_error = msg; return ERROR; } /********************************************************************** * * new_pos * * Gets new tree node and sets pos to point to it. */ static int new_pos(void) { ++pos; if (pos == 0) { if ((treestart = malloc(sizeof(node_t))) == NULL) return tree_error("Out of memory"); treeptr = treestart; } else { if ((treestart=realloc(treestart,(pos+1)*sizeof(node_t)))==NULL) return tree_error("Out of memory"); treeptr = treestart + pos; } return TRUE; } /********************************************************************** * * add_language * * Adds a new character to the recognized language. */ static void add_language(int ch) { language[ch] = TRUE; if (rbuf.ignorecase && isalpha(ch)) language[CHCASE(ch)] = TRUE; } /********************************************************************** * * add_item * * Adds a new leaf item to the syntax tree and returns tree position. */ static int add_item(node_type_t type, unsigned value) { if (new_pos() == ERROR) return ERROR; treeptr->type = type; treeptr->val.item = value; if (type != EMPTY_ID) add_language(value); return pos; } /********************************************************************** * * add_node * * Adds a new node to the syntax tree and returns tree position. */ static int add_node(node_type_t type, int left, int right) { if (new_pos() == ERROR) return ERROR; treeptr->type = type; treeptr->val.next.left = left; treeptr->val.next.right = right; return pos; } /********************************************************************** * * add_regexp * * Adds a new item to the modified regular expression. */ static void add_regexp(int item) { if (regexp == NULL) { regpos = -1; regsize = REGBLOCK; if ((regexp = malloc(regsize*sizeof(int))) == NULL) memerr = TRUE; } if (regpos == regsize-1) { regsize += REGBLOCK; if ((regexp = realloc(regexp, regsize*sizeof(int))) == NULL) memerr = TRUE; } ++regpos; if (!(item & ~MAXCHAR) && rbuf.ignorecase && islower(item)) regexp[regpos] = toupper(item); else regexp[regpos] = item; } /********************************************************************** * * block_beg * * Finds the beginning of a possible block that starts from regpos. */ static int block_beg(int regpos) { REG1 int i, cnt; if (regexp[regpos] != R_PAR) return regpos; for (cnt = 1, i = regpos-1; i >= 0 && cnt; i--) { if (regexp[i] == L_PAR) --cnt; if (regexp[i] == R_PAR) ++cnt; } return i+1; } /********************************************************************** * * copy_class * * Copies class to the classes array. We can't use the same class twice, * because each class is released by calling free(). */ static int copy_class(set_t *class) { REG1 int bsize; REG2 set_t *tmpclass; bsize = set_bytesize(NCLASSITEMS); if ((tmpclass = malloc(bsize)) == NULL) { memerr = TRUE; return 0; } set_copy(tmpclass, class, bsize); if ((classes=realloc(classes,(++last_class+1)*sizeof(set_t *)))==NULL) { memerr = TRUE; return 0; } classes[last_class] = tmpclass; return last_class; } /********************************************************************** * * add_one_or_more * * Changes r+ to rr*. */ static void add_one_or_more(void) { REG1 int i, pos; for (i = block_beg(regpos), pos = regpos; i <= pos; i++) add_regexp(regexp[i]); for (i = pos+1, pos = regpos; i <= pos; i++) if (regexp[i] == CLASS) regexp[i+1] = copy_class(classes[regexp[i+1]]); add_regexp(CLOSURE_OP); } /********************************************************************** * * add_left_paren * * Finds previous block in regexp and adds left parethesis before it. */ static void add_left_paren(int regpos) { REG1 int i, pos, save; save = regexp[regpos]; pos = block_beg(regpos); for (i = regpos; i > pos; i--) regexp[i] = regexp[i-1]; regexp[pos] = L_PAR; add_regexp(save); } /********************************************************************** * * add_zero_or_one * * Changes r? to (r|î). */ static void add_zero_or_one(void) { add_left_paren(regpos); add_regexp(OR_OP); add_regexp(EMPTY); add_regexp(R_PAR); } /********************************************************************** * * add_class * * Adds character class to regexp. Classes are added to regexp so that * it is surrounded by parenthesis and it contains CLASS marker followed * by an integer that is class index to the classes array. */ static void add_class(REG1 uchar* class) { REG2 int i; class[rbuf.eol] = class[EOL] = class[REM] = FALSE; if (classes == NULL) { if ((classes = malloc(sizeof(set_t *))) == NULL) { memerr = TRUE; return; } last_class = 0; } else { if ((classes=realloc(classes,(++last_class+1)*sizeof(set_t *))) == NULL) { memerr = TRUE; return; } } if ((classes[last_class] = malloc(set_bytesize(NCLASSITEMS))) == NULL) { memerr = TRUE; return; } set_clear(classes[last_class], set_bytesize(NCLASSITEMS)); for (i = 0; i < NCLASSITEMS; i++) if (class[i]) { if (!language[i]) add_language(i); add_set(classes[last_class], i); } if (set_empty(classes[last_class], set_bytesize(NCLASSITEMS))) add_regexp(EMPTY); else { add_regexp(L_PAR); add_regexp(CLASS); add_regexp(last_class); add_regexp(R_PAR); } } /********************************************************************** * * init_class */ #define init_class(c, v) memset(c, v, NCLASSITEMS); /********************************************************************** * * get_class * * Collects class characters to boolean array indexed by characters * and adds it to regexp. */ static uchar* get_class(REG3 uchar* str) { REG1 int i, end_range; REG4 int neg = FALSE; uchar class[NCLASSITEMS]; init_class(class, FALSE); ++str; /* skip starting '[' */ if (*str == NEGATE_CLASS) { ++str; neg = TRUE; } if (*str == ']') { ++str; class[']'] = TRUE; } for (; *str && *str != ']'; str++) { if (*str == '\\' && !*(++str)) return NULL; if (*(str+1) == '-' && *(str+2) != ']') { i = *str; str += *(str+2) == '\\' ? 3 : 2; if ((end_range = *str) == '\0') return NULL; if (end_range < i) { int tmp = i; i = end_range; end_range = tmp; } for (; i <= end_range; i++) class[i] = TRUE; continue; } class[*str] = TRUE; } if (rbuf.ignorecase) { for (i = 1; i < NCHARS; i++) if (isalpha(i)) class[i] |= class[CHCASE(i)]; } if (neg) for (i = 1; i < NCLASSITEMS; i++) class[i] = !class[i]; return *str == ']' ? add_class(class),str : NULL; } /********************************************************************** * * add_any * * Adds a character class to the regexp that matches any character except * end of line. */ static void add_any(void) { uchar class[NCLASSITEMS]; init_class(class, TRUE); class[BOL] = class[rbuf.eol] = class[EOL] = FALSE; add_class(class); } /********************************************************************** * * add_search_init * * Adds [0-MAXCHAR]*( to the beginning of the regexp if we are building * a substring searcher. Note, that no characters from here are added * to the language. */ static void add_search_init(void) { uchar class[NCLASSITEMS]; init_class(class, TRUE); add_class(class); add_regexp(CLOSURE_OP); add_regexp(L_PAR); } /********************************************************************** * * make_regexp * * Changes regular expression to simpler syntax (uses only '(', ')', '|', * '*' and 'î'). Integer array is used, so full eight bit character set * except '\0' can be used. */ static char* make_regexp(REG1 uchar* str) { REG2 uchar* strbeg = str; REG3 int paren_cnt = 0; regexp = NULL; /* Check special cases and make sure, that they match to every line. */ /* Is this really necessary and correct ? */ if (((*str == '$' || *str == '^') && *(str+1) == '\0') || *str == '\0') str = "a*"; /* If we are building a substring searcher, change regexp r to form [0-MAXCHAR]*(r). It guarantees, that substring is found in O(n+m)-time. */ memerr = FALSE; if (rbuf.search) add_search_init(); CLEAR_LANGUAGE(language); rbuf.begline = rbuf.endline = FALSE; for (; *str && !memerr; str++) { switch (*str) { case '\\': if (*(str+1) != '\0') add_regexp(*(++str)); else add_regexp(*str); break; case '(': if (*(str+1) == ')') ++str; else if (*(str+1) == '\0') add_regexp(*str); else { add_regexp(L_PAR); ++paren_cnt; } break; case ')': --paren_cnt; add_regexp(R_PAR); break; case '[': str = get_class(str); if (str == NULL) return "Unterminated character class"; break; case ']': add_regexp(*str); break; case '*': if (str != strbeg) add_regexp(CLOSURE_OP); else add_regexp(*str); break; case '?': if (str != strbeg) add_zero_or_one(); else add_regexp(*str); continue; case '+': if (str != strbeg) add_one_or_more(); else add_regexp(*str); break; case '.': add_any(); break; case '|': if (str == strbeg || *(str+1) == '\0') add_regexp(*str); else if (*(str-1) == '(') { add_regexp(EMPTY); add_regexp(OR_OP); } else if (*(str+1) == ')') { add_regexp(OR_OP); add_regexp(EMPTY); } else add_regexp(OR_OP); break; case '^': rbuf.begline = TRUE; add_regexp(BOL_OP); break; case '$': rbuf.endline = TRUE; add_regexp(EOL_OP); break; default: if (*str == rbuf.eol || *str == rbuf.eol2) return "End of line character " "in regular expression"; add_regexp(*str); break; } } /* Finish L_PAR started at add_search_init() */ if (rbuf.search) add_regexp(R_PAR); add_regexp(EOS); if (memerr) return "Out of memory"; return paren_cnt ? "Unbalanced parenthesis" : NULL; } /********************************************************************** * * fetch_next */ #define fetch_next() \ { \ current = next; \ next = *(++regptr); \ } /********************************************************************** * * fetch_new */ #define fetch_new() \ { \ current = *(++regptr); \ next = *(++regptr); \ } /********************************************************************** * * primary_exp */ static int primary_exp(void) { REG1 int last_pos; switch (current) { case L_PAR: fetch_next(); /* skip L_PAR */ last_pos = expression(); if (next != R_PAR) return tree_error("Error in expression"); fetch_next(); /* skip R_PAR */ return last_pos; case R_PAR: case CLOSURE_OP: case OR_OP: return tree_error("Error in expression"); case EOS: return tree_error("Premature end of expression"); case CLASS: fetch_next(); if (new_pos() == ERROR) return ERROR; treeptr->type = CLASS_ID; treeptr->val.class = classes[current]; return pos; case EMPTY: return add_item(EMPTY_ID, 0); case EOL_OP: add_language(EOL); return add_item(ID, rbuf.eol); case BOL_OP: return add_item(ID, BOL); case ERROR: return ERROR; default: return add_item(ID, current); } } /********************************************************************** * * closure_exp */ static int closure_exp(void) { REG1 int left; left = primary_exp(); switch (next) { case CLOSURE_OP: fetch_next(); return add_node(CLOSURE, left, 0); default: return left; } } /********************************************************************** * * cat_exp */ static int cat_exp(void) { REG1 int left, right; left = closure_exp(); for (;;) { switch (next) { case R_PAR: case CLOSURE_OP: case OR_OP: case EOS: case ERROR: return left; case L_PAR: default: fetch_next(); right = closure_exp(); if (right == ERROR) return ERROR; if (add_node(CAT, left, right) == ERROR) return ERROR; left = pos; continue; } } } /********************************************************************** * * expression */ static int expression(void) { REG1 int left, right; left = cat_exp(); for (;;) { switch (next) { case OR_OP: fetch_new(); right = cat_exp(); if (right == ERROR) return ERROR; if (add_node(OR, left, right) == ERROR) return ERROR; left = pos; continue; default: return left; } } } /********************************************************************** * * add_endmarker * * Adds unique right-end marker to the regular expression. */ static int add_endmarker(REG1 int left) { REG2 int right; if (add_item(ID, REM) == ERROR) return ERROR; /* catenate original expression and right-end marker */ right = pos; if (add_node(CAT, left, right) == ERROR) return ERROR; return TRUE; } /********************************************************************** * * convert_language * * Converts language from boolean array to a list of those characters * that belong to the language. Rbuf.language[0] is number of characters * in language. */ static bool convert_language(void) { REG1 unsigned i, j; /* calculate number of items in language */ for (i = j = 0; i < NLANG; i++) if (language[i]) j++; /* allocate buffer for language items */ if ((rbuf.language = malloc((j+1)*sizeof(rbuf.language[0]))) == NULL) return FALSE; /* Copy language elements to the rbuf. Note that the zeroth element contains the number of characters in language. */ rbuf.language[0] = j; for (i = 0, j = 1; i < NLANG; i++) if (language[i]) rbuf.language[j++] = i; return TRUE; } #ifdef TEST #if defined(__TURBOC__) || defined(MSDOS) #define EMPTYCH 0xee #else #define EMPTYCH '@' #endif extern int debug; /********************************************************************** * * print_regexp */ static void print_regexp(void) { int* ptr; for (ptr = regexp; *ptr; ptr++) { switch (*ptr) { case L_PAR: putchar('('); break; case R_PAR: putchar(')'); break; case CLOSURE_OP: putchar('*'); break; case OR_OP: putchar('|'); break; case EMPTY: putchar(EMPTYCH); break; case BOL_OP: putchar('^'); break; case EOL_OP: putchar('$'); break; case CLASS: putchar('['); ++ptr; printf("%d", *ptr); putchar(']'); break; default: putchar(*ptr); break; } } putchar('\n'); } #endif /* TEST */ /********************************************************************** * * free_treemem * * Releases all unnecessary memory. */ static void free_treemem(void) { d_free(&language); d_free(&classes); d_free(®exp); } /********************************************************************** * * error_return */ static int error_return(REG1 char* msg) { d_free(&treestart); free_treemem(); if (msg) rbuf.reg_error = msg; else if (!rbuf.reg_error) rbuf.reg_error = "Error in expression"; return ERROR; } /********************************************************************** * * init_tree */ static char* init_tree(void) { classes = NULL; treestart = treeptr = rbuf.tree = NULL; rbuf.root = -1; regexp = NULL; if ((language = malloc(NLANG)) == NULL) return "Out of memory"; return NULL; } /********************************************************************** * * create_tree * * Creates syntax tree from given regular expression. Returns root * of the tree or -1, if there was errors. */ int create_tree(REG2 uchar* str) { REG1 int left; REG3 char* msg; if ((msg = init_tree()) != NULL) return error_return(msg); if ((msg = make_regexp(str)) != NULL) return error_return(msg); #ifdef TEST if (debug) print_regexp(); #endif regptr = regexp; current = *regptr; next = *(++regptr); pos = -1; left = expression(); if (left == ERROR || next != EOS) return error_return(NULL); if (add_endmarker(left) == ERROR) return error_return(NULL); if (!convert_language()) return error_return("Out of memory"); free_treemem(); rbuf.tree = treestart; rbuf.root = pos; return pos; }