static char rcsid[] = "$Id: regex.c,v 1.1 1992/09/06 19:31:32 mike Exp $"; /* $Log: regex.c,v $ * Revision 1.1 1992/09/06 19:31:32 mike * Initial revision * */ /* * regex - Regular expression pattern matching * and replacement * * By: Ozan S. Yigit (oz), Dept. of Computer Science, York University * Mods: C Durland * * These routines are the PUBLIC DOMAIN equivalents of regex routines as * found in 4.nBSD UN*X, with minor extensions. * * These routines are derived from various implementations found in software * tools books, and Conroy's grep. They are NOT derived from * licensed/restricted software. For more interesting/academic/complicated * implementations, see Henry Spencer's regexp routines, or GNU Emacs * pattern matching module. * * dfa = deterministic finite automata * Routines: * re_comp: compile a regular expression into a DFA. * char *re_comp(s) * char *s; * returns: NULL if OK, else error string * If s is NULL or 0 length, last compiled pattern is used. * re_exec: execute the DFA to match a pattern. * int re_exec(s) * char *s; * re_subs: substitute the matched portions in a new string. * int re_subs(src, dst) * char *src; * char *dst; * re_fail: failure routine for re_exec. * void re_fail(msg, op) * char *msg; * char op; * * Regular Expressions: * * [1] char matches itself, unless it is a special * character (metachar): . \ [ ] * + ^ $ * * [2] . matches any character. * * [3] \ matches the character following it, except * when followed by one of: ()123456789<> adnwW * (see [7], [8] and [9]) * It is used as an escape character for all other * meta-characters, and itself. When used in a set * ([4]), it is treated as an ordinary character. * * [4] [set] matches one of the characters in the set. * If the first character in the set is "^", * it matches a character NOT in the set. A * shorthand S-E is used to specify a set of * characters S upto E, inclusive. The special * characters "]" and "-" have no special * meaning if they appear as the first chars * in the set. * Example: Matches: * -------- ------- * [a-z] Any lowercase alpha. * * [^]-] Any char except ] and -. * * [^A-Z] Any char except uppercase alpha. * * [a-zA-Z] Any alpha. * * [a-b-c] == [a-bb-c] == [a-c] * [a-a] == [a] * [-abc] == [abc-] Match -, a, b, c. * * []] == ] Match "]" * [-]] Match only "-]". This is a set ([-]) * and a character (]). * * [z-a] Nothing and is an error. * * [5] * any regular expression form [1] to [4], followed by * closure char (*) matches zero or more matches of * that form. * * [6] + same as [5], except it matches one or more. * * [7] a regular expression in the form [1] to [10], enclosed * as \(form\) matches what form matches. The enclosure * creates a set of tags, used for [8] and for * pattern substution. The tagged forms are numbered * starting from 1. * * [8] a \ followed by a digit 1 to 9 matches whatever a * previously tagged regular expression ([7]) matched. * * [9] \< a regular expression starting with a \< construct * \> and/or ending with a \> construct, restricts the * pattern matching to the beginning of a word, and/or * the end of a word. A word is defined to be a character * string beginning and/or ending with the characters * A-Z a-z 0-9 and _. It must also be preceded and/or * followed by any character outside those mentioned. * * [10] a composite regular expression xy where x and y * are in the form [1] to [10] matches the longest * match of x followed by a match for y. * * [11] ^ a regular expression starting with a ^ character * $ and/or ending with a $ character, restricts the * pattern matching to the beginning of the line, * or the end of line. [anchors] Elsewhere in the * pattern, ^ and $ are treated as ordinary characters. * * Acknowledgements: * HCR's Hugh Redelmeier has been most helpful in various stages of * development. He convinced me to include BOW and EOW constructs, * originally invented by Rob Pike at the University of Toronto. * References: * Software tools Kernighan & Plauger * Software tools in Pascal Kernighan & Plauger * Grep [rsx-11 C dist] David Conroy * ed - text editor Un*x Programmer's Manual * Advanced editing on Un*x B. W. Kernighan * RegExp routines Henry Spencer * Notes: * This implementation uses a bit-set representation for character sets for * speed and compactness. Each character is represented by one bit in a * N-bit block. Thus, SET or NSET always takes a constant M bytes in the * internal dfa, and re_exec does a single bit comparison to locate the * character in the set. N is 128 for 7 bits ASCII and 256 for 8 bit * data. Thus M is 16 or 32 bytes. * Put CLO in front of what gets closed for ease of interpreting. * Put END at end of what gets closed to limit recursion. * Examples: * pattern: foo*.* * compile: CHR f CHR o CLO CHR o END CLO ANY END END * matches: fo foo fooo foobar fobar foxx ... * * pattern: fo[ob]a[rz] * compile: CHR f CHR o SET bitset CHR a SET bitset END * matches: fobar fooar fobaz fooaz * * pattern: foo\\+ * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END * matches: foo\ foo\\ foo\\\ ... * * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo) * compile: BOT 1 CHR f CHR o CHR o EOT 1 SET bitset REF 1 END * matches: foo1foo foo2foo foo3foo * * pattern: \(fo.*\)-\1 * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END * matches: foo-foo fo-fo fob-fob foobar-foobar ... */ #include #include typedef unsigned char Char; #define MAXDFA 768 /* amount of space for compiled RE */ #define MAXTAG 10 #define CHR 1 /* character :: CHR */ #define ANY 2 /* . :: ANY */ #define SET 3 /* set: [...] :: SET bitset */ #define NSET 4 /* not set: [^...] :: SET bitset */ #define BOL 5 /* beginning of line: ^ :: BOL */ #define EOL 6 /* end of line: $ :: EOL */ #define BOT 7 /* beginning of tag: \( */ #define EOT 8 /* end of tag: \) */ #define BOW 9 /* beginning of word: \< */ #define EOW 10 /* end of word: \> */ #define REF 11 /* tag reference: \1,...,\9 */ #define CLO 12 /* closure: +, * :: CLO dfa END */ #define SPACE 13 /* ": ": match isspace() */ #define ALPHA 14 /* :a match isalpha() */ #define DIGIT 15 /* :d match isdigit() */ #define ALNUM 16 /* :n match isalnum() */ #define WORD 17 /* :w match isword() */ #define NWORD 18 /* :W match !isword() */ #define END 0 /* ******************************************************************** */ /* **************************** Bit Tables **************************** */ /* ******************************************************************** */ /* * Bit table: a string of bits stored in an array of char * .-----------------------------------. * |01234567|89012345|67890123|45678901| * `-----------------------------------' * bits[0] [1] [2] [3] * To find what bucket the nth bit is in (8 bits per bucket): * bit_bucket(n) = bits[n/8] * It might be a good idea to restrict n so it doesn't index outside its * range (only works if number of bits is a power of 2): * n = n & ((max_n - 1) & ~7) where max_n is a power of 2 * The ~7 is just to get rid of the lower bits that won't do anything * anyway. * The nth bit in the bucket is n mod 8 (ie the lower 3 bits of n (0-7) are * the bit position): * bit_flag(n) = 1 << (n & 7) * To find the state of the nth bit (0 == off and !0 == on): * bit_bucket(n) & bit_flag(n) * To set the nth bit: * bits[bit_bucket(n)] |= bit_flag(n) * Notes: * The bits are stored in a character array so that the array can be * easily copied without worrying about alignment (ie can use a loop as * well as memcpy()). * This is based on two's complement math. */ /* The following defines are for character set size. 128 for straight * ASCII, 256 for Euro ASCII (8 bit characters). */ #define MAXCHR 256 /* 128 or 256 */ #define BLKIND 0xf8 /* 0x78 or 0xf8 */ /* * The following defines are not meant to be changeable. * They are for readability only. */ #define CHRBIT 8 #define BITBLK MAXCHR/CHRBIT #define BITIND 0x7 static Char bittab[BITBLK]; /* bit table for SET */ /* Add or check to see if a character is in the bit table (character * set). * Note: * When calling these routines, make sure c is an unsigned char (or * int) so if it has the high bit set, casting it to an int won't * make it a large negative number. */ #define ISINSET(bittab,c) ((bittab)[((c) & BLKIND)>>3] & (1<<((c) & BITIND))) #define CHSET(bittab,c) (bittab)[((c) & BLKIND)>>3] |= 1<<((c) & BITIND) static void chset(c) register int c; { CHSET(bittab,c); } /* ******************************************************************** */ #define SLOP 50 /* dfa overflow protection */ static Char dfa[MAXDFA + SLOP]; /* automaton */ static int tagstk[MAXTAG], /* subpat tag stack */ pattern_compiled = FALSE; /* status of lastpat */ #define BADPAT(msg) return (*dfa = END, (Char *)msg) #define STORE(x) (*mp++ = x) /* !!! should check for dfa overflow */ /* Compile RE to internal format & store in dfa[] * Input: * pat : pointer to regular expression string to compile. * Returns: * NULL: RE compiled OK. * pointer to error message. */ Char *re_comp(pat) Char *pat; { register Char *p, /* pattern pointer */ *mp = dfa, /* dfa pointer */ *lp, /* saved pointer */ *sp = dfa; /* another one */ register int tagi = 0, /* tag stack index */ tagc = 1, /* actual tag count */ n; if (pat == NULL || *pat == '\0') if (pattern_compiled) return NULL; else BADPAT("No previous regular expression"); pattern_compiled = FALSE; *dfa = END; /* init dfa for lp and sp */ for (p = pat; *p; p++) { lp = mp; /* start of next dfa state */ switch(*p) { case '.': STORE(ANY); break; /* match any character */ case '^': /* match beginning of line */ if (p == pat) STORE(BOL); else { STORE(CHR); STORE(*p); } /* match ^ */ break; case '$': /* match end of line */ if (*(p+1) == '\0') STORE(EOL); else { STORE(CHR); STORE(*p); } /* match $ */ break; case '[': /* match a set of characters */ if (*++p == '^') { STORE(NSET); p++; } else STORE(SET); if (*p == ']') chset(*p++); /* real bracket, match ] */ if (*p == '-') chset(*p++); /* real dash, match - */ while (*p && *p != ']') { if (*p == '-' && *(p+1) != '\0' && *(p+1) != ']') /* [a-z] */ { int c1, c2; p++; c1 = *(p-2); /* 'a' */ c2 = *p++; /* 'z' */ if (c1 > c2) BADPAT("Empty set"); /* something like [z-a] */ /* remember that 'a' has already been put into bittab */ while (++c1 <= c2) chset(c1); /* build bit table */ } #ifdef EXTEND else if (*p == '\\' && *(p+1)) { p++; chset(*p++); } #endif else chset(*p++); } if (*p == '\0') BADPAT("Missing ]"); for (n = 0; n < BITBLK; bittab[n++] = '\0') STORE(bittab[n]); break; case '*': /* match 0 or more of preceding RE */ case '+': /* match 1 or more. Note: x+ == xx* */ if (p == pat) BADPAT("Empty closure"); lp = sp; /* previous opcode */ if (*lp == CLO) break; /* equivalence: x** == x* */ switch(*lp) { case BOL: case BOT: case EOT: case BOW: case EOW: case REF: BADPAT("Illegal closure"); } if (*p == '+') for (sp = mp; lp < sp; lp++) STORE(*lp); STORE(END); STORE(END); sp = mp; while (--mp > lp) *mp = mp[-1]; STORE(CLO); /* open hole for CLO */ mp = sp; break; case '\\': /* tags, backrefs */ switch(*++p) { case '\0': BADPAT("Bad quote"); case '(': if (tagc < MAXTAG) { tagstk[++tagi] = tagc; STORE(BOT); STORE(tagc++); } else BADPAT("Too many \\(\\) pairs"); break; case ')': if (*sp == BOT) BADPAT("Null pattern inside \\(\\)"); if (tagi > 0) { STORE(EOT); STORE(tagstk[tagi--]); } else BADPAT("Unmatched \\)"); break; case '<': STORE(BOW); break; case '>': if (*sp == BOW) BADPAT("Null pattern inside \\<\\>"); STORE(EOW); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': n = *p - '0'; if (tagi > 0 && tagstk[tagi] == n) BADPAT("Cyclical reference"); if (tagc > n) { STORE(REF); STORE(n); } else BADPAT("Undetermined reference"); break; case ' ': STORE(SPACE); break; case 'a': STORE(ALPHA); break; case 'd': STORE(DIGIT); break; case 'n': STORE(ALNUM); break; case 'w': STORE(WORD); break; case 'W': STORE(NWORD); break; #ifdef EXTEND case 'b': STORE(CHR); STORE('\b'); break; case 'n': STORE(CHR); STORE('\n'); break; case 'f': STORE(CHR); STORE('\f'); break; case 'r': STORE(CHR); STORE('\r'); break; case 't': STORE(CHR); STORE('\t'); break; #endif default: STORE(CHR); STORE(*p); } break; default : STORE(CHR); STORE(*p); break; /* an ordinary character */ } sp = lp; /* start of previous state */ if (mp > dfa + MAXDFA) BADPAT("Pattern too long (braindead re_comp())"); } if (tagi > 0) BADPAT("Unmatched \\("); STORE(END); pattern_compiled = TRUE; return NULL; } static Char *pmatch(); Char *bopat[MAXTAG], *eopat[MAXTAG]; int re_errorcode; /* sleaze */ static Char *bol; /* re_exec: execute dfa to find a match. * * special cases: (dfa[0]) * BOL * Match only once, starting from the beginning. * CHR * First locate the character without calling pmatch(), and if found, * call pmatch() for the remaining string. * END * re_comp() failed, poor luser did not check for it. Fail fast. * * If a match is found, bopat[0] and eopat[0] are set to the beginning and * the end of the matched fragment, respectively. * * Input: * lp: string to search * SoL==TRUE if lp starts line * move==TRUE if search the entire string for match * Returns: * TRUE: * FALSE: * ???? * Munges: * re_errorcode: Set to FALSE, re_fail() might want to set it. * Notes: * If SoL is FALSE, lp[-1] MUST be at valid! A couple of REs will look * there if they can. */ int re_exec(lp,SoL,move) register Char *lp; int SoL, move; { register Char *ap = dfa, c; Char *ep = NULL; re_errorcode = FALSE; /* assume no match */ bol = (SoL ? lp : NULL); bopat[0] = bopat[1] = bopat[2] = bopat[3] = bopat[4] = bopat[5] = bopat[6] = bopat[7] = bopat[8] = bopat[9] = NULL; switch(*ap) { case END: return FALSE; /* munged automaton. fail always */ case BOL: /* anchored: match from BOL only */ if (!SoL) return FALSE; /* BoL can only be at front of dfa */ ep = pmatch(lp,++ap); break; case CHR: /* ordinary char: locate it fast */ if (move) { c = *(ap+1); while (*lp && !ceq(*lp,c)) lp++; if (!*lp) return FALSE; /* if EoS, fail. else fall thru */ } default: /* regular matching all the way. */ if (!move) { ep = pmatch(lp,ap); break; } #if 0 while (*lp) { if ((ep = pmatch(lp,ap))) break; lp++; } #else while (TRUE) { if ((ep = pmatch(lp,ap))) break; if ('\0' == *lp++) break; } #endif break; } if (!ep) return re_errorcode; /* only if pmatch() returns NULL */ bopat[0] = lp; eopat[0] = ep; return TRUE; } /* * pmatch: internal routine for the hard part * * This code is mostly snarfed from an early grep written by David Conroy. * The backref and tag stuff, and various other mods are by oZ. * * special cases: (dfa[n], dfa[n+1]) * CLO ANY * We KNOW ".*" will match ANYTHING upto the end of line. Thus, go to * the end of line straight, without calling pmatch() recursively. As in * the other closure cases, the remaining pattern must be matched by * moving backwards on the string recursively, to find a match for xy (x * is ".*" and y is the remaining pattern) where the match satisfies the * LONGEST match for x followed by a match for y. * CLO CHR * Scan forward matching the single char without recursion, and at the * point of failure, we execute the remaining dfa recursively, as * described above. * * At the end of a successful match, bopat[n] and eopat[n] are set to the * beginning and end of subpatterns matched by tagged expressions (n = 1 * to 9). * * Input: * Returns: * NULL: No match, re_fail() may have been called. * else: pointer to end of match. */ extern void re_fail(); /* skip values for CLO XXX to skip past the closure */ #define ANYSKIP 2 /* CLO ANY END ... */ #define CHRSKIP 3 /* CLO CHR chr END ... */ #define SETSKIP (2 +BITBLK) /* CLO SET 16bytes END ... */ static Char *pmatch(lp,dfa) register Char *lp, *dfa; { register Char *e, /* extra pointer for CLO */ *bp, *ep; /* beginning and ending of subpat */ Char *are; /* to save the line ptr */ register int op, c, n; while ((op = *dfa++) != END) switch(op) { case CHR: if (!ceq(*lp++,*dfa++)) return NULL; break; case ANY: if (*lp++ == '\0') return NULL; break; case SET: c = *lp++; if (!ISINSET(dfa,c)) return NULL; /* ISINSET(dfa,0) is FALSE */ dfa += BITBLK; break; case NSET: if ((c = *lp++) == '\0' || ISINSET(dfa,c)) return NULL; dfa += BITBLK; break; case BOT: bopat[*dfa++] = lp; break; case EOT: eopat[*dfa++] = lp; break; case EOL: if (*lp != '\0') return NULL; break; case ALNUM: if (!isalnum(*lp++)) return NULL; break; case ALPHA: if (!isalpha(*lp++)) return NULL; break; case DIGIT: if (!isdigit(*lp++)) return NULL; break; case SPACE: if (!isspace(*lp++)) return NULL; break; case WORD: if (!isword (*lp++)) return NULL; break; case NWORD: if (*lp == '\0' || isword(*lp++)) return NULL; break; case BOW: if (!(lp != bol && isword(lp[-1])) && isword(*lp)) break; return NULL; case EOW: /* 'w\0' is OK here */ if ((lp != bol && isword(lp[-1])) && !isword(*lp)) break; return NULL; case REF: /* !!! case_fold? */ n = *dfa++; bp = bopat[n]; ep = eopat[n]; while (bp < ep) if (*bp++ != *lp++) return NULL; /* !!! recurse? */ break; case CLO: are = lp; n = ANYSKIP; switch(*dfa) { case ANY: while (*lp) lp++; break; case ALNUM: while (isalnum(*lp)) lp++; break; case ALPHA: while (isalpha(*lp)) lp++; break; case DIGIT: while (isdigit(*lp)) lp++; break; case SPACE: while (isspace(*lp)) lp++; break; case WORD: while (isword (*lp)) lp++; break; case NWORD: while (*lp && !isword(*lp)) lp++; break; case CHR: c = *(dfa+1); /* we know c!='\0' */ while (ceq(*lp,c)) lp++; n = CHRSKIP; break; case SET: case NSET: while (*lp && (e = pmatch(lp,dfa))) lp = e; n = SETSKIP; break; default: re_fail("closure: bad dfa.",*dfa); return NULL; } dfa += n; while (lp >= are) /* backup up till match next pattern */ { if (e = pmatch(lp,dfa)) return e; --lp; } return NULL; default: re_fail("re_exec: bad dfa.",op); return NULL; } return lp; } /* * re_subs: substitute the matched portions of the src in dst. * & substitute the entire matched pattern. * \digit substitute a subpattern, with the given * tag number. Tags are numbered from 1 to * 9. If the particular tagged subpattern * does not exist, null is substituted. * !!!Note: if the line that was used re_exec() has gone byebye * then \digit will blow cookies since the tags point into the line. * Input: * src: * dst: * Returns: * TRUE: everything went as expected * FALSE: Bad src or no match. */ int re_subs(src,dst) register Char *src, *dst; { register Char c, *bp, *ep; register int pin; if (!bopat[0]) return FALSE; while (c = *src++) { switch(c) { case '&': pin = 0; break; case '\\': c = *src++; if (c >= '0' && c <= '9') { pin = c - '0'; break; } default: *dst++ = c; continue; } if ((bp = bopat[pin]) && (ep = eopat[pin])) { while (*bp && bp < ep) *dst++ = *bp++; if (bp < ep) return FALSE; } } *dst = '\0'; return TRUE; } /* ******************************************************************** */ /* ************************* DEBUG ************************************ */ /* ******************************************************************** */ #ifdef DEBUG /* * symbolic - produce a symbolic dump of the dfa */ symbolic(s) char *s; { printf("pattern: %s\n", s); printf("dfacode:\n"); dfadump(dfa); } static dfadump(dfa) Char *dfa; { register int n; while (*dfa != END) switch(*dfa++) { case CLO: printf("CLOSURE"); dfadump(dfa); switch(*dfa) { case CHR: n = CHRSKIP; break; case ANY: n = ANYSKIP; break; case SET: case NSET: n = SETSKIP; break; } dfa += n; break; case CHR: printf("\tCHR %c\n",*dfa++); break; case ANY: printf("\tANY .\n"); break; case BOL: printf("\tBOL -\n"); break; case EOL: printf("\tEOL -\n"); break; case BOT: printf("BOT: %d\n",*dfa++); break; case EOT: printf("EOT: %d\n",*dfa++); break; case BOW: printf("BOW\n"); break; case EOW: printf("EOW\n"); break; case REF: printf("REF: %d\n",*dfa++); break; case SET: printf("\tSET ["); for (n = 0; n < MAXCHR; n++) if (ISINSET(dfa,n)) printf("%c",n); printf("]\n"); dfa += BITBLK; break; case NSET: printf("\tNSET ["); for (n = 0; n < MAXCHR; n++) if (ISINSET(dfa,n)) printf("%c",n); printf("]\n"); dfa += BITBLK; break; default: printf("bad dfa. opcode %o\n", dfa[-1]); exit(1); break; } } #endif