/************************************************************************ * This program is Copyright (C) 1986 by Jonathan Payne. JOVE is * * provided to you without charge, and with no warranty. You may give * * away copies of JOVE, including sources, provided that this notice is * * included in all the files. * ************************************************************************/ /* * Modified: Sep-88 by Tom Hageman [TRH] * fixed bug in re_dosub; some fine tuning & additions * Mar-89 [TRH] extend to fuller RE, remove alternate parameter. * Jan-91 [TRH] more optimization. * {{TODO: more robustness in RE compiler.}} */ /* * The regular expression string is compiled into a (finite nondeterministic) * automaton. The following opcodes are used: * * token opcode arguments * ----- ------ --------- * ^ AT_BOL * $ AT_EOL * \< AT_BOW * \> AT_EOW * \{alt1, ... altN>\} ALT |len1|alt| ... lenN|altN|EOP| * \( OPENP+# * \) CLOSEP+# * \# BACKREF # * . ANY * [...], [^...] ONE_OF bitmap * (endmarker) EOP * (implements loops) BACK offset * (avoid enless loops) MARK * (normal characters) NORMC |len|literal string| * (case independent) CINDC |len|literal string| * (end w/ paren count) NUMP # * (implement negation) NOT |len|subexpr| * * Each "atomic" RE (except ^,$,\<,\>) can be followed by a *, +, or ? suffix * for (0 or more), (1 or more), (0 or one) matches. (or \! to negate the * match condition {{this is expeximental!}}) * The "simple" REs have the corresponding code (STAR, PLUS, QUEST) * ORed in with their opcode; NORMC/CINDC modified in this way * have a single character argument instead of a counted string. * The complex cases "\{ ... \}[*+?]" and "\( ... \)[*+?]" are explicitly * compiled as follows: * \(\)[*+?] -> \([*+?]\) * * * -> MARK\{BACK(MARK),\} * + -> MARK\{BACK(MARK),\} * ? -> \{,\} * * where BACK(x) means: jump backward to the opcode for in the compiled * RE automaton. * * The \| (alternate) operator is compiled much the same way as the * curly brace pair, except that an alternate is ended with a NUMP opcode, * to record the actual number of \( \) parens in that alternate. * * The first RE_NFIRST characters in the buffer are reserved for start- * character optimization. */ #include "tune.h" /* to allow definition of debug flags there. */ #ifdef DEBUG /* General debug flag, turn on file-specific flag. */ # ifndef RE_DEBUG # define RE_DEBUG 1 # endif #else # if RE_DEBUG /* File-specific debug flag, turn on general flag. */ # define DEBUG # endif #endif #include "jove.h" RCS("$Id: re.c,v 14.32.0.10 1994/04/22 18:24:23 tom Exp tom $") #include "ctype.h" #define Extern public #include "re.h" #ifndef TINY private int REfirst __(( const char *_(comp_p), char *_(firsts) )); #endif private void cset_range __(( char *_(base), int _(from), int _(to) )), cset_case_ignore __(( char *_(base) )), REreset __(( void )); private char *do_comp __(( char *_(comp_base), int _(in_subalt), char **_(parens) )), *ins_alt __(( char *_(at_p), char *_(to_p) )); /* convert to upper case to achieve case equivalence -- MOVED TO re.h */ /* bit field manipulations */ #define BLKIND(c) (_Uc(c) >> 3) #define BITIND(c) ((c) & 07) #define SETMEMBER(c,s) ((s)[BLKIND(c)] |= Bit(BITIND(c))) #define ISMEMBER(c,s) ((s)[BLKIND(c)] & Bit(BITIND(c))) char compbuf[RE_SIZE], /* global default compbuf */ *rep_search ZERO, /* replace search string */ *rep_str ZERO; /* contains replacement string */ int REdirection; DEF_INT( "case-ignore-search", CaseIgnore, V_BOOL ) ZERO; DEF_INT( "wrap-search", WrapScan, V_BOOL ) ZERO; DEF_INT( "match-regular-expressions", UseRE, V_BOOL ) ZERO; /* The following are ORed in in main opcode of simple kinds. */ #define QUEST 1 /* ? match 0 or 1 of last RE. */ #define PLUS 2 /* + match 1 or more of last RE. */ #define STAR 3 /* * match any number of last RE. */ /* These values are chosen so that "x+?", "x+*", "x?*" all boil down to `x*', just as you would expect. */ /* Opcodes for compiled RE. */ #define EOP 0 /* end of pattern. */ /* note: some optimizations are aware of this being zero */ #define AT_BOL 1 /* ^ */ #define AT_EOL 2 /* $ */ #define AT_BOW 3 /* \< */ #define AT_EOW 4 /* \> */ #define BACK 5 /* backward jumps */ #define NUMP 6 /* EOP with paren count */ #define NOT 7 /* \! (experimental) */ #define NOSTR 8 /* Codes < NOSTR can't be [*+?]'d. */ #define ANYC NOSTR /* . */ #define NORMC (NOSTR+4) /* normal character */ #define CINDC (NOSTR+8) /* case independent character */ #define ONE_OF (NOSTR+12) /* [xxx] and [^xxx] */ #define SIMPLE (NOSTR+16) /* codes < SIMPLE can have [*+?] ORed in */ #define OPENP (SIMPLE-1) /* \( numbered [1..NPAR] */ #define CLOSEP (OPENP+NPAR) /* \) numbered [1..NPAR] */ #define ALT (CLOSEP+NPAR+1) /* \{ */ #define MARK (ALT+1) /* target of backward jump. */ #define BACKREF (ALT+2) /* \# in [1..NPAR] */ #define NPAR 9 /* [1..9] */ /* nice shorthands */ #define isOPENP(c) ((unsigned)((c)-OPENP-1) 0400) /* Small RE buffers don't need long format jumps. */ # define JUMP_LONG 0377 # endif #else # if (2* JUMP_LONG -1 != 2* 0 -1) # undef JUMP_LONG # define JUMP_LONG 0377 # endif #endif #if JUMP_LONG /* Calculate the number of bytes a jump (opcode + offset) will take, as a function of the distance to jump. A short jump offset will fit in a single byte; a long format jump has itself tagged as such in the byte normally used for the short offset, and encodes the offset in two additional bytes. */ # define JUMP_SIZE(len) ((len) >= JUMP_LONG ? 4 : 2) private int jump_store __(( char *_(at), int _(len), int _(backward) )); private int jump_store(at, len, backward) register char *at; register int len; int backward; { ASSERT(len > 0); if (len >= JUMP_LONG) { if (!backward) { /* Shift tail to make room for long address. */ register char *d = at + len; while (--d > at) d[2] = d[0]; } *at++ = JUMP_LONG; *at++ = (char)(len >> 8); backward += 2; } *at = (char) len; return backward; } /* Store a backward jump offset `length' in the position pointed to by `at' while advancing the pointer. */ # define B_JUMP_STORE(at, len) (at += jump_store(at, len, YES)) /* Calculate a forward jump offset from pointers `end', presumably pointing to the position to jump to, and `at', presumably pointing to the jump's offset location. It is assumed that a single byte is already allocated to hold the jump's offset. If necessary the stuff between `at' and `end' is shifted upward and the `end' pointer is updated to allocate space for a long jump offset. */ # define F_JUMP_STORE(at, end) (end += jump_store(at, end - at, NO)) #else /* Short jumps only; pretty easy... */ # define JUMP_SIZE(len) 2 # define B_JUMP_STORE(at, len) (*at=len,++at) # define F_JUMP_STORE(at, end) (*at=end-at) #endif /* Size of MARK instruction. */ #define MARK_SIZE 3 /* * REcompile: compiles a regular expression pattern in an internal code. * if `mode' <= 0, use pattern as a literal string, else do full regular * expression parsing. (this way we can temporarily switch in and out * of RE mode in macros, using the next trick: * "ESC -- (set match-RE) ... search ... ESC - (set match-RE) */ void REcompile(pattern, re, into_buf) register const char *pattern; register char *into_buf; { nparens = 0; REdirection = 0; /* !BACKWARD; - restore default. */ /* * Optimization: calculate the set of possible start characters * for this RE. The trivial case is a single character: this * character is stored in compiled[0], while compiled[1] contains * a copy of "case-ignore-search" at compile-time. The more * complex case involves a bitset of allowed characters: the * offset to this is encoded in compiled[0] and compiled[1] and * is marked by the RE_FSET flag set in compiled[1]. * No first-character optimization is present when both * compiled[0] == 0 and not (compiled[1] & RE_FSET). */ #define RE_NFIRST 2 #define RE_FSET (~0177) if (re) { char *parens[NPAR+1]; /* paren stack */ parens[0] = NULL; /* mark bottom of stack */ end_compb = into_buf + RE_SIZE - 7; /* chicken factor */ REptr = pattern; *into_buf++ = '\0'; /* clear FIRSTC and CASE_IGN */ *into_buf++ = NO; cur_compb = into_buf; #ifndef TINY # define end_p pattern end_p = #endif do_comp(into_buf, NO, &parens[1]); REreset(); /* post-process the compiled expression for start-characters */ { register char *p = into_buf; #ifndef TINY register int anchored = 0; #endif for (;;) { switch (*p++) { /* easy cases are handled here. */ case CINDC: into_buf[-1]++; /* fall through ... */ case NORMC: into_buf[-2] = p[1]; break; case AT_BOL: #ifndef TINY anchored++; #endif case AT_BOW: case AT_EOW: continue; default: if (isPAREN(p[-1])) continue; #ifndef TINY if (p[-1] == MARK) { p += 2; continue; } /* * The hard case: if there is room after * the actual RE, the bitset of possible * start characters is placed there. It * is built in a temporary variable so as * to not disturb some semi pre-compiled * expressions that use a small RE buffer. * Anchored match doesn't need it, either. */ if (anchored) break; if (end_p > into_buf + RE_SIZE - RE_NFIRST - CSETSIZE) break; { char firstset[CSETSIZE]; /* blank it out. */ p = &firstset[CSETSIZE]; do *--p = 0; while (p > firstset); if (REfirst(into_buf, p) <= 0) break; /* copy bitset to final location. */ p = (char *) end_p + CSETSIZE; end_p = &firstset[CSETSIZE]; do *--p = *--end_p; while (end_p > firstset); } /* now `p' points to firstset in RE buffer. */ if (True(CaseIgnore)) cset_case_ignore(p); /* fall through... */ case ONE_OF: case ONE_OF | PLUS: { /* store offset to start bitset */ register int offset = p - into_buf; *--into_buf = RE_FSET | (offset >> 7); *--into_buf = offset & (Bit(7)-1); } # undef end_p #endif /* TINY */ break; } break; } } } else { /* RE_LITERAL */ register int len = strlen(pattern); # define comp_p into_buf /* alias */ #if (RE_SIZE - RE_NFIRST - 1 >= MAXBYTE) if (len >= MAXBYTE) complain(TooLong); #else if (len >= RE_SIZE - RE_NFIRST - 1) complain(TooLong); #endif #ifdef OLD_CASEIGNORE if (True(CaseIgnore)) { *comp_p++ = CEquiv(*pattern); *comp_p++ = YES; *comp_p++ = CINDC; *comp_p++ = len; while (*comp_p++ = CEquiv(*pattern++)) ; } else { *comp_p++ = *pattern; *comp_p++ = NO; *comp_p++ = NORMC; *comp_p++ = len; while (*comp_p++ = *pattern++) ; } #else /* The new way of ignoring case ignores case only if the SEARCH PATTERN is lower case; uppercase characters in the search pattern must match exactly. */ *comp_p++ = *pattern; if (True(CaseIgnore)) { *comp_p++ = YES; *comp_p++ = CINDC; } else { *comp_p++ = NO; *comp_p++ = NORMC; } *comp_p++ = len; while (*comp_p++ = *pattern++) ; #endif # undef comp_p /* unalias */ } } /* compile the pattern into an internal code */ private char * do_comp(comp_base, in_subalt, parens) char *comp_base; char **parens; { register char *comp_p = comp_base; /* speed ... */ register char *last_p = NULL; /* non-NULL if [*+?] allowed */ register char *chr_cnt = NULL; /* non-NULL if collecting chars */ register int c; char **parenp = parens; register int suffix; /* static to communicate "return code" between incarnations */ static int more; more = NO; suffix = STAR; /* loop invariant. */ while (c = *REptr++) { if (comp_p > end_compb) toolong: complain(TooLong); /* handle [*+?] suffixes before `last_p' is updated. */ switch (c) { /* PRE: (suffix == STAR) */ case '?': /* suffix = QUEST; */ suffix += QUEST - PLUS; case '+': /* suffix = PLUS; */ suffix += PLUS - STAR; case '*': /* suffix = STAR; */ if (chr_cnt) last_p = chr_cnt - 1; /* Make sure we can handle it. */ if (last_p == NULL || *last_p < NOSTR) { #if NOTYET if (True(AllowSPQ)) goto defchar; #endif complain("Unexpected %c", c); } #ifndef TINY /* Collect multiple [*?+]'s. {this is an optimization which speeds up compilation and tends to make the resulting RE smaller and more efficient, especially for non-trivial cases. On the other hand, multiple flags are rarely used in practice, so why bother, really?} */ for (;;) { switch (*REptr++) { case '?': suffix |= QUEST; continue; case '+': suffix |= PLUS; /*- continue; -*/ case '*': suffix |= STAR; continue; } REptr--; break; } #endif if (chr_cnt) { /* NORMC, CINDC */ c = *--comp_p; if (suffix == PLUS) { /* compile x+ as xx*. */ ++comp_p; /* suffix = STAR; */ suffix += STAR - PLUS; } else if (--*chr_cnt == 0) { comp_p = last_p; } *comp_p = *last_p; #ifdef TINY last_p = comp_p; #endif *comp_p++ |= suffix; *comp_p++ = c; chr_cnt = NULL; } else if (*last_p < SIMPLE) { /* ANYC, ONE_OF */ if (suffix == PLUS && *last_p == ANYC) { /* * just copy previous atom, since "x+" * is really just shorthand for "xx*" */ register char *end_p = comp_p; while (last_p < end_p) *comp_p++ = *last_p++; suffix += STAR - PLUS; } *last_p |= suffix; } else { /* OPENP, ALT, BACKREF */ register int closep = 0; /* Suffixes apply to expression inside \( \). */ if (isOPENP(*last_p)) { ++last_p; closep = *--comp_p; } switch (suffix) { case QUEST: /* * Compile x? as: * ______ * / v * ALT o x EOP o EOP * \_^ */ *comp_p++ = EOP; comp_p = ins_alt(last_p, comp_p); break; case PLUS: /* * Compile x+ as: * _______ * / v * MARK x ALT o BACK o o EOP * ^_______________/ \_^ */ *comp_p++ = ALT; #if JUMP_LONG *comp_p = JUMP_SIZE(comp_p - last_p + MARK_SIZE + 2) + 1, comp_p++; #else *comp_p++ = 2 + 1; #endif *comp_p++ = BACK; B_JUMP_STORE(comp_p, comp_p - last_p + MARK_SIZE); goto insert_mark; /*- case STAR: -*/ default: /* * Compile x* as: * _________ * / v * MARK ALT o x BACK o o EOP * ^_______________/ \_^ */ *comp_p++ = BACK; B_JUMP_STORE(comp_p, comp_p - last_p + MARK_SIZE + JUMP_SIZE(comp_p - last_p + MARK_SIZE + 1)); comp_p = ins_alt(last_p, comp_p); insert_mark: { char *save = comp_p; do { --comp_p; comp_p[MARK_SIZE] = comp_p[0]; } while (comp_p > last_p); *last_p = MARK; comp_p = save + MARK_SIZE; } break; } *comp_p++ = 1; *comp_p++ = EOP; if (closep) { /* restore \( \) */ #ifdef TINY --last_p; #endif *comp_p++ = closep; } } suffix = STAR; /* restore loop invariant. */ continue; #ifdef NOT case '\\': if (*REptr == '!') { REptr++; *comp_p++ = EOP; comp_p = ins_alt(last_p, comp_p); *last_p = NOT; continue; } #endif } last_p = comp_p; switch (c) { case '\\': switch (c = *REptr++) { case '\0': complain("Premature end of pattern."); case '{': { char *save = last_p; *comp_p++ = ALT; do { last_p = comp_p++; comp_p = do_comp(comp_p, YES, parenp); F_JUMP_STORE(last_p, comp_p); } while (more); #if JUMP_LONG if (*last_p == 0) /* borrow */ --last_p[-1]; #endif --*last_p; /* i.e. point to EOP == 0 */ last_p = save; break; } case '}': if (!in_subalt) complain("Unexpected \\}."); goto outahere; case '(': if (++nparens >= NPAR) complain("Too many \\('s; max is %d.", NPAR); *parenp++ = comp_p; *comp_p++ = OPENP + nparens; last_p = NULL; break; case ')': /* Check for unmatched \) and empty \(\)'s. */ if (parenp == parens) complain("Unexpected \\)."); last_p = *--parenp; *comp_p++ = *last_p + (CLOSEP - OPENP); break; case '|': /* * [TRH] Emulate end of pattern, * EXCEPT for first occurrence */ c = '\0'; more++; if (comp_base != cur_compb) goto outahere; *comp_p++ = NUMP; *comp_p++ = nparens; comp_p = ins_alt(cur_compb, comp_p); do { nparens = 0; last_p = comp_p++; comp_p = do_comp(comp_p, NO, parenp); comp_p[-1] = NUMP; /* replace EOP */ *comp_p++ = nparens; F_JUMP_STORE(last_p, comp_p); } while (more); *comp_p++ = 0; /* end of alt list */ goto outahere; /* we are done */ case_PAR('0'): /* * [TRH] we can't allow forward ("..\4..") * or pending ("\(..\1..\)..") sub-expressions. */ { register char **pp = parenp; c -= '0'; if (c <= nparens) { /* no forward ref; check pending */ while(*--pp && c != **pp - OPENP) ; } if (c > nparens || *pp) complain("Backref \\%d not allowed here", c); *comp_p++ = BACKREF; *comp_p++ = c; break; } case '<': *comp_p++ = AT_BOW; break; case '>': *comp_p++ = AT_EOW; break; default: goto defchar; } break; case '.': *comp_p++ = ANYC; break; case '^': if (comp_p != comp_base) goto defchar; *comp_p++ = AT_BOL; break; case '$': if (*REptr && *REptr != '\\' && !(*REptr == ',' && in_subalt)) goto defchar; *comp_p++ = AT_EOL; break; case '[': { /* * [TRH] compile [xxx] and [^xxx] to same code. * we must negate the bitset in case of [^xxx] * Accept ']' as normal char if first in set, * and '-' if first or last in set. * backslash '\' has no special meaning here. */ register char *neg_p = NULL; register int c2; comp_p += 1 + CSETSIZE; if (comp_p > end_compb) goto toolong; if ((c = *REptr++) == '^') { neg_p = comp_p; /* remember... */ c = *REptr++; } /* clear bitset */ do *--comp_p = '\0'; while (comp_p > last_p); *comp_p++ = ONE_OF; do { #if NO if (c == '\\') c = *REptr++; #endif if (c == '\0') complain("Missing ]."); SETMEMBER(c, comp_p); if (*REptr == '-' && (c2 = *++REptr)) { if (c2 == ']') { REptr--; continue; } #if NO if (c2 == '\\') if ((c2 = *++REptr) == '\0') continue; /* for error mesg. */ #endif REptr++; if (c < c2) cset_range(comp_p, c, c2); } } while ((c = *REptr++) != ']'); if (True(CaseIgnore)) cset_case_ignore(comp_p); if (neg_p == NULL) { comp_p += CSETSIZE; } else { /* * [^xxx] so negate bitset. * make sure '\0' is not in set after negation. */ SETMEMBER('\0', comp_p); while (comp_p < neg_p) *comp_p++ ^= ~0; /* ...and comp_p at right spot for free */ } break; } case ',': if (in_subalt) { more++; goto outahere; } /* fall through ... */ defchar: default: if (chr_cnt == NULL) { if (True(CaseIgnore)) *comp_p++ = CINDC; else *comp_p++ = NORMC; chr_cnt = comp_p; *comp_p++ = 0; } #if (RE_SIZE >= MAXBYTE) if (*chr_cnt >= _UC_(MAXBYTE - 1)) goto toolong; #endif ++*chr_cnt; #ifdef OLD_CASEIGNORE if (True(CaseIgnore)) *comp_p++ = CEquiv(c); else #endif *comp_p++ = c; continue; } chr_cnt = NULL; } /* Kludge: remember # of parens for this pattern. */ if (cur_compb == comp_base) *comp_p++ = NUMP, *comp_p++ = nparens; else outahere: *comp_p++ = EOP; /* End of pattern, let's do some error checking. */ if (parenp != parens) complain("Missing \\)."); if (in_subalt && c == 0) /* End of pattern with \}. */ complain("Missing \\}."); return comp_p; } private char * ins_alt(at_p, to_p) char *at_p; register char *to_p; { register char *s = to_p; #if JUMP_LONG int len = to_p - at_p; #endif register char *d = to_p += JUMP_SIZE(len); while (s > at_p) /* make room for new node */ *--d = *--s; *s++ = ALT; #if JUMP_LONG if ((len) >= JUMP_LONG) { *s++ = JUMP_LONG; *s++ = len >> 8; } #endif *s = to_p - s; return to_p; } /* * union of bitset at `base' with bitset in range [from..to] (inclusive) */ private void cset_range(base, from, to) char *base; register int from; { register int mask = -Bit(BITIND(from)); /* assumes 2's complement */ register char *p = base, *q = p + BLKIND(to); p += BLKIND(from); while (p < q) { *p++ |= mask; mask = ~0; } *p |= mask & (Bit(BITIND(to)+1) - 1); } /* * make bitset case-independent * {{depends heavily on ASCII character coding, eg. 'A'-'a' is divisible by 8}} */ private void cset_case_ignore(base) char *base; { #if (('a' - 'A') % 8 == 0 && ('z' - 'a') == 25) /* i.e. ASCII */ { register char *up = base, *lp = up + BLKIND('a'), *up_end = up + BLKIND('Z'); up += BLKIND('A'); # ifdef OLD_CASEIGNORE *lp |= *up & -Bit(BITIND('A')); # endif *up++ |= *lp++ & -Bit(BITIND('a')); /* assumes 2's complement */ do { # ifdef OLD_CASEIGNORE *lp |= *up; *up++ = *lp++; # else *up++ |= *lp++; # endif } while (up < up_end); # ifdef OLD_CASEIGNORE *lp |= *up & (Bit(BITIND('Z')+1) - 1); # endif *up |= *lp & (Bit(BITIND('z')+1) - 1); } # if (BPC == 8) # define CSET_LIMIT 0200 /* to resolve equivalences for char 128..255 */ # endif #else # define CSET_LIMIT 0 #endif #ifdef CSET_LIMIT { /* this does not depend on ASCII character coding */ register int c = CSETSIZE * 8 - 1, m = 0; register char *p = base + CSETSIZE; register int ce; do { if ((m >>= 1) == 0) { --p; m = Bit(8-1); } if ((ce = CEquiv(c)) != c) { # ifdef OLD_CASEIGNORE register char *pe = base + BLKIND(ce); register int me = Bit(BITIND(ce)); if (*p & m) *pe |= me; else if (*pe & me) # else if (ISMEMBER(ce, base)) # endif *p |= m; } } while (--c >= CSET_LIMIT); } # undef CSET_LIMIT #endif } #ifndef TINY /* Build FIRST(comp_p) in cset `firstset' */ private int REfirst(comp_p, firstset) register const char *comp_p; register char *firstset; { for (;;) { switch (*comp_p++) { case NORMC: case CINDC: { register int c = comp_p[1]; SETMEMBER(c, firstset); } break; case NORMC | STAR: case NORMC | QUEST: case CINDC | STAR: case CINDC | QUEST: { register int c = *comp_p++; SETMEMBER(c, firstset); } continue; case ONE_OF: case ONE_OF | PLUS: { register const char *end_p = comp_p + CSETSIZE; do *firstset++ |= *comp_p++; while (comp_p < end_p); firstset -= CSETSIZE; } break; case ONE_OF | QUEST: case ONE_OF | STAR: { register const char *end_p = comp_p + CSETSIZE; do *firstset++ |= *comp_p++; while (comp_p < end_p); firstset -= CSETSIZE; } continue; case AT_EOL: SETMEMBER('\0', firstset); case AT_BOL: break; case ALT: { register int n, maybe = NO; while (n = _UC_(*comp_p++)) { #if JUMP_LONG if (n == JUMP_LONG) { n = (unsigned short)(*comp_p++ << 8), n |= _UC_(*comp_p++); } #endif switch(REfirst(comp_p, firstset)) { case NO: return NO; case -YES: maybe++; } comp_p += n - 1; } if (maybe) continue; } break; case NUMP: case EOP: case BACK: return -YES; /* maybe... */ case AT_BOW: case AT_EOW: case_PAR(OPENP): case_PAR(CLOSEP): continue; case MARK: comp_p += 2; continue; default: return NO; } break; } return YES; } #endif /* TINY */ private const char *pstrtlst[NPAR+1], /* index into REbuf */ *pendlst[NPAR+1], *locrater, *REbolp; /* use pstrtlst[0], pendlst[0] to record complete match */ #define loc1 pstrtlst[0] #define loc2 pendlst[0] int REbom, REeom; /* beginning and end of match */ #ifdef SMALL private int member __(( int _(c), const char *_(base) )); private int member(c, base) register int c; const char *base; { return ISMEMBER(c, base); } #else /* !SMALL i.e., BIG; trade program size for speed. */ /* !!this uses the (local) variable `c' to be (at least partially) safe!! */ # define member(chr, base) ((c = (chr), ISMEMBER(c, base))) #endif #if RE_DEBUG private int level; private void re_dump __(( const char *_(line_p), const char *_(comp_p) )); private void re_dump(line_p, comp_p) register const char *line_p; register const char *comp_p; { char textbuf[17], work[50]; register int i; pop_wind("*DEBUG*", 0, B_SCRATCH); ToLast(); sprintf(work, "\n--- garbled RE: (level %d)\nline ...", level); ins_str(work, 0); ins_str(line_p, 0); ins_str("\npattern (compiled) :", 0); textbuf[0] = textbuf[16] = '\0'; line_p = cur_compb; for (i = 0; i < RE_SIZE; i++) { if ((i & 15) == 0) { sprintf(work, "%s\n%3d %3x ", textbuf, i, i); ins_str(work, 0); } textbuf[i&15] = (isctrl(*line_p)) ? '.' : *line_p; sprintf(work, "%02x ", _UC_(*line_p++)); if (line_p == comp_p) work[2] = '*'; ins_str(work, 0); } ins_str(textbuf, 0); level = 0; } #endif private int REmatch __(( const char *_(line_base), char *_(comp_base) )); private int REmatch(line_base, comp_base) const char *line_base; char *comp_base; { register const char *line_p = line_base; register char *comp_p = comp_base; register int n; #define first_p line_base #define c n /* for member() macro */ #if RE_DEBUG level++; # undef YES # define YES (--level, 1) # undef NO # define NO (--level, 0) #endif for (;;) { switch (*comp_p++) { case NORMC: n = *comp_p++; do { if (*line_p++ != *comp_p++) return NO; } while (--n); continue; case CINDC: /* case independent comparison */ n = *comp_p++; do { #ifdef OLD_CASEIGNORE if (CEquiv(*line_p++) != *comp_p++) #else /* Optimized for most common (lower)case. */ if (*line_p++ != *comp_p++ && CEquiv(line_p[-1]) != comp_p[-1]) #endif return NO; } while (--n); continue; case NUMP: nparens = *comp_p++; /* fall into... */ case EOP: loc2 = line_p; REeom = line_p - REbolp; return YES; /* Success! */ case AT_BOL: if (line_p == REbolp && line_p != locrater) continue; return NO; case AT_EOL: if (*line_p == '\0') continue; return NO; case ANYC: if (*line_p++ != '\0') continue; return NO; case AT_BOW: if (isword(*line_p) && !(line_p > REbolp && isword(line_p[-1])) /* && line_p != locrater -- NO! */) /* consider replacement of "\<[a-z]* *" with ""; test for locrater here would skip next adjacent word, even though it should match */ continue; return NO; case AT_EOW: if (!isword(*line_p) && (line_p > REbolp && isword(line_p[-1])) /* && line_p != locrater -- NO! (see above) */) continue; return NO; case ONE_OF: if (member(*line_p++, comp_p)) { comp_p += CSETSIZE; continue; } return NO; case_PAR(OPENP): n = comp_p[-1] - OPENP; pstrtlst[n] = line_p; pendlst[n] = line_p; continue; case_PAR(CLOSEP): n = comp_p[-1] - CLOSEP; pendlst[n] = line_p; continue; case BACKREF: { register const char *end_p; register char *save_p; n = *comp_p++; save_p = comp_p; comp_p = (char *) pstrtlst[n]; end_p = pendlst[n]; while (comp_p < end_p) if (*line_p++ != *comp_p++) return NO; comp_p = save_p; continue; } case ALT: /* This is a new, backtracking (kinda) version. If rejects matching sub-expressions for which the pattern following this alternative does not match. The old code blindly accepted the first matching sub-expr, no matter what the consequences. It still returns the *first* matching sub-expr, not necessarily the *longest*. Also watch out for simple `*', `+' and `?' at the end of a sub-expr: these either match the longest possible substring, or fail--no backtracking there. (Yes this is a bug...) */ while ((n = _UC_(*comp_p++))) { #if JUMP_LONG if (n == JUMP_LONG) { n = (unsigned short)(*comp_p++ << 8), n |= _UC_(*comp_p++); } #endif --n; /* Debugging/assertion goo: */ #if JUMP_LONG # define ALT_J_L_ASSERTION (comp_p[n-3]==(char)JUMP_LONG && \ comp_p[n-4]==BACK) #else # define ALT_J_L_ASSERTION 0 #endif #define ALT_ASSERTION (comp_p[n]==EOP || comp_p[n-1]==EOP || \ (comp_p[n-2]==NUMP && \ (unsigned)comp_p[n-1]<=NPAR) || \ comp_p[n-2]==BACK || ALT_J_L_ASSERTION) /* You rang...? */ #if RE_DEBUG if (!ALT_ASSERTION) { re_dump(line_p, comp_p); complain("BUG: garbled RE (ALT offset=%d)", n); } #else ASSERT(ALT_ASSERTION); #endif switch (REmatch(line_p, comp_p)) { case NO: comp_p += n; continue; case YES: break; default: /*- case YES+YES: -*/ /* Means matched complete expression, not just sub-expr. (This is a kludge to make it work in combination with BACK.) */ return YES+YES; } comp_p += n; /* Now that we have a matching sub-expression, see if the tail matches too. */ { register char *save = comp_p; /* {{As a speed-optimization we can cache a pointer to the t(r)ailing expression, instead of walking through the list each time.}} */ while ((n = _UC_(*comp_p++))) { #if JUMP_LONG if (n == JUMP_LONG) { n = ((unsigned short) (*comp_p++ << 8)), n |= _UC_(*comp_p++); } #endif --n; #if RE_DEBUG if (!ALT_ASSERTION) { re_dump(line_p, comp_p); complain("BUG: garbled RE (ALT offset=%d)", n); } #else ASSERT(ALT_ASSERTION); #endif comp_p += n; } if (REmatch(loc2, comp_p)) return YES+YES; comp_p = save; } } return NO; case BACK: /* Backward jump, to implement `+' and `*' closure on complex subexpressions. Note that jumping back to *before* the enclosing ALT gives us (recursive) backtracking for free! (I didn't notice it myself until recently...) */ n = _UC_(*comp_p++); #if JUMP_LONG if (n == JUMP_LONG) { n = (unsigned short)(comp_p[0] << 8) | _UC_(comp_p[1]); } #endif #if RE_DEBUG if (!(n > 0 && comp_p[-n-1] != MARK)) { re_dump(line_p, comp_p - 1); complain("BUG: garbled RE (BACK offset=%d)", n); } #endif comp_p -= n; ASSERT(n > 0); /* BACK should point to a MARK opcode, so that we can check for an empty match and fail, instead of running astray in an endless loop. */ ASSERT(comp_p[-1] == MARK); #if pdp11 /* This is a code-size optimization hack. ONLY valid if sizeof(char *) == 2. */ # define MARKVAL(lp) ((unsigned short)(lp)) #else # define MARKVAL(lp) ((lp) - REbolp) #endif if (MARKVAL(line_p) <= (unsigned short)(comp_p[0] << 8) + _UC_(comp_p[1])) return NO; /* Take a shortcut and fall through... */ case MARK: /* Record line pointer for BACK empty match check. */ n = MARKVAL(line_p); *comp_p++ = (char)(n >> 8); *comp_p++ = (char)(n); continue; #ifdef NOT case NOT: n = _UC_(*comp_p++); # if JUMP_LONG if (n == JUMP_LONG) { n = (unsigned short)(*comp_p++ << 8); n |= _UC_(*comp_p++); } # endif if (REmatch(line_p, comp_p)) return NO; comp_p += --n; ASSERT(comp_p[-1]==EOP); continue; #endif case ANYC | QUEST: if (*line_p) quest: if (REmatch(line_p + 1, comp_p)) return YES; continue; case NORMC | QUEST: if (*line_p == *comp_p++) goto quest; continue; case CINDC | QUEST: #ifdef OLD_CASEIGNORE if (CEquiv(*line_p) == *comp_p++) #else if (*line_p == *comp_p++ || CEquiv(*line_p) == comp_p[-1]) #endif goto quest; continue; case ONE_OF | QUEST: if (member(*line_p, (comp_p += CSETSIZE) - CSETSIZE)) goto quest; continue; case ANYC | STAR: first_p = line_p; while (*line_p++) ; break; case NORMC | STAR: first_p = line_p; while (*line_p++ == *comp_p) ; comp_p++; break; case CINDC | STAR: first_p = line_p; #ifdef OLD_CASEIGNORE while (CEquiv(*line_p++) == *comp_p) ; #else while (*line_p++ == *comp_p || CEquiv(line_p[-1]) == *comp_p) ; #endif comp_p++; break; case ONE_OF | PLUS: if (!member(*line_p++, comp_p)) return NO; case ONE_OF | STAR: first_p = line_p; while (member(*line_p++, comp_p)) ; comp_p += CSETSIZE; break; default: #if RE_DEBUG re_dump(line_p, comp_p); #endif complain("BUG: garbled RE (0%o) @%d.", _UC_(*--comp_p), (int)(comp_p - cur_compb)); } /*switch */ { /* * only get here when STAR (or PLUS) * Do a little lookahead to avoid lots of recursive calls */ #ifndef TINY register const char *lookahead_p = comp_p; while (isPAREN(n = *lookahead_p) || n == AT_BOW || n == AT_EOW) lookahead_p++; # define comp_p lookahead_p #endif while (--line_p > first_p) { switch (*comp_p) { /* a little lookahead */ case NORMC: ++line_p; while (*--line_p != comp_p[2]) if (line_p == first_p) return NO; break; case CINDC: ++line_p; #ifdef OLD_CASEIGNORE while (CEquiv(*--line_p) != comp_p[2]) #else while (*--line_p != comp_p[2] && CEquiv(*line_p) != comp_p[2]) #endif if (line_p == first_p) return NO; break; case ONE_OF: case ONE_OF | PLUS: ++line_p; while (!member(*--line_p, comp_p + 1)) if (line_p == first_p) return NO; break; } #undef comp_p if (REmatch(line_p, comp_p)) return YES; } continue; } } /* for */ #if RE_DEBUG # undef YES # define YES 1 # undef NO # define NO 0 #endif #undef c #undef first_p /* NOTREACHED. */ } private void REreset() { register int i = NPAR+1; register const char **ps = pstrtlst, **pe = pendlst; do { if (*ps == NULL) break; *ps++ = NULL; *pe++ = NULL; } while (--i); } /* Index LINE at OFFSET, the compiled EXPR, with alternates ALTS. [*- If lbuf_okay is nonzero it's okay to use linebuf if LINE is the current line. This should save lots of time in things like paren matching in LISP mode. Saves all that copying from linebuf to REbuf. substitute() is the guy who calls re_lindex with lbuf_okay as 0, since the substitution gets placed in linebuf ... doesn't work too well when the source and destination strings are the same. I hate all these arguments! -*] [TRH] ALWAYS use linebuf for search if LINE is the current line. substitute problem is solved in re_dosub(). BTW as far as I understand it, this lbuf_okay kludge is necessary because paren matching sets up a fake curline/curchar. Therefore we cannot be sure that lcontents() delivers the 'right' pointer. So actually if lbuf_okay is TRUE, linebuf is ignored completely. Maybe it is more elegant to pass an extra start-bufpos parameter to docompiled() (with NULL as default: curline/curchar). Oct 88 [TRH], well I reorganized it -- docompiled now has an extra parameter "(Bufpos *) start". No more cheating with currpos needed anymore, so lcontents() works fine and thus no need for an lsave(). Mar 89 [TRH] get rid of alts parameter; this information is now compiled into RE. Also change type of first parameter from (Line *) to (char *) to make it more general (and changed the name to reflect this change). Add another extra parameter "(Line *) end_lp" to docompiled() to get a still cleaner interface. Note that backward search stops at the first match from the end of the string, so for more complicated expressions it does not necessarily find the longest possible match (alas, alas...). [JP] This code is cumbersome, repetetive for reasons of efficiency. Fast search is a must as far as I am concerned. */ int re_sindex(str, offset, expr) const char *str; char *expr; { register const char *s = expr; register char c, firstc = *s++; register int case_ign = *s++; #ifndef TINY register const char *firstset = NULL; if (case_ign & RE_FSET) { firstset = s + (((case_ign & ~RE_FSET) << 7) + firstc); firstc = '\0'; } #endif if (pstrtlst[1]) REreset(); expr = (char *) s; REbolp = s = str; if (expr[0] == AT_BOL) { if (offset && REdirection >= 0) return NO; #ifndef TINY if (firstc) { if (firstc != *s && (!case_ign || firstc != CEquiv(*s))) return NO; } #endif if (REmatch(s, expr)) { found: loc1 = s; REbom = s - REbolp; return YES; } return NO; } s += offset; if (REdirection < 0) { /* BACKWARD */ if (offset < 0) /* flag from BACKWARD */ s -= offset, s += strlen(s); do { if (firstc) { s++; if (!case_ign) { while (*--s != firstc) if (s == REbolp) return NO; } else { #ifdef OLD_CASEIGNORE while (CEquiv(*--s) != firstc) #else while (*--s != firstc && CEquiv(*s) != firstc) #endif if (s == REbolp) return NO; } } #ifndef TINY else if (firstset) { s++; while (!member(*--s, firstset)) if (s == REbolp) break; /* always try at BOL since sub-expr may be anchored. */ } #endif if (REmatch(s, expr)) { goto found; } } while (--s >= REbolp); } else { /* FORWARD */ #ifndef TINY if (firstset && offset == 0) /* always try at BOL since sub-expr may be anchored. */ goto f_match; #endif do { if (firstc) { if (!case_ign) { do if ((c = *s++) == '\0') return NO; while (c != firstc); } else { do if ((c = *s++) == '\0') return NO; #ifdef OLD_CASEIGNORE while (CEquiv(c) != firstc); #else while (c != firstc && CEquiv(c) != firstc); #endif } --s; } #ifndef TINY else if (firstset) { while (!member(c = *s++, firstset)) if (c == '\0') return NO; --s; } f_match: #endif if (REmatch(s, expr)) { goto found; } } while (*s++); } return NO; } int okay_wrap ZERO; /* Implicit parameter to dosearch */ Bufpos * dosearch(pattern, dir, re) const char *pattern; { register Buffer *cb = curbuf; register Line *wrap_lp = NULL; if (b_empty(cb)) return NULL; /* Can't match! There's no buffer. */ REcompile(pattern, re, compbuf); if (okay_wrap && True(WrapScan)) wrap_lp = cb->b_dot; return docompiled(dir, compbuf, (Bufpos *) 0, wrap_lp); } Bufpos * docompiled(dir, expr, start, wrap_lp) char *expr; Bufpos *start; Line *wrap_lp; { static Bufpos ret; register Line *lp; register int offset; { register Bufpos *sp; if ((sp = start) == NULL) { sp = &ret; DOTsave(sp); } lp = sp->p_line; offset = sp->p_char; } REdirection = dir; { register Line *end_lp = wrap_lp; register const char *cp = lcontents(lp); if (start) locrater = cp + offset; if (dir < 0) { /* BACKWARD */ if (!offset || !re_sindex(cp, offset - 1, expr)) { locrater = NULL; do { if (!(lp = lp->l_prev)) { if (!end_lp) return NULL; lp = curbuf->b_last; } if (lp == end_lp) { if (offset < 0) return NULL; end_lp = end_lp->l_prev; offset = -1; /* exit flag */ } } while (!re_sindex(lcontents(lp), -1, expr)); } ret.p_char = REbom; } else { /* FORWARD */ /* this avoids infinite loops at "$" ?? */ if (!cp[offset] || !re_sindex(cp, offset, expr)) { locrater = NULL; do { if (!(lp = lp->l_next)) { if (!end_lp) return NULL; lp = curbuf->b_first; } if (lp == end_lp) { if (offset < 0) return NULL; end_lp = end_lp->l_next; offset = -1; /* exit flag */ } } while (!re_sindex(lcontents(lp), 0, expr)); } ret.p_char = REeom; } locrater = NULL; } ret.p_line = lp; return &ret; } private char *insert __(( char *_(off), char *_(endp), int _(which) )); private char * insert(off, endp, which) register char *off; char *endp; { register int n; register const char *pp; n = pendlst[which] - (pp = pstrtlst[which]); while (--n >= 0 && off < endp) { *off++ = *pp++; } return off; } /* Build the actual substitution string from the replacement string and the most recent match. [TRH] add "\0" (complete match) -- fix bug in "\N" recognition. Escape conventions: \0 .. \9 - sub-expression substitutions. \\0 .. \\9 - literal \0 .. \9 \\\ - literal \ + \-escape [yes was still buggy...] Otherwise backslash is treated as a normal character. This routine is a replacement for re_dosub() */ void REsubst(buf, rep_str) char *buf; const char *rep_str; { register char *tp = buf; /* destination */ register const char *rp = rep_str; /* where the matched line is */ register int c; #define endp buf /* alias! */ endp += LBSIZE - 1; for (rp = rep_str; (c = *rp++); ) { if (tp > endp) { --tp; /* truncate line */ break; } if (c == '\\') { if (!(*rp++ == c && (isdigit(*rp) || *rp == c)) && (unsigned)(*--rp - '0') <= nparens) { tp = insert(tp, endp, *rp++ - '0'); continue; } /* [TRH] don't eat backslash if no (valid) sub-expr */ } if (c == '\r') c = '\n'; /* so we can have embedded newlines */ *tp++ = c; } *tp = '\0'; #undef endp } void putmatch(which, buf, size) register char *buf; { register char *endp = buf + size; if ((buf = insert(buf, endp, which)) >= endp) len_error(error); *buf = '\0'; } void RErecur() { char cbuf[sizeof compbuf]; register char *save_rep; register char *save_search; register int npars; npars = nparens; byte_copy(compbuf, cbuf, sizeof compbuf); save_rep = copystr(rep_str); save_search = copystr(rep_search); Recur(); set_str(&rep_search, save_search); free(save_search); set_str(&rep_str, save_rep); free(save_rep); byte_copy(cbuf, compbuf, sizeof compbuf); nparens = npars; } /* * ask for RE string and compiles it into compbuf. * if the RE is invalid, the cursor is set near the offending * character and the error message is displayed (briefly). * [14-jun-89 TRH] don't catch errors if in .joverc. * [Jul-90 TRH] adapted to changed ask(). */ #ifdef _STDARG_ # include #else # include #endif private struct { const char *def; int use_re; } re_ask_par; private int re_aux __(( int _(c) )); char * DEFVARG(re_ask, (const char *def, int use_re, const char *fmt, ...), (def, use_re, fmt, va_alist) const char *def; const char *fmt;) { va_register va_list ap; register const char *o_def = re_ask_par.def;/* make it re-entrant */ register char *ans; VA_INIT_PROPAGATE re_ask_par.use_re = use_re; va_begin(ap, fmt); ans = do_ask("\r\n", re_aux, re_ask_par.def = def, VA_PROPAGATE(fmt, ap)); va_end(ap); re_ask_par.def = o_def; return ans; } /* this is called from within ask() when the user types CR or LF */ private re_aux(c) { Savejmp savejmp; char saveline[MESG_SIZE], errmesg[MESG_SIZE]; register char *pattern = linebuf; register int err; strcpy(saveline, mesgbuf); /* may need it later, for error mesg */ if ((err = push_env(savejmp)) == 0) { if (pattern[0] == '\0') { /* empty string - use default instead */ if (re_ask_par.def == NULL) complain("No default"); /* copy it to linebuf so it will be returned by do_ask */ strcpy(pattern, re_ask_par.def); } REcompile(pattern, True(re_ask_par.use_re), compbuf); } pop_env(savejmp); if (err) { register int at; if (InJoverc) /* propagate abort */ complain((char *) 0); if (pattern[0] == '\0' || (at = REptr - pattern - 1) < 0) at = 0; curchar = at; /* set cursor near the error */ s_mess("%s [%s]", saveline, strcpy(errmesg, mesgbuf)); } return err; } /* Do we match PATTERN at OFFSET in BUF? */ private int look_re = YES; /* this is a kludge */ LookingAt(pattern, buf, offset) const char *pattern; register const char *buf; register int offset; { char comp_buf[RE_SIZE]; register char *comp = comp_buf; REcompile(pattern, look_re, comp); REbolp = buf; REbom = offset; return REmatch(loc1 = buf + offset, comp + RE_NFIRST); } look_at(pattern) const char *pattern; { register int result; look_re = NO; result = LookingAt(pattern, linebuf, curchar); look_re++; return result; } /*====================================================================== * $Log: re.c,v $ * Revision 14.32.0.10 1994/04/22 18:24:23 tom * (re_ask): use `va_register va_list'. * * Revision 14.32.0.9 1994/02/01 20:35:19 tom * (do_comp, ALT_J_L_ASSERTION): fix datatype range errors. * * Revision 14.32.0.8 1993/11/01 18:05:42 tom * (searchstr): removed; (UseRE): made non-PRIVATE; * (re_ask): changed semantics of force_re argument -> use_re. * * Revision 14.32.0.4 1993/07/31 03:24:27 tom * (MARK): new opcode, introduced to avoid endless loops in `*' and `+' * closure of complex sub-expressions (always a target of BACK jump); * (REcompile, do_comp, REfirst): add MARK support; * (REmatch): [ALT] matching sub-expr is accepted only if tail matches too, * [BACK] fail if match is empty (to avoid endless loops), * [MARK] (new) record current line pointer (for empty match check in BACK). * * Revision 14.32 1993/05/14 17:40:26 tom * some (space) optimizations for dumber compilers; various changes for new * case-ignore scheme: preserve old way in OLD_CASEIGNORE conditionals. * * Revision 14.31 1993/02/17 00:55:04 tom * preserve rep_search in RErecur(); lotsa random optimizations. * * Revision 14.30 1993/01/26 17:53:27 tom * cleanup whitespace; some random optimizations; standardize DEBUG options; * add "\!" directive; allow long offsets to ALT/BACK. * * Revision 14.28 1992/10/27 12:26:42 tom * convert to "port{ansi,defs}.h" conventions; fix REdirection botch after * failing backward search--now reset it at each REcompile. * * Revision 14.26 1992/08/26 23:56:57 tom * PRIVATE-ized some Variable defs; add RCS directives. * */