/*---------------------------------------------------------------------*\ | | | CPP -- a stand-alone C preprocessor | | Copyright (c) 1993 Hacker Ltd. Author: Scott Bigham | | | | Permission is granted to anyone to use this software for any purpose | | on any computer system, and to redistribute it freely, with the | | following restrictions: | | - No charge may be made other than reasonable charges for repro- | | duction. | | - Modified versions must be clearly marked as such. | | - The author is not responsible for any harmful consequences of | | using this software, even if they result from defects therein. | | | | token.c -- transform raw input to preprocessor tokens | \*---------------------------------------------------------------------*/ /* There are (for better or worse) three interfaces to the tokenizing mechanism, at increasing levels of abstraction. _one_token() tokenizes directly out of the input line buffer, and should generally only be used while handling preprocessor directive lines. token() pre-tokenizes an entire line of input at a time and doles it out one token at a time. Note that tokens returned by token() may or may not have undergone macro expansion; use it when you need access to pre-expanded input tokens. exp_token() fully expands each token before returning it; this is usually the best way to access input. */ #include #include #include #include #include "global.h" #include "ztype.h" #include "alloc.h" #define BASE10 1 #define BASE8 2 #define BASE16 3 #define GRANULARITY 256 extern char *next_c; static int tok_flags = 0; static TokenP pushback_list; /* mk_Token() -- allocate and initialize space for a Token */ TokenP mk_Token() { register TokenP T = alloc_Token(); T->val = T->hashval = T->flags = T->type = T->subtype = 0; T->_txt.out_of_line = T->_ws.out_of_line = NULL; T->flags |= (INLINE_TXT | INLINE_WS); T->_txt.inline[0] = T->_ws.inline[0] = '\0'; T->next = NULL; return T; } /* clear_txt() -- clear the text space of a token */ void clear_txt(T) register TokenP T; { if (!(T->flags & INLINE_TXT) && T->_txt.out_of_line) free(T->_txt.out_of_line); T->flags |= INLINE_TXT; T->_txt.inline[0] = '\0'; } /* clear_ws() -- clear the white space of a token */ void clear_ws(T) register TokenP T; { if (!(T->flags & INLINE_WS) && T->_ws.out_of_line) free(T->_ws.out_of_line); T->flags |= INLINE_WS; T->_ws.inline[0] = '\0'; } /* set_txt() -- set the text of a token to |s|, copying if necessary, and deleting the current text, if any */ void set_txt(T, s) register TokenP T; register char *s; { if (!(T->flags & INLINE_TXT) && T->_txt.out_of_line) free(T->_txt.out_of_line); if (strlen(s) <= 7) { T->flags |= INLINE_TXT; strcpy(T->_txt.inline, s); } else { T->flags &= ~INLINE_TXT; T->_txt.out_of_line = strdup(s); } } /* set_txt_n() -- set the text of a token to the first |n| characters of |s|, copying if necessary, and deleting the current text, if any */ void set_txt_n(T, s, n) register TokenP T; register char *s; int n; { if (!(T->flags & INLINE_TXT) && T->_txt.out_of_line) free(T->_txt.out_of_line); if (n <= 7) { T->flags |= INLINE_TXT; strncpy(T->_txt.inline, s, n); T->_txt.inline[n] = '\0'; } else { T->flags &= ~INLINE_TXT; T->_txt.out_of_line = mallok(n + 1); strncpy(T->_txt.out_of_line, s, n); T->_txt.out_of_line[n] = '\0'; } } /* set_ws() -- set the white space of a token to |s|, copying if necessary, and deleting the current white space, if any */ void set_ws(T, s) register TokenP T; register char *s; { if (!(T->flags & INLINE_WS) && T->_ws.out_of_line) free(T->_ws.out_of_line); if (strlen(s) <= 3) { T->flags |= INLINE_WS; strcpy(T->_ws.inline, s); } else { T->flags &= ~INLINE_WS; T->_ws.out_of_line = strdup(s); } } /* free_token() -- return an allocated Token to the free list */ void free_token(T) register TokenP T; { T->next = NULL; free_tlist(T); } /* free_tlist() -- return a list of Token's to the free list */ void free_tlist(T) register TokenP T; { register TokenP T1; for (T1 = T; T; T = T1) { T1 = T->next; clear_txt(T); clear_ws(T); #if 0 T->next = next_free_tok; next_free_tok = T; #else dealloc_Token(T); #endif } } /* copy_token() -- return a new Token that is a duplicate of the given token */ TokenP copy_token(T1) register TokenP T1; { register TokenP T2 = mk_Token(); *T2 = *T1; if (!(T1->flags & INLINE_WS)) T2->_ws.out_of_line = strdup(T1->_ws.out_of_line); if (!(T1->flags & INLINE_TXT)) T2->_txt.out_of_line = strdup(T1->_txt.out_of_line); T2->next = NULL; return T2; } /* copy_tlist() -- create a duplicate of a list of Token's */ TokenP copy_tlist(T1) register TokenP T1; { Token head; register TokenP T2 = &head; for (T2->next = NULL; T1; T1 = T1->next, T2 = T2->next) T2->next = copy_token(T1); return head.next; } /* tok_shutdown() -- free all space allocated for Token's */ void tok_shutdown() { #ifdef DEBUG /* explicitly clean up, to check for memory leaks */ #if 0 register TokenP T, T1; register int i; for (T1 = T = tok_blocks; T; T = T1) { T1 = T->next; for (i = 1; i < GRANULARITY; i++) { if (T[i].flags & IN_USE) { fprintf(stderr, "@@@ Token not freed: "); dump_token(&T[i]); fputc('\n', stderr); } clear_txt(&T[i]); clear_ws(&T[i]); } free(T); } fprintf(stderr, "%d total blocks allocated\n", num_blocks); #else /* 0 */ cleanup_Token(); #endif /* 0 */ #endif } /* push_tlist() -- "un-read" the list of Token's |T|; token() will return all of these tokens in order before reading another token from the input file */ void push_tlist(T) register TokenP T; { register TokenP t; if (!T) return; t = T; while (t->next) t = t->next; t->next = pushback_list; pushback_list = T; } /* mk_eof() -- makes and returns an EOF_ token */ static TokenP mk_eof() { register TokenP T = mk_Token(); T->type = EOF_; T->flags |= INLINE_TXT | INLINE_WS; T->_ws.inline[0] = T->_txt.inline[0] = '\0'; return T; } /* mk_stopper() -- makes and returns a STOP token. See expand_tlist() for further information. */ TokenP mk_stopper() { register TokenP T = mk_Token(); T->type = STOP; T->flags |= INLINE_TXT | INLINE_WS; T->_ws.inline[0] = T->_txt.inline[0] = '\0'; return T; } /* mk_unmarker() -- makes and returns a special token that informs the tokenizer to unmark the macro text associated with token |T|. See expand() for further information. */ TokenP mk_unmarker(T) register TokenP T; { register TokenP T1 = copy_token(T); T1->type = UNMARK; T->flags |= INLINE_TXT | INLINE_WS; T->_ws.inline[0] = T->_txt.inline[0] = '\0'; return T1; } /* mk_printable() -- makes and returns an untyped token with an arbitrary text body, for purposes of printing directly via print_token(). */ TokenP mk_printable(s) const char *s; { register TokenP T = mk_Token(); T->type = DONT_CARE; set_ws(T, " "); set_txt(T, s); return T; } /* flush_tokenizer() -- discard all Tokens pushed back by push_tlist() */ void flush_tokenizer() { free_tlist(pushback_list); pushback_list = NULL; } /* number() -- copies from |s| into the token |T| a string of characters denoting an integer or floating-point constant. Returns a pointer to the first uncopied character. */ static char *number(s, T) register char *s; register TokenP T; { int numtype = BASE10, fpflag = 0; char *t; T->type = NUMBER; if (*s == '0') { /* check for octal or hexadecimal constant */ if ((s[1] == 'x' || s[1] == 'X') && isxdigit(s[2])) { numtype = BASE16; T->flags |= UNS_VAL; } else if (is_octal(s[1])) { numtype = BASE8; T->flags |= UNS_VAL; } } T->val = strtol(s, &t, 0); s = t; if (numtype != BASE10 || is_isuff(*s)) { /* if we're not in base 10, or the next characters are integer constant suffixes, this can't be a floating-point constant */ while (is_isuff(*s)) { if (*s == 'u' || *s == 'U') T->flags |= UNS_VAL; s++; } return s; } /* check to see if the number is actually floating point */ if (*s == '.') { fpflag = 1; do s++; while (isdigit(*s)); } if (*s == 'e' || *s == 'E') { register char *u = s; u++; if (*u == '-' || *u == '+') u++; if (isdigit(*u)) { fpflag = 1; do u++; while (isdigit(*u)); s = u; } } if (fpflag) { T->type = FP_NUM; if (is_fsuff(*s)) s++; } return s; } /* char_constant() -- copy from |s| into the token |T| a string of characters denoting a character constant. We do not translate escape sequences at this point, though we might need to */ static char *char_constant(s, T) register char *s; register TokenP T; { T->type = CHAR_CON; for (; *s; s++) { if (*s == '\'') return s + 1; if (*s == '\\') s++; } error("unterminated character constant"); return s; } /* string_literal() -- copy from |s| into the token |T| a string of characters denoting a string literal. We do not translate escape sequences at this point, though we might need to */ static char *string_literal(s, T) register char *s; register TokenP T; { T->type = STR_CON; for (; *s; s++) { if (*s == '"') return s + 1; if (*s == '\\') s++; } error("unterminated string literal"); return s; } /* include_name() -- copy from |s| into the token |T| a string of characters denoting an #include file specifier enclosed in <>. |s| points to the character after the '>'. */ static char *include_name(s, T) register char *s; register TokenP T; { T->type = INC_NAM; for (; *s; s++) { if (*s == '>') return s + 1; } error("unterminated include file name"); } /* set_mode() -- set the tokenizer flags to |m| */ void set_mode(m) int m; { tok_flags = m; } /* change_mode() -- twiddle the tokenizer flags; in particular, set the flags specified in |raise| and clear the flags specified in |lower| */ void change_mode(raise, lower) int raise, lower; { tok_flags |= raise; tok_flags &= (~lower); } /* get_mode() -- return the current value of the tokenizer flags */ int get_mode() { return tok_flags; } /* xlate_token() -- determines the type of the next preprocessor token in the string pointed to by |s|. Information about the token found is placed in the Token |T|. Returns a pointer to the first character not in the token read. */ static char *xlate_token(s, T) register char *s; register TokenP T; { if (is_ctoks(*s)) { char *t; T->hashval = hash_id(s, &t); s = t; T->type = ID; return t; } else if (isdigit(*s)) return number(s, T); else switch (*s++) { case '.': T->subtype = '.'; if (*s == '.' && s[1] == '.') { s += 2; T->type = DONT_CARE; } else if (isdigit(*s)) s = number(s - 1, T); else T->type = DONT_CARE; break; case '#': if (*s == '#') { s++; T->type = TOK_CAT; } else T->type = POUND; break; case '&': T->subtype = '&'; if (*s == '&') { s++; T->type = L_AND_OP; } else if (*s == '=') { s++; T->type = DONT_CARE; } else T->type = B_AND_OP; break; case '|': T->subtype = '|'; if (*s == '|') { s++; T->type = L_OR_OP; } else if (*s == '=') { s++; T->type = DONT_CARE; } else T->type = B_OR_OP; break; case '+': T->subtype = '+'; if (*s == s[-1] || *s == '=') { s++; T->type = DONT_CARE; } else T->type = ADD_OP; break; case '~': T->type = UNARY_OP; T->subtype = '~'; break; case ',': T->type = COMMA; T->subtype = ','; break; case '(': T->type = LPAREN; T->subtype = '('; break; case ')': T->type = RPAREN; T->subtype = ')'; break; case '!': T->subtype = '!'; if (*s == '=') { s++; T->type = EQ_OP; } else T->type = UNARY_OP; break; case '=': T->subtype = '='; if (*s == '=') { s++; T->type = EQ_OP; } else T->type = DONT_CARE; break; case '*': case '/': case '%': T->subtype = s[-1]; if (*s == '=') { s++; T->type = DONT_CARE; } else T->type = MUL_OP; break; case '^': T->subtype = '^'; if (*s == '=') { s++; T->type = DONT_CARE; } else T->type = B_XOR_OP; break; case '-': T->subtype = '-'; if (*s == '-' || *s == '=' || *s == '>') { s++; T->type = DONT_CARE; } else T->type = ADD_OP; break; case '<': if (tok_flags & INCLUDE_LINE) { s = include_name(s, T); break; } /* else fall through */ case '>': T->subtype = s[-1]; T->type = REL_OP; if (*s == s[-1]) { s++; T->type = SHIFT_OP; } if (*s == '=') { s++; if (T->type == REL_OP) T->subtype = (T->subtype == '<' ? '(' : ')'); else T->type = DONT_CARE; } break; case '\'': s = char_constant(s, T); break; case '"': s = string_literal(s, T); break; case '[': case ']': case '{': case '}': case ';': case ':': case '?': T->type = DONT_CARE; break; default: T->type = UNKNOWN; } return s; } /* print_token() -- write token |T| to the output file */ void print_token(T) register TokenP T; { if (in_config_file) return; if (T->type == STOP) bugchk("STOP token in output stream?"); fputs(token_ws(T), outf); fputs(token_txt(T), outf); if (T->flags & TRAIL_SPC) fputc(' ', outf); } /* merge_tokens() -- Perform token pasting on Token's |T1| and |T2|. Returns the resulting token. */ TokenP merge_tokens(T1, T2) register TokenP T1, T2; { register TokenP T = mk_Token(); register char *t; set_ws(T, token_ws(T1)); t = mallok(strlen(token_txt(T1)) + strlen(token_txt(T2)) + 1); strcpy(t, token_txt(T1)); strcat(t, token_txt(T2)); set_txt(T, t); free(t); t = xlate_token(token_txt(T), T); if (*t != '\0') { if (w_bad_concat) warning("Invalid token \"%s\" created by concatenation", token_txt(T)); T->type = UNKNOWN; } return T; } TokenP _one_token() { register TokenP T = mk_Token(); register char *s = next_c, *t, *u; int n; t = suck_ws(s, T); if (!t || !*t) { set_txt(T, "\n"); T->type = EOL; T->subtype = '\n'; next_c = t; return T; } u = xlate_token(t, T); n = (int)(u - t); if (T->type == UNKNOWN && w_bad_chars) error("Unrecognized character 0x%02x='%c'", *t, *t); set_txt_n(T, t, n); next_c = u; return T; } void _tokenize_line() { Token head; register TokenP T = &head; head.next = NULL; do { T = T->next = _one_token(); } while (T->type != EOL); push_tlist(head.next); } TokenP token() { register TokenP T; register char *s; while (pushback_list) { T = pushback_list; pushback_list = T->next; T->next = NULL; if (T->type == UNMARK) { Macro *M; M = lookup(token_txt(T), T->hashval); if (!M) bugchk("UNMARK on non-macro token %s", token_txt(T)); if (!(M->flags & MARKED)) bugchk("UNMARK on unmarked macro %s", token_txt(T)); M->flags ^= MARKED; free_token(T); continue; } else { return T; } } /* if we get to here, the pushback list is empty, and we need to read in another line */ next_c = s = getline(); if (!s) return mk_eof(); T = _one_token(); if (T->type == EOL) { return T; } /* we need preprocessor lines in raw form, so we can't pre-tokenize */ if (T->type != POUND || get_mode() & SLURP) _tokenize_line(); return T; } TokenP exp_token() { register TokenP T = token(); register Macro *M; if (T->type == ID && !(T->flags & BLUEPAINT) && (M = lookup(token_txt(T), T->hashval))) { expand(T, M); free_token(T); return exp_token(); } else return T; } #ifdef DEBUG /* debugging routines to display tokens in internal format */ void dump_token(T) TokenP T; { static char *type_names[] = {"<<< ERROR >>>", "UNKNOWN", "DONT_CARE", "EOL", "NUMBER", "FP_NUM", "ID", "STR_CON", "CHAR_CON", "UNARY_OP", "MUL_OP", "ADD_OP", "SHIFT_OP", "REL_OP", "EQ_OP", "B_AND_OP", "B_XOR_OP", "B_OR_OP", "L_AND_OP", "L_OR_OP", "LPAREN", "RPAREN", "COMMA", "INC_NAM", "POUND", "TOK_CAT", "MACRO_ARG", "EOF_", "STOP", "UNMARK" }; fprintf(stderr, "[%s, '%c', \"%s\", \"%s\", %ld, %x]@%p", type_names[T->type], (T->subtype ? T->subtype : ' '), (token_ws(T) ? token_ws(T) : "(null)"), (token_txt(T) ? token_txt(T) : "(null)"), T->val, T->flags, (void *)T ); } void dump_tlist(T) TokenP T; { while (T) { dump_token(T); fputc('\n', stderr); T = T->next; } } void dump_pushback() { dump_tlist(pushback_list); } #endif /* DEBUG */