/********************************* stop.c ******************************** Purpose: Stop list filter finite state machine generator and driver. Provenence: Written by and unit tested by C. Fox, 1990. Changed by C. Fox, July 1991. - added ANSI C declarations - branch tested to 95% coverage Notes: This module implements a fast finite state machine generator, and a driver, for implementing stop list filters. The strlist module is a simple string array data type implementation. **/ #include #include #include #include #include "stop.h" #include "strlist.h" /******************************************************************************/ #define FALSE 0 #define TRUE 1 #define EOS '\0' /************** Character Classification ***************/ /* Tokenizing requires that ASCII be broken into character */ /* classes distinguished for tokenizing. Delimiter chars */ /* separate tokens. Digits and letters make up the body */ /* of search terms. */ typedef enum { DELIM_CH, /* whitespace, punctuation, etc. */ DIGIT_CH, /* the digits */ LETTER_CH, /* upper and lower case */ } CharClassType; static CharClassType char_class[128] = { /* ^@ */ DELIM_CH, /* ^A */ DELIM_CH, /* ^B */ DELIM_CH, /* ^C */ DELIM_CH, /* ^D */ DELIM_CH, /* ^E */ DELIM_CH, /* ^F */ DELIM_CH, /* ^G */ DELIM_CH, /* ^H */ DELIM_CH, /* ^I */ DELIM_CH, /* ^J */ DELIM_CH, /* ^K */ DELIM_CH, /* ^L */ DELIM_CH, /* ^M */ DELIM_CH, /* ^N */ DELIM_CH, /* ^O */ DELIM_CH, /* ^P */ DELIM_CH, /* ^Q */ DELIM_CH, /* ^R */ DELIM_CH, /* ^S */ DELIM_CH, /* ^T */ DELIM_CH, /* ^U */ DELIM_CH, /* ^V */ DELIM_CH, /* ^W */ DELIM_CH, /* ^X */ DELIM_CH, /* ^Y */ DELIM_CH, /* ^Z */ DELIM_CH, /* ^[ */ DELIM_CH, /* ^\ */ DELIM_CH, /* ^] */ DELIM_CH, /* ^^ */ DELIM_CH, /* ^_ */ DELIM_CH, /* */ DELIM_CH, /* ! */ DELIM_CH, /* " */ DELIM_CH, /* # */ DELIM_CH, /* $ */ DELIM_CH, /* % */ DELIM_CH, /* & */ DELIM_CH, /* ' */ DELIM_CH, /* ( */ DELIM_CH, /* ) */ DELIM_CH, /* * */ DELIM_CH, /* + */ DELIM_CH, /* , */ DELIM_CH, /* - */ DELIM_CH, /* . */ DELIM_CH, /* / */ DELIM_CH, /* 0 */ DIGIT_CH, /* 1 */ DIGIT_CH, /* 2 */ DIGIT_CH, /* 3 */ DIGIT_CH, /* 4 */ DIGIT_CH, /* 5 */ DIGIT_CH, /* 6 */ DIGIT_CH, /* 7 */ DIGIT_CH, /* 8 */ DIGIT_CH, /* 9 */ DIGIT_CH, /* : */ DELIM_CH, /* ; */ DELIM_CH, /* < */ DELIM_CH, /* = */ DELIM_CH, /* > */ DELIM_CH, /* ? */ DELIM_CH, /* @ */ DELIM_CH, /* A */ LETTER_CH, /* B */ LETTER_CH, /* C */ LETTER_CH, /* D */ LETTER_CH, /* E */ LETTER_CH, /* F */ LETTER_CH, /* G */ LETTER_CH, /* H */ LETTER_CH, /* I */ LETTER_CH, /* J */ LETTER_CH, /* K */ LETTER_CH, /* L */ LETTER_CH, /* M */ LETTER_CH, /* N */ LETTER_CH, /* O */ LETTER_CH, /* P */ LETTER_CH, /* Q */ LETTER_CH, /* R */ LETTER_CH, /* S */ LETTER_CH, /* T */ LETTER_CH, /* U */ LETTER_CH, /* V */ LETTER_CH, /* W */ LETTER_CH, /* X */ LETTER_CH, /* Y */ LETTER_CH, /* Z */ LETTER_CH, /* [ */ DELIM_CH, /* \ */ DELIM_CH, /* ] */ DELIM_CH, /* ^ */ DELIM_CH, /* _ */ DELIM_CH, /* ` */ DELIM_CH, /* a */ LETTER_CH, /* b */ LETTER_CH, /* c */ LETTER_CH, /* d */ LETTER_CH, /* e */ LETTER_CH, /* f */ LETTER_CH, /* g */ LETTER_CH, /* h */ LETTER_CH, /* i */ LETTER_CH, /* j */ LETTER_CH, /* k */ LETTER_CH, /* l */ LETTER_CH, /* m */ LETTER_CH, /* n */ LETTER_CH, /* o */ LETTER_CH, /* p */ LETTER_CH, /* q */ LETTER_CH, /* r */ LETTER_CH, /* s */ LETTER_CH, /* t */ LETTER_CH, /* u */ LETTER_CH, /* v */ LETTER_CH, /* w */ LETTER_CH, /* x */ LETTER_CH, /* y */ LETTER_CH, /* z */ LETTER_CH, /* { */ DELIM_CH, /* | */ DELIM_CH, /* } */ DELIM_CH, /* ~ */ DELIM_CH, /* ^? */ DELIM_CH, }; /************** Character Case Conversion **************/ /* Term text must be accumulated in a single case. This */ /* array is used to convert letter case but otherwise */ /* preserve characters. */ static char convert_case[128] = { /* ^@ */ 0, /* ^A */ 0, /* ^B */ 0, /* ^C */ 0, /* ^D */ 0, /* ^E */ 0, /* ^F */ 0, /* ^G */ 0, /* ^H */ 0, /* ^I */ 0, /* ^J */ 0, /* ^K */ 0, /* ^L */ 0, /* ^M */ 0, /* ^N */ 0, /* ^O */ 0, /* ^P */ 0, /* ^Q */ 0, /* ^R */ 0, /* ^S */ 0, /* ^T */ 0, /* ^U */ 0, /* ^V */ 0, /* ^W */ 0, /* ^X */ 0, /* ^Y */ 0, /* ^Z */ 0, /* ^[ */ 0, /* ^\ */ 0, /* ^] */ 0, /* ^^ */ 0, /* ^_ */ 0, /* */ ' ', /* ! */ '!', /* " */ '"', /* # */ '#', /* $ */ '$', /* % */ '%', /* & */ '&', /* ' */ '\'', /* ( */ '(', /* ) */ ')', /* * */ '*', /* + */ '+', /* , */ ',', /* - */ '-', /* . */ '.', /* / */ '/', /* 0 */ '0', /* 1 */ '1', /* 2 */ '2', /* 3 */ '3', /* 4 */ '4', /* 5 */ '5', /* 6 */ '6', /* 7 */ '7', /* 8 */ '8', /* 9 */ '9', /* : */ ':', /* ; */ ';', /* < */ '<', /* = */ '=', /* > */ '>', /* ? */ '?', /* @ */ '@', /* A */ 'a', /* B */ 'b', /* C */ 'c', /* D */ 'd', /* E */ 'e', /* F */ 'f', /* G */ 'g', /* H */ 'h', /* I */ 'i', /* J */ 'j', /* K */ 'k', /* L */ 'l', /* M */ 'm', /* N */ 'n', /* O */ 'o', /* P */ 'p', /* Q */ 'q', /* R */ 'r', /* S */ 's', /* T */ 't', /* U */ 'u', /* V */ 'v', /* W */ 'w', /* X */ 'x', /* Y */ 'y', /* Z */ 'z', /* [ */ '[', /* \ */ 92, /* ] */ ']', /* ^ */ '^', /* _ */ '_', /* ` */ '`', /* a */ 'a', /* b */ 'b', /* c */ 'c', /* d */ 'd', /* e */ 'e', /* f */ 'f', /* g */ 'g', /* h */ 'h', /* i */ 'i', /* j */ 'j', /* k */ 'k', /* l */ 'l', /* m */ 'm', /* n */ 'n', /* o */ 'o', /* p */ 'p', /* q */ 'q', /* r */ 'r', /* s */ 's', /* t */ 't', /* u */ 'u', /* v */ 'v', /* w */ 'w', /* x */ 'x', /* y */ 'y', /* z */ 'z', /* { */ '{', /* | */ '|', /* } */ '}', /* ~ */ '~', /* ^? */ 0, }; #define DEAD_STATE -1 /* used to block a DFA */ #define TABLE_INCREMENT 256 /* used to grow tables */ /************************* Hashing **************************/ /* Sets of suffixes labeling states during the DFA construction */ /* are hashed to speed searching. The hashing function uses an */ /* entire integer variable range as its hash table size; in an */ /* effort to get a good spread through this range, hash values */ /* start big, and are incremented by a lot with every new word */ /* in the list. The collision rate is low using this method */ #define HASH_START 5775863 #define HASH_INCREMENT 38873647 /************** State Label Binary Search Tree **************/ /* During DFA construction, all states must be searched by */ /* their labels to make sure that the minimum number of states */ /* are used. This operation is sped up by hashing the labels */ /* to a signature value, then storing the signatures and labels */ /* in a binary search tree. The tree is destroyed once the DFA */ /* is fully constructed. */ typedef struct TreeNode { StrList label; /* state label used as search key */ unsigned signature; /* hashed label to speed searching */ int state; /* whose label is representd by node */ struct TreeNode *left; /* left binary search subtree */ struct TreeNode *right; /* right binary search subtree */ } SearchTreeNode, *SearchTree; /********************* DFA State Table **********************/ /* The state table is an array of structures holding a state */ /* label, a count of the arcs out of the state, a pointer into */ /* the arc table for these arcs, and a final state flag. The */ /* label field is used only during machine construction. */ typedef struct { StrList label; /* for this state - used during build */ int num_arcs; /* for this state in the arc table */ int arc_offset; /* for finding arcs in the arc table */ short is_final; /* TRUE iff this is a final state */ } StateTableEntry, *StateTable; /********************** DFA Arc Table ***********************/ /* The arc table lists all transitions for all states in a DFA */ /* in compacted form. Each state's transitions are offset from */ /* the start of the table, then listed in arc label order. */ /* Transitions are found by a linear search of the sub-section */ /* of the table for a given state. */ typedef struct { char label; /* character label on an out-arrow */ int target; /* the target state for the out-arrow */ } ArcTableEntry, *ArcTable; /********************** DFA Structure ***********************/ /* A DFA is represented as a pointer to a structure holding the */ /* machine's state and transition tables, and bookkeepping */ /* counters. The tables are arrays whose space is malloc'd, */ /* then realloc'd if more space is required. Once a machine is */ /* constructed, the table space is realloc'd one last time to */ /* fit the needs of the machine exactly. */ typedef struct _DfaStruct { int num_states; /* in the DFA (and state table) */ int max_states; /* now allocated in the state table */ int num_arcs; /* in the arc table for this machine */ int max_arcs; /* now allocated in the arc table */ StateTable state_table; /* the compacted DFA state table */ ArcTable arc_table; /* the compacted DFA transition table */ SearchTree tree; /* storing state labels used in build */ } DFAStruct; /******************************************************************************/ /************************* Function Declarations **************************/ #ifdef __STDC__ static char *GetMemory( char *ptr, int num_bytes ); static void DestroyTree( SearchTree tree ); static int GetState( DFA machine, StrList label, unsigned signature ); static void AddArc( DFA machine, int state, char arc_label, StrList state_label, unsigned state_signature ); extern DFA BuildDFA( StrList words ); extern char *GetTerm( FILE *stream, DFA machine, int size, char *output ); #else static char *GetMemory(); static void DestroyTree(); static int GetState(); static void AddArc(); extern DFA BuildDFA(); extern char *GetTerm(); #endif /******************************************************************************/ /************************ Private Function Definitions ********************/ /*FN*************************************************************************** GetMemory( ptr, num_bytes ) Returns: char * -- new/expanded block of memory Purpose: Rationalize memory allocation and handle errors Plan: Part 1: Allocate memory with supplied allocation functions Part 2: Handle any errors Part 3: Return the allocated block of memory Notes: None. **/ static char * GetMemory( ptr, num_bytes ) char *ptr; /* in: expanded block; NULL if nonesuch */ int num_bytes; /* in: number of bytes to allocate */ { char *memory; /* temporary for holding results */ /* Part 1: Allocate memory with supplied allocation functions */ if ( NULL == ptr ) memory = malloc( (unsigned)num_bytes ); else memory = realloc( ptr, (unsigned)num_bytes ); /* Part 2: Handle any errors */ if ( NULL == memory ) { (void)fprintf( stderr, "malloc failure--aborting\n" ); exit(1); } /* Part 3: Return the allocated block of memory */ return( memory ); } /* GetMemory */ /*FN*************************************************************************** DestroyTree( tree ) Returns: void Purpose: Destroy a binary search tree created during machine construction Plan: Part 1: Return right away of there is no tree Part 2: Deallocate the subtrees Part 3: Deallocate the root Notes: None. **/ static void DestroyTree( tree ) SearchTree tree; /* in: search tree destroyed */ { /* Part 1: Return right away of there is no tree */ if ( NULL == tree ) return; /* Part 2: Deallocate the subtrees */ if ( NULL != tree->left ) DestroyTree( tree->left ); if ( NULL != tree->right ) DestroyTree( tree->right ); /* Part 3: Deallocate the root */ tree->left = tree->right = NULL; (void)free( (char *)tree ); } /* DestroyTree */ /*FN*************************************************************************** GetState( machine, label, signature ) Returns: int -- state with the given label Purpose: Search a machine and return the state with a given state label Plan: Part 1: Search the tree for the requested state Part 2: If not found, add the label to the tree Part 3: Return the state number Notes: This machine always returns a state with the given label because if the machine does not have a state with the given label, then one is created. **/ static int GetState( machine, label, signature ) DFA machine; /* in: DFA whose state labels are searched; StrList label; /* in: state label searched for */ unsigned signature; /* in: signature of the label requested */ { SearchTree *ptr; /* pointer to a search tree link field */ SearchTree new_node; /* for a newly added search tree node */ /* Part 1: Search the tree for the requested state */ ptr = &(machine->tree); while ( (NULL != *ptr) && ( (signature != (*ptr)->signature) || !StrListEqual(label,(*ptr)->label)) ) ptr = (signature <= (*ptr)->signature) ? &(*ptr)->left : &(*ptr)->right; /* Part 2: If not found, add the label to the tree */ if ( NULL == *ptr ) { /* create a new node and fill in its fields */ new_node = (SearchTree)GetMemory( NULL, sizeof(SearchTreeNode) ); new_node->signature = signature; new_node->label = (StrList)label; new_node->state = machine->num_states; new_node->left = new_node->right = NULL; /* allocate more states if needed, set up the new state */ if ( machine->num_states == machine->max_states ) { machine->max_states += TABLE_INCREMENT; machine->state_table = (StateTable)GetMemory( machine->state_table, machine->max_states*sizeof(StateTableEntry)); } machine->state_table[machine->num_states].label = (StrList)label; machine->num_states++; /* hook the new node into the binary search tree */ *ptr = new_node; } else StrListDestroy( label ); /* Part 3: Return the state number */ return( (*ptr)->state ); } /* GetState */ /*FN*************************************************************************** AddArc( machine, state, arc_label, state_label, state_signature ) Returns: void Purpose: Add an arc between two states in a DFA Plan: Part 1: Search for the target state among existing states Part 2: Make sure the arc table is big enough Part 3: Add the new arc Notes: None. **/ static void AddArc( machine, state, arc_label, state_label, state_signature ) DFA machine; /* in/out: machine with an arc added */ int state; /* in: with an out arc added */ char arc_label; /* in: label on the new arc */ StrList state_label; /* in: label on the target state */ unsigned state_signature; /* in: label hash signature to speed searching */ { register int target; /* destination state for the new arc */ /* Part 1: Search for the target state among existing states */ StrListSort( state_label ); target = GetState( machine, state_label, state_signature ); /* Part 2: Make sure the arc table is big enough */ if ( machine->num_arcs == machine->max_arcs ) { machine->max_arcs += TABLE_INCREMENT; machine->arc_table = (ArcTable)GetMemory( machine->arc_table, machine->max_arcs * sizeof(ArcTableEntry) ); } /* Part 3: Add the new arc */ machine->arc_table[machine->num_arcs].label = arc_label; machine->arc_table[machine->num_arcs].target = target; machine->num_arcs++; machine->state_table[state].num_arcs++; } /* AddArc */ /*FN*************************************************************************** BuildDFA( words ) Returns: DFA -- newly created finite state machine Purpose: Build a DFA to recognize a list of words Plan: Part 1: Allocate space and initialize variables Part 2: Make and label the DFA start state Part 3: Main loop - build the state and arc tables Part 4: Deallocate the binary search tree and the state labels Part 5: Reallocate the tables to squish them down Part 6: Return the newly constructed DFA Notes: None. **/ DFA BuildDFA( words ) StrList words; /* in: that the machine is built to recognize */ { DFA machine; /* local for easier access to machine */ register int state; /* current state's state number */ char arc_label; /* for the current arc when adding arcs */ char *string; /* element in a set of state labels */ char ch; /* the first character in a new string */ StrList current_label; /* set of strings labeling a state */ StrList target_label; /* labeling the arc target state */ unsigned target_signature; /* hashed label for binary search tree */ register int i; /* for looping through strings */ /* Part 1: Allocate space and initialize variables */ machine = (DFA)GetMemory( NULL, sizeof(DFAStruct) ); machine->max_states = TABLE_INCREMENT; machine->state_table = (StateTable)GetMemory(NULL, machine->max_states*sizeof(StateTableEntry)); machine->num_states = 0; machine->max_arcs = TABLE_INCREMENT; machine->arc_table = (ArcTable)GetMemory( NULL, machine->max_arcs * sizeof(ArcTableEntry) ); machine->num_arcs = 0; machine->tree = NULL; /* Part 2: Make and label the DFA start state */ StrListUnique( words ); /* sort and unique the list */ machine->state_table[0].label = words; machine->num_states = 1; /* Part 3: Main loop - build the state and arc tables */ for ( state = 0; state < machine->num_states; state++ ) { /* The current state has nothing but a label, so */ /* the first order of business is to set up some */ /* of its other major fields */ machine->state_table[state].is_final = FALSE; machine->state_table[state].arc_offset = machine->num_arcs; machine->state_table[state].num_arcs = 0; /* Add arcs to the arc table for the current state */ /* based on the state's derived set. Also set the */ /* state's final flag if the empty string is found */ /* in the suffix list */ current_label = machine->state_table[state].label; target_label = StrListCreate(); target_signature = HASH_START; arc_label = EOS; for ( i = 0; i < StrListSize(current_label); i++ ) { /* get the next string in the label and lop it */ string = StrListPeek( current_label, i ); ch = *string++; /* the empty string means mark this state as final */ if ( EOS == ch ) { machine->state_table[state].is_final = TRUE; continue; } /* make sure we have a legitimate arc_label */ if ( EOS == arc_label ) arc_label = ch; /* if the first character is new, then we must */ /* add an arc for the previous first character */ if ( ch != arc_label ) { AddArc(machine, state, arc_label, target_label, target_signature); target_label = StrListCreate(); target_signature = HASH_START; arc_label = ch; } /* add the current suffix to the target state label */ StrListAppend( target_label, string ); target_signature += (*string + 1) * HASH_INCREMENT; while ( *string ) target_signature += *string++; } /* On loop exit we have not added an arc for the */ /* last bunch of suffixes, so we must do so, as */ /* long as the last set of suffixes is not empty */ /* (which happens when the current state label */ /* is the singleton set of the empty string). */ if ( 0 < StrListSize(target_label) ) AddArc( machine, state, arc_label, target_label, target_signature ); } /* Part 4: Deallocate the binary search tree and the state labels */ DestroyTree( machine->tree ); machine->tree = NULL; for ( i = 0; i < machine->num_states; i++ ) { StrListDestroy( machine->state_table[i].label ); machine->state_table[i].label = NULL; } /* Part 5: Reallocate the tables to squish them down */ machine->state_table = (StateTable)GetMemory( machine->state_table, machine->num_states * sizeof(StateTableEntry) ); machine->arc_table = (ArcTable)GetMemory( machine->arc_table, machine->num_arcs * sizeof(ArcTableEntry) ); /* Part 6: Return the newly constructed DFA */ return( machine ); } /* BuildDFA */ /*FN*************************************************************************** GetTerm( stream, machine, size, output ) Returns: char * -- NULL if stream is exhausted, otherwise output buffer Purpose: Get the next token from an input stream, filtering stop words Plan: Part 1: Return NULL immediately if there is no input Part 2: Initialize the local variables Part 3: Main Loop: Put an unfiltered word into the output buffer Part 4: Return the output buffer Notes: This routine runs the DFA provided as the machine parameter, and collects the text of any term in the output buffer. If a stop word is recognized in this process, it is skipped. Care is also taken to be sure not to overrun the output buffer. **/ char * GetTerm( stream, machine, size, output ) FILE *stream; /* in: source of input characters */ DFA machine; /* in: finite state machine driving process */ int size; /* in: bytes in the output buffer */ char *output; /* in/out: where the next token in placed */ { char *outptr; /* for scanning through the output buffer */ int ch; /* current character during input scan */ register int state; /* current state during DFA execution */ /* Part 1: Return NULL immediately if there is no input */ if ( EOF == (ch = getc(stream)) ) return( NULL ); /* Part 2: Initialize the local variables */ outptr = output; /* Part 3: Main Loop: Put an unfiltered word into the output buffer */ do { /* scan past any leading delimiters */ while ( (EOF != ch ) && ((DELIM_CH == char_class[ch]) || (DIGIT_CH == char_class[ch])) ) ch = getc( stream ); /* start the machine in its start state */ state = 0; /* copy input to output until reaching a delimiter, and also */ /* run the DFA on the input to watch for filtered words */ while ( (EOF != ch) && (DELIM_CH != char_class[ch]) ) { if ( outptr == (output+size-1) ) { outptr = output; state = 0; } *outptr++ = convert_case[ch]; if ( DEAD_STATE != state ) { register int i; /* for scanning through arc labels */ int arc_start; /* where the arc label list starts */ int arc_end; /* where the arc label list ends */ arc_start = machine->state_table[state].arc_offset; arc_end = arc_start + machine->state_table[state].num_arcs; for ( i = arc_start; i < arc_end; i++ ) if ( convert_case[ch] == machine->arc_table[i].label ) { state = machine->arc_table[i].target; break; } if ( i == arc_end ) state = DEAD_STATE; } ch = getc( stream ); } /* start from scratch if a stop word is recognized */ if ( (DEAD_STATE != state) && machine->state_table[state].is_final ) outptr = output; /* terminate the output buffer */ *outptr = EOS; } while ( (EOF != ch) && !*output ); /* Part 4: Return the output buffer */ return( output ); } /* GetTerm */