/* PROGRAM NAME: hierarky.c PURPOSE: This program will generate a hierarchy in two ways. 1) It can simply read the parent-child links from an input file and store the links in the inverted file structure, OR 2) It can use the Rada algorithm which splits up words into different frequency groups and then builds links between them. INPUT FILES REQUIRED: (Depends on the option selected). Option 1: requires inverted file and link file. Option 2: requires inverted file. 1) inverted file: sequences of term document number weight. (multiple entries for any term should be grouped together) 2) links file: sequences of parent term child term NOTES: Filters such as stop lists and stemmers should be used before running this program. PARAMETERS TO BE SET BY USER: 1) MAXWORD: identifies the maximum size of a term 2) NUMBER_OF_LEVELS: specifies the desired number of levels in the thesaurus hierarchy to be generated, used by the Rada algorithm COMMAND LINE: (INPUT & OUTPUT FILES ARE SPECIFIED INTERACTIVELY) hierarky ****************************************************************************/ #include #include #include #define MAXWORD 20 /* maximum size of a term */ #define NUMBER_OF_LEVELS 10 /* # of levels desired in the thesaurus */ struct doclist { /* sequences of document # and weight pairs*/ int doc; /* document number */ float weight; /* term weight in document */ struct doclist *nextdoc; /* ptr. to next doclist record */ } doclistfile; struct parentlist { char term[MAXWORD]; /* parent term */ struct invert *parent; /* ptr. to parent term in inverted file */ struct parentlist *nextparent; /* ptr. to next parentlist record */ } parentfile; struct childlist { char term[MAXWORD]; /* child term */ struct invert *child; /* ptr. to child term in inverted file */ struct childlist *nextchild; /* ptr. to next childlist record */ } childfile; struct invert { /* inverted file */ char term[MAXWORD]; /* term */ struct doclist *doc; /* sequences of document # and weight */ struct parentlist *parents; /* ptr. to parent terms */ struct childlist *children; /* ptr. to child terms */ int level; /* thesaurus level based on term frequency*/ struct invert *nextterm; /* ptr. to next invert record */ } invfile; struct invert *startinv; /* ptr. to first record in inverted file */ struct invert *lastinv; /* ptr. to last record in inverted file */ struct doclist *lastdoc; /* ptr. to last document in doclist */ static char currentterm[MAXWORD]; /* tracks current term in inverted file */ static int Number_of_docs; /* total # of documents which is computed */ static struct invert *get_mem_invert(); /* these 4 functions will obtain */ static struct doclist *get_mem_doclist(); /* memory for records. The type of */ static struct parentlist *get_mem_parentlist();/* is indicated by the name of */ static struct childlist *get_mem_childlist(); /* the function */ static FILE *input; /* inverted file */ static FILE *input1; /* link file */ static FILE *output; /* holds any output */ static float cohesion(), /* compute cohesion between two terms */ total_wdf(), /* compute total frequency of term in dbse. */ get_freq_range(); static void read_invfile(), /* read in the inverted file */ read_links(), /* read in the links file */ add_link(), /* called within read_links() */ pr_invert(), /* print the inverted file */ add_invert(), /* called within read_invfile() */ write_levels(), /* initialize the levels information */ generate_Rada_hierarchy(), /* generate the Rada hierarchy */ get_term_data(); /* get basic information about terms */ struct invert *find_term(); /* searches for term in inverted file and */ /* returns its address. */ int main(argc) int argc; { char ch, fname[128]; startinv = NULL; lastinv = NULL; lastdoc = NULL; currentterm[0] = '\0'; Number_of_docs = 0; if (argc>1) { (void) printf("There is an error in the command line\n"); (void) printf("Correct usage is:\n"); (void) printf("hierarchy\n"); exit(1); } (void) printf("\nMake a selection\n"); (void) printf("To simply read links from a link file enter 1\n"); (void) printf("To use Rada's algorithm to generate links enter 2\n"); (void) printf("To quit enter 3\n"); (void) printf("Enter selection: "); ch=getchar(); switch(ch) { case '1': (void) printf("\nEnter name of inverted file: "); (void) scanf("%s",fname); if((input=fopen(fname,"r"))==NULL){ (void) printf("cannot open file %s\n",fname); exit(1); } (void) printf("Enter name of link file: "); (void) scanf("%s",fname); if((input1=fopen(fname,"r"))==NULL){ (void) printf("cannot open file %s\n",fname); exit(1); } (void) printf("Enter name of output file: "); (void) scanf("%s",fname); if((output=fopen(fname,"w"))==NULL){ (void) printf("cannot open file %s\n",fname); exit(1); } read_invfile(); (void) fprintf(output,"\nINVERTED FILE\n\n"); pr_invert(); read_links(); (void) fprintf(output,"\nINVERTED FILE WITH LINK INFORMATION\n\n"); pr_invert(); (void) fclose(input); (void) fclose(input1); (void) fclose(output); break; case '2': (void) printf("\nEnter name of inverted file: "); (void) scanf("%s",fname); if((input=fopen(fname,"r"))==NULL){ (void) printf("cannot open file %s\n",fname); exit(1); } (void) printf("Enter name of output file: "); (void) scanf("%s",fname); if((output=fopen(fname,"w"))==NULL){ (void) printf("cannot open file %s\n",fname); exit(1); } read_invfile(); (void) fprintf(output,"\nINVERTED FILE\n\n"); pr_invert(); generate_Rada_hierarchy(); (void) fprintf(output,"\nINVERTED FILE AFTER GENERATING RADA HIERARCHY\n\n"); pr_invert(); (void) fclose(input); (void) fclose(output); break; case '3': exit(0); } return(0); } /*************************************************************************** read_invfile() Returns: void Purpose: Read in the inverted file entries from the disk file **/ static void read_invfile() { int docid; /* holds current document number */ char temp[MAXWORD]; /* holds current term */ float weight; /* holds current term weight */ struct doclist *p; /* structure to store doc#-weight pair */ (void) fscanf(input,"%s%d%f",temp,&docid,&weight); /* read next line */ while (strlen(temp) > 0) /* while its found a legitimate term */ { if (!strncmp(currentterm,temp,MAXWORD)) { /* if this term has previously been entered in inverted file then */ /* only need to attach a doclist record to the same entry */ p = get_mem_doclist(); /* get memory for doclist record */ p->doc = docid; /* assign document number */ p->weight = weight; /* assign term weight */ p->nextdoc = NULL; if (lastdoc) lastdoc->nextdoc = p; /* connect p to the doclist chain for this term if it already exists */ lastdoc = p; /* set this global variable */ } else add_invert(docid,temp,weight); /* else term is a brand new term & need to make a new inverted file entry */ temp[0] = '\0'; (void) fscanf(input,"%s%d%f",temp,&docid,&weight); /* read next line */ } } /*************************************************************************** add_invert(docid,temp,weight) Returns: void Purpose: Start a new entry in the inverted file. It is called in the read_invfile function when a new term is read from the input file. **/ static void add_invert(docid,temp,weight) int docid; /* in: document number */ char temp[MAXWORD]; /* in: new index term */ float weight; /* in: index term weight */ { struct invert *p; /* p will get attached to inverted file */ p = get_mem_invert(); /* get memory for p */ (void) strncpy(p->term,temp,MAXWORD); /* copy over the term */ p->parents = NULL; /* to begin this term has no parent terms*/ p->children = NULL; /* also no child terms */ p->doc = get_mem_doclist(); /* start a doclist structure */ p->doc->doc = docid; /* assign document number */ p->doc->weight = weight; /* assign term weight */ p->doc->nextdoc = NULL; p->nextterm = NULL; if (startinv == NULL) startinv = p; /* if this is the first entry in inverted file, then update global variable */ if (lastinv) lastinv->nextterm = p; /* update ptr. to last inverted file record */ lastinv = p; lastdoc = p->doc; /* update ptr. to last document*/ (void) strncpy(currentterm,temp,MAXWORD); /* update global variable current term */ } /* to the new term just added */ /*************************************************************************** read_links() Returns: void Purpose: Read parent-child link information from a file and record links in the inverted record structure **/ static void read_links() { char parent[MAXWORD], /* tracks parent term */ child[MAXWORD]; /* tracks child term */ (void) fscanf(input1,"%s%s",parent,child); /* read input line */ while (strlen(parent) > 0) /* while a legitimate parent has been found */ { add_link(parent,child); /* this function will add the appropriate links in */ /* the inverted file */ child[0] = '\0'; parent[0] = '\0'; /* now throw out the old parent & child */ (void) fscanf(input1,"%s%s",parent,child); /* read next input line */ } } /*************************************************************************** add_link(parent,child) Returns: void. Purpose: Used within read_links. Basically, for each parent-child link specified, it adds the appropriate link information into the inverted file. Notes: If a term in the link file is not in the inverted file then the program will give a suitable message and exit. **/ static void add_link(parent,child) char parent[MAXWORD], /* in: holds the parent term */ child[MAXWORD]; /* in: holds the child term */ { struct invert *p, /* holds add. of parent term in inv. file */ *q; /* holds add. of child term in inv. file */ struct parentlist *new_parent; /* structure used to store parent info. */ struct childlist *new_child; /* structure used to store child info. */ p = find_term(parent); /* find address of parent term */ if (!p){ printf("\nPlease check the input files.\n"); printf("\nParent term %s is not in the inverted file\n",parent); exit(0);} q = find_term(child); /* find address of child term */ if (!q){ printf("\nPlease check the input files. Output may be incorrect.\n"); printf("\nChild term %s is not in the inverted file\n",child); exit(0);} /* first add parent links for given child */ new_parent = get_mem_parentlist(); /* get memory for parentlist record*/ (void) strncpy(new_parent->term,parent,MAXWORD); /* copy over parent term*/ new_parent->parent = p; /* store address of parent term in inverted file*/ if (q->parents == NULL) { /* i.e. no parents listed for given child yet */ q->parents = new_parent; /* first parent link made */ new_parent->nextparent = NULL; } else { /* at least 1 parent already listed for given child */ new_parent->nextparent = q->parents;/* attach newparent to front of list */ q->parents = new_parent; } /* next add child links for given parent */ new_child = get_mem_childlist(); /* get memory for childlist record */ (void) strncpy(new_child->term,child,MAXWORD); /* copy over child term */ new_child->child = q; /* store address of child term in inverted file */ if (p->children == NULL) { /* no children listed for given parent yet */ p->children = new_child; /* first child link made */ new_child->nextchild = NULL; } else { /* at least 1 child already listed for given parent */ new_child->nextchild = p->children; /* attach newchild to front of list */ p->children = new_child; } } /*************************************************************************** pr_invert() Returns: void Purpose: Print the inverted file. It prints each term, its associated document numbers, term-weights and parent child terms. **/ static void pr_invert() { struct invert *inv_addr; /* tracks address of current inv. file record */ struct doclist *doc_addr; /* tracks address of current doclist record */ struct parentlist *parent_addr;/* tracks address of current parentlist record */ struct childlist *child_addr; /* tracks address of current childlist record */ inv_addr = startinv; /* begin at top of inverted file*/ while (inv_addr) { /* while a legitimate term.... */ (void) fprintf(output,"TERM: %s\nPARENT TERMS: ",inv_addr->term); parent_addr = inv_addr->parents; /* find addr. of first parent */ while (parent_addr) { /* printing all parents */ (void) fprintf(output,"%s ",parent_addr->term); parent_addr = parent_addr->nextparent; /*loop through remaining parents*/ } (void) fprintf(output,"\nCHILD TERMS: "); child_addr = inv_addr->children; /* find addr. of first child */ while (child_addr) { /* printing all children */ (void) fprintf(output,"%s ",child_addr->term); child_addr = child_addr->nextchild; /*loop through remaining children */ } (void) fprintf(output,"\n\n"); (void) fprintf(output," DOCUMENT NUMBER TERM WEIGHT\n"); doc_addr = inv_addr->doc; /* find addr. of first associated doc.*/ while (doc_addr) { /* print all docs. and term weights*/ (void) fprintf(output," %-30d ",doc_addr->doc); (void) fprintf(output,"%-10.5f\n",doc_addr->weight); doc_addr = doc_addr->nextdoc; /* loop through remaining documents*/ } (void) fprintf(output,"\n"); inv_addr = inv_addr->nextterm; /* go to next inverted file entry */ } } /*************************************************************************** total_wdf(term) Returns: float Purpose: Compute total within document frequency for specified term in the database **/ static float total_wdf(term) char term[MAXWORD]; /* in: term for which total_wdf is required */ { struct invert *term_addr; /* add. of above term in inverted file */ struct doclist *doc_ad; /* tracks add. of associated doclist record */ float totalwdf; /* tracks total wdf */ totalwdf = 0.0; term_addr = find_term(term); /* obtain address of the term in inv. file */ if (term_addr) { /* if term was found */ doc_ad = term_addr->doc; /* find address of associated doclist record*/ while (doc_ad) { totalwdf = totalwdf + doc_ad->weight; /* loop through doclist records to */ doc_ad = doc_ad->nextdoc; /* compute the total weight */ } } else (void) fprintf(output,"Term %s is not in the inverted file. Could lead to problems\n",term); return(totalwdf); } /*************************************************************************** get_freq_range(minimum,maximum) Returns: float Purpose: Compute the difference between the maximum total term frequency and the minimum total term frequency observed in the inverted file. **/ static float get_freq_range(minimum, maximum) float *minimum, /* out: returns minimum totalwdf */ *maximum; /* out: returns maximum totalwdf */ { struct invert *inv_addr; float freq, max, min; inv_addr = startinv; /* begin at top of inverted file */ /* initialize min and max to equal frequency of 1st term in file */ if (inv_addr) { freq = total_wdf(inv_addr->term); min = freq; max = freq; inv_addr = inv_addr->nextterm; /* go to next term in inv. file */ } while (inv_addr) { /* while a legitimate term compare with max and min. */ freq = total_wdf(inv_addr->term); if (freq > max) max = freq; if (freq < min) min = freq; inv_addr = inv_addr->nextterm; /* go to next term in inv. file */ } *minimum = min; *maximum = max; return(max - min); /* returning the difference */ } /*************************************************************************** write_levels() Returns: void Purpose: Write the level numbers for each term into the inverted file depending on the total wdf of the term in the database and the user selected parameter NUMBER_OF_LEVELS. The level numbers are marked 0, 1, 2, ... etc. Level 0 refers to the highest frequency class, level 1 the next frequency class etc. **/ static void write_levels() { int i, /* counter through the different levels */ number; /* holds NUMBER_OF_LEVELS */ float freq, /* holds frequency of term in database */ range, /* holds diff. between highest & lowest freqs.*/ current_low, /* tracks lower frequency of current level */ current_high; /* tracks higher frequency of current level */ float high, /* highest term frequency in database */ low; /* lowest term frequency in database */ struct invert *inv_addr; /* tracks current inverted file record */ /* range holds the difference between highest & lowest totalwdf in dbse. */ range = get_freq_range(&low,&high); number = NUMBER_OF_LEVELS; /* user specified global parameter */ inv_addr = startinv; /* start with the first term in inverted file */ while(inv_addr) { /* while a legitimate term was found */ freq = total_wdf(inv_addr->term); current_low = low; for (i=(number-1);i>=0;i--) { if (i==0) current_high = high; else current_high = current_low + (range/number); /* if the term's frequency is within this narrow range, then level = i*/ if ((freq >= current_low) && (freq <= current_high)) { inv_addr->level = i; break; } current_low = current_high; /* loop through the frequency levels */ } /* ending for loop */ inv_addr = inv_addr->nextterm; /* loop through other inv. file terms */ } } /*************************************************************************** generate_Rada_hierarchy() Returns: void Purpose: Create the levelptrs data structure and generate the hierarchy according to Rada's algorithm **/ static void generate_Rada_hierarchy() { struct termlist { struct invert *term; /* pointer to term in inverted file */ int mark; /* equals 1 if term is propagated else 0 */ struct termlist *nextterm; /* pointer to next termlist record */ } termlistfile, *p, *q, *r; /* levelptrs is an array of pointers. Each slot points to the start of the chain of termlist records for that level */ struct termlist *levelptrs[NUMBER_OF_LEVELS]; int i; struct invert *inv_addr; /* tracks current term in inverted file */ float coh, max_cohesion; write_levels(); /* this routine computes and writes the level number for each */ /* term in the inverted file */ for (i=0;iterm = inv_addr; /* assign the address of term in inverted file*/ p->mark = 0; /* initially term not linked */ /* Note: this term has been assigned to a frequency level already. Now, if this */ /* is the first term read for this level then set the appropriate levelptrs entry*/ /* to point to this term */ if (levelptrs[inv_addr->level] == NULL) { levelptrs[inv_addr->level] = p; p->nextterm = NULL; } else { /* now this is not the first term encountered for this level, so simply */ /* attach it to the front of the chain */ p->nextterm = levelptrs[inv_addr->level]; levelptrs[inv_addr->level] = p; } inv_addr = inv_addr->nextterm; /* process next inverted file term */ } /* end while */ /* start with each level and compute max-cohesion with previous level */ for (i=1;iterm->term,q->term->term); if (coh > max_cohesion) max_cohesion = coh; q = q->nextterm; } /* max_cohesion for terms in p has been computed */ /* adding parent-child links and marking parents as propagated */ q = levelptrs[i-1]; while (q && max_cohesion > 0.0) { coh = cohesion(p->term->term,q->term->term); if (coh == max_cohesion) { /* this ensures multiple links possible */ add_link(q->term->term,p->term->term); /* routine adds the actual link */ q->mark = 1; /* to show that parent term has been linked */ } q = q->nextterm; } /* end while(q) */ p = p->nextterm; /* go to next term in level i */ max_cohesion = 0.0; } /* end while(p) */ /* checking all terms in level[i-1] to make sure they have propagated */ q = levelptrs[i-1]; while (q) { if (q->mark == 0){ /* i.e. term has no child in next level */ q->mark = 1; r = (struct termlist *)malloc(sizeof(termlistfile)); if (!r) { (void) fprintf(output,"\nout of memory\n"); exit(2); } r->term = q->term; r->mark = 0; /* making a copy of term as its dummy child */ r->nextterm = levelptrs[i]; /* inserting r at beginning of level i chain */ levelptrs[i] = r; } q = q->nextterm; } } /* for */ } /*************************************************************************** cohesion(term1,term2) Returns: void Purpose: Compute the cohesion between two terms **/ static float cohesion(term1,term2) char term1[MAXWORD], /* in: the two terms which are being */ term2[MAXWORD]; /* in: compared to determine cohesion */ { float l1, /* holds # of documents associated with term 1 */ l2, /* holds # of documents associated with term 2 */ common; /* holds # of documents in common */ get_term_data(term1,term2,&l1,&l2,&common); return(common/(sqrt(l1 * l2))); } /*************************************************************************** get_term_data(term1,term2,l1,l2,common) Returns: void Purpose: Given two terms, it determines the number of documents in each and in common between them. **/ static void get_term_data(term1,term2,l1,l2,common) char term1[MAXWORD], /* in: term 1 */ term2[MAXWORD]; /* in: term 2 */ float *l1, /* out: # of documents associated with term 1 */ *l2, /* out: # of documents associated with term 2 */ *common; /* out: # of documents associated with both */ { struct invert *p, *q; /* holds addresses of both terms in inv. file */ struct doclist *doc_ad1, *doc_ad2; /* tracks addresses of doclists records */ int count1, /* # of documents associated with term 1 */ count2, /* # of documents associated with term 2 */ com; /* # of documents in common */ p = find_term(term1); q = find_term(term2); /* find addresses of terms */ doc_ad1 = p->doc; /* start with doclist record for term1*/ count1 = 0; count2 = 0; com = 0; /* initialize */ /* first get length for document 1 and number of common terms */ while (doc_ad1) { count1 = count1 +1; doc_ad2 = q->doc; /* for each doc. of term1 loop through all docs. of term2 */ while (doc_ad2) { if (doc_ad1->doc == doc_ad2->doc) { /* if they are the same doc. # */ com = com +1; break; } doc_ad2 = doc_ad2->nextdoc; } doc_ad1 = doc_ad1->nextdoc; } /* now get length of document 2 */ doc_ad2 = q->doc; while (doc_ad2) { count2 = count2 + 1; doc_ad2 = doc_ad2->nextdoc; } *l1 = count1; *l2 = count2; *common = com; } /*************************************************************************** *find_term(term) Returns: address of a struct invert record Purpose: Search for a specified term in the inverted file and return address of the corresponding inverted file record. **/ static struct invert *find_term(term) char term[MAXWORD]; /* in: term to be located in inverted file */ { struct invert *inv_addr; /* tracks addr. of current rec. in inv, file */ inv_addr = startinv; /* begin at top on inv. file */ while(inv_addr) { if (!strcmp(term,inv_addr->term)) return(inv_addr); inv_addr = inv_addr->nextterm; } (void) fprintf(output,"Term %s not found\n",term); return(NULL); } /*************************************************************************** *get_mem_invert() Returns: address of a struct invert record Purpose: dynamically obtain enough memory to store 1 invert record. **/ static struct invert *get_mem_invert() { struct invert *record; record = (struct invert *)malloc(sizeof(invfile)); if (!record) { (void) fprintf(output,"\nout of memory\n"); return(NULL); } return(record); } /*************************************************************************** *get_mem_doclist() Returns: address of a struct doclist record Purpose: dynamically obtain enough memory to store 1 doclist record. **/ static struct doclist *get_mem_doclist() { struct doclist *record; record = (struct doclist *)malloc(sizeof(doclistfile)); if (!record) { (void) fprintf(output,"\nout of memory\n"); return(NULL); } return(record); } /*************************************************************************** *get_mem_parentlist() Returns: address of a struct parentlist record Purpose: dynamically obtain enough memory to store i parentlist record **/ static struct parentlist *get_mem_parentlist() { struct parentlist *record; record = (struct parentlist *)malloc(sizeof(parentfile)); if (!record) { (void) fprintf(output,"\nout of memory\n"); return(NULL); } return(record); } /*************************************************************************** *get_mem_childlist() Returns: address of a struct childlist record Purpose: dynamically obtain enough memory to store 1 childlist record **/ static struct childlist *get_mem_childlist() { struct childlist *record; record = (struct childlist *)malloc(sizeof(childfile)); if (!record) { (void) fprintf(output,"\nout of memory\n"); return(NULL); } return(record); }