/************************** Start of HUFF.C ************************* * * This is the Huffman coding module used in Chapter 3. * Compile with BITIO.C, ERRHAND.C, and either MAIN-C.C or MAIN-E.C */ #include #include #include #include #include "bitio.h" #include "errhand.h" #include "main.h" /* * The NODE structure is a node in the Huffman decoding tree. It has a * count, which is its weight in the tree, and the node numbers of its * two children. The saved_count member of the structure is only * there for debugging purposes, and can be safely taken out at any * time. It just holds the intial count for each of the symbols, since * the count member is continually being modified as the tree grows. */ typedef struct tree_node { unsigned int count; unsigned int saved_count; int child_0; int child_1; } NODE; /* * A Huffman tree is set up for decoding, not encoding. When encoding, * I first walk through the tree and build up a table of codes for * each symbol. The codes are stored in this CODE structure. */ typedef struct code { unsigned int code; int code_bits; } CODE; /* * The special EOS symbol is 256, the first available symbol after all * of the possible bytes. When decoding, reading this symbols * indicates that all of the data has been read in. */ #define END_OF_STREAM 256 /* * Local function prototypes, defined with or without ANSI prototypes. */ #ifdef __STDC__ void count_bytes( FILE *input, unsigned long *long_counts ); void scale_counts( unsigned long *long_counts, NODE *nodes ); int build_tree( NODE *nodes ); void convert_tree_to_code( NODE *nodes, CODE *codes, unsigned int code_so_far, int bits, int node ); void output_counts( BIT_FILE *output, NODE *nodes ); void input_counts( BIT_FILE *input, NODE *nodes ); void print_model( NODE *nodes, CODE *codes ); void compress_data( FILE *input, BIT_FILE *output, CODE *codes ); void expand_data( BIT_FILE *input, FILE *output, NODE *nodes, int root_node ); void print_char( int c ); #else /* __STDC__ */ void count_bytes(); void scale_counts(); int build_tree(); void convert_tree_to_code(); void output_counts(); void input_counts(); void print_model(); void compress_data(); void expand_data(); void print_char(); #endif /* __STDC__ */ /* * These two strings are used by MAIN-C.C and MAIN-E.C to print * messages of importance to the user of the program. */ char *CompressionName = "static order 0 model with Huffman coding"; char *Usage = "infile outfile [-d]\n\nSpecifying -d will dump the modeling data\n"; /* * CompressFile is the compression routine called by MAIN-C.C. It * looks for a single additional argument to be passed to it from * the command line: "-d". If a "-d" is present, it means the * user wants to see the model data dumped out for debugging * purposes. * * This routine works in a fairly straightforward manner. First, * it has to allocate storage for three different arrays of data. * Next, it counts all the bytes in the input file. The counts * are all stored in long int, so the next step is scale them down * to single byte counts in the NODE array. After the counts are * scaled, the Huffman decoding tree is built on top of the NODE * array. Another routine walks through the tree to build a table * of codes, one per symbol. Finally, when the codes are all ready, * compressing the file is a simple matter. After the file is * compressed, the storage is freed up, and the routine returns. * */ void CompressFile( input, output, argc, argv ) FILE *input; BIT_FILE *output; int argc; char *argv[]; { unsigned long *counts; NODE *nodes; CODE *codes; int root_node; counts = (unsigned long *) calloc( 256, sizeof( unsigned long ) ); if ( counts == NULL ) fatal_error( "Error allocating counts array\n" ); if ( ( nodes = (NODE *) calloc( 514, sizeof( NODE ) ) ) == NULL ) fatal_error( "Error allocating nodes array\n" ); if ( ( codes = (CODE *) calloc( 257, sizeof( CODE ) ) ) == NULL ) fatal_error( "Error allocating codes array\n" ); count_bytes( input, counts ); scale_counts( counts, nodes ); output_counts( output, nodes ); root_node = build_tree( nodes ); convert_tree_to_code( nodes, codes, 0, 0, root_node ); if ( argc > 0 && strcmp( argv[ 0 ], "-d" ) == 0 ) print_model( nodes, codes ); compress_data( input, output, codes ); free( (char *) counts ); free( (char *) nodes ); free( (char *) codes ); } /* * ExpandFile is the routine called by MAIN-E.C to expand a file that * has been compressed with order 0 Huffman coding. This routine has * a simpler job than that of the Compression routine. All it has to * do is read in the counts that have been stored in the compressed * file, then build the Huffman tree. The data can then be expanded * by reading in a bit at a time from the compressed file. Finally, * the node array is freed and the routine returns. * */ void ExpandFile( input, output, argc, argv ) BIT_FILE *input; FILE *output; int argc; char *argv[]; { NODE *nodes; int root_node; if ( ( nodes = (NODE *) calloc( 514, sizeof( NODE ) ) ) == NULL ) fatal_error( "Error allocating nodes array\n" ); input_counts( input, nodes ); root_node = build_tree( nodes ); if ( argc > 0 && strcmp( argv[ 0 ], "-d" ) == 0 ) print_model( nodes, 0 ); expand_data( input, output, nodes, root_node ); free( (char *) nodes ); } /* * In order for the compressor to build the same model, I have to store * the symbol counts in the compressed file so the expander can read * them in. In order to save space, I don't save all 256 symbols * unconditionally. The format used to store counts looks like this: * * start, stop, counts, start, stop, counts, ... 0 * * This means that I store runs of counts, until all the non-zero * counts have been stored. At this time the list is terminated by * storing a start value of 0. Note that at least 1 run of counts has * to be stored, so even if the first start value is 0, I read it in. * It also means that even in an empty file that has no counts, I have * to pass at least one count. * * In order to efficiently use this format, I have to identify runs of * non-zero counts. Because of the format used, I don't want to stop a * run because of just one or two zeros in the count stream. So I have * to sit in a loop looking for strings of three or more zero values in * a row. * * This is simple in concept, but it ends up being one of the most * complicated routines in the whole program. A routine that just * writes out 256 values without attempting to optimize would be much * simpler, but would hurt compression quite a bit on small files. * */ void output_counts( output, nodes ) BIT_FILE *output; NODE *nodes; { int first; int last; int next; int i; first = 0; while ( first < 255 && nodes[ first ].count == 0 ) first++; /* * Each time I hit the start of the loop, I assume that first is the * number for a run of non-zero values. The rest of the loop is * concerned with finding the value for last, which is the end of the * run, and the value of next, which is the start of the next run. * At the end of the loop, I assign next to first, so it starts in on * the next run. */ for ( ; first < 256 ; first = next ) { last = first + 1; for ( ; ; ) { for ( ; last < 256 ; last++ ) if ( nodes[ last ].count == 0 ) break; last--; for ( next = last + 1; next < 256 ; next++ ) if ( nodes[ next ].count != 0 ) break; if ( next > 255 ) break; if ( ( next - last ) > 3 ) break; last = next; }; /* * Here is where I output first, last, and all the counts in between. */ if ( putc( first, output->file ) != first ) fatal_error( "Error writing byte counts\n" ); if ( putc( last, output->file ) != last ) fatal_error( "Error writing byte counts\n" ); for ( i = first ; i <= last ; i++ ) { if ( putc( nodes[ i ].count, output->file ) != (int) nodes[ i ].count ) fatal_error( "Error writing byte counts\n" ); } } if ( putc( 0, output->file ) != 0 ) fatal_error( "Error writing byte counts\n" ); } /* * When expanding, I have to read in the same set of counts. This is * quite a bit easier that the process of writing them out, since no * decision making needs to be done. All I do is read in first, check * to see if I am all done, and if not, read in last and a string of * counts. */ void input_counts( input, nodes ) BIT_FILE *input; NODE *nodes; { int first; int last; int i; int c; for ( i = 0 ; i < 256 ; i++ ) nodes[ i ].count = 0; if ( ( first = getc( input->file ) ) == EOF ) fatal_error( "Error reading byte counts\n" ); if ( ( last = getc( input->file ) ) == EOF ) fatal_error( "Error reading byte counts\n" ); for ( ; ; ) { for ( i = first ; i <= last ; i++ ) if ( ( c = getc( input->file ) ) == EOF ) fatal_error( "Error reading byte counts\n" ); else nodes[ i ].count = (unsigned int) c; if ( ( first = getc( input->file ) ) == EOF ) fatal_error( "Error reading byte counts\n" ); if ( first == 0 ) break; if ( ( last = getc( input->file ) ) == EOF ) fatal_error( "Error reading byte counts\n" ); } nodes[ END_OF_STREAM ].count = 1; } /* * This routine counts the frequency of occurence of every byte in * the input file. It marks the place in the input stream where it * started, counts up all the bytes, then returns to the place where * it started. In most C implementations, the length of a file * cannot exceed an unsigned long, so this routine should always * work. */ #ifndef SEEK_SET #define SEEK_SET 0 #endif void count_bytes( input, counts ) FILE *input; unsigned long *counts; { long input_marker; int c; input_marker = ftell( input ); while ( ( c = getc( input )) != EOF ) counts[ c ]++; fseek( input, input_marker, SEEK_SET ); } /* * In order to limit the size of my Huffman codes to 16 bits, I scale * my counts down so they fit in an unsigned char, and then store them * all as initial weights in my NODE array. The only thing to be * careful of is to make sure that a node with a non-zero count doesn't * get scaled down to 0. Nodes with values of 0 don't get codes. */ void scale_counts( counts, nodes ) unsigned long *counts; NODE *nodes; { unsigned long max_count; int i; max_count = 0; for ( i = 0 ; i < 256 ; i++ ) if ( counts[ i ] > max_count ) max_count = counts[ i ]; if ( max_count == 0 ) { counts[ 0 ] = 1; max_count = 1; } max_count = max_count / 255; max_count = max_count + 1; for ( i = 0 ; i < 256 ; i++ ) { nodes[ i ].count = (unsigned int) ( counts[ i ] / max_count ); if ( nodes[ i ].count == 0 && counts[ i ] != 0 ) nodes[ i ].count = 1; } nodes[ END_OF_STREAM ].count = 1; } /* * Building the Huffman tree is fairly simple. All of the active nodes * are scanned in order to locate the two nodes with the minimum * weights. These two weights are added together and assigned to a new * node. The new node makes the two minimum nodes into its 0 child * and 1 child. The two minimum nodes are then marked as inactive. * This process repeats until their is only one node left, which is the * root node. The tree is done, and the root node is passed back * to the calling routine. * * Node 513 is used here to arbitratily provide a node with a guaranteed * maximum value. It starts off being min_1 and min_2. After all active * nodes have been scanned, I can tell if there is only one active node * left by checking to see if min_1 is still 513. */ int build_tree( nodes ) NODE *nodes; { int next_free; int i; int min_1; int min_2; nodes[ 513 ].count = 0xffff; for ( next_free = END_OF_STREAM + 1 ; ; next_free++ ) { min_1 = 513; min_2 = 513; for ( i = 0 ; i < next_free ; i++ ) if ( nodes[ i ].count != 0 ) { if ( nodes[ i ].count < nodes[ min_1 ].count ) { min_2 = min_1; min_1 = i; } else if ( nodes[ i ].count < nodes[ min_2 ].count ) min_2 = i; } if ( min_2 == 513 ) break; nodes[ next_free ].count = nodes[ min_1 ].count + nodes[ min_2 ].count; nodes[ min_1 ].saved_count = nodes[ min_1 ].count; nodes[ min_1 ].count = 0; nodes[ min_2 ].saved_count = nodes[ min_2 ].count; nodes[ min_2 ].count = 0; nodes[ next_free ].child_0 = min_1; nodes[ next_free ].child_1 = min_2; } next_free--; nodes[ next_free ].saved_count = nodes[ next_free ].count; return( next_free ); } /* * Since the Huffman tree is built as a decoding tree, there is * no simple way to get the encoding values for each symbol out of * it. This routine recursively walks through the tree, adding the * child bits to each code until it gets to a leaf. When it gets * to a leaf, it stores the code value in the CODE element, and * returns. */ void convert_tree_to_code( nodes, codes, code_so_far, bits, node ) NODE *nodes; CODE *codes; unsigned int code_so_far; int bits; int node; { if ( node <= END_OF_STREAM ) { codes[ node ].code = code_so_far; codes[ node ].code_bits = bits; return; } code_so_far <<= 1; bits++; convert_tree_to_code( nodes, codes, code_so_far, bits, nodes[ node ].child_0 ); convert_tree_to_code( nodes, codes, code_so_far | 1, bits, nodes[ node ].child_1 ); } /* * If the -d command line option is specified, this routine is called * to print out some of the model information after the tree is built. * Note that this is the only place that the saved_count NODE element * is used for anything at all, and in this case it is just for * diagnostic information. By the time I get here, and the tree has * been built, every active element will have 0 in its count. */ void print_model( nodes, codes ) NODE *nodes; CODE *codes; { int i; for ( i = 0 ; i < 513 ; i++ ) { if ( nodes[ i ].saved_count != 0 ) { printf( "node=" ); print_char( i ); printf( " count=%3d", nodes[ i ].saved_count ); printf( " child_0=" ); print_char( nodes[ i ].child_0 ); printf( " child_1=" ); print_char( nodes[ i ].child_1 ); if ( codes && i <= END_OF_STREAM ) { printf( " Huffman code=" ); FilePrintBinary( stdout, codes[ i ].code, codes[ i ].code_bits ); } printf( "\n" ); } } } /* * The print_model routine uses this function to print out node numbers. * The catch is, if it is a printable character, it gets printed out * as a character. Makes the debug output a little easier to read. */ void print_char( c ) int c; { if ( c >= 0x20 && c < 127 ) printf( "'%c'", c ); else printf( "%3d", c ); } /* * Once the tree gets built, and the CODE table is built, compressing * the data is a breeze. Each byte is read in, and its corresponding * Huffman code is sent out. */ void compress_data( input, output, codes ) FILE *input; BIT_FILE *output; CODE *codes; { int c; while ( ( c = getc( input ) ) != EOF ) OutputBits( output, (unsigned long) codes[ c ].code, codes[ c ].code_bits ); OutputBits( output, (unsigned long) codes[ END_OF_STREAM ].code, codes[ END_OF_STREAM ].code_bits ); } /* * Expanding compressed data is a little harder than the compression * phase. As each new symbol is decoded, the tree is traversed, * starting at the root node, reading a bit in, and taking either the * child_0 or child_1 path. Eventually, the tree winds down to a * leaf node, and the corresponding symbol is output. If the symbol * is the END_OF_STREAM symbol, it doesn't get written out, and * instead the whole process terminates. */ void expand_data( input, output, nodes, root_node ) BIT_FILE *input; FILE *output; NODE *nodes; int root_node; { int node; for ( ; ; ) { node = root_node; do { if ( InputBit( input ) ) node = nodes[ node ].child_1; else node = nodes[ node ].child_0; } while ( node > END_OF_STREAM ); if ( node == END_OF_STREAM ) break; if ( ( putc( node, output ) ) != node ) fatal_error( "Error trying to write expanded byte to output" ); } } /*************************** End of HUFF.C **************************/