/************************** Start of LZSS.C ************************* * * This is the LZSS module, which implements an LZ77 style compression * algorithm. As iplemented here it uses a 12 bit index into the sliding * window, and a 4 bit length, which is adjusted to reflect phrase lengths * of between 2 and 17 bytes. */ #include #include #include #include #include "bitio.h" /* * Various constants used to define the compression parameters. The * INDEX_BIT_COUNT tells how many bits we allocate to indices into the * text window. This directly determines the WINDOW_SIZE. The * LENGTH_BIT_COUNT tells how many bits we allocate for the length of * an encode phrase. This determines the size of the look ahead buffer. * The TREE_ROOT is a special node in the tree that always points to * the root node of the binary phrase tree. END_OF_STREAM is a special * index used to flag the fact that the file has been completely * encoded, and there is no more data. UNUSED is the null index for * the tree. MOD_WINDOW() is a macro used to perform arithmetic on tree * indices. * */ #define INDEX_BIT_COUNT 12 #define LENGTH_BIT_COUNT 4 #define WINDOW_SIZE ( 1 << INDEX_BIT_COUNT ) #define RAW_LOOK_AHEAD_SIZE ( 1 << LENGTH_BIT_COUNT ) #define BREAK_EVEN ( ( 1 + INDEX_BIT_COUNT + LENGTH_BIT_COUNT ) / 9 ) #define LOOK_AHEAD_SIZE ( RAW_LOOK_AHEAD_SIZE + BREAK_EVEN ) #define TREE_ROOT WINDOW_SIZE #define END_OF_STREAM 0 #define UNUSED 0 #define MOD_WINDOW( a ) ( ( a ) & ( WINDOW_SIZE - 1 ) ) char *CompressionName = "LZSS Encoder"; char *Usage = "in-file out-file\n\n"; /* * These are the two global data structures used in this program. * The window[] array is exactly that, the window of previously seen * text, as well as the current look ahead text. The tree[] structure * contains the binary tree of all of the strings in the window sorted * in order. */ unsigned char window[ WINDOW_SIZE ]; struct { int parent; int smaller_child; int larger_child; } tree[ WINDOW_SIZE + 1 ]; /* * Function prototypes for both ANSI C compilers and their K&R brethren. */ #ifdef __STDC__ void InitTree( int r ); void ContractNode( int old_node, int new_node ); void ReplaceNode( int old_node, int new_node ); int FindNextNode( int node ); void DeleteString( int p ); int AddString( int new_node, int *match_position ); void CompressFile( FILE *input, BIT_FILE *output, int argc, char *argv[] ); void ExpandFile( BIT_FILE *input, FILE *output, int argc, char *argv[] ); #else void InitTree(); void ContractNode(); void ReplaceNode(); int FindNextNode(); void DeleteString(); int AddString(); void CompressFile(); void ExpandFile(); #endif /* * Since the tree is static data, it comes up with every node * initialized to 0, which is good, since 0 is the UNUSED code. * However, to make the tree really usable, a single phrase has to be * added to the tree so it has a root node. That is done right here. */ void InitTree( r ) int r; { tree[ TREE_ROOT ].larger_child = r; tree[ r ].parent = TREE_ROOT; tree[ r ].larger_child = UNUSED; tree[ r ].smaller_child = UNUSED; } /* * This routine is used when a node is being deleted. The link to * its descendant is broken by pulling the descendant in to overlay * the existing link. */ void ContractNode( old_node, new_node ) int old_node; int new_node; { tree[ new_node ].parent = tree[ old_node ].parent; if ( tree[ tree[ old_node ].parent ].larger_child == old_node ) tree[ tree[ old_node ].parent ].larger_child = new_node; else tree[ tree[ old_node ].parent ].smaller_child = new_node; tree[ old_node ].parent = UNUSED; } /* * This routine is also used when a node is being deleted. However, * in this case, it is being replaced by a node that was not previously * in the tree. */ void ReplaceNode( old_node, new_node ) int old_node; int new_node; { int parent; parent = tree[ old_node ].parent; if ( tree[ parent ].smaller_child == old_node ) tree[ parent ].smaller_child = new_node; else tree[ parent ].larger_child = new_node; tree[ new_node ] = tree[ old_node ]; tree[ tree[ new_node ].smaller_child ].parent = new_node; tree[ tree[ new_node ].larger_child ].parent = new_node; tree[ old_node ].parent = UNUSED; } /* * This routine is used to find the next smallest node after the node * argument. It assumes that the node has a smaller child. We find * the next smallest child by going to the smaller_child node, then * going to the end of the larger_child descendant chain. */ int FindNextNode( node ) int node; { int next; next = tree[ node ].smaller_child; while ( tree[ next ].larger_child != UNUSED ) next = tree[ next ].larger_child; return( next ); } /* * This routine performs the classic binary tree deletion algorithm. * If the node to be deleted has a null link in either direction, we * just pull the non-null link up one to replace the existing link. * If both links exist, we instead delete the next link in order, which * is guaranteed to have a null link, then replace the node to be deleted * with the next link. */ void DeleteString( p ) int p; { int replacement; if ( tree[ p ].parent == UNUSED ) return; if ( tree[ p ].larger_child == UNUSED ) ContractNode( p, tree[ p ].smaller_child ); else if ( tree[ p ].smaller_child == UNUSED ) ContractNode( p, tree[ p ].larger_child ); else { replacement = FindNextNode( p ); DeleteString( replacement ); ReplaceNode( p, replacement ); } } /* * This where most of the work done by the encoder takes place. This * routine is responsible for adding the new node to the binary tree. * It also has to find the best match among all the existing nodes in * the tree, and return that to the calling routine. To make matters * even more complicated, if the new_node has a duplicate in the tree, * the old_node is deleted, for reasons of efficiency. */ int AddString( new_node, match_position ) int new_node; int *match_position; { int i; int test_node; int delta; int match_length; int *child; if ( new_node == END_OF_STREAM ) return( 0 ); test_node = tree[ TREE_ROOT ].larger_child; match_length = 0; for ( ; ; ) { for ( i = 0 ; i < LOOK_AHEAD_SIZE ; i++ ) { delta = window[ MOD_WINDOW( new_node + i ) ] - window[ MOD_WINDOW( test_node + i ) ]; if ( delta != 0 ) break; } if ( i >= match_length ) { match_length = i; *match_position = test_node; if ( match_length >= LOOK_AHEAD_SIZE ) { ReplaceNode( test_node, new_node ); return( match_length ); } } if ( delta >= 0 ) child = &tree[ test_node ].larger_child; else child = &tree[ test_node ].smaller_child; if ( *child == UNUSED ) { *child = new_node; tree[ new_node ].parent = test_node; tree[ new_node ].larger_child = UNUSED; tree[ new_node ].smaller_child = UNUSED; return( match_length ); } test_node = *child; } } /* * This is the compression routine. It has to first load up the look * ahead buffer, then go into the main compression loop. The main loop * decides whether to output a single character or an index/length * token that defines a phrase. Once the character or phrase has been * sent out, another loop has to run. The second loop reads in new * characters, deletes the strings that are overwritten by the new * character, then adds the strings that are created by the new * character. */ void CompressFile( input, output, argc, argv ) FILE *input; BIT_FILE *output; int argc; char *argv[]; { int i; int c; int look_ahead_bytes; int current_position; int replace_count; int match_length; int match_position; current_position = 1; for ( i = 0 ; i < LOOK_AHEAD_SIZE ; i++ ) { if ( ( c = getc( input ) ) == EOF ) break; window[ current_position + i ] = (unsigned char) c; } look_ahead_bytes = i; InitTree( current_position ); match_length = 0; match_position = 0; while ( look_ahead_bytes > 0 ) { if ( match_length > look_ahead_bytes ) match_length = look_ahead_bytes; if ( match_length <= BREAK_EVEN ) { replace_count = 1; OutputBit( output, 1 ); OutputBits( output, (unsigned long) window[ current_position ], 8 ); } else { OutputBit( output, 0 ); OutputBits( output, (unsigned long) match_position, INDEX_BIT_COUNT ); OutputBits( output, (unsigned long) ( match_length - ( BREAK_EVEN + 1 ) ), LENGTH_BIT_COUNT ); replace_count = match_length; } for ( i = 0 ; i < replace_count ; i++ ) { DeleteString( MOD_WINDOW( current_position + LOOK_AHEAD_SIZE ) ); if ( ( c = getc( input ) ) == EOF ) look_ahead_bytes--; else window[ MOD_WINDOW( current_position + LOOK_AHEAD_SIZE ) ] = (unsigned char) c; current_position = MOD_WINDOW( current_position + 1 ); if ( look_ahead_bytes ) match_length = AddString( current_position, &match_position ); } }; OutputBit( output, 0 ); OutputBits( output, (unsigned long) END_OF_STREAM, INDEX_BIT_COUNT ); while ( argc-- > 0 ) printf( "Unknown argument: %s\n", *argv++ ); } /* * This is the expansion routine for the LZSS algorithm. All it has * to do is read in flag bits, decide whether to read in a character or * a index/length pair, and take the appropriate action. */ void ExpandFile( input, output, argc, argv ) BIT_FILE *input; FILE *output; int argc; char *argv[]; { int i; int current_position; int c; int match_length; int match_position; current_position = 1; for ( ; ; ) { if ( InputBit( input ) ) { c = (int) InputBits( input, 8 ); putc( c, output ); window[ current_position ] = (unsigned char) c; current_position = MOD_WINDOW( current_position + 1 ); } else { match_position = (int) InputBits( input, INDEX_BIT_COUNT ); if ( match_position == END_OF_STREAM ) break; match_length = (int) InputBits( input, LENGTH_BIT_COUNT ); match_length += BREAK_EVEN; for ( i = 0 ; i <= match_length ; i++ ) { c = window[ MOD_WINDOW( match_position + i ) ]; putc( c, output ); window[ current_position ] = (unsigned char) c; current_position = MOD_WINDOW( current_position + 1 ); } } } while ( argc-- > 0 ) printf( "Unknown argument: %s\n", *argv++ ); } /************************** End of LZSS.C *************************/