The following text is sample data to test compression routines. The following text is MORE!! sample data to test compression routines. This text has a large run in it!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! {$DEFINE DEBUG} { This is a port of "Ross Data Compression" as presented by Ed Ross in "The C Users Journal". Original code from unknown source; bugs fixed and optimized by trixter@oldskool.org Method: RDC is based on LZ77 with an RLE component: If a run of bytes is found, it is coded as a run length; if a run is not found, it is coded as a length-offset pair. Matches are sped up using a 4096-entry hash table. Codes are byte-aligned. LZ77 with byte-aligned codes is a common method that has been "discovered" by many different people for different methods: LZSS, LZRWA (Ross Williams), LZO (Markus Oberhumer), and others. Performance: Due to the nature of the relatively simplistic compression function, the compression offered by RDC is not as competitive as other methods. However, it is *extremely* competitive in terms of speed -- in fact, properly optimized in assembler, RDC holds the potential to be the fastest possible decompression method available for 8088-based machines due to the additional RLE encoding and the relative speed of expanding RLE codes on 8088 using the STOSW opcode. This unit has not yet been optimized in assembler, however. } Unit RDCUnit; Interface Procedure Comp_FileToFile(Var infile, outfile: File); Procedure Decomp_FileToFile(Var infile, outfile: File); Implementation {$IFDEF DEBUG} uses support; {$ENDIF} Type PByte = ^Byte; ByteArray = Array[0..65536-8-1] Of Byte; PByteArray = ^ByteArray; PInteger = ^Integer; IntArray = Array[0..32767-1] Of Integer; PIntArray = ^IntArray; PWord = ^Word; WordArray = Array[0..32767-1] Of Word; PWordArray = ^WordArray; Const HASH_LEN = 4096; { # hash table entries } HASH_SIZE = HASH_LEN * Sizeof(word); BUFF_LEN = 32768-1; { size of disk io buffer } (* compress inbuff_len bytes of inbuff into outbuff using hash_len entries in hash_tbl. return length of outbuff, or "0 - inbuff_len" if inbuff could not be compressed. *) Function rdc_compress(ibuff : Pointer; inbuff_len : Word; obuff : Pointer; htable : Pointer) : Integer; Var inbuff : PByte Absolute ibuff; outbuff : PByte Absolute obuff; hash_tbl : PWordArray Absolute htable; in_idx : PByte; in_idxa : PByteArray absolute in_idx; inbuff_end : PByte; anchor : PByte; pat_idx : PByte; cnt : Word; gap : Word; c : Word; hash : Word; hashlen : Word; ctrl_idx : PWord; ctrl_bits : Word; ctrl_cnt : Word; out_idx : PByte; outbuff_end : PByte; foo:integer; {$IFDEF DEBUG} procedure trace(s:string); begin writeln(s,'³','in_idx: ',hex(seg(in_idx^)),':',hex(ofs(in_idx^)), ' out_idx: ',hex(seg(out_idx^)),':',hex(ofs(out_idx^)), ' anchor: ',hex(seg(anchor^)),':',hex(ofs(anchor^)), ' #:',hex(hash), ' pat_idx: ',hex(seg(pat_idx^)),':',hex(ofs(pat_idx^)) ); end; {$ENDIF} Begin in_idx := inbuff; inbuff_end := Pointer(LongInt(inbuff) + inbuff_len); ctrl_idx := Pointer(outbuff); ctrl_cnt := 0; out_idx := Pointer(longint(outbuff) + Sizeof(Word)); outbuff_end := Pointer(LongInt(outbuff) + (inbuff_len - 48)); { skip the compression for a small buffer } If inbuff_len <= 18 Then Begin Move(outbuff, inbuff, inbuff_len); rdc_compress := 0 - inbuff_len; Exit; End; { adjust # hash entries so hash algorithm can use 'and' instead of 'mod' } hashlen := HASH_LEN - 1; { scan thru inbuff } While LongInt(in_idx) < LongInt(inbuff_end) Do Begin { make room for the control bits and check for outbuff overflow } If ctrl_cnt = 16 Then Begin ctrl_idx^ := ctrl_bits; ctrl_cnt := 1; ctrl_idx := Pointer(out_idx); Inc(word(out_idx), 2); {if the output index is farther along than the end of the output buffer,} If LongInt(out_idx) > LongInt(outbuff_end) Then Begin {Move(outbuff^, inbuff^, inbuff_len);} {this annoying foo variable is to get around a bug in borland pascal} foo:=-inbuff_len; rdc_compress := foo; {return negative number to indicate no compression} Exit; End; End Else Inc(ctrl_cnt); { look for rle } anchor := in_idx; c := in_idx^; Inc(in_idx); While (LongInt(in_idx) < longint(inbuff_end)) And (in_idx^ = c) And (LongInt(in_idx) - LongInt(anchor) < (HASH_LEN + 18)) Do Inc(in_idx); { store compression code if character is repeated more than 2 times } cnt := LongInt(in_idx) - LongInt(anchor); If cnt > 2 Then Begin {$IFDEF DEBUG} trace('RLEcnt: '+inttostr(cnt)); {$ENDIF} If cnt <= 18 Then { short rle } Begin out_idx^ := cnt - 3; Inc(out_idx); out_idx^ := c; Inc(out_idx); End Else { long rle } Begin Dec(cnt, 19); out_idx^ := 16 + (cnt and $0F); Inc(out_idx); out_idx^ := cnt Shr 4; Inc(out_idx); out_idx^ := c; Inc(out_idx); End; ctrl_bits := (ctrl_bits Shl 1) Or 1; Continue; End; { look for pattern if 2 or more characters remain in the input buffer } in_idx := anchor; If (LongInt(inbuff_end) - LongInt(in_idx)) > 2 Then Begin { locate offset of possible pattern in sliding dictionary } hash := ((((in_idxa^[0] And 15) Shl 8) Or in_idxa^[1]) Xor ((in_idxa^[0] Shr 4) Or (in_idxa^[2] Shl 4))) And hashlen; {$IFDEF DEBUG} trace('bfssh '); {$ENDIF} pat_idx := in_idx; Word(pat_idx) := hash_tbl^[hash]; hash_tbl^[hash] := Word(in_idx); {$IFDEF DEBUG} trace('afssh '); {$ENDIF} { compare characters if we're within 4098 bytes } gap := LongInt(in_idx) - LongInt(pat_idx); If (gap <= HASH_LEN + 2) Then Begin {$IFDEF DEBUG} trace('inPAT '); {$ENDIF} While (LongInt(in_idx) < LongInt(inbuff_end)) And (LongInt(pat_idx) < LongInt(anchor)) And (pat_idx^ = in_idx^) And (LongInt(in_idx) - LongInt(anchor) < 271) Do Begin Inc(in_idx); Inc(pat_idx); End; { store pattern if it is more than 2 characters } cnt := LongInt(in_idx) - LongInt(anchor); If cnt > 2 Then Begin {$IFDEF DEBUG} trace('storPAT len: '+inttostr(cnt)); {$ENDIF} Dec(gap, 3); If cnt <= 15 Then { short pattern } Begin {$IFDEF DEBUG} trace('bfspt'); {$ENDIF} out_idx^ := (cnt Shl 4) + (gap And $0F); Inc(out_idx); out_idx^ := gap Shr 4; Inc(out_idx); {$IFDEF DEBUG} trace('afspt'); {$ENDIF} End Else Begin { long pattern } {$IFDEF DEBUG} trace('bflpt'); {$ENDIF} out_idx^ := 32 + (gap And $0F); Inc(out_idx); out_idx^ := gap Shr 4; Inc(out_idx); out_idx^ := cnt - 16; Inc(out_idx); {$IFDEF DEBUG} trace('aflpt'); {$ENDIF} End; ctrl_bits := (ctrl_bits Shl 1) Or 1; Continue; End; End; End; { can't compress this character so copy it to outbuff } {$IFDEF DEBUG} trace('bfoutlit'); {$ENDIF} out_idx^ := c; Inc(out_idx); Inc(anchor); in_idx := anchor; ctrl_bits := ctrl_bits Shl 1; {$IFDEF DEBUG} trace('afoutlit'); {$ENDIF} End; { save last load of control bits } {$IFDEF DEBUG} trace('bfstorectrlbits'); {$ENDIF} ctrl_bits := ctrl_bits Shl (16 - ctrl_cnt); ctrl_idx^ := ctrl_bits; {$IFDEF DEBUG} trace('afstorectrlbits'); {$ENDIF} { and return size of compressed buffer } rdc_compress := LongInt(out_idx) - LongInt(outbuff); End; (* decompress inbuff_len bytes of inbuff into outbuff. return length of outbuff. *) Function RDC_Decompress(inbuff : PByte; inbuff_len : Word; outbuff : PByte) : Integer; Var ctrl_bits : Word; ctrl_mask : Word; inbuff_idx : PByte; outbuff_idx : PByte; inbuff_end : PByte; cmd, cnt : Word; ofs, len : Word; outbuff_src : PByte; Begin ctrl_mask := 0; inbuff_idx := inbuff; outbuff_idx := outbuff; inbuff_end := Pointer(LongInt(inbuff) + inbuff_len); { process each item in inbuff } While LongInt(inbuff_idx) < LongInt(inbuff_end) Do Begin { get new load of control bits if needed } ctrl_mask := ctrl_mask Shr 1; If ctrl_mask = 0 Then Begin ctrl_bits := PWord(inbuff_idx)^; Inc(inbuff_idx, 2); ctrl_mask := $8000; End; { just copy this char if control bit is zero } If (ctrl_bits And ctrl_mask) = 0 Then Begin outbuff_idx^ := inbuff_idx^; Inc(outbuff_idx); Inc(inbuff_idx); Continue; End; { undo the compression code } cmd := (inbuff_idx^ Shr 4) And $0F; cnt := inbuff_idx^ And $0F; Inc(inbuff_idx); Case cmd Of 0 : { short rle } Begin Inc(cnt, 3); FillChar(outbuff_idx^, cnt, inbuff_idx^); Inc(inbuff_idx); Inc(outbuff_idx, cnt); End; 1 : { long rle } Begin Inc(cnt, inbuff_idx^ Shl 4); Inc(inbuff_idx); Inc(cnt, 19); FillChar(outbuff_idx^, cnt, inbuff_idx^); Inc(inbuff_idx); Inc(outbuff_idx, cnt); End; 2 : { long pattern } Begin ofs := cnt + 3; Inc(ofs, inbuff_idx^ Shl 4); Inc(inbuff_idx); cnt := inbuff_idx^; Inc(inbuff_idx); Inc(cnt, 16); outbuff_src := Pointer(LongInt(outbuff_idx) - ofs); Move(outbuff_src^, outbuff_idx^, cnt); Inc(outbuff_idx, cnt); End; Else { short pattern} Begin ofs := cnt + 3; Inc(ofs, inbuff_idx^ Shl 4); Inc(inbuff_idx); outbuff_src := Pointer(LongInt(outbuff_idx) - ofs); Move(outbuff_src^, outbuff_idx^, cmd); Inc(outbuff_idx, cmd); End; End; End; { return length of decompressed buffer } RDC_Decompress := LongInt(outbuff_idx) - LongInt(outbuff); End; Procedure Comp_FileToFile(Var infile, outfile: File); const bufsize=buff_len; Var code : Integer; bytes_read : Integer; compress_len : Integer; HashPtr : PWordArray; inputbuffer, outputbuffer : PByteArray; incounter,outcounter:longint; Begin Getmem(HashPtr, HASH_SIZE); {*2 is to help debugging a pointer problem} Getmem(inputbuffer, BUFF_LEN); Getmem(outputbuffer, BUFF_LEN); { read infile bufsize bytes at a time } bytes_read := bufsize; incounter:=0; outcounter:=0; While bytes_read = bufsize Do Begin Fillchar(hashPtr^, HASH_SIZE, #0); Blockread(infile, inputbuffer^, bufsize, bytes_read); if bytes_read=0 then break; inc(incounter,bytes_read); { compress this load of bytes } compress_len := RDC_Compress(PByte(inputbuffer), bytes_read, PByte(outputbuffer), HashPtr); { write length of compressed buffer } Blockwrite(outfile, compress_len, 2, code); { check for negative length indicating the buffer could not be compressed } If compress_len < 0 Then begin compress_len := bytes_read; writeln('BUFFER COULD NOT BE COMPRESSED!'); end; inc(outcounter,compress_len); { write the buffer } if (compress_len=bytes_read) then Blockwrite(outfile, inputbuffer^, compress_len, code) else Blockwrite(outfile, outputbuffer^, compress_len, code); { we're done if less than full buffer was read } writeln('Input pos: ',incounter:8,' Output pos: ',outcounter:8,' Ratio: ',(outcounter / incounter)*100:2:1); End; { add trailer to indicate End of File } compress_len := 0; Blockwrite(outfile, compress_len, 2, code); If (code <> 2) then writeln('Error writing trailer.'+#13+#10); Freemem(HashPtr, HASH_SIZE); Freemem(inputbuffer, BUFF_LEN); Freemem(outputbuffer, BUFF_LEN); End; Procedure Decomp_FileToFile(Var infile, outfile: File); Var code : Integer; block_len : Integer; decomp_len : Integer; HashPtr : PWordArray; inputbuffer, outputbuffer : PByteArray; Begin Getmem(inputbuffer, BUFF_LEN); Getmem(outputbuffer, BUFF_LEN); { read infile BUFF_LEN bytes at a time } block_len := 1; While block_len <> 0 do Begin Blockread(infile, block_len, 2, code); If (code <> 2) then writeln('Can''t read block length.'+#13+#10); { check for End-of-file flag } If block_len <> 0 Then Begin If (block_len < 0) Then { copy uncompressed chars } Begin decomp_len := 0 - block_len; Blockread(infile, outputbuffer^, decomp_len, code); If (code <> decomp_len) then writeln('Can''t read uncompressed block.'+#13+#10); End Else { decompress this buffer } Begin Blockread(infile, inputbuffer^, block_len, code); If (code <> block_len) then writeln('Can''t read compressed block.'+#13+#10); decomp_len := RDC_Decompress(PByte(inputbuffer), block_len, PByte(outputbuffer)); End; { and write this buffer outfile } Blockwrite(outfile, outputbuffer^, decomp_len, code); if (code <> decomp_len) then writeln('Error writing uncompressed data.'+#13+#10); End; End; Freemem(inputbuffer, BUFF_LEN); Freemem(outputbuffer, BUFF_LEN); End; END.