A simple implementation of the LZ77 algorithm

On the web you can find many sites describing different compression algorithms. But you canīt find complete implementations that easy. When I read something about different algorithms, I decided to inplement the LZ77 myself.

Note: The following programs are not performance tuned. Compressing a 25kB text file with them takes about 4s on my AMD Barton 3000+. The reason is, that the search of matching strings is done by brute force not with BTrees or hash tables (BTW: Some implementations about them are patented.). The intention was to keep it simple.
Important: Iīve tested the whole compression and decompression somewhat but donīt use it for any data you donīt want to loose. I wonīt be responsible for _ANY_ data loss caused by this source!

The header file used by the other source files:
lz77.h
/***************************************
 * Copyright (C)2004 by Wiesner Thomas *
 ***************************************/

// Window size. Max. is 4096. Bigger gives better conpression
// but takes longer
#define WNDSIZE   4096

#define VERSION "Generic LZ77 compression suite V0.1"

// The structure for the sliding windows
struct lz77wnd {
   unsigned char *dat;  // The data
   unsigned int len;    // Buffer length
   unsigned int pos;    // Position (for internal handling)
};

// The structure for the flag byte handling
struct fgbuf {
   unsigned char bytes[24];   // Max. number is all are triples is 24
   unsigned char fbyte;       // Flagbyte
   unsigned char cnt;         // Number of bytes in the buffer
   unsigned char fcnt;        // Flag count (for internal handling)
};

unsigned char is_bit_set(unsigned char byte, unsigned char bit_nr);

void triple_to_adr(unsigned char *triple, int *offset, int *cnt);
void adr_to_triple(unsigned char *triple, short offset, short cnt);

void inline clear_fgbuf(struct fgbuf *buf);
void inline add_byte_fgbuf(struct fgbuf *buf, unsigned char byte);
void add_ocp_fgbuf(struct fgbuf *buf, int offset, int cnt);
int flush_fgbuf(struct fgbuf *buf, FILE *fp);
int fill_fgbuf(struct fgbuf *buf, FILE *fp);

int inline get_wnd_byte(struct lz77wnd wnd, unsigned int off);
int inline put_wnd_byte(struct lz77wnd *wnd, unsigned int next_byte);
int wnd_init(struct lz77wnd *wnd);
void wnd_free(struct lz77wnd *wnd);
long find_match(struct lz77wnd buf, struct lz77wnd str, unsigned int len);
int shift_wnd(struct lz77wnd *wnd);

int pack(FILE *ifp, FILE *ofp);
int unpack(FILE *ifp, FILE *ofp);

The main source file which contains all necessary routines:
lz77.c
/***************************************
 * Copyright (C)2004 by Wiesner Thomas *
 ***************************************/

#include <stdio.h>
#include <stdlib.h>
#include "lz77.h"

/* FILE FORMAT:
 * 
 * .---.---.---.---.
 * | F | D | D | D | ...
 * '---'---'---'---'
 *
 * A Flagbyte, which determines wheter the next bytes are literal bytes
 * or offset/count pairs.
 * A 0-Bit in the Flagbyte indicates that the corresponding byte is a literal
 * byte, a 1-Bit that this are 2 bytes containing and offset/count pair
 *
 * Example (Hex numbers):
 *   9a 85 96 00 fb 5a 0d ff 01 62 6e 70 2c 22 20 6f
 *   The first byte (0x9a) is the flag byte. The binary value is:
 *   01011001  (Read with LSB first)
 *   So 0x85, 0x96, 0x00 contains a offset/count pair
 *      0xfb is a literal value
 *      0x5a is a literal value
 *      0x0d, 0xff, 0x01 contains a offset/count pair
 *      0x62, 0x6e, 0x70 contains a offset/count pair
 *      0x70 is a literal value
 *      0x2c, 0x22, 0c20 contains a offset/count pair
 *      0x6f is a literal value
 */

// Checks wheter a bit is set in a byte
unsigned char is_bit_set(unsigned char byte, unsigned char bit_nr)
{
   if(((1 << bit_nr) & byte) > 0)
      return 1;
   return 0;
}

/* Converts a byte triple into address and offset
 * The bytes are:
 *        Byte 0            Byte 1            Byte 2
 *   .-.-.-.-.-.-.-.-. .-.-.-.-.-.-.-.-. .-.-.-.-.-.-.-.-.
 *   '-'-'-'-'-'-'-'-' '-'-'-'-'-'-'-'-' '-'-'-'-'-'-'-'-'
 *   \___________ _________ __/ \___________ ____________/
 *               V                          V
 *            Offset                      Count
 * Thus each 12 Bit
 */
void triple_to_adr(unsigned char *triple, int *offset, int *cnt)
{
   unsigned char low_nib, high_nib;

   *offset = triple[0];

   high_nib = triple[1] >> 4;
   low_nib = triple[1] & 0x0f;

   *offset <<= 4;
   *offset += high_nib;

   *cnt = low_nib << 8;
   *cnt += triple[2];
}

// Converts a address to a triple
void adr_to_triple(unsigned char *triple, short offset, short cnt)
{
   triple[0] = offset >> 4;

   triple[1] = offset & 0x000f;  // Assign the lowest 4 Bit
   triple[1] <<= 4;
   triple[1] += cnt >> 8;

   triple[2] = (cnt & 0x00ff);   // Assign the lowert byte
}


// Clear the IO buffer
void inline clear_fgbuf(struct fgbuf *buf)
{
   buf->fbyte = 0;
   buf->cnt = 0;
   buf->fcnt = 0;
}

// Adds a byte to the IO buffer
void inline add_byte_fgbuf(struct fgbuf *buf, unsigned char byte)
{
   buf->bytes[buf->cnt] = byte;
   buf->cnt++;
   buf->fcnt++;
}

// Adds a offset/count pair to the buffer
void add_ocp_fgbuf(struct fgbuf *buf, int offset, int cnt)
{
   adr_to_triple(&(buf->bytes[buf->cnt]), offset, cnt);
   buf->cnt += 3;
   buf->fbyte |= (1 << buf->fcnt);  // Update falg byte
   buf->fcnt++;
//printf("ocp: %d %d\n", offset, cnt);
}

// Flush the IO buffer to the file
int flush_fgbuf(struct fgbuf *buf, FILE *fp)
{
   if(fwrite(&buf->fbyte, 1, 1, fp) != 1) {  // Write the flag byte
      puts("flush_fgbuf: Error on writing flag byte\n");
      return -1;
   }
   if(fwrite(buf->bytes, buf->cnt, 1, fp) != 1) {  // Write the data
      puts("flush_fgbuf: Error on writing data block\n");
      return -1;
   }
   clear_fgbuf(buf);
   return 0;
}

// Fills an empty IO buffer with filedata
int fill_fgbuf(struct fgbuf *buf, FILE *fp)
{
   int i, c;

   clear_fgbuf(buf);
   if(fread(&buf->fbyte, 1, 1, fp) != 1) {
      return -2;
   }

   for(i = 0; i < 8; i++) {   // Calculate the number of bytes
      if(is_bit_set(buf->fbyte, i))
         buf->cnt += 3;    // 3 if it is a offset/count pair
      else
         buf->cnt++;       // 1 if it is a literal value
   }

   for(i = 0; i < buf->cnt && (c = fgetc(fp)) != EOF; i++) {
      buf->bytes[i] = c;
   }

   return i;
}

// Read a byte from the window
int inline get_wnd_byte(struct lz77wnd wnd, unsigned int off)
{
   if(off >= wnd.len) { // Wrong offset
      puts("get_wnd_byte: Offset too large\n");
      return -1;
   }
   if(wnd.pos + off < wnd.len)   // Do we have to wrap around the end?
      return wnd.dat[wnd.pos + off];
   return wnd.dat[wnd.pos + off - wnd.len];
}

// Write next byte into window. Returns -1 or the byte which got shifted out
int inline put_wnd_byte(struct lz77wnd *wnd, unsigned int next_byte)
{
   int ret;

   if(wnd->len < WNDSIZE) {   // If max. len not reached, add byte at the end
      wnd->dat[wnd->len] = next_byte;
      wnd->len++;
      ret = -1;
   } else { // otherwise overwrite first byte
      ret = wnd->dat[wnd->pos];
      wnd->dat[wnd->pos] = next_byte;
   }
   wnd->pos++;
   if(wnd->pos >= WNDSIZE)
      wnd->pos = 0;
   return ret;
}

// Moves the window read position without adding bytes
// (Only for handling the last bytes in buffer if EOF occured!)
int shift_wnd(struct lz77wnd *wnd)
{
/* if(wnd->pos+1 < wnd->len) {
      wnd->pos++;
      return wnd->dat[wnd->pos-1];
   }
   else  // No more bytes
      return -1;*/

   if(wnd->pos+1 == wnd->len)
      return -1;
   if(wnd->pos+1 > wnd->len)
      wnd->pos = 0;
   wnd->pos++;
   return wnd->dat[wnd->pos-1];
}

// Initialize window
int wnd_init(struct lz77wnd *wnd)
{
   wnd->pos = 0;
   wnd->len = 0;
   wnd->dat = (unsigned char *) malloc(sizeof (unsigned char) * WNDSIZE);
   if(wnd->dat == NULL) {
      puts("wnd_init: Unable to allocate memory\n");
      return -1;
   }
   return 0;
}

void wnd_free(struct lz77wnd *wnd)
{
   wnd->pos = 0;
   wnd->len = 0;
   free(wnd->dat);
}

// Finds a string in the buffer buf. Returns offset
long find_match(struct lz77wnd buf, struct lz77wnd str, unsigned int len)
{
   long buf_off;
   long l;
   unsigned int matches;

   for(buf_off = 0; buf_off+len <= buf.len; buf_off++) {  // In every buf pos.
      matches = 0;
      for(l = 0; l < len; l++) { // Look for the string
         if(get_wnd_byte(buf, l+buf_off) == get_wnd_byte(str, l))
            matches++;
      }
      if(matches == len) {
         return buf_off;
      }
   }
   return -1;
}

/*void putwnd(struct lz77wnd wnd)
{
   int i;
   for(i = 0; i < wnd.len; i++)
      putchar(get_wnd_byte(wnd, i));
   putchar('\n');
}*/

// Compress a file ifp, output file ofp
int pack(FILE *ifp, FILE *ofp)
{
   struct lz77wnd lahead, search;
   struct fgbuf iobuf;

   long l, i;
   int c;
   unsigned int maxlen, findsize, written;
   unsigned int bytes_left = 4096;  // 4096 is very important
   long maxlenoff = 0, off;
   unsigned char eof = 0;  // Wheter we had EOF
   unsigned char done = 0;

   if(wnd_init(&lahead) < 0 || wnd_init(&search) < 0)
      return -1;

   clear_fgbuf(&iobuf);

   // Fill the look ahead buffer
   for(l = 0, c = 0; l < WNDSIZE && (c = fgetc(ifp)) != EOF ; l++) {
      put_wnd_byte(&lahead, c);
   }
   if(c == EOF) {
      eof = 1;
      bytes_left = lahead.len;
      lahead.len = WNDSIZE;   // Otherwise the bytes don't get shifted out
      lahead.pos = 0;
   }

   // The compression loop
   while(!done) {

      // Find the biggest match
      maxlen = 0;
      off = 0;
      for(findsize = 4; findsize <= WNDSIZE && off >= 0 && findsize < bytes_left; findsize++) {
         off = find_match(search, lahead, findsize);
         if(findsize > maxlen && off >= 0) {
            maxlen = findsize;
            maxlenoff = off;
         }
      }

      // Is the match long enough to achieve compression?
      if(maxlen > 3) {
         // Add the offset/count pair to the output buffer. -1 beacuse 0
         // doesn't occur, but 4096 does
         add_ocp_fgbuf(&iobuf, maxlenoff, maxlen-1);
         written = maxlen;
      } else { // No. So output a literal value
         // Add the byte to the output buffer
         add_byte_fgbuf(&iobuf, get_wnd_byte(lahead, 0));
         written = 1;   // One byte written
      }

      // Is the output buffer full?
      if(iobuf.fcnt >= 8) {
         flush_fgbuf(&iobuf, ofp);  // Flush the buffer to the ofp
      }

      // Move the look ahead buffer
      for(i = 0; i < written; i++) {
         if(!eof) {  // No, we didn't have EOF
            c = fgetc(ifp);   // Read next byte
            if(c != EOF) { // Did we have EOF now?
               c = put_wnd_byte(&lahead, c); // Refill look ahead buffer
               if(c < 0)   // Should not happen!
                  puts("pack: Strange error 0. Maybe program bug?\n");
               put_wnd_byte(&search, c);  // Move the next byte into search bf.
            } else {
               eof = 1;
               bytes_left = lahead.len;
            }
         }
         if(eof) {   // We had already EOF
            c = put_wnd_byte(&lahead, 0);
            if(bytes_left <= 1) {   // we ran out of input data!
               done = 1;
               break;   // Compression done
            }
            put_wnd_byte(&search, c); // Move next byte into search buffer
            bytes_left--;
         }
      }
   }
   if(iobuf.cnt != 0)
      flush_fgbuf(&iobuf, ofp);

   wnd_free(&search);
   wnd_free(&lahead);
   return 0;
}

// Decompress a file ifp, output file ofp
int unpack(FILE *ifp, FILE *ofp)
{
   struct lz77wnd search;
   struct fgbuf iobuf;
   unsigned char *copybuf;

   char read;
   int c;
   unsigned int off, len;
   long cnt;

   copybuf = (unsigned char *) malloc(sizeof (unsigned char) * WNDSIZE);
   if(copybuf == NULL) {
      puts("unpack: Error on allocating memory\n");
      return -1;
   }

   if(wnd_init(&search) < 0)
      return -1;

   clear_fgbuf(&iobuf);

   // Decompression loop
   do {
      if((read = fill_fgbuf(&iobuf, ifp)) < 0)
         return -1;
      for(iobuf.fcnt = 0, iobuf.cnt = 0; iobuf.fcnt < 8 && iobuf.cnt < read; iobuf.fcnt++) {
         if(!is_bit_set(iobuf.fbyte, iobuf.fcnt)) {   // Is it a literal value?
            fputc(iobuf.bytes[iobuf.cnt], ofp);
            put_wnd_byte(&search, iobuf.bytes[iobuf.cnt]);  // Update buffer
            iobuf.cnt++;
         } else {
            triple_to_adr(&(iobuf.bytes[iobuf.cnt]), &off, &len);
            len++;   // Because len 0 doesn't occur, but 4096 does
            // Copy the bytes to output
            for(cnt = 0; cnt < len; cnt++) {
               c = get_wnd_byte(search, off+cnt);
               copybuf[cnt] = c;
               fputc(c, ofp);
               if(c < 0)
                  puts("Strange error 0. Maybe a program bug.\n");
            }
            for(cnt = 0; cnt < len; cnt++) {
               put_wnd_byte(&search, copybuf[cnt]);   // Update buffer
            }
            iobuf.cnt += 3;
         }
      }
   } while(read > 0);

   wnd_free(&search);
   free(copybuf);
   return 0;
}

Addidionally I wrote a short compression main program:
lz77pack.c
/***************************************
 * Copyright (C)2004 by Wiesner Thomas *
 ***************************************/

#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include "lz77.h"

int main(int argc, char *argv[])
{
   FILE *ifp, *ofp;
   unsigned char *buf;

   if(argc != 2) {
      printf("Syntax: %s file\n", argv[0]);
      return -1;
   }

   if((ifp = fopen(argv[1], "rb")) == NULL) {
      printf("Unable to open input file\n");
      return -1;
   }

   buf = (unsigned char *) malloc(sizeof (unsigned char) * (strlen(argv[1])+4));
   if(buf == NULL) {
      puts("Unable to allocate memory\n");
      fclose(ifp);
      return -1;
   }

   strcpy(buf, argv[1]);
   strcat(buf, ".77");

   if((ofp = fopen(buf, "wb")) == NULL) {
      printf("Unable to open output file %s\n", buf);
      fclose(ifp);
      free(buf);
      return -1;
   }

   pack(ifp, ofp);

   fclose(ifp);
   fclose(ofp);
   free(buf);
   unlink(argv[1]);

   return 0;
}

And a short decompression main program:
lz77unpack.c
/***************************************
 * Copyright (C)2004 by Wiesner Thomas *
 ***************************************/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include "lz77.h"

void strip_ext(unsigned char *str)
{
   int i;
   for(i = strlen(str)-1; str[i] != '.' && i >= 0; i--)
      ;
   if(i > 0)
      str[i] = '\0';
}

int main(int argc, char *argv[])
{
   FILE *ifp, *ofp;
   unsigned char *buf;

   if(argc != 2) {
      printf("Syntax: %s file\n", argv[0]);
      return -1;
   }

   if((ifp = fopen(argv[1], "rb")) == NULL) {
      printf("Unable to open input file\n");
      return -1;
   }

   buf = (unsigned char *) malloc(sizeof (unsigned char) * (strlen(argv[1])+1));
   if(buf == NULL) {
      puts("Unable to allocate memory\n");
      fclose(ifp);
      return -1;
   }

   strcpy(buf, argv[1]);
   if(buf[strlen(buf)-1] != '7' || buf[strlen(buf)-2] != '7' || buf[strlen(buf)-3] != '.') {
      printf("File extension is not .77\n");
      fclose(ifp);
      free(buf);
      return -1;
   }
   strip_ext(buf);

   if((ofp = fopen(buf, "wb")) == NULL) {
      printf("Unable to open output file %s\n", buf);
      fclose(ifp);
      free(buf);
      return -1;
   }

   unpack(ifp, ofp);

   fclose(ifp);
   fclose(ofp);
   free(buf);
   unlink(argv[1]);
   return 0;
}