On the web you can find many sites describing different compression algorithms.
But you canīt find complete implementations that easy.
When I read something about different algorithms, I decided to inplement the LZ77 myself.
Note: The following programs are not performance tuned. Compressing a 25kB text file
with them takes about 4s on my AMD Barton 3000+. The reason is, that the search of matching
strings is done by brute force not with BTrees or hash tables (BTW: Some implementations about them
are patented.). The intention was to keep it simple.
Important: Iīve tested the whole compression and decompression
somewhat but donīt use it for any data you donīt want to loose. I wonīt be responsible for _ANY_
data loss caused by this source!
The header file used by the other source files:
| lz77.h |
/***************************************
* Copyright (C)2004 by Wiesner Thomas *
***************************************/
// Window size. Max. is 4096. Bigger gives better conpression
// but takes longer
#define WNDSIZE 4096
#define VERSION "Generic LZ77 compression suite V0.1"
// The structure for the sliding windows
struct lz77wnd {
unsigned char *dat; // The data
unsigned int len; // Buffer length
unsigned int pos; // Position (for internal handling)
};
// The structure for the flag byte handling
struct fgbuf {
unsigned char bytes[24]; // Max. number is all are triples is 24
unsigned char fbyte; // Flagbyte
unsigned char cnt; // Number of bytes in the buffer
unsigned char fcnt; // Flag count (for internal handling)
};
unsigned char is_bit_set(unsigned char byte, unsigned char bit_nr);
void triple_to_adr(unsigned char *triple, int *offset, int *cnt);
void adr_to_triple(unsigned char *triple, short offset, short cnt);
void inline clear_fgbuf(struct fgbuf *buf);
void inline add_byte_fgbuf(struct fgbuf *buf, unsigned char byte);
void add_ocp_fgbuf(struct fgbuf *buf, int offset, int cnt);
int flush_fgbuf(struct fgbuf *buf, FILE *fp);
int fill_fgbuf(struct fgbuf *buf, FILE *fp);
int inline get_wnd_byte(struct lz77wnd wnd, unsigned int off);
int inline put_wnd_byte(struct lz77wnd *wnd, unsigned int next_byte);
int wnd_init(struct lz77wnd *wnd);
void wnd_free(struct lz77wnd *wnd);
long find_match(struct lz77wnd buf, struct lz77wnd str, unsigned int len);
int shift_wnd(struct lz77wnd *wnd);
int pack(FILE *ifp, FILE *ofp);
int unpack(FILE *ifp, FILE *ofp);
|
The main source file which contains all necessary routines:
| lz77.c |
/***************************************
* Copyright (C)2004 by Wiesner Thomas *
***************************************/
#include <stdio.h>
#include <stdlib.h>
#include "lz77.h"
/* FILE FORMAT:
*
* .---.---.---.---.
* | F | D | D | D | ...
* '---'---'---'---'
*
* A Flagbyte, which determines wheter the next bytes are literal bytes
* or offset/count pairs.
* A 0-Bit in the Flagbyte indicates that the corresponding byte is a literal
* byte, a 1-Bit that this are 2 bytes containing and offset/count pair
*
* Example (Hex numbers):
* 9a 85 96 00 fb 5a 0d ff 01 62 6e 70 2c 22 20 6f
* The first byte (0x9a) is the flag byte. The binary value is:
* 01011001 (Read with LSB first)
* So 0x85, 0x96, 0x00 contains a offset/count pair
* 0xfb is a literal value
* 0x5a is a literal value
* 0x0d, 0xff, 0x01 contains a offset/count pair
* 0x62, 0x6e, 0x70 contains a offset/count pair
* 0x70 is a literal value
* 0x2c, 0x22, 0c20 contains a offset/count pair
* 0x6f is a literal value
*/
// Checks wheter a bit is set in a byte
unsigned char is_bit_set(unsigned char byte, unsigned char bit_nr)
{
if(((1 << bit_nr) & byte) > 0)
return 1;
return 0;
}
/* Converts a byte triple into address and offset
* The bytes are:
* Byte 0 Byte 1 Byte 2
* .-.-.-.-.-.-.-.-. .-.-.-.-.-.-.-.-. .-.-.-.-.-.-.-.-.
* '-'-'-'-'-'-'-'-' '-'-'-'-'-'-'-'-' '-'-'-'-'-'-'-'-'
* \___________ _________ __/ \___________ ____________/
* V V
* Offset Count
* Thus each 12 Bit
*/
void triple_to_adr(unsigned char *triple, int *offset, int *cnt)
{
unsigned char low_nib, high_nib;
*offset = triple[0];
high_nib = triple[1] >> 4;
low_nib = triple[1] & 0x0f;
*offset <<= 4;
*offset += high_nib;
*cnt = low_nib << 8;
*cnt += triple[2];
}
// Converts a address to a triple
void adr_to_triple(unsigned char *triple, short offset, short cnt)
{
triple[0] = offset >> 4;
triple[1] = offset & 0x000f; // Assign the lowest 4 Bit
triple[1] <<= 4;
triple[1] += cnt >> 8;
triple[2] = (cnt & 0x00ff); // Assign the lowert byte
}
// Clear the IO buffer
void inline clear_fgbuf(struct fgbuf *buf)
{
buf->fbyte = 0;
buf->cnt = 0;
buf->fcnt = 0;
}
// Adds a byte to the IO buffer
void inline add_byte_fgbuf(struct fgbuf *buf, unsigned char byte)
{
buf->bytes[buf->cnt] = byte;
buf->cnt++;
buf->fcnt++;
}
// Adds a offset/count pair to the buffer
void add_ocp_fgbuf(struct fgbuf *buf, int offset, int cnt)
{
adr_to_triple(&(buf->bytes[buf->cnt]), offset, cnt);
buf->cnt += 3;
buf->fbyte |= (1 << buf->fcnt); // Update falg byte
buf->fcnt++;
//printf("ocp: %d %d\n", offset, cnt);
}
// Flush the IO buffer to the file
int flush_fgbuf(struct fgbuf *buf, FILE *fp)
{
if(fwrite(&buf->fbyte, 1, 1, fp) != 1) { // Write the flag byte
puts("flush_fgbuf: Error on writing flag byte\n");
return -1;
}
if(fwrite(buf->bytes, buf->cnt, 1, fp) != 1) { // Write the data
puts("flush_fgbuf: Error on writing data block\n");
return -1;
}
clear_fgbuf(buf);
return 0;
}
// Fills an empty IO buffer with filedata
int fill_fgbuf(struct fgbuf *buf, FILE *fp)
{
int i, c;
clear_fgbuf(buf);
if(fread(&buf->fbyte, 1, 1, fp) != 1) {
return -2;
}
for(i = 0; i < 8; i++) { // Calculate the number of bytes
if(is_bit_set(buf->fbyte, i))
buf->cnt += 3; // 3 if it is a offset/count pair
else
buf->cnt++; // 1 if it is a literal value
}
for(i = 0; i < buf->cnt && (c = fgetc(fp)) != EOF; i++) {
buf->bytes[i] = c;
}
return i;
}
// Read a byte from the window
int inline get_wnd_byte(struct lz77wnd wnd, unsigned int off)
{
if(off >= wnd.len) { // Wrong offset
puts("get_wnd_byte: Offset too large\n");
return -1;
}
if(wnd.pos + off < wnd.len) // Do we have to wrap around the end?
return wnd.dat[wnd.pos + off];
return wnd.dat[wnd.pos + off - wnd.len];
}
// Write next byte into window. Returns -1 or the byte which got shifted out
int inline put_wnd_byte(struct lz77wnd *wnd, unsigned int next_byte)
{
int ret;
if(wnd->len < WNDSIZE) { // If max. len not reached, add byte at the end
wnd->dat[wnd->len] = next_byte;
wnd->len++;
ret = -1;
} else { // otherwise overwrite first byte
ret = wnd->dat[wnd->pos];
wnd->dat[wnd->pos] = next_byte;
}
wnd->pos++;
if(wnd->pos >= WNDSIZE)
wnd->pos = 0;
return ret;
}
// Moves the window read position without adding bytes
// (Only for handling the last bytes in buffer if EOF occured!)
int shift_wnd(struct lz77wnd *wnd)
{
/* if(wnd->pos+1 < wnd->len) {
wnd->pos++;
return wnd->dat[wnd->pos-1];
}
else // No more bytes
return -1;*/
if(wnd->pos+1 == wnd->len)
return -1;
if(wnd->pos+1 > wnd->len)
wnd->pos = 0;
wnd->pos++;
return wnd->dat[wnd->pos-1];
}
// Initialize window
int wnd_init(struct lz77wnd *wnd)
{
wnd->pos = 0;
wnd->len = 0;
wnd->dat = (unsigned char *) malloc(sizeof (unsigned char) * WNDSIZE);
if(wnd->dat == NULL) {
puts("wnd_init: Unable to allocate memory\n");
return -1;
}
return 0;
}
void wnd_free(struct lz77wnd *wnd)
{
wnd->pos = 0;
wnd->len = 0;
free(wnd->dat);
}
// Finds a string in the buffer buf. Returns offset
long find_match(struct lz77wnd buf, struct lz77wnd str, unsigned int len)
{
long buf_off;
long l;
unsigned int matches;
for(buf_off = 0; buf_off+len <= buf.len; buf_off++) { // In every buf pos.
matches = 0;
for(l = 0; l < len; l++) { // Look for the string
if(get_wnd_byte(buf, l+buf_off) == get_wnd_byte(str, l))
matches++;
}
if(matches == len) {
return buf_off;
}
}
return -1;
}
/*void putwnd(struct lz77wnd wnd)
{
int i;
for(i = 0; i < wnd.len; i++)
putchar(get_wnd_byte(wnd, i));
putchar('\n');
}*/
// Compress a file ifp, output file ofp
int pack(FILE *ifp, FILE *ofp)
{
struct lz77wnd lahead, search;
struct fgbuf iobuf;
long l, i;
int c;
unsigned int maxlen, findsize, written;
unsigned int bytes_left = 4096; // 4096 is very important
long maxlenoff = 0, off;
unsigned char eof = 0; // Wheter we had EOF
unsigned char done = 0;
if(wnd_init(&lahead) < 0 || wnd_init(&search) < 0)
return -1;
clear_fgbuf(&iobuf);
// Fill the look ahead buffer
for(l = 0, c = 0; l < WNDSIZE && (c = fgetc(ifp)) != EOF ; l++) {
put_wnd_byte(&lahead, c);
}
if(c == EOF) {
eof = 1;
bytes_left = lahead.len;
lahead.len = WNDSIZE; // Otherwise the bytes don't get shifted out
lahead.pos = 0;
}
// The compression loop
while(!done) {
// Find the biggest match
maxlen = 0;
off = 0;
for(findsize = 4; findsize <= WNDSIZE && off >= 0 && findsize < bytes_left; findsize++) {
off = find_match(search, lahead, findsize);
if(findsize > maxlen && off >= 0) {
maxlen = findsize;
maxlenoff = off;
}
}
// Is the match long enough to achieve compression?
if(maxlen > 3) {
// Add the offset/count pair to the output buffer. -1 beacuse 0
// doesn't occur, but 4096 does
add_ocp_fgbuf(&iobuf, maxlenoff, maxlen-1);
written = maxlen;
} else { // No. So output a literal value
// Add the byte to the output buffer
add_byte_fgbuf(&iobuf, get_wnd_byte(lahead, 0));
written = 1; // One byte written
}
// Is the output buffer full?
if(iobuf.fcnt >= 8) {
flush_fgbuf(&iobuf, ofp); // Flush the buffer to the ofp
}
// Move the look ahead buffer
for(i = 0; i < written; i++) {
if(!eof) { // No, we didn't have EOF
c = fgetc(ifp); // Read next byte
if(c != EOF) { // Did we have EOF now?
c = put_wnd_byte(&lahead, c); // Refill look ahead buffer
if(c < 0) // Should not happen!
puts("pack: Strange error 0. Maybe program bug?\n");
put_wnd_byte(&search, c); // Move the next byte into search bf.
} else {
eof = 1;
bytes_left = lahead.len;
}
}
if(eof) { // We had already EOF
c = put_wnd_byte(&lahead, 0);
if(bytes_left <= 1) { // we ran out of input data!
done = 1;
break; // Compression done
}
put_wnd_byte(&search, c); // Move next byte into search buffer
bytes_left--;
}
}
}
if(iobuf.cnt != 0)
flush_fgbuf(&iobuf, ofp);
wnd_free(&search);
wnd_free(&lahead);
return 0;
}
// Decompress a file ifp, output file ofp
int unpack(FILE *ifp, FILE *ofp)
{
struct lz77wnd search;
struct fgbuf iobuf;
unsigned char *copybuf;
char read;
int c;
unsigned int off, len;
long cnt;
copybuf = (unsigned char *) malloc(sizeof (unsigned char) * WNDSIZE);
if(copybuf == NULL) {
puts("unpack: Error on allocating memory\n");
return -1;
}
if(wnd_init(&search) < 0)
return -1;
clear_fgbuf(&iobuf);
// Decompression loop
do {
if((read = fill_fgbuf(&iobuf, ifp)) < 0)
return -1;
for(iobuf.fcnt = 0, iobuf.cnt = 0; iobuf.fcnt < 8 && iobuf.cnt < read; iobuf.fcnt++) {
if(!is_bit_set(iobuf.fbyte, iobuf.fcnt)) { // Is it a literal value?
fputc(iobuf.bytes[iobuf.cnt], ofp);
put_wnd_byte(&search, iobuf.bytes[iobuf.cnt]); // Update buffer
iobuf.cnt++;
} else {
triple_to_adr(&(iobuf.bytes[iobuf.cnt]), &off, &len);
len++; // Because len 0 doesn't occur, but 4096 does
// Copy the bytes to output
for(cnt = 0; cnt < len; cnt++) {
c = get_wnd_byte(search, off+cnt);
copybuf[cnt] = c;
fputc(c, ofp);
if(c < 0)
puts("Strange error 0. Maybe a program bug.\n");
}
for(cnt = 0; cnt < len; cnt++) {
put_wnd_byte(&search, copybuf[cnt]); // Update buffer
}
iobuf.cnt += 3;
}
}
} while(read > 0);
wnd_free(&search);
free(copybuf);
return 0;
}
|
Addidionally I wrote a short compression main program:
| lz77pack.c |
/***************************************
* Copyright (C)2004 by Wiesner Thomas *
***************************************/
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include "lz77.h"
int main(int argc, char *argv[])
{
FILE *ifp, *ofp;
unsigned char *buf;
if(argc != 2) {
printf("Syntax: %s file\n", argv[0]);
return -1;
}
if((ifp = fopen(argv[1], "rb")) == NULL) {
printf("Unable to open input file\n");
return -1;
}
buf = (unsigned char *) malloc(sizeof (unsigned char) * (strlen(argv[1])+4));
if(buf == NULL) {
puts("Unable to allocate memory\n");
fclose(ifp);
return -1;
}
strcpy(buf, argv[1]);
strcat(buf, ".77");
if((ofp = fopen(buf, "wb")) == NULL) {
printf("Unable to open output file %s\n", buf);
fclose(ifp);
free(buf);
return -1;
}
pack(ifp, ofp);
fclose(ifp);
fclose(ofp);
free(buf);
unlink(argv[1]);
return 0;
}
|
And a short decompression main program:
| lz77unpack.c |
/***************************************
* Copyright (C)2004 by Wiesner Thomas *
***************************************/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include "lz77.h"
void strip_ext(unsigned char *str)
{
int i;
for(i = strlen(str)-1; str[i] != '.' && i >= 0; i--)
;
if(i > 0)
str[i] = '\0';
}
int main(int argc, char *argv[])
{
FILE *ifp, *ofp;
unsigned char *buf;
if(argc != 2) {
printf("Syntax: %s file\n", argv[0]);
return -1;
}
if((ifp = fopen(argv[1], "rb")) == NULL) {
printf("Unable to open input file\n");
return -1;
}
buf = (unsigned char *) malloc(sizeof (unsigned char) * (strlen(argv[1])+1));
if(buf == NULL) {
puts("Unable to allocate memory\n");
fclose(ifp);
return -1;
}
strcpy(buf, argv[1]);
if(buf[strlen(buf)-1] != '7' || buf[strlen(buf)-2] != '7' || buf[strlen(buf)-3] != '.') {
printf("File extension is not .77\n");
fclose(ifp);
free(buf);
return -1;
}
strip_ext(buf);
if((ofp = fopen(buf, "wb")) == NULL) {
printf("Unable to open output file %s\n", buf);
fclose(ifp);
free(buf);
return -1;
}
unpack(ifp, ofp);
fclose(ifp);
fclose(ofp);
free(buf);
unlink(argv[1]);
return 0;
}
|