src/libflate/deflate.c - plan9 - Git at Google

 #include <u.h>
 #include <libc.h>
 #include <flate.h>

 typedef struct Chain	Chain;
 typedef struct Chains	Chains;
 typedef struct Dyncode	Dyncode;
 typedef struct Huff	Huff;
 typedef struct LZblock	LZblock;
 typedef struct LZstate	LZstate;

 enum
 {
 	/*
 	 * deflate format paramaters
 	 */
 	DeflateUnc	= 0,			/* uncompressed block */
 	DeflateFix	= 1,			/* fixed huffman codes */
 	DeflateDyn	= 2,			/* dynamic huffman codes */

 	DeflateEob	= 256,			/* end of block code in lit/len book */
 	DeflateMaxBlock	= 64*1024-1,		/* maximum size of uncompressed block */

 	DeflateMaxExp	= 10,			/* maximum expansion for a block */

 	LenStart	= 257,			/* start of length codes in litlen */
 	Nlitlen		= 288,			/* number of litlen codes */
 	Noff		= 30,			/* number of offset codes */
 	Nclen		= 19,			/* number of codelen codes */

 	MaxOff		= 32*1024,
 	MinMatch	= 3,			/* shortest match possible */
 	MaxMatch	= 258,			/* longest match possible */

 	/*
 	 * huffman code paramaters
 	 */
 	MaxLeaf		= Nlitlen,
 	MaxHuffBits	= 16,			/* max bits in a huffman code */
 	ChainMem	= 2 * (MaxHuffBits - 1) * MaxHuffBits,

 	/*
 	 * coding of the lz parse
 	 */
 	LenFlag		= 1 << 3,
 	LenShift	= 4,			/* leaves enough space for MinMatchMaxOff */
 	MaxLitRun	= LenFlag - 1,

 	/*
 	 * internal lz paramaters
 	 */
 	DeflateOut	= 4096,			/* output buffer size */
 	BlockSize	= 8192,			/* attempted input read quanta */
 	DeflateBlock	= DeflateMaxBlock & ~(BlockSize - 1),
 	MinMatchMaxOff	= 4096,			/* max profitable offset for small match;
 						 * assumes 8 bits for len, 5+10 for offset
 						 * DONT CHANGE WITHOUT CHANGING LZPARSE CONSTANTS
 						 */
 	HistSlop	= 512,			/* must be at lead MaxMatch */
 	HistBlock	= 64*1024,
 	HistSize	= HistBlock + HistSlop,

 	HashLog		= 13,
 	HashSize	= 1<<HashLog,

 	MaxOffCode	= 256,			/* biggest offset looked up in direct table */

 	EstLitBits	= 8,
 	EstLenBits	= 4,
 	EstOffBits	= 5
 };

 /*
  * knuth vol. 3 multiplicative hashing
  * each byte x chosen according to rules
  * 1/4 < x < 3/10, 1/3 x < < 3/7, 4/7 < x < 2/3, 7/10 < x < 3/4
  * with reasonable spread between the bytes & their complements
  *
  * the 3 byte value appears to be as almost good as the 4 byte value,
  * and might be faster on some machines
  */
 /*
 #define hashit(c)	((u32int)((c) * 0x6b43a9) >> (24 - HashLog))
 */
 #define hashit(c)	((u32int)(((c) & 0xffffff) * 0x6b43a9b5) >> (32 - HashLog))

 /*
  * lempel-ziv style compression state
  */
 struct LZstate
 {
 	uchar	hist[HistSize];
 	ulong	pos;				/* current location in history buffer */
 	ulong	avail;				/* data available after pos */
 	int	eof;
 	ushort	hash[HashSize];			/* hash chains */
 	ushort	nexts[MaxOff];
 	int	now;				/* pos in hash chains */
 	int	dot;				/* dawn of time in history */
 	int	prevlen;			/* lazy matching state */
 	int	prevoff;
 	int	maxcheck;			/* compressor tuning */

 	uchar	obuf[DeflateOut];
 	uchar	*out;				/* current position in the output buffer */
 	uchar	*eout;
 	ulong	bits;				/* bit shift register */
 	int	nbits;
 	int	rbad;				/* got an error reading the buffer */
 	int	wbad;				/* got an error writing the buffer */
 	int	(*w)(void*, void*, int);
 	void	*wr;

 	ulong	totr;				/* total input size */
 	ulong	totw;				/* total output size */
 	int	debug;
 };

 struct LZblock
 {
 	ushort	parse[DeflateMaxBlock / 2 + 1];
 	int	lastv;				/* value being constucted for parse */
 	ulong	litlencount[Nlitlen];
 	ulong	offcount[Noff];
 	ushort	*eparse;			/* limit for parse table */
 	int	bytes;				/* consumed from the input */
 	int	excost;				/* cost of encoding extra len & off bits */
 };

 /*
  * huffman code table
  */
 struct Huff
 {
 	short	bits;				/* length of the code */
 	ushort	encode;				/* the code */
 };

 /*
  * encoding of dynamic huffman trees
  */
 struct Dyncode
 {
 	int	nlit;
 	int	noff;
 	int	nclen;
 	int	ncode;
 	Huff	codetab[Nclen];
 	uchar	codes[Nlitlen+Noff];
 	uchar	codeaux[Nlitlen+Noff];
 };

 static	int	deflateb(LZstate *lz, LZblock *lzb, void *rr, int (*r)(void*, void*, int));
 static	int	lzcomp(LZstate*, LZblock*, uchar*, ushort*, int finish);
 static	void	wrblock(LZstate*, int, ushort*, ushort*, Huff*, Huff*);
 static	int	bitcost(Huff*, ulong*, int);
 static	int	huffcodes(Dyncode*, Huff*, Huff*);
 static	void	wrdyncode(LZstate*, Dyncode*);
 static	void	lzput(LZstate*, ulong bits, int nbits);
 static	void	lzflushbits(LZstate*);
 static	void	lzflush(LZstate *lz);
 static	void	lzwrite(LZstate *lz, void *buf, int n);

 static	int	hufftabinit(Huff*, int, ulong*, int);
 static	int	mkgzprecode(Huff*, ulong *, int, int);

 static	int	mkprecode(Huff*, ulong *, int, int, ulong*);
 static	void	nextchain(Chains*, int);
 static	void	leafsort(ulong*, ushort*, int, int);

 /* conversion from len to code word */
 static int lencode[MaxMatch];

 /*
  * conversion from off to code word
  * off <= MaxOffCode ? offcode[off] : bigoffcode[off >> 7]
 */
 static int offcode[MaxOffCode];
 static int bigoffcode[256];

 /* litlen code words LenStart-285 extra bits */
 static int litlenbase[Nlitlen-LenStart];
 static int litlenextra[Nlitlen-LenStart] =
 {
 /* 257 */	0, 0, 0,
 /* 260 */	0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 /* 270 */	2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
 /* 280 */	4, 5, 5, 5, 5, 0, 0, 0
 };

 /* offset code word extra bits */
 static int offbase[Noff];
 static int offextra[] =
 {
 	0,  0,  0,  0,  1,  1,  2,  2,  3,  3,
 	4,  4,  5,  5,  6,  6,  7,  7,  8,  8,
 	9,  9,  10, 10, 11, 11, 12, 12, 13, 13,
 	0,  0,
 };

 /* order code lengths */
 static int clenorder[Nclen] =
 {
         16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
 };

 /* static huffman tables */
 static	Huff	litlentab[Nlitlen];
 static	Huff	offtab[Noff];
 static	Huff	hofftab[Noff];

 /* bit reversal for brain dead endian swap in huffman codes */
 static	uchar	revtab[256];
 static	ulong	nlits;
 static	ulong	nmatches;

 int
 deflateinit(void)
 {
 	ulong bitcount[MaxHuffBits];
 	int i, j, ci, n;

 	/* byte reverse table */
 	for(i=0; i<256; i++)
 		for(j=0; j<8; j++)
 			if(i & (1<<j))
 				revtab[i] |= 0x80 >> j;

 	/* static Litlen bit lengths */
 	for(i=0; i<144; i++)
 		litlentab[i].bits = 8;
 	for(i=144; i<256; i++)
 		litlentab[i].bits = 9;
 	for(i=256; i<280; i++)
 		litlentab[i].bits = 7;
 	for(i=280; i<Nlitlen; i++)
 		litlentab[i].bits = 8;

 	memset(bitcount, 0, sizeof(bitcount));
 	bitcount[8] += 144 - 0;
 	bitcount[9] += 256 - 144;
 	bitcount[7] += 280 - 256;
 	bitcount[8] += Nlitlen - 280;

 	if(!hufftabinit(litlentab, Nlitlen, bitcount, 9))
 		return FlateInternal;

 	/* static offset bit lengths */
 	for(i = 0; i < Noff; i++)
 		offtab[i].bits = 5;

 	memset(bitcount, 0, sizeof(bitcount));
 	bitcount[5] = Noff;

 	if(!hufftabinit(offtab, Noff, bitcount, 5))
 		return FlateInternal;

 	bitcount[0] = 0;
 	bitcount[1] = 0;
 	if(!mkgzprecode(hofftab, bitcount, 2, MaxHuffBits))
 		return FlateInternal;

 	/* conversion tables for lens & offs to codes */
 	ci = 0;
 	for(i = LenStart; i < 286; i++){
 		n = ci + (1 << litlenextra[i - LenStart]);
 		litlenbase[i - LenStart] = ci;
 		for(; ci < n; ci++)
 			lencode[ci] = i;
 	}
 	/* patch up special case for len MaxMatch */
 	lencode[MaxMatch-MinMatch] = 285;
 	litlenbase[285-LenStart] = MaxMatch-MinMatch;

 	ci = 0;
 	for(i = 0; i < 16; i++){
 		n = ci + (1 << offextra[i]);
 		offbase[i] = ci;
 		for(; ci < n; ci++)
 			offcode[ci] = i;
 	}

 	ci = ci >> 7;
 	for(; i < 30; i++){
 		n = ci + (1 << (offextra[i] - 7));
 		offbase[i] = ci << 7;
 		for(; ci < n; ci++)
 			bigoffcode[ci] = i;
 	}
 	return FlateOk;
 }

 static void
 deflatereset(LZstate *lz, int level, int debug)
 {
 	memset(lz->nexts, 0, sizeof lz->nexts);
 	memset(lz->hash, 0, sizeof lz->hash);
 	lz->totr = 0;
 	lz->totw = 0;
 	lz->pos = 0;
 	lz->avail = 0;
 	lz->out = lz->obuf;
 	lz->eout = &lz->obuf[DeflateOut];
 	lz->prevlen = MinMatch - 1;
 	lz->prevoff = 0;
 	lz->now = MaxOff + 1;
 	lz->dot = lz->now;
 	lz->bits = 0;
 	lz->nbits = 0;
 	lz->maxcheck = (1 << level);
 	lz->maxcheck -= lz->maxcheck >> 2;
 	if(lz->maxcheck < 2)
 		lz->maxcheck = 2;
 	else if(lz->maxcheck > 1024)
 		lz->maxcheck = 1024;

 	lz->debug = debug;
 }

 int
 deflate(void *wr, int (*w)(void*, void*, int), void *rr, int (*r)(void*, void*, int), int level, int debug)
 {
 	LZstate *lz;
 	LZblock *lzb;
 	int ok;

 	lz = malloc(sizeof *lz + sizeof *lzb);
 	if(lz == nil)
 		return FlateNoMem;
 	lzb = (LZblock*)&lz[1];

 	deflatereset(lz, level, debug);
 	lz->w = w;
 	lz->wr = wr;
 	lz->wbad = 0;
 	lz->rbad = 0;
 	lz->eof = 0;
 	ok = FlateOk;
 	while(!lz->eof || lz->avail){
 		ok = deflateb(lz, lzb, rr, r);
 		if(ok != FlateOk)
 			break;
 	}
 	if(ok == FlateOk && lz->rbad)
 		ok = FlateInputFail;
 	if(ok == FlateOk && lz->wbad)
 		ok = FlateOutputFail;
 	free(lz);
 	return ok;
 }

 static int
 deflateb(LZstate *lz, LZblock *lzb, void *rr, int (*r)(void*, void*, int))
 {
 	Dyncode dyncode, hdyncode;
 	Huff dlitlentab[Nlitlen], dofftab[Noff], hlitlentab[Nlitlen];
 	ulong litcount[Nlitlen];
 	long nunc, ndyn, nfix, nhuff;
 	uchar *slop, *hslop;
 	ulong ep;
 	int i, n, m, mm, nslop;

 	memset(lzb->litlencount, 0, sizeof lzb->litlencount);
 	memset(lzb->offcount, 0, sizeof lzb->offcount);
 	lzb->litlencount[DeflateEob]++;

 	lzb->bytes = 0;
 	lzb->eparse = lzb->parse;
 	lzb->lastv = 0;
 	lzb->excost = 0;

 	slop = &lz->hist[lz->pos];
 	n = lz->avail;
 	while(n < DeflateBlock && (!lz->eof || lz->avail)){
 		/*
 		 * fill the buffer as much as possible,
 		 * while leaving room for MaxOff history behind lz->pos,
 		 * and not reading more than we can handle.
 		 *
 		 * make sure we read at least HistSlop bytes.
 		 */
 		if(!lz->eof){
 			ep = lz->pos + lz->avail;
 			if(ep >= HistBlock)
 				ep -= HistBlock;
 			m = HistBlock - MaxOff - lz->avail;
 			if(m > HistBlock - n)
 				m = HistBlock - n;
 			if(m > (HistBlock + HistSlop) - ep)
 				m = (HistBlock + HistSlop) - ep;
 			if(m & ~(BlockSize - 1))
 				m &= ~(BlockSize - 1);

 			/*
 			 * be nice to the caller: stop reads that are too small.
 			 * can only get here when we've already filled the buffer some
 			 */
 			if(m < HistSlop){
 				if(!m || !lzb->bytes)
 					return FlateInternal;
 				break;
 			}

 			mm = (*r)(rr, &lz->hist[ep], m);
 			if(mm > 0){
 				/*
 				 * wrap data to end if we're read it from the beginning
 				 * this way, we don't have to wrap searches.
 				 *
 				 * wrap reads past the end to the beginning.
 				 * this way, we can guarantee minimum size reads.
 				 */
 				if(ep < HistSlop)
 					memmove(&lz->hist[ep + HistBlock], &lz->hist[ep], HistSlop - ep);
 				else if(ep + mm > HistBlock)
 					memmove(&lz->hist[0], &lz->hist[HistBlock], ep + mm - HistBlock);

 				lz->totr += mm;
 				n += mm;
 				lz->avail += mm;
 			}else{
 				if(mm < 0)
 					lz->rbad = 1;
 				lz->eof = 1;
 			}
 		}
 		ep = lz->pos + lz->avail;
 		if(ep > HistSize)
 			ep = HistSize;
 		if(lzb->bytes + ep - lz->pos > DeflateMaxBlock)
 			ep = lz->pos + DeflateMaxBlock - lzb->bytes;
 		m = lzcomp(lz, lzb, &lz->hist[ep], lzb->eparse, lz->eof);
 		lzb->bytes += m;
 		lz->pos = (lz->pos + m) & (HistBlock - 1);
 		lz->avail -= m;
 	}
 	if(lzb->lastv)
 		*lzb->eparse++ = lzb->lastv;
 	if(lzb->eparse > lzb->parse + nelem(lzb->parse))
 		return FlateInternal;
 	nunc = lzb->bytes;

 	if(!mkgzprecode(dlitlentab, lzb->litlencount, Nlitlen, MaxHuffBits)
 	|| !mkgzprecode(dofftab, lzb->offcount, Noff, MaxHuffBits))
 		return FlateInternal;

 	ndyn = huffcodes(&dyncode, dlitlentab, dofftab);
 	if(ndyn < 0)
 		return FlateInternal;
 	ndyn += bitcost(dlitlentab, lzb->litlencount, Nlitlen)
 		+ bitcost(dofftab, lzb->offcount, Noff)
 		+ lzb->excost;

 	memset(litcount, 0, sizeof litcount);

 	nslop = nunc;
 	if(nslop > &lz->hist[HistSize] - slop)
 		nslop = &lz->hist[HistSize] - slop;

 	for(i = 0; i < nslop; i++)
 		litcount[slop[i]]++;
 	hslop = &lz->hist[HistSlop - nslop];
 	for(; i < nunc; i++)
 		litcount[hslop[i]]++;
 	litcount[DeflateEob]++;

 	if(!mkgzprecode(hlitlentab, litcount, Nlitlen, MaxHuffBits))
 		return FlateInternal;
 	nhuff = huffcodes(&hdyncode, hlitlentab, hofftab);
 	if(nhuff < 0)
 		return FlateInternal;
 	nhuff += bitcost(hlitlentab, litcount, Nlitlen);

 	nfix = bitcost(litlentab, lzb->litlencount, Nlitlen)
 		+ bitcost(offtab, lzb->offcount, Noff)
 		+ lzb->excost;

 	lzput(lz, lz->eof && !lz->avail, 1);

 	if(lz->debug){
 		fprint(2, "block: bytes=%lud entries=%ld extra bits=%d\n\tuncompressed=%lud fixed=%lud dynamic=%lud huffman=%lud\n",
 			nunc, lzb->eparse - lzb->parse, lzb->excost, (nunc + 4) * 8, nfix, ndyn, nhuff);
 		fprint(2, "\tnlit=%lud matches=%lud eof=%d\n", nlits, nmatches, lz->eof && !lz->avail);
 	}

 	if((nunc + 4) * 8 < ndyn && (nunc + 4) * 8 < nfix && (nunc + 4) * 8 < nhuff){
 		lzput(lz, DeflateUnc, 2);
 		lzflushbits(lz);

 		lzput(lz, nunc & 0xff, 8);
 		lzput(lz, (nunc >> 8) & 0xff, 8);
 		lzput(lz, ~nunc & 0xff, 8);
 		lzput(lz, (~nunc >> 8) & 0xff, 8);
 		lzflush(lz);

 		lzwrite(lz, slop, nslop);
 		lzwrite(lz, &lz->hist[HistSlop], nunc - nslop);
 	}else if(ndyn < nfix && ndyn < nhuff){
 		lzput(lz, DeflateDyn, 2);

 		wrdyncode(lz, &dyncode);
 		wrblock(lz, slop - lz->hist, lzb->parse, lzb->eparse, dlitlentab, dofftab);
 		lzput(lz, dlitlentab[DeflateEob].encode, dlitlentab[DeflateEob].bits);
 	}else if(nhuff < nfix){
 		lzput(lz, DeflateDyn, 2);

 		wrdyncode(lz, &hdyncode);

 		m = 0;
 		for(i = nunc; i > MaxLitRun; i -= MaxLitRun)
 			lzb->parse[m++] = MaxLitRun;
 		lzb->parse[m++] = i;

 		wrblock(lz, slop - lz->hist, lzb->parse, lzb->parse + m, hlitlentab, hofftab);
 		lzput(lz, hlitlentab[DeflateEob].encode, hlitlentab[DeflateEob].bits);
 	}else{
 		lzput(lz, DeflateFix, 2);

 		wrblock(lz, slop - lz->hist, lzb->parse, lzb->eparse, litlentab, offtab);
 		lzput(lz, litlentab[DeflateEob].encode, litlentab[DeflateEob].bits);
 	}

 	if(lz->eof && !lz->avail){
 		lzflushbits(lz);
 		lzflush(lz);
 	}
 	return FlateOk;
 }

 static void
 lzwrite(LZstate *lz, void *buf, int n)
 {
 	int nw;

 	if(n && lz->w){
 		nw = (*lz->w)(lz->wr, buf, n);
 		if(nw != n){
 			lz->w = 0;
 			lz->wbad = 1;
 		}else
 			lz->totw += n;
 	}
 }

 static void
 lzflush(LZstate *lz)
 {
 	lzwrite(lz, lz->obuf, lz->out - lz->obuf);
 	lz->out = lz->obuf;
 }

 static void
 lzput(LZstate *lz, ulong bits, int nbits)
 {
 	bits = (bits << lz->nbits) | lz->bits;
 	for(nbits += lz->nbits; nbits >= 8; nbits -= 8){
 		*lz->out++ = bits;
 		if(lz->out == lz->eout)
 			lzflush(lz);
 		bits >>= 8;
 	}
 	lz->bits = bits;
 	lz->nbits = nbits;
 }

 static void
 lzflushbits(LZstate *lz)
 {
 	if(lz->nbits)
 		lzput(lz, 0, 8 - (lz->nbits & 7));
 }

 /*
  * write out a block of n samples,
  * given lz encoding and counts for huffman tables
  */
 static void
 wrblock(LZstate *out, int litoff, ushort *soff, ushort *eoff, Huff *litlentab, Huff *offtab)
 {
 	ushort *off;
 	int i, run, offset, lit, len, c;

 	if(out->debug > 2){
 		for(off = soff; off < eoff; ){
 			offset = *off++;
 			run = offset & MaxLitRun;
 			if(run){
 				for(i = 0; i < run; i++){
 					lit = out->hist[litoff & (HistBlock - 1)];
 					litoff++;
 					fprint(2, "\tlit %.2ux %c\n", lit, lit);
 				}
 				if(!(offset & LenFlag))
 					continue;
 				len = offset >> LenShift;
 				offset = *off++;
 			}else if(offset & LenFlag){
 				len = offset >> LenShift;
 				offset = *off++;
 			}else{
 				len = 0;
 				offset >>= LenShift;
 			}
 			litoff += len + MinMatch;
 			fprint(2, "\t<%d, %d>\n", offset + 1, len + MinMatch);
 		}
 	}

 	for(off = soff; off < eoff; ){
 		offset = *off++;
 		run = offset & MaxLitRun;
 		if(run){
 			for(i = 0; i < run; i++){
 				lit = out->hist[litoff & (HistBlock - 1)];
 				litoff++;
 				lzput(out, litlentab[lit].encode, litlentab[lit].bits);
 			}
 			if(!(offset & LenFlag))
 				continue;
 			len = offset >> LenShift;
 			offset = *off++;
 		}else if(offset & LenFlag){
 			len = offset >> LenShift;
 			offset = *off++;
 		}else{
 			len = 0;
 			offset >>= LenShift;
 		}
 		litoff += len + MinMatch;
 		c = lencode[len];
 		lzput(out, litlentab[c].encode, litlentab[c].bits);
 		c -= LenStart;
 		if(litlenextra[c])
 			lzput(out, len - litlenbase[c], litlenextra[c]);

 		if(offset < MaxOffCode)
 			c = offcode[offset];
 		else
 			c = bigoffcode[offset >> 7];
 		lzput(out, offtab[c].encode, offtab[c].bits);
 		if(offextra[c])
 			lzput(out, offset - offbase[c], offextra[c]);
 	}
 }

 /*
  * look for the longest, closest string which matches
  * the next prefix.  the clever part here is looking for
  * a string 1 longer than the previous best match.
  *
  * follows the recommendation of limiting number of chains
  * which are checked.  this appears to be the best heuristic.
  */
 static int
 lzmatch(int now, int then, uchar *p, uchar *es, ushort *nexts, uchar *hist, int runlen, int check, int *m)
 {
 	uchar *s, *t;
 	int ml, off, last;

 	ml = check;
 	if(runlen >= 8)
 		check >>= 2;
 	*m = 0;
 	if(p + runlen >= es)
 		return runlen;
 	last = 0;
 	for(; check-- > 0; then = nexts[then & (MaxOff-1)]){
 		off = (ushort)(now - then);
 		if(off <= last || off > MaxOff)
 			break;
 		s = p + runlen;
 		t = hist + (((p - hist) - off) & (HistBlock-1));
 		t += runlen;
 		for(; s >= p; s--){
 			if(*s != *t)
 				goto matchloop;
 			t--;
 		}

 		/*
 		 * we have a new best match.
 		 * extend it to it's maximum length
 		 */
 		t += runlen + 2;
 		s += runlen + 2;
 		for(; s < es; s++){
 			if(*s != *t)
 				break;
 			t++;
 		}
 		runlen = s - p;
 		*m = off - 1;
 		if(s == es || runlen > ml)
 			break;
 matchloop:;
 		last = off;
 	}
 	return runlen;
 }

 static int
 lzcomp(LZstate *lz, LZblock *lzb, uchar *ep, ushort *parse, int finish)
 {
 	ulong cont, excost, *litlencount, *offcount;
 	uchar *p, *q, *s, *es;
 	ushort *nexts, *hash;
 	int v, i, h, runlen, n, now, then, m, prevlen, prevoff, maxdefer;

 	litlencount = lzb->litlencount;
 	offcount = lzb->offcount;
 	nexts = lz->nexts;
 	hash = lz->hash;
 	now = lz->now;

 	p = &lz->hist[lz->pos];
 	if(lz->prevlen != MinMatch - 1)
 		p++;

 	/*
 	 * hash in the links for any hanging link positions,
 	 * and calculate the hash for the current position.
 	 */
 	n = MinMatch;
 	if(n > ep - p)
 		n = ep - p;
 	cont = 0;
 	for(i = 0; i < n - 1; i++){
 		m = now - ((MinMatch-1) - i);
 		if(m < lz->dot)
 			continue;
 		s = lz->hist + (((p - lz->hist) - (now - m)) & (HistBlock-1));

 		cont = (s[0] << 16) | (s[1] << 8) | s[2];
 		h = hashit(cont);
 		prevoff = 0;
 		for(then = hash[h]; ; then = nexts[then & (MaxOff-1)]){
 			v = (ushort)(now - then);
 			if(v <= prevoff || v >= (MinMatch-1) - i)
 				break;
 			prevoff = v;
 		}
 		if(then == (ushort)m)
 			continue;
 		nexts[m & (MaxOff-1)] = hash[h];
 		hash[h] = m;
 	}
 	for(i = 0; i < n; i++)
 		cont = (cont << 8) | p[i];

 	/*
 	 * now must point to the index in the nexts array
 	 * corresponding to p's position in the history
 	 */
 	prevlen = lz->prevlen;
 	prevoff = lz->prevoff;
 	maxdefer = lz->maxcheck >> 2;
 	excost = 0;
 	v = lzb->lastv;
 	for(;;){
 		es = p + MaxMatch;
 		if(es > ep){
 			if(!finish || p >= ep)
 				break;
 			es = ep;
 		}

 		h = hashit(cont);
 		runlen = lzmatch(now, hash[h], p, es, nexts, lz->hist, prevlen, lz->maxcheck, &m);

 		/*
 		 * back out of small matches too far in the past
 		 */
 		if(runlen == MinMatch && m >= MinMatchMaxOff){
 			runlen = MinMatch - 1;
 			m = 0;
 		}

 		/*
 		 * record the encoding and increment counts for huffman trees
 		 * if we get a match, defer selecting it until we check for
 		 * a longer match at the next position.
 		 */
 		if(prevlen >= runlen && prevlen != MinMatch - 1){
 			/*
 			 * old match at least as good; use that one
 			 */
 			n = prevlen - MinMatch;
 			if(v || n){
 				*parse++ = v | LenFlag | (n << LenShift);
 				*parse++ = prevoff;
 			}else
 				*parse++ = prevoff << LenShift;
 			v = 0;

 			n = lencode[n];
 			litlencount[n]++;
 			excost += litlenextra[n - LenStart];

 			if(prevoff < MaxOffCode)
 				n = offcode[prevoff];
 			else
 				n = bigoffcode[prevoff >> 7];
 			offcount[n]++;
 			excost += offextra[n];

 			runlen = prevlen - 1;
 			prevlen = MinMatch - 1;
 			nmatches++;
 		}else if(runlen == MinMatch - 1){
 			/*
 			 * no match; just put out the literal
 			 */
 			if(++v == MaxLitRun){
 				*parse++ = v;
 				v = 0;
 			}
 			litlencount[*p]++;
 			nlits++;
 			runlen = 1;
 		}else{
 			if(prevlen != MinMatch - 1){
 				/*
 				 * longer match now. output previous literal,
 				 * update current match, and try again
 				 */
 				if(++v == MaxLitRun){
 					*parse++ = v;
 					v = 0;
 				}
 				litlencount[p[-1]]++;
 				nlits++;
 			}

 			prevoff = m;

 			if(runlen < maxdefer){
 				prevlen = runlen;
 				runlen = 1;
 			}else{
 				n = runlen - MinMatch;
 				if(v || n){
 					*parse++ = v | LenFlag | (n << LenShift);
 					*parse++ = prevoff;
 				}else
 					*parse++ = prevoff << LenShift;
 				v = 0;

 				n = lencode[n];
 				litlencount[n]++;
 				excost += litlenextra[n - LenStart];

 				if(prevoff < MaxOffCode)
 					n = offcode[prevoff];
 				else
 					n = bigoffcode[prevoff >> 7];
 				offcount[n]++;
 				excost += offextra[n];

 				prevlen = MinMatch - 1;
 				nmatches++;
 			}
 		}

 		/*
 		 * update the hash for the newly matched data
 		 * this is constructed so the link for the old
 		 * match in this position must be at the end of a chain,
 		 * and will expire when this match is added, ie it will
 		 * never be examined by the match loop.
 		 * add to the hash chain only if we have the real hash data.
 		 */
 		for(q = p + runlen; p != q; p++){
 			if(p + MinMatch <= ep){
 				h = hashit(cont);
 				nexts[now & (MaxOff-1)] = hash[h];
 				hash[h] = now;
 				if(p + MinMatch < ep)
 					cont = (cont << 8) | p[MinMatch];
 			}
 			now++;
 		}
 	}

 	/*
 	 * we can just store away the lazy state and
 	 * pick it up next time.  the last block will have finish set
 	 * so we won't have any pending matches
 	 * however, we need to correct for how much we've encoded
 	 */
 	if(prevlen != MinMatch - 1)
 		p--;

 	lzb->excost += excost;
 	lzb->eparse = parse;
 	lzb->lastv = v;

 	lz->now = now;
 	lz->prevlen = prevlen;
 	lz->prevoff = prevoff;

 	return p - &lz->hist[lz->pos];
 }

 /*
  * make up the dynamic code tables, and return the number of bits
  * needed to transmit them.
  */
 static int
 huffcodes(Dyncode *dc, Huff *littab, Huff *offtab)
 {
 	Huff *codetab;
 	uchar *codes, *codeaux;
 	ulong codecount[Nclen], excost;
 	int i, n, m, v, c, nlit, noff, ncode, nclen;

 	codetab = dc->codetab;
 	codes = dc->codes;
 	codeaux = dc->codeaux;

 	/*
 	 * trim the sizes of the tables
 	 */
 	for(nlit = Nlitlen; nlit > 257 && littab[nlit-1].bits == 0; nlit--)
 		;
 	for(noff = Noff; noff > 1 && offtab[noff-1].bits == 0; noff--)
 		;

 	/*
 	 * make the code-length code
 	 */
 	for(i = 0; i < nlit; i++)
 		codes[i] = littab[i].bits;
 	for(i = 0; i < noff; i++)
 		codes[i + nlit] = offtab[i].bits;

 	/*
 	 * run-length compress the code-length code
 	 */
 	excost = 0;
 	c = 0;
 	ncode = nlit+noff;
 	for(i = 0; i < ncode; ){
 		n = i + 1;
 		v = codes[i];
 		while(n < ncode && v == codes[n])
 			n++;
 		n -= i;
 		i += n;
 		if(v == 0){
 			while(n >= 11){
 				m = n;
 				if(m > 138)
 					m = 138;
 				codes[c] = 18;
 				codeaux[c++] = m - 11;
 				n -= m;
 				excost += 7;
 			}
 			if(n >= 3){
 				codes[c] = 17;
 				codeaux[c++] = n - 3;
 				n = 0;
 				excost += 3;
 			}
 		}
 		while(n--){
 			codes[c++] = v;
 			while(n >= 3){
 				m = n;
 				if(m > 6)
 					m = 6;
 				codes[c] = 16;
 				codeaux[c++] = m - 3;
 				n -= m;
 				excost += 3;
 			}
 		}
 	}

 	memset(codecount, 0, sizeof codecount);
 	for(i = 0; i < c; i++)
 		codecount[codes[i]]++;
 	if(!mkgzprecode(codetab, codecount, Nclen, 8))
 		return -1;

 	for(nclen = Nclen; nclen > 4 && codetab[clenorder[nclen-1]].bits == 0; nclen--)
 		;

 	dc->nlit = nlit;
 	dc->noff = noff;
 	dc->nclen = nclen;
 	dc->ncode = c;

 	return 5 + 5 + 4 + nclen * 3 + bitcost(codetab, codecount, Nclen) + excost;
 }

 static void
 wrdyncode(LZstate *out, Dyncode *dc)
 {
 	Huff *codetab;
 	uchar *codes, *codeaux;
 	int i, v, c;

 	/*
 	 * write out header, then code length code lengths,
 	 * and code lengths
 	 */
 	lzput(out, dc->nlit-257, 5);
 	lzput(out, dc->noff-1, 5);
 	lzput(out, dc->nclen-4, 4);

 	codetab = dc->codetab;
 	for(i = 0; i < dc->nclen; i++)
 		lzput(out, codetab[clenorder[i]].bits, 3);

 	codes = dc->codes;
 	codeaux = dc->codeaux;
 	c = dc->ncode;
 	for(i = 0; i < c; i++){
 		v = codes[i];
 		lzput(out, codetab[v].encode, codetab[v].bits);
 		if(v >= 16){
 			if(v == 16)
 				lzput(out, codeaux[i], 2);
 			else if(v == 17)
 				lzput(out, codeaux[i], 3);
 			else /* v == 18 */
 				lzput(out, codeaux[i], 7);
 		}
 	}
 }

 static int
 bitcost(Huff *tab, ulong *count, int n)
 {
 	ulong tot;
 	int i;

 	tot = 0;
 	for(i = 0; i < n; i++)
 		tot += count[i] * tab[i].bits;
 	return tot;
 }

 static int
 mkgzprecode(Huff *tab, ulong *count, int n, int maxbits)
 {
 	ulong bitcount[MaxHuffBits];
 	int i, nbits;

 	nbits = mkprecode(tab, count, n, maxbits, bitcount);
 	for(i = 0; i < n; i++){
 		if(tab[i].bits == -1)
 			tab[i].bits = 0;
 		else if(tab[i].bits == 0){
 			if(nbits != 0 || bitcount[0] != 1)
 				return 0;
 			bitcount[1] = 1;
 			bitcount[0] = 0;
 			nbits = 1;
 			tab[i].bits = 1;
 		}
 	}
 	if(bitcount[0] != 0)
 		return 0;
 	return hufftabinit(tab, n, bitcount, nbits);
 }

 static int
 hufftabinit(Huff *tab, int n, ulong *bitcount, int nbits)
 {
 	ulong code, nc[MaxHuffBits];
 	int i, bits;

 	code = 0;
 	for(bits = 1; bits <= nbits; bits++){
 		code = (code + bitcount[bits-1]) << 1;
 		nc[bits] = code;
 	}

 	for(i = 0; i < n; i++){
 		bits = tab[i].bits;
 		if(bits){
 			code = nc[bits]++ << (16 - bits);
 			if(code & ~0xffff)
 				return 0;
 			tab[i].encode = revtab[code >> 8] | (revtab[code & 0xff] << 8);
 		}
 	}
 	return 1;
 }


 /*
  * this should be in a library
  */
 struct Chain
 {
 	ulong	count;				/* occurances of everything in the chain */
 	ushort	leaf;				/* leaves to the left of chain, or leaf value */
 	char	col;				/* ref count for collecting unused chains */
 	char	gen;				/* need to generate chains for next lower level */
 	Chain	*up;				/* Chain up in the lists */
 };

 struct Chains
 {
 	Chain	*lists[(MaxHuffBits - 1) * 2];
 	ulong	leafcount[MaxLeaf];		/* sorted list of leaf counts */
 	ushort	leafmap[MaxLeaf];		/* map to actual leaf number */
 	int	nleaf;				/* number of leaves */
 	Chain	chains[ChainMem];
 	Chain	*echains;
 	Chain	*free;
 	char	col;
 	int	nlists;
 };

 /*
  * fast, low space overhead algorithm for max depth huffman type codes
  *
  * J. Katajainen, A. Moffat and A. Turpin, "A fast and space-economical
  * algorithm for length-limited coding," Proc. Intl. Symp. on Algorithms
  * and Computation, Cairns, Australia, Dec. 1995, Lecture Notes in Computer
  * Science, Vol 1004, J. Staples, P. Eades, N. Katoh, and A. Moffat, eds.,
  * pp 12-21, Springer Verlag, New York, 1995.
  */
 static int
 mkprecode(Huff *tab, ulong *count, int n, int maxbits, ulong *bitcount)
 {
 	Chains cs;
 	Chain *c;
 	int i, m, em, bits;

 	memset(&cs, 0, sizeof cs);

 	/*
 	 * set up the sorted list of leaves
 	 */
 	m = 0;
 	for(i = 0; i < n; i++){
 		tab[i].bits = -1;
 		tab[i].encode = 0;
 		if(count[i] != 0){
 			cs.leafcount[m] = count[i];
 			cs.leafmap[m] = i;
 			m++;
 		}
 	}
 	if(m < 2){
 		if(m != 0){
 			tab[cs.leafmap[0]].bits = 0;
 			bitcount[0] = 1;
 		}else
 			bitcount[0] = 0;
 		return 0;
 	}
 	cs.nleaf = m;
 	leafsort(cs.leafcount, cs.leafmap, 0, m);

 	for(i = 0; i < m; i++)
 		cs.leafcount[i] = count[cs.leafmap[i]];

 	/*
 	 * set up free list
 	 */
 	cs.free = &cs.chains[2];
 	cs.echains = &cs.chains[ChainMem];
 	cs.col = 1;

 	/*
 	 * initialize chains for each list
 	 */
 	c = &cs.chains[0];
 	c->count = cs.leafcount[0];
 	c->leaf = 1;
 	c->col = cs.col;
 	c->up = nil;
 	c->gen = 0;
 	cs.chains[1] = cs.chains[0];
 	cs.chains[1].leaf = 2;
 	cs.chains[1].count = cs.leafcount[1];
 	for(i = 0; i < maxbits-1; i++){
 		cs.lists[i * 2] = &cs.chains[0];
 		cs.lists[i * 2 + 1] = &cs.chains[1];
 	}

 	cs.nlists = 2 * (maxbits - 1);
 	m = 2 * m - 2;
 	for(i = 2; i < m; i++)
 		nextchain(&cs, cs.nlists - 2);

 	bits = 0;
 	bitcount[0] = cs.nleaf;
 	for(c = cs.lists[cs.nlists - 1]; c != nil; c = c->up){
 		m = c->leaf;
 		bitcount[bits++] -= m;
 		bitcount[bits] = m;
 	}
 	m = 0;
 	for(i = bits; i >= 0; i--)
 		for(em = m + bitcount[i]; m < em; m++)
 			tab[cs.leafmap[m]].bits = i;

 	return bits;
 }

 /*
  * calculate the next chain on the list
  * we can always toss out the old chain
  */
 static void
 nextchain(Chains *cs, int list)
 {
 	Chain *c, *oc;
 	int i, nleaf, sumc;

 	oc = cs->lists[list + 1];
 	cs->lists[list] = oc;
 	if(oc == nil)
 		return;

 	/*
 	 * make sure we have all chains needed to make sumc
 	 * note it is possible to generate only one of these,
 	 * use twice that value for sumc, and then generate
 	 * the second if that preliminary sumc would be chosen.
 	 * however, this appears to be slower on current tests
 	 */
 	if(oc->gen){
 		nextchain(cs, list - 2);
 		nextchain(cs, list - 2);
 		oc->gen = 0;
 	}

 	/*
 	 * pick up the chain we're going to add;
 	 * collect unused chains no free ones are left
 	 */
 	for(c = cs->free; ; c++){
 		if(c >= cs->echains){
 			cs->col++;
 			for(i = 0; i < cs->nlists; i++)
 				for(c = cs->lists[i]; c != nil; c = c->up)
 					c->col = cs->col;
 			c = cs->chains;
 		}
 		if(c->col != cs->col)
 			break;
 	}

 	/*
 	 * pick the cheapest of
 	 * 1) the next package from the previous list
 	 * 2) the next leaf
 	 */
 	nleaf = oc->leaf;
 	sumc = 0;
 	if(list > 0 && cs->lists[list-1] != nil)
 		sumc = cs->lists[list-2]->count + cs->lists[list-1]->count;
 	if(sumc != 0 && (nleaf >= cs->nleaf || cs->leafcount[nleaf] > sumc)){
 		c->count = sumc;
 		c->leaf = oc->leaf;
 		c->up = cs->lists[list-1];
 		c->gen = 1;
 	}else if(nleaf >= cs->nleaf){
 		cs->lists[list + 1] = nil;
 		return;
 	}else{
 		c->leaf = nleaf + 1;
 		c->count = cs->leafcount[nleaf];
 		c->up = oc->up;
 		c->gen = 0;
 	}
 	cs->free = c + 1;

 	cs->lists[list + 1] = c;
 	c->col = cs->col;
 }

 static int
 pivot(ulong *c, int a, int n)
 {
 	int j, pi, pj, pk;

 	j = n/6;
 	pi = a + j;	/* 1/6 */
 	j += j;
 	pj = pi + j;	/* 1/2 */
 	pk = pj + j;	/* 5/6 */
 	if(c[pi] < c[pj]){
 		if(c[pi] < c[pk]){
 			if(c[pj] < c[pk])
 				return pj;
 			return pk;
 		}
 		return pi;
 	}
 	if(c[pj] < c[pk]){
 		if(c[pi] < c[pk])
 			return pi;
 		return pk;
 	}
 	return pj;
 }

 static	void
 leafsort(ulong *leafcount, ushort *leafmap, int a, int n)
 {
 	ulong t;
 	int j, pi, pj, pn;

 	while(n > 1){
 		if(n > 10){
 			pi = pivot(leafcount, a, n);
 		}else
 			pi = a + (n>>1);

 		t = leafcount[pi];
 		leafcount[pi] = leafcount[a];
 		leafcount[a] = t;
 		t = leafmap[pi];
 		leafmap[pi] = leafmap[a];
 		leafmap[a] = t;
 		pi = a;
 		pn = a + n;
 		pj = pn;
 		for(;;){
 			do
 				pi++;
 			while(pi < pn && (leafcount[pi] < leafcount[a] || leafcount[pi] == leafcount[a] && leafmap[pi] > leafmap[a]));
 			do
 				pj--;
 			while(pj > a && (leafcount[pj] > leafcount[a] || leafcount[pj] == leafcount[a] && leafmap[pj] < leafmap[a]));
 			if(pj < pi)
 				break;
 			t = leafcount[pi];
 			leafcount[pi] = leafcount[pj];
 			leafcount[pj] = t;
 			t = leafmap[pi];
 			leafmap[pi] = leafmap[pj];
 			leafmap[pj] = t;
 		}
 		t = leafcount[a];
 		leafcount[a] = leafcount[pj];
 		leafcount[pj] = t;
 		t = leafmap[a];
 		leafmap[a] = leafmap[pj];
 		leafmap[pj] = t;
 		j = pj - a;

 		n = n-j-1;
 		if(j >= n){
 			leafsort(leafcount, leafmap, a, j);
 			a += j+1;
 		}else{
 			leafsort(leafcount, leafmap, a + (j+1), n);
 			n = j;
 		}
 	}
 }