xref: /illumos-gate/usr/src/cmd/spell/huff.c (revision 0d8b5334)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
57c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
67c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
77c478bd9Sstevel@tonic-gate  * with the License.
87c478bd9Sstevel@tonic-gate  *
97c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
107c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
117c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
127c478bd9Sstevel@tonic-gate  * and limitations under the License.
137c478bd9Sstevel@tonic-gate  *
147c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
157c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
167c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
177c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
187c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
197c478bd9Sstevel@tonic-gate  *
207c478bd9Sstevel@tonic-gate  * CDDL HEADER END
217c478bd9Sstevel@tonic-gate  */
227c478bd9Sstevel@tonic-gate /*
23*0d8b5334Sceastha  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
247c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
287c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
297c478bd9Sstevel@tonic-gate 
307c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
317c478bd9Sstevel@tonic-gate 
327c478bd9Sstevel@tonic-gate 
337c478bd9Sstevel@tonic-gate #include <unistd.h>
347c478bd9Sstevel@tonic-gate #include <stdlib.h>
357c478bd9Sstevel@tonic-gate #include <stdio.h>
367c478bd9Sstevel@tonic-gate 
377c478bd9Sstevel@tonic-gate #define	BYTE 8
387c478bd9Sstevel@tonic-gate #define	QW 1		/* width of bas-q digit in bits */
397c478bd9Sstevel@tonic-gate 
407c478bd9Sstevel@tonic-gate /*
417c478bd9Sstevel@tonic-gate  * this stuff should be local and hidden; it was made
427c478bd9Sstevel@tonic-gate  * accessible outside for dirty reasons: 20% faster spell
437c478bd9Sstevel@tonic-gate  */
447c478bd9Sstevel@tonic-gate #include "huff.h"
457c478bd9Sstevel@tonic-gate struct huff huffcode;
467c478bd9Sstevel@tonic-gate 
477c478bd9Sstevel@tonic-gate /*
487c478bd9Sstevel@tonic-gate  * Infinite Huffman code
497c478bd9Sstevel@tonic-gate  *
507c478bd9Sstevel@tonic-gate  * Let the messages be exponentially distributed with ratio r:
517c478bd9Sstevel@tonic-gate  * 	P {message k} = r^k*(1-r),	k = 0, 1, ...
527c478bd9Sstevel@tonic-gate  * Let the messages be coded in base q, and suppose
537c478bd9Sstevel@tonic-gate  * 	r^n = 1/q
547c478bd9Sstevel@tonic-gate  * If each decade(base q) contains n codes, then
557c478bd9Sstevel@tonic-gate  * the messages assigned to each decade will be q times
567c478bd9Sstevel@tonic-gate  * as probable as the next. Moreover the code for the tail of
577c478bd9Sstevel@tonic-gate  * the distribution after truncating one decade should look
587c478bd9Sstevel@tonic-gate  * just like the original, but longer by one leading digit q-1.
597c478bd9Sstevel@tonic-gate  * 	q(z+n) = z + (q-1)q^w
607c478bd9Sstevel@tonic-gate  * where z is first code of decade, w is width of code, in shortest
617c478bd9Sstevel@tonic-gate  * full decade. Examples, base 2:
627c478bd9Sstevel@tonic-gate  * 	r^1 = 1/2	r^5 = 1/2
637c478bd9Sstevel@tonic-gate  * 	0		0110
647c478bd9Sstevel@tonic-gate  * 	10		0111
657c478bd9Sstevel@tonic-gate  * 	110		1000
667c478bd9Sstevel@tonic-gate  * 	1110		1001
677c478bd9Sstevel@tonic-gate  * 	...		1010
687c478bd9Sstevel@tonic-gate  * 			10110
697c478bd9Sstevel@tonic-gate  * 	w = 1, z = 0		w = 4, z = 0110
707c478bd9Sstevel@tonic-gate  * Rewriting slightly
717c478bd9Sstevel@tonic-gate  * 	(q-1)z + q*n = (q-1)q^w
727c478bd9Sstevel@tonic-gate  * whence z is a multiple of q and n is a multiple of q-1. Let
737c478bd9Sstevel@tonic-gate  * 	z = cq, n = d(q-1)
747c478bd9Sstevel@tonic-gate  * We pick w to be the least integer such that
757c478bd9Sstevel@tonic-gate  * 	d = n/(q-1) <= q^(w-1)
767c478bd9Sstevel@tonic-gate  * Then solve for c
777c478bd9Sstevel@tonic-gate  * 	c = q^(w-1) - d
787c478bd9Sstevel@tonic-gate  * If c is not zero, the first decade may be preceded by
797c478bd9Sstevel@tonic-gate  * even shorter (w-1)-digit codes 0, 1, ..., c-1. Thus
807c478bd9Sstevel@tonic-gate  * the example code with r^5 = 1/2 becomes
817c478bd9Sstevel@tonic-gate  * 	000
827c478bd9Sstevel@tonic-gate  * 	001
837c478bd9Sstevel@tonic-gate  * 	010
847c478bd9Sstevel@tonic-gate  * 	0110
857c478bd9Sstevel@tonic-gate  * 	0111
867c478bd9Sstevel@tonic-gate  * 	1000
877c478bd9Sstevel@tonic-gate  * 	1001
887c478bd9Sstevel@tonic-gate  * 	1010
897c478bd9Sstevel@tonic-gate  * 	10110
907c478bd9Sstevel@tonic-gate  * 	...
917c478bd9Sstevel@tonic-gate  * 	110110
927c478bd9Sstevel@tonic-gate  * 	...
937c478bd9Sstevel@tonic-gate  * The expected number of base-q digits in a codeword is then
947c478bd9Sstevel@tonic-gate  *	w - 1 + r^c/(1-r^n)
957c478bd9Sstevel@tonic-gate  * The present routines require q to be a power of 2
967c478bd9Sstevel@tonic-gate  */
977c478bd9Sstevel@tonic-gate /*
987c478bd9Sstevel@tonic-gate  * There is a lot of hanky-panky with left justification against
997c478bd9Sstevel@tonic-gate  * sign instead of simple left justification because
1007c478bd9Sstevel@tonic-gate  * unsigned long is not available
1017c478bd9Sstevel@tonic-gate  */
1027c478bd9Sstevel@tonic-gate #define	L (BYTE*(sizeof (long))-1)	/* length of signless long */
1037c478bd9Sstevel@tonic-gate #define	MASK (~((unsigned long)1<<L))	/* mask out sign */
1047c478bd9Sstevel@tonic-gate 
1057c478bd9Sstevel@tonic-gate /*
1067c478bd9Sstevel@tonic-gate  * decode the prefix of word y (which is left justified against sign)
1077c478bd9Sstevel@tonic-gate  * place mesage number into place pointed to by kp
1087c478bd9Sstevel@tonic-gate  * return length (in bits) of decoded prefix or 0 if code is out of
1097c478bd9Sstevel@tonic-gate  * range
1107c478bd9Sstevel@tonic-gate  */
1117c478bd9Sstevel@tonic-gate int
1127c478bd9Sstevel@tonic-gate decode(long y, long *pk)
1137c478bd9Sstevel@tonic-gate {
114*0d8b5334Sceastha 	int l;
1157c478bd9Sstevel@tonic-gate 	long v;
1167c478bd9Sstevel@tonic-gate 	if (y < cs) {
1177c478bd9Sstevel@tonic-gate 		*pk = y >> (long)(L+QW-w);
1187c478bd9Sstevel@tonic-gate 		return (w-QW);
1197c478bd9Sstevel@tonic-gate 	}
1207c478bd9Sstevel@tonic-gate 	for (l = w, v = v0; y >= qcs;
1217c478bd9Sstevel@tonic-gate 	    y = ((unsigned long)y << QW) & MASK, v += n)
1227c478bd9Sstevel@tonic-gate 		if ((l += QW) > L)
1237c478bd9Sstevel@tonic-gate 			return (0);
1247c478bd9Sstevel@tonic-gate 	*pk = v + (y>>(long)(L-w));
1257c478bd9Sstevel@tonic-gate 	return (l);
1267c478bd9Sstevel@tonic-gate }
1277c478bd9Sstevel@tonic-gate 
1287c478bd9Sstevel@tonic-gate /*
1297c478bd9Sstevel@tonic-gate  * encode message k and put result (right justified) into
1307c478bd9Sstevel@tonic-gate  * place pointed to by py.
1317c478bd9Sstevel@tonic-gate  * return length (in bits) of result,
1327c478bd9Sstevel@tonic-gate  * or 0 if code is too long
1337c478bd9Sstevel@tonic-gate  */
1347c478bd9Sstevel@tonic-gate 
1357c478bd9Sstevel@tonic-gate int
1367c478bd9Sstevel@tonic-gate encode(long k, long *py)
1377c478bd9Sstevel@tonic-gate {
138*0d8b5334Sceastha 	int l;
1397c478bd9Sstevel@tonic-gate 	long y;
1407c478bd9Sstevel@tonic-gate 	if (k < c) {
1417c478bd9Sstevel@tonic-gate 		*py = k;
1427c478bd9Sstevel@tonic-gate 		return (w-QW);
1437c478bd9Sstevel@tonic-gate 	}
1447c478bd9Sstevel@tonic-gate 	for (k -= c, y = 1, l = w; k >= n; k -= n, y <<= QW)
1457c478bd9Sstevel@tonic-gate 		if ((l += QW) > L)
1467c478bd9Sstevel@tonic-gate 			return (0);
1477c478bd9Sstevel@tonic-gate 	*py = ((y-1)<<w) + cq + k;
1487c478bd9Sstevel@tonic-gate 	return (l);
1497c478bd9Sstevel@tonic-gate }
1507c478bd9Sstevel@tonic-gate 
1517c478bd9Sstevel@tonic-gate 
1527c478bd9Sstevel@tonic-gate /*
1537c478bd9Sstevel@tonic-gate  * Initialization code, given expected value of k
1547c478bd9Sstevel@tonic-gate  * E(k) = r/(1-r) = a
1557c478bd9Sstevel@tonic-gate  * and given base width b
1567c478bd9Sstevel@tonic-gate  * return expected length of coded messages
1577c478bd9Sstevel@tonic-gate  */
1587c478bd9Sstevel@tonic-gate static struct qlog {
1597c478bd9Sstevel@tonic-gate 	long p;
1607c478bd9Sstevel@tonic-gate 	double u;
1617c478bd9Sstevel@tonic-gate } z;
1627c478bd9Sstevel@tonic-gate 
1637c478bd9Sstevel@tonic-gate static struct qlog
1647c478bd9Sstevel@tonic-gate qlog(double x, double y, long p, double u)	/* find smallest p so x^p<=y */
1657c478bd9Sstevel@tonic-gate {
1667c478bd9Sstevel@tonic-gate 
1677c478bd9Sstevel@tonic-gate 	if (u/x <= y) {
1687c478bd9Sstevel@tonic-gate 		z.p = 0;
1697c478bd9Sstevel@tonic-gate 		z.u = 1;
1707c478bd9Sstevel@tonic-gate 	} else {
1717c478bd9Sstevel@tonic-gate 		z = qlog(x, y, p+p, u*u);
1727c478bd9Sstevel@tonic-gate 		if (u*z.u/x > y) {
1737c478bd9Sstevel@tonic-gate 			z.p += p;
1747c478bd9Sstevel@tonic-gate 			z.u *= u;
1757c478bd9Sstevel@tonic-gate 		}
1767c478bd9Sstevel@tonic-gate 	}
1777c478bd9Sstevel@tonic-gate 	return (z);
1787c478bd9Sstevel@tonic-gate }
1797c478bd9Sstevel@tonic-gate 
1807c478bd9Sstevel@tonic-gate double
1817c478bd9Sstevel@tonic-gate huff(float a)
1827c478bd9Sstevel@tonic-gate {
183*0d8b5334Sceastha 	int i, q;
1847c478bd9Sstevel@tonic-gate 	long d, j;
1857c478bd9Sstevel@tonic-gate 	double r = a/(1.0 + a);
1867c478bd9Sstevel@tonic-gate 	double rc, rq;
1877c478bd9Sstevel@tonic-gate 
1887c478bd9Sstevel@tonic-gate 	for (i = 0, q = 1, rq = r; i < QW; i++, q *= 2, rq *= rq)
1897c478bd9Sstevel@tonic-gate 		continue;
1907c478bd9Sstevel@tonic-gate 	rq /= r;	/* rq = r^(q-1) */
1917c478bd9Sstevel@tonic-gate 	(void) qlog(rq, 1./q, 1L, rq);
1927c478bd9Sstevel@tonic-gate 	d = z.p;
1937c478bd9Sstevel@tonic-gate 	n = d*(q-1);
1947c478bd9Sstevel@tonic-gate 	if (n != d * (q - 1))
1957c478bd9Sstevel@tonic-gate 		abort();	/* time to make n long */
1967c478bd9Sstevel@tonic-gate 	for (w = QW, j = 1; j < d; w += QW, j *= q)
1977c478bd9Sstevel@tonic-gate 		continue;
1987c478bd9Sstevel@tonic-gate 	c = j - d;
1997c478bd9Sstevel@tonic-gate 	cq = c*q;
2007c478bd9Sstevel@tonic-gate 	cs = cq<<(L-w);
2017c478bd9Sstevel@tonic-gate 	qcs = (((long)(q-1)<<w) + cq) << (L-QW-w);
2027c478bd9Sstevel@tonic-gate 	v0 = c - cq;
2037c478bd9Sstevel@tonic-gate 	for (i = 0, rc = 1; i < c; i++, rc *= r)	/* rc = r^c */
2047c478bd9Sstevel@tonic-gate 		continue;
2057c478bd9Sstevel@tonic-gate 	return (w + QW*(rc/(1-z.u) - 1));
2067c478bd9Sstevel@tonic-gate }
2077c478bd9Sstevel@tonic-gate 
2087c478bd9Sstevel@tonic-gate void
2097c478bd9Sstevel@tonic-gate whuff(void)
2107c478bd9Sstevel@tonic-gate {
2117c478bd9Sstevel@tonic-gate 	(void) fwrite((char *) & huffcode, sizeof (huffcode), 1, stdout);
2127c478bd9Sstevel@tonic-gate }
2137c478bd9Sstevel@tonic-gate 
2147c478bd9Sstevel@tonic-gate int
2157c478bd9Sstevel@tonic-gate rhuff(FILE *f)
2167c478bd9Sstevel@tonic-gate {
2177c478bd9Sstevel@tonic-gate 	return (read(fileno(f), (char *)&huffcode, sizeof (huffcode)) ==
2187c478bd9Sstevel@tonic-gate 	    sizeof (huffcode));
2197c478bd9Sstevel@tonic-gate }
220