xref: /illumos-gate/usr/src/cmd/spell/huff.c (revision 7c478bd9)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 1994 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28*7c478bd9Sstevel@tonic-gate /*	  All Rights Reserved  	*/
29*7c478bd9Sstevel@tonic-gate 
30*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
31*7c478bd9Sstevel@tonic-gate 
32*7c478bd9Sstevel@tonic-gate 
33*7c478bd9Sstevel@tonic-gate #include <unistd.h>
34*7c478bd9Sstevel@tonic-gate #include <stdlib.h>
35*7c478bd9Sstevel@tonic-gate #include <stdio.h>
36*7c478bd9Sstevel@tonic-gate 
37*7c478bd9Sstevel@tonic-gate #define	BYTE 8
38*7c478bd9Sstevel@tonic-gate #define	QW 1		/* width of bas-q digit in bits */
39*7c478bd9Sstevel@tonic-gate 
40*7c478bd9Sstevel@tonic-gate /*
41*7c478bd9Sstevel@tonic-gate  * this stuff should be local and hidden; it was made
42*7c478bd9Sstevel@tonic-gate  * accessible outside for dirty reasons: 20% faster spell
43*7c478bd9Sstevel@tonic-gate  */
44*7c478bd9Sstevel@tonic-gate #include "huff.h"
45*7c478bd9Sstevel@tonic-gate struct huff huffcode;
46*7c478bd9Sstevel@tonic-gate 
47*7c478bd9Sstevel@tonic-gate /*
48*7c478bd9Sstevel@tonic-gate  * Infinite Huffman code
49*7c478bd9Sstevel@tonic-gate  *
50*7c478bd9Sstevel@tonic-gate  * Let the messages be exponentially distributed with ratio r:
51*7c478bd9Sstevel@tonic-gate  * 	P {message k} = r^k*(1-r),	k = 0, 1, ...
52*7c478bd9Sstevel@tonic-gate  * Let the messages be coded in base q, and suppose
53*7c478bd9Sstevel@tonic-gate  * 	r^n = 1/q
54*7c478bd9Sstevel@tonic-gate  * If each decade(base q) contains n codes, then
55*7c478bd9Sstevel@tonic-gate  * the messages assigned to each decade will be q times
56*7c478bd9Sstevel@tonic-gate  * as probable as the next. Moreover the code for the tail of
57*7c478bd9Sstevel@tonic-gate  * the distribution after truncating one decade should look
58*7c478bd9Sstevel@tonic-gate  * just like the original, but longer by one leading digit q-1.
59*7c478bd9Sstevel@tonic-gate  * 	q(z+n) = z + (q-1)q^w
60*7c478bd9Sstevel@tonic-gate  * where z is first code of decade, w is width of code, in shortest
61*7c478bd9Sstevel@tonic-gate  * full decade. Examples, base 2:
62*7c478bd9Sstevel@tonic-gate  * 	r^1 = 1/2	r^5 = 1/2
63*7c478bd9Sstevel@tonic-gate  * 	0		0110
64*7c478bd9Sstevel@tonic-gate  * 	10		0111
65*7c478bd9Sstevel@tonic-gate  * 	110		1000
66*7c478bd9Sstevel@tonic-gate  * 	1110		1001
67*7c478bd9Sstevel@tonic-gate  * 	...		1010
68*7c478bd9Sstevel@tonic-gate  * 			10110
69*7c478bd9Sstevel@tonic-gate  * 	w = 1, z = 0		w = 4, z = 0110
70*7c478bd9Sstevel@tonic-gate  * Rewriting slightly
71*7c478bd9Sstevel@tonic-gate  * 	(q-1)z + q*n = (q-1)q^w
72*7c478bd9Sstevel@tonic-gate  * whence z is a multiple of q and n is a multiple of q-1. Let
73*7c478bd9Sstevel@tonic-gate  * 	z = cq, n = d(q-1)
74*7c478bd9Sstevel@tonic-gate  * We pick w to be the least integer such that
75*7c478bd9Sstevel@tonic-gate  * 	d = n/(q-1) <= q^(w-1)
76*7c478bd9Sstevel@tonic-gate  * Then solve for c
77*7c478bd9Sstevel@tonic-gate  * 	c = q^(w-1) - d
78*7c478bd9Sstevel@tonic-gate  * If c is not zero, the first decade may be preceded by
79*7c478bd9Sstevel@tonic-gate  * even shorter (w-1)-digit codes 0, 1, ..., c-1. Thus
80*7c478bd9Sstevel@tonic-gate  * the example code with r^5 = 1/2 becomes
81*7c478bd9Sstevel@tonic-gate  * 	000
82*7c478bd9Sstevel@tonic-gate  * 	001
83*7c478bd9Sstevel@tonic-gate  * 	010
84*7c478bd9Sstevel@tonic-gate  * 	0110
85*7c478bd9Sstevel@tonic-gate  * 	0111
86*7c478bd9Sstevel@tonic-gate  * 	1000
87*7c478bd9Sstevel@tonic-gate  * 	1001
88*7c478bd9Sstevel@tonic-gate  * 	1010
89*7c478bd9Sstevel@tonic-gate  * 	10110
90*7c478bd9Sstevel@tonic-gate  * 	...
91*7c478bd9Sstevel@tonic-gate  * 	110110
92*7c478bd9Sstevel@tonic-gate  * 	...
93*7c478bd9Sstevel@tonic-gate  * The expected number of base-q digits in a codeword is then
94*7c478bd9Sstevel@tonic-gate  *	w - 1 + r^c/(1-r^n)
95*7c478bd9Sstevel@tonic-gate  * The present routines require q to be a power of 2
96*7c478bd9Sstevel@tonic-gate  */
97*7c478bd9Sstevel@tonic-gate /*
98*7c478bd9Sstevel@tonic-gate  * There is a lot of hanky-panky with left justification against
99*7c478bd9Sstevel@tonic-gate  * sign instead of simple left justification because
100*7c478bd9Sstevel@tonic-gate  * unsigned long is not available
101*7c478bd9Sstevel@tonic-gate  */
102*7c478bd9Sstevel@tonic-gate #define	L (BYTE*(sizeof (long))-1)	/* length of signless long */
103*7c478bd9Sstevel@tonic-gate #define	MASK (~((unsigned long)1<<L))	/* mask out sign */
104*7c478bd9Sstevel@tonic-gate 
105*7c478bd9Sstevel@tonic-gate /*
106*7c478bd9Sstevel@tonic-gate  * decode the prefix of word y (which is left justified against sign)
107*7c478bd9Sstevel@tonic-gate  * place mesage number into place pointed to by kp
108*7c478bd9Sstevel@tonic-gate  * return length (in bits) of decoded prefix or 0 if code is out of
109*7c478bd9Sstevel@tonic-gate  * range
110*7c478bd9Sstevel@tonic-gate  */
111*7c478bd9Sstevel@tonic-gate int
112*7c478bd9Sstevel@tonic-gate decode(long y, long *pk)
113*7c478bd9Sstevel@tonic-gate {
114*7c478bd9Sstevel@tonic-gate 	register l;
115*7c478bd9Sstevel@tonic-gate 	long v;
116*7c478bd9Sstevel@tonic-gate 	if (y < cs) {
117*7c478bd9Sstevel@tonic-gate 		*pk = y >> (long)(L+QW-w);
118*7c478bd9Sstevel@tonic-gate 		return (w-QW);
119*7c478bd9Sstevel@tonic-gate 	}
120*7c478bd9Sstevel@tonic-gate 	for (l = w, v = v0; y >= qcs;
121*7c478bd9Sstevel@tonic-gate 	    y = ((unsigned long)y << QW) & MASK, v += n)
122*7c478bd9Sstevel@tonic-gate 		if ((l += QW) > L)
123*7c478bd9Sstevel@tonic-gate 			return (0);
124*7c478bd9Sstevel@tonic-gate 	*pk = v + (y>>(long)(L-w));
125*7c478bd9Sstevel@tonic-gate 	return (l);
126*7c478bd9Sstevel@tonic-gate }
127*7c478bd9Sstevel@tonic-gate 
128*7c478bd9Sstevel@tonic-gate /*
129*7c478bd9Sstevel@tonic-gate  * encode message k and put result (right justified) into
130*7c478bd9Sstevel@tonic-gate  * place pointed to by py.
131*7c478bd9Sstevel@tonic-gate  * return length (in bits) of result,
132*7c478bd9Sstevel@tonic-gate  * or 0 if code is too long
133*7c478bd9Sstevel@tonic-gate  */
134*7c478bd9Sstevel@tonic-gate 
135*7c478bd9Sstevel@tonic-gate int
136*7c478bd9Sstevel@tonic-gate encode(long k, long *py)
137*7c478bd9Sstevel@tonic-gate {
138*7c478bd9Sstevel@tonic-gate 	register l;
139*7c478bd9Sstevel@tonic-gate 	long y;
140*7c478bd9Sstevel@tonic-gate 	if (k < c) {
141*7c478bd9Sstevel@tonic-gate 		*py = k;
142*7c478bd9Sstevel@tonic-gate 		return (w-QW);
143*7c478bd9Sstevel@tonic-gate 	}
144*7c478bd9Sstevel@tonic-gate 	for (k -= c, y = 1, l = w; k >= n; k -= n, y <<= QW)
145*7c478bd9Sstevel@tonic-gate 		if ((l += QW) > L)
146*7c478bd9Sstevel@tonic-gate 			return (0);
147*7c478bd9Sstevel@tonic-gate 	*py = ((y-1)<<w) + cq + k;
148*7c478bd9Sstevel@tonic-gate 	return (l);
149*7c478bd9Sstevel@tonic-gate }
150*7c478bd9Sstevel@tonic-gate 
151*7c478bd9Sstevel@tonic-gate 
152*7c478bd9Sstevel@tonic-gate /*
153*7c478bd9Sstevel@tonic-gate  * Initialization code, given expected value of k
154*7c478bd9Sstevel@tonic-gate  * E(k) = r/(1-r) = a
155*7c478bd9Sstevel@tonic-gate  * and given base width b
156*7c478bd9Sstevel@tonic-gate  * return expected length of coded messages
157*7c478bd9Sstevel@tonic-gate  */
158*7c478bd9Sstevel@tonic-gate static struct qlog {
159*7c478bd9Sstevel@tonic-gate 	long p;
160*7c478bd9Sstevel@tonic-gate 	double u;
161*7c478bd9Sstevel@tonic-gate } z;
162*7c478bd9Sstevel@tonic-gate 
163*7c478bd9Sstevel@tonic-gate static struct qlog
164*7c478bd9Sstevel@tonic-gate qlog(double x, double y, long p, double u)	/* find smallest p so x^p<=y */
165*7c478bd9Sstevel@tonic-gate {
166*7c478bd9Sstevel@tonic-gate 
167*7c478bd9Sstevel@tonic-gate 	if (u/x <= y) {
168*7c478bd9Sstevel@tonic-gate 		z.p = 0;
169*7c478bd9Sstevel@tonic-gate 		z.u = 1;
170*7c478bd9Sstevel@tonic-gate 	} else {
171*7c478bd9Sstevel@tonic-gate 		z = qlog(x, y, p+p, u*u);
172*7c478bd9Sstevel@tonic-gate 		if (u*z.u/x > y) {
173*7c478bd9Sstevel@tonic-gate 			z.p += p;
174*7c478bd9Sstevel@tonic-gate 			z.u *= u;
175*7c478bd9Sstevel@tonic-gate 		}
176*7c478bd9Sstevel@tonic-gate 	}
177*7c478bd9Sstevel@tonic-gate 	return (z);
178*7c478bd9Sstevel@tonic-gate }
179*7c478bd9Sstevel@tonic-gate 
180*7c478bd9Sstevel@tonic-gate double
181*7c478bd9Sstevel@tonic-gate huff(float a)
182*7c478bd9Sstevel@tonic-gate {
183*7c478bd9Sstevel@tonic-gate 	register i, q;
184*7c478bd9Sstevel@tonic-gate 	long d, j;
185*7c478bd9Sstevel@tonic-gate 	double r = a/(1.0 + a);
186*7c478bd9Sstevel@tonic-gate 	double rc, rq;
187*7c478bd9Sstevel@tonic-gate 
188*7c478bd9Sstevel@tonic-gate 	for (i = 0, q = 1, rq = r; i < QW; i++, q *= 2, rq *= rq)
189*7c478bd9Sstevel@tonic-gate 		continue;
190*7c478bd9Sstevel@tonic-gate 	rq /= r;	/* rq = r^(q-1) */
191*7c478bd9Sstevel@tonic-gate 	(void) qlog(rq, 1./q, 1L, rq);
192*7c478bd9Sstevel@tonic-gate 	d = z.p;
193*7c478bd9Sstevel@tonic-gate 	n = d*(q-1);
194*7c478bd9Sstevel@tonic-gate 	if (n != d * (q - 1))
195*7c478bd9Sstevel@tonic-gate 		abort();	/* time to make n long */
196*7c478bd9Sstevel@tonic-gate 	for (w = QW, j = 1; j < d; w += QW, j *= q)
197*7c478bd9Sstevel@tonic-gate 		continue;
198*7c478bd9Sstevel@tonic-gate 	c = j - d;
199*7c478bd9Sstevel@tonic-gate 	cq = c*q;
200*7c478bd9Sstevel@tonic-gate 	cs = cq<<(L-w);
201*7c478bd9Sstevel@tonic-gate 	qcs = (((long)(q-1)<<w) + cq) << (L-QW-w);
202*7c478bd9Sstevel@tonic-gate 	v0 = c - cq;
203*7c478bd9Sstevel@tonic-gate 	for (i = 0, rc = 1; i < c; i++, rc *= r)	/* rc = r^c */
204*7c478bd9Sstevel@tonic-gate 		continue;
205*7c478bd9Sstevel@tonic-gate 	return (w + QW*(rc/(1-z.u) - 1));
206*7c478bd9Sstevel@tonic-gate }
207*7c478bd9Sstevel@tonic-gate 
208*7c478bd9Sstevel@tonic-gate void
209*7c478bd9Sstevel@tonic-gate whuff(void)
210*7c478bd9Sstevel@tonic-gate {
211*7c478bd9Sstevel@tonic-gate 	(void) fwrite((char *) & huffcode, sizeof (huffcode), 1, stdout);
212*7c478bd9Sstevel@tonic-gate }
213*7c478bd9Sstevel@tonic-gate 
214*7c478bd9Sstevel@tonic-gate int
215*7c478bd9Sstevel@tonic-gate rhuff(FILE *f)
216*7c478bd9Sstevel@tonic-gate {
217*7c478bd9Sstevel@tonic-gate 	return (read(fileno(f), (char *)&huffcode, sizeof (huffcode)) ==
218*7c478bd9Sstevel@tonic-gate 	    sizeof (huffcode));
219*7c478bd9Sstevel@tonic-gate }
220