1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 */
25/*
26 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27 * Use is subject to license terms.
28 */
29
30#pragma weak fmal = __fmal
31
32#include "libm.h"
33#include "fma.h"
34#include "fenv_inlines.h"
35
36#if defined(__sparc)
37
38static const union {
39	unsigned i[2];
40	double d;
41} C[] = {
42	{ 0x3fe00000u, 0 },
43	{ 0x40000000u, 0 },
44	{ 0x3ef00000u, 0 },
45	{ 0x3e700000u, 0 },
46	{ 0x41300000u, 0 },
47	{ 0x3e300000u, 0 },
48	{ 0x3b300000u, 0 },
49	{ 0x38300000u, 0 },
50	{ 0x42300000u, 0 },
51	{ 0x3df00000u, 0 },
52	{ 0x7fe00000u, 0 },
53	{ 0x00100000u, 0 },
54	{ 0x00100001u, 0 },
55	{ 0, 0 },
56	{ 0x7ff00000u, 0 },
57	{ 0x7ff00001u, 0 }
58};
59
60#define	half	C[0].d
61#define	two	C[1].d
62#define	twom16	C[2].d
63#define	twom24	C[3].d
64#define	two20	C[4].d
65#define	twom28	C[5].d
66#define	twom76	C[6].d
67#define	twom124	C[7].d
68#define	two36	C[8].d
69#define	twom32	C[9].d
70#define	huge	C[10].d
71#define	tiny	C[11].d
72#define	tiny2	C[12].d
73#define	zero	C[13].d
74#define	inf	C[14].d
75#define	snan	C[15].d
76
77static const unsigned int fsr_rm = 0xc0000000u;
78
79/*
80 * fmal for SPARC: 128-bit quad precision, big-endian
81 */
82long double
83__fmal(long double x, long double y, long double z) {
84	union {
85		unsigned int i[4];
86		long double q;
87	} xx, yy, zz;
88	union {
89		unsigned int i[2];
90		double d;
91	} u;
92	double dx[5], dy[5], dxy[9], c, s;
93	unsigned int xy0, xy1, xy2, xy3, xy4, xy5, xy6, xy7;
94	unsigned int z0, z1, z2, z3, z4, z5, z6, z7;
95	unsigned int rm, sticky;
96	unsigned int fsr;
97	int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
98	int cx, cy, cz;
99	volatile double	dummy;
100
101	/* extract the high order words of the arguments */
102	xx.q = x;
103	yy.q = y;
104	zz.q = z;
105	hx = xx.i[0] & ~0x80000000;
106	hy = yy.i[0] & ~0x80000000;
107	hz = zz.i[0] & ~0x80000000;
108
109	/*
110	 * distinguish zero, finite nonzero, infinite, and quiet nan
111	 * arguments; raise invalid and return for signaling nans
112	 */
113	if (hx >= 0x7fff0000) {
114		if ((hx & 0xffff) | xx.i[1] | xx.i[2] | xx.i[3]) {
115			if (!(hx & 0x8000)) {
116				/* signaling nan, raise invalid */
117				dummy = snan;
118				dummy += snan;
119				xx.i[0] |= 0x8000;
120				return (xx.q);
121			}
122			cx = 3;	/* quiet nan */
123		} else
124			cx = 2;	/* inf */
125	} else if (hx == 0) {
126		cx = (xx.i[1] | xx.i[2] | xx.i[3]) ? 1 : 0;
127				/* subnormal or zero */
128	} else
129		cx = 1;		/* finite nonzero */
130
131	if (hy >= 0x7fff0000) {
132		if ((hy & 0xffff) | yy.i[1] | yy.i[2] | yy.i[3]) {
133			if (!(hy & 0x8000)) {
134				dummy = snan;
135				dummy += snan;
136				yy.i[0] |= 0x8000;
137				return (yy.q);
138			}
139			cy = 3;
140		} else
141			cy = 2;
142	} else if (hy == 0) {
143		cy = (yy.i[1] | yy.i[2] | yy.i[3]) ? 1 : 0;
144	} else
145		cy = 1;
146
147	if (hz >= 0x7fff0000) {
148		if ((hz & 0xffff) | zz.i[1] | zz.i[2] | zz.i[3]) {
149			if (!(hz & 0x8000)) {
150				dummy = snan;
151				dummy += snan;
152				zz.i[0] |= 0x8000;
153				return (zz.q);
154			}
155			cz = 3;
156		} else
157			cz = 2;
158	} else if (hz == 0) {
159		cz = (zz.i[1] | zz.i[2] | zz.i[3]) ? 1 : 0;
160	} else
161		cz = 1;
162
163	/* get the fsr and clear current exceptions */
164	__fenv_getfsr32(&fsr);
165	fsr &= ~FSR_CEXC;
166
167	/* handle all other zero, inf, and nan cases */
168	if (cx != 1 || cy != 1 || cz != 1) {
169		/* if x or y is a quiet nan, return it */
170		if (cx == 3) {
171			__fenv_setfsr32(&fsr);
172			return (x);
173		}
174		if (cy == 3) {
175			__fenv_setfsr32(&fsr);
176			return (y);
177		}
178
179		/* if x*y is 0*inf, raise invalid and return the default nan */
180		if ((cx == 0 && cy == 2) || (cx == 2 && cy == 0)) {
181			dummy = zero;
182			dummy *= inf;
183			zz.i[0] = 0x7fffffff;
184			zz.i[1] = zz.i[2] = zz.i[3] = 0xffffffff;
185			return (zz.q);
186		}
187
188		/* if z is a quiet nan, return it */
189		if (cz == 3) {
190			__fenv_setfsr32(&fsr);
191			return (z);
192		}
193
194		/*
195		 * now none of x, y, or z is nan; handle cases where x or y
196		 * is inf
197		 */
198		if (cx == 2 || cy == 2) {
199			/*
200			 * if z is also inf, either we have inf-inf or
201			 * the result is the same as z depending on signs
202			 */
203			if (cz == 2) {
204				if ((int) ((xx.i[0] ^ yy.i[0]) ^ zz.i[0]) < 0) {
205					dummy = inf;
206					dummy -= inf;
207					zz.i[0] = 0x7fffffff;
208					zz.i[1] = zz.i[2] = zz.i[3] =
209						0xffffffff;
210					return (zz.q);
211				}
212				__fenv_setfsr32(&fsr);
213				return (z);
214			}
215
216			/* otherwise the result is inf with appropriate sign */
217			zz.i[0] = ((xx.i[0] ^ yy.i[0]) & 0x80000000) |
218				0x7fff0000;
219			zz.i[1] = zz.i[2] = zz.i[3] = 0;
220			__fenv_setfsr32(&fsr);
221			return (zz.q);
222		}
223
224		/* if z is inf, return it */
225		if (cz == 2) {
226			__fenv_setfsr32(&fsr);
227			return (z);
228		}
229
230		/*
231		 * now x, y, and z are all finite; handle cases where x or y
232		 * is zero
233		 */
234		if (cx == 0 || cy == 0) {
235			/* either we have 0-0 or the result is the same as z */
236			if (cz == 0 && (int) ((xx.i[0] ^ yy.i[0]) ^ zz.i[0]) <
237				0) {
238				zz.i[0] = (fsr >> 30) == FSR_RM ? 0x80000000 :
239					0;
240				__fenv_setfsr32(&fsr);
241				return (zz.q);
242			}
243			__fenv_setfsr32(&fsr);
244			return (z);
245		}
246
247		/* if we get here, x and y are nonzero finite, z must be zero */
248		return (x * y);
249	}
250
251	/*
252	 * now x, y, and z are all finite and nonzero; set round-to-
253	 * negative-infinity mode
254	 */
255	__fenv_setfsr32(&fsr_rm);
256
257	/*
258	 * get the signs and exponents and normalize the significands
259	 * of x and y
260	 */
261	sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
262	ex = hx >> 16;
263	hx &= 0xffff;
264	if (!ex) {
265		if (hx | (xx.i[1] & 0xfffe0000)) {
266			ex = 1;
267		} else if (xx.i[1] | (xx.i[2] & 0xfffe0000)) {
268			hx = xx.i[1];
269			xx.i[1] = xx.i[2];
270			xx.i[2] = xx.i[3];
271			xx.i[3] = 0;
272			ex = -31;
273		} else if (xx.i[2] | (xx.i[3] & 0xfffe0000)) {
274			hx = xx.i[2];
275			xx.i[1] = xx.i[3];
276			xx.i[2] = xx.i[3] = 0;
277			ex = -63;
278		} else {
279			hx = xx.i[3];
280			xx.i[1] = xx.i[2] = xx.i[3] = 0;
281			ex = -95;
282		}
283		while ((hx & 0x10000) == 0) {
284			hx = (hx << 1) | (xx.i[1] >> 31);
285			xx.i[1] = (xx.i[1] << 1) | (xx.i[2] >> 31);
286			xx.i[2] = (xx.i[2] << 1) | (xx.i[3] >> 31);
287			xx.i[3] <<= 1;
288			ex--;
289		}
290	} else
291		hx |= 0x10000;
292	ey = hy >> 16;
293	hy &= 0xffff;
294	if (!ey) {
295		if (hy | (yy.i[1] & 0xfffe0000)) {
296			ey = 1;
297		} else if (yy.i[1] | (yy.i[2] & 0xfffe0000)) {
298			hy = yy.i[1];
299			yy.i[1] = yy.i[2];
300			yy.i[2] = yy.i[3];
301			yy.i[3] = 0;
302			ey = -31;
303		} else if (yy.i[2] | (yy.i[3] & 0xfffe0000)) {
304			hy = yy.i[2];
305			yy.i[1] = yy.i[3];
306			yy.i[2] = yy.i[3] = 0;
307			ey = -63;
308		} else {
309			hy = yy.i[3];
310			yy.i[1] = yy.i[2] = yy.i[3] = 0;
311			ey = -95;
312		}
313		while ((hy & 0x10000) == 0) {
314			hy = (hy << 1) | (yy.i[1] >> 31);
315			yy.i[1] = (yy.i[1] << 1) | (yy.i[2] >> 31);
316			yy.i[2] = (yy.i[2] << 1) | (yy.i[3] >> 31);
317			yy.i[3] <<= 1;
318			ey--;
319		}
320	} else
321		hy |= 0x10000;
322	exy = ex + ey - 0x3fff;
323
324	/* convert the significands of x and y to doubles */
325	c = twom16;
326	dx[0] = (double) ((int) hx) * c;
327	dy[0] = (double) ((int) hy) * c;
328
329	c *= twom24;
330	dx[1] = (double) ((int) (xx.i[1] >> 8)) * c;
331	dy[1] = (double) ((int) (yy.i[1] >> 8)) * c;
332
333	c *= twom24;
334	dx[2] = (double) ((int) (((xx.i[1] << 16) | (xx.i[2] >> 16)) &
335	    0xffffff)) * c;
336	dy[2] = (double) ((int) (((yy.i[1] << 16) | (yy.i[2] >> 16)) &
337	    0xffffff)) * c;
338
339	c *= twom24;
340	dx[3] = (double) ((int) (((xx.i[2] << 8) | (xx.i[3] >> 24)) &
341	    0xffffff)) * c;
342	dy[3] = (double) ((int) (((yy.i[2] << 8) | (yy.i[3] >> 24)) &
343	    0xffffff)) * c;
344
345	c *= twom24;
346	dx[4] = (double) ((int) (xx.i[3] & 0xffffff)) * c;
347	dy[4] = (double) ((int) (yy.i[3] & 0xffffff)) * c;
348
349	/* form the "digits" of the product */
350	dxy[0] = dx[0] * dy[0];
351	dxy[1] = dx[0] * dy[1] + dx[1] * dy[0];
352	dxy[2] = dx[0] * dy[2] + dx[1] * dy[1] + dx[2] * dy[0];
353	dxy[3] = dx[0] * dy[3] + dx[1] * dy[2] + dx[2] * dy[1] +
354	    dx[3] * dy[0];
355	dxy[4] = dx[0] * dy[4] + dx[1] * dy[3] + dx[2] * dy[2] +
356	    dx[3] * dy[1] + dx[4] * dy[0];
357	dxy[5] = dx[1] * dy[4] + dx[2] * dy[3] + dx[3] * dy[2] +
358	    dx[4] * dy[1];
359	dxy[6] = dx[2] * dy[4] + dx[3] * dy[3] + dx[4] * dy[2];
360	dxy[7] = dx[3] * dy[4] + dx[4] * dy[3];
361	dxy[8] = dx[4] * dy[4];
362
363	/* split odd-numbered terms and combine into even-numbered terms */
364	c = (dxy[1] + two20) - two20;
365	dxy[0] += c;
366	dxy[1] -= c;
367	c = (dxy[3] + twom28) - twom28;
368	dxy[2] += c + dxy[1];
369	dxy[3] -= c;
370	c = (dxy[5] + twom76) - twom76;
371	dxy[4] += c + dxy[3];
372	dxy[5] -= c;
373	c = (dxy[7] + twom124) - twom124;
374	dxy[6] += c + dxy[5];
375	dxy[8] += (dxy[7] - c);
376
377	/* propagate carries, adjusting the exponent if need be */
378	dxy[7] = dxy[6] + dxy[8];
379	dxy[5] = dxy[4] + dxy[7];
380	dxy[3] = dxy[2] + dxy[5];
381	dxy[1] = dxy[0] + dxy[3];
382	if (dxy[1] >= two) {
383		dxy[0] *= half;
384		dxy[1] *= half;
385		dxy[2] *= half;
386		dxy[3] *= half;
387		dxy[4] *= half;
388		dxy[5] *= half;
389		dxy[6] *= half;
390		dxy[7] *= half;
391		dxy[8] *= half;
392		exy++;
393	}
394
395	/* extract the significand of x*y */
396	s = two36;
397	u.d = c = dxy[1] + s;
398	xy0 = u.i[1];
399	c -= s;
400	dxy[1] -= c;
401	dxy[0] -= c;
402
403	s *= twom32;
404	u.d = c = dxy[1] + s;
405	xy1 = u.i[1];
406	c -= s;
407	dxy[2] += (dxy[0] - c);
408	dxy[3] = dxy[2] + dxy[5];
409
410	s *= twom32;
411	u.d = c = dxy[3] + s;
412	xy2 = u.i[1];
413	c -= s;
414	dxy[4] += (dxy[2] - c);
415	dxy[5] = dxy[4] + dxy[7];
416
417	s *= twom32;
418	u.d = c = dxy[5] + s;
419	xy3 = u.i[1];
420	c -= s;
421	dxy[4] -= c;
422	dxy[5] = dxy[4] + dxy[7];
423
424	s *= twom32;
425	u.d = c = dxy[5] + s;
426	xy4 = u.i[1];
427	c -= s;
428	dxy[6] += (dxy[4] - c);
429	dxy[7] = dxy[6] + dxy[8];
430
431	s *= twom32;
432	u.d = c = dxy[7] + s;
433	xy5 = u.i[1];
434	c -= s;
435	dxy[8] += (dxy[6] - c);
436
437	s *= twom32;
438	u.d = c = dxy[8] + s;
439	xy6 = u.i[1];
440	c -= s;
441	dxy[8] -= c;
442
443	s *= twom32;
444	u.d = c = dxy[8] + s;
445	xy7 = u.i[1];
446
447	/* extract the sign, exponent, and significand of z */
448	sz = zz.i[0] & 0x80000000;
449	ez = hz >> 16;
450	z0 = hz & 0xffff;
451	if (!ez) {
452		if (z0 | (zz.i[1] & 0xfffe0000)) {
453			z1 = zz.i[1];
454			z2 = zz.i[2];
455			z3 = zz.i[3];
456			ez = 1;
457		} else if (zz.i[1] | (zz.i[2] & 0xfffe0000)) {
458			z0 = zz.i[1];
459			z1 = zz.i[2];
460			z2 = zz.i[3];
461			z3 = 0;
462			ez = -31;
463		} else if (zz.i[2] | (zz.i[3] & 0xfffe0000)) {
464			z0 = zz.i[2];
465			z1 = zz.i[3];
466			z2 = z3 = 0;
467			ez = -63;
468		} else {
469			z0 = zz.i[3];
470			z1 = z2 = z3 = 0;
471			ez = -95;
472		}
473		while ((z0 & 0x10000) == 0) {
474			z0 = (z0 << 1) | (z1 >> 31);
475			z1 = (z1 << 1) | (z2 >> 31);
476			z2 = (z2 << 1) | (z3 >> 31);
477			z3 <<= 1;
478			ez--;
479		}
480	} else {
481		z0 |= 0x10000;
482		z1 = zz.i[1];
483		z2 = zz.i[2];
484		z3 = zz.i[3];
485	}
486	z4 = z5 = z6 = z7 = 0;
487
488	/*
489	 * now x*y is represented by sxy, exy, and xy[0-7], and z is
490	 * represented likewise; swap if need be so |xy| <= |z|
491	 */
492	if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 && (xy1 > z1 ||
493		(xy1 == z1 && (xy2 > z2 || (xy2 == z2 && (xy3 > z3 ||
494		(xy3 == z3 && (xy4 | xy5 | xy6 | xy7) != 0)))))))))) {
495		e = sxy; sxy = sz; sz = e;
496		e = exy; exy = ez; ez = e;
497		e = xy0; xy0 = z0; z0 = e;
498		e = xy1; xy1 = z1; z1 = e;
499		e = xy2; xy2 = z2; z2 = e;
500		e = xy3; xy3 = z3; z3 = e;
501		z4 = xy4; xy4 = 0;
502		z5 = xy5; xy5 = 0;
503		z6 = xy6; xy6 = 0;
504		z7 = xy7; xy7 = 0;
505	}
506
507	/* shift the significand of xy keeping a sticky bit */
508	e = ez - exy;
509	if (e > 236) {
510		xy0 = xy1 = xy2 = xy3 = xy4 = xy5 = xy6 = 0;
511		xy7 = 1;
512	} else if (e >= 224) {
513		sticky = xy7 | xy6 | xy5 | xy4 | xy3 | xy2 | xy1 |
514			((xy0 << 1) << (255 - e));
515		xy7 = xy0 >> (e - 224);
516		if (sticky)
517			xy7 |= 1;
518		xy0 = xy1 = xy2 = xy3 = xy4 = xy5 = xy6 = 0;
519	} else if (e >= 192) {
520		sticky = xy7 | xy6 | xy5 | xy4 | xy3 | xy2 |
521			((xy1 << 1) << (223 - e));
522		xy7 = (xy1 >> (e - 192)) | ((xy0 << 1) << (223 - e));
523		if (sticky)
524			xy7 |= 1;
525		xy6 = xy0 >> (e - 192);
526		xy0 = xy1 = xy2 = xy3 = xy4 = xy5 = 0;
527	} else if (e >= 160) {
528		sticky = xy7 | xy6 | xy5 | xy4 | xy3 |
529			((xy2 << 1) << (191 - e));
530		xy7 = (xy2 >> (e - 160)) | ((xy1 << 1) << (191 - e));
531		if (sticky)
532			xy7 |= 1;
533		xy6 = (xy1 >> (e - 160)) | ((xy0 << 1) << (191 - e));
534		xy5 = xy0 >> (e - 160);
535		xy0 = xy1 = xy2 = xy3 = xy4 = 0;
536	} else if (e >= 128) {
537		sticky = xy7 | xy6 | xy5 | xy4 | ((xy3 << 1) << (159 - e));
538		xy7 = (xy3 >> (e - 128)) | ((xy2 << 1) << (159 - e));
539		if (sticky)
540			xy7 |= 1;
541		xy6 = (xy2 >> (e - 128)) | ((xy1 << 1) << (159 - e));
542		xy5 = (xy1 >> (e - 128)) | ((xy0 << 1) << (159 - e));
543		xy4 = xy0 >> (e - 128);
544		xy0 = xy1 = xy2 = xy3 = 0;
545	} else if (e >= 96) {
546		sticky = xy7 | xy6 | xy5 | ((xy4 << 1) << (127 - e));
547		xy7 = (xy4 >> (e - 96)) | ((xy3 << 1) << (127 - e));
548		if (sticky)
549			xy7 |= 1;
550		xy6 = (xy3 >> (e - 96)) | ((xy2 << 1) << (127 - e));
551		xy5 = (xy2 >> (e - 96)) | ((xy1 << 1) << (127 - e));
552		xy4 = (xy1 >> (e - 96)) | ((xy0 << 1) << (127 - e));
553		xy3 = xy0 >> (e - 96);
554		xy0 = xy1 = xy2 = 0;
555	} else if (e >= 64) {
556		sticky = xy7 | xy6 | ((xy5 << 1) << (95 - e));
557		xy7 = (xy5 >> (e - 64)) | ((xy4 << 1) << (95 - e));
558		if (sticky)
559			xy7 |= 1;
560		xy6 = (xy4 >> (e - 64)) | ((xy3 << 1) << (95 - e));
561		xy5 = (xy3 >> (e - 64)) | ((xy2 << 1) << (95 - e));
562		xy4 = (xy2 >> (e - 64)) | ((xy1 << 1) << (95 - e));
563		xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
564		xy2 = xy0 >> (e - 64);
565		xy0 = xy1 = 0;
566	} else if (e >= 32) {
567		sticky = xy7 | ((xy6 << 1) << (63 - e));
568		xy7 = (xy6 >> (e - 32)) | ((xy5 << 1) << (63 - e));
569		if (sticky)
570			xy7 |= 1;
571		xy6 = (xy5 >> (e - 32)) | ((xy4 << 1) << (63 - e));
572		xy5 = (xy4 >> (e - 32)) | ((xy3 << 1) << (63 - e));
573		xy4 = (xy3 >> (e - 32)) | ((xy2 << 1) << (63 - e));
574		xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
575		xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
576		xy1 = xy0 >> (e - 32);
577		xy0 = 0;
578	} else if (e) {
579		sticky = (xy7 << 1) << (31 - e);
580		xy7 = (xy7 >> e) | ((xy6 << 1) << (31 - e));
581		if (sticky)
582			xy7 |= 1;
583		xy6 = (xy6 >> e) | ((xy5 << 1) << (31 - e));
584		xy5 = (xy5 >> e) | ((xy4 << 1) << (31 - e));
585		xy4 = (xy4 >> e) | ((xy3 << 1) << (31 - e));
586		xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
587		xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
588		xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
589		xy0 >>= e;
590	}
591
592	/* if this is a magnitude subtract, negate the significand of xy */
593	if (sxy ^ sz) {
594		xy0 = ~xy0;
595		xy1 = ~xy1;
596		xy2 = ~xy2;
597		xy3 = ~xy3;
598		xy4 = ~xy4;
599		xy5 = ~xy5;
600		xy6 = ~xy6;
601		xy7 = -xy7;
602		if (xy7 == 0)
603			if (++xy6 == 0)
604				if (++xy5 == 0)
605					if (++xy4 == 0)
606						if (++xy3 == 0)
607							if (++xy2 == 0)
608								if (++xy1 == 0)
609									xy0++;
610	}
611
612	/* add, propagating carries */
613	z7 += xy7;
614	e = (z7 < xy7);
615	z6 += xy6;
616	if (e) {
617		z6++;
618		e = (z6 <= xy6);
619	} else
620		e = (z6 < xy6);
621	z5 += xy5;
622	if (e) {
623		z5++;
624		e = (z5 <= xy5);
625	} else
626		e = (z5 < xy5);
627	z4 += xy4;
628	if (e) {
629		z4++;
630		e = (z4 <= xy4);
631	} else
632		e = (z4 < xy4);
633	z3 += xy3;
634	if (e) {
635		z3++;
636		e = (z3 <= xy3);
637	} else
638		e = (z3 < xy3);
639	z2 += xy2;
640	if (e) {
641		z2++;
642		e = (z2 <= xy2);
643	} else
644		e = (z2 < xy2);
645	z1 += xy1;
646	if (e) {
647		z1++;
648		e = (z1 <= xy1);
649	} else
650		e = (z1 < xy1);
651	z0 += xy0;
652	if (e)
653		z0++;
654
655	/* postnormalize and collect rounding information into z4 */
656	if (ez < 1) {
657		/* result is tiny; shift right until exponent is within range */
658		e = 1 - ez;
659		if (e > 116) {
660			z4 = 1; /* result can't be exactly zero */
661			z0 = z1 = z2 = z3 = 0;
662		} else if (e >= 96) {
663			sticky = z7 | z6 | z5 | z4 | z3 | z2 |
664				((z1 << 1) << (127 - e));
665			z4 = (z1 >> (e - 96)) | ((z0 << 1) << (127 - e));
666			if (sticky)
667				z4 |= 1;
668			z3 = z0 >> (e - 96);
669			z0 = z1 = z2 = 0;
670		} else if (e >= 64) {
671			sticky = z7 | z6 | z5 | z4 | z3 |
672				((z2 << 1) << (95 - e));
673			z4 = (z2 >> (e - 64)) | ((z1 << 1) << (95 - e));
674			if (sticky)
675				z4 |= 1;
676			z3 = (z1 >> (e - 64)) | ((z0 << 1) << (95 - e));
677			z2 = z0 >> (e - 64);
678			z0 = z1 = 0;
679		} else if (e >= 32) {
680			sticky = z7 | z6 | z5 | z4 | ((z3 << 1) << (63 - e));
681			z4 = (z3 >> (e - 32)) | ((z2 << 1) << (63 - e));
682			if (sticky)
683				z4 |= 1;
684			z3 = (z2 >> (e - 32)) | ((z1 << 1) << (63 - e));
685			z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
686			z1 = z0 >> (e - 32);
687			z0 = 0;
688		} else {
689			sticky = z7 | z6 | z5 | (z4 << 1) << (31 - e);
690			z4 = (z4 >> e) | ((z3 << 1) << (31 - e));
691			if (sticky)
692				z4 |= 1;
693			z3 = (z3 >> e) | ((z2 << 1) << (31 - e));
694			z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
695			z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
696			z0 >>= e;
697		}
698		ez = 1;
699	} else if (z0 >= 0x20000) {
700		/* carry out; shift right by one */
701		sticky = (z4 & 1) | z5 | z6 | z7;
702		z4 = (z4 >> 1) | (z3 << 31);
703		if (sticky)
704			z4 |= 1;
705		z3 = (z3 >> 1) | (z2 << 31);
706		z2 = (z2 >> 1) | (z1 << 31);
707		z1 = (z1 >> 1) | (z0 << 31);
708		z0 >>= 1;
709		ez++;
710	} else {
711		if (z0 < 0x10000 && (z0 | z1 | z2 | z3 | z4 | z5 | z6 | z7)
712			!= 0) {
713			/*
714			 * borrow/cancellation; shift left as much as
715			 * exponent allows
716			 */
717			while (!(z0 | (z1 & 0xfffe0000)) && ez >= 33) {
718				z0 = z1;
719				z1 = z2;
720				z2 = z3;
721				z3 = z4;
722				z4 = z5;
723				z5 = z6;
724				z6 = z7;
725				z7 = 0;
726				ez -= 32;
727			}
728			while (z0 < 0x10000 && ez > 1) {
729				z0 = (z0 << 1) | (z1 >> 31);
730				z1 = (z1 << 1) | (z2 >> 31);
731				z2 = (z2 << 1) | (z3 >> 31);
732				z3 = (z3 << 1) | (z4 >> 31);
733				z4 = (z4 << 1) | (z5 >> 31);
734				z5 = (z5 << 1) | (z6 >> 31);
735				z6 = (z6 << 1) | (z7 >> 31);
736				z7 <<= 1;
737				ez--;
738			}
739		}
740		if (z5 | z6 | z7)
741			z4 |= 1;
742	}
743
744	/* get the rounding mode */
745	rm = fsr >> 30;
746
747	/* strip off the integer bit, if there is one */
748	ibit = z0 & 0x10000;
749	if (ibit)
750		z0 -= 0x10000;
751	else {
752		ez = 0;
753		if (!(z0 | z1 | z2 | z3 | z4)) { /* exact zero */
754			zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
755			zz.i[1] = zz.i[2] = zz.i[3] = 0;
756			__fenv_setfsr32(&fsr);
757			return (zz.q);
758		}
759	}
760
761	/*
762	 * flip the sense of directed roundings if the result is negative;
763	 * the logic below applies to a positive result
764	 */
765	if (sz)
766		rm ^= rm >> 1;
767
768	/* round and raise exceptions */
769	if (z4) {
770		fsr |= FSR_NXC;
771
772		/* decide whether to round the fraction up */
773		if (rm == FSR_RP || (rm == FSR_RN && (z4 > 0x80000000u ||
774			(z4 == 0x80000000u && (z3 & 1))))) {
775			/* round up and renormalize if necessary */
776			if (++z3 == 0)
777				if (++z2 == 0)
778					if (++z1 == 0)
779						if (++z0 == 0x10000) {
780							z0 = 0;
781							ez++;
782						}
783		}
784	}
785
786	/* check for under/overflow */
787	if (ez >= 0x7fff) {
788		if (rm == FSR_RN || rm == FSR_RP) {
789			zz.i[0] = sz | 0x7fff0000;
790			zz.i[1] = zz.i[2] = zz.i[3] = 0;
791		} else {
792			zz.i[0] = sz | 0x7ffeffff;
793			zz.i[1] = zz.i[2] = zz.i[3] = 0xffffffff;
794		}
795		fsr |= FSR_OFC | FSR_NXC;
796	} else {
797		zz.i[0] = sz | (ez << 16) | z0;
798		zz.i[1] = z1;
799		zz.i[2] = z2;
800		zz.i[3] = z3;
801
802		/*
803		 * !ibit => exact result was tiny before rounding,
804		 * z4 nonzero => result delivered is inexact
805		 */
806		if (!ibit) {
807			if (z4)
808				fsr |= FSR_UFC | FSR_NXC;
809			else if (fsr & FSR_UFM)
810				fsr |= FSR_UFC;
811		}
812	}
813
814	/* restore the fsr and emulate exceptions as needed */
815	if ((fsr & FSR_CEXC) & (fsr >> 23)) {
816		__fenv_setfsr32(&fsr);
817		if (fsr & FSR_OFC) {
818			dummy = huge;
819			dummy *= huge;
820		} else if (fsr & FSR_UFC) {
821			dummy = tiny;
822			if (fsr & FSR_NXC)
823				dummy *= tiny;
824			else
825				dummy -= tiny2;
826		} else {
827			dummy = huge;
828			dummy += tiny;
829		}
830	} else {
831		fsr |= (fsr & 0x1f) << 5;
832		__fenv_setfsr32(&fsr);
833	}
834	return (zz.q);
835}
836
837#elif defined(__x86)
838
839static const union {
840	unsigned i[2];
841	double d;
842} C[] = {
843	{ 0, 0x3fe00000u },
844	{ 0, 0x40000000u },
845	{ 0, 0x3df00000u },
846	{ 0, 0x3bf00000u },
847	{ 0, 0x41f00000u },
848	{ 0, 0x43e00000u },
849	{ 0, 0x7fe00000u },
850	{ 0, 0x00100000u },
851	{ 0, 0x00100001u }
852};
853
854#define	half	C[0].d
855#define	two	C[1].d
856#define	twom32	C[2].d
857#define	twom64	C[3].d
858#define	two32	C[4].d
859#define	two63	C[5].d
860#define	huge	C[6].d
861#define	tiny	C[7].d
862#define	tiny2	C[8].d
863
864#if defined(__amd64)
865#define	NI	4
866#else
867#define	NI	3
868#endif
869
870/*
871 * fmal for x86: 80-bit extended double precision, little-endian
872 */
873long double
874__fmal(long double x, long double y, long double z) {
875	union {
876		unsigned i[NI];
877		long double e;
878	} xx, yy, zz;
879	long double xhi, yhi, xlo, ylo, t;
880	unsigned xy0, xy1, xy2, xy3, xy4, z0, z1, z2, z3, z4;
881	unsigned oldcwsw, cwsw, rm, sticky, carry;
882	int ex, ey, ez, exy, sxy, sz, e, tinyafter;
883	volatile double	dummy;
884
885	/* extract the exponents of the arguments */
886	xx.e = x;
887	yy.e = y;
888	zz.e = z;
889	ex = xx.i[2] & 0x7fff;
890	ey = yy.i[2] & 0x7fff;
891	ez = zz.i[2] & 0x7fff;
892
893	/* dispense with inf, nan, and zero cases */
894	if (ex == 0x7fff || ey == 0x7fff || (ex | xx.i[1] | xx.i[0]) == 0 ||
895		(ey | yy.i[1] | yy.i[0]) == 0)	/* x or y is inf, nan, or 0 */
896		return (x * y + z);
897
898	if (ez == 0x7fff)			/* z is inf or nan */
899		return (x + z);	/* avoid spurious under/overflow in x * y */
900
901	if ((ez | zz.i[1] | zz.i[0]) == 0)	/* z is zero */
902		/*
903		 * x * y isn't zero but could underflow to zero,
904		 * so don't add z, lest we perturb the sign
905		 */
906		return (x * y);
907
908	/*
909	 * now x, y, and z are all finite and nonzero; extract signs and
910	 * normalize the significands (this will raise the denormal operand
911	 * exception if need be)
912	 */
913	sxy = (xx.i[2] ^ yy.i[2]) & 0x8000;
914	sz = zz.i[2] & 0x8000;
915	if (!ex) {
916		xx.e = x * two63;
917		ex = (xx.i[2] & 0x7fff) - 63;
918	}
919	if (!ey) {
920		yy.e = y * two63;
921		ey = (yy.i[2] & 0x7fff) - 63;
922	}
923	if (!ez) {
924		zz.e = z * two63;
925		ez = (zz.i[2] & 0x7fff) - 63;
926	}
927
928	/*
929	 * save the control and status words, mask all exceptions, and
930	 * set rounding to 64-bit precision and toward-zero
931	 */
932	__fenv_getcwsw(&oldcwsw);
933	cwsw = (oldcwsw & 0xf0c0ffff) | 0x0f3f0000;
934	__fenv_setcwsw(&cwsw);
935
936	/* multiply x*y to 128 bits */
937	exy = ex + ey - 0x3fff;
938	xx.i[2] = 0x3fff;
939	yy.i[2] = 0x3fff;
940	x = xx.e;
941	y = yy.e;
942	xhi = ((x + twom32) + two32) - two32;
943	yhi = ((y + twom32) + two32) - two32;
944	xlo = x - xhi;
945	ylo = y - yhi;
946	x *= y;
947	y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
948	if (x >= two) {
949		x *= half;
950		y *= half;
951		exy++;
952	}
953
954	/* extract the significands */
955	xx.e = x;
956	xy0 = xx.i[1];
957	xy1 = xx.i[0];
958	yy.e = t = y + twom32;
959	xy2 = yy.i[0];
960	yy.e = (y - (t - twom32)) + twom64;
961	xy3 = yy.i[0];
962	xy4 = 0;
963	z0 = zz.i[1];
964	z1 = zz.i[0];
965	z2 = z3 = z4 = 0;
966
967	/*
968	 * now x*y is represented by sxy, exy, and xy[0-4], and z is
969	 * represented likewise; swap if need be so |xy| <= |z|
970	 */
971	if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
972		(xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
973		e = sxy; sxy = sz; sz = e;
974		e = exy; exy = ez; ez = e;
975		e = xy0; xy0 = z0; z0 = e;
976		e = xy1; xy1 = z1; z1 = e;
977		z2 = xy2; xy2 = 0;
978		z3 = xy3; xy3 = 0;
979	}
980
981	/* shift the significand of xy keeping a sticky bit */
982	e = ez - exy;
983	if (e > 130) {
984		xy0 = xy1 = xy2 = xy3 = 0;
985		xy4 = 1;
986	} else if (e >= 128) {
987		sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (159 - e));
988		xy4 = xy0 >> (e - 128);
989		if (sticky)
990			xy4 |= 1;
991		xy0 = xy1 = xy2 = xy3 = 0;
992	} else if (e >= 96) {
993		sticky = xy3 | xy2 | ((xy1 << 1) << (127 - e));
994		xy4 = (xy1 >> (e - 96)) | ((xy0 << 1) << (127 - e));
995		if (sticky)
996			xy4 |= 1;
997		xy3 = xy0 >> (e - 96);
998		xy0 = xy1 = xy2 = 0;
999	} else if (e >= 64) {
1000		sticky = xy3 | ((xy2 << 1) << (95 - e));
1001		xy4 = (xy2 >> (e - 64)) | ((xy1 << 1) << (95 - e));
1002		if (sticky)
1003			xy4 |= 1;
1004		xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
1005		xy2 = xy0 >> (e - 64);
1006		xy0 = xy1 = 0;
1007	} else if (e >= 32) {
1008		sticky = (xy3 << 1) << (63 - e);
1009		xy4 = (xy3 >> (e - 32)) | ((xy2 << 1) << (63 - e));
1010		if (sticky)
1011			xy4 |= 1;
1012		xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
1013		xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
1014		xy1 = xy0 >> (e - 32);
1015		xy0 = 0;
1016	} else if (e) {
1017		xy4 = (xy3 << 1) << (31 - e);
1018		xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
1019		xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
1020		xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
1021		xy0 >>= e;
1022	}
1023
1024	/* if this is a magnitude subtract, negate the significand of xy */
1025	if (sxy ^ sz) {
1026		xy0 = ~xy0;
1027		xy1 = ~xy1;
1028		xy2 = ~xy2;
1029		xy3 = ~xy3;
1030		xy4 = -xy4;
1031		if (xy4 == 0)
1032			if (++xy3 == 0)
1033				if (++xy2 == 0)
1034					if (++xy1 == 0)
1035						xy0++;
1036	}
1037
1038	/* add, propagating carries */
1039	z4 += xy4;
1040	carry = (z4 < xy4);
1041	z3 += xy3;
1042	if (carry) {
1043		z3++;
1044		carry = (z3 <= xy3);
1045	} else
1046		carry = (z3 < xy3);
1047	z2 += xy2;
1048	if (carry) {
1049		z2++;
1050		carry = (z2 <= xy2);
1051	} else
1052		carry = (z2 < xy2);
1053	z1 += xy1;
1054	if (carry) {
1055		z1++;
1056		carry = (z1 <= xy1);
1057	} else
1058		carry = (z1 < xy1);
1059	z0 += xy0;
1060	if (carry) {
1061		z0++;
1062		carry = (z0 <= xy0);
1063	} else
1064		carry = (z0 < xy0);
1065
1066	/* for a magnitude subtract, ignore the last carry out */
1067	if (sxy ^ sz)
1068		carry = 0;
1069
1070	/* postnormalize and collect rounding information into z2 */
1071	if (ez < 1) {
1072		/* result is tiny; shift right until exponent is within range */
1073		e = 1 - ez;
1074		if (e > 67) {
1075			z2 = 1;	/* result can't be exactly zero */
1076			z0 = z1 = 0;
1077		} else if (e >= 64) {
1078			sticky = z4 | z3 | z2 | z1 | ((z0 << 1) << (95 - e));
1079			z2 = (z0 >> (e - 64)) | ((carry << 1) << (95 - e));
1080			if (sticky)
1081				z2 |= 1;
1082			z1 = carry >> (e - 64);
1083			z0 = 0;
1084		} else if (e >= 32) {
1085			sticky = z4 | z3 | z2 | ((z1 << 1) << (63 - e));
1086			z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
1087			if (sticky)
1088				z2 |= 1;
1089			z1 = (z0 >> (e - 32)) | ((carry << 1) << (63 - e));
1090			z0 = carry >> (e - 32);
1091		} else {
1092			sticky = z4 | z3 | (z2 << 1) << (31 - e);
1093			z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
1094			if (sticky)
1095				z2 |= 1;
1096			z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
1097			z0 = (z0 >> e) | ((carry << 1) << (31 - e));
1098		}
1099		ez = 1;
1100	} else if (carry) {
1101		/* carry out; shift right by one */
1102		sticky = (z2 & 1) | z3 | z4;
1103		z2 = (z2 >> 1) | (z1 << 31);
1104		if (sticky)
1105			z2 |= 1;
1106		z1 = (z1 >> 1) | (z0 << 31);
1107		z0 = (z0 >> 1) | 0x80000000;
1108		ez++;
1109	} else {
1110		if (z0 < 0x80000000u && (z0 | z1 | z2 | z3 | z4) != 0) {
1111			/*
1112			 * borrow/cancellation; shift left as much as
1113			 * exponent allows
1114			 */
1115			while (!z0 && ez >= 33) {
1116				z0 = z1;
1117				z1 = z2;
1118				z2 = z3;
1119				z3 = z4;
1120				z4 = 0;
1121				ez -= 32;
1122			}
1123			while (z0 < 0x80000000u && ez > 1) {
1124				z0 = (z0 << 1) | (z1 >> 31);
1125				z1 = (z1 << 1) | (z2 >> 31);
1126				z2 = (z2 << 1) | (z3 >> 31);
1127				z3 = (z3 << 1) | (z4 >> 31);
1128				z4 <<= 1;
1129				ez--;
1130			}
1131		}
1132		if (z3 | z4)
1133			z2 |= 1;
1134	}
1135
1136	/* get the rounding mode */
1137	rm = oldcwsw & 0x0c000000;
1138
1139	/* adjust exponent if result is subnormal */
1140	tinyafter = 0;
1141	if (!(z0 & 0x80000000)) {
1142		ez = 0;
1143		tinyafter = 1;
1144		if (!(z0 | z1 | z2)) { /* exact zero */
1145			zz.i[2] = rm == FCW_RM ? 0x8000 : 0;
1146			zz.i[1] = zz.i[0] = 0;
1147			__fenv_setcwsw(&oldcwsw);
1148			return (zz.e);
1149		}
1150	}
1151
1152	/*
1153	 * flip the sense of directed roundings if the result is negative;
1154	 * the logic below applies to a positive result
1155	 */
1156	if (sz && (rm == FCW_RM || rm == FCW_RP))
1157		rm = (FCW_RM + FCW_RP) - rm;
1158
1159	/* round */
1160	if (z2) {
1161		if (rm == FCW_RP || (rm == FCW_RN && (z2 > 0x80000000u ||
1162			(z2 == 0x80000000u && (z1 & 1))))) {
1163			/* round up and renormalize if necessary */
1164			if (++z1 == 0) {
1165				if (++z0 == 0) {
1166					z0 = 0x80000000;
1167					ez++;
1168				} else if (z0 == 0x80000000) {
1169					/* rounded up to smallest normal */
1170					ez = 1;
1171					if ((rm == FCW_RP && z2 >
1172						0x80000000u) || (rm == FCW_RN &&
1173						z2 >= 0xc0000000u))
1174						/*
1175						 * would have rounded up to
1176						 * smallest normal even with
1177						 * unbounded range
1178						 */
1179						tinyafter = 0;
1180				}
1181			}
1182		}
1183	}
1184
1185	/* restore the control and status words, check for over/underflow */
1186	__fenv_setcwsw(&oldcwsw);
1187	if (ez >= 0x7fff) {
1188		if (rm == FCW_RN || rm == FCW_RP) {
1189			zz.i[2] = sz | 0x7fff;
1190			zz.i[1] = 0x80000000;
1191			zz.i[0] = 0;
1192		} else {
1193			zz.i[2] = sz | 0x7ffe;
1194			zz.i[1] = 0xffffffff;
1195			zz.i[0] = 0xffffffff;
1196		}
1197		dummy = huge;
1198		dummy *= huge;
1199	} else {
1200		zz.i[2] = sz | ez;
1201		zz.i[1] = z0;
1202		zz.i[0] = z1;
1203
1204		/*
1205		 * tinyafter => result rounded w/ unbounded range would be tiny,
1206		 * z2 nonzero => result delivered is inexact
1207		 */
1208		if (tinyafter) {
1209			dummy = tiny;
1210			if (z2)
1211				dummy *= tiny;
1212			else
1213				dummy -= tiny2;
1214		} else if (z2) {
1215			dummy = huge;
1216			dummy += tiny;
1217		}
1218	}
1219
1220	return (zz.e);
1221}
1222
1223#else
1224#error Unknown architecture
1225#endif
1226