xref: /illumos-gate/usr/src/lib/libm/common/m9x/fmaf.c (revision 1ec68d33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24  */
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #pragma weak fmaf = __fmaf
31 
32 #include "libm.h"
33 #include "fma.h"
34 #include "fenv_inlines.h"
35 
36 #if defined(__sparc)
37 
38 /*
39  * fmaf for SPARC: 32-bit single precision, big-endian
40  */
41 float
__fmaf(float x,float y,float z)42 __fmaf(float x, float y, float z) {
43 	union {
44 		unsigned i[2];
45 		double d;
46 	} xy, zz;
47 	unsigned u, s;
48 	int exy, ez;
49 
50 	/*
51 	 * the following operations can only raise the invalid exception,
52 	 * and then only if either x*y is of the form Inf*0 or one of x,
53 	 * y, or z is a signaling NaN
54 	 */
55 	xy.d = (double) x * y;
56 	zz.d = (double) z;
57 
58 	/*
59 	 * if the sum xy + z will be exact, just compute it and cast the
60 	 * result to float
61 	 */
62 	exy = (xy.i[0] >> 20) & 0x7ff;
63 	ez = (zz.i[0] >> 20) & 0x7ff;
64 	if ((ez - exy <= 4 && exy - ez <= 28) || exy == 0x7ff || exy == 0 ||
65 		ez == 0x7ff || ez == 0) {
66 		return ((float) (xy.d + zz.d));
67 	}
68 
69 	/*
70 	 * collapse the tail of the smaller summand into a "sticky bit"
71 	 * so that the sum can be computed without error
72 	 */
73 	if (ez > exy) {
74 		if (ez - exy < 31) {
75 			u = xy.i[1];
76 			s = 2 << (ez - exy);
77 			if (u & (s - 1))
78 				u |= s;
79 			xy.i[1] = u & ~(s - 1);
80 		} else if (ez - exy < 51) {
81 			u = xy.i[0];
82 			s = 1 << (ez - exy - 31);
83 			if ((u & (s - 1)) | xy.i[1])
84 				u |= s;
85 			xy.i[0] = u & ~(s - 1);
86 			xy.i[1] = 0;
87 		} else {
88 			/* collapse all of xy into a single bit */
89 			xy.i[0] = (xy.i[0] & 0x80000000) | ((ez - 51) << 20);
90 			xy.i[1] = 0;
91 		}
92 	} else {
93 		if (exy - ez < 31) {
94 			u = zz.i[1];
95 			s = 2 << (exy - ez);
96 			if (u & (s - 1))
97 				u |= s;
98 			zz.i[1] = u & ~(s - 1);
99 		} else if (exy - ez < 51) {
100 			u = zz.i[0];
101 			s = 1 << (exy - ez - 31);
102 			if ((u & (s - 1)) | zz.i[1])
103 				u |= s;
104 			zz.i[0] = u & ~(s - 1);
105 			zz.i[1] = 0;
106 		} else {
107 			/* collapse all of zz into a single bit */
108 			zz.i[0] = (zz.i[0] & 0x80000000) | ((exy - 51) << 20);
109 			zz.i[1] = 0;
110 		}
111 	}
112 
113 	return ((float) (xy.d + zz.d));
114 }
115 
116 #elif defined(__x86)
117 
118 #if defined(__amd64)
119 #define	NI	4
120 #else
121 #define	NI	3
122 #endif
123 
124 /*
125  * fmaf for x86: 32-bit single precision, little-endian
126  */
127 float
__fmaf(float x,float y,float z)128 __fmaf(float x, float y, float z) {
129 	union {
130 		unsigned i[NI];
131 		long double e;
132 	} xy, zz;
133 	unsigned u, s, cwsw, oldcwsw;
134 	int exy, ez;
135 
136 	/* set rounding precision to 64 bits */
137 	__fenv_getcwsw(&oldcwsw);
138 	cwsw = (oldcwsw & 0xfcffffff) | 0x03000000;
139 	__fenv_setcwsw(&cwsw);
140 
141 	/*
142 	 * the following operations can only raise the invalid exception,
143 	 * and then only if either x*y is of the form Inf*0 or one of x,
144 	 * y, or z is a signaling NaN
145 	 */
146 	xy.e = (long double) x * y;
147 	zz.e = (long double) z;
148 
149 	/*
150 	 * if the sum xy + z will be exact, just compute it and cast the
151 	 * result to float
152 	 */
153 	exy = xy.i[2] & 0x7fff;
154 	ez = zz.i[2] & 0x7fff;
155 	if ((ez - exy <= 15 && exy - ez <= 39) || exy == 0x7fff || exy == 0 ||
156 		ez == 0x7fff || ez == 0) {
157 		goto cont;
158 	}
159 
160 	/*
161 	 * collapse the tail of the smaller summand into a "sticky bit"
162 	 * so that the sum can be computed without error
163 	 */
164 	if (ez > exy) {
165 		if (ez - exy < 31) {
166 			u = xy.i[0];
167 			s = 2 << (ez - exy);
168 			if (u & (s - 1))
169 				u |= s;
170 			xy.i[0] = u & ~(s - 1);
171 		} else if (ez - exy < 62) {
172 			u = xy.i[1];
173 			s = 1 << (ez - exy - 31);
174 			if ((u & (s - 1)) | xy.i[0])
175 				u |= s;
176 			xy.i[1] = u & ~(s - 1);
177 			xy.i[0] = 0;
178 		} else {
179 			/* collapse all of xy into a single bit */
180 			xy.i[0] = 0;
181 			xy.i[1] = 0x80000000;
182 			xy.i[2] = (xy.i[2] & 0x8000) | (ez - 62);
183 		}
184 	} else {
185 		if (exy - ez < 62) {
186 			u = zz.i[1];
187 			s = 1 << (exy - ez - 31);
188 			if ((u & (s - 1)) | zz.i[0])
189 				u |= s;
190 			zz.i[1] = u & ~(s - 1);
191 			zz.i[0] = 0;
192 		} else {
193 			/* collapse all of zz into a single bit */
194 			zz.i[0] = 0;
195 			zz.i[1] = 0x80000000;
196 			zz.i[2] = (zz.i[2] & 0x8000) | (exy - 62);
197 		}
198 	}
199 
200 cont:
201 	xy.e += zz.e;
202 
203 	/* restore the rounding precision */
204 	__fenv_getcwsw(&cwsw);
205 	cwsw = (cwsw & 0xfcffffff) | (oldcwsw & 0x03000000);
206 	__fenv_setcwsw(&cwsw);
207 
208 	return ((float) xy.e);
209 }
210 
211 #if 0
212 /*
213  * another fmaf for x86: assumes return value will be left in
214  * long double (80-bit double extended) precision
215  */
216 long double
217 __fmaf(float x, float y, float z) {
218 	/*
219 	 * Note: This implementation assumes the rounding precision mode
220 	 * is set to the default, rounding to 64 bit precision.  If this
221 	 * routine must work in non-default rounding precision modes, do
222 	 * the following instead:
223 	 *
224 	 *   long double t;
225 	 *
226 	 *   <set rp mode to round to 64 bit precision>
227 	 *   t = x * y;
228 	 *   <restore rp mode>
229 	 *   return t + z;
230 	 *
231 	 * Note that the code to change rounding precision must not alter
232 	 * the exception masks or flags, since the product x * y may raise
233 	 * an invalid operation exception.
234 	 */
235 	return ((long double) x * y + z);
236 }
237 #endif
238 
239 #else
240 #error Unknown architecture
241 #endif
242