1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29/*
30 * __vsinf: single precision vector sin
31 *
32 * Algorithm:
33 *
34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
36 * z*C2))), where z = x*x, all evaluated in double precision.
37 *
38 * Accuracy:
39 *
40 * The largest error is less than 0.6 ulps.
41 */
42
43#include <sys/isa_defs.h>
44
45#ifdef _LITTLE_ENDIAN
46#define	HI(x)	*(1+(int *)&x)
47#define	LO(x)	*(unsigned *)&x
48#else
49#define	HI(x)	*(int *)&x
50#define	LO(x)	*(1+(unsigned *)&x)
51#endif
52
53#ifdef __RESTRICT
54#define	restrict _Restrict
55#else
56#define	restrict
57#endif
58
59extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
60
61static const double C[] = {
62	-1.66666552424430847168e-01,	/* 2^ -3 * -1.5555460000000 */
63	8.33219196647405624390e-03,	/* 2^ -7 *  1.11077E0000000 */
64	-1.95187909412197768688e-04,	/* 2^-13 * -1.9956B60000000 */
65	1.0,
66	-0.5,
67	4.16666455566883087158e-02,	/* 2^ -5 *  1.55554A0000000 */
68	-1.38873036485165357590e-03,	/* 2^-10 * -1.6C0C1E0000000 */
69	2.44309903791872784495e-05,	/* 2^-16 *  1.99E24E0000000 */
70	0.636619772367581343075535,	/* 2^ -1  * 1.45F306DC9C883 */
71	6755399441055744.0,		/* 2^ 52  * 1.8000000000000 */
72	1.570796326734125614166,	/* 2^  0  * 1.921FB54400000 */
73	6.077100506506192601475e-11,	/* 2^-34  * 1.0B4611A626331 */
74};
75
76#define	S0	C[0]
77#define	S1	C[1]
78#define	S2	C[2]
79#define	one	C[3]
80#define	mhalf	C[4]
81#define	C0	C[5]
82#define	C1	C[6]
83#define	C2	C[7]
84#define	invpio2	C[8]
85#define	c3two51	C[9]
86#define	pio2_1  C[10]
87#define	pio2_t	C[11]
88
89#define	PREPROCESS(N, index, label)					\
90	hx = *(int *)x;							\
91	ix = hx & 0x7fffffff;						\
92	t = *x;								\
93	x += stridex;							\
94	if (ix <= 0x3f490fdb) { /* |x| < pi/4 */			\
95		if (ix == 0) {						\
96			y[index] = t;					\
97			goto label;					\
98		}							\
99		y##N = (double)t;					\
100		n##N = 0;						\
101	} else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */		\
102		y##N = (double)t;					\
103		medium = 1;						\
104	} else {							\
105		if (ix >= 0x7f800000) { /* inf or nan */		\
106			y[index] = t / t;				\
107			goto label;					\
108		}							\
109		z##N = y##N = (double)t;				\
110		hx = HI(y##N);						\
111		n##N = ((hx >> 20) & 0x7ff) - 1046;			\
112		HI(z##N) = (hx & 0xfffff) | 0x41600000;			\
113		n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0);	\
114		if (hx < 0) {						\
115			y##N = -y##N;					\
116			n##N = -n##N;					\
117		}							\
118		z##N = y##N * y##N;					\
119		if (n##N & 1) { /* compute cos y */			\
120			f##N = (float)(one + z##N * (mhalf + z##N *	\
121			    (C0 + z##N * (C1 + z##N * C2))));		\
122		} else { /* compute sin y */				\
123			f##N = (float)(y##N + y##N * z##N * (S0 +	\
124			    z##N * (S1 + z##N * S2)));			\
125		}							\
126		y[index] = (n##N & 2)? -f##N : f##N;			\
127		goto label;						\
128	}
129
130#define	PROCESS(N)							\
131	if (medium) {							\
132		z##N = y##N * invpio2 + c3two51;			\
133		n##N = LO(z##N);					\
134		z##N -= c3two51;					\
135		y##N = (y##N - z##N * pio2_1) - z##N * pio2_t;		\
136	}								\
137	z##N = y##N * y##N;						\
138	if (n##N & 1) { /* compute cos y */				\
139		f##N = (float)(one + z##N * (mhalf + z##N * (C0 +	\
140		    z##N * (C1 + z##N * C2))));				\
141	} else { /* compute sin y */					\
142		f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 +	\
143		    z##N * S2)));					\
144	}								\
145	*y = (n##N & 2)? -f##N : f##N;					\
146	y += stridey
147
148void
149__vsinf(int n, float *restrict x, int stridex, float *restrict y,
150    int stridey)
151{
152	double		y0, y1, y2, y3;
153	double		z0, z1, z2, z3;
154	float		f0, f1, f2, f3, t;
155	int		n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
156
157	y -= stridey;
158
159	for (;;) {
160begin:
161		y += stridey;
162
163		if (--n < 0)
164			break;
165
166		medium = 0;
167		PREPROCESS(0, 0, begin);
168
169		if (--n < 0)
170			goto process1;
171
172		PREPROCESS(1, stridey, process1);
173
174		if (--n < 0)
175			goto process2;
176
177		PREPROCESS(2, (stridey << 1), process2);
178
179		if (--n < 0)
180			goto process3;
181
182		PREPROCESS(3, (stridey << 1) + stridey, process3);
183
184		if (medium) {
185			z0 = y0 * invpio2 + c3two51;
186			z1 = y1 * invpio2 + c3two51;
187			z2 = y2 * invpio2 + c3two51;
188			z3 = y3 * invpio2 + c3two51;
189
190			n0 = LO(z0);
191			n1 = LO(z1);
192			n2 = LO(z2);
193			n3 = LO(z3);
194
195			z0 -= c3two51;
196			z1 -= c3two51;
197			z2 -= c3two51;
198			z3 -= c3two51;
199
200			y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
201			y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
202			y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
203			y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
204		}
205
206		z0 = y0 * y0;
207		z1 = y1 * y1;
208		z2 = y2 * y2;
209		z3 = y3 * y3;
210
211		hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) |
212		    ((n3 & 1) << 3);
213		switch (hx) {
214		case 0:
215			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
216			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
217			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
218			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
219			break;
220
221		case 1:
222			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
223			    z0 * (C1 + z0 * C2))));
224			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
225			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
226			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
227			break;
228
229		case 2:
230			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
231			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
232			    z1 * (C1 + z1 * C2))));
233			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
234			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
235			break;
236
237		case 3:
238			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
239			    z0 * (C1 + z0 * C2))));
240			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
241			    z1 * (C1 + z1 * C2))));
242			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
243			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
244			break;
245
246		case 4:
247			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
248			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
249			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
250			    z2 * (C1 + z2 * C2))));
251			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
252			break;
253
254		case 5:
255			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
256			    z0 * (C1 + z0 * C2))));
257			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
258			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
259			    z2 * (C1 + z2 * C2))));
260			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
261			break;
262
263		case 6:
264			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
265			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
266			    z1 * (C1 + z1 * C2))));
267			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
268			    z2 * (C1 + z2 * C2))));
269			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
270			break;
271
272		case 7:
273			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
274			    z0 * (C1 + z0 * C2))));
275			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
276			    z1 * (C1 + z1 * C2))));
277			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
278			    z2 * (C1 + z2 * C2))));
279			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
280			break;
281
282		case 8:
283			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
284			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
285			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
286			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
287			    z3 * (C1 + z3 * C2))));
288			break;
289
290		case 9:
291			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
292			    z0 * (C1 + z0 * C2))));
293			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
294			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
295			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
296			    z3 * (C1 + z3 * C2))));
297			break;
298
299		case 10:
300			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
301			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
302			    z1 * (C1 + z1 * C2))));
303			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
304			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
305			    z3 * (C1 + z3 * C2))));
306			break;
307
308		case 11:
309			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
310			    z0 * (C1 + z0 * C2))));
311			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
312			    z1 * (C1 + z1 * C2))));
313			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
314			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
315			    z3 * (C1 + z3 * C2))));
316			break;
317
318		case 12:
319			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
320			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
321			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
322			    z2 * (C1 + z2 * C2))));
323			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
324			    z3 * (C1 + z3 * C2))));
325			break;
326
327		case 13:
328			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
329			    z0 * (C1 + z0 * C2))));
330			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
331			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
332			    z2 * (C1 + z2 * C2))));
333			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
334			    z3 * (C1 + z3 * C2))));
335			break;
336
337		case 14:
338			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
339			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
340			    z1 * (C1 + z1 * C2))));
341			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
342			    z2 * (C1 + z2 * C2))));
343			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
344			    z3 * (C1 + z3 * C2))));
345			break;
346
347		default:
348			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
349			    z0 * (C1 + z0 * C2))));
350			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
351			    z1 * (C1 + z1 * C2))));
352			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
353			    z2 * (C1 + z2 * C2))));
354			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
355			    z3 * (C1 + z3 * C2))));
356		}
357
358		*y = (n0 & 2)? -f0 : f0;
359		y += stridey;
360		*y = (n1 & 2)? -f1 : f1;
361		y += stridey;
362		*y = (n2 & 2)? -f2 : f2;
363		y += stridey;
364		*y = (n3 & 2)? -f3 : f3;
365		continue;
366
367process1:
368		PROCESS(0);
369		continue;
370
371process2:
372		PROCESS(0);
373		PROCESS(1);
374		continue;
375
376process3:
377		PROCESS(0);
378		PROCESS(1);
379		PROCESS(2);
380	}
381}
382