1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29/*
30 * __vcosf: single precision vector cos
31 *
32 * Algorithm:
33 *
34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
36 * z*C2))), where z = x*x, all evaluated in double precision.
37 *
38 * Accuracy:
39 *
40 * The largest error is less than 0.6 ulps.
41 */
42
43#include <sys/isa_defs.h>
44
45#ifdef _LITTLE_ENDIAN
46#define	HI(x)	*(1+(int *)&x)
47#define	LO(x)	*(unsigned *)&x
48#else
49#define	HI(x)	*(int *)&x
50#define	LO(x)	*(1+(unsigned *)&x)
51#endif
52
53#ifdef __RESTRICT
54#define	restrict _Restrict
55#else
56#define	restrict
57#endif
58
59extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
60
61static const double C[] = {
62	-1.66666552424430847168e-01,	/* 2^ -3 * -1.5555460000000 */
63	8.33219196647405624390e-03,	/* 2^ -7 *  1.11077E0000000 */
64	-1.95187909412197768688e-04,	/* 2^-13 * -1.9956B60000000 */
65	1.0,
66	-0.5,
67	4.16666455566883087158e-02,	/* 2^ -5 *  1.55554A0000000 */
68	-1.38873036485165357590e-03,	/* 2^-10 * -1.6C0C1E0000000 */
69	2.44309903791872784495e-05,	/* 2^-16 *  1.99E24E0000000 */
70	0.636619772367581343075535,	/* 2^ -1  * 1.45F306DC9C883 */
71	6755399441055744.0,		/* 2^ 52  * 1.8000000000000 */
72	1.570796326734125614166,	/* 2^  0  * 1.921FB54400000 */
73	6.077100506506192601475e-11,	/* 2^-34  * 1.0B4611A626331 */
74};
75
76#define	S0	C[0]
77#define	S1	C[1]
78#define	S2	C[2]
79#define	one	C[3]
80#define	mhalf	C[4]
81#define	C0	C[5]
82#define	C1	C[6]
83#define	C2	C[7]
84#define	invpio2	C[8]
85#define	c3two51	C[9]
86#define	pio2_1  C[10]
87#define	pio2_t	C[11]
88
89#define	PREPROCESS(N, index, label)					\
90	hx = *(int *)x;							\
91	ix = hx & 0x7fffffff;						\
92	t = *x;								\
93	x += stridex;							\
94	if (ix <= 0x3f490fdb) { /* |x| < pi/4 */			\
95		if (ix == 0) {						\
96			y[index] = one;					\
97			goto label;					\
98		}							\
99		y##N = (double)t;					\
100		n##N = 1;						\
101	} else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */		\
102		y##N = (double)t;					\
103		medium = 1;						\
104	} else {							\
105		if (ix >= 0x7f800000) { /* inf or nan */		\
106			y[index] = t / t;				\
107			goto label;					\
108		}							\
109		z##N = y##N = (double)t;				\
110		hx = HI(y##N);						\
111		n##N = ((hx >> 20) & 0x7ff) - 1046;			\
112		HI(z##N) = (hx & 0xfffff) | 0x41600000;			\
113		n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0) + 1;	\
114		z##N = y##N * y##N;					\
115		if (n##N & 1) { /* compute cos y */			\
116			f##N = (float)(one + z##N * (mhalf + z##N *	\
117			    (C0 + z##N * (C1 + z##N * C2))));		\
118		} else { /* compute sin y */				\
119			f##N = (float)(y##N + y##N * z##N * (S0 +	\
120			    z##N * (S1 + z##N * S2)));			\
121		}							\
122		y[index] = (n##N & 2)? -f##N : f##N;			\
123		goto label;						\
124	}
125
126#define	PROCESS(N)							\
127	if (medium) {							\
128		z##N = y##N * invpio2 + c3two51;			\
129		n##N = LO(z##N) + 1;					\
130		z##N -= c3two51;					\
131		y##N = (y##N - z##N * pio2_1) - z##N * pio2_t;		\
132	}								\
133	z##N = y##N * y##N;						\
134	if (n##N & 1) { /* compute cos y */				\
135		f##N = (float)(one + z##N * (mhalf + z##N * (C0 +	\
136		    z##N * (C1 + z##N * C2))));				\
137	} else { /* compute sin y */					\
138		f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 +	\
139		    z##N * S2)));					\
140	}								\
141	*y = (n##N & 2)? -f##N : f##N;					\
142	y += stridey
143
144void
145__vcosf(int n, float *restrict x, int stridex, float *restrict y,
146    int stridey)
147{
148	double		y0, y1, y2, y3;
149	double		z0, z1, z2, z3;
150	float		f0, f1, f2, f3, t;
151	int		n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
152
153	y -= stridey;
154
155	for (;;) {
156begin:
157		y += stridey;
158
159		if (--n < 0)
160			break;
161
162		medium = 0;
163		PREPROCESS(0, 0, begin);
164
165		if (--n < 0)
166			goto process1;
167
168		PREPROCESS(1, stridey, process1);
169
170		if (--n < 0)
171			goto process2;
172
173		PREPROCESS(2, (stridey << 1), process2);
174
175		if (--n < 0)
176			goto process3;
177
178		PREPROCESS(3, (stridey << 1) + stridey, process3);
179
180		if (medium) {
181			z0 = y0 * invpio2 + c3two51;
182			z1 = y1 * invpio2 + c3two51;
183			z2 = y2 * invpio2 + c3two51;
184			z3 = y3 * invpio2 + c3two51;
185
186			n0 = LO(z0) + 1;
187			n1 = LO(z1) + 1;
188			n2 = LO(z2) + 1;
189			n3 = LO(z3) + 1;
190
191			z0 -= c3two51;
192			z1 -= c3two51;
193			z2 -= c3two51;
194			z3 -= c3two51;
195
196			y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
197			y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
198			y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
199			y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
200		}
201
202		z0 = y0 * y0;
203		z1 = y1 * y1;
204		z2 = y2 * y2;
205		z3 = y3 * y3;
206
207		hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) |
208		    ((n3 & 1) << 3);
209		switch (hx) {
210		case 0:
211			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
212			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
213			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
214			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
215			break;
216
217		case 1:
218			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
219			    z0 * (C1 + z0 * C2))));
220			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
221			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
222			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
223			break;
224
225		case 2:
226			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
227			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
228			    z1 * (C1 + z1 * C2))));
229			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
230			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
231			break;
232
233		case 3:
234			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
235			    z0 * (C1 + z0 * C2))));
236			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
237			    z1 * (C1 + z1 * C2))));
238			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
239			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
240			break;
241
242		case 4:
243			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
244			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
245			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
246			    z2 * (C1 + z2 * C2))));
247			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
248			break;
249
250		case 5:
251			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
252			    z0 * (C1 + z0 * C2))));
253			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
254			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
255			    z2 * (C1 + z2 * C2))));
256			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
257			break;
258
259		case 6:
260			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
261			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
262			    z1 * (C1 + z1 * C2))));
263			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
264			    z2 * (C1 + z2 * C2))));
265			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
266			break;
267
268		case 7:
269			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
270			    z0 * (C1 + z0 * C2))));
271			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
272			    z1 * (C1 + z1 * C2))));
273			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
274			    z2 * (C1 + z2 * C2))));
275			f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
276			break;
277
278		case 8:
279			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
280			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
281			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
282			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
283			    z3 * (C1 + z3 * C2))));
284			break;
285
286		case 9:
287			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
288			    z0 * (C1 + z0 * C2))));
289			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
290			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
291			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
292			    z3 * (C1 + z3 * C2))));
293			break;
294
295		case 10:
296			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
297			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
298			    z1 * (C1 + z1 * C2))));
299			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
300			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
301			    z3 * (C1 + z3 * C2))));
302			break;
303
304		case 11:
305			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
306			    z0 * (C1 + z0 * C2))));
307			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
308			    z1 * (C1 + z1 * C2))));
309			f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
310			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
311			    z3 * (C1 + z3 * C2))));
312			break;
313
314		case 12:
315			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
316			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
317			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
318			    z2 * (C1 + z2 * C2))));
319			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
320			    z3 * (C1 + z3 * C2))));
321			break;
322
323		case 13:
324			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
325			    z0 * (C1 + z0 * C2))));
326			f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
327			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
328			    z2 * (C1 + z2 * C2))));
329			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
330			    z3 * (C1 + z3 * C2))));
331			break;
332
333		case 14:
334			f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
335			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
336			    z1 * (C1 + z1 * C2))));
337			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
338			    z2 * (C1 + z2 * C2))));
339			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
340			    z3 * (C1 + z3 * C2))));
341			break;
342
343		default:
344			f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
345			    z0 * (C1 + z0 * C2))));
346			f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
347			    z1 * (C1 + z1 * C2))));
348			f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
349			    z2 * (C1 + z2 * C2))));
350			f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
351			    z3 * (C1 + z3 * C2))));
352		}
353
354		*y = (n0 & 2)? -f0 : f0;
355		y += stridey;
356		*y = (n1 & 2)? -f1 : f1;
357		y += stridey;
358		*y = (n2 & 2)? -f2 : f2;
359		y += stridey;
360		*y = (n3 & 2)? -f3 : f3;
361		continue;
362
363process1:
364		PROCESS(0);
365		continue;
366
367process2:
368		PROCESS(0);
369		PROCESS(1);
370		continue;
371
372process3:
373		PROCESS(0);
374		PROCESS(1);
375		PROCESS(2);
376	}
377}
378