1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vatan.S"
30
31#include "libm.h"
32
33	RO_DATA
34
35! following is the C version of the ATAN algorithm
36!  #include <math.h>
37!  #include <stdio.h>
38!  double jkatan(double *x)
39!  {
40!    double  f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
41!    int index, sign, intf, intz;
42!    extern const double __vlibm_TBL_atan1[];
43!    long *pf = (long *) &f, *pz = (long *) &z;
44!
45!  /*    Power series  atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7
46!   *    Error =  -3.08254E-18   On the interval  |x| < 1/64 */
47!
48!  /* define dummy names for readability.  Use parray to help compiler optimize loads */
49!  #define p3    parray[0]
50!  #define p2    parray[1]
51!  #define p1    parray[2]
52!  #define soffset      3
53!
54!    static const double parray[] = {
55!     -1.428029046844299722E-01,		/* p[3]		*/
56!      1.999999917247000615E-01, 		/* p[2]		*/
57!     -3.333333333329292858E-01, 		/* p[1]		*/
58!      1.0, 				/* not used for p[0], though		*/
59!     -1.0,				/* used to flip sign of answer 		*/
60!    };
61!
62!    f        = *x;				/* fetch argument		*/
63!    intf     = pf[0];				/* grab upper half		*/
64!    sign     = intf & 0x80000000;			/* sign of argument		*/
65!    intf    ^= sign;				/* abs(upper argument)		*/
66!    sign     = (unsigned) sign >> 31;		/* sign bit = 0 or 1		*/
67!    pf[0]    = intf;
68!
69!    if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */
70!    {
71!      if(  (intf > 0x7ff00000) ||
72!        ((intf == 0x7ff00000) &&  (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/
73!      if( intf < 0x3e300000 ) 			/* avoid underflow for small arg */
74!      {
75!        dummy = 1.0e37 + f;
76!        dummy = dummy;
77!        return (*x);
78!      }
79!      if( intf > 0x43600000 )			/* avoid underflow for big arg  */
80!      {
81!        index = 2;
82!        f     = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low   */
83!        f     = parray[soffset + sign] * f;	/* put sign bit on ans		*/
84!        return (f);
85!      }
86!    }
87!
88!    index    = 0;				/* points to 0,0 in table	*/
89!    if (intf > 0x40500000)			/* if(|x| > 64               	*/
90!    { f = -1.0/f;
91!      index  = 2; 				/* point to pi/2 upper, lower	*/
92!    }
93!    else if( intf >= 0x3f900000 )		/* if |x| >= (1/64)... 		*/
94!    {
95!      intz   = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper	*/
96!      pz[0]  = intz;				/* store as a double (z)	*/
97!      pz[1]  = 0;				/* ...lower			*/
98!      f      = (f - z)/(1.0 + f*z); 		/* get reduced argument		*/
99!      index  = (intz - 0x3f900000) >> 15;	/* (index >> 16) << 1)		*/
100!      index += 4;				/* skip over 0,0,pi/2,pi/2	*/
101!    }
102!    conup    = __vlibm_TBL_atan1[index];	/* upper table 			*/
103!    conlo    = __vlibm_TBL_atan1[index+1];	/* lower table 			*/
104!    tmp      = f*f;
105!    poly     = (f*tmp)*((p3*tmp + p2)*tmp + p1);
106!    ansu     = conup + f;     			/* compute atan(f)  upper 	*/
107!    ansl     = (((conup - ansu) + f) + poly) + conlo;
108!    ans      = ansu + ansl;
109!    ans      = parray[soffset + sign] * ans;
110!    return ans;
111!  }
112
113/* 8 bytes = 1 double f.p. word */
114#define WSIZE	8
115
116	.align	32			!align with full D-cache line
117.COEFFS:
118	.double	0r-1.428029046844299722E-01 	!p[3]
119        .double 0r1.999999917247000615E-01 	!p[2]
120        .double 0r-3.333333333329292858E-01	!p[1]
121        .double 0r-1.0, 			!constant -1.0
122	.word   0x00008000,0x0			!for fp rounding of reduced arg
123	.word   0x7fff0000,0x0			!for fp truncation
124	.word	0x47900000,0			!a number close to 1.0E37
125	.word   0x80000000,0x0			!mask for fp sign bit
126	.word   0x3f800000,0x0			!1.0/128.0 dummy "safe" argument
127	.type	.COEFFS,#object
128
129	ENTRY(__vatan)
130	save	%sp,-SA(MINFRAME)-16,%sp
131	PIC_SETUP(g5)
132	PIC_SET(g5,__vlibm_TBL_atan1,o4)
133	PIC_SET(g5,.COEFFS,o0)
134/*
135   __vatan(int n, double *x, int stridex, double *y, stridey)
136   computes y(i) = atan( x(i) ), for 1=1,n.  Stridex, stridey
137   are the distance between x and y elements
138
139	%i0    n
140	%i1    address of x
141	%i2    stride x
142	%i3    address of y
143	%i4    stride y
144*/
145	cmp	%i0,0			!if n <=0,
146	ble,pn	%icc,.RETURN		!....then do nothing
147	sll	%i2,3,%i2		!convert stride to byte count
148	sll	%i4,3,%i4		!convert stride to byte count
149
150/*  pre-load constants before beginning main loop */
151
152	ldd	[%o0],%f58		!load p[3]
153	mov	2,%i5			!argcount = 3
154
155	ldd	[%o0+WSIZE],%f60	!load p[2]
156	add	%fp,STACK_BIAS-8,%l1		!yaddr1 = &dummy
157	fzero   %f18			!ansu1 = 0
158
159	ldd	[%o0+2*WSIZE],%f62	!load p[1]
160	add	%fp,STACK_BIAS-8,%l2		!yaddr2 = &dummy
161	fzero   %f12			!(poly1) = 0
162
163	ldd 	[%o0+3*WSIZE],%f56	!-1.0
164	fzero   %f14			!tmp1 = 0
165
166	ldd     [%o0+4*WSIZE],%f52	!load rounding mask
167	fzero   %f16			!conup1 = 0
168
169	ldd     [%o0+5*WSIZE],%f54	!load truncation mask
170	fzero   %f36			!f1 = 0
171
172	ldd	[%o0+6*WSIZE],%f50	!1.0e37
173	fzero   %f38			!f2 = 0
174
175	ldd	[%o0+7*WSIZE],%f32	!mask for sign bit
176
177	ldd	[%o4+2*WSIZE],%f46	!pi/2 upper
178	ldd	[%o4+(2*WSIZE+8)],%f48	!pi/2 lower
179	sethi	%hi(0x40500000),%l6	!64.0
180	sethi	%hi(0x3f900000),%l7	!1/64.0
181	mov     0,%l4			!index1 = 0
182	mov     0,%l5			!index2 = 0
183
184.MAINLOOP:
185
186    /*--------------------------------------------------------------------------*/
187    /*--------------------------------------------------------------------------*/
188    /*--------------------------------------------------------------------------*/
189
190.LOOP0:
191        deccc   %i0                     !--n
192	bneg	1f
193        mov	%i1,%o5			!xuse = x (delay slot)
194
195	ba	2f
196	nop				!delay slot
1971:
198	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
199	dec	%i5			!argcount--
2002:
201	sethi	%hi(0x80000000),%o7	!mask for sign bit
202/*2 */  sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
203	ld	[%o5],%o0		!intf = pf[0] = f upper
204	ldd     [%o4+%l5],%f26		!conup2 = __vlibm_TBL_atan1[index2]
205
206	sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
207/*4 */  andn	%o0,%o7,%o0		!intf = fabs(intf)
208	ldd 	[%o5],%f34		!f = *x into f34
209
210	sub	%o1,%o0,%o1		!(-) if intf > big
211/*6 */  sub	%o0,%o2,%o2		!(-) if intf < small
212	fand 	%f34,%f32,%f40		!sign0 = sign bit
213            fmuld   %f38,%f38,%f24      !tmp2= f2*f2
214
215/*7 */  orcc	%o1,%o2,%g0		!(-) if either true
216 	bneg,pn  %icc,.SPECIAL0		!if (-) goto special cases below
217	fabsd	%f34,%f34		!abs(f)  (delay slot)
218	!----------------------
219
220
221	sethi   %hi(0x8000),%o7		!rounding bit
222/*8 */  fpadd32 %f34,%f52,%f0		!intf + 0x00008000 (again)
223            faddd   %f26,%f38,%f28      !ansu2 = conup2 + f2
224
225 	add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
226/*9*/   fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
227            fmuld   %f58,%f24,%f22      !p[3]*tmp2
228
229/*10 */ sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
230	fmuld	%f34,%f0,%f10		!f*z
231	fsubd	%f34,%f0,%f20		!f - z
232          add     %o4,%l4,%l4           !base addr + index1
233          fmuld   %f14,%f12,%f12        !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
234          faddd   %f16,%f36,%f16        !(conup1 - ansu1) + f1
235
236/*12 */ and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
237            faddd   %f22,%f60,%f22      !p[3]*tmp2 + p[2]
238          ldd     [%l4+WSIZE],%f14      !conlo1 = __vlibm_TBL_atan1[index+1]
239
240/*13 */ sub	%o0,%l7,%o2		!intz - 0x3f900000
241	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
242          faddd   %f16,%f12,%f12        !((conup1 - ansu1) + f1) + poly1
243
244	cmp	%o0,%l6			!(|f| > 64)
245        ble	.ELSE0			!if(|f| > 64) then
246/*15 */ sra	  %o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
247	  mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
248          ba	  .ENDIF0		!continue
249/*16 */   fdivd	  %f56,%f34,%f34	!f = -1.0/f (delay slot)
250        .ELSE0:				!else f( |x| >= (1/64))
251	cmp	%o0,%l7			!if intf >= 1/64
252	bl  	.ENDIF0			!if( |x| >= (1/64) ) then...
253	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
254          add	  %o3,4,%o1		!index = index + 4
255/*16 */   fdivd	  %f20,%f10,%f34	!f = (f - z)/(1.0 + f*z), reduced argument
256	.ENDIF0:
257
258/*17*/  sll	%o1,3,%l3		!index0 = index
259	mov	%i3,%l0			!yaddr0 = address of y
260          faddd   %f12,%f14,%f12        !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
261            fmuld   %f22,%f24,%f22      !(p3*tmp2 + p2)*tmp2
262            fsubd   %f26,%f28,%f26      !conup2 - ansu2
263
264/*20*/  add	%i1,%i2,%i1		!x     += stridex
265        add	%i3,%i4,%i3		!y     += stridey
266          faddd   %f18,%f12,%f36        !ans1 = ansu1 + ansl1
267            fmuld   %f38,%f24,%f24      !f*tmp2
268            faddd   %f22,%f62,%f22      !(p3*tmp2 + p2)*tmp2 + p1
269
270/*23*/    for     %f36,%f42,%f36        !sign(ans1) = sign of argument
271          std     %f36,[%l1]            !*yaddr1 = ans1
272            add     %o4,%l5,%l5         !base addr + index2
273            fmuld   %f24,%f22,%f22      !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
274            faddd   %f26,%f38,%f26      !(conup2 - ansu2) + f2
275	cmp	%i5,0			!if argcount =0, we are done
276	be	.RETURN
277	  nop
278
279    /*--------------------------------------------------------------------------*/
280    /*--------------------------------------------------------------------------*/
281    /*--------------------------------------------------------------------------*/
282
283.LOOP1:
284/*25*/  deccc   %i0                     !--n
285        bneg    1f
286        mov     %i1,%o5                 !xuse = x (delay slot)
287        ba      2f
288        nop                             !delay slot
2891:
290	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
291        dec     %i5                     !argcount--
2922:
293
294/*26*/  sethi	%hi(0x80000000),%o7	!mask for sign bit
295	sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
296	ld	[%o5],%o0		!intf = pf[0] = f upper
297
298/*28*/  sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
299	andn	%o0,%o7,%o0		!intf = fabs(intf)
300	ldd 	[%o5],%f36		!f = *x into f36
301
302/*30*/  sub	%o1,%o0,%o1		!(-) if intf > big
303	sub	%o0,%o2,%o2		!(-) if intf < small
304	fand 	%f36,%f32,%f42		!sign1 = sign bit
305
306/*31*/  orcc	%o1,%o2,%g0		!(-) if either true
307 	bneg,pn  %icc,.SPECIAL1		!if (-) goto special cases below
308	fabsd	%f36,%f36		!abs(f)  (delay slot)
309	!----------------------
310
311/*32*/  fpadd32 %f36,%f52,%f0		!intf + 0x00008000 (again)
312          ldd     [%l5+WSIZE],%f24      !conlo2 = __vlibm_TBL_atan1[index2+1]
313
314/*33*/  fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
315        sethi   %hi(0x8000),%o7		!rounding bit
316          faddd   %f26,%f22,%f22        !((conup2 - ansu2) + f2) + poly2
317
318/*34*/  add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
319	sethi   %hi(0x7fff0000),%o7	!mask for rounding argument
320	fmuld	%f36,%f0,%f10		!f*z
321	fsubd	%f36,%f0,%f20		!f - z
322
323/*35*/  and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
324          faddd   %f22,%f24,%f22        !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2
325
326/*37*/  sub	%o0,%l7,%o2		!intz - 0x3f900000
327	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
328      ldd     [%o4+%l3],%f6             !conup0 = __vlibm_TBL_atan1[index0]
329
330        cmp	%o0,%l6			!(|f| > 64)
331	ble	.ELSE1			!if(|f| > 64) then
332/*38*/  sra	%o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
333          mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
334	  ba	  .ENDIF1		!continue
335/*40*/    fdivd	  %f56,%f36,%f36	!f = -1.0/f (delay slot)
336        .ELSE1:				!else f( |x| >= (1/64))
337	cmp	%o0,%l7			!if intf >= 1/64
338	bl  	.ENDIF1			!if( |x| >= (1/64) ) then...
339	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
340          add	  %o3,4,%o1		!index = index + 4
341/*40*/    fdivd	  %f20,%f10,%f36	!f = (f - z)/(1.0 + f*z), reduced argument
342	.ENDIF1:
343
344/*41*/sll     %o1,3,%l4		 	!index1 = index
345      mov     %i3,%l1			!yaddr1 = address of y
346      fmuld   %f34,%f34,%f4             !tmp0= f0*f0
347          faddd   %f28,%f22,%f38        !ans2 = ansu2 + ansl2
348
349/*44*/add	%i1,%i2,%i1		!x     += stridex
350      add	%i3,%i4,%i3		!y     += stridey
351    fmuld   %f58,%f4,%f2                !p[3]*tmp0
352    faddd   %f6,%f34,%f8                !ansu0 = conup0 + f0
353          for     %f38,%f44,%f38        !sign(ans2) = sign of argument
354          std     %f38,[%l2]            !*yaddr2 = ans2
355          cmp	%i5,0			!if argcount =0, we are done
356	  be	.RETURN
357	  nop
358
359    /*--------------------------------------------------------------------------*/
360    /*--------------------------------------------------------------------------*/
361    /*--------------------------------------------------------------------------*/
362
363.LOOP2:
364/*46*/  deccc   %i0                     !--n
365        bneg    1f
366        mov     %i1,%o5                 !xuse = x (delay slot)
367        ba      2f
368        nop                             !delay slot
3691:
370	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
371        dec     %i5                     !argcount--
3722:
373
374/*47*/  sethi	%hi(0x80000000),%o7	!mask for sign bit
375	sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
376	ld	[%o5],%o0		!intf = pf[0] = f upper
377
378/*49*/  sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
379	andn	%o0,%o7,%o0		!intf = fabs(intf)
380	ldd 	[%o5],%f38		!f = *x into f38
381
382/*51*/  sub	%o1,%o0,%o1		!(-) if intf > big
383	sub	%o0,%o2,%o2		!(-) if intf < small
384	fand 	%f38,%f32,%f44		!sign2 = sign bit
385
386/*52*/  orcc	%o1,%o2,%g0		!(-) if either true
387 	bneg,pn  %icc,.SPECIAL2		!if (-) goto special cases below
388	fabsd	%f38,%f38		!abs(f)  (delay slot)
389	!----------------------
390
391/*53*/  fpadd32 %f38,%f52,%f0		!intf + 0x00008000 (again)
392      faddd   %f2,%f60,%f2              !p[3]*tmp0 + p[2]
393
394/*54*/  sethi   %hi(0x8000),%o7		!rounding bit
395	fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
396
397/*55*/  add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
398	sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
399	fmuld	%f38,%f0,%f10		!f*z
400	fsubd	%f38,%f0,%f20		!f - z
401
402/*56*/  and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
403      fmuld   %f2,%f4,%f2               !(p3*tmp0 + p2)*tmp0
404      fsubd   %f6,%f8,%f6               !conup0 - ansu0
405
406/*58*/  sub	%o0,%l7,%o2		!intz - 0x3f900000
407	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
408            ldd     [%o4+%l4],%f16      !conup1 = __vlibm_TBL_atan1[index1]
409
410	cmp	%o0,%l6			!(|f| > 64)
411	ble	.ELSE2			!if(|f| > 64) then
412/*60*/  sra	%o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
413	  mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
414	  ba	  .ENDIF2		!continue
415/*61*/    fdivd	  %f56,%f38,%f38	!f = -1.0/f (delay slot)
416        .ELSE2:				!else f( |x| >= (1/64))
417	cmp	%o0,%l7			!if intf >= 1/64
418	bl  	.ENDIF2			!if( |x| >= (1/64) ) then...
419	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
420          add	  %o3,4,%o1		!index = index + 4
421/*61*/    fdivd	  %f20,%f10,%f38	!f = (f - z)/(1.0 + f*z), reduced argument
422	.ENDIF2:
423
424
425/*62*/  sll	%o1,3,%l5		!index2 = index
426	mov	%i3,%l2			!yaddr2 = address of y
427      fmuld   %f34,%f4,%f4              !f0*tmp0
428      faddd   %f2,%f62,%f2              !(p3*tmp0 + p2)*tmp0 + p1
429            fmuld   %f36,%f36,%f14      !tmp1= f1*f1
430
431/*65*/add     %o4,%l3,%l3               !base addr + index0
432      fmuld   %f4,%f2,%f2               !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
433      faddd   %f6,%f34,%f6              !(conup0 - ansu0) + f0
434            fmuld   %f58,%f14,%f12      !p[3]*tmp1
435            faddd   %f16,%f36,%f18      !ansu1 = conup1 + f1
436      ldd     [%l3+WSIZE],%f4           !conlo0 = __vlibm_TBL_atan1[index0+1]
437
438/*68*/  add	%i1,%i2,%i1		!x     += stridex
439	add	%i3,%i4,%i3		!y     += stridey
440      faddd   %f6,%f2,%f2               !((conup0 - ansu0) + f0) + poly0
441            faddd   %f12,%f60,%f12      !p[3]*tmp1 + p[2]
442
443/*71*/faddd   %f2,%f4,%f2               !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
444            fmuld   %f12,%f14,%f12      !(p3*tmp1 + p2)*tmp1
445            fsubd   %f16,%f18,%f16      !conup1 - ansu1
446
447/*74*/faddd   %f8,%f2,%f34              !ans0 = ansu0 + ansl0
448          fmuld   %f36,%f14,%f14        !f1*tmp1
449          faddd   %f12,%f62,%f12        !(p3*tmp1 + p2)*tmp1 + p1
450
451/*77*/  for     %f34,%f40,%f34          !sign(ans0) = sign of argument
452        std     %f34,[%l0]              !*yaddr0 = ans, always gets stored (delay slot)
453        cmp	%i5,0			!if argcount =0, we are done
454	bg	.MAINLOOP
455	  nop
456
457    /*--------------------------------------------------------------------------*/
458    /*--------------------------------------------------------------------------*/
459    /*--------------------------------------------------------------------------*/
460
461.RETURN:
462	ret
463	restore	%g0,%g0,%g0
464
465    /*--------------------------------------------------------------------------*/
466    /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/
467    /*--------------------------------------------------------------------------*/
468
469/* at this point
470   %i1     x address
471   %o0     intf
472   %o2     intf - 0x3e300000
473   %f34,36,38     f0,f1,f2
474   %f40,42,44     sign0,sign1,sign2
475*/
476
477 	.align	32				!align on I-cache boundary
478.SPECIAL0:
479	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
480	bpos	1f  				!if >=...continue
481	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
482	  ba	  3f
483	  faddd	  %f34,%f50,%f30		!dummy op just to generate exception (delay slot)
4841:
485	ld	[%o5+4],%o5			!load x lower word
486	sllx	%o0,32,%o0			!left justify intf
487	sllx	%g1,32,%g1		!left justify Inf
488	or	%o0,%o5,%o0			!merge in lower intf
489	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
490	ble,pt	%xcc,2f				!pass thru if NaN
491	  nop
492	  fmuld	  %f34,%f34,%f34		!...... (x*x) trigger invalid exception
493	  ba      3f
494	  nop
4952:
496	faddd	%f46,%f48,%f34			!ans = pi/2 upper + pi/2 lower
4973:
498	add	%i1,%i2,%i1			!x += stridex
499	for     %f34,%f40,%f34			!sign(ans) = sign of argument
500	std	%f34,[%i3]			!*y = ans
501	ba	.LOOP0				!keep looping
502	add	%i3,%i4,%i3			!y += stridey (delay slot)
503
504    /*--------------------------------------------------------------------------*/
505    /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/
506    /*--------------------------------------------------------------------------*/
507
508 	.align	32				!align on I-cache boundary
509.SPECIAL1:
510	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
511	bpos	1f  				!if >=...continue
512	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
513	  ba	  3f
514	  faddd	  %f36,%f50,%f30		!dummy op just to generate exception (delay slot)
5151:
516	ld	[%o5+4],%o5			!load x lower word
517	sllx	%o0,32,%o0			!left justify intf
518	sllx	%g1,32,%g1		!left justify Inf
519	or	%o0,%o5,%o0			!merge in lower intf
520	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
521	ble,pt	%xcc,2f				!pass thru if NaN
522	  nop
523	  fmuld	  %f36,%f36,%f36		!...... (x*x) trigger invalid exception
524	  ba      3f
525	  nop
5262:
527	faddd	%f46,%f48,%f36			!ans = pi/2 upper + pi/2 lower
5283:
529	add	%i1,%i2,%i1			!x += stridex
530	for     %f36,%f42,%f36			!sign(ans) = sign of argument
531	std	%f36,[%i3]			!*y = ans
532	ba      .LOOP1 				!keep looping
533	add	%i3,%i4,%i3			!y += stridey (delay slot)
534
535    /*--------------------------------------------------------------------------*/
536    /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/
537    /*--------------------------------------------------------------------------*/
538
539 	.align	32				!align on I-cache boundary
540.SPECIAL2:
541	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
542	bpos	1f  				!if >=...continue
543	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
544	  ba	  3f
545	  faddd	  %f38,%f50,%f30		!dummy op just to generate exception (delay slot)
5461:
547	ld	[%o5+4],%o5			!load x lower word
548	sllx	%o0,32,%o0			!left justify intf
549	sllx	%g1,32,%g1		!left justify Inf
550	or	%o0,%o5,%o0			!merge in lower intf
551	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
552	ble,pt	%xcc,2f				!pass thru if NaN
553	  nop
554	  fmuld	  %f38,%f38,%f38		!...... (x*x) trigger invalid exception
555	  ba      3f
556	  nop
5572:
558	faddd	%f46,%f48,%f38			!ans = pi/2 upper + pi/2 lower
5593:
560	add	%i1,%i2,%i1			!x += stridex
561	for     %f38,%f44,%f38			!sign(ans) = sign of argument
562	std	%f38,[%i3]			!*y = ans
563	ba      .LOOP2 				!keep looping
564	add	%i3,%i4,%i3			!y += stridey
565
566    /*--------------------------------------------------------------------------*/
567    /*--------------------------------------------------------------------------*/
568    /*--------------------------------------------------------------------------*/
569
570	SET_SIZE(__vatan)
571
572!	.ident	"03-20-96 Sparc V9 3-way-unrolled version"
573