125c28e83SPiotr Jasiukajtis/*
225c28e83SPiotr Jasiukajtis * CDDL HEADER START
325c28e83SPiotr Jasiukajtis *
425c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the
525c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License").
625c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License.
725c28e83SPiotr Jasiukajtis *
825c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
925c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing.
1025c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions
1125c28e83SPiotr Jasiukajtis * and limitations under the License.
1225c28e83SPiotr Jasiukajtis *
1325c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each
1425c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1525c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the
1625c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying
1725c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner]
1825c28e83SPiotr Jasiukajtis *
1925c28e83SPiotr Jasiukajtis * CDDL HEADER END
2025c28e83SPiotr Jasiukajtis */
2125c28e83SPiotr Jasiukajtis/*
2225c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
2325c28e83SPiotr Jasiukajtis */
2425c28e83SPiotr Jasiukajtis/*
2525c28e83SPiotr Jasiukajtis * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
2625c28e83SPiotr Jasiukajtis * Use is subject to license terms.
2725c28e83SPiotr Jasiukajtis */
2825c28e83SPiotr Jasiukajtis
2925c28e83SPiotr Jasiukajtis	.file	"__vatan.S"
3025c28e83SPiotr Jasiukajtis
3125c28e83SPiotr Jasiukajtis#include "libm.h"
3225c28e83SPiotr Jasiukajtis
3325c28e83SPiotr Jasiukajtis	RO_DATA
3425c28e83SPiotr Jasiukajtis
3525c28e83SPiotr Jasiukajtis! following is the C version of the ATAN algorithm
3625c28e83SPiotr Jasiukajtis!  #include <math.h>
3725c28e83SPiotr Jasiukajtis!  #include <stdio.h>
3825c28e83SPiotr Jasiukajtis!  double jkatan(double *x)
3925c28e83SPiotr Jasiukajtis!  {
4025c28e83SPiotr Jasiukajtis!    double  f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
4125c28e83SPiotr Jasiukajtis!    int index, sign, intf, intz;
4225c28e83SPiotr Jasiukajtis!    extern const double __vlibm_TBL_atan1[];
4325c28e83SPiotr Jasiukajtis!    long *pf = (long *) &f, *pz = (long *) &z;
44*55fea89dSDan Cross!
4525c28e83SPiotr Jasiukajtis!  /*    Power series  atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7
4625c28e83SPiotr Jasiukajtis!   *    Error =  -3.08254E-18   On the interval  |x| < 1/64 */
47*55fea89dSDan Cross!
4825c28e83SPiotr Jasiukajtis!  /* define dummy names for readability.  Use parray to help compiler optimize loads */
4925c28e83SPiotr Jasiukajtis!  #define p3    parray[0]
5025c28e83SPiotr Jasiukajtis!  #define p2    parray[1]
5125c28e83SPiotr Jasiukajtis!  #define p1    parray[2]
52*55fea89dSDan Cross!  #define soffset      3
53*55fea89dSDan Cross!
54*55fea89dSDan Cross!    static const double parray[] = {
5525c28e83SPiotr Jasiukajtis!     -1.428029046844299722E-01,		/* p[3]		*/
5625c28e83SPiotr Jasiukajtis!      1.999999917247000615E-01, 		/* p[2]		*/
5725c28e83SPiotr Jasiukajtis!     -3.333333333329292858E-01, 		/* p[1]		*/
5825c28e83SPiotr Jasiukajtis!      1.0, 				/* not used for p[0], though		*/
5925c28e83SPiotr Jasiukajtis!     -1.0,				/* used to flip sign of answer 		*/
6025c28e83SPiotr Jasiukajtis!    };
61*55fea89dSDan Cross!
6225c28e83SPiotr Jasiukajtis!    f        = *x;				/* fetch argument		*/
6325c28e83SPiotr Jasiukajtis!    intf     = pf[0];				/* grab upper half		*/
6425c28e83SPiotr Jasiukajtis!    sign     = intf & 0x80000000;			/* sign of argument		*/
6525c28e83SPiotr Jasiukajtis!    intf    ^= sign;				/* abs(upper argument)		*/
6625c28e83SPiotr Jasiukajtis!    sign     = (unsigned) sign >> 31;		/* sign bit = 0 or 1		*/
6725c28e83SPiotr Jasiukajtis!    pf[0]    = intf;
68*55fea89dSDan Cross!
6925c28e83SPiotr Jasiukajtis!    if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */
7025c28e83SPiotr Jasiukajtis!    {
71*55fea89dSDan Cross!      if(  (intf > 0x7ff00000) ||
7225c28e83SPiotr Jasiukajtis!        ((intf == 0x7ff00000) &&  (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/
7325c28e83SPiotr Jasiukajtis!      if( intf < 0x3e300000 ) 			/* avoid underflow for small arg */
7425c28e83SPiotr Jasiukajtis!      {
7525c28e83SPiotr Jasiukajtis!        dummy = 1.0e37 + f;
7625c28e83SPiotr Jasiukajtis!        dummy = dummy;
7725c28e83SPiotr Jasiukajtis!        return (*x);
7825c28e83SPiotr Jasiukajtis!      }
7925c28e83SPiotr Jasiukajtis!      if( intf > 0x43600000 )			/* avoid underflow for big arg  */
8025c28e83SPiotr Jasiukajtis!      {
8125c28e83SPiotr Jasiukajtis!        index = 2;
8225c28e83SPiotr Jasiukajtis!        f     = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low   */
8325c28e83SPiotr Jasiukajtis!        f     = parray[soffset + sign] * f;	/* put sign bit on ans		*/
8425c28e83SPiotr Jasiukajtis!        return (f);
8525c28e83SPiotr Jasiukajtis!      }
8625c28e83SPiotr Jasiukajtis!    }
87*55fea89dSDan Cross!
8825c28e83SPiotr Jasiukajtis!    index    = 0;				/* points to 0,0 in table	*/
8925c28e83SPiotr Jasiukajtis!    if (intf > 0x40500000)			/* if(|x| > 64               	*/
9025c28e83SPiotr Jasiukajtis!    { f = -1.0/f;
9125c28e83SPiotr Jasiukajtis!      index  = 2; 				/* point to pi/2 upper, lower	*/
9225c28e83SPiotr Jasiukajtis!    }
9325c28e83SPiotr Jasiukajtis!    else if( intf >= 0x3f900000 )		/* if |x| >= (1/64)... 		*/
9425c28e83SPiotr Jasiukajtis!    {
9525c28e83SPiotr Jasiukajtis!      intz   = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper	*/
9625c28e83SPiotr Jasiukajtis!      pz[0]  = intz;				/* store as a double (z)	*/
9725c28e83SPiotr Jasiukajtis!      pz[1]  = 0;				/* ...lower			*/
9825c28e83SPiotr Jasiukajtis!      f      = (f - z)/(1.0 + f*z); 		/* get reduced argument		*/
9925c28e83SPiotr Jasiukajtis!      index  = (intz - 0x3f900000) >> 15;	/* (index >> 16) << 1)		*/
10025c28e83SPiotr Jasiukajtis!      index += 4;				/* skip over 0,0,pi/2,pi/2	*/
10125c28e83SPiotr Jasiukajtis!    }
10225c28e83SPiotr Jasiukajtis!    conup    = __vlibm_TBL_atan1[index];	/* upper table 			*/
10325c28e83SPiotr Jasiukajtis!    conlo    = __vlibm_TBL_atan1[index+1];	/* lower table 			*/
10425c28e83SPiotr Jasiukajtis!    tmp      = f*f;
10525c28e83SPiotr Jasiukajtis!    poly     = (f*tmp)*((p3*tmp + p2)*tmp + p1);
10625c28e83SPiotr Jasiukajtis!    ansu     = conup + f;     			/* compute atan(f)  upper 	*/
10725c28e83SPiotr Jasiukajtis!    ansl     = (((conup - ansu) + f) + poly) + conlo;
10825c28e83SPiotr Jasiukajtis!    ans      = ansu + ansl;
10925c28e83SPiotr Jasiukajtis!    ans      = parray[soffset + sign] * ans;
11025c28e83SPiotr Jasiukajtis!    return ans;
11125c28e83SPiotr Jasiukajtis!  }
11225c28e83SPiotr Jasiukajtis
11325c28e83SPiotr Jasiukajtis/* 8 bytes = 1 double f.p. word */
11425c28e83SPiotr Jasiukajtis#define WSIZE	8
11525c28e83SPiotr Jasiukajtis
11625c28e83SPiotr Jasiukajtis	.align	32			!align with full D-cache line
11725c28e83SPiotr Jasiukajtis.COEFFS:
11825c28e83SPiotr Jasiukajtis	.double	0r-1.428029046844299722E-01 	!p[3]
11925c28e83SPiotr Jasiukajtis        .double 0r1.999999917247000615E-01 	!p[2]
12025c28e83SPiotr Jasiukajtis        .double 0r-3.333333333329292858E-01	!p[1]
12125c28e83SPiotr Jasiukajtis        .double 0r-1.0, 			!constant -1.0
12225c28e83SPiotr Jasiukajtis	.word   0x00008000,0x0			!for fp rounding of reduced arg
12325c28e83SPiotr Jasiukajtis	.word   0x7fff0000,0x0			!for fp truncation
12425c28e83SPiotr Jasiukajtis	.word	0x47900000,0			!a number close to 1.0E37
12525c28e83SPiotr Jasiukajtis	.word   0x80000000,0x0			!mask for fp sign bit
12625c28e83SPiotr Jasiukajtis	.word   0x3f800000,0x0			!1.0/128.0 dummy "safe" argument
12725c28e83SPiotr Jasiukajtis	.type	.COEFFS,#object
12825c28e83SPiotr Jasiukajtis
12925c28e83SPiotr Jasiukajtis	ENTRY(__vatan)
13025c28e83SPiotr Jasiukajtis	save	%sp,-SA(MINFRAME)-16,%sp
13125c28e83SPiotr Jasiukajtis	PIC_SETUP(g5)
13225c28e83SPiotr Jasiukajtis	PIC_SET(g5,__vlibm_TBL_atan1,o4)
13325c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS,o0)
13425c28e83SPiotr Jasiukajtis/*
13525c28e83SPiotr Jasiukajtis   __vatan(int n, double *x, int stridex, double *y, stridey)
13625c28e83SPiotr Jasiukajtis   computes y(i) = atan( x(i) ), for 1=1,n.  Stridex, stridey
137*55fea89dSDan Cross   are the distance between x and y elements
13825c28e83SPiotr Jasiukajtis
13925c28e83SPiotr Jasiukajtis	%i0    n
14025c28e83SPiotr Jasiukajtis	%i1    address of x
14125c28e83SPiotr Jasiukajtis	%i2    stride x
14225c28e83SPiotr Jasiukajtis	%i3    address of y
14325c28e83SPiotr Jasiukajtis	%i4    stride y
14425c28e83SPiotr Jasiukajtis*/
14525c28e83SPiotr Jasiukajtis	cmp	%i0,0			!if n <=0,
14625c28e83SPiotr Jasiukajtis	ble,pn	%icc,.RETURN		!....then do nothing
14725c28e83SPiotr Jasiukajtis	sll	%i2,3,%i2		!convert stride to byte count
14825c28e83SPiotr Jasiukajtis	sll	%i4,3,%i4		!convert stride to byte count
14925c28e83SPiotr Jasiukajtis
15025c28e83SPiotr Jasiukajtis/*  pre-load constants before beginning main loop */
15125c28e83SPiotr Jasiukajtis
15225c28e83SPiotr Jasiukajtis	ldd	[%o0],%f58		!load p[3]
153*55fea89dSDan Cross	mov	2,%i5			!argcount = 3
15425c28e83SPiotr Jasiukajtis
15525c28e83SPiotr Jasiukajtis	ldd	[%o0+WSIZE],%f60	!load p[2]
15625c28e83SPiotr Jasiukajtis	add	%fp,STACK_BIAS-8,%l1		!yaddr1 = &dummy
15725c28e83SPiotr Jasiukajtis	fzero   %f18			!ansu1 = 0
15825c28e83SPiotr Jasiukajtis
15925c28e83SPiotr Jasiukajtis	ldd	[%o0+2*WSIZE],%f62	!load p[1]
16025c28e83SPiotr Jasiukajtis	add	%fp,STACK_BIAS-8,%l2		!yaddr2 = &dummy
16125c28e83SPiotr Jasiukajtis	fzero   %f12			!(poly1) = 0
16225c28e83SPiotr Jasiukajtis
163*55fea89dSDan Cross	ldd 	[%o0+3*WSIZE],%f56	!-1.0
16425c28e83SPiotr Jasiukajtis	fzero   %f14			!tmp1 = 0
16525c28e83SPiotr Jasiukajtis
16625c28e83SPiotr Jasiukajtis	ldd     [%o0+4*WSIZE],%f52	!load rounding mask
16725c28e83SPiotr Jasiukajtis	fzero   %f16			!conup1 = 0
16825c28e83SPiotr Jasiukajtis
16925c28e83SPiotr Jasiukajtis	ldd     [%o0+5*WSIZE],%f54	!load truncation mask
17025c28e83SPiotr Jasiukajtis	fzero   %f36			!f1 = 0
17125c28e83SPiotr Jasiukajtis
17225c28e83SPiotr Jasiukajtis	ldd	[%o0+6*WSIZE],%f50	!1.0e37
17325c28e83SPiotr Jasiukajtis	fzero   %f38			!f2 = 0
17425c28e83SPiotr Jasiukajtis
17525c28e83SPiotr Jasiukajtis	ldd	[%o0+7*WSIZE],%f32	!mask for sign bit
17625c28e83SPiotr Jasiukajtis
17725c28e83SPiotr Jasiukajtis	ldd	[%o4+2*WSIZE],%f46	!pi/2 upper
178*55fea89dSDan Cross	ldd	[%o4+(2*WSIZE+8)],%f48	!pi/2 lower
17925c28e83SPiotr Jasiukajtis	sethi	%hi(0x40500000),%l6	!64.0
18025c28e83SPiotr Jasiukajtis	sethi	%hi(0x3f900000),%l7	!1/64.0
18125c28e83SPiotr Jasiukajtis	mov     0,%l4			!index1 = 0
18225c28e83SPiotr Jasiukajtis	mov     0,%l5			!index2 = 0
18325c28e83SPiotr Jasiukajtis
18425c28e83SPiotr Jasiukajtis.MAINLOOP:
18525c28e83SPiotr Jasiukajtis
18625c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
18725c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
18825c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
18925c28e83SPiotr Jasiukajtis
19025c28e83SPiotr Jasiukajtis.LOOP0:
19125c28e83SPiotr Jasiukajtis        deccc   %i0                     !--n
19225c28e83SPiotr Jasiukajtis	bneg	1f
19325c28e83SPiotr Jasiukajtis        mov	%i1,%o5			!xuse = x (delay slot)
19425c28e83SPiotr Jasiukajtis
19525c28e83SPiotr Jasiukajtis	ba	2f
19625c28e83SPiotr Jasiukajtis	nop				!delay slot
19725c28e83SPiotr Jasiukajtis1:
19825c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
19925c28e83SPiotr Jasiukajtis	dec	%i5			!argcount--
20025c28e83SPiotr Jasiukajtis2:
20125c28e83SPiotr Jasiukajtis	sethi	%hi(0x80000000),%o7	!mask for sign bit
20225c28e83SPiotr Jasiukajtis/*2 */  sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
20325c28e83SPiotr Jasiukajtis	ld	[%o5],%o0		!intf = pf[0] = f upper
20425c28e83SPiotr Jasiukajtis	ldd     [%o4+%l5],%f26		!conup2 = __vlibm_TBL_atan1[index2]
20525c28e83SPiotr Jasiukajtis
20625c28e83SPiotr Jasiukajtis	sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
20725c28e83SPiotr Jasiukajtis/*4 */  andn	%o0,%o7,%o0		!intf = fabs(intf)
20825c28e83SPiotr Jasiukajtis	ldd 	[%o5],%f34		!f = *x into f34
20925c28e83SPiotr Jasiukajtis
21025c28e83SPiotr Jasiukajtis	sub	%o1,%o0,%o1		!(-) if intf > big
21125c28e83SPiotr Jasiukajtis/*6 */  sub	%o0,%o2,%o2		!(-) if intf < small
212*55fea89dSDan Cross	fand 	%f34,%f32,%f40		!sign0 = sign bit
21325c28e83SPiotr Jasiukajtis            fmuld   %f38,%f38,%f24      !tmp2= f2*f2
21425c28e83SPiotr Jasiukajtis
21525c28e83SPiotr Jasiukajtis/*7 */  orcc	%o1,%o2,%g0		!(-) if either true
21625c28e83SPiotr Jasiukajtis 	bneg,pn  %icc,.SPECIAL0		!if (-) goto special cases below
21725c28e83SPiotr Jasiukajtis	fabsd	%f34,%f34		!abs(f)  (delay slot)
21825c28e83SPiotr Jasiukajtis	!----------------------
21925c28e83SPiotr Jasiukajtis
22025c28e83SPiotr Jasiukajtis
22125c28e83SPiotr Jasiukajtis	sethi   %hi(0x8000),%o7		!rounding bit
22225c28e83SPiotr Jasiukajtis/*8 */  fpadd32 %f34,%f52,%f0		!intf + 0x00008000 (again)
22325c28e83SPiotr Jasiukajtis            faddd   %f26,%f38,%f28      !ansu2 = conup2 + f2
22425c28e83SPiotr Jasiukajtis
22525c28e83SPiotr Jasiukajtis 	add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
22625c28e83SPiotr Jasiukajtis/*9*/   fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
22725c28e83SPiotr Jasiukajtis            fmuld   %f58,%f24,%f22      !p[3]*tmp2
22825c28e83SPiotr Jasiukajtis
22925c28e83SPiotr Jasiukajtis/*10 */ sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
23025c28e83SPiotr Jasiukajtis	fmuld	%f34,%f0,%f10		!f*z
23125c28e83SPiotr Jasiukajtis	fsubd	%f34,%f0,%f20		!f - z
23225c28e83SPiotr Jasiukajtis          add     %o4,%l4,%l4           !base addr + index1
23325c28e83SPiotr Jasiukajtis          fmuld   %f14,%f12,%f12        !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
23425c28e83SPiotr Jasiukajtis          faddd   %f16,%f36,%f16        !(conup1 - ansu1) + f1
23525c28e83SPiotr Jasiukajtis
23625c28e83SPiotr Jasiukajtis/*12 */ and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
23725c28e83SPiotr Jasiukajtis            faddd   %f22,%f60,%f22      !p[3]*tmp2 + p[2]
23825c28e83SPiotr Jasiukajtis          ldd     [%l4+WSIZE],%f14      !conlo1 = __vlibm_TBL_atan1[index+1]
23925c28e83SPiotr Jasiukajtis
24025c28e83SPiotr Jasiukajtis/*13 */ sub	%o0,%l7,%o2		!intz - 0x3f900000
24125c28e83SPiotr Jasiukajtis	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
24225c28e83SPiotr Jasiukajtis          faddd   %f16,%f12,%f12        !((conup1 - ansu1) + f1) + poly1
24325c28e83SPiotr Jasiukajtis
24425c28e83SPiotr Jasiukajtis	cmp	%o0,%l6			!(|f| > 64)
24525c28e83SPiotr Jasiukajtis        ble	.ELSE0			!if(|f| > 64) then
24625c28e83SPiotr Jasiukajtis/*15 */ sra	  %o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
24725c28e83SPiotr Jasiukajtis	  mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
24825c28e83SPiotr Jasiukajtis          ba	  .ENDIF0		!continue
24925c28e83SPiotr Jasiukajtis/*16 */   fdivd	  %f56,%f34,%f34	!f = -1.0/f (delay slot)
25025c28e83SPiotr Jasiukajtis        .ELSE0:				!else f( |x| >= (1/64))
25125c28e83SPiotr Jasiukajtis	cmp	%o0,%l7			!if intf >= 1/64
25225c28e83SPiotr Jasiukajtis	bl  	.ENDIF0			!if( |x| >= (1/64) ) then...
25325c28e83SPiotr Jasiukajtis	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
25425c28e83SPiotr Jasiukajtis          add	  %o3,4,%o1		!index = index + 4
25525c28e83SPiotr Jasiukajtis/*16 */   fdivd	  %f20,%f10,%f34	!f = (f - z)/(1.0 + f*z), reduced argument
25625c28e83SPiotr Jasiukajtis	.ENDIF0:
25725c28e83SPiotr Jasiukajtis
25825c28e83SPiotr Jasiukajtis/*17*/  sll	%o1,3,%l3		!index0 = index
25925c28e83SPiotr Jasiukajtis	mov	%i3,%l0			!yaddr0 = address of y
26025c28e83SPiotr Jasiukajtis          faddd   %f12,%f14,%f12        !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
26125c28e83SPiotr Jasiukajtis            fmuld   %f22,%f24,%f22      !(p3*tmp2 + p2)*tmp2
26225c28e83SPiotr Jasiukajtis            fsubd   %f26,%f28,%f26      !conup2 - ansu2
26325c28e83SPiotr Jasiukajtis
26425c28e83SPiotr Jasiukajtis/*20*/  add	%i1,%i2,%i1		!x     += stridex
26525c28e83SPiotr Jasiukajtis        add	%i3,%i4,%i3		!y     += stridey
26625c28e83SPiotr Jasiukajtis          faddd   %f18,%f12,%f36        !ans1 = ansu1 + ansl1
26725c28e83SPiotr Jasiukajtis            fmuld   %f38,%f24,%f24      !f*tmp2
26825c28e83SPiotr Jasiukajtis            faddd   %f22,%f62,%f22      !(p3*tmp2 + p2)*tmp2 + p1
26925c28e83SPiotr Jasiukajtis
27025c28e83SPiotr Jasiukajtis/*23*/    for     %f36,%f42,%f36        !sign(ans1) = sign of argument
27125c28e83SPiotr Jasiukajtis          std     %f36,[%l1]            !*yaddr1 = ans1
27225c28e83SPiotr Jasiukajtis            add     %o4,%l5,%l5         !base addr + index2
27325c28e83SPiotr Jasiukajtis            fmuld   %f24,%f22,%f22      !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
27425c28e83SPiotr Jasiukajtis            faddd   %f26,%f38,%f26      !(conup2 - ansu2) + f2
27525c28e83SPiotr Jasiukajtis	cmp	%i5,0			!if argcount =0, we are done
27625c28e83SPiotr Jasiukajtis	be	.RETURN
27725c28e83SPiotr Jasiukajtis	  nop
27825c28e83SPiotr Jasiukajtis
27925c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
28025c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
28125c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
28225c28e83SPiotr Jasiukajtis
28325c28e83SPiotr Jasiukajtis.LOOP1:
28425c28e83SPiotr Jasiukajtis/*25*/  deccc   %i0                     !--n
28525c28e83SPiotr Jasiukajtis        bneg    1f
28625c28e83SPiotr Jasiukajtis        mov     %i1,%o5                 !xuse = x (delay slot)
28725c28e83SPiotr Jasiukajtis        ba      2f
28825c28e83SPiotr Jasiukajtis        nop                             !delay slot
28925c28e83SPiotr Jasiukajtis1:
29025c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
29125c28e83SPiotr Jasiukajtis        dec     %i5                     !argcount--
29225c28e83SPiotr Jasiukajtis2:
29325c28e83SPiotr Jasiukajtis
29425c28e83SPiotr Jasiukajtis/*26*/  sethi	%hi(0x80000000),%o7	!mask for sign bit
29525c28e83SPiotr Jasiukajtis	sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
29625c28e83SPiotr Jasiukajtis	ld	[%o5],%o0		!intf = pf[0] = f upper
29725c28e83SPiotr Jasiukajtis
29825c28e83SPiotr Jasiukajtis/*28*/  sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
29925c28e83SPiotr Jasiukajtis	andn	%o0,%o7,%o0		!intf = fabs(intf)
30025c28e83SPiotr Jasiukajtis	ldd 	[%o5],%f36		!f = *x into f36
30125c28e83SPiotr Jasiukajtis
30225c28e83SPiotr Jasiukajtis/*30*/  sub	%o1,%o0,%o1		!(-) if intf > big
30325c28e83SPiotr Jasiukajtis	sub	%o0,%o2,%o2		!(-) if intf < small
304*55fea89dSDan Cross	fand 	%f36,%f32,%f42		!sign1 = sign bit
30525c28e83SPiotr Jasiukajtis
30625c28e83SPiotr Jasiukajtis/*31*/  orcc	%o1,%o2,%g0		!(-) if either true
30725c28e83SPiotr Jasiukajtis 	bneg,pn  %icc,.SPECIAL1		!if (-) goto special cases below
30825c28e83SPiotr Jasiukajtis	fabsd	%f36,%f36		!abs(f)  (delay slot)
30925c28e83SPiotr Jasiukajtis	!----------------------
31025c28e83SPiotr Jasiukajtis
31125c28e83SPiotr Jasiukajtis/*32*/  fpadd32 %f36,%f52,%f0		!intf + 0x00008000 (again)
31225c28e83SPiotr Jasiukajtis          ldd     [%l5+WSIZE],%f24      !conlo2 = __vlibm_TBL_atan1[index2+1]
31325c28e83SPiotr Jasiukajtis
31425c28e83SPiotr Jasiukajtis/*33*/  fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
31525c28e83SPiotr Jasiukajtis        sethi   %hi(0x8000),%o7		!rounding bit
31625c28e83SPiotr Jasiukajtis          faddd   %f26,%f22,%f22        !((conup2 - ansu2) + f2) + poly2
31725c28e83SPiotr Jasiukajtis
31825c28e83SPiotr Jasiukajtis/*34*/  add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
31925c28e83SPiotr Jasiukajtis	sethi   %hi(0x7fff0000),%o7	!mask for rounding argument
32025c28e83SPiotr Jasiukajtis	fmuld	%f36,%f0,%f10		!f*z
32125c28e83SPiotr Jasiukajtis	fsubd	%f36,%f0,%f20		!f - z
32225c28e83SPiotr Jasiukajtis
32325c28e83SPiotr Jasiukajtis/*35*/  and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
32425c28e83SPiotr Jasiukajtis          faddd   %f22,%f24,%f22        !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2
32525c28e83SPiotr Jasiukajtis
32625c28e83SPiotr Jasiukajtis/*37*/  sub	%o0,%l7,%o2		!intz - 0x3f900000
32725c28e83SPiotr Jasiukajtis	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
32825c28e83SPiotr Jasiukajtis      ldd     [%o4+%l3],%f6             !conup0 = __vlibm_TBL_atan1[index0]
32925c28e83SPiotr Jasiukajtis
33025c28e83SPiotr Jasiukajtis        cmp	%o0,%l6			!(|f| > 64)
33125c28e83SPiotr Jasiukajtis	ble	.ELSE1			!if(|f| > 64) then
33225c28e83SPiotr Jasiukajtis/*38*/  sra	%o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
33325c28e83SPiotr Jasiukajtis          mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
33425c28e83SPiotr Jasiukajtis	  ba	  .ENDIF1		!continue
33525c28e83SPiotr Jasiukajtis/*40*/    fdivd	  %f56,%f36,%f36	!f = -1.0/f (delay slot)
33625c28e83SPiotr Jasiukajtis        .ELSE1:				!else f( |x| >= (1/64))
33725c28e83SPiotr Jasiukajtis	cmp	%o0,%l7			!if intf >= 1/64
33825c28e83SPiotr Jasiukajtis	bl  	.ENDIF1			!if( |x| >= (1/64) ) then...
33925c28e83SPiotr Jasiukajtis	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
34025c28e83SPiotr Jasiukajtis          add	  %o3,4,%o1		!index = index + 4
34125c28e83SPiotr Jasiukajtis/*40*/    fdivd	  %f20,%f10,%f36	!f = (f - z)/(1.0 + f*z), reduced argument
34225c28e83SPiotr Jasiukajtis	.ENDIF1:
34325c28e83SPiotr Jasiukajtis
34425c28e83SPiotr Jasiukajtis/*41*/sll     %o1,3,%l4		 	!index1 = index
34525c28e83SPiotr Jasiukajtis      mov     %i3,%l1			!yaddr1 = address of y
34625c28e83SPiotr Jasiukajtis      fmuld   %f34,%f34,%f4             !tmp0= f0*f0
34725c28e83SPiotr Jasiukajtis          faddd   %f28,%f22,%f38        !ans2 = ansu2 + ansl2
348*55fea89dSDan Cross
34925c28e83SPiotr Jasiukajtis/*44*/add	%i1,%i2,%i1		!x     += stridex
35025c28e83SPiotr Jasiukajtis      add	%i3,%i4,%i3		!y     += stridey
35125c28e83SPiotr Jasiukajtis    fmuld   %f58,%f4,%f2                !p[3]*tmp0
35225c28e83SPiotr Jasiukajtis    faddd   %f6,%f34,%f8                !ansu0 = conup0 + f0
35325c28e83SPiotr Jasiukajtis          for     %f38,%f44,%f38        !sign(ans2) = sign of argument
35425c28e83SPiotr Jasiukajtis          std     %f38,[%l2]            !*yaddr2 = ans2
35525c28e83SPiotr Jasiukajtis          cmp	%i5,0			!if argcount =0, we are done
35625c28e83SPiotr Jasiukajtis	  be	.RETURN
35725c28e83SPiotr Jasiukajtis	  nop
35825c28e83SPiotr Jasiukajtis
35925c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
36025c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
36125c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
36225c28e83SPiotr Jasiukajtis
36325c28e83SPiotr Jasiukajtis.LOOP2:
36425c28e83SPiotr Jasiukajtis/*46*/  deccc   %i0                     !--n
36525c28e83SPiotr Jasiukajtis        bneg    1f
36625c28e83SPiotr Jasiukajtis        mov     %i1,%o5                 !xuse = x (delay slot)
36725c28e83SPiotr Jasiukajtis        ba      2f
36825c28e83SPiotr Jasiukajtis        nop                             !delay slot
36925c28e83SPiotr Jasiukajtis1:
37025c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
37125c28e83SPiotr Jasiukajtis        dec     %i5                     !argcount--
37225c28e83SPiotr Jasiukajtis2:
37325c28e83SPiotr Jasiukajtis
37425c28e83SPiotr Jasiukajtis/*47*/  sethi	%hi(0x80000000),%o7	!mask for sign bit
37525c28e83SPiotr Jasiukajtis	sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
37625c28e83SPiotr Jasiukajtis	ld	[%o5],%o0		!intf = pf[0] = f upper
37725c28e83SPiotr Jasiukajtis
37825c28e83SPiotr Jasiukajtis/*49*/  sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
37925c28e83SPiotr Jasiukajtis	andn	%o0,%o7,%o0		!intf = fabs(intf)
38025c28e83SPiotr Jasiukajtis	ldd 	[%o5],%f38		!f = *x into f38
38125c28e83SPiotr Jasiukajtis
38225c28e83SPiotr Jasiukajtis/*51*/  sub	%o1,%o0,%o1		!(-) if intf > big
38325c28e83SPiotr Jasiukajtis	sub	%o0,%o2,%o2		!(-) if intf < small
384*55fea89dSDan Cross	fand 	%f38,%f32,%f44		!sign2 = sign bit
38525c28e83SPiotr Jasiukajtis
38625c28e83SPiotr Jasiukajtis/*52*/  orcc	%o1,%o2,%g0		!(-) if either true
38725c28e83SPiotr Jasiukajtis 	bneg,pn  %icc,.SPECIAL2		!if (-) goto special cases below
38825c28e83SPiotr Jasiukajtis	fabsd	%f38,%f38		!abs(f)  (delay slot)
38925c28e83SPiotr Jasiukajtis	!----------------------
39025c28e83SPiotr Jasiukajtis
39125c28e83SPiotr Jasiukajtis/*53*/  fpadd32 %f38,%f52,%f0		!intf + 0x00008000 (again)
39225c28e83SPiotr Jasiukajtis      faddd   %f2,%f60,%f2              !p[3]*tmp0 + p[2]
39325c28e83SPiotr Jasiukajtis
39425c28e83SPiotr Jasiukajtis/*54*/  sethi   %hi(0x8000),%o7		!rounding bit
39525c28e83SPiotr Jasiukajtis	fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
39625c28e83SPiotr Jasiukajtis
39725c28e83SPiotr Jasiukajtis/*55*/  add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
39825c28e83SPiotr Jasiukajtis	sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
39925c28e83SPiotr Jasiukajtis	fmuld	%f38,%f0,%f10		!f*z
40025c28e83SPiotr Jasiukajtis	fsubd	%f38,%f0,%f20		!f - z
40125c28e83SPiotr Jasiukajtis
40225c28e83SPiotr Jasiukajtis/*56*/  and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
40325c28e83SPiotr Jasiukajtis      fmuld   %f2,%f4,%f2               !(p3*tmp0 + p2)*tmp0
40425c28e83SPiotr Jasiukajtis      fsubd   %f6,%f8,%f6               !conup0 - ansu0
40525c28e83SPiotr Jasiukajtis
40625c28e83SPiotr Jasiukajtis/*58*/  sub	%o0,%l7,%o2		!intz - 0x3f900000
40725c28e83SPiotr Jasiukajtis	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
40825c28e83SPiotr Jasiukajtis            ldd     [%o4+%l4],%f16      !conup1 = __vlibm_TBL_atan1[index1]
40925c28e83SPiotr Jasiukajtis
41025c28e83SPiotr Jasiukajtis	cmp	%o0,%l6			!(|f| > 64)
41125c28e83SPiotr Jasiukajtis	ble	.ELSE2			!if(|f| > 64) then
41225c28e83SPiotr Jasiukajtis/*60*/  sra	%o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
41325c28e83SPiotr Jasiukajtis	  mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
41425c28e83SPiotr Jasiukajtis	  ba	  .ENDIF2		!continue
41525c28e83SPiotr Jasiukajtis/*61*/    fdivd	  %f56,%f38,%f38	!f = -1.0/f (delay slot)
41625c28e83SPiotr Jasiukajtis        .ELSE2:				!else f( |x| >= (1/64))
41725c28e83SPiotr Jasiukajtis	cmp	%o0,%l7			!if intf >= 1/64
41825c28e83SPiotr Jasiukajtis	bl  	.ENDIF2			!if( |x| >= (1/64) ) then...
41925c28e83SPiotr Jasiukajtis	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
42025c28e83SPiotr Jasiukajtis          add	  %o3,4,%o1		!index = index + 4
42125c28e83SPiotr Jasiukajtis/*61*/    fdivd	  %f20,%f10,%f38	!f = (f - z)/(1.0 + f*z), reduced argument
42225c28e83SPiotr Jasiukajtis	.ENDIF2:
42325c28e83SPiotr Jasiukajtis
424*55fea89dSDan Cross
42525c28e83SPiotr Jasiukajtis/*62*/  sll	%o1,3,%l5		!index2 = index
42625c28e83SPiotr Jasiukajtis	mov	%i3,%l2			!yaddr2 = address of y
42725c28e83SPiotr Jasiukajtis      fmuld   %f34,%f4,%f4              !f0*tmp0
42825c28e83SPiotr Jasiukajtis      faddd   %f2,%f62,%f2              !(p3*tmp0 + p2)*tmp0 + p1
42925c28e83SPiotr Jasiukajtis            fmuld   %f36,%f36,%f14      !tmp1= f1*f1
43025c28e83SPiotr Jasiukajtis
43125c28e83SPiotr Jasiukajtis/*65*/add     %o4,%l3,%l3               !base addr + index0
43225c28e83SPiotr Jasiukajtis      fmuld   %f4,%f2,%f2               !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
43325c28e83SPiotr Jasiukajtis      faddd   %f6,%f34,%f6              !(conup0 - ansu0) + f0
43425c28e83SPiotr Jasiukajtis            fmuld   %f58,%f14,%f12      !p[3]*tmp1
43525c28e83SPiotr Jasiukajtis            faddd   %f16,%f36,%f18      !ansu1 = conup1 + f1
43625c28e83SPiotr Jasiukajtis      ldd     [%l3+WSIZE],%f4           !conlo0 = __vlibm_TBL_atan1[index0+1]
43725c28e83SPiotr Jasiukajtis
43825c28e83SPiotr Jasiukajtis/*68*/  add	%i1,%i2,%i1		!x     += stridex
43925c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3		!y     += stridey
44025c28e83SPiotr Jasiukajtis      faddd   %f6,%f2,%f2               !((conup0 - ansu0) + f0) + poly0
44125c28e83SPiotr Jasiukajtis            faddd   %f12,%f60,%f12      !p[3]*tmp1 + p[2]
44225c28e83SPiotr Jasiukajtis
44325c28e83SPiotr Jasiukajtis/*71*/faddd   %f2,%f4,%f2               !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
44425c28e83SPiotr Jasiukajtis            fmuld   %f12,%f14,%f12      !(p3*tmp1 + p2)*tmp1
44525c28e83SPiotr Jasiukajtis            fsubd   %f16,%f18,%f16      !conup1 - ansu1
44625c28e83SPiotr Jasiukajtis
44725c28e83SPiotr Jasiukajtis/*74*/faddd   %f8,%f2,%f34              !ans0 = ansu0 + ansl0
44825c28e83SPiotr Jasiukajtis          fmuld   %f36,%f14,%f14        !f1*tmp1
44925c28e83SPiotr Jasiukajtis          faddd   %f12,%f62,%f12        !(p3*tmp1 + p2)*tmp1 + p1
45025c28e83SPiotr Jasiukajtis
45125c28e83SPiotr Jasiukajtis/*77*/  for     %f34,%f40,%f34          !sign(ans0) = sign of argument
45225c28e83SPiotr Jasiukajtis        std     %f34,[%l0]              !*yaddr0 = ans, always gets stored (delay slot)
45325c28e83SPiotr Jasiukajtis        cmp	%i5,0			!if argcount =0, we are done
45425c28e83SPiotr Jasiukajtis	bg	.MAINLOOP
45525c28e83SPiotr Jasiukajtis	  nop
45625c28e83SPiotr Jasiukajtis
45725c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
45825c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
45925c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
46025c28e83SPiotr Jasiukajtis
46125c28e83SPiotr Jasiukajtis.RETURN:
46225c28e83SPiotr Jasiukajtis	ret
46325c28e83SPiotr Jasiukajtis	restore	%g0,%g0,%g0
46425c28e83SPiotr Jasiukajtis
46525c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
46625c28e83SPiotr Jasiukajtis    /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/
46725c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
46825c28e83SPiotr Jasiukajtis
46925c28e83SPiotr Jasiukajtis/* at this point
47025c28e83SPiotr Jasiukajtis   %i1     x address
47125c28e83SPiotr Jasiukajtis   %o0     intf
47225c28e83SPiotr Jasiukajtis   %o2     intf - 0x3e300000
47325c28e83SPiotr Jasiukajtis   %f34,36,38     f0,f1,f2
47425c28e83SPiotr Jasiukajtis   %f40,42,44     sign0,sign1,sign2
47525c28e83SPiotr Jasiukajtis*/
47625c28e83SPiotr Jasiukajtis
47725c28e83SPiotr Jasiukajtis 	.align	32				!align on I-cache boundary
47825c28e83SPiotr Jasiukajtis.SPECIAL0:
47925c28e83SPiotr Jasiukajtis	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
48025c28e83SPiotr Jasiukajtis	bpos	1f  				!if >=...continue
48125c28e83SPiotr Jasiukajtis	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
48225c28e83SPiotr Jasiukajtis	  ba	  3f
48325c28e83SPiotr Jasiukajtis	  faddd	  %f34,%f50,%f30		!dummy op just to generate exception (delay slot)
48425c28e83SPiotr Jasiukajtis1:
485*55fea89dSDan Cross	ld	[%o5+4],%o5			!load x lower word
48625c28e83SPiotr Jasiukajtis	sllx	%o0,32,%o0			!left justify intf
48725c28e83SPiotr Jasiukajtis	sllx	%g1,32,%g1		!left justify Inf
48825c28e83SPiotr Jasiukajtis	or	%o0,%o5,%o0			!merge in lower intf
48925c28e83SPiotr Jasiukajtis	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
49025c28e83SPiotr Jasiukajtis	ble,pt	%xcc,2f				!pass thru if NaN
49125c28e83SPiotr Jasiukajtis	  nop
49225c28e83SPiotr Jasiukajtis	  fmuld	  %f34,%f34,%f34		!...... (x*x) trigger invalid exception
49325c28e83SPiotr Jasiukajtis	  ba      3f
49425c28e83SPiotr Jasiukajtis	  nop
49525c28e83SPiotr Jasiukajtis2:
496*55fea89dSDan Cross	faddd	%f46,%f48,%f34			!ans = pi/2 upper + pi/2 lower
497*55fea89dSDan Cross3:
49825c28e83SPiotr Jasiukajtis	add	%i1,%i2,%i1			!x += stridex
49925c28e83SPiotr Jasiukajtis	for     %f34,%f40,%f34			!sign(ans) = sign of argument
50025c28e83SPiotr Jasiukajtis	std	%f34,[%i3]			!*y = ans
50125c28e83SPiotr Jasiukajtis	ba	.LOOP0				!keep looping
50225c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3			!y += stridey (delay slot)
50325c28e83SPiotr Jasiukajtis
50425c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
50525c28e83SPiotr Jasiukajtis    /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/
50625c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
50725c28e83SPiotr Jasiukajtis
50825c28e83SPiotr Jasiukajtis 	.align	32				!align on I-cache boundary
50925c28e83SPiotr Jasiukajtis.SPECIAL1:
51025c28e83SPiotr Jasiukajtis	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
51125c28e83SPiotr Jasiukajtis	bpos	1f  				!if >=...continue
51225c28e83SPiotr Jasiukajtis	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
51325c28e83SPiotr Jasiukajtis	  ba	  3f
51425c28e83SPiotr Jasiukajtis	  faddd	  %f36,%f50,%f30		!dummy op just to generate exception (delay slot)
51525c28e83SPiotr Jasiukajtis1:
516*55fea89dSDan Cross	ld	[%o5+4],%o5			!load x lower word
51725c28e83SPiotr Jasiukajtis	sllx	%o0,32,%o0			!left justify intf
51825c28e83SPiotr Jasiukajtis	sllx	%g1,32,%g1		!left justify Inf
51925c28e83SPiotr Jasiukajtis	or	%o0,%o5,%o0			!merge in lower intf
52025c28e83SPiotr Jasiukajtis	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
52125c28e83SPiotr Jasiukajtis	ble,pt	%xcc,2f				!pass thru if NaN
52225c28e83SPiotr Jasiukajtis	  nop
52325c28e83SPiotr Jasiukajtis	  fmuld	  %f36,%f36,%f36		!...... (x*x) trigger invalid exception
52425c28e83SPiotr Jasiukajtis	  ba      3f
52525c28e83SPiotr Jasiukajtis	  nop
52625c28e83SPiotr Jasiukajtis2:
527*55fea89dSDan Cross	faddd	%f46,%f48,%f36			!ans = pi/2 upper + pi/2 lower
528*55fea89dSDan Cross3:
52925c28e83SPiotr Jasiukajtis	add	%i1,%i2,%i1			!x += stridex
53025c28e83SPiotr Jasiukajtis	for     %f36,%f42,%f36			!sign(ans) = sign of argument
53125c28e83SPiotr Jasiukajtis	std	%f36,[%i3]			!*y = ans
53225c28e83SPiotr Jasiukajtis	ba      .LOOP1 				!keep looping
53325c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3			!y += stridey (delay slot)
53425c28e83SPiotr Jasiukajtis
53525c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
53625c28e83SPiotr Jasiukajtis    /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/
53725c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
53825c28e83SPiotr Jasiukajtis
53925c28e83SPiotr Jasiukajtis 	.align	32				!align on I-cache boundary
54025c28e83SPiotr Jasiukajtis.SPECIAL2:
54125c28e83SPiotr Jasiukajtis	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
54225c28e83SPiotr Jasiukajtis	bpos	1f  				!if >=...continue
54325c28e83SPiotr Jasiukajtis	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
54425c28e83SPiotr Jasiukajtis	  ba	  3f
54525c28e83SPiotr Jasiukajtis	  faddd	  %f38,%f50,%f30		!dummy op just to generate exception (delay slot)
54625c28e83SPiotr Jasiukajtis1:
547*55fea89dSDan Cross	ld	[%o5+4],%o5			!load x lower word
54825c28e83SPiotr Jasiukajtis	sllx	%o0,32,%o0			!left justify intf
54925c28e83SPiotr Jasiukajtis	sllx	%g1,32,%g1		!left justify Inf
55025c28e83SPiotr Jasiukajtis	or	%o0,%o5,%o0			!merge in lower intf
55125c28e83SPiotr Jasiukajtis	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
55225c28e83SPiotr Jasiukajtis	ble,pt	%xcc,2f				!pass thru if NaN
55325c28e83SPiotr Jasiukajtis	  nop
55425c28e83SPiotr Jasiukajtis	  fmuld	  %f38,%f38,%f38		!...... (x*x) trigger invalid exception
55525c28e83SPiotr Jasiukajtis	  ba      3f
55625c28e83SPiotr Jasiukajtis	  nop
55725c28e83SPiotr Jasiukajtis2:
558*55fea89dSDan Cross	faddd	%f46,%f48,%f38			!ans = pi/2 upper + pi/2 lower
559*55fea89dSDan Cross3:
56025c28e83SPiotr Jasiukajtis	add	%i1,%i2,%i1			!x += stridex
56125c28e83SPiotr Jasiukajtis	for     %f38,%f44,%f38			!sign(ans) = sign of argument
56225c28e83SPiotr Jasiukajtis	std	%f38,[%i3]			!*y = ans
56325c28e83SPiotr Jasiukajtis	ba      .LOOP2 				!keep looping
56425c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3			!y += stridey
56525c28e83SPiotr Jasiukajtis
56625c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
56725c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
56825c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
56925c28e83SPiotr Jasiukajtis
57025c28e83SPiotr Jasiukajtis	SET_SIZE(__vatan)
57125c28e83SPiotr Jasiukajtis
57225c28e83SPiotr Jasiukajtis!	.ident	"03-20-96 Sparc V9 3-way-unrolled version"
573