/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ .file "__vcos.S" #include "libm.h" RO_DATA .align 64 constants: .word 0x3ec718e3,0xa6972785 .word 0x3ef9fd39,0x94293940 .word 0xbf2a019f,0x75ee4be1 .word 0xbf56c16b,0xba552569 .word 0x3f811111,0x1108c703 .word 0x3fa55555,0x554f5b35 .word 0xbfc55555,0x555554d0 .word 0xbfdfffff,0xffffff85 .word 0x3ff00000,0x00000000 .word 0xbfc55555,0x5551fc28 .word 0x3f811107,0x62eacc9d .word 0xbfdfffff,0xffff6328 .word 0x3fa55551,0x5f7acf0c .word 0x3fe45f30,0x6dc9c883 .word 0x43380000,0x00000000 .word 0x3ff921fb,0x54400000 .word 0x3dd0b461,0x1a600000 .word 0x3ba3198a,0x2e000000 .word 0x397b839a,0x252049c1 .word 0x80000000,0x00004000 .word 0xffff8000,0x00000000 ! N.B.: low-order words used .word 0x3fc90000,0x80000000 ! for sign bit hacking; see .word 0x3fc40000,0x00000000 ! references to "thresh" below #define p4 0x0 #define q4 0x08 #define p3 0x10 #define q3 0x18 #define p2 0x20 #define q2 0x28 #define p1 0x30 #define q1 0x38 #define one 0x40 #define pp1 0x48 #define pp2 0x50 #define qq1 0x58 #define qq2 0x60 #define invpio2 0x68 #define round 0x70 #define pio2_1 0x78 #define pio2_2 0x80 #define pio2_3 0x88 #define pio2_3t 0x90 #define f30val 0x98 #define mask 0xa0 #define thresh 0xa8 ! local storage indices #define xsave STACK_BIAS-0x8 #define ysave STACK_BIAS-0x10 #define nsave STACK_BIAS-0x14 #define sxsave STACK_BIAS-0x18 #define sysave STACK_BIAS-0x1c #define biguns STACK_BIAS-0x20 #define n2 STACK_BIAS-0x24 #define n1 STACK_BIAS-0x28 #define n0 STACK_BIAS-0x2c #define x2_1 STACK_BIAS-0x40 #define x1_1 STACK_BIAS-0x50 #define x0_1 STACK_BIAS-0x60 #define y2_0 STACK_BIAS-0x70 #define y1_0 STACK_BIAS-0x80 #define y0_0 STACK_BIAS-0x90 ! sizeof temp storage - must be a multiple of 16 for V9 #define tmps 0x90 !-------------------------------------------------------------------- ! define pipes for easier reading #define P0_f0 %f0 #define P0_f1 %f1 #define P0_f2 %f2 #define P0_f3 %f3 #define P0_f4 %f4 #define P0_f5 %f5 #define P0_f6 %f6 #define P0_f7 %f7 #define P0_f8 %f8 #define P0_f9 %f9 #define P1_f10 %f10 #define P1_f11 %f11 #define P1_f12 %f12 #define P1_f13 %f13 #define P1_f14 %f14 #define P1_f15 %f15 #define P1_f16 %f16 #define P1_f17 %f17 #define P1_f18 %f18 #define P1_f19 %f19 #define P2_f20 %f20 #define P2_f21 %f21 #define P2_f22 %f22 #define P2_f23 %f23 #define P2_f24 %f24 #define P2_f25 %f25 #define P2_f26 %f26 #define P2_f27 %f27 #define P2_f28 %f28 #define P2_f29 %f29 ! define __vlibm_TBL_sincos_hi & lo for easy reading #define SC_HI %l3 #define SC_LO %l4 ! define constants for easy reading #define C_q1 %f46 #define C_q2 %f48 #define C_q3 %f50 #define C_q4 %f52 ! one ( 1 ) uno eins echi un #define C_ONE %f54 #define C_ONE_LO %f55 ! masks #define MSK_SIGN %i5 #define MSK_BIT31 %f30 #define MSK_BIT13 %f31 #define MSK_BITSHI17 %f44 ! constants for pp and qq #define C_pp1 %f56 #define C_pp2 %f58 #define C_qq1 %f60 #define C_qq2 %f62 ! sign mask #define C_signM %i5 #define LIM_l5 %l5 #define LIM_l6 %l6 ! when in pri range, using value as transition from poly to table. ! for Medium range,change use of %l6 and use to keep track of biguns. #define LIM_l7 %l7 !-------------------------------------------------------------------- ENTRY(__vcos) save %sp,-SA(MINFRAME)-tmps,%sp PIC_SETUP(g5) PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) PIC_SET(g5,constants,o0) mov %o0,%g1 wr %g0,0x82,%asi ! set %asi for non-faulting loads ! ========== primary range ========== ! register use ! i0 n ! i1 x ! i2 stridex ! i3 y ! i4 stridey ! i5 0x80000000 ! l0 hx0 ! l1 hx1 ! l2 hx2 ! l3 __vlibm_TBL_sincos_hi ! l4 __vlibm_TBL_sincos_lo ! l5 0x3fc40000 ! l6 0x3e400000 ! l7 0x3fe921fb ! the following are 64-bit registers in both V8+ and V9 ! g1 scratch ! g5 ! o0 py0 ! o1 py1 ! o2 py2 ! o3 oy0 ! o4 oy1 ! o5 oy2 ! o7 scratch ! f0 x0 ! f2 ! f4 ! f6 ! f8 scratch for table base ! f9 signbit0 ! f10 x1 ! f12 ! f14 ! f16 ! f18 scratch for table base ! f19 signbit1 ! f20 x2 ! f22 ! f24 ! f26 ! f28 scratch for table base ! f29 signbit2 ! f30 0x80000000 ! f31 0x4000 ! f32 ! f34 ! f36 ! f38 ! f40 ! f42 ! f44 0xffff800000000000 ! f46 p1 ! f48 p2 ! f50 p3 ! f52 p4 ! f54 one ! f56 pp1 ! f58 pp2 ! f60 qq1 ! f62 qq2 #ifdef __sparcv9 stx %i1,[%fp+xsave] ! save arguments stx %i3,[%fp+ysave] #else st %i1,[%fp+xsave] ! save arguments st %i3,[%fp+ysave] #endif st %i0,[%fp+nsave] st %i2,[%fp+sxsave] st %i4,[%fp+sysave] sethi %hi(0x80000000),MSK_SIGN ! load/set up constants sethi %hi(0x3fc40000),LIM_l5 sethi %hi(0x3e400000),LIM_l6 sethi %hi(0x3fe921fb),LIM_l7 or LIM_l7,%lo(0x3fe921fb),LIM_l7 ldd [%g1+f30val],MSK_BIT31 ldd [%g1+mask],MSK_BITSHI17 ldd [%g1+q1],C_q1 ldd [%g1+q2],C_q2 ldd [%g1+q3],C_q3 ldd [%g1+q4],C_q4 ldd [%g1+one],C_ONE ldd [%g1+pp1],C_pp1 ldd [%g1+pp2],C_pp2 ldd [%g1+qq1],C_qq1 ldd [%g1+qq2],C_qq2 sll %i2,3,%i2 ! scale strides sll %i4,3,%i4 add %fp,x0_1,%o3 ! precondition loop add %fp,x0_1,%o4 add %fp,x0_1,%o5 ld [%i1],%l0 ! hx = *x ld [%i1],P0_f0 ld [%i1+4],P0_f1 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 add %i1,%i2,%i1 ! x += stridex ba,pt %icc,.loop0 !delay slot nop .align 32 .loop0: lda [%i1]%asi,%l1 ! preload next argument sub %l0,LIM_l6,%g1 sub LIM_l7,%l0,%o7 fands P0_f0,MSK_BIT31,P0_f9 ! save signbit lda [%i1]%asi,P1_f10 orcc %o7,%g1,%g0 mov %i3,%o0 ! py0 = y bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb ! delay slot lda [%i1+4]%asi,P1_f11 addcc %i0,-1,%i0 add %i3,%i4,%i3 ! y += stridey ble,pn %icc,.endloop1 ! delay slot andn %l1,MSK_SIGN,%l1 add %i1,%i2,%i1 ! x += stridex fabsd P0_f0,P0_f0 fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only .loop1: lda [%i1]%asi,%l2 ! preload next argument sub %l1,LIM_l6,%g1 sub LIM_l7,%l1,%o7 fands P1_f10,MSK_BIT31,P1_f19 ! save signbit lda [%i1]%asi,P2_f20 orcc %o7,%g1,%g0 mov %i3,%o1 ! py1 = y bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb ! delay slot lda [%i1+4]%asi,P2_f21 addcc %i0,-1,%i0 add %i3,%i4,%i3 ! y += stridey ble,pn %icc,.endloop2 ! delay slot andn %l2,MSK_SIGN,%l2 add %i1,%i2,%i1 ! x += stridex fabsd P1_f10,P1_f10 fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only .loop2: st P0_f6,[%o3] sub %l2,LIM_l6,%g1 sub LIM_l7,%l2,%o7 fands P2_f20,MSK_BIT31,P2_f29 ! save signbit st P0_f7,[%o3+4] orcc %g1,%o7,%g0 mov %i3,%o2 ! py2 = y bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb ! delay slot add %i3,%i4,%i3 ! y += stridey cmp %l0,LIM_l5 fabsd P2_f20,P2_f20 bl,pn %icc,.case4 ! delay slot st P1_f16,[%o4] cmp %l1,LIM_l5 fpadd32s P0_f0,MSK_BIT13,P0_f8 bl,pn %icc,.case2 ! delay slot st P1_f17,[%o4+4] cmp %l2,LIM_l5 fpadd32s P1_f10,MSK_BIT13,P1_f18 bl,pn %icc,.case1 ! delay slot st P2_f26,[%o5] mov %o0,%o3 sethi %hi(0x3fc3c000),%o7 fpadd32s P2_f20,MSK_BIT13,P2_f28 st P2_f27,[%o5+4] fand P0_f8,MSK_BITSHI17,P0_f2 mov %o1,%o4 fand P1_f18,MSK_BITSHI17,P1_f12 mov %o2,%o5 sub %l0,%o7,%l0 fand P2_f28,MSK_BITSHI17,P2_f22 sub %l1,%o7,%l1 sub %l2,%o7,%l2 fsubd P0_f0,P0_f2,P0_f0 srl %l0,10,%l0 add SC_HI,8,%g1;add SC_LO,8,%o7 fsubd P1_f10,P1_f12,P1_f10 srl %l1,10,%l1 fsubd P2_f20,P2_f22,P2_f20 srl %l2,10,%l2 fmuld P0_f0,P0_f0,P0_f2 andn %l0,0x1f,%l0 fmuld P1_f10,P1_f10,P1_f12 andn %l1,0x1f,%l1 fmuld P2_f20,P2_f20,P2_f22 andn %l2,0x1f,%l2 fmuld P0_f2,C_pp2,P0_f6 ldd [%g1+%l0],%f32 fmuld P1_f12,C_pp2,P1_f16 ldd [%g1+%l1],%f36 fmuld P2_f22,C_pp2,P2_f26 ldd [%g1+%l2],%f40 faddd P0_f6,C_pp1,P0_f6 fmuld P0_f2,C_qq2,P0_f4 ldd [SC_HI+%l0],%f34 faddd P1_f16,C_pp1,P1_f16 fmuld P1_f12,C_qq2,P1_f14 ldd [SC_HI+%l1],%f38 faddd P2_f26,C_pp1,P2_f26 fmuld P2_f22,C_qq2,P2_f24 ldd [SC_HI+%l2],%f42 fmuld P0_f2,P0_f6,P0_f6 faddd P0_f4,C_qq1,P0_f4 fmuld P1_f12,P1_f16,P1_f16 faddd P1_f14,C_qq1,P1_f14 fmuld P2_f22,P2_f26,P2_f26 faddd P2_f24,C_qq1,P2_f24 faddd P0_f6,C_ONE,P0_f6 fmuld P0_f2,P0_f4,P0_f4 faddd P1_f16,C_ONE,P1_f16 fmuld P1_f12,P1_f14,P1_f14 faddd P2_f26,C_ONE,P2_f26 fmuld P2_f22,P2_f24,P2_f24 fmuld P0_f0,P0_f6,P0_f6 ldd [%o7+%l0],P0_f2 fmuld P1_f10,P1_f16,P1_f16 ldd [%o7+%l1],P1_f12 fmuld P2_f20,P2_f26,P2_f26 ldd [%o7+%l2],P2_f22 fmuld P0_f4,%f32,P0_f4 lda [%i1]%asi,%l0 ! preload next argument fmuld P1_f14,%f36,P1_f14 lda [%i1]%asi,P0_f0 fmuld P2_f24,%f40,P2_f24 lda [%i1+4]%asi,P0_f1 fmuld P0_f6,%f34,P0_f6 add %i1,%i2,%i1 ! x += stridex fmuld P1_f16,%f38,P1_f16 fmuld P2_f26,%f42,P2_f26 fsubd P0_f6,P0_f4,P0_f6 fsubd P1_f16,P1_f14,P1_f16 fsubd P2_f26,P2_f24,P2_f26 fsubd P0_f2,P0_f6,P0_f6 fsubd P1_f12,P1_f16,P1_f16 fsubd P2_f22,P2_f26,P2_f26 faddd P0_f6,%f32,P0_f6 faddd P1_f16,%f36,P1_f16 faddd P2_f26,%f40,P2_f26 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 addcc %i0,-1,%i0 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P2_f26,P2_f29,P2_f26 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .case1: st P2_f27,[%o5+4] sethi %hi(0x3fc3c000),%o7 fand P0_f8,MSK_BITSHI17,P0_f2 sub %l0,%o7,%l0 sub %l1,%o7,%l1 add SC_HI,8,%g1;add SC_LO,8,%o7 fand P1_f18,MSK_BITSHI17,P1_f12 fmuld P2_f20,P2_f20,P2_f22 fsubd P0_f0,P0_f2,P0_f0 srl %l0,10,%l0 mov %o0,%o3 fsubd P1_f10,P1_f12,P1_f10 srl %l1,10,%l1 mov %o1,%o4 fmuld P2_f22,C_q4,P2_f24 mov %o2,%o5 fmuld P0_f0,P0_f0,P0_f2 andn %l0,0x1f,%l0 fmuld P1_f10,P1_f10,P1_f12 andn %l1,0x1f,%l1 faddd P2_f24,C_q3,P2_f24 fmuld P0_f2,C_pp2,P0_f6 ldd [%g1+%l0],%f32 fmuld P1_f12,C_pp2,P1_f16 ldd [%g1+%l1],%f36 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f6,C_pp1,P0_f6 fmuld P0_f2,C_qq2,P0_f4 ldd [SC_HI+%l0],%f34 faddd P1_f16,C_pp1,P1_f16 fmuld P1_f12,C_qq2,P1_f14 ldd [SC_HI+%l1],%f38 faddd P2_f24,C_q2,P2_f24 fmuld P0_f2,P0_f6,P0_f6 faddd P0_f4,C_qq1,P0_f4 fmuld P1_f12,P1_f16,P1_f16 faddd P1_f14,C_qq1,P1_f14 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f6,C_ONE,P0_f6 fmuld P0_f2,P0_f4,P0_f4 faddd P1_f16,C_ONE,P1_f16 fmuld P1_f12,P1_f14,P1_f14 faddd P2_f24,C_q1,P2_f24 fmuld P0_f0,P0_f6,P0_f6 ldd [%o7+%l0],P0_f2 fmuld P1_f10,P1_f16,P1_f16 ldd [%o7+%l1],P1_f12 fmuld P0_f4,%f32,P0_f4 lda [%i1]%asi,%l0 ! preload next argument fmuld P1_f14,%f36,P1_f14 lda [%i1]%asi,P0_f0 fmuld P0_f6,%f34,P0_f6 lda [%i1+4]%asi,P0_f1 fmuld P1_f16,%f38,P1_f16 add %i1,%i2,%i1 ! x += stridex fmuld P2_f22,P2_f24,P2_f24 fsubd P0_f6,P0_f4,P0_f6 fsubd P1_f16,P1_f14,P1_f16 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 fsubd P0_f2,P0_f6,P0_f6 fsubd P1_f12,P1_f16,P1_f16 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 faddd P0_f6,%f32,P0_f6 faddd P1_f16,%f36,P1_f16 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 addcc %i0,-1,%i0 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P1_f16,P1_f19,P1_f16 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .case2: st P2_f26,[%o5] cmp %l2,LIM_l5 fpadd32s P2_f20,MSK_BIT13,P2_f28 bl,pn %icc,.case3 ! delay slot st P2_f27,[%o5+4] sethi %hi(0x3fc3c000),%o7 fand P0_f8,MSK_BITSHI17,P0_f2 sub %l0,%o7,%l0 sub %l2,%o7,%l2 add SC_HI,8,%g1;add SC_LO,8,%o7 fand P2_f28,MSK_BITSHI17,P2_f22 fmuld P1_f10,P1_f10,P1_f12 fsubd P0_f0,P0_f2,P0_f0 srl %l0,10,%l0 mov %o0,%o3 fsubd P2_f20,P2_f22,P2_f20 srl %l2,10,%l2 mov %o2,%o5 fmuld P1_f12,C_q4,P1_f14 mov %o1,%o4 fmuld P0_f0,P0_f0,P0_f2 andn %l0,0x1f,%l0 fmuld P2_f20,P2_f20,P2_f22 andn %l2,0x1f,%l2 faddd P1_f14,C_q3,P1_f14 fmuld P0_f2,C_pp2,P0_f6 ldd [%g1+%l0],%f32 fmuld P2_f22,C_pp2,P2_f26 ldd [%g1+%l2],%f40 fmuld P1_f12,P1_f14,P1_f14 faddd P0_f6,C_pp1,P0_f6 fmuld P0_f2,C_qq2,P0_f4 ldd [SC_HI+%l0],%f34 faddd P2_f26,C_pp1,P2_f26 fmuld P2_f22,C_qq2,P2_f24 ldd [SC_HI+%l2],%f42 faddd P1_f14,C_q2,P1_f14 fmuld P0_f2,P0_f6,P0_f6 faddd P0_f4,C_qq1,P0_f4 fmuld P2_f22,P2_f26,P2_f26 faddd P2_f24,C_qq1,P2_f24 fmuld P1_f12,P1_f14,P1_f14 faddd P0_f6,C_ONE,P0_f6 fmuld P0_f2,P0_f4,P0_f4 faddd P2_f26,C_ONE,P2_f26 fmuld P2_f22,P2_f24,P2_f24 faddd P1_f14,C_q1,P1_f14 fmuld P0_f0,P0_f6,P0_f6 ldd [%o7+%l0],P0_f2 fmuld P2_f20,P2_f26,P2_f26 ldd [%o7+%l2],P2_f22 fmuld P0_f4,%f32,P0_f4 lda [%i1]%asi,%l0 ! preload next argument fmuld P2_f24,%f40,P2_f24 lda [%i1]%asi,P0_f0 fmuld P0_f6,%f34,P0_f6 lda [%i1+4]%asi,P0_f1 fmuld P2_f26,%f42,P2_f26 add %i1,%i2,%i1 ! x += stridex fmuld P1_f12,P1_f14,P1_f14 fsubd P0_f6,P0_f4,P0_f6 fsubd P2_f26,P2_f24,P2_f26 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 fsubd P0_f2,P0_f6,P0_f6 fsubd P2_f22,P2_f26,P2_f26 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 faddd P0_f6,%f32,P0_f6 faddd P2_f26,%f40,P2_f26 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 addcc %i0,-1,%i0 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P2_f26,P2_f29,P2_f26 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .case3: sethi %hi(0x3fc3c000),%o7 fand P0_f8,MSK_BITSHI17,P0_f2 fmuld P1_f10,P1_f10,P1_f12 sub %l0,%o7,%l0 add SC_HI,8,%g1;add SC_LO,8,%o7 fmuld P2_f20,P2_f20,P2_f22 fsubd P0_f0,P0_f2,P0_f0 srl %l0,10,%l0 mov %o0,%o3 fmuld P1_f12,C_q4,P1_f14 mov %o1,%o4 fmuld P2_f22,C_q4,P2_f24 mov %o2,%o5 fmuld P0_f0,P0_f0,P0_f2 andn %l0,0x1f,%l0 faddd P1_f14,C_q3,P1_f14 faddd P2_f24,C_q3,P2_f24 fmuld P0_f2,C_pp2,P0_f6 ldd [%g1+%l0],%f32 fmuld P1_f12,P1_f14,P1_f14 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f6,C_pp1,P0_f6 fmuld P0_f2,C_qq2,P0_f4 ldd [SC_HI+%l0],%f34 faddd P1_f14,C_q2,P1_f14 faddd P2_f24,C_q2,P2_f24 fmuld P0_f2,P0_f6,P0_f6 faddd P0_f4,C_qq1,P0_f4 fmuld P1_f12,P1_f14,P1_f14 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f6,C_ONE,P0_f6 fmuld P0_f2,P0_f4,P0_f4 faddd P1_f14,C_q1,P1_f14 faddd P2_f24,C_q1,P2_f24 fmuld P0_f0,P0_f6,P0_f6 ldd [%o7+%l0],P0_f2 fmuld P0_f4,%f32,P0_f4 lda [%i1]%asi,%l0 ! preload next argument fmuld P1_f12,P1_f14,P1_f14 lda [%i1]%asi,P0_f0 fmuld P0_f6,%f34,P0_f6 lda [%i1+4]%asi,P0_f1 fmuld P2_f22,P2_f24,P2_f24 add %i1,%i2,%i1 ! x += stridex !!(vsin)fmuld P1_f10,P1_f14,P1_f14 fsubd P0_f6,P0_f4,P0_f6 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 fsubd P0_f2,P0_f6,P0_f6 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 faddd P0_f6,%f32,P0_f6 addcc %i0,-1,%i0 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P0_f6,P0_f9,P0_f6 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .case4: st P1_f17,[%o4+4] cmp %l1,LIM_l5 fpadd32s P1_f10,MSK_BIT13,P1_f18 bl,pn %icc,.case6 ! delay slot st P2_f26,[%o5] cmp %l2,LIM_l5 fpadd32s P2_f20,MSK_BIT13,P2_f28 bl,pn %icc,.case5 ! delay slot st P2_f27,[%o5+4] sethi %hi(0x3fc3c000),%o7 fand P1_f18,MSK_BITSHI17,P1_f12 sub %l1,%o7,%l1 sub %l2,%o7,%l2 add SC_HI,8,%g1;add SC_LO,8,%o7 fand P2_f28,MSK_BITSHI17,P2_f22 fmuld P0_f0,P0_f0,P0_f2 fsubd P1_f10,P1_f12,P1_f10 srl %l1,10,%l1 mov %o1,%o4 fsubd P2_f20,P2_f22,P2_f20 srl %l2,10,%l2 mov %o2,%o5 fmovd P0_f0,P0_f6 !ID for processing fmuld P0_f2,C_q4,P0_f4 mov %o0,%o3 fmuld P1_f10,P1_f10,P1_f12 andn %l1,0x1f,%l1 fmuld P2_f20,P2_f20,P2_f22 andn %l2,0x1f,%l2 faddd P0_f4,C_q3,P0_f4 fmuld P1_f12,C_pp2,P1_f16 ldd [%g1+%l1],%f36 fmuld P2_f22,C_pp2,P2_f26 ldd [%g1+%l2],%f40 fmuld P0_f2,P0_f4,P0_f4 faddd P1_f16,C_pp1,P1_f16 fmuld P1_f12,C_qq2,P1_f14 ldd [SC_HI+%l1],%f38 faddd P2_f26,C_pp1,P2_f26 fmuld P2_f22,C_qq2,P2_f24 ldd [SC_HI+%l2],%f42 faddd P0_f4,C_q2,P0_f4 fmuld P1_f12,P1_f16,P1_f16 faddd P1_f14,C_qq1,P1_f14 fmuld P2_f22,P2_f26,P2_f26 faddd P2_f24,C_qq1,P2_f24 fmuld P0_f2,P0_f4,P0_f4 faddd P1_f16,C_ONE,P1_f16 fmuld P1_f12,P1_f14,P1_f14 faddd P2_f26,C_ONE,P2_f26 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f4,C_q1,P0_f4 fmuld P1_f10,P1_f16,P1_f16 ldd [%o7+%l1],P1_f12 fmuld P2_f20,P2_f26,P2_f26 ldd [%o7+%l2],P2_f22 fmuld P1_f14,%f36,P1_f14 lda [%i1]%asi,%l0 ! preload next argument fmuld P2_f24,%f40,P2_f24 lda [%i1]%asi,P0_f0 fmuld P1_f16,%f38,P1_f16 lda [%i1+4]%asi,P0_f1 fmuld P2_f26,%f42,P2_f26 add %i1,%i2,%i1 ! x += stridex fmuld P0_f2,P0_f4,P0_f4 fsubd P1_f16,P1_f14,P1_f16 fsubd P2_f26,P2_f24,P2_f26 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 fsubd P1_f12,P1_f16,P1_f16 fsubd P2_f22,P2_f26,P2_f26 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing faddd P1_f16,%f36,P1_f16 faddd P2_f26,%f40,P2_f26 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 addcc %i0,-1,%i0 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P2_f26,P2_f29,P2_f26 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .case5: sethi %hi(0x3fc3c000),%o7 fand P1_f18,MSK_BITSHI17,P1_f12 fmuld P0_f0,P0_f0,P0_f2 sub %l1,%o7,%l1 add SC_HI,8,%g1;add SC_LO,8,%o7 fmuld P2_f20,P2_f20,P2_f22 fsubd P1_f10,P1_f12,P1_f10 srl %l1,10,%l1 mov %o1,%o4 fmovd P0_f0,P0_f6 !ID for processing fmuld P0_f2,C_q4,P0_f4 mov %o0,%o3 fmuld P2_f22,C_q4,P2_f24 mov %o2,%o5 fmuld P1_f10,P1_f10,P1_f12 andn %l1,0x1f,%l1 faddd P0_f4,C_q3,P0_f4 faddd P2_f24,C_q3,P2_f24 fmuld P1_f12,C_pp2,P1_f16 ldd [%g1+%l1],%f36 fmuld P0_f2,P0_f4,P0_f4 fmuld P2_f22,P2_f24,P2_f24 faddd P1_f16,C_pp1,P1_f16 fmuld P1_f12,C_qq2,P1_f14 ldd [SC_HI+%l1],%f38 faddd P0_f4,C_q2,P0_f4 faddd P2_f24,C_q2,P2_f24 fmuld P1_f12,P1_f16,P1_f16 faddd P1_f14,C_qq1,P1_f14 fmuld P0_f2,P0_f4,P0_f4 fmuld P2_f22,P2_f24,P2_f24 faddd P1_f16,C_ONE,P1_f16 fmuld P1_f12,P1_f14,P1_f14 faddd P0_f4,C_q1,P0_f4 faddd P2_f24,C_q1,P2_f24 fmuld P1_f10,P1_f16,P1_f16 ldd [%o7+%l1],P1_f12 fmuld P1_f14,%f36,P1_f14 lda [%i1]%asi,%l0 ! preload next argument fmuld P0_f2,P0_f4,P0_f4 lda [%i1]%asi,P0_f0 fmuld P1_f16,%f38,P1_f16 lda [%i1+4]%asi,P0_f1 fmuld P2_f22,P2_f24,P2_f24 add %i1,%i2,%i1 ! x += stridex !!(vsin)fmuld P0_f6,P0_f4,P0_f4 fsubd P1_f16,P1_f14,P1_f16 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing fsubd P1_f12,P1_f16,P1_f16 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 faddd P1_f16,%f36,P1_f16 addcc %i0,-1,%i0 nop !!(vsin) fors P2_f26,P2_f29,P2_f26 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P1_f16,P1_f19,P1_f16 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .case6: st P2_f27,[%o5+4] cmp %l2,LIM_l5 fpadd32s P2_f20,MSK_BIT13,P2_f28 bl,pn %icc,.case7 ! delay slot sethi %hi(0x3fc3c000),%o7 fand P2_f28,MSK_BITSHI17,P2_f22 fmuld P0_f0,P0_f0,P0_f2 sub %l2,%o7,%l2 add SC_HI,8,%g1;add SC_LO,8,%o7 fmuld P1_f10,P1_f10,P1_f12 fsubd P2_f20,P2_f22,P2_f20 srl %l2,10,%l2 mov %o2,%o5 fmovd P0_f0,P0_f6 !ID for processing fmuld P0_f2,C_q4,P0_f4 mov %o0,%o3 fmuld P1_f12,C_q4,P1_f14 mov %o1,%o4 fmuld P2_f20,P2_f20,P2_f22 andn %l2,0x1f,%l2 faddd P0_f4,C_q3,P0_f4 faddd P1_f14,C_q3,P1_f14 fmuld P2_f22,C_pp2,P2_f26 ldd [%g1+%l2],%f40 fmuld P0_f2,P0_f4,P0_f4 fmuld P1_f12,P1_f14,P1_f14 faddd P2_f26,C_pp1,P2_f26 fmuld P2_f22,C_qq2,P2_f24 ldd [SC_HI+%l2],%f42 faddd P0_f4,C_q2,P0_f4 faddd P1_f14,C_q2,P1_f14 fmuld P2_f22,P2_f26,P2_f26 faddd P2_f24,C_qq1,P2_f24 fmuld P0_f2,P0_f4,P0_f4 fmuld P1_f12,P1_f14,P1_f14 faddd P2_f26,C_ONE,P2_f26 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f4,C_q1,P0_f4 faddd P1_f14,C_q1,P1_f14 fmuld P2_f20,P2_f26,P2_f26 ldd [%o7+%l2],P2_f22 fmuld P2_f24,%f40,P2_f24 lda [%i1]%asi,%l0 ! preload next argument fmuld P0_f2,P0_f4,P0_f4 lda [%i1]%asi,P0_f0 fmuld P2_f26,%f42,P2_f26 lda [%i1+4]%asi,P0_f1 fmuld P1_f12,P1_f14,P1_f14 add %i1,%i2,%i1 ! x += stridex !!(vsin)fmuld P0_f6,P0_f4,P0_f4 fsubd P2_f26,P2_f24,P2_f26 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing fsubd P2_f22,P2_f26,P2_f26 faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 faddd P2_f26,%f40,P2_f26 addcc %i0,-1,%i0 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P2_f26,P2_f29,P2_f26 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .case7: fmuld P0_f0,P0_f0,P0_f2 fmovd P0_f0,P0_f6 !ID for processing mov %o0,%o3 fmuld P1_f10,P1_f10,P1_f12 mov %o1,%o4 fmuld P2_f20,P2_f20,P2_f22 mov %o2,%o5 fmuld P0_f2,C_q4,P0_f4 lda [%i1]%asi,%l0 ! preload next argument fmuld P1_f12,C_q4,P1_f14 lda [%i1]%asi,P0_f0 fmuld P2_f22,C_q4,P2_f24 lda [%i1+4]%asi,P0_f1 faddd P0_f4,C_q3,P0_f4 add %i1,%i2,%i1 ! x += stridex faddd P1_f14,C_q3,P1_f14 faddd P2_f24,C_q3,P2_f24 fmuld P0_f2,P0_f4,P0_f4 fmuld P1_f12,P1_f14,P1_f14 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f4,C_q2,P0_f4 faddd P1_f14,C_q2,P1_f14 faddd P2_f24,C_q2,P2_f24 fmuld P0_f2,P0_f4,P0_f4 fmuld P1_f12,P1_f14,P1_f14 fmuld P2_f22,P2_f24,P2_f24 faddd P0_f4,C_q1,P0_f4 faddd P1_f14,C_q1,P1_f14 faddd P2_f24,C_q1,P2_f24 fmuld P0_f2,P0_f4,P0_f4 fmuld P1_f12,P1_f14,P1_f14 fmuld P2_f22,P2_f24,P2_f24 !!(vsin)fmuld P0_f6,P0_f4,P0_f4 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 !!(vsin)fmuld P2_f20,P2_f24,P2_f24 faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 nop !!(vsin) fors P0_f6,P0_f9,P0_f6 addcc %i0,-1,%i0 nop !!(vsin) fors P1_f16,P1_f19,P1_f16 bg,pt %icc,.loop0 ! delay slot nop !!(vsin) fors P2_f26,P2_f29,P2_f26 ba,pt %icc,.endloop0 ! delay slot nop .align 32 .endloop2: cmp %l1,LIM_l5 bl,pn %icc,1f ! delay slot fabsd P1_f10,P1_f10 sethi %hi(0x3fc3c000),%o7 fpadd32s P1_f10,MSK_BIT13,P1_f18 fand P1_f18,MSK_BITSHI17,P1_f12 sub %l1,%o7,%l1 add SC_HI,8,%g1;add SC_LO,8,%o7 fsubd P1_f10,P1_f12,P1_f10 srl %l1,10,%l1 fmuld P1_f10,P1_f10,P1_f12 andn %l1,0x1f,%l1 fmuld P1_f12,C_pp2,P2_f20 ldd [%g1+%l1],%f36 faddd P2_f20,C_pp1,P2_f20 fmuld P1_f12,C_qq2,P1_f14 ldd [SC_HI+%l1],%f38 fmuld P1_f12,P2_f20,P2_f20 faddd P1_f14,C_qq1,P1_f14 faddd P2_f20,C_ONE,P2_f20 fmuld P1_f12,P1_f14,P1_f14 fmuld P1_f10,P2_f20,P2_f20 ldd [%o7+%l1],P1_f12 fmuld P1_f14,%f36,P1_f14 fmuld P2_f20,%f38,P2_f20 fsubd P2_f20,P1_f14,P2_f20 fsubd P1_f12,P2_f20,P2_f20 ba,pt %icc,2f ! delay slot faddd P2_f20,%f36,P2_f20 1: fmuld P1_f10,P1_f10,P1_f12 fmuld P1_f12,C_q4,P1_f14 faddd P1_f14,C_q3,P1_f14 fmuld P1_f12,P1_f14,P1_f14 faddd P1_f14,C_q2,P1_f14 fmuld P1_f12,P1_f14,P1_f14 faddd P1_f14,C_q1,P1_f14 fmuld P1_f12,P1_f14,P1_f14 !!(vsin)fmuld P1_f10,P1_f14,P1_f14 faddd C_ONE,P1_f14,P2_f20 !!(vsin)faddd P1_f10,P1_f14,P2_f20 2: nop !!(vsin) fors P2_f20,P1_f19,P2_f20 st P2_f20,[%o1] st P2_f21,[%o1+4] .endloop1: cmp %l0,LIM_l5 bl,pn %icc,1f ! delay slot fabsd P0_f0,P0_f0 sethi %hi(0x3fc3c000),%o7 fpadd32s P0_f0,MSK_BIT13,P0_f8 fand P0_f8,MSK_BITSHI17,P0_f2 sub %l0,%o7,%l0 add SC_HI,8,%g1;add SC_LO,8,%o7 fsubd P0_f0,P0_f2,P0_f0 srl %l0,10,%l0 fmuld P0_f0,P0_f0,P0_f2 andn %l0,0x1f,%l0 fmuld P0_f2,C_pp2,P2_f20 ldd [%g1+%l0],%f32 faddd P2_f20,C_pp1,P2_f20 fmuld P0_f2,C_qq2,P0_f4 ldd [SC_HI+%l0],%f34 fmuld P0_f2,P2_f20,P2_f20 faddd P0_f4,C_qq1,P0_f4 faddd P2_f20,C_ONE,P2_f20 fmuld P0_f2,P0_f4,P0_f4 fmuld P0_f0,P2_f20,P2_f20 ldd [%o7+%l0],P0_f2 fmuld P0_f4,%f32,P0_f4 fmuld P2_f20,%f34,P2_f20 fsubd P2_f20,P0_f4,P2_f20 fsubd P0_f2,P2_f20,P2_f20 ba,pt %icc,2f ! delay slot faddd P2_f20,%f32,P2_f20 1: fmuld P0_f0,P0_f0,P0_f2 fmuld P0_f2,C_q4,P0_f4 faddd P0_f4,C_q3,P0_f4 fmuld P0_f2,P0_f4,P0_f4 faddd P0_f4,C_q2,P0_f4 fmuld P0_f2,P0_f4,P0_f4 faddd P0_f4,C_q1,P0_f4 fmuld P0_f2,P0_f4,P0_f4 !!(vsin)fmuld P0_f0,P0_f4,P0_f4 faddd C_ONE,P0_f4,P2_f20 !!(vsin)faddd P0_f0,P0_f4,P2_f20 2: nop !!(vsin) fors P2_f20,P0_f9,P2_f20 st P2_f20,[%o0] st P2_f21,[%o0+4] .endloop0: st P0_f6,[%o3] st P0_f7,[%o3+4] st P1_f16,[%o4] st P1_f17,[%o4+4] st P2_f26,[%o5] st P2_f27,[%o5+4] ! return. finished off with only primary range arguments ret restore .align 32 .range0: cmp %l0,LIM_l6 bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. ! delay slot, annulled if branch not taken mov 0x1,LIM_l6 ! set biguns flag or fdtoi P0_f0,P0_f2; fmovd C_ONE,P0_f0 ; st P0_f0,[%o0] ! *y = *x with inexact if x nonzero st P0_f1,[%o0+4] !nop ! (vsin) fdtoi P0_f0,P0_f2 addcc %i0,-1,%i0 ble,pn %icc,.endloop0 ! delay slot, harmless if branch taken add %i3,%i4,%i3 ! y += stridey andn %l1,MSK_SIGN,%l0 ! hx &= ~0x80000000 fmovd P1_f10,P0_f0 ba,pt %icc,.loop0 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .range1: cmp %l1,LIM_l6 bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. ! delay slot, annulled if branch not taken mov 0x2,LIM_l6 ! set biguns flag or fdtoi P1_f10,P1_f12; fmovd C_ONE,P1_f10 ; st P1_f10,[%o1] ! *y = *x with inexact if x nonzero st P1_f11,[%o1+4] !nop ! (vsin) fdtoi P1_f10,P1_f12 addcc %i0,-1,%i0 ble,pn %icc,.endloop1 ! delay slot, harmless if branch taken add %i3,%i4,%i3 ! y += stridey andn %l2,MSK_SIGN,%l1 ! hx &= ~0x80000000 fmovd P2_f20,P1_f10 ba,pt %icc,.loop1 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .range2: cmp %l2,LIM_l6 bg,a,pt %icc,.MEDIUM ! brance to Medium range on big arg. ! delay slot, annulled if branch not taken mov 0x3,LIM_l6 ! set biguns flag or fdtoi P2_f20,P2_f22; fmovd C_ONE,P2_f20 ; st P2_f20,[%o2] ! *y = *x with inexact if x nonzero st P2_f21,[%o2+4] nop ! (vsin) fdtoi P2_f20,P2_f22 1: addcc %i0,-1,%i0 ble,pn %icc,.endloop2 ! delay slot nop ld [%i1],%l2 ld [%i1],P2_f20 ld [%i1+4],P2_f21 andn %l2,MSK_SIGN,%l2 ! hx &= ~0x80000000 ba,pt %icc,.loop2 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .MEDIUM: ! ========== medium range ========== ! register use ! i0 n ! i1 x ! i2 stridex ! i3 y ! i4 stridey ! i5 0x80000000 ! l0 hx0 ! l1 hx1 ! l2 hx2 ! l3 __vlibm_TBL_sincos_hi ! l4 __vlibm_TBL_sincos_lo ! l5 constants ! l6 biguns stored here : still called LIM_l6 ! l7 0x413921fb ! the following are 64-bit registers in both V8+ and V9 ! g1 scratch ! g5 ! o0 py0 ! o1 py1 ! o2 py2 ! o3 n0 ! o4 n1 ! o5 n2 ! o7 scratch ! f0 x0 ! f2 n0,y0 ! f4 ! f6 ! f8 scratch for table base ! f9 signbit0 ! f10 x1 ! f12 n1,y1 ! f14 ! f16 ! f18 scratch for table base ! f19 signbit1 ! f20 x2 ! f22 n2,y2 ! f24 ! f26 ! f28 scratch for table base ! f29 signbit2 ! f30 0x80000000 ! f31 0x4000 ! f32 ! f34 ! f36 ! f38 ! f40 invpio2 ! f42 round ! f44 0xffff800000000000 ! f46 pio2_1 ! f48 pio2_2 ! f50 pio2_3 ! f52 pio2_3t ! f54 one ! f56 pp1 ! f58 pp2 ! f60 qq1 ! f62 qq2 PIC_SET(g5,constants,l5) ! %o3,%o4,%o5 need to be stored st P0_f6,[%o3] sethi %hi(0x413921fb),%l7 st P0_f7,[%o3+4] or %l7,%lo(0x413921fb),%l7 st P1_f16,[%o4] st P1_f17,[%o4+4] st P2_f26,[%o5] st P2_f27,[%o5+4] ldd [%l5+invpio2],%f40 ldd [%l5+round],%f42 ldd [%l5+pio2_1],%f46 ldd [%l5+pio2_2],%f48 ldd [%l5+pio2_3],%f50 ldd [%l5+pio2_3t],%f52 std %f54,[%fp+x0_1+8] ! set up stack data std %f54,[%fp+x1_1+8] std %f54,[%fp+x2_1+8] stx %g0,[%fp+y0_0+8] stx %g0,[%fp+y1_0+8] stx %g0,[%fp+y2_0+8] ! branched here in the middle of the array. Need to adjust ! for the members of the triple that were selected in the primary ! loop. ! no adjustment since all three selected here subcc LIM_l6,0x1,%g0 ! continue in LOOP0? bz,a %icc,.LOOP0 mov 0x0,LIM_l6 ! delay slot set biguns=0 ! ajust 1st triple since 2d and 3d done here subcc LIM_l6,0x2,%g0 ! continue in LOOP1? fmuld %f0,%f40,%f2 ! adj LOOP0 bz,a %icc,.LOOP1 mov 0x0,LIM_l6 ! delay slot set biguns=0 ! ajust 1st and 2d triple since 3d done here subcc LIM_l6,0x3,%g0 ! continue in LOOP2? !done fmuld %f0,%f40,%f2 ! adj LOOP0 sub %i3,%i4,%i3 ! adjust to not double increment fmuld %f10,%f40,%f12 ! adj LOOP1 faddd %f2,%f42,%f2 ! adj LOOP1 bz,a %icc,.LOOP2 mov 0x0,LIM_l6 ! delay slot set biguns=0 ba .LOOP0 nop ! -- 16 byte aligned .align 32 .LOOP0: lda [%i1]%asi,%l1 ! preload next argument mov %i3,%o0 ! py0 = y lda [%i1]%asi,%f10 cmp %l0,%l7 add %i3,%i4,%i3 ! y += stridey bg,pn %icc,.BIG0 ! if hx > 0x413921fb ! delay slot lda [%i1+4]%asi,%f11 addcc %i0,-1,%i0 add %i1,%i2,%i1 ! x += stridex ble,pn %icc,.ENDLOOP1 ! delay slot andn %l1,%i5,%l1 nop fmuld %f0,%f40,%f2 fabsd %f54,%f54 ! a nop for alignment only .LOOP1: lda [%i1]%asi,%l2 ! preload next argument mov %i3,%o1 ! py1 = y lda [%i1]%asi,%f20 cmp %l1,%l7 add %i3,%i4,%i3 ! y += stridey bg,pn %icc,.BIG1 ! if hx > 0x413921fb ! delay slot lda [%i1+4]%asi,%f21 addcc %i0,-1,%i0 add %i1,%i2,%i1 ! x += stridex ble,pn %icc,.ENDLOOP2 ! delay slot andn %l2,%i5,%l2 nop fmuld %f10,%f40,%f12 faddd %f2,%f42,%f2 .LOOP2: st %f3,[%fp+n0] mov %i3,%o2 ! py2 = y cmp %l2,%l7 add %i3,%i4,%i3 ! y += stridey fmuld %f20,%f40,%f22 bg,pn %icc,.BIG2 ! if hx > 0x413921fb ! delay slot add %l5,thresh+4,%o7 faddd %f12,%f42,%f12 st %f13,[%fp+n1] ! - add %l5,thresh,%g1 faddd %f22,%f42,%f22 st %f23,[%fp+n2] fsubd %f2,%f42,%f2 ! n fsubd %f12,%f42,%f12 ! n fsubd %f22,%f42,%f22 ! n fmuld %f2,%f46,%f4 fmuld %f12,%f46,%f14 fmuld %f22,%f46,%f24 fsubd %f0,%f4,%f4 fmuld %f2,%f48,%f6 fsubd %f10,%f14,%f14 fmuld %f12,%f48,%f16 fsubd %f20,%f24,%f24 fmuld %f22,%f48,%f26 fsubd %f4,%f6,%f0 ld [%fp+n0],%o3 ; add %o3,1,%o3 fsubd %f14,%f16,%f10 ld [%fp+n1],%o4 ; add %o4,1,%o4 fsubd %f24,%f26,%f20 ld [%fp+n2],%o5 ; add %o5,1,%o5 fsubd %f4,%f0,%f32 and %o3,1,%o3 fsubd %f14,%f10,%f34 and %o4,1,%o4 fsubd %f24,%f20,%f36 and %o5,1,%o5 fsubd %f32,%f6,%f32 fmuld %f2,%f50,%f8 sll %o3,3,%o3 fsubd %f34,%f16,%f34 fmuld %f12,%f50,%f18 sll %o4,3,%o4 fsubd %f36,%f26,%f36 fmuld %f22,%f50,%f28 sll %o5,3,%o5 fsubd %f8,%f32,%f8 ld [%g1+%o3],%f6 fsubd %f18,%f34,%f18 ld [%g1+%o4],%f16 fsubd %f28,%f36,%f28 ld [%g1+%o5],%f26 fsubd %f0,%f8,%f4 fsubd %f10,%f18,%f14 fsubd %f20,%f28,%f24 fsubd %f0,%f4,%f32 fsubd %f10,%f14,%f34 fsubd %f20,%f24,%f36 fsubd %f32,%f8,%f32 fmuld %f2,%f52,%f2 fsubd %f34,%f18,%f34 fmuld %f12,%f52,%f12 fsubd %f36,%f28,%f36 fmuld %f22,%f52,%f22 fsubd %f2,%f32,%f2 ld [%o7+%o3],%f8 fsubd %f12,%f34,%f12 ld [%o7+%o4],%f18 fsubd %f22,%f36,%f22 ld [%o7+%o5],%f28 fsubd %f4,%f2,%f0 ! x fsubd %f14,%f12,%f10 ! x fsubd %f24,%f22,%f20 ! x fsubd %f4,%f0,%f4 fsubd %f14,%f10,%f14 fsubd %f24,%f20,%f24 fands %f0,%f30,%f9 ! save signbit fands %f10,%f30,%f19 ! save signbit fands %f20,%f30,%f29 ! save signbit fabsd %f0,%f0 std %f0,[%fp+x0_1] fabsd %f10,%f10 std %f10,[%fp+x1_1] fabsd %f20,%f20 std %f20,[%fp+x2_1] fsubd %f4,%f2,%f2 ! y fsubd %f14,%f12,%f12 ! y fsubd %f24,%f22,%f22 ! y fcmpgt32 %f6,%f0,%l0 fcmpgt32 %f16,%f10,%l1 fcmpgt32 %f26,%f20,%l2 ! -- 16 byte aligned fxors %f2,%f9,%f2 fxors %f12,%f19,%f12 fxors %f22,%f29,%f22 fands %f9,%f8,%f9 ! if (n & 1) clear sign bit andcc %l0,2,%g0 bne,pn %icc,.CASE4 ! delay slot fands %f19,%f18,%f19 ! if (n & 1) clear sign bit andcc %l1,2,%g0 bne,pn %icc,.CASE2 ! delay slot fands %f29,%f28,%f29 ! if (n & 1) clear sign bit andcc %l2,2,%g0 bne,pn %icc,.CASE1 ! delay slot fpadd32s %f0,%f31,%f8 sethi %hi(0x3fc3c000),%o7 ld [%fp+x0_1],%l0 fpadd32s %f10,%f31,%f18 add %l3,8,%g1 ld [%fp+x1_1],%l1 fpadd32s %f20,%f31,%f28 ld [%fp+x2_1],%l2 fand %f8,%f44,%f4 sub %l0,%o7,%l0 fand %f18,%f44,%f14 sub %l1,%o7,%l1 fand %f28,%f44,%f24 sub %l2,%o7,%l2 fsubd %f0,%f4,%f0 srl %l0,10,%l0 fsubd %f10,%f14,%f10 srl %l1,10,%l1 fsubd %f20,%f24,%f20 srl %l2,10,%l2 faddd %f0,%f2,%f0 andn %l0,0x1f,%l0 faddd %f10,%f12,%f10 andn %l1,0x1f,%l1 faddd %f20,%f22,%f20 andn %l2,0x1f,%l2 fmuld %f0,%f0,%f2 add %l0,%o3,%l0 fmuld %f10,%f10,%f12 add %l1,%o4,%l1 fmuld %f20,%f20,%f22 add %l2,%o5,%l2 fmuld %f2,%f58,%f6 ldd [%l3+%l0],%f32 fmuld %f12,%f58,%f16 ldd [%l3+%l1],%f34 fmuld %f22,%f58,%f26 ldd [%l3+%l2],%f36 faddd %f6,%f56,%f6 fmuld %f2,%f62,%f4 faddd %f16,%f56,%f16 fmuld %f12,%f62,%f14 faddd %f26,%f56,%f26 fmuld %f22,%f62,%f24 fmuld %f2,%f6,%f6 faddd %f4,%f60,%f4 fmuld %f12,%f16,%f16 faddd %f14,%f60,%f14 fmuld %f22,%f26,%f26 faddd %f24,%f60,%f24 faddd %f6,%f54,%f6 fmuld %f2,%f4,%f4 faddd %f16,%f54,%f16 fmuld %f12,%f14,%f14 faddd %f26,%f54,%f26 fmuld %f22,%f24,%f24 fmuld %f0,%f6,%f6 ldd [%g1+%l0],%f2 fmuld %f10,%f16,%f16 ldd [%g1+%l1],%f12 fmuld %f20,%f26,%f26 ldd [%g1+%l2],%f22 fmuld %f4,%f32,%f4 ldd [%l4+%l0],%f0 fmuld %f14,%f34,%f14 ldd [%l4+%l1],%f10 fmuld %f24,%f36,%f24 ldd [%l4+%l2],%f20 fmuld %f6,%f2,%f6 fmuld %f16,%f12,%f16 fmuld %f26,%f22,%f26 faddd %f6,%f4,%f6 faddd %f16,%f14,%f16 faddd %f26,%f24,%f26 faddd %f6,%f0,%f6 faddd %f16,%f10,%f16 faddd %f26,%f20,%f26 faddd %f6,%f32,%f6 faddd %f16,%f34,%f16 faddd %f26,%f36,%f26 .FIXSIGN: ld [%fp+n0],%o3 ; add %o3,1,%o3 add %l5,thresh-4,%g1 ld [%fp+n1],%o4 ; add %o4,1,%o4 ld [%fp+n2],%o5 ; add %o5,1,%o5 and %o3,2,%o3 sll %o3,2,%o3 and %o4,2,%o4 lda [%i1]%asi,%l0 ! preload next argument sll %o4,2,%o4 and %o5,2,%o5 ld [%g1+%o3],%f8 sll %o5,2,%o5 ld [%g1+%o4],%f18 ld [%g1+%o5],%f28 fxors %f9,%f8,%f9 lda [%i1]%asi,%f0 fxors %f29,%f28,%f29 lda [%i1+4]%asi,%f1 fxors %f19,%f18,%f19 fors %f6,%f9,%f6 ! tack on sign add %i1,%i2,%i1 ! x += stridex st %f6,[%o0] fors %f26,%f29,%f26 ! tack on sign st %f7,[%o0+4] fors %f16,%f19,%f16 ! tack on sign st %f26,[%o2] st %f27,[%o2+4] addcc %i0,-1,%i0 st %f16,[%o1] andn %l0,%i5,%l0 ! hx &= ~0x80000000 bg,pt %icc,.LOOP0 ! delay slot st %f17,[%o1+4] ba,pt %icc,.ENDLOOP0 ! delay slot nop .align 32 .CASE1: fpadd32s %f10,%f31,%f18 sethi %hi(0x3fc3c000),%o7 ld [%fp+x0_1],%l0 fand %f8,%f44,%f4 add %l3,8,%g1 ld [%fp+x1_1],%l1 fand %f18,%f44,%f14 sub %l0,%o7,%l0 fsubd %f0,%f4,%f0 srl %l0,10,%l0 sub %l1,%o7,%l1 fsubd %f10,%f14,%f10 srl %l1,10,%l1 fmuld %f20,%f20,%f20 ldd [%l5+%o5],%f36 add %l5,%o5,%l2 faddd %f0,%f2,%f0 andn %l0,0x1f,%l0 faddd %f10,%f12,%f10 andn %l1,0x1f,%l1 fmuld %f20,%f36,%f24 ldd [%l2+0x10],%f26 add %fp,%o5,%o5 fmuld %f0,%f0,%f2 add %l0,%o3,%l0 fmuld %f10,%f10,%f12 add %l1,%o4,%l1 faddd %f24,%f26,%f24 ldd [%l2+0x20],%f36 fmuld %f2,%f58,%f6 ldd [%l3+%l0],%f32 fmuld %f12,%f58,%f16 ldd [%l3+%l1],%f34 fmuld %f20,%f24,%f24 ldd [%l2+0x30],%f26 faddd %f6,%f56,%f6 fmuld %f2,%f62,%f4 faddd %f16,%f56,%f16 fmuld %f12,%f62,%f14 faddd %f24,%f36,%f24 ldd [%o5+x2_1],%f36 fmuld %f2,%f6,%f6 faddd %f4,%f60,%f4 fmuld %f12,%f16,%f16 faddd %f14,%f60,%f14 fmuld %f20,%f24,%f24 faddd %f6,%f54,%f6 fmuld %f2,%f4,%f4 ldd [%g1+%l0],%f2 faddd %f16,%f54,%f16 fmuld %f12,%f14,%f14 ldd [%g1+%l1],%f12 faddd %f24,%f26,%f24 fmuld %f0,%f6,%f6 ldd [%l4+%l0],%f0 fmuld %f10,%f16,%f16 ldd [%l4+%l1],%f10 fmuld %f4,%f32,%f4 std %f22,[%fp+y2_0] fmuld %f14,%f34,%f14 fmuld %f6,%f2,%f6 fmuld %f16,%f12,%f16 fmuld %f20,%f24,%f24 faddd %f6,%f4,%f6 faddd %f16,%f14,%f16 fmuld %f36,%f24,%f24 ldd [%o5+y2_0],%f22 faddd %f6,%f0,%f6 faddd %f16,%f10,%f16 faddd %f24,%f22,%f24 faddd %f6,%f32,%f6 faddd %f16,%f34,%f16 ba,pt %icc,.FIXSIGN ! delay slot faddd %f36,%f24,%f26 .align 32 .CASE2: fpadd32s %f0,%f31,%f8 ld [%fp+x0_1],%l0 andcc %l2,2,%g0 bne,pn %icc,.CASE3 ! delay slot sethi %hi(0x3fc3c000),%o7 fpadd32s %f20,%f31,%f28 ld [%fp+x2_1],%l2 fand %f8,%f44,%f4 sub %l0,%o7,%l0 add %l3,8,%g1 fand %f28,%f44,%f24 sub %l2,%o7,%l2 fsubd %f0,%f4,%f0 srl %l0,10,%l0 fsubd %f20,%f24,%f20 srl %l2,10,%l2 fmuld %f10,%f10,%f10 ldd [%l5+%o4],%f34 add %l5,%o4,%l1 faddd %f0,%f2,%f0 andn %l0,0x1f,%l0 faddd %f20,%f22,%f20 andn %l2,0x1f,%l2 fmuld %f10,%f34,%f14 ldd [%l1+0x10],%f16 add %fp,%o4,%o4 fmuld %f0,%f0,%f2 add %l0,%o3,%l0 fmuld %f20,%f20,%f22 add %l2,%o5,%l2 faddd %f14,%f16,%f14 ldd [%l1+0x20],%f34 fmuld %f2,%f58,%f6 ldd [%l3+%l0],%f32 fmuld %f22,%f58,%f26 ldd [%l3+%l2],%f36 fmuld %f10,%f14,%f14 ldd [%l1+0x30],%f16 faddd %f6,%f56,%f6 fmuld %f2,%f62,%f4 faddd %f26,%f56,%f26 fmuld %f22,%f62,%f24 faddd %f14,%f34,%f14 ldd [%o4+x1_1],%f34 fmuld %f2,%f6,%f6 faddd %f4,%f60,%f4 fmuld %f22,%f26,%f26 faddd %f24,%f60,%f24 fmuld %f10,%f14,%f14 faddd %f6,%f54,%f6 fmuld %f2,%f4,%f4 ldd [%g1+%l0],%f2 faddd %f26,%f54,%f26 fmuld %f22,%f24,%f24 ldd [%g1+%l2],%f22 faddd %f14,%f16,%f14 fmuld %f0,%f6,%f6 ldd [%l4+%l0],%f0 fmuld %f20,%f26,%f26 ldd [%l4+%l2],%f20 fmuld %f4,%f32,%f4 std %f12,[%fp+y1_0] fmuld %f24,%f36,%f24 fmuld %f6,%f2,%f6 fmuld %f26,%f22,%f26 fmuld %f10,%f14,%f14 faddd %f6,%f4,%f6 faddd %f26,%f24,%f26 fmuld %f34,%f14,%f14 ldd [%o4+y1_0],%f12 faddd %f6,%f0,%f6 faddd %f26,%f20,%f26 faddd %f14,%f12,%f14 faddd %f6,%f32,%f6 faddd %f26,%f36,%f26 ba,pt %icc,.FIXSIGN ! delay slot faddd %f34,%f14,%f16 .align 32 .CASE3: fand %f8,%f44,%f4 add %l3,8,%g1 sub %l0,%o7,%l0 fmuld %f10,%f10,%f10 ldd [%l5+%o4],%f34 add %l5,%o4,%l1 fsubd %f0,%f4,%f0 srl %l0,10,%l0 fmuld %f20,%f20,%f20 ldd [%l5+%o5],%f36 add %l5,%o5,%l2 fmuld %f10,%f34,%f14 ldd [%l1+0x10],%f16 add %fp,%o4,%o4 faddd %f0,%f2,%f0 andn %l0,0x1f,%l0 fmuld %f20,%f36,%f24 ldd [%l2+0x10],%f26 add %fp,%o5,%o5 faddd %f14,%f16,%f14 ldd [%l1+0x20],%f34 fmuld %f0,%f0,%f2 add %l0,%o3,%l0 faddd %f24,%f26,%f24 ldd [%l2+0x20],%f36 fmuld %f10,%f14,%f14 ldd [%l1+0x30],%f16 fmuld %f2,%f58,%f6 ldd [%l3+%l0],%f32 fmuld %f20,%f24,%f24 ldd [%l2+0x30],%f26 faddd %f14,%f34,%f14 ldd [%o4+x1_1],%f34 faddd %f6,%f56,%f6 fmuld %f2,%f62,%f4 faddd %f24,%f36,%f24 ldd [%o5+x2_1],%f36 fmuld %f10,%f14,%f14 std %f12,[%fp+y1_0] fmuld %f2,%f6,%f6 faddd %f4,%f60,%f4 fmuld %f20,%f24,%f24 std %f22,[%fp+y2_0] faddd %f14,%f16,%f14 faddd %f6,%f54,%f6 fmuld %f2,%f4,%f4 ldd [%g1+%l0],%f2 faddd %f24,%f26,%f24 fmuld %f10,%f14,%f14 fmuld %f0,%f6,%f6 ldd [%l4+%l0],%f0 fmuld %f4,%f32,%f4 fmuld %f20,%f24,%f24 fmuld %f6,%f2,%f6 fmuld %f34,%f14,%f14 ldd [%o4+y1_0],%f12 fmuld %f36,%f24,%f24 ldd [%o5+y2_0],%f22 faddd %f6,%f4,%f6 faddd %f14,%f12,%f14 faddd %f24,%f22,%f24 faddd %f6,%f0,%f6 faddd %f34,%f14,%f16 faddd %f36,%f24,%f26 ba,pt %icc,.FIXSIGN ! delay slot faddd %f6,%f32,%f6 .align 32 .CASE4: fands %f29,%f28,%f29 ! if (n & 1) clear sign bit sethi %hi(0x3fc3c000),%o7 andcc %l1,2,%g0 bne,pn %icc,.CASE6 ! delay slot andcc %l2,2,%g0 fpadd32s %f10,%f31,%f18 ld [%fp+x1_1],%l1 bne,pn %icc,.CASE5 ! delay slot add %l3,8,%g1 ld [%fp+x2_1],%l2 fpadd32s %f20,%f31,%f28 fand %f18,%f44,%f14 sub %l1,%o7,%l1 fand %f28,%f44,%f24 sub %l2,%o7,%l2 fsubd %f10,%f14,%f10 srl %l1,10,%l1 fsubd %f20,%f24,%f20 srl %l2,10,%l2 fmuld %f0,%f0,%f0 ldd [%l5+%o3],%f32 add %l5,%o3,%l0 faddd %f10,%f12,%f10 andn %l1,0x1f,%l1 faddd %f20,%f22,%f20 andn %l2,0x1f,%l2 fmuld %f0,%f32,%f4 ldd [%l0+0x10],%f6 add %fp,%o3,%o3 fmuld %f10,%f10,%f12 add %l1,%o4,%l1 fmuld %f20,%f20,%f22 add %l2,%o5,%l2 faddd %f4,%f6,%f4 ldd [%l0+0x20],%f32 fmuld %f12,%f58,%f16 ldd [%l3+%l1],%f34 fmuld %f22,%f58,%f26 ldd [%l3+%l2],%f36 fmuld %f0,%f4,%f4 ldd [%l0+0x30],%f6 faddd %f16,%f56,%f16 fmuld %f12,%f62,%f14 faddd %f26,%f56,%f26 fmuld %f22,%f62,%f24 faddd %f4,%f32,%f4 ldd [%o3+x0_1],%f32 fmuld %f12,%f16,%f16 faddd %f14,%f60,%f14 fmuld %f22,%f26,%f26 faddd %f24,%f60,%f24 fmuld %f0,%f4,%f4 faddd %f16,%f54,%f16 fmuld %f12,%f14,%f14 ldd [%g1+%l1],%f12 faddd %f26,%f54,%f26 fmuld %f22,%f24,%f24 ldd [%g1+%l2],%f22 faddd %f4,%f6,%f4 fmuld %f10,%f16,%f16 ldd [%l4+%l1],%f10 fmuld %f20,%f26,%f26 ldd [%l4+%l2],%f20 fmuld %f14,%f34,%f14 std %f2,[%fp+y0_0] fmuld %f24,%f36,%f24 fmuld %f0,%f4,%f4 fmuld %f16,%f12,%f16 fmuld %f26,%f22,%f26 fmuld %f32,%f4,%f4 ldd [%o3+y0_0],%f2 faddd %f16,%f14,%f16 faddd %f26,%f24,%f26 faddd %f4,%f2,%f4 faddd %f16,%f10,%f16 faddd %f26,%f20,%f26 faddd %f32,%f4,%f6 faddd %f16,%f34,%f16 ba,pt %icc,.FIXSIGN ! delay slot faddd %f26,%f36,%f26 .align 32 .CASE5: fand %f18,%f44,%f14 sub %l1,%o7,%l1 fmuld %f0,%f0,%f0 ldd [%l5+%o3],%f32 add %l5,%o3,%l0 fsubd %f10,%f14,%f10 srl %l1,10,%l1 fmuld %f20,%f20,%f20 ldd [%l5+%o5],%f36 add %l5,%o5,%l2 fmuld %f0,%f32,%f4 ldd [%l0+0x10],%f6 add %fp,%o3,%o3 faddd %f10,%f12,%f10 andn %l1,0x1f,%l1 fmuld %f20,%f36,%f24 ldd [%l2+0x10],%f26 add %fp,%o5,%o5 faddd %f4,%f6,%f4 ldd [%l0+0x20],%f32 fmuld %f10,%f10,%f12 add %l1,%o4,%l1 faddd %f24,%f26,%f24 ldd [%l2+0x20],%f36 fmuld %f0,%f4,%f4 ldd [%l0+0x30],%f6 fmuld %f12,%f58,%f16 ldd [%l3+%l1],%f34 fmuld %f20,%f24,%f24 ldd [%l2+0x30],%f26 faddd %f4,%f32,%f4 ldd [%o3+x0_1],%f32 faddd %f16,%f56,%f16 fmuld %f12,%f62,%f14 faddd %f24,%f36,%f24 ldd [%o5+x2_1],%f36 fmuld %f0,%f4,%f4 std %f2,[%fp+y0_0] fmuld %f12,%f16,%f16 faddd %f14,%f60,%f14 fmuld %f20,%f24,%f24 std %f22,[%fp+y2_0] faddd %f4,%f6,%f4 faddd %f16,%f54,%f16 fmuld %f12,%f14,%f14 ldd [%g1+%l1],%f12 faddd %f24,%f26,%f24 fmuld %f0,%f4,%f4 fmuld %f10,%f16,%f16 ldd [%l4+%l1],%f10 fmuld %f14,%f34,%f14 fmuld %f20,%f24,%f24 fmuld %f16,%f12,%f16 fmuld %f32,%f4,%f4 ldd [%o3+y0_0],%f2 fmuld %f36,%f24,%f24 ldd [%o5+y2_0],%f22 faddd %f16,%f14,%f16 faddd %f4,%f2,%f4 faddd %f24,%f22,%f24 faddd %f16,%f10,%f16 faddd %f32,%f4,%f6 faddd %f36,%f24,%f26 ba,pt %icc,.FIXSIGN ! delay slot faddd %f16,%f34,%f16 .align 32 .CASE6: ld [%fp+x2_1],%l2 add %l3,8,%g1 bne,pn %icc,.CASE7 ! delay slot fpadd32s %f20,%f31,%f28 fand %f28,%f44,%f24 ldd [%l5+%o3],%f32 add %l5,%o3,%l0 fmuld %f0,%f0,%f0 sub %l2,%o7,%l2 fsubd %f20,%f24,%f20 srl %l2,10,%l2 fmuld %f10,%f10,%f10 ldd [%l5+%o4],%f34 add %l5,%o4,%l1 fmuld %f0,%f32,%f4 ldd [%l0+0x10],%f6 add %fp,%o3,%o3 faddd %f20,%f22,%f20 andn %l2,0x1f,%l2 fmuld %f10,%f34,%f14 ldd [%l1+0x10],%f16 add %fp,%o4,%o4 faddd %f4,%f6,%f4 ldd [%l0+0x20],%f32 fmuld %f20,%f20,%f22 add %l2,%o5,%l2 faddd %f14,%f16,%f14 ldd [%l1+0x20],%f34 fmuld %f0,%f4,%f4 ldd [%l0+0x30],%f6 fmuld %f22,%f58,%f26 ldd [%l3+%l2],%f36 fmuld %f10,%f14,%f14 ldd [%l1+0x30],%f16 faddd %f4,%f32,%f4 ldd [%o3+x0_1],%f32 faddd %f26,%f56,%f26 fmuld %f22,%f62,%f24 faddd %f14,%f34,%f14 ldd [%o4+x1_1],%f34 fmuld %f0,%f4,%f4 std %f2,[%fp+y0_0] fmuld %f22,%f26,%f26 faddd %f24,%f60,%f24 fmuld %f10,%f14,%f14 std %f12,[%fp+y1_0] faddd %f4,%f6,%f4 faddd %f26,%f54,%f26 fmuld %f22,%f24,%f24 ldd [%g1+%l2],%f22 faddd %f14,%f16,%f14 fmuld %f0,%f4,%f4 fmuld %f20,%f26,%f26 ldd [%l4+%l2],%f20 fmuld %f24,%f36,%f24 fmuld %f10,%f14,%f14 fmuld %f26,%f22,%f26 fmuld %f32,%f4,%f4 ldd [%o3+y0_0],%f2 fmuld %f34,%f14,%f14 ldd [%o4+y1_0],%f12 faddd %f26,%f24,%f26 faddd %f4,%f2,%f4 faddd %f14,%f12,%f14 faddd %f26,%f20,%f26 faddd %f32,%f4,%f6 faddd %f34,%f14,%f16 ba,pt %icc,.FIXSIGN ! delay slot faddd %f26,%f36,%f26 .align 32 .CASE7: fmuld %f0,%f0,%f0 ldd [%l5+%o3],%f32 add %l5,%o3,%l0 fmuld %f10,%f10,%f10 ldd [%l5+%o4],%f34 add %l5,%o4,%l1 fmuld %f20,%f20,%f20 ldd [%l5+%o5],%f36 add %l5,%o5,%l2 fmuld %f0,%f32,%f4 ldd [%l0+0x10],%f6 add %fp,%o3,%o3 fmuld %f10,%f34,%f14 ldd [%l1+0x10],%f16 add %fp,%o4,%o4 fmuld %f20,%f36,%f24 ldd [%l2+0x10],%f26 add %fp,%o5,%o5 faddd %f4,%f6,%f4 ldd [%l0+0x20],%f32 faddd %f14,%f16,%f14 ldd [%l1+0x20],%f34 faddd %f24,%f26,%f24 ldd [%l2+0x20],%f36 fmuld %f0,%f4,%f4 ldd [%l0+0x30],%f6 fmuld %f10,%f14,%f14 ldd [%l1+0x30],%f16 fmuld %f20,%f24,%f24 ldd [%l2+0x30],%f26 faddd %f4,%f32,%f4 ldd [%o3+x0_1],%f32 faddd %f14,%f34,%f14 ldd [%o4+x1_1],%f34 faddd %f24,%f36,%f24 ldd [%o5+x2_1],%f36 fmuld %f0,%f4,%f4 std %f2,[%fp+y0_0] fmuld %f10,%f14,%f14 std %f12,[%fp+y1_0] fmuld %f20,%f24,%f24 std %f22,[%fp+y2_0] faddd %f4,%f6,%f4 faddd %f14,%f16,%f14 faddd %f24,%f26,%f24 fmuld %f0,%f4,%f4 fmuld %f10,%f14,%f14 fmuld %f20,%f24,%f24 fmuld %f32,%f4,%f4 ldd [%o3+y0_0],%f2 fmuld %f34,%f14,%f14 ldd [%o4+y1_0],%f12 fmuld %f36,%f24,%f24 ldd [%o5+y2_0],%f22 faddd %f4,%f2,%f4 faddd %f14,%f12,%f14 faddd %f24,%f22,%f24 faddd %f32,%f4,%f6 faddd %f34,%f14,%f16 ba,pt %icc,.FIXSIGN ! delay slot faddd %f36,%f24,%f26 .align 32 .ENDLOOP2: fmuld %f10,%f40,%f12 add %l5,thresh,%g1 faddd %f12,%f42,%f12 st %f13,[%fp+n1] fsubd %f12,%f42,%f12 ! n fmuld %f12,%f46,%f14 fsubd %f10,%f14,%f14 fmuld %f12,%f48,%f16 fsubd %f14,%f16,%f10 ld [%fp+n1],%o4 ; add %o4,1,%o4 fsubd %f14,%f10,%f34 and %o4,1,%o4 fsubd %f34,%f16,%f34 fmuld %f12,%f50,%f18 sll %o4,3,%o4 fsubd %f18,%f34,%f18 ld [%g1+%o4],%f16 fsubd %f10,%f18,%f14 fsubd %f10,%f14,%f34 add %l5,thresh+4,%o7 fsubd %f34,%f18,%f34 fmuld %f12,%f52,%f12 fsubd %f12,%f34,%f12 ld [%o7+%o4],%f18 fsubd %f14,%f12,%f10 ! x fsubd %f14,%f10,%f14 fands %f10,%f30,%f19 ! save signbit fabsd %f10,%f10 std %f10,[%fp+x1_1] fsubd %f14,%f12,%f12 ! y fcmpgt32 %f16,%f10,%l1 fxors %f12,%f19,%f12 fands %f19,%f18,%f19 ! if (n & 1) clear sign bit andcc %l1,2,%g0 bne,pn %icc,1f ! delay slot nop fpadd32s %f10,%f31,%f18 ld [%fp+x1_1],%l1 fand %f18,%f44,%f14 sethi %hi(0x3fc3c000),%o7 add %l3,8,%g1 fsubd %f10,%f14,%f10 sub %l1,%o7,%l1 srl %l1,10,%l1 faddd %f10,%f12,%f10 andn %l1,0x1f,%l1 fmuld %f10,%f10,%f12 add %l1,%o4,%l1 fmuld %f12,%f58,%f16 ldd [%l3+%l1],%f34 faddd %f16,%f56,%f16 fmuld %f12,%f62,%f14 fmuld %f12,%f16,%f16 faddd %f14,%f60,%f14 faddd %f16,%f54,%f16 fmuld %f12,%f14,%f14 ldd [%g1+%l1],%f12 fmuld %f10,%f16,%f16 ldd [%l4+%l1],%f10 fmuld %f14,%f34,%f14 fmuld %f16,%f12,%f16 faddd %f16,%f14,%f16 faddd %f16,%f10,%f16 ba,pt %icc,2f faddd %f16,%f34,%f16 1: fmuld %f10,%f10,%f10 ldd [%l5+%o4],%f34 add %l5,%o4,%l1 fmuld %f10,%f34,%f14 ldd [%l1+0x10],%f16 add %fp,%o4,%o4 faddd %f14,%f16,%f14 ldd [%l1+0x20],%f34 fmuld %f10,%f14,%f14 ldd [%l1+0x30],%f16 faddd %f14,%f34,%f14 ldd [%o4+x1_1],%f34 fmuld %f10,%f14,%f14 std %f12,[%fp+y1_0] faddd %f14,%f16,%f14 fmuld %f10,%f14,%f14 fmuld %f34,%f14,%f14 ldd [%o4+y1_0],%f12 faddd %f14,%f12,%f14 faddd %f34,%f14,%f16 2: add %l5,thresh-4,%g1 ld [%fp+n1],%o4 ; add %o4,1,%o4 and %o4,2,%o4 sll %o4,2,%o4 ld [%g1+%o4],%f18 fxors %f19,%f18,%f19 fors %f16,%f19,%f16 ! tack on sign st %f16,[%o1] st %f17,[%o1+4] .ENDLOOP1: fmuld %f0,%f40,%f2 add %l5,thresh,%g1 faddd %f2,%f42,%f2 st %f3,[%fp+n0] fsubd %f2,%f42,%f2 ! n fmuld %f2,%f46,%f4 fsubd %f0,%f4,%f4 fmuld %f2,%f48,%f6 fsubd %f4,%f6,%f0 ld [%fp+n0],%o3 ; add %o3,1,%o3 fsubd %f4,%f0,%f32 and %o3,1,%o3 fsubd %f32,%f6,%f32 fmuld %f2,%f50,%f8 sll %o3,3,%o3 fsubd %f8,%f32,%f8 ld [%g1+%o3],%f6 fsubd %f0,%f8,%f4 fsubd %f0,%f4,%f32 add %l5,thresh+4,%o7 fsubd %f32,%f8,%f32 fmuld %f2,%f52,%f2 fsubd %f2,%f32,%f2 ld [%o7+%o3],%f8 fsubd %f4,%f2,%f0 ! x fsubd %f4,%f0,%f4 fands %f0,%f30,%f9 ! save signbit fabsd %f0,%f0 std %f0,[%fp+x0_1] fsubd %f4,%f2,%f2 ! y fcmpgt32 %f6,%f0,%l0 fxors %f2,%f9,%f2 fands %f9,%f8,%f9 ! if (n & 1) clear sign bit andcc %l0,2,%g0 bne,pn %icc,1f ! delay slot nop fpadd32s %f0,%f31,%f8 ld [%fp+x0_1],%l0 fand %f8,%f44,%f4 sethi %hi(0x3fc3c000),%o7 add %l3,8,%g1 fsubd %f0,%f4,%f0 sub %l0,%o7,%l0 srl %l0,10,%l0 faddd %f0,%f2,%f0 andn %l0,0x1f,%l0 fmuld %f0,%f0,%f2 add %l0,%o3,%l0 fmuld %f2,%f58,%f6 ldd [%l3+%l0],%f32 faddd %f6,%f56,%f6 fmuld %f2,%f62,%f4 fmuld %f2,%f6,%f6 faddd %f4,%f60,%f4 faddd %f6,%f54,%f6 fmuld %f2,%f4,%f4 ldd [%g1+%l0],%f2 fmuld %f0,%f6,%f6 ldd [%l4+%l0],%f0 fmuld %f4,%f32,%f4 fmuld %f6,%f2,%f6 faddd %f6,%f4,%f6 faddd %f6,%f0,%f6 ba,pt %icc,2f faddd %f6,%f32,%f6 1: fmuld %f0,%f0,%f0 ldd [%l5+%o3],%f32 add %l5,%o3,%l0 fmuld %f0,%f32,%f4 ldd [%l0+0x10],%f6 add %fp,%o3,%o3 faddd %f4,%f6,%f4 ldd [%l0+0x20],%f32 fmuld %f0,%f4,%f4 ldd [%l0+0x30],%f6 faddd %f4,%f32,%f4 ldd [%o3+x0_1],%f32 fmuld %f0,%f4,%f4 std %f2,[%fp+y0_0] faddd %f4,%f6,%f4 fmuld %f0,%f4,%f4 fmuld %f32,%f4,%f4 ldd [%o3+y0_0],%f2 faddd %f4,%f2,%f4 faddd %f32,%f4,%f6 2: add %l5,thresh-4,%g1 ld [%fp+n0],%o3 ; add %o3,1,%o3 and %o3,2,%o3 sll %o3,2,%o3 ld [%g1+%o3],%f8 fxors %f9,%f8,%f9 fors %f6,%f9,%f6 ! tack on sign st %f6,[%o0] st %f7,[%o0+4] .ENDLOOP0: ! check for huge arguments remaining tst LIM_l6 be,pt %icc,.exit ! delay slot nop ! ========== huge range (use C code) ========== #ifdef __sparcv9 ldx [%fp+xsave],%o1 ldx [%fp+ysave],%o3 #else ld [%fp+xsave],%o1 ld [%fp+ysave],%o3 #endif ld [%fp+nsave],%o0 ld [%fp+sxsave],%o2 ld [%fp+sysave],%o4 sra %o2,0,%o2 ! sign-extend for V9 sra %o4,0,%o4 call __vlibm_vcos_big mov %l7,%o5 ! delay slot .exit: ret restore .align 32 .SKIP0: addcc %i0,-1,%i0 ble,pn %icc,.ENDLOOP0 ! delay slot, harmless if branch taken add %i3,%i4,%i3 ! y += stridey andn %l1,%i5,%l0 ! hx &= ~0x80000000 fmovs %f10,%f0 ld [%i1+4],%f1 ba,pt %icc,.LOOP0 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .SKIP1: addcc %i0,-1,%i0 ble,pn %icc,.ENDLOOP1 ! delay slot, harmless if branch taken add %i3,%i4,%i3 ! y += stridey andn %l2,%i5,%l1 ! hx &= ~0x80000000 fmovs %f20,%f10 ld [%i1+4],%f11 ba,pt %icc,.LOOP1 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .SKIP2: addcc %i0,-1,%i0 ble,pn %icc,.ENDLOOP2 ! delay slot, harmless if branch taken add %i3,%i4,%i3 ! y += stridey ld [%i1],%l2 ld [%i1],%f20 ld [%i1+4],%f21 andn %l2,%i5,%l2 ! hx &= ~0x80000000 ba,pt %icc,.LOOP2 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .BIG0: sethi %hi(0x7ff00000),%o7 cmp %l0,%o7 bl,a,pt %icc,1f ! if hx < 0x7ff00000 ! delay slot, annulled if branch not taken mov %l7,LIM_l6 ! set biguns flag or fsubd %f0,%f0,%f0 ! y = x - x st %f0,[%o0] st %f1,[%o0+4] 1: addcc %i0,-1,%i0 ble,pn %icc,.ENDLOOP0 ! delay slot, harmless if branch taken andn %l1,%i5,%l0 ! hx &= ~0x80000000 fmovd %f10,%f0 ba,pt %icc,.LOOP0 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .BIG1: sethi %hi(0x7ff00000),%o7 cmp %l1,%o7 bl,a,pt %icc,1f ! if hx < 0x7ff00000 ! delay slot, annulled if branch not taken mov %l7,LIM_l6 ! set biguns flag or fsubd %f10,%f10,%f10 ! y = x - x st %f10,[%o1] st %f11,[%o1+4] 1: addcc %i0,-1,%i0 ble,pn %icc,.ENDLOOP1 ! delay slot, harmless if branch taken andn %l2,%i5,%l1 ! hx &= ~0x80000000 fmovd %f20,%f10 ba,pt %icc,.LOOP1 ! delay slot add %i1,%i2,%i1 ! x += stridex .align 32 .BIG2: sethi %hi(0x7ff00000),%o7 cmp %l2,%o7 bl,a,pt %icc,1f ! if hx < 0x7ff00000 ! delay slot, annulled if branch not taken mov %l7,LIM_l6 ! set biguns flag or fsubd %f20,%f20,%f20 ! y = x - x st %f20,[%o2] st %f21,[%o2+4] 1: addcc %i0,-1,%i0 ble,pn %icc,.ENDLOOP2 ! delay slot nop ld [%i1],%l2 ld [%i1],%f20 ld [%i1+4],%f21 andn %l2,%i5,%l2 ! hx &= ~0x80000000 ba,pt %icc,.LOOP2 ! delay slot add %i1,%i2,%i1 ! x += stridex SET_SIZE(__vcos)