/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ .file "__vsqrtf_ultra3.S" #include "libm.h" .weak __vsqrtf .type __vsqrtf,#function __vsqrtf = __vsqrtf_ultra3 RO_DATA .align 64 .CONST_TBL: .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 #define DC0 %f6 #define DC1 %f4 #define DC2 %f2 #define K2 %f38 #define K1 %f36 #define TBL %l2 #define stridex %l3 #define stridey %l4 #define _0x1ff0 %l5 #define counter %l6 #define _0x00800000 %l7 #define _0x7f800000 %o0 #define tmp_px STACK_BIAS-0x40 #define tmp_counter STACK_BIAS-0x38 #define tmp0 STACK_BIAS-0x30 #define tmp1 STACK_BIAS-0x28 #define tmp2 STACK_BIAS-0x20 #define tmp3 STACK_BIAS-0x18 #define tmp4 STACK_BIAS-0x10 ! sizeof temp storage - must be a multiple of 16 for V9 #define tmps 0x40 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! !!!!! algorithm !!!!! ! ! x0 = *px; ! ax = *(int*)px; ! px += stridex; ! ! if( ax >= 0x7f800000 ) ! { ! *py = sqrtf(x0); ! py += stridey; ! continue; ! } ! if( ax < 0x00800000 ) ! { ! *py = sqrtf(x0); ! py += stridey; ! continue; ! } ! ! db0 = (double)x0; ! iexp0 = ax >> 24; ! iexp0 += 0x3c0; ! lexp0 = (long long)iexp0 << 52; ! ! db0 = vis_fand(db0,DC0); ! db0 = vis_for(db0,DC1); ! hi0 = vis_fand(db0,DC2); ! ! ax >>= 11; ! si0 = ax & 0x1ff0; ! dtmp0 = ((double*)((char*)TBL + si0))[0]; ! xx0 = (db0 - hi0); ! xx0 *= dtmp0; ! dtmp0 = ((double*)((char*)TBL + si0))[1] ! res0 = K2 * xx0; ! res0 += K1; ! res0 *= xx0; ! res0 += DC1; ! res0 = dtmp0 * res0; ! dtmp1 = *((double*)&lexp0); ! res0 *= dtmp1; ! fres0 = (float)res0; ! *py = fres0; ! py += stridey; !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ENTRY(__vsqrtf_ultra3) save %sp,-SA(MINFRAME)-tmps,%sp PIC_SETUP(l7) PIC_SET(l7,.CONST_TBL,o2) PIC_SET(l7,__vlibm_TBL_sqrtf,l2) st %i0,[%fp+tmp_counter] sll %i2,2,stridex or %g0,0xff8,%l5 stx %i1,[%fp+tmp_px] sll %l5,1,_0x1ff0 ldd [%o2],K1 sll %i4,2,stridey ldd [%o2+8],K2 or %g0,%i3,%g5 ldd [%o2+16],DC0 sethi %hi(0x7f800000),%o0 ldd [%o2+24],DC1 sethi %hi(0x00800000),%l7 ldd [%o2+32],DC2 .begin: ld [%fp+tmp_counter],counter ldx [%fp+tmp_px],%i1 st %g0,[%fp+tmp_counter] .begin1: cmp counter,0 ble,pn %icc,.exit lda [%i1]0x82,%o2 ! (2_0) ax = *(int*)px; or %g0,%i1,%o7 lda [%i1]0x82,%f25 ! (2_0) x0 = *px; cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 bge,pn %icc,.spec ! (2_0) if( ax >= 0x7f800000 ) nop cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 bl,pn %icc,.spec ! (2_0) if( ax < 0x00800000 ) nop fstod %f25,%f56 ! (2_0) db0 = (double)x0; lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; add %o7,stridex,%i1 ! px += stridex add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 bge,pn %icc,.update0 ! (3_0) if( ax >= 0x7f800000 ) nop .cont0: sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; sra %o2,11,%i2 ! (2_0) ax >>= 11; stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 bl,pn %icc,.update1 ! (3_0) if( ax < 0x00800000 ) nop .cont1: fstod %f0,%f48 ! (3_0) db0 = (double)x0; and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; add %i1,stridex,%i1 ! px += stridex add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; lda [%i1]0x82,%f13 ! (4_0) x0 = *px; fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); add %o4,960,%i0 ! (3_0) iexp0 += 0x3c0; cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 bge,pn %icc,.update2 ! (4_1) if( ax >= 0x7f800000 ) nop .cont2: fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); sllx %i0,52,%g1 ! (3_1) lexp0 = (long long)iexp0 << 52; ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; sra %o1,11,%l0 ! (3_1) ax >>= 11; stx %g1,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 bl,pn %icc,.update3 ! (4_1) if( ax < 0x00800000 ) nop .cont3: fstod %f13,%f50 ! (4_1) db0 = (double)x0; fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; add %i1,stridex,%o4 ! px += stridex add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 bge,pn %icc,.update4 ! (0_0) if( ax >= 0x7f800000 ) fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); .cont4: sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; sra %o2,11,%i5 ! (4_1) ax >>= 11; stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 bl,pn %icc,.update5 ! (0_0) if( ax < 0x00800000 ) nop .cont5: fstod %f17,%f56 ! (0_0) db0 = (double)x0; fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; faddd %f52,K1,%f52 ! (2_1) res0 += K1; sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); add %o4,stridex,%i1 ! px += stridex add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 bge,pn %icc,.update6 ! (1_0) if( ax >= 0x7f800000 ) fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); .cont6: fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; sra %l1,11,%i4 ! (0_0) ax >>= 11; stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 bl,pn %icc,.update7 ! (1_0) if( ax < 0x00800000 ) nop .cont7: fstod %f21,%f56 ! (1_0) db0 = (double)x0; fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; faddd %f50,K1,%f62 ! (3_1) res0 += K1; add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; add %i1,stridex,%o7 ! px += stridex add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 bge,pn %icc,.update8 ! (2_0) if( ax >= 0x7f800000 ) fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); .cont8: fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; sra %i0,11,%g1 ! (1_0) ax >>= 11; stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 bl,pn %icc,.update9 ! (2_0) if( ax < 0x00800000 ) ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); fstod %f25,%f56 ! (2_0) db0 = (double)x0; .cont9: fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; faddd %f50,K1,%f34 ! (4_1) res0 += K1; add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; add %o7,stridex,%i1 ! px += stridex add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 bge,pn %icc,.update10 ! (3_0) if( ax >= 0x7f800000 ) fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); .cont10: fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; sra %o2,11,%i2 ! (2_0) ax >>= 11; stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 bl,pn %icc,.update11 ! (3_0) if( ax < 0x00800000 ) ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); fstod %f0,%f48 ! (3_0) db0 = (double)x0; .cont11: fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; faddd %f50,K1,%f56 ! (0_0) res0 += K1; add %i1,stridex,%i1 ! px += stridex add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; lda [%i1]0x82,%f13 ! (4_0) x0 = *px; fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); or %g0,%g5,%i3 cmp counter,5 bl,pn %icc,.tail add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; ba .main_loop sub counter,5,counter ! counter .align 16 .main_loop: fmuld K2,%f30,%f60 ! (1_1) res0 = K2 * xx0; cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 bge,pn %icc,.update12 ! (4_1) if( ax >= 0x7f800000 ) fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); .cont12: fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; sllx %g5,52,%g5 ! (3_1) lexp0 = (long long)iexp0 << 52; ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; sra %o1,11,%l0 ! (3_1) ax >>= 11; stx %g5,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 bl,pn %icc,.update13 ! (4_1) if( ax < 0x00800000 ) ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); fstod %f13,%f50 ! (4_1) db0 = (double)x0; .cont13: fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; faddd %f60,K1,%f32 ! (1_1) res0 += K1; add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 add %i3,stridey,%o3 ! py += stridey st %f15,[%i3] ! (2_2) *py = fres0; fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; add %i1,stridex,%o4 ! px += stridex add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 bge,pn %icc,.update14 ! (0_0) if( ax >= 0x7f800000 ) fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); .cont14: fmuld %f32,%f30,%f48 ! (1_1) res0 *= xx0; sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; sra %o2,11,%i5 ! (4_1) ax >>= 11; stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 bl,pn %icc,.update15 ! (0_0) if( ax < 0x00800000 ) ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); fstod %f17,%f56 ! (0_0) db0 = (double)x0; .cont15: fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; add %o3,stridey,%g5 ! py += stridey lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; faddd %f52,K1,%f52 ! (2_1) res0 += K1; sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; st %f19,[%o3] ! (3_2) *py = fres0; fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; add %o4,stridex,%i1 ! px += stridex ldd [%i4+8],%f60 ! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f48,DC1,%f58 ! (1_1) res0 += DC1; add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 bge,pn %icc,.update16 ! (1_0) if( ax >= 0x7f800000 ) fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); .cont16: fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; fmuld %f60,%f58,%f44 ! (1_1) res0 = dtmp0 * res0; sra %l1,11,%i4 ! (0_0) ax >>= 11; stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 bl,pn %icc,.update17 ! (1_0) if( ax < 0x00800000 ) ldd [%fp+tmp4],%f34 ! (1_1) dtmp1 = *((double*)&lexp0); fstod %f21,%f56 ! (1_0) db0 = (double)x0; .cont17: fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; faddd %f50,K1,%f62 ! (3_1) res0 += K1; add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 add %g5,stridey,%g5 ! py += stridey st %f23,[stridey+%o3] ! (4_2) *py = fres0; fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); fmuld %f44,%f34,%f44 ! (1_1) res0 *= dtmp1; sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; add %i1,stridex,%o7 ! px += stridex add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 bge,pn %icc,.update18 ! (2_0) if( ax >= 0x7f800000 ) fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); .cont18: fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; sra %i0,11,%g1 ! (1_0) ax >>= 11; stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 bl,pn %icc,.update19 ! (2_0) if( ax < 0x00800000 ) ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); fstod %f25,%f56 ! (2_0) db0 = (double)x0; .cont19: fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; faddd %f50,K1,%f34 ! (4_1) res0 += K1; add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 add %g5,stridey,%g1 ! py += stridey st %f27,[%g5] ! (0_1) *py = fres0; fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; add %o7,stridex,%i1 ! px += stridex add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 bge,pn %icc,.update20 ! (3_0) if( ax >= 0x7f800000 ) fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); .cont20: fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; fdtos %f44,%f8 ! (1_1) fres0 = (float)res0; fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; sra %o2,11,%i2 ! (2_0) ax >>= 11; stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 bl,pn %icc,.update21 ! (3_0) if( ax < 0x00800000 ) ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); fstod %f0,%f48 ! (3_0) db0 = (double)x0; .cont21: fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; faddd %f50,K1,%f56 ! (0_0) res0 += K1; add %i1,stridex,%i1 ! px += stridex add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 st %f8,[stridey+%g5] ! (1_1) *py = fres0; fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; add %g1,stridey,%i3 ! py += stridey subcc counter,5,counter ! counter lda [%i1]0x82,%f13 ! (4_0) x0 = *px; fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); bpos,pt %icc,.main_loop add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; add counter,5,counter .tail: subcc counter,1,counter bneg,a .begin or %g0,%i3,%g5 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); add %i3,stridey,%o3 ! py += stridey st %f15,[%i3] ! (2_2) *py = fres0; subcc counter,1,counter bneg,a .begin or %g0,%o3,%g5 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); add %o3,stridey,%g5 ! py += stridey st %f19,[%o3] ! (3_2) *py = fres0; subcc counter,1,counter bneg,a .begin nop fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; add %g5,stridey,%g5 ! py += stridey st %f23,[stridey+%o3] ! (4_2) *py = fres0; subcc counter,1,counter bneg,a .begin nop fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; st %f27,[%g5] ! (0_1) *py = fres0; ba .begin add %g5,stridey,%g5 .align 16 .spec: fsqrts %f25,%f25 sub counter,1,counter add %i1,stridex,%i1 st %f25,[%g5] ba .begin1 add %g5,stridey,%g5 .align 16 .update0: cmp counter,1 ble .cont0 fzeros %f0 stx %i1,[%fp+tmp_px] sethi %hi(0x7f800000),%o1 sub counter,1,counter st counter,[%fp+tmp_counter] ba .cont0 or %g0,1,counter .align 16 .update1: cmp counter,1 ble .cont1 fzeros %f0 stx %i1,[%fp+tmp_px] clr %o1 sub counter,1,counter st counter,[%fp+tmp_counter] ba .cont1 or %g0,1,counter .align 16 .update2: cmp counter,2 ble .cont2 fzeros %f13 stx %i1,[%fp+tmp_px] sethi %hi(0x7f800000),%o2 sub counter,2,counter st counter,[%fp+tmp_counter] ba .cont2 or %g0,2,counter .align 16 .update3: cmp counter,2 ble .cont3 fzeros %f13 stx %i1,[%fp+tmp_px] clr %o2 sub counter,2,counter st counter,[%fp+tmp_counter] ba .cont3 or %g0,2,counter .align 16 .update4: cmp counter,3 ble .cont4 fzeros %f17 stx %o4,[%fp+tmp_px] sethi %hi(0x7f800000),%l1 sub counter,3,counter st counter,[%fp+tmp_counter] ba .cont4 or %g0,3,counter .align 16 .update5: cmp counter,3 ble .cont5 fzeros %f17 stx %o4,[%fp+tmp_px] clr %l1 sub counter,3,counter st counter,[%fp+tmp_counter] ba .cont5 or %g0,3,counter .align 16 .update6: cmp counter,4 ble .cont6 fzeros %f21 stx %i1,[%fp+tmp_px] sethi %hi(0x7f800000),%i0 sub counter,4,counter st counter,[%fp+tmp_counter] ba .cont6 or %g0,4,counter .align 16 .update7: cmp counter,4 ble .cont7 fzeros %f21 stx %i1,[%fp+tmp_px] clr %i0 sub counter,4,counter st counter,[%fp+tmp_counter] ba .cont7 or %g0,4,counter .align 16 .update8: cmp counter,5 ble .cont8 fzeros %f25 stx %o7,[%fp+tmp_px] sethi %hi(0x7f800000),%o2 sub counter,5,counter st counter,[%fp+tmp_counter] ba .cont8 or %g0,5,counter .align 16 .update9: cmp counter,5 ble .cont9 fzeros %f25 stx %o7,[%fp+tmp_px] clr %o2 sub counter,5,counter st counter,[%fp+tmp_counter] ba .cont9 or %g0,5,counter .align 16 .update10: cmp counter,6 ble .cont10 fzeros %f0 stx %i1,[%fp+tmp_px] sethi %hi(0x7f800000),%o1 sub counter,6,counter st counter,[%fp+tmp_counter] ba .cont10 or %g0,6,counter .align 16 .update11: cmp counter,6 ble .cont11 fzeros %f0 stx %i1,[%fp+tmp_px] clr %o1 sub counter,6,counter st counter,[%fp+tmp_counter] ba .cont11 or %g0,6,counter .align 16 .update12: cmp counter,2 ble .cont12 fzeros %f13 stx %i1,[%fp+tmp_px] sethi %hi(0x7f800000),%o2 sub counter,2,counter st counter,[%fp+tmp_counter] ba .cont12 or %g0,2,counter .align 16 .update13: cmp counter,2 ble .cont13 fzeros %f13 stx %i1,[%fp+tmp_px] clr %o2 sub counter,2,counter st counter,[%fp+tmp_counter] ba .cont13 or %g0,2,counter .align 16 .update14: cmp counter,3 ble .cont14 fzeros %f17 stx %o4,[%fp+tmp_px] sethi %hi(0x7f800000),%l1 sub counter,3,counter st counter,[%fp+tmp_counter] ba .cont14 or %g0,3,counter .align 16 .update15: cmp counter,3 ble .cont15 fzeros %f17 stx %o4,[%fp+tmp_px] clr %l1 sub counter,3,counter st counter,[%fp+tmp_counter] ba .cont15 or %g0,3,counter .align 16 .update16: cmp counter,4 ble .cont16 fzeros %f21 stx %i1,[%fp+tmp_px] sethi %hi(0x7f800000),%i0 sub counter,4,counter st counter,[%fp+tmp_counter] ba .cont16 or %g0,4,counter .align 16 .update17: cmp counter,4 ble .cont17 fzeros %f21 stx %i1,[%fp+tmp_px] clr %i0 sub counter,4,counter st counter,[%fp+tmp_counter] ba .cont17 or %g0,4,counter .align 16 .update18: cmp counter,5 ble .cont18 fzeros %f25 stx %o7,[%fp+tmp_px] sethi %hi(0x7f800000),%o2 sub counter,5,counter st counter,[%fp+tmp_counter] ba .cont18 or %g0,5,counter .align 16 .update19: cmp counter,5 ble .cont19 fzeros %f25 stx %o7,[%fp+tmp_px] clr %o2 sub counter,5,counter st counter,[%fp+tmp_counter] ba .cont19 or %g0,5,counter .align 16 .update20: cmp counter,6 ble .cont20 fzeros %f0 stx %i1,[%fp+tmp_px] sethi %hi(0x7f800000),%o1 sub counter,6,counter st counter,[%fp+tmp_counter] ba .cont20 or %g0,6,counter .align 16 .update21: cmp counter,6 ble .cont21 fzeros %f0 stx %i1,[%fp+tmp_px] clr %o1 sub counter,6,counter st counter,[%fp+tmp_counter] ba .cont21 or %g0,6,counter .exit: ret restore SET_SIZE(__vsqrtf_ultra3)