1*25c28e83SPiotr Jasiukajtis/* 2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START 3*25c28e83SPiotr Jasiukajtis * 4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the 5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License"). 6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License. 7*25c28e83SPiotr Jasiukajtis * 8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing. 10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions 11*25c28e83SPiotr Jasiukajtis * and limitations under the License. 12*25c28e83SPiotr Jasiukajtis * 13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each 14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the 16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying 17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner] 18*25c28e83SPiotr Jasiukajtis * 19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END 20*25c28e83SPiotr Jasiukajtis */ 21*25c28e83SPiotr Jasiukajtis/* 22*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23*25c28e83SPiotr Jasiukajtis */ 24*25c28e83SPiotr Jasiukajtis/* 25*25c28e83SPiotr Jasiukajtis * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 26*25c28e83SPiotr Jasiukajtis * Use is subject to license terms. 27*25c28e83SPiotr Jasiukajtis */ 28*25c28e83SPiotr Jasiukajtis 29*25c28e83SPiotr Jasiukajtis .file "__vsqrtf.S" 30*25c28e83SPiotr Jasiukajtis 31*25c28e83SPiotr Jasiukajtis#include "libm.h" 32*25c28e83SPiotr Jasiukajtis 33*25c28e83SPiotr Jasiukajtis ENTRY(__vsqrtf) 34*25c28e83SPiotr Jasiukajtis push %rbp 35*25c28e83SPiotr Jasiukajtis movq %rsp,%rbp 36*25c28e83SPiotr Jasiukajtis 37*25c28e83SPiotr Jasiukajtis/ on entry: 38*25c28e83SPiotr Jasiukajtis/ %edi = n 39*25c28e83SPiotr Jasiukajtis/ %rsi = x 40*25c28e83SPiotr Jasiukajtis/ %edx = stridex 41*25c28e83SPiotr Jasiukajtis/ %rcx = y 42*25c28e83SPiotr Jasiukajtis/ %r8d = stridey 43*25c28e83SPiotr Jasiukajtis 44*25c28e83SPiotr Jasiukajtis movslq %edx,%rdx / sign extend and scale strides 45*25c28e83SPiotr Jasiukajtis shlq $2,%rdx 46*25c28e83SPiotr Jasiukajtis movslq %r8d,%r8 47*25c28e83SPiotr Jasiukajtis shlq $2,%r8 48*25c28e83SPiotr Jasiukajtis 49*25c28e83SPiotr Jasiukajtis cmpl $4,%edi 50*25c28e83SPiotr Jasiukajtis jl .finish 51*25c28e83SPiotr Jasiukajtis 52*25c28e83SPiotr Jasiukajtis cmpq $4,%rdx 53*25c28e83SPiotr Jasiukajtis jne .nonunit 54*25c28e83SPiotr Jasiukajtis cmpq $4,%r8 55*25c28e83SPiotr Jasiukajtis jne .nonunit 56*25c28e83SPiotr Jasiukajtis 57*25c28e83SPiotr Jasiukajtis/ unit-stride case 58*25c28e83SPiotr Jasiukajtis movq %rdx,%r9 59*25c28e83SPiotr Jasiukajtis shlq $2,%r9 60*25c28e83SPiotr Jasiukajtis movq %r8,%r10 61*25c28e83SPiotr Jasiukajtis shlq $2,%r10 62*25c28e83SPiotr Jasiukajtis 63*25c28e83SPiotr Jasiukajtis .align 16 64*25c28e83SPiotr Jasiukajtis.loop: 65*25c28e83SPiotr Jasiukajtis movups (%rsi),%xmm0 66*25c28e83SPiotr Jasiukajtis addq %r9,%rsi 67*25c28e83SPiotr Jasiukajtis sqrtps %xmm0,%xmm0 68*25c28e83SPiotr Jasiukajtis movups %xmm0,(%rcx) 69*25c28e83SPiotr Jasiukajtis addq %r10,%rcx 70*25c28e83SPiotr Jasiukajtis subl $4,%edi 71*25c28e83SPiotr Jasiukajtis cmpl $4,%edi 72*25c28e83SPiotr Jasiukajtis jge .loop 73*25c28e83SPiotr Jasiukajtis 74*25c28e83SPiotr Jasiukajtis.finish: 75*25c28e83SPiotr Jasiukajtis testl %edi,%edi 76*25c28e83SPiotr Jasiukajtis jle .done 77*25c28e83SPiotr Jasiukajtis 78*25c28e83SPiotr Jasiukajtis.finish_loop: 79*25c28e83SPiotr Jasiukajtis movss (%rsi),%xmm0 80*25c28e83SPiotr Jasiukajtis addq %rdx,%rsi 81*25c28e83SPiotr Jasiukajtis sqrtss %xmm0,%xmm0 82*25c28e83SPiotr Jasiukajtis movss %xmm0,(%rcx) 83*25c28e83SPiotr Jasiukajtis addq %r8,%rcx 84*25c28e83SPiotr Jasiukajtis decl %edi 85*25c28e83SPiotr Jasiukajtis jg .finish_loop 86*25c28e83SPiotr Jasiukajtis 87*25c28e83SPiotr Jasiukajtis.done: 88*25c28e83SPiotr Jasiukajtis leave 89*25c28e83SPiotr Jasiukajtis ret 90*25c28e83SPiotr Jasiukajtis 91*25c28e83SPiotr Jasiukajtis .align 16 92*25c28e83SPiotr Jasiukajtis.nonunit: 93*25c28e83SPiotr Jasiukajtis movss (%rsi),%xmm0 94*25c28e83SPiotr Jasiukajtis addq %rdx,%rsi 95*25c28e83SPiotr Jasiukajtis movss (%rsi),%xmm1 96*25c28e83SPiotr Jasiukajtis addq %rdx,%rsi 97*25c28e83SPiotr Jasiukajtis movss (%rsi),%xmm2 98*25c28e83SPiotr Jasiukajtis addq %rdx,%rsi 99*25c28e83SPiotr Jasiukajtis movss (%rsi),%xmm3 100*25c28e83SPiotr Jasiukajtis addq %rdx,%rsi 101*25c28e83SPiotr Jasiukajtis 102*25c28e83SPiotr Jasiukajtis movlhps %xmm1,%xmm0 / xmm0: 0 x1 0 x0 103*25c28e83SPiotr Jasiukajtis movlhps %xmm3,%xmm2 / xmm2: 0 x3 0 x2 104*25c28e83SPiotr Jasiukajtis shufps $0x88,%xmm2,%xmm0 / xmm0: x3 x2 x1 x0 105*25c28e83SPiotr Jasiukajtis 106*25c28e83SPiotr Jasiukajtis sqrtps %xmm0,%xmm0 / xmm0: y3 y2 y1 y0 107*25c28e83SPiotr Jasiukajtis 108*25c28e83SPiotr Jasiukajtis movaps %xmm0,%xmm1 / xmm1: y3 y2 y1 y0 109*25c28e83SPiotr Jasiukajtis shufps $0xf5,%xmm0,%xmm1 / xmm1: y3 y3 y1 y1 110*25c28e83SPiotr Jasiukajtis movhlps %xmm0,%xmm2 / xmm2: 0 x3 y3 y2 111*25c28e83SPiotr Jasiukajtis movhlps %xmm1,%xmm3 / xmm3: 0 0 y3 y3 112*25c28e83SPiotr Jasiukajtis 113*25c28e83SPiotr Jasiukajtis movss %xmm0,(%rcx) 114*25c28e83SPiotr Jasiukajtis addq %r8,%rcx 115*25c28e83SPiotr Jasiukajtis movss %xmm1,(%rcx) 116*25c28e83SPiotr Jasiukajtis addq %r8,%rcx 117*25c28e83SPiotr Jasiukajtis movss %xmm2,(%rcx) 118*25c28e83SPiotr Jasiukajtis addq %r8,%rcx 119*25c28e83SPiotr Jasiukajtis movss %xmm3,(%rcx) 120*25c28e83SPiotr Jasiukajtis addq %r8,%rcx 121*25c28e83SPiotr Jasiukajtis 122*25c28e83SPiotr Jasiukajtis subl $4,%edi 123*25c28e83SPiotr Jasiukajtis cmpl $4,%edi 124*25c28e83SPiotr Jasiukajtis jge .nonunit 125*25c28e83SPiotr Jasiukajtis 126*25c28e83SPiotr Jasiukajtis jmp .finish 127*25c28e83SPiotr Jasiukajtis 128*25c28e83SPiotr Jasiukajtis SET_SIZE(__vsqrtf) 129