1*25c28e83SPiotr Jasiukajtis/*
2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START
3*25c28e83SPiotr Jasiukajtis *
4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the
5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License").
6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License.
7*25c28e83SPiotr Jasiukajtis *
8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing.
10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions
11*25c28e83SPiotr Jasiukajtis * and limitations under the License.
12*25c28e83SPiotr Jasiukajtis *
13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each
14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the
16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying
17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner]
18*25c28e83SPiotr Jasiukajtis *
19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END
20*25c28e83SPiotr Jasiukajtis */
21*25c28e83SPiotr Jasiukajtis/*
22*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23*25c28e83SPiotr Jasiukajtis */
24*25c28e83SPiotr Jasiukajtis/*
25*25c28e83SPiotr Jasiukajtis * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26*25c28e83SPiotr Jasiukajtis * Use is subject to license terms.
27*25c28e83SPiotr Jasiukajtis */
28*25c28e83SPiotr Jasiukajtis
29*25c28e83SPiotr Jasiukajtis	.file	"__vsqrtf.S"
30*25c28e83SPiotr Jasiukajtis
31*25c28e83SPiotr Jasiukajtis#include "libm.h"
32*25c28e83SPiotr Jasiukajtis
33*25c28e83SPiotr Jasiukajtis	ENTRY(__vsqrtf)
34*25c28e83SPiotr Jasiukajtis	push	%rbp
35*25c28e83SPiotr Jasiukajtis	movq	%rsp,%rbp
36*25c28e83SPiotr Jasiukajtis
37*25c28e83SPiotr Jasiukajtis/ on entry:
38*25c28e83SPiotr Jasiukajtis/   %edi = n
39*25c28e83SPiotr Jasiukajtis/   %rsi = x
40*25c28e83SPiotr Jasiukajtis/   %edx = stridex
41*25c28e83SPiotr Jasiukajtis/   %rcx = y
42*25c28e83SPiotr Jasiukajtis/   %r8d = stridey
43*25c28e83SPiotr Jasiukajtis
44*25c28e83SPiotr Jasiukajtis	movslq	%edx,%rdx		/ sign extend and scale strides
45*25c28e83SPiotr Jasiukajtis	shlq	$2,%rdx
46*25c28e83SPiotr Jasiukajtis	movslq	%r8d,%r8
47*25c28e83SPiotr Jasiukajtis	shlq	$2,%r8
48*25c28e83SPiotr Jasiukajtis
49*25c28e83SPiotr Jasiukajtis	cmpl	$4,%edi
50*25c28e83SPiotr Jasiukajtis	jl	.finish
51*25c28e83SPiotr Jasiukajtis
52*25c28e83SPiotr Jasiukajtis	cmpq	$4,%rdx
53*25c28e83SPiotr Jasiukajtis	jne	.nonunit
54*25c28e83SPiotr Jasiukajtis	cmpq	$4,%r8
55*25c28e83SPiotr Jasiukajtis	jne	.nonunit
56*25c28e83SPiotr Jasiukajtis
57*25c28e83SPiotr Jasiukajtis/ unit-stride case
58*25c28e83SPiotr Jasiukajtis	movq	%rdx,%r9
59*25c28e83SPiotr Jasiukajtis	shlq	$2,%r9
60*25c28e83SPiotr Jasiukajtis	movq	%r8,%r10
61*25c28e83SPiotr Jasiukajtis	shlq	$2,%r10
62*25c28e83SPiotr Jasiukajtis
63*25c28e83SPiotr Jasiukajtis	.align	16
64*25c28e83SPiotr Jasiukajtis.loop:
65*25c28e83SPiotr Jasiukajtis	movups	(%rsi),%xmm0
66*25c28e83SPiotr Jasiukajtis	addq	%r9,%rsi
67*25c28e83SPiotr Jasiukajtis	sqrtps	%xmm0,%xmm0
68*25c28e83SPiotr Jasiukajtis	movups	%xmm0,(%rcx)
69*25c28e83SPiotr Jasiukajtis	addq	%r10,%rcx
70*25c28e83SPiotr Jasiukajtis	subl	$4,%edi
71*25c28e83SPiotr Jasiukajtis	cmpl	$4,%edi
72*25c28e83SPiotr Jasiukajtis	jge	.loop
73*25c28e83SPiotr Jasiukajtis
74*25c28e83SPiotr Jasiukajtis.finish:
75*25c28e83SPiotr Jasiukajtis	testl	%edi,%edi
76*25c28e83SPiotr Jasiukajtis	jle	.done
77*25c28e83SPiotr Jasiukajtis
78*25c28e83SPiotr Jasiukajtis.finish_loop:
79*25c28e83SPiotr Jasiukajtis	movss	(%rsi),%xmm0
80*25c28e83SPiotr Jasiukajtis	addq	%rdx,%rsi
81*25c28e83SPiotr Jasiukajtis	sqrtss	%xmm0,%xmm0
82*25c28e83SPiotr Jasiukajtis	movss	%xmm0,(%rcx)
83*25c28e83SPiotr Jasiukajtis	addq	%r8,%rcx
84*25c28e83SPiotr Jasiukajtis	decl	%edi
85*25c28e83SPiotr Jasiukajtis	jg	.finish_loop
86*25c28e83SPiotr Jasiukajtis
87*25c28e83SPiotr Jasiukajtis.done:
88*25c28e83SPiotr Jasiukajtis	leave
89*25c28e83SPiotr Jasiukajtis	ret
90*25c28e83SPiotr Jasiukajtis
91*25c28e83SPiotr Jasiukajtis	.align	16
92*25c28e83SPiotr Jasiukajtis.nonunit:
93*25c28e83SPiotr Jasiukajtis	movss	(%rsi),%xmm0
94*25c28e83SPiotr Jasiukajtis	addq	%rdx,%rsi
95*25c28e83SPiotr Jasiukajtis	movss	(%rsi),%xmm1
96*25c28e83SPiotr Jasiukajtis	addq	%rdx,%rsi
97*25c28e83SPiotr Jasiukajtis	movss	(%rsi),%xmm2
98*25c28e83SPiotr Jasiukajtis	addq	%rdx,%rsi
99*25c28e83SPiotr Jasiukajtis	movss	(%rsi),%xmm3
100*25c28e83SPiotr Jasiukajtis	addq	%rdx,%rsi
101*25c28e83SPiotr Jasiukajtis
102*25c28e83SPiotr Jasiukajtis	movlhps	%xmm1,%xmm0		/ xmm0:   0  x1   0  x0
103*25c28e83SPiotr Jasiukajtis	movlhps	%xmm3,%xmm2		/ xmm2:   0  x3   0  x2
104*25c28e83SPiotr Jasiukajtis	shufps	$0x88,%xmm2,%xmm0	/ xmm0:  x3  x2  x1  x0
105*25c28e83SPiotr Jasiukajtis
106*25c28e83SPiotr Jasiukajtis	sqrtps	%xmm0,%xmm0		/ xmm0:  y3  y2  y1  y0
107*25c28e83SPiotr Jasiukajtis
108*25c28e83SPiotr Jasiukajtis	movaps	%xmm0,%xmm1		/ xmm1:  y3  y2  y1  y0
109*25c28e83SPiotr Jasiukajtis	shufps	$0xf5,%xmm0,%xmm1	/ xmm1:  y3  y3  y1  y1
110*25c28e83SPiotr Jasiukajtis	movhlps	%xmm0,%xmm2		/ xmm2:   0  x3  y3  y2
111*25c28e83SPiotr Jasiukajtis	movhlps	%xmm1,%xmm3		/ xmm3:   0   0  y3  y3
112*25c28e83SPiotr Jasiukajtis
113*25c28e83SPiotr Jasiukajtis	movss	%xmm0,(%rcx)
114*25c28e83SPiotr Jasiukajtis	addq	%r8,%rcx
115*25c28e83SPiotr Jasiukajtis	movss	%xmm1,(%rcx)
116*25c28e83SPiotr Jasiukajtis	addq	%r8,%rcx
117*25c28e83SPiotr Jasiukajtis	movss	%xmm2,(%rcx)
118*25c28e83SPiotr Jasiukajtis	addq	%r8,%rcx
119*25c28e83SPiotr Jasiukajtis	movss	%xmm3,(%rcx)
120*25c28e83SPiotr Jasiukajtis	addq	%r8,%rcx
121*25c28e83SPiotr Jasiukajtis
122*25c28e83SPiotr Jasiukajtis	subl	$4,%edi
123*25c28e83SPiotr Jasiukajtis	cmpl	$4,%edi
124*25c28e83SPiotr Jasiukajtis	jge	.nonunit
125*25c28e83SPiotr Jasiukajtis
126*25c28e83SPiotr Jasiukajtis	jmp	.finish
127*25c28e83SPiotr Jasiukajtis
128*25c28e83SPiotr Jasiukajtis	SET_SIZE(__vsqrtf)
129