1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23 */
24/*
25 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26 * Use is subject to license terms.
27 */
28
29	.file	"__vsqrtf.S"
30
31#include "libm.h"
32
33	ENTRY(__vsqrtf)
34	push	%rbp
35	movq	%rsp,%rbp
36
37/ on entry:
38/   %edi = n
39/   %rsi = x
40/   %edx = stridex
41/   %rcx = y
42/   %r8d = stridey
43
44	movslq	%edx,%rdx		/ sign extend and scale strides
45	shlq	$2,%rdx
46	movslq	%r8d,%r8
47	shlq	$2,%r8
48
49	cmpl	$4,%edi
50	jl	.finish
51
52	cmpq	$4,%rdx
53	jne	.nonunit
54	cmpq	$4,%r8
55	jne	.nonunit
56
57/ unit-stride case
58	movq	%rdx,%r9
59	shlq	$2,%r9
60	movq	%r8,%r10
61	shlq	$2,%r10
62
63	.align	16
64.loop:
65	movups	(%rsi),%xmm0
66	addq	%r9,%rsi
67	sqrtps	%xmm0,%xmm0
68	movups	%xmm0,(%rcx)
69	addq	%r10,%rcx
70	subl	$4,%edi
71	cmpl	$4,%edi
72	jge	.loop
73
74.finish:
75	testl	%edi,%edi
76	jle	.done
77
78.finish_loop:
79	movss	(%rsi),%xmm0
80	addq	%rdx,%rsi
81	sqrtss	%xmm0,%xmm0
82	movss	%xmm0,(%rcx)
83	addq	%r8,%rcx
84	decl	%edi
85	jg	.finish_loop
86
87.done:
88	leave
89	ret
90
91	.align	16
92.nonunit:
93	movss	(%rsi),%xmm0
94	addq	%rdx,%rsi
95	movss	(%rsi),%xmm1
96	addq	%rdx,%rsi
97	movss	(%rsi),%xmm2
98	addq	%rdx,%rsi
99	movss	(%rsi),%xmm3
100	addq	%rdx,%rsi
101
102	movlhps	%xmm1,%xmm0		/ xmm0:   0  x1   0  x0
103	movlhps	%xmm3,%xmm2		/ xmm2:   0  x3   0  x2
104	shufps	$0x88,%xmm2,%xmm0	/ xmm0:  x3  x2  x1  x0
105
106	sqrtps	%xmm0,%xmm0		/ xmm0:  y3  y2  y1  y0
107
108	movaps	%xmm0,%xmm1		/ xmm1:  y3  y2  y1  y0
109	shufps	$0xf5,%xmm0,%xmm1	/ xmm1:  y3  y3  y1  y1
110	movhlps	%xmm0,%xmm2		/ xmm2:   0  x3  y3  y2
111	movhlps	%xmm1,%xmm3		/ xmm3:   0   0  y3  y3
112
113	movss	%xmm0,(%rcx)
114	addq	%r8,%rcx
115	movss	%xmm1,(%rcx)
116	addq	%r8,%rcx
117	movss	%xmm2,(%rcx)
118	addq	%r8,%rcx
119	movss	%xmm3,(%rcx)
120	addq	%r8,%rcx
121
122	subl	$4,%edi
123	cmpl	$4,%edi
124	jge	.nonunit
125
126	jmp	.finish
127
128	SET_SIZE(__vsqrtf)
129