2533d3a4Edward Gillett * CDDL HEADER START
3533d3a4Edward Gillett *
4533d3a4Edward Gillett * The contents of this file are subject to the terms of the
5533d3a4Edward Gillett * Common Development and Distribution License (the "License").
6533d3a4Edward Gillett * You may not use this file except in compliance with the License.
7533d3a4Edward Gillett *
8533d3a4Edward Gillett * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9533d3a4Edward Gillett * or http://www.opensolaris.org/os/licensing.
10533d3a4Edward Gillett * See the License for the specific language governing permissions
11533d3a4Edward Gillett * and limitations under the License.
12533d3a4Edward Gillett *
13533d3a4Edward Gillett * When distributing Covered Code, include this CDDL HEADER in each
14533d3a4Edward Gillett * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15533d3a4Edward Gillett * If applicable, add the following below this CDDL HEADER, with the
16533d3a4Edward Gillett * fields enclosed by brackets "[]" replaced with your own identifying
17533d3a4Edward Gillett * information: Portions Copyright [yyyy] [name of copyright owner]
18533d3a4Edward Gillett *
19533d3a4Edward Gillett * CDDL HEADER END
207c478bdstevel@tonic-gate */
23533d3a4Edward Gillett * Copyright (c) 2009, Intel Corporation
247c478bdstevel@tonic-gate * All rights reserved.
257c478bdstevel@tonic-gate */
27533d3a4Edward Gillett/*
28533d3a4Edward Gillett *	str[n]cpy - copy [n] chars from second operand into first operand
29533d3a4Edward Gillett */
307c478bdstevel@tonic-gate#include "SYS.h"
31533d3a4Edward Gillett#include "proc64_id.h"
337c478bdstevel@tonic-gate#define LABEL(s) .strcpy/**/s
357c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
367c478bdstevel@tonic-gate	ENTRY(strncpy)
37533d3a4Edward Gillett	test	%edx, %edx
38533d3a4Edward Gillett	jz	LABEL(strncpy_exitz)
39533d3a4Edward Gillett	mov	%rdx, %r8
41533d3a4Edward Gillett	ENTRY(strcpy)				/* (char *, const char *) */
42533d3a4Edward Gillett	xor	%rdx, %rdx
43533d3a4Edward Gillett#endif
44533d3a4Edward Gillett	mov	%esi, %ecx
45533d3a4Edward Gillett	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
46533d3a4Edward Gillett	and	$0xf, %rcx
47533d3a4Edward Gillett	mov	%rdi, %rax			/* save destination address for return value */
48533d3a4Edward Gillett
49533d3a4Edward Gillett
50533d3a4Edward Gillett	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char checks */
51533d3a4Edward Gillett	pcmpeqb	(%rsi), %xmm0			/* check 16 bytes in src for null */
52533d3a4Edward Gillett	pmovmskb %xmm0, %edx
53533d3a4Edward Gillett	shr	%cl, %edx			/* adjust for offset from 16byte boundary */
54533d3a4Edward Gillett	test	%edx, %edx			/* edx will be 0 if chars are non-null */
55533d3a4Edward Gillett	jnz	LABEL(less16bytes)		/* null char found in first 16 bytes examined */
56533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
57533d3a4Edward Gillett	/*
58533d3a4Edward Gillett	 * Check if the count is satisfied in first 16 bytes examined.
59533d3a4Edward Gillett	 */
60533d3a4Edward Gillett	lea	-16(%r8, %rcx), %r11
61533d3a4Edward Gillett	cmp	$0, %r11
62533d3a4Edward Gillett	jle	LABEL(less16bytes)
63533d3a4Edward Gillett#endif
64533d3a4Edward Gillett	mov	%rcx, %r9			/* rsi alignment offset */
65533d3a4Edward Gillett	or	%edi, %ecx
66533d3a4Edward Gillett	and	$0xf, %ecx
67533d3a4Edward Gillett	lea	-16(%r9), %r10
68533d3a4Edward Gillett	jz	LABEL(ashr_0)			/* src and dest are both 16 byte aligned */
69533d3a4Edward Gillett
70533d3a4Edward Gillett	neg	%r10				/* max src bytes remaining in current dqword */
71533d3a4Edward Gillett
72533d3a4Edward Gillett	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation */
73533d3a4Edward Gillett	pcmpeqb	16(%rsi), %xmm0			/* check next 16 bytes in src for a null */
74533d3a4Edward Gillett	pmovmskb %xmm0, %edx
75533d3a4Edward Gillett	test	%edx, %edx
76533d3a4Edward Gillett	jnz	LABEL(less32bytes)		/* null char found in first 32 bytes examined */
77533d3a4Edward Gillett
78533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
79533d3a4Edward Gillett	/*
80533d3a4Edward Gillett	 * If strncpy count <= 16 go to exit case
81533d3a4Edward Gillett	 */
82533d3a4Edward Gillett	sub	$16, %r8
83533d3a4Edward Gillett	jbe	LABEL(less32bytes_strncpy_truncation)
84533d3a4Edward Gillett#endif
85533d3a4Edward Gillett	/*
86533d3a4Edward Gillett	 * At least 16 bytes to copy to destination string. Move them now.
87533d3a4Edward Gillett	 * Don't worry about alignment.
88533d3a4Edward Gillett	 */
89533d3a4Edward Gillett	mov	(%rsi, %r9), %rdx
90533d3a4Edward Gillett	mov	%rdx, (%rdi)
91533d3a4Edward Gillett	mov	8(%rsi, %r9), %rdx
92533d3a4Edward Gillett	mov	%rdx, 8(%rdi)
93533d3a4Edward Gillett
94533d3a4Edward Gillett	/*
95533d3a4Edward Gillett	 * so far destination rdi may be aligned by 16, re-calculate rsi and
96533d3a4Edward Gillett	 * jump to corresponding src/dest relative offset case.
97533d3a4Edward Gillett	 * 	rcx is offset of rsi
98533d3a4Edward Gillett	 * 	rdx is offset of rdi
99533d3a4Edward Gillett	 */
100533d3a4Edward Gillett	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
101533d3a4Edward Gillett	mov	%rax, %rdx			/* rax contains orignal rdi */
102533d3a4Edward Gillett	xor	%rdi, %rdx			/* same effect as "and $0xf, %rdx" */
103533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
104533d3a4Edward Gillett	/*
105533d3a4Edward Gillett	 * Will now do 16 byte aligned stores. Stores may overlap some bytes
106533d3a4Edward Gillett	 * (ie store twice) if destination was unaligned. Compensate here.
107533d3a4Edward Gillett	 */
108533d3a4Edward Gillett	add	%rdx, %r8			/* compensate for overlap */
109533d3a4Edward Gillett#endif
110533d3a4Edward Gillett
111533d3a4Edward Gillett	add	$16, %rdi			/* next 16 bytes for dest */
112533d3a4Edward Gillett
113533d3a4Edward Gillett	/*
114533d3a4Edward Gillett	 * align src to 16-byte boundary. Could be up or down depending on
115533d3a4Edward Gillett	 * whether src offset - dest offset > 0 (up) or
116533d3a4Edward Gillett	 *  src offset - dest offset < 0 (down).
117533d3a4Edward Gillett	 */
118533d3a4Edward Gillett	sub	%rdx, %r9			/* src offset - dest offset */
119533d3a4Edward Gillett
120533d3a4Edward Gillett	lea	16(%r9, %rsi), %rsi
121533d3a4Edward Gillett	mov	%esi, %ecx			/* for new src offset */
122533d3a4Edward Gillett	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
123533d3a4Edward Gillett
124533d3a4Edward Gillett	and	$0xf, %ecx			/* new src offset is 0 if rsi/rdi have same alignment */
125533d3a4Edward Gillett	jz	LABEL(ashr_0)
126533d3a4Edward Gillett
127533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
128533d3a4Edward Gillett	xor	%edx, %edx			/* In case unaligned_exit is taken */
129533d3a4Edward Gillett#endif
130533d3a4Edward Gillett	/*
131533d3a4Edward Gillett	 * Jump to case corresponding to source/dest string relative offsets
132533d3a4Edward Gillett	 * Index = (16 + (src offset - dest offset)) % 16
133533d3a4Edward Gillett	 */
134533d3a4Edward Gillett	lea	-16(%rcx), %r10
135533d3a4Edward Gillett	mov	%rcx, %r9
136533d3a4Edward Gillett	neg	%r10				/* max src bytes remaining in current dqword */
137533d3a4Edward Gillett	lea	LABEL(unaligned_table)(%rip), %r11
138533d3a4Edward Gillett	movslq	(%r11, %rcx, 4), %rcx
139533d3a4Edward Gillett	lea	(%r11, %rcx), %rcx
140533d3a4Edward Gillett	jmp	*%rcx
141533d3a4Edward Gillett
142533d3a4Edward Gillett/*
143533d3a4Edward Gillett * ashr_0 handles the following cases:
144533d3a4Edward Gillett * 	src alignment offset = dest alignment offset
145533d3a4Edward Gillett */
146533d3a4Edward Gillett	.p2align 5
147533d3a4Edward GillettLABEL(ashr_0):
148533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
149533d3a4Edward Gillett	sub	$16, %r8
150533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_aligned)
152533d3a4Edward Gillett	movdqa	(%rsi), %xmm1		/* fetch 16 bytes from src string */
153533d3a4Edward Gillett	movdqa	%xmm1, (%rdi)		/* store 16 bytes into dest string */
154533d3a4Edward Gillett	add	$16, %rsi
155533d3a4Edward Gillett	add	$16, %rdi
156533d3a4Edward Gillett	pcmpeqb	(%rsi), %xmm0		/* check 16 bytes in src for a null */
157533d3a4Edward Gillett	pmovmskb %xmm0, %edx
158533d3a4Edward Gillett
159533d3a4Edward Gillett	test	%edx, %edx		/* edx will be 0 if chars are non-null */
160533d3a4Edward Gillett	jnz	LABEL(aligned_16bytes)	/* exit tail */
161533d3a4Edward Gillett
162533d3a4Edward GillettLABEL(ashr_0_loop):
163533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
164533d3a4Edward Gillett	sub	$16, %r8
165533d3a4Edward Gillett	jbe	LABEL(strncpy_truncation_aligned)
166533d3a4Edward Gillett#endif
167533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm1
168533d3a4Edward Gillett	movdqa	%xmm1, (%rdi, %rcx)
169533d3a4Edward Gillett	add	$16, %rcx
170533d3a4Edward Gillett	pcmpeqb	(%rsi, %rcx), %xmm0
171533d3a4Edward Gillett	pmovmskb %xmm0, %edx
172533d3a4Edward Gillett	test	%edx, %edx
173533d3a4Edward Gillett	jnz	LABEL(aligned_exit)
1757c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
176533d3a4Edward Gillett	sub	$16, %r8
177533d3a4Edward Gillett	jbe	LABEL(strncpy_truncation_aligned)
179533d3a4Edward Gillett	movdqa  (%rsi, %rcx), %xmm1
180533d3a4Edward Gillett	movdqa  %xmm1, (%rdi, %rcx)
181533d3a4Edward Gillett	add	$16, %rcx
182533d3a4Edward Gillett	pcmpeqb  (%rsi, %rcx), %xmm0
183533d3a4Edward Gillett	pmovmskb  %xmm0, %edx
184533d3a4Edward Gillett	test	%edx, %edx
185533d3a4Edward Gillett	jnz	LABEL(aligned_exit)
187533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
188533d3a4Edward Gillett	sub	$16, %r8
189533d3a4Edward Gillett	jbe	LABEL(strncpy_truncation_aligned)
190533d3a4Edward Gillett#endif
191533d3a4Edward Gillett	movdqa  (%rsi, %rcx), %xmm1
192533d3a4Edward Gillett	movdqa  %xmm1, (%rdi, %rcx)
194533d3a4Edward Gillett	add	$16, %rcx
195533d3a4Edward Gillett	pcmpeqb  (%rsi, %rcx), %xmm0
196533d3a4Edward Gillett	pmovmskb  %xmm0, %edx
197533d3a4Edward Gillett	test	%edx, %edx
198533d3a4Edward Gillett	jnz	LABEL(aligned_exit)
200533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
201533d3a4Edward Gillett	sub	$16, %r8
202533d3a4Edward Gillett	jbe	LABEL(strncpy_truncation_aligned)
203533d3a4Edward Gillett#endif
204533d3a4Edward Gillett	movdqa  (%rsi, %rcx), %xmm1
205533d3a4Edward Gillett	movdqa  %xmm1, (%rdi, %rcx)
206533d3a4Edward Gillett	add	$16, %rcx
207533d3a4Edward Gillett	pcmpeqb  (%rsi, %rcx), %xmm0
208533d3a4Edward Gillett	pmovmskb  %xmm0, %edx
209533d3a4Edward Gillett	test	%edx, %edx
210533d3a4Edward Gillett	jz	LABEL(ashr_0_loop)
211533d3a4Edward Gillett	jmp	LABEL(aligned_exit)
212533d3a4Edward Gillett
213533d3a4Edward Gillett
214533d3a4Edward Gillett/*
215533d3a4Edward Gillett * ashr_15 handles the following cases:
216533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 15
217533d3a4Edward Gillett *
218533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
219533d3a4Edward Gillett * bank, there is no null byte.
220533d3a4Edward Gillett */
221533d3a4Edward Gillett	.p2align 4
222533d3a4Edward GillettLABEL(ashr_15):
223533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
224533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
225533d3a4Edward Gillett	cmp	%r10, %r8
226533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
227533d3a4Edward Gillett#endif
228533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
229533d3a4Edward Gillett	jz	LABEL(ashr_15_use_sse2)
2317c478bdstevel@tonic-gate	.p2align 4
232533d3a4Edward GillettLABEL(ashr_15_use_ssse3):
233533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
234533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
235533d3a4Edward Gillett	pmovmskb %xmm0, %edx
236533d3a4Edward Gillett	test	%edx, %edx
237533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
238533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
239533d3a4Edward Gillett	sub	$16, %r8
240533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
241533d3a4Edward Gillett#endif
242533d3a4Edward Gillett
243533d3a4Edward Gillett	#palignr $15, (%rsi, %rcx), %xmm3
244533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
245533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0f
247533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
248533d3a4Edward Gillett	add	$16, %rcx
249533d3a4Edward Gillett
250533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
251533d3a4Edward Gillett	cmp	%r10, %r8
252533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
253533d3a4Edward Gillett#endif
254533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
255533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
256533d3a4Edward Gillett	pmovmskb %xmm0, %edx
257533d3a4Edward Gillett	test	%edx, %edx
258533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
2597c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
260533d3a4Edward Gillett	sub	$16, %r8
261533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
264533d3a4Edward Gillett	#palignr $15, (%rsi, %rcx), %xmm3
265533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
266533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0f
268533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
269533d3a4Edward Gillett	add	$16, %rcx
2714fdb7a0Nobutomo Nakano#ifdef USE_AS_STRNCPY
272533d3a4Edward Gillett	cmp	%r10, %r8
273533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
2744fdb7a0Nobutomo Nakano#endif
275533d3a4Edward Gillett	jmp	LABEL(ashr_15_use_ssse3)
2764fdb7a0Nobutomo Nakano
2777c478bdstevel@tonic-gate	.p2align 4
278533d3a4Edward GillettLABEL(ashr_15_use_sse2):
279533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
280533d3a4Edward Gillett	pmovmskb %xmm0, %edx
281533d3a4Edward Gillett	test	%edx, %edx
282533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
283533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
284533d3a4Edward Gillett	sub	$16, %r8
285533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
286533d3a4Edward Gillett#endif
288533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
289533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
291533d3a4Edward Gillett	psrldq	$15, %xmm2
292533d3a4Edward Gillett	pslldq	$1, %xmm3
293533d3a4Edward Gillett	por	%xmm2, %xmm3
295533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
296533d3a4Edward Gillett	add	$16, %rcx
297533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
298533d3a4Edward Gillett	cmp	%r10, %r8
299533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
300533d3a4Edward Gillett#endif
301533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
302533d3a4Edward Gillett	pmovmskb %xmm0, %edx
303533d3a4Edward Gillett	test	%edx, %edx
304533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
305533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
306533d3a4Edward Gillett	sub	$16, %r8
307533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
308533d3a4Edward Gillett#endif
309533d3a4Edward Gillett
310533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
311533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
313533d3a4Edward Gillett	psrldq	$15, %xmm2
314533d3a4Edward Gillett	pslldq	$1, %xmm3
315533d3a4Edward Gillett	por	%xmm2, %xmm3
316533d3a4Edward Gillett
317533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
318533d3a4Edward Gillett	add	$16, %rcx
3197c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
320533d3a4Edward Gillett	cmp	%r10, %r8
321533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
323533d3a4Edward Gillett	jmp	LABEL(ashr_15_use_sse2)
324533d3a4Edward Gillett
326533d3a4Edward Gillett/*
327533d3a4Edward Gillett * ashr_14 handles the following cases:
328533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 14
329533d3a4Edward Gillett *
330533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
331533d3a4Edward Gillett * bank, there is no null byte.
332533d3a4Edward Gillett */
333533d3a4Edward Gillett	.p2align 4
334533d3a4Edward GillettLABEL(ashr_14):
335533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
336533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
337533d3a4Edward Gillett	cmp	%r10, %r8
338533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
339533d3a4Edward Gillett#endif
340533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
341533d3a4Edward Gillett	jz	LABEL(ashr_14_use_sse2)
342533d3a4Edward Gillett
343533d3a4Edward Gillett	.p2align 4
344533d3a4Edward GillettLABEL(ashr_14_use_ssse3):
345533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
346533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
347533d3a4Edward Gillett	pmovmskb %xmm0, %edx
348533d3a4Edward Gillett	test	%edx, %edx
349533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
350533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
351533d3a4Edward Gillett	sub	$16, %r8
352533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
353533d3a4Edward Gillett#endif
355533d3a4Edward Gillett	#palignr $14, (%rsi, %rcx), %xmm3
356533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
357533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0e
359533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
360533d3a4Edward Gillett	add	$16, %rcx
3627c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
363533d3a4Edward Gillett	cmp	%r10, %r8
364533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
365533d3a4Edward Gillett#endif
366533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
367533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
368533d3a4Edward Gillett	pmovmskb %xmm0, %edx
369533d3a4Edward Gillett	test	%edx, %edx
370533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
371533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
372533d3a4Edward Gillett	sub	$16, %r8
373533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
374533d3a4Edward Gillett#endif
375533d3a4Edward Gillett
376533d3a4Edward Gillett	#palignr $14, (%rsi, %rcx), %xmm3
377533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
378533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0e
379533d3a4Edward Gillett
380533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
381533d3a4Edward Gillett	add	$16, %rcx
382533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
383533d3a4Edward Gillett	cmp	%r10, %r8
384533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
385533d3a4Edward Gillett#endif
386533d3a4Edward Gillett	jmp	LABEL(ashr_14_use_ssse3)
387533d3a4Edward Gillett
388533d3a4Edward Gillett	.p2align 4
389533d3a4Edward GillettLABEL(ashr_14_use_sse2):
390533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
391533d3a4Edward Gillett	pmovmskb %xmm0, %edx
392533d3a4Edward Gillett	test	%edx, %edx
393533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
394533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
395533d3a4Edward Gillett	sub	$16, %r8
396533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
399533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
400533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
402533d3a4Edward Gillett	psrldq	$14, %xmm2
403533d3a4Edward Gillett	pslldq	$2, %xmm3
404533d3a4Edward Gillett	por	%xmm2, %xmm3
406533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
407533d3a4Edward Gillett	add	$16, %rcx
4097c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
410533d3a4Edward Gillett	cmp	%r10, %r8
411533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
412533d3a4Edward Gillett#endif
413533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
414533d3a4Edward Gillett	pmovmskb %xmm0, %edx
415533d3a4Edward Gillett	test	%edx, %edx
416533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
417533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
418533d3a4Edward Gillett	sub	$16, %r8
419533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
422533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
423533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
425533d3a4Edward Gillett	psrldq	$14, %xmm2
426533d3a4Edward Gillett	pslldq	$2, %xmm3
427533d3a4Edward Gillett	por	%xmm2, %xmm3
429533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
430533d3a4Edward Gillett	add	$16, %rcx
431533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
432533d3a4Edward Gillett	cmp	%r10, %r8
433533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
434533d3a4Edward Gillett#endif
435533d3a4Edward Gillett	jmp	LABEL(ashr_14_use_sse2)
437533d3a4Edward Gillett
438533d3a4Edward Gillett/*
439533d3a4Edward Gillett * ashr_13 handles the following cases:
440533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 13
441533d3a4Edward Gillett *
442533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
443533d3a4Edward Gillett * bank, there is no null byte.
444533d3a4Edward Gillett */
445533d3a4Edward Gillett	.p2align 4
446533d3a4Edward GillettLABEL(ashr_13):
447533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
4487c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
449533d3a4Edward Gillett	cmp	%r10, %r8
450533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
452533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
453533d3a4Edward Gillett	jz	LABEL(ashr_13_use_sse2)
455533d3a4Edward Gillett	.p2align 4
456533d3a4Edward GillettLABEL(ashr_13_use_ssse3):
457533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
458533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
459533d3a4Edward Gillett	pmovmskb %xmm0, %edx
460533d3a4Edward Gillett	test	%edx, %edx
461533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
462533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
463533d3a4Edward Gillett	sub	$16, %r8
464533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
465533d3a4Edward Gillett#endif
467533d3a4Edward Gillett	#palignr $13, (%rsi, %rcx), %xmm3
468533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
469533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0d
471533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
472533d3a4Edward Gillett	add	$16, %rcx
4747c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
475533d3a4Edward Gillett	cmp	%r10, %r8
476533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
477533d3a4Edward Gillett#endif
478533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
479533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
480533d3a4Edward Gillett	pmovmskb %xmm0, %edx
481533d3a4Edward Gillett	test	%edx, %edx
482533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
483533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
484533d3a4Edward Gillett	sub	$16, %r8
485533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
486533d3a4Edward Gillett#endif
487533d3a4Edward Gillett
488533d3a4Edward Gillett	#palignr $13, (%rsi, %rcx), %xmm3
489533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
490533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0d
491533d3a4Edward Gillett
492533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
493533d3a4Edward Gillett	add	$16, %rcx
494533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
495533d3a4Edward Gillett	cmp	%r10, %r8
496533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
497533d3a4Edward Gillett#endif
498533d3a4Edward Gillett	jmp	LABEL(ashr_13_use_ssse3)
499533d3a4Edward Gillett
500533d3a4Edward Gillett	.p2align 4
501533d3a4Edward GillettLABEL(ashr_13_use_sse2):
502533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
503533d3a4Edward Gillett	pmovmskb %xmm0, %edx
504533d3a4Edward Gillett	test	%edx, %edx
505533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
506533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
507533d3a4Edward Gillett	sub	$16, %r8
508533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
511533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
512533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
514533d3a4Edward Gillett	psrldq	$13, %xmm2
515533d3a4Edward Gillett	pslldq	$3, %xmm3
516533d3a4Edward Gillett	por	%xmm2, %xmm3
518533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
519533d3a4Edward Gillett	add	$16, %rcx
5217c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
522533d3a4Edward Gillett	cmp	%r10, %r8
523533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
524533d3a4Edward Gillett#endif
525533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
526533d3a4Edward Gillett	pmovmskb %xmm0, %edx
527533d3a4Edward Gillett	test	%edx, %edx
528533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
529533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
530533d3a4Edward Gillett	sub	$16, %r8
531533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
534533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
535533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
537533d3a4Edward Gillett	psrldq	$13, %xmm2
538533d3a4Edward Gillett	pslldq	$3, %xmm3
539533d3a4Edward Gillett	por	%xmm2, %xmm3
540533d3a4Edward Gillett
541533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
542533d3a4Edward Gillett	add	$16, %rcx
543533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
544533d3a4Edward Gillett	cmp	%r10, %r8
545533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
546533d3a4Edward Gillett#endif
547533d3a4Edward Gillett	jmp	LABEL(ashr_13_use_sse2)
550533d3a4Edward Gillett/*
551533d3a4Edward Gillett * ashr_12 handles the following cases:
552533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 12
553533d3a4Edward Gillett *
554533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
555533d3a4Edward Gillett * bank, there is no null byte.
556533d3a4Edward Gillett */
557533d3a4Edward Gillett	.p2align 4
558533d3a4Edward GillettLABEL(ashr_12):
559533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
5607c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
561533d3a4Edward Gillett	cmp	%r10, %r8
562533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
564533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
565533d3a4Edward Gillett	jz	LABEL(ashr_12_use_sse2)
567533d3a4Edward Gillett	.p2align 4
568533d3a4Edward GillettLABEL(ashr_12_use_ssse3):
569533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
570533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
571533d3a4Edward Gillett	pmovmskb %xmm0, %edx
572533d3a4Edward Gillett	test	%edx, %edx
573533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
574533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
575533d3a4Edward Gillett	sub	$16, %r8
576533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
577533d3a4Edward Gillett#endif
579533d3a4Edward Gillett	#palignr $12, (%rsi, %rcx), %xmm3
580533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
581533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0c
583533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
584533d3a4Edward Gillett	add	$16, %rcx
5867c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
587533d3a4Edward Gillett	cmp	%r10, %r8
588533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
589533d3a4Edward Gillett#endif
590533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
591533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
592533d3a4Edward Gillett	pmovmskb %xmm0, %edx
593533d3a4Edward Gillett	test	%edx, %edx
594533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
595533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
596533d3a4Edward Gillett	sub	$16, %r8
597533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
600533d3a4Edward Gillett	#palignr $12, (%rsi, %rcx), %xmm3
601533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
602533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0c
604533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
605533d3a4Edward Gillett	add	$16, %rcx
606533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
607533d3a4Edward Gillett	cmp	%r10, %r8
608533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
609533d3a4Edward Gillett#endif
610533d3a4Edward Gillett	jmp	LABEL(ashr_12_use_ssse3)
612533d3a4Edward Gillett	.p2align 4
613533d3a4Edward GillettLABEL(ashr_12_use_sse2):
614533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
615533d3a4Edward Gillett	pmovmskb %xmm0, %edx
616533d3a4Edward Gillett	test	%edx, %edx
617533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
618533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
619533d3a4Edward Gillett	sub	$16, %r8
620533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
621533d3a4Edward Gillett#endif
622533d3a4Edward Gillett
623533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
624533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
625533d3a4Edward Gillett
626533d3a4Edward Gillett	psrldq	$12, %xmm2
627533d3a4Edward Gillett	pslldq	$4, %xmm3
628533d3a4Edward Gillett	por	%xmm2, %xmm3
630533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
631533d3a4Edward Gillett	add	$16, %rcx
633533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
634533d3a4Edward Gillett	cmp	%r10, %r8
635533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
636533d3a4Edward Gillett#endif
637533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
638533d3a4Edward Gillett	pmovmskb %xmm0, %edx
639533d3a4Edward Gillett	test	%edx, %edx
640533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
641533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
642533d3a4Edward Gillett	sub	$16, %r8
643533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
644533d3a4Edward Gillett#endif
646533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
647533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
649533d3a4Edward Gillett	psrldq	$12, %xmm2
650533d3a4Edward Gillett	pslldq	$4, %xmm3
651533d3a4Edward Gillett	por	%xmm2, %xmm3
653533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
654533d3a4Edward Gillett	add	$16, %rcx
6557c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
656533d3a4Edward Gillett	cmp	%r10, %r8
657533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
659533d3a4Edward Gillett	jmp	LABEL(ashr_12_use_sse2)
662533d3a4Edward Gillett/*
663533d3a4Edward Gillett * ashr_11 handles the following cases:
664533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 11
665533d3a4Edward Gillett *
666533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
667533d3a4Edward Gillett * bank, there is no null byte.
668533d3a4Edward Gillett */
669533d3a4Edward Gillett	.p2align 4
670533d3a4Edward GillettLABEL(ashr_11):
671533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
672533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
673533d3a4Edward Gillett	cmp	%r10, %r8
674533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
675533d3a4Edward Gillett#endif
676533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
677533d3a4Edward Gillett	jz	LABEL(ashr_11_use_sse2)
679533d3a4Edward Gillett	.p2align 4
680533d3a4Edward GillettLABEL(ashr_11_use_ssse3):
681533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
682533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
683533d3a4Edward Gillett	pmovmskb %xmm0, %edx
684533d3a4Edward Gillett	test	%edx, %edx
685533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
686533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
687533d3a4Edward Gillett	sub	$16, %r8
688533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
689533d3a4Edward Gillett#endif
690533d3a4Edward Gillett
691533d3a4Edward Gillett	#palignr $11, (%rsi, %rcx), %xmm3
692533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
693533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0b
694533d3a4Edward Gillett
695533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
696533d3a4Edward Gillett	add	$16, %rcx
697533d3a4Edward Gillett
698533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
699533d3a4Edward Gillett	cmp	%r10, %r8
700533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
701533d3a4Edward Gillett#endif
702533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
703533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
704533d3a4Edward Gillett	pmovmskb %xmm0, %edx
705533d3a4Edward Gillett	test	%edx, %edx
706533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
707533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
708533d3a4Edward Gillett	sub	$16, %r8
709533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
710533d3a4Edward Gillett#endif
712533d3a4Edward Gillett	#palignr $11, (%rsi, %rcx), %xmm3
713533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
714533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0b
715533d3a4Edward Gillett
716533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
717533d3a4Edward Gillett	add	$16, %rcx
718533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
719533d3a4Edward Gillett	cmp	%r10, %r8
720533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
721533d3a4Edward Gillett#endif
722533d3a4Edward Gillett	jmp	LABEL(ashr_11_use_ssse3)
723533d3a4Edward Gillett
724533d3a4Edward Gillett	.p2align 4
725533d3a4Edward GillettLABEL(ashr_11_use_sse2):
726533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
727533d3a4Edward Gillett	pmovmskb %xmm0, %edx
728533d3a4Edward Gillett	test	%edx, %edx
729533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
7307c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
731533d3a4Edward Gillett	sub	$16, %r8
732533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
735533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
736533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
738533d3a4Edward Gillett	psrldq	$11, %xmm2
739533d3a4Edward Gillett	pslldq	$5, %xmm3
740533d3a4Edward Gillett	por	%xmm2, %xmm3
742533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
743533d3a4Edward Gillett	add	$16, %rcx
7457c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
746533d3a4Edward Gillett	cmp	%r10, %r8
747533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
748533d3a4Edward Gillett#endif
749533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
750533d3a4Edward Gillett	pmovmskb %xmm0, %edx
751533d3a4Edward Gillett	test	%edx, %edx
752533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
753533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
754533d3a4Edward Gillett	sub	$16, %r8
755533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
758533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
759533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
761533d3a4Edward Gillett	psrldq	$11, %xmm2
762533d3a4Edward Gillett	pslldq	$5, %xmm3
763533d3a4Edward Gillett	por	%xmm2, %xmm3
764533d3a4Edward Gillett
765533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
766533d3a4Edward Gillett	add	$16, %rcx
767533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
768533d3a4Edward Gillett	cmp	%r10, %r8
769533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
770533d3a4Edward Gillett#endif
771533d3a4Edward Gillett	jmp	LABEL(ashr_11_use_sse2)
774533d3a4Edward Gillett/*
775533d3a4Edward Gillett * ashr_10 handles the following cases:
776533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 10
777533d3a4Edward Gillett *
778533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
779533d3a4Edward Gillett * bank, there is no null byte.
780533d3a4Edward Gillett */
781533d3a4Edward Gillett	.p2align 4
782533d3a4Edward GillettLABEL(ashr_10):
783533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
7847c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
785533d3a4Edward Gillett	cmp	%r10, %r8
786533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
788533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
789533d3a4Edward Gillett	jz	LABEL(ashr_10_use_sse2)
791533d3a4Edward Gillett	.p2align 4
792533d3a4Edward GillettLABEL(ashr_10_use_ssse3):
793533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
794533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
795533d3a4Edward Gillett	pmovmskb %xmm0, %edx
796533d3a4Edward Gillett	test	%edx, %edx
797533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
798533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
799533d3a4Edward Gillett	sub	$16, %r8
800533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
801533d3a4Edward Gillett#endif
802533d3a4Edward Gillett
803533d3a4Edward Gillett	#palignr $10, (%rsi, %rcx), %xmm3
804533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
805533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0a
806533d3a4Edward Gillett
807533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
808533d3a4Edward Gillett	add	$16, %rcx
809533d3a4Edward Gillett
810533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
811533d3a4Edward Gillett	cmp	%r10, %r8
812533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
813533d3a4Edward Gillett#endif
814533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
815533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
816533d3a4Edward Gillett	pmovmskb %xmm0, %edx
817533d3a4Edward Gillett	test	%edx, %edx
818533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
819533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
820533d3a4Edward Gillett	sub	$16, %r8
821533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
822533d3a4Edward Gillett#endif
824533d3a4Edward Gillett	#palignr $10, (%rsi, %rcx), %xmm3
825533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
826533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x0a
828533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
829533d3a4Edward Gillett	add	$16, %rcx
830533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
831533d3a4Edward Gillett	cmp	%r10, %r8
832533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
833533d3a4Edward Gillett#endif
834533d3a4Edward Gillett	jmp	LABEL(ashr_10_use_ssse3)
836533d3a4Edward Gillett	.p2align 4
837533d3a4Edward GillettLABEL(ashr_10_use_sse2):
838533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
839533d3a4Edward Gillett	pmovmskb %xmm0, %edx
840533d3a4Edward Gillett	test	%edx, %edx
841533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
8427c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
843533d3a4Edward Gillett	sub	$16, %r8
844533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
847533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
848533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
850533d3a4Edward Gillett	psrldq	$10, %xmm2
851533d3a4Edward Gillett	pslldq	$6, %xmm3
852533d3a4Edward Gillett	por	%xmm2, %xmm3
854533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
855533d3a4Edward Gillett	add	$16, %rcx
8577c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
858533d3a4Edward Gillett	cmp	%r10, %r8
859533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
860533d3a4Edward Gillett#endif
861533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
862533d3a4Edward Gillett	pmovmskb %xmm0, %edx
863533d3a4Edward Gillett	test	%edx, %edx
864533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
865533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
866533d3a4Edward Gillett	sub	$16, %r8
867533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
870533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
871533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
872533d3a4Edward Gillett
873533d3a4Edward Gillett	psrldq	$10, %xmm2
874533d3a4Edward Gillett	pslldq	$6, %xmm3
875533d3a4Edward Gillett	por	%xmm2, %xmm3
877533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
878533d3a4Edward Gillett	add	$16, %rcx
879533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
880533d3a4Edward Gillett	cmp	%r10, %r8
881533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
882533d3a4Edward Gillett#endif
883533d3a4Edward Gillett	jmp	LABEL(ashr_10_use_sse2)
886533d3a4Edward Gillett/*
887533d3a4Edward Gillett * ashr_9 handles the following cases:
888533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 9
889533d3a4Edward Gillett *
890533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
891533d3a4Edward Gillett * bank, there is no null byte.
892533d3a4Edward Gillett */
893533d3a4Edward Gillett	.p2align 4
894533d3a4Edward GillettLABEL(ashr_9):
895533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
8967c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
897533d3a4Edward Gillett	cmp	%r10, %r8
898533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
900533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
901533d3a4Edward Gillett	jz	LABEL(ashr_9_use_sse2)
903533d3a4Edward Gillett	.p2align 4
904533d3a4Edward GillettLABEL(ashr_9_use_ssse3):
905533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
906533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
907533d3a4Edward Gillett	pmovmskb %xmm0, %edx
908533d3a4Edward Gillett	test	%edx, %edx
909533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
910533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
911533d3a4Edward Gillett	sub	$16, %r8
912533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
913533d3a4Edward Gillett#endif
915533d3a4Edward Gillett	#palignr $9, (%rsi, %rcx), %xmm3
916533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
917533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x09
919533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
920533d3a4Edward Gillett	add	$16, %rcx
9227c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
923533d3a4Edward Gillett	cmp	%r10, %r8
924533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
925533d3a4Edward Gillett#endif
926533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
927533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
928533d3a4Edward Gillett	pmovmskb %xmm0, %edx
929533d3a4Edward Gillett	test	%edx, %edx
930533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
931533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
932533d3a4Edward Gillett	sub	$16, %r8
933533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
936533d3a4Edward Gillett	#palignr $9, (%rsi, %rcx), %xmm3
937533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
938533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x09
940533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
941533d3a4Edward Gillett	add	$16, %rcx
942533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
943533d3a4Edward Gillett	cmp	%r10, %r8
944533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
945533d3a4Edward Gillett#endif
946533d3a4Edward Gillett	jmp	LABEL(ashr_9_use_ssse3)
948533d3a4Edward Gillett	.p2align 4
949533d3a4Edward GillettLABEL(ashr_9_use_sse2):
950533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
951533d3a4Edward Gillett	pmovmskb %xmm0, %edx
952533d3a4Edward Gillett	test	%edx, %edx
953533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
954533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
955533d3a4Edward Gillett	sub	$16, %r8
956533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
957533d3a4Edward Gillett#endif
959533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
960533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
962533d3a4Edward Gillett	psrldq	$9, %xmm2
963533d3a4Edward Gillett	pslldq	$7, %xmm3
964533d3a4Edward Gillett	por	%xmm2, %xmm3
966533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
967533d3a4Edward Gillett	add	$16, %rcx
969533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
970533d3a4Edward Gillett	cmp	%r10, %r8
971533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
972533d3a4Edward Gillett#endif
973533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
974533d3a4Edward Gillett	pmovmskb %xmm0, %edx
975533d3a4Edward Gillett	test	%edx, %edx
976533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
977533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
978533d3a4Edward Gillett	sub	$16, %r8
979533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
980533d3a4Edward Gillett#endif
981533d3a4Edward Gillett
982533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
983533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
985533d3a4Edward Gillett	psrldq	$9, %xmm2
986533d3a4Edward Gillett	pslldq	$7, %xmm3
987533d3a4Edward Gillett	por	%xmm2, %xmm3
989533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
990533d3a4Edward Gillett	add	$16, %rcx
991533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
992533d3a4Edward Gillett	cmp	%r10, %r8
993533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
994533d3a4Edward Gillett#endif
995533d3a4Edward Gillett	jmp	LABEL(ashr_9_use_sse2)
996533d3a4Edward Gillett
997533d3a4Edward Gillett
998533d3a4Edward Gillett/*
999533d3a4Edward Gillett * ashr_8 handles the following cases:
1000533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 8
1001533d3a4Edward Gillett *
1002533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1003533d3a4Edward Gillett * bank, there is no null byte.
1004533d3a4Edward Gillett */
1005533d3a4Edward Gillett	.p2align 4
1006533d3a4Edward GillettLABEL(ashr_8):
1007533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
1008533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1009533d3a4Edward Gillett	cmp	%r10, %r8
1010533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
1011533d3a4Edward Gillett#endif
1012533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1013533d3a4Edward Gillett	jz	LABEL(ashr_8_use_sse2)
1015533d3a4Edward Gillett	.p2align 4
1016533d3a4Edward GillettLABEL(ashr_8_use_ssse3):
1017533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1018533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
1019533d3a4Edward Gillett	pmovmskb %xmm0, %edx
1020533d3a4Edward Gillett	test	%edx, %edx
1021533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
10227c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
1023533d3a4Edward Gillett	sub	$16, %r8
1024533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1027533d3a4Edward Gillett	#palignr $8, (%rsi, %rcx), %xmm3
1028533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1029533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x08
1030533d3a4Edward Gillett
1031533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1032533d3a4Edward Gillett	add	$16, %rcx
1033533d3a4Edward Gillett
1034533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1035533d3a4Edward Gillett	cmp	%r10, %r8
1036533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
1037533d3a4Edward Gillett#endif
1038533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1039533d3a4Edward Gillett	pcmpeqb %xmm3, %xmm0
1040533d3a4Edward Gillett	pmovmskb %xmm0, %edx
1041533d3a4Edward Gillett	test	%edx, %edx
1042533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
1043533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1044533d3a4Edward Gillett	sub	$16, %r8
1045533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1046533d3a4Edward Gillett#endif
1048533d3a4Edward Gillett	#palignr $8, (%rsi, %rcx), %xmm3
1049533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1050533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x08
1052533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1053533d3a4Edward Gillett	add	$16, %rcx
1054533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1055533d3a4Edward Gillett	cmp	%r10, %r8
1056533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
1057533d3a4Edward Gillett#endif
1058533d3a4Edward Gillett	jmp	LABEL(ashr_8_use_ssse3)
1060533d3a4Edward Gillett	.p2align 4
1061533d3a4Edward GillettLABEL(ashr_8_use_sse2):
1062533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1063533d3a4Edward Gillett	pmovmskb %xmm0, %edx
1064533d3a4Edward Gillett	test	%edx, %edx
1065533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
10667c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
1067533d3a4Edward Gillett	sub	$16, %r8
1068533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1071533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1072533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
1074533d3a4Edward Gillett	psrldq	$8, %xmm2
1075533d3a4Edward Gillett	pslldq	$8, %xmm3
1076533d3a4Edward Gillett	por	%xmm2, %xmm3
1078533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1079533d3a4Edward Gillett	add	$16, %rcx
10817c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
1082533d3a4Edward Gillett	cmp	%r10, %r8
1083533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
1084533d3a4Edward Gillett#endif
1085533d3a4Edward Gillett	pcmpeqb 16(%rsi, %rcx), %xmm0
1086533d3a4Edward Gillett	pmovmskb %xmm0, %edx
1087533d3a4Edward Gillett	test	%edx, %edx
1088533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
1089533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1090533d3a4Edward Gillett	sub	$16, %r8
1091533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1094533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1095533d3a4Edward Gillett	movdqa	(%rsi, %rcx), %xmm2
1097533d3a4Edward Gillett	psrldq	$8, %xmm2
1098533d3a4Edward Gillett	pslldq	$8, %xmm3
1099533d3a4Edward Gillett	por	%xmm2, %xmm3
1101533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1102533d3a4Edward Gillett	add	$16, %rcx
1103533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1104533d3a4Edward Gillett	cmp	%r10, %r8
1105533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
1106533d3a4Edward Gillett#endif
1107533d3a4Edward Gillett	jmp	LABEL(ashr_8_use_sse2)
1109533d3a4Edward Gillett
1110533d3a4Edward Gillett/*
1111533d3a4Edward Gillett * ashr_7 handles the following cases:
1112533d3a4Edward Gillett * 	(16 + (src offset - dest offset)) % 16 = 7
1113533d3a4Edward Gillett *
1114533d3a4Edward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache
1115533d3a4Edward Gillett * bank, there is no null byte.
1116533d3a4Edward Gillett */
1117533d3a4Edward Gillett	.p2align 4
1118533d3a4Edward GillettLABEL(ashr_7):
1119533d3a4Edward Gillett	xor	%ecx, %ecx				/* clear index */
11207c478bdstevel@tonic-gate#ifdef USE_AS_STRNCPY
1121533d3a4Edward Gillett	cmp	%r10, %r8
1122533d3a4Edward Gillett	jbe	LABEL(unaligned_exit)
1124533d3a4Edward Gillett	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
1125533d3a4Edward Gillett	jz	LABEL(ashr_7_use_sse2)
1127533d3a4Edward Gillett	.p2align 4
1128533d3a4Edward GillettLABEL(ashr_7_use_ssse3):
1129533d3a4Edward Gillett	movdqa	16(%rsi, %rcx), %xmm3
1130533d3a4Edward Gillett	pcmpeqb	%xmm3, %xmm0
1131533d3a4Edward Gillett	pmovmskb %xmm0, %edx
1132533d3a4Edward Gillett	test	%edx, %edx
1133533d3a4Edward Gillett	jnz	LABEL(unaligned_exit)
1134533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1135533d3a4Edward Gillett	sub	$16, %r8
1136533d3a4Edward Gillett 	jbe	LABEL(strncpy_truncation_unaligned)
1137533d3a4Edward Gillett#endif
1138533d3a4Edward Gillett
1139533d3a4Edward Gillett	#palignr $7, (%rsi, %rcx), %xmm3
1140533d3a4Edward Gillett	.byte	0x66, 0x0F, 0x3A ,0x0F
1141533d3a4Edward Gillett	.byte	0x1c, 0x0e, 0x07
1142533d3a4Edward Gillett
1143533d3a4Edward Gillett	movdqa	%xmm3, (%rdi, %rcx)
1144533d3a4Edward Gillett	add	$16, %rcx
1145533d3a4Edward Gillett
1146533d3a4Edward Gillett#ifdef USE_AS_STRNCPY
1147533d3a4Edward Gillett	cmp	%r10, %r8
1148533d3a4Edward Gillett