xref: /illumos-gate/usr/src/lib/libc/amd64/gen/strcmp.S (revision 5d9d9091)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009, Intel Corporation
24 * All rights reserved.
25 */
26
27/*
28 *	str[n]cmp - compare chars between two string
29 */
30
31#include "SYS.h"
32#include "proc64_id.h"
33
34#define LABEL(s) .strcmp##s
35
36#ifdef USE_AS_STRNCMP
37	/*
38	 * Since the counter, %r11, is unsigned, we branch to strcmp_exitz
39	 * if the new counter > the old one or is 0.
40	 */
41#define UPDATE_STRNCMP_COUNTER				\
42	/* calculate left number to compare */		\
43	lea	-16(%rcx, %r11), %r9;			\
44	cmp	%r9, %r11;				\
45	jb	LABEL(strcmp_exitz);			\
46	test	%r9, %r9;				\
47	je	LABEL(strcmp_exitz);			\
48	mov	%r9, %r11
49#else
50#define UPDATE_STRNCMP_COUNTER
51#endif
52
53	/*
54	 * This implementation uses SSE to compare up to 16 bytes at a time.
55	 */
56#ifdef USE_AS_STRNCMP
57	ENTRY(strncmp)
58	test	%rdx, %rdx
59	je	LABEL(strcmp_exitz)
60	mov	%rdx, %r11
61#else
62	ENTRY(strcmp)			/* (const char *, const char *) */
63#endif
64	mov	%esi, %ecx
65	mov	%edi, %eax
66	and	$0x3f, %rcx		/* rsi alignment in cache line */
67	and	$0x3f, %rax		/* rdi alignment in cache line */
68	cmp	$0x30, %ecx
69	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
70	cmp	$0x30, %eax
71	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
72	movlpd	(%rdi), %xmm1
73	movlpd	(%rsi), %xmm2
74	movhpd	8(%rdi), %xmm1
75	movhpd	8(%rsi), %xmm2
76	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
77	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
78	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
79	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
80	pmovmskb %xmm1, %edx
81	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
82	jnz	LABEL(less16bytes)	/* If not, found mismatch or null char */
83#ifdef USE_AS_STRNCMP
84	sub	$16, %r11
85	jbe	LABEL(strcmp_exitz)	/* finish comparision */
86#endif
87	add	$16, %rsi		/* prepare to search next 16 bytes */
88	add	$16, %rdi		/* prepare to search next 16 bytes */
89
90	/*
91	 * Determine rdi and rsi string offsets from 16-byte alignment.
92	 * Use relative offset difference between the two to determine which case
93	 * below to use.
94	 */
95	.p2align 4
96LABEL(crosscache):
97	and	$0xfffffffffffffff0, %rsi	/* force %rsi to be 16 byte aligned */
98	and	$0xfffffffffffffff0, %rdi	/* force %rdi to be 16 byte aligned */
99	mov	$0xffff, %edx			/* for equivalent offset */
100	xor	%r8d, %r8d
101	and	$0xf, %ecx			/* offset of rsi */
102	and	$0xf, %eax			/* offset of rdi */
103	cmp	%eax, %ecx
104	je	LABEL(ashr_0)			/* both strings have the same alignment */
105	ja	LABEL(bigger)
106	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
107	xchg	%ecx, %eax
108	xchg	%rsi, %rdi
109LABEL(bigger):
110	mov	%rcx, %r9
111	sub	%rax, %r9
112	lea	LABEL(unaligned_table)(%rip), %r10
113	movslq	(%r10, %r9, 4), %r9
114	lea	(%r10, %r9), %r10
115	jmp	*%r10				/* jump to corresponding case */
116
117/*
118 * ashr_0 handles the following cases:
119 * 	str1 offset = str2 offset
120 */
121	.p2align 4
122LABEL(ashr_0):
123	movdqa	(%rsi), %xmm1
124	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
125	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
126	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
127	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
128	pmovmskb %xmm1, %r9d
129	shr	%cl, %edx			/* adjust 0xffff for offset */
130	shr	%cl, %r9d			/* adjust for 16-byte offset */
131	sub	%r9d, %edx
132	/*
133	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
134	 * the start from (16-rax) and no null char was seen.
135	 */
136	jne	LABEL(less32bytes)		/* mismatch or null char */
137	UPDATE_STRNCMP_COUNTER
138	mov	$16, %rcx
139	mov	$16, %r9
140	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
141
142	/*
143	 * Now both strings are aligned at 16-byte boundary. Loop over strings
144	 * checking 32-bytes per iteration.
145	 */
146	.p2align 4
147LABEL(loop_ashr_0):
148	movdqa	(%rsi, %rcx), %xmm1
149	movdqa	(%rdi, %rcx), %xmm2
150
151	pcmpeqb	%xmm1, %xmm0
152	pcmpeqb	%xmm2, %xmm1
153	psubb	%xmm0, %xmm1
154	pmovmskb %xmm1, %edx
155	sub	$0xffff, %edx
156	jnz	LABEL(exit)		/* mismatch or null char seen */
157
158#ifdef USE_AS_STRNCMP
159	sub	$16, %r11
160	jbe	LABEL(strcmp_exitz)
161#endif
162	add	$16, %rcx
163	movdqa	(%rsi, %rcx), %xmm1
164	movdqa	(%rdi, %rcx), %xmm2
165
166	pcmpeqb	%xmm1, %xmm0
167	pcmpeqb	%xmm2, %xmm1
168	psubb	%xmm0, %xmm1
169	pmovmskb %xmm1, %edx
170	sub	$0xffff, %edx
171	jnz	LABEL(exit)
172#ifdef USE_AS_STRNCMP
173	sub	$16, %r11
174	jbe	LABEL(strcmp_exitz)
175#endif
176	add	$16, %rcx
177	jmp	LABEL(loop_ashr_0)
178
179/*
180 * ashr_1 handles the following cases:
181 * 	abs(str1 offset - str2 offset) = 15
182 */
183	.p2align 4
184LABEL(ashr_1):
185	pxor	%xmm0, %xmm0
186	movdqa	(%rdi), %xmm2
187	movdqa	(%rsi), %xmm1
188	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
189	pslldq	$15, %xmm2		/* shift first string to align with second */
190	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
191	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
192	pmovmskb %xmm2, %r9d
193	shr	%cl, %edx		/* adjust 0xffff for offset */
194	shr	%cl, %r9d		/* adjust for 16-byte offset */
195	sub	%r9d, %edx
196	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
197	movdqa	(%rdi), %xmm3
198	UPDATE_STRNCMP_COUNTER
199
200	pxor	%xmm0, %xmm0
201	mov	$16, %rcx		/* index for loads */
202	mov	$1, %r9d		/* rdi bytes already examined. Used in exit code */
203	/*
204	 * Setup %r10 value allows us to detect crossing a page boundary.
205	 * When %r10 goes positive we are crossing a page boundary and
206	 * need to do a nibble.
207	 */
208	lea	1(%rdi), %r10
209	and	$0xfff, %r10		/* offset into 4K page */
210	sub	$0x1000, %r10		/* subtract 4K pagesize */
211	movdqa	%xmm3, %xmm4
212
213	.p2align 4
214LABEL(loop_ashr_1):
215	add	$16, %r10
216	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
217
218LABEL(gobble_ashr_1):
219	movdqa	(%rsi, %rcx), %xmm1
220	movdqa	(%rdi, %rcx), %xmm2
221	movdqa	%xmm2, %xmm4		 /* store for next cycle */
222
223	psrldq	$1, %xmm3
224	pslldq	$15, %xmm2
225	por	%xmm3, %xmm2		/* merge into one 16byte value */
226
227	pcmpeqb	%xmm1, %xmm0
228	pcmpeqb	%xmm2, %xmm1
229	psubb	%xmm0, %xmm1
230	pmovmskb %xmm1, %edx
231	sub	$0xffff, %edx
232	jnz	LABEL(exit)
233
234#ifdef USE_AS_STRNCMP
235	sub	$16, %r11
236	jbe	LABEL(strcmp_exitz)
237#endif
238	add	$16, %rcx
239	movdqa	%xmm4, %xmm3
240
241	add	$16, %r10
242	jg	LABEL(nibble_ashr_1)	/* cross page boundary */
243
244	movdqa	(%rsi, %rcx), %xmm1
245	movdqa	(%rdi, %rcx), %xmm2
246	movdqa	%xmm2, %xmm4		/* store for next cycle */
247
248	psrldq	$1, %xmm3
249	pslldq 	$15, %xmm2
250	por	%xmm3, %xmm2		/* merge into one 16byte value */
251
252	pcmpeqb	%xmm1, %xmm0
253	pcmpeqb	%xmm2, %xmm1
254	psubb	%xmm0, %xmm1
255	pmovmskb %xmm1, %edx
256	sub	$0xffff, %edx
257	jnz	LABEL(exit)
258
259#ifdef USE_AS_STRNCMP
260	sub	$16, %r11
261	jbe	LABEL(strcmp_exitz)
262#endif
263	add	$16, %rcx
264	movdqa	%xmm4, %xmm3
265	jmp	LABEL(loop_ashr_1)
266
267	/*
268	 * Nibble avoids loads across page boundary. This is to avoid a potential
269	 * access into unmapped memory.
270	 */
271	.p2align 4
272LABEL(nibble_ashr_1):
273	psrldq	$1, %xmm4
274	movdqa	(%rsi, %rcx), %xmm1
275	pcmpeqb	%xmm1, %xmm0
276	pcmpeqb	%xmm4, %xmm1
277	psubb	%xmm0, %xmm1
278	pmovmskb %xmm1, %edx
279	sub	$0x7fff, %edx
280	jnz	LABEL(exit)
281#ifdef USE_AS_STRNCMP
282	cmp	$15, %r11
283	jbe	LABEL(strcmp_exitz)
284#endif
285	pxor	%xmm0, %xmm0
286	sub	$0x1000, %r10		/* subtract 4K from %r10 */
287	jmp	LABEL(gobble_ashr_1)
288
289/*
290 * ashr_2 handles the following cases:
291 * 	abs(str1 offset - str2 offset) = 14
292 */
293	.p2align 4
294LABEL(ashr_2):
295	pxor	%xmm0, %xmm0
296	movdqa	(%rdi), %xmm2
297	movdqa	(%rsi), %xmm1
298	pcmpeqb	%xmm1, %xmm0
299	pslldq	$14, %xmm2
300	pcmpeqb	%xmm1, %xmm2
301	psubb	%xmm0, %xmm2
302	pmovmskb %xmm2, %r9d
303	shr	%cl, %edx
304	shr	%cl, %r9d
305	sub	%r9d, %edx
306	jnz	LABEL(less32bytes)
307	movdqa	(%rdi), %xmm3
308	UPDATE_STRNCMP_COUNTER
309
310	pxor	%xmm0, %xmm0
311	mov	$16, %rcx	/* index for loads */
312	mov	$2, %r9d	/* rdi bytes already examined. Used in exit code */
313	/*
314	 * Setup %r10 value allows us to detect crossing a page boundary.
315	 * When %r10 goes positive we are crossing a page boundary and
316	 * need to do a nibble.
317	 */
318	lea	2(%rdi), %r10
319	and	$0xfff, %r10	/* offset into 4K page */
320	sub	$0x1000, %r10	/* subtract 4K pagesize */
321	movdqa	%xmm3, %xmm4
322
323	.p2align 4
324LABEL(loop_ashr_2):
325	add	$16, %r10
326	jg	LABEL(nibble_ashr_2)
327
328LABEL(gobble_ashr_2):
329	movdqa	(%rsi, %rcx), %xmm1
330	movdqa	(%rdi, %rcx), %xmm2
331	movdqa	%xmm2, %xmm4
332
333	psrldq	$2, %xmm3
334	pslldq	$14, %xmm2
335	por	%xmm3, %xmm2
336
337	pcmpeqb	%xmm1, %xmm0
338	pcmpeqb	%xmm2, %xmm1
339	psubb	%xmm0, %xmm1
340	pmovmskb %xmm1, %edx
341	sub	$0xffff, %edx
342	jnz	LABEL(exit)
343
344#ifdef USE_AS_STRNCMP
345	sub	$16, %r11
346	jbe	LABEL(strcmp_exitz)
347#endif
348
349	add	$16, %rcx
350	movdqa	%xmm4, %xmm3
351
352	add	$16, %r10
353	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
354
355	movdqa	(%rsi, %rcx), %xmm1
356	movdqa	(%rdi, %rcx), %xmm2
357	movdqa	%xmm2, %xmm4
358
359	psrldq	$2, %xmm3
360	pslldq 	$14, %xmm2
361	por	%xmm3, %xmm2
362
363	pcmpeqb	%xmm1, %xmm0
364	pcmpeqb	%xmm2, %xmm1
365	psubb	%xmm0, %xmm1
366	pmovmskb %xmm1, %edx
367	sub	$0xffff, %edx
368	jnz	LABEL(exit)
369
370#ifdef USE_AS_STRNCMP
371	sub	$16, %r11
372	jbe	LABEL(strcmp_exitz)
373#endif
374
375	add	$16, %rcx
376	movdqa	%xmm4, %xmm3
377	jmp	LABEL(loop_ashr_2)
378
379	.p2align 4
380LABEL(nibble_ashr_2):
381	psrldq	$2, %xmm4
382	movdqa	(%rsi, %rcx), %xmm1
383	pcmpeqb	%xmm1, %xmm0
384	pcmpeqb	%xmm4, %xmm1
385	psubb	%xmm0, %xmm1
386	pmovmskb %xmm1, %edx
387	sub	$0x3fff, %edx
388	jnz	LABEL(exit)
389#ifdef USE_AS_STRNCMP
390	cmp	$14, %r11
391	jbe	LABEL(strcmp_exitz)
392#endif
393	pxor	%xmm0, %xmm0
394	sub	$0x1000, %r10		/* subtract 4K from %r10 */
395	jmp	LABEL(gobble_ashr_2)
396
397/*
398 * ashr_3 handles the following cases:
399 * 	abs(str1 offset - str2 offset) = 13
400 */
401	.p2align 4
402LABEL(ashr_3):
403	pxor	%xmm0, %xmm0
404	movdqa	(%rdi), %xmm2
405	movdqa	(%rsi), %xmm1
406	pcmpeqb	%xmm1, %xmm0
407	pslldq	$13, %xmm2
408	pcmpeqb	%xmm1, %xmm2
409	psubb	%xmm0, %xmm2
410	pmovmskb %xmm2, %r9d
411	shr	%cl, %edx
412	shr	%cl, %r9d
413	sub	%r9d, %edx
414	jnz	LABEL(less32bytes)
415	movdqa	(%rdi), %xmm3
416
417	UPDATE_STRNCMP_COUNTER
418
419	pxor	%xmm0, %xmm0
420	mov	$16, %rcx	/* index for loads */
421	mov	$3, %r9d	/* rdi bytes already examined. Used in exit code */
422	/*
423	 * Setup %r10 value allows us to detect crossing a page boundary.
424	 * When %r10 goes positive we are crossing a page boundary and
425	 * need to do a nibble.
426	 */
427	lea	3(%rdi), %r10
428	and	$0xfff, %r10	/* offset into 4K page */
429	sub	$0x1000, %r10	/* subtract 4K pagesize */
430	movdqa	%xmm3, %xmm4
431
432	.p2align 4
433LABEL(loop_ashr_3):
434	add	$16, %r10
435	jg	LABEL(nibble_ashr_3)
436
437LABEL(gobble_ashr_3):
438	movdqa	(%rsi, %rcx), %xmm1
439	movdqa	(%rdi, %rcx), %xmm2
440	movdqa	%xmm2, %xmm4
441
442	psrldq	$3, %xmm3
443	pslldq	$13, %xmm2
444	por	%xmm3, %xmm2
445
446	pcmpeqb	%xmm1, %xmm0
447	pcmpeqb	%xmm2, %xmm1
448	psubb	%xmm0, %xmm1
449	pmovmskb %xmm1, %edx
450	sub	$0xffff, %edx
451	jnz	LABEL(exit)
452
453#ifdef USE_AS_STRNCMP
454	sub	$16, %r11
455	jbe	LABEL(strcmp_exitz)
456#endif
457
458	add	$16, %rcx
459	movdqa	%xmm4, %xmm3
460
461	add	$16, %r10
462	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
463
464	movdqa	(%rsi, %rcx), %xmm1
465	movdqa	(%rdi, %rcx), %xmm2
466	movdqa	%xmm2, %xmm4
467
468	psrldq	$3, %xmm3
469	pslldq 	$13, %xmm2
470	por	%xmm3, %xmm2
471
472	pcmpeqb	%xmm1, %xmm0
473	pcmpeqb	%xmm2, %xmm1
474	psubb	%xmm0, %xmm1
475	pmovmskb %xmm1, %edx
476	sub	$0xffff, %edx
477	jnz	LABEL(exit)
478
479#ifdef USE_AS_STRNCMP
480	sub	$16, %r11
481	jbe	LABEL(strcmp_exitz)
482#endif
483
484	add	$16, %rcx
485	movdqa	%xmm4, %xmm3
486	jmp	LABEL(loop_ashr_3)
487
488	.p2align 4
489LABEL(nibble_ashr_3):
490	psrldq	$3, %xmm4
491	movdqa	(%rsi, %rcx), %xmm1
492	pcmpeqb	%xmm1, %xmm0
493	pcmpeqb	%xmm4, %xmm1
494	psubb	%xmm0, %xmm1
495	pmovmskb %xmm1, %edx
496	sub	$0x1fff, %edx
497	jnz	LABEL(exit)
498#ifdef USE_AS_STRNCMP
499	cmp	$13, %r11
500	jbe	LABEL(strcmp_exitz)
501#endif
502	pxor	%xmm0, %xmm0
503	sub	$0x1000, %r10		/* subtract 4K from %r10 */
504	jmp	LABEL(gobble_ashr_3)
505
506/*
507 * ashr_4 handles the following cases:
508 * 	abs(str1 offset - str2 offset) = 12
509 */
510	.p2align 4
511LABEL(ashr_4):
512	pxor	%xmm0, %xmm0
513	movdqa	(%rdi), %xmm2
514	movdqa	(%rsi), %xmm1
515	pcmpeqb	%xmm1, %xmm0
516	pslldq	$12, %xmm2
517	pcmpeqb	%xmm1, %xmm2
518	psubb	%xmm0, %xmm2
519	pmovmskb %xmm2, %r9d
520	shr	%cl, %edx
521	shr	%cl, %r9d
522	sub	%r9d, %edx
523	jnz	LABEL(less32bytes)
524	movdqa	(%rdi), %xmm3
525
526	UPDATE_STRNCMP_COUNTER
527
528	pxor	%xmm0, %xmm0
529	mov	$16, %rcx	/* index for loads */
530	mov	$4, %r9d	/* rdi bytes already examined. Used in exit code */
531	/*
532	 * Setup %r10 value allows us to detect crossing a page boundary.
533	 * When %r10 goes positive we are crossing a page boundary and
534	 * need to do a nibble.
535	 */
536	lea	4(%rdi), %r10
537	and	$0xfff, %r10	/* offset into 4K page */
538	sub	$0x1000, %r10	/* subtract 4K pagesize */
539	movdqa	%xmm3, %xmm4
540
541	.p2align 4
542LABEL(loop_ashr_4):
543	add	$16, %r10
544	jg	LABEL(nibble_ashr_4)
545
546LABEL(gobble_ashr_4):
547	movdqa	(%rsi, %rcx), %xmm1
548	movdqa	(%rdi, %rcx), %xmm2
549	movdqa	%xmm2, %xmm4
550
551	psrldq	$4, %xmm3
552	pslldq	$12, %xmm2
553	por	%xmm3, %xmm2
554
555	pcmpeqb	%xmm1, %xmm0
556	pcmpeqb	%xmm2, %xmm1
557	psubb	%xmm0, %xmm1
558	pmovmskb %xmm1, %edx
559	sub	$0xffff, %edx
560	jnz	LABEL(exit)
561
562#ifdef USE_AS_STRNCMP
563	sub	$16, %r11
564	jbe	LABEL(strcmp_exitz)
565#endif
566
567	add	$16, %rcx
568	movdqa	%xmm4, %xmm3
569
570	add	$16, %r10
571	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
572
573	movdqa	(%rsi, %rcx), %xmm1
574	movdqa	(%rdi, %rcx), %xmm2
575	movdqa	%xmm2, %xmm4
576
577	psrldq	$4, %xmm3
578	pslldq 	$12, %xmm2
579	por	%xmm3, %xmm2
580
581	pcmpeqb	%xmm1, %xmm0
582	pcmpeqb	%xmm2, %xmm1
583	psubb	%xmm0, %xmm1
584	pmovmskb %xmm1, %edx
585	sub	$0xffff, %edx
586	jnz	LABEL(exit)
587
588#ifdef USE_AS_STRNCMP
589	sub	$16, %r11
590	jbe	LABEL(strcmp_exitz)
591#endif
592
593	add	$16, %rcx
594	movdqa	%xmm4, %xmm3
595	jmp	LABEL(loop_ashr_4)
596
597	.p2align 4
598LABEL(nibble_ashr_4):
599	psrldq	$4, %xmm4
600	movdqa	(%rsi, %rcx), %xmm1
601	pcmpeqb	%xmm1, %xmm0
602	pcmpeqb	%xmm4, %xmm1
603	psubb	%xmm0, %xmm1
604	pmovmskb %xmm1, %edx
605	sub	$0x0fff, %edx
606	jnz	LABEL(exit)
607#ifdef USE_AS_STRNCMP
608	cmp	$12, %r11
609	jbe	LABEL(strcmp_exitz)
610#endif
611	pxor	%xmm0, %xmm0
612	sub	$0x1000, %r10		/* subtract 4K from %r10 */
613	jmp	LABEL(gobble_ashr_4)
614
615/*
616 * ashr_5 handles the following cases:
617 * 	abs(str1 offset - str2 offset) = 11
618 */
619	.p2align 4
620LABEL(ashr_5):
621	pxor	%xmm0, %xmm0
622	movdqa	(%rdi), %xmm2
623	movdqa	(%rsi), %xmm1
624	pcmpeqb	%xmm1, %xmm0
625	pslldq	$11, %xmm2
626	pcmpeqb	%xmm1, %xmm2
627	psubb	%xmm0, %xmm2
628	pmovmskb %xmm2, %r9d
629	shr	%cl, %edx
630	shr	%cl, %r9d
631	sub	%r9d, %edx
632	jnz	LABEL(less32bytes)
633	movdqa	(%rdi), %xmm3
634
635	UPDATE_STRNCMP_COUNTER
636
637	pxor	%xmm0, %xmm0
638	mov	$16, %rcx	/* index for loads */
639	mov	$5, %r9d	/* rdi bytes already examined. Used in exit code */
640	/*
641	 * Setup %r10 value allows us to detect crossing a page boundary.
642	 * When %r10 goes positive we are crossing a page boundary and
643	 * need to do a nibble.
644	 */
645	lea	5(%rdi), %r10
646	and	$0xfff, %r10	/* offset into 4K page */
647	sub	$0x1000, %r10	/* subtract 4K pagesize */
648	movdqa	%xmm3, %xmm4
649
650	.p2align 4
651LABEL(loop_ashr_5):
652	add	$16, %r10
653	jg	LABEL(nibble_ashr_5)
654
655LABEL(gobble_ashr_5):
656	movdqa	(%rsi, %rcx), %xmm1
657	movdqa	(%rdi, %rcx), %xmm2
658	movdqa	%xmm2, %xmm4
659
660	psrldq	$5, %xmm3
661	pslldq	$11, %xmm2
662	por	%xmm3, %xmm2
663
664	pcmpeqb	%xmm1, %xmm0
665	pcmpeqb	%xmm2, %xmm1
666	psubb	%xmm0, %xmm1
667	pmovmskb %xmm1, %edx
668	sub	$0xffff, %edx
669	jnz	LABEL(exit)
670
671#ifdef USE_AS_STRNCMP
672	sub	$16, %r11
673	jbe	LABEL(strcmp_exitz)
674#endif
675
676	add	$16, %rcx
677	movdqa	%xmm4, %xmm3
678
679	add	$16, %r10
680	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
681
682	movdqa	(%rsi, %rcx), %xmm1
683	movdqa	(%rdi, %rcx), %xmm2
684	movdqa	%xmm2, %xmm4
685
686	psrldq	$5, %xmm3
687	pslldq 	$11, %xmm2
688	por	%xmm3, %xmm2
689
690	pcmpeqb	%xmm1, %xmm0
691	pcmpeqb	%xmm2, %xmm1
692	psubb	%xmm0, %xmm1
693	pmovmskb %xmm1, %edx
694	sub	$0xffff, %edx
695	jnz	LABEL(exit)
696
697#ifdef USE_AS_STRNCMP
698	sub	$16, %r11
699	jbe	LABEL(strcmp_exitz)
700#endif
701
702	add	$16, %rcx
703	movdqa	%xmm4, %xmm3
704	jmp	LABEL(loop_ashr_5)
705
706	.p2align 4
707LABEL(nibble_ashr_5):
708	psrldq	$5, %xmm4
709	movdqa	(%rsi, %rcx), %xmm1
710	pcmpeqb	%xmm1, %xmm0
711	pcmpeqb	%xmm4, %xmm1
712	psubb	%xmm0, %xmm1
713	pmovmskb %xmm1, %edx
714	sub	$0x07ff, %edx
715	jnz	LABEL(exit)
716#ifdef USE_AS_STRNCMP
717	cmp	$11, %r11
718	jbe	LABEL(strcmp_exitz)
719#endif
720 	pxor	%xmm0, %xmm0
721	sub	$0x1000, %r10		/* subtract 4K from %r10 */
722	jmp	LABEL(gobble_ashr_5)
723
724/*
725 * ashr_6 handles the following cases:
726 * 	abs(str1 offset - str2 offset) = 10
727 */
728	.p2align 4
729LABEL(ashr_6):
730	pxor	%xmm0, %xmm0
731	movdqa	(%rdi), %xmm2
732	movdqa	(%rsi), %xmm1
733	pcmpeqb	%xmm1, %xmm0
734	pslldq	$10, %xmm2
735	pcmpeqb	%xmm1, %xmm2
736	psubb	%xmm0, %xmm2
737	pmovmskb %xmm2, %r9d
738	shr	%cl, %edx
739	shr	%cl, %r9d
740	sub	%r9d, %edx
741	jnz	LABEL(less32bytes)
742	movdqa	(%rdi), %xmm3
743
744	UPDATE_STRNCMP_COUNTER
745
746	pxor	%xmm0, %xmm0
747	mov	$16, %rcx	/* index for loads */
748	mov	$6, %r9d	/* rdi bytes already examined. Used in exit code */
749	/*
750	 * Setup %r10 value allows us to detect crossing a page boundary.
751	 * When %r10 goes positive we are crossing a page boundary and
752	 * need to do a nibble.
753	 */
754	lea	6(%rdi), %r10
755	and	$0xfff, %r10	/* offset into 4K page */
756	sub	$0x1000, %r10	/* subtract 4K pagesize */
757	movdqa	%xmm3, %xmm4
758
759	.p2align 4
760LABEL(loop_ashr_6):
761	add	$16, %r10
762	jg	LABEL(nibble_ashr_6)
763
764LABEL(gobble_ashr_6):
765	movdqa	(%rsi, %rcx), %xmm1
766	movdqa	(%rdi, %rcx), %xmm2
767	movdqa	%xmm2, %xmm4
768
769	psrldq	$6, %xmm3
770	pslldq	$10, %xmm2
771	por	%xmm3, %xmm2
772
773	pcmpeqb	%xmm1, %xmm0
774	pcmpeqb	%xmm2, %xmm1
775	psubb	%xmm0, %xmm1
776	pmovmskb %xmm1, %edx
777	sub	$0xffff, %edx
778	jnz	LABEL(exit)
779
780#ifdef USE_AS_STRNCMP
781	sub	$16, %r11
782	jbe	LABEL(strcmp_exitz)
783#endif
784
785	add	$16, %rcx
786	movdqa	%xmm4, %xmm3
787
788	add	$16, %r10
789	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
790
791	movdqa	(%rsi, %rcx), %xmm1
792	movdqa	(%rdi, %rcx), %xmm2
793	movdqa	%xmm2, %xmm4
794
795	psrldq	$6, %xmm3
796	pslldq 	$10, %xmm2
797	por	%xmm3, %xmm2
798
799	pcmpeqb	%xmm1, %xmm0
800	pcmpeqb	%xmm2, %xmm1
801	psubb	%xmm0, %xmm1
802	pmovmskb %xmm1, %edx
803	sub	$0xffff, %edx
804	jnz	LABEL(exit)
805
806#ifdef USE_AS_STRNCMP
807	sub	$16, %r11
808	jbe	LABEL(strcmp_exitz)
809#endif
810
811	add	$16, %rcx
812	movdqa	%xmm4, %xmm3
813	jmp	LABEL(loop_ashr_6)
814
815	.p2align 4
816LABEL(nibble_ashr_6):
817	psrldq	$6, %xmm4
818	movdqa	(%rsi, %rcx), %xmm1
819	pcmpeqb	%xmm1, %xmm0
820	pcmpeqb	%xmm4, %xmm1
821	psubb	%xmm0, %xmm1
822	pmovmskb %xmm1, %edx
823	sub	$0x03ff, %edx
824	jnz	LABEL(exit)
825#ifdef USE_AS_STRNCMP
826	cmp	$10, %r11
827	jbe	LABEL(strcmp_exitz)
828#endif
829 	pxor	%xmm0, %xmm0
830	sub	$0x1000, %r10		/* subtract 4K from %r10 */
831	jmp	LABEL(gobble_ashr_6)
832
833/*
834 * ashr_7 handles the following cases:
835 * 	abs(str1 offset - str2 offset) = 9
836 */
837	.p2align 4
838LABEL(ashr_7):
839	pxor	%xmm0, %xmm0
840	movdqa	(%rdi), %xmm2
841	movdqa	(%rsi), %xmm1
842	pcmpeqb	%xmm1, %xmm0
843	pslldq	$9, %xmm2
844	pcmpeqb	%xmm1, %xmm2
845	psubb	%xmm0, %xmm2
846	pmovmskb %xmm2, %r9d
847	shr	%cl, %edx
848	shr	%cl, %r9d
849	sub	%r9d, %edx
850	jnz	LABEL(less32bytes)
851	movdqa	(%rdi), %xmm3
852
853	UPDATE_STRNCMP_COUNTER
854
855	pxor	%xmm0, %xmm0
856	mov	$16, %rcx	/* index for loads */
857	mov	$7, %r9d	/* rdi bytes already examined. Used in exit code */
858	/*
859	 * Setup %r10 value allows us to detect crossing a page boundary.
860	 * When %r10 goes positive we are crossing a page boundary and
861	 * need to do a nibble.
862	 */
863	lea	7(%rdi), %r10
864	and	$0xfff, %r10	/* offset into 4K page */
865	sub	$0x1000, %r10	/* subtract 4K pagesize */
866	movdqa	%xmm3, %xmm4
867
868	.p2align 4
869LABEL(loop_ashr_7):
870	add	$16, %r10
871	jg	LABEL(nibble_ashr_7)
872
873LABEL(gobble_ashr_7):
874	movdqa	(%rsi, %rcx), %xmm1
875	movdqa	(%rdi, %rcx), %xmm2
876	movdqa	%xmm2, %xmm4
877
878	psrldq	$7, %xmm3
879	pslldq	$9, %xmm2
880	por	%xmm3, %xmm2
881
882	pcmpeqb	%xmm1, %xmm0
883	pcmpeqb	%xmm2, %xmm1
884	psubb	%xmm0, %xmm1
885	pmovmskb %xmm1, %edx
886	sub	$0xffff, %edx
887	jnz	LABEL(exit)
888
889#ifdef USE_AS_STRNCMP
890	sub	$16, %r11
891	jbe	LABEL(strcmp_exitz)
892#endif
893
894	add	$16, %rcx
895	movdqa	%xmm4, %xmm3
896
897	add	$16, %r10
898	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
899
900	movdqa	(%rsi, %rcx), %xmm1
901	movdqa	(%rdi, %rcx), %xmm2
902	movdqa	%xmm2, %xmm4
903
904	psrldq	$7, %xmm3
905	pslldq 	$9, %xmm2
906	por	%xmm3, %xmm2
907
908	pcmpeqb	%xmm1, %xmm0
909	pcmpeqb	%xmm2, %xmm1
910	psubb	%xmm0, %xmm1
911	pmovmskb %xmm1, %edx
912	sub	$0xffff, %edx
913	jnz	LABEL(exit)
914
915#ifdef USE_AS_STRNCMP
916	sub	$16, %r11
917	jbe	LABEL(strcmp_exitz)
918#endif
919
920	add	$16, %rcx
921	movdqa	%xmm4, %xmm3
922	jmp	LABEL(loop_ashr_7)
923
924	.p2align 4
925LABEL(nibble_ashr_7):
926	psrldq	$7, %xmm4
927	movdqa	(%rsi, %rcx), %xmm1
928	pcmpeqb	%xmm1, %xmm0
929	pcmpeqb	%xmm4, %xmm1
930	psubb	%xmm0, %xmm1
931	pmovmskb %xmm1, %edx
932	sub	$0x01ff, %edx
933	jnz	LABEL(exit)
934#ifdef USE_AS_STRNCMP
935	cmp	$9, %r11
936	jbe	LABEL(strcmp_exitz)
937#endif
938 	pxor	%xmm0, %xmm0
939	sub	$0x1000, %r10		/* subtract 4K from %r10 */
940	jmp	LABEL(gobble_ashr_7)
941
942/*
943 * ashr_8 handles the following cases:
944 * 	abs(str1 offset - str2 offset) = 8
945 */
946	.p2align 4
947LABEL(ashr_8):
948	pxor	%xmm0, %xmm0
949	movdqa	(%rdi), %xmm2
950	movdqa	(%rsi), %xmm1
951	pcmpeqb	%xmm1, %xmm0
952	pslldq	$8, %xmm2
953	pcmpeqb	%xmm1, %xmm2
954	psubb	%xmm0, %xmm2
955	pmovmskb %xmm2, %r9d
956	shr	%cl, %edx
957	shr	%cl, %r9d
958	sub	%r9d, %edx
959	jnz	LABEL(less32bytes)
960	movdqa	(%rdi), %xmm3
961
962	UPDATE_STRNCMP_COUNTER
963
964	pxor	%xmm0, %xmm0
965	mov	$16, %rcx	/* index for loads */
966	mov	$8, %r9d	/* rdi bytes already examined. Used in exit code */
967	/*
968	 * Setup %r10 value allows us to detect crossing a page boundary.
969	 * When %r10 goes positive we are crossing a page boundary and
970	 * need to do a nibble.
971	 */
972	lea	8(%rdi), %r10
973	and	$0xfff, %r10	/* offset into 4K page */
974	sub	$0x1000, %r10	/* subtract 4K pagesize */
975	movdqa	%xmm3, %xmm4
976
977	.p2align 4
978LABEL(loop_ashr_8):
979	add	$16, %r10
980	jg	LABEL(nibble_ashr_8)
981
982LABEL(gobble_ashr_8):
983	movdqa	(%rsi, %rcx), %xmm1
984	movdqa	(%rdi, %rcx), %xmm2
985	movdqa	%xmm2, %xmm4
986
987	psrldq	$8, %xmm3
988	pslldq	$8, %xmm2
989	por	%xmm3, %xmm2
990
991	pcmpeqb	%xmm1, %xmm0
992	pcmpeqb	%xmm2, %xmm1
993	psubb	%xmm0, %xmm1
994	pmovmskb %xmm1, %edx
995	sub	$0xffff, %edx
996	jnz	LABEL(exit)
997
998#ifdef USE_AS_STRNCMP
999	sub	$16, %r11
1000	jbe	LABEL(strcmp_exitz)
1001#endif
1002
1003	add	$16, %rcx
1004	movdqa	%xmm4, %xmm3
1005
1006	add	$16, %r10
1007	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
1008
1009	movdqa	(%rsi, %rcx), %xmm1
1010	movdqa	(%rdi, %rcx), %xmm2
1011	movdqa	%xmm2, %xmm4
1012
1013	psrldq	$8, %xmm3
1014	pslldq 	$8, %xmm2
1015	por	%xmm3, %xmm2
1016
1017	pcmpeqb	%xmm1, %xmm0
1018	pcmpeqb	%xmm2, %xmm1
1019	psubb	%xmm0, %xmm1
1020	pmovmskb %xmm1, %edx
1021	sub	$0xffff, %edx
1022	jnz	LABEL(exit)
1023
1024#ifdef USE_AS_STRNCMP
1025	sub	$16, %r11
1026	jbe	LABEL(strcmp_exitz)
1027#endif
1028
1029	add	$16, %rcx
1030	movdqa	%xmm4, %xmm3
1031	jmp	LABEL(loop_ashr_8)
1032
1033	.p2align 4
1034LABEL(nibble_ashr_8):
1035	psrldq	$8, %xmm4
1036	movdqa	(%rsi, %rcx), %xmm1
1037	pcmpeqb	%xmm1, %xmm0
1038	pcmpeqb	%xmm4, %xmm1
1039	psubb	%xmm0, %xmm1
1040	pmovmskb %xmm1, %edx
1041	sub	$0x00ff, %edx
1042	jnz	LABEL(exit)
1043#ifdef USE_AS_STRNCMP
1044	cmp	$8, %r11
1045	jbe	LABEL(strcmp_exitz)
1046#endif
1047 	pxor	%xmm0, %xmm0
1048	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1049	jmp	LABEL(gobble_ashr_8)
1050
1051/*
1052 * ashr_9 handles the following cases:
1053 * 	abs(str1 offset - str2 offset) = 7
1054 */
1055	.p2align 4
1056LABEL(ashr_9):
1057	pxor	%xmm0, %xmm0
1058	movdqa	(%rdi), %xmm2
1059	movdqa	(%rsi), %xmm1
1060	pcmpeqb	%xmm1, %xmm0
1061	pslldq	$7, %xmm2
1062	pcmpeqb	%xmm1, %xmm2
1063	psubb	%xmm0, %xmm2
1064	pmovmskb %xmm2, %r9d
1065	shr	%cl, %edx
1066	shr	%cl, %r9d
1067	sub	%r9d, %edx
1068	jnz	LABEL(less32bytes)
1069	movdqa	(%rdi), %xmm3
1070
1071	UPDATE_STRNCMP_COUNTER
1072
1073	pxor	%xmm0, %xmm0
1074	mov	$16, %rcx	/* index for loads */
1075	mov	$9, %r9d	/* rdi bytes already examined. Used in exit code */
1076	/*
1077	 * Setup %r10 value allows us to detect crossing a page boundary.
1078	 * When %r10 goes positive we are crossing a page boundary and
1079	 * need to do a nibble.
1080	 */
1081	lea	9(%rdi), %r10
1082	and	$0xfff, %r10	/* offset into 4K page */
1083	sub	$0x1000, %r10	/* subtract 4K pagesize */
1084	movdqa	%xmm3, %xmm4
1085
1086	.p2align 4
1087LABEL(loop_ashr_9):
1088	add	$16, %r10
1089	jg	LABEL(nibble_ashr_9)
1090
1091LABEL(gobble_ashr_9):
1092	movdqa	(%rsi, %rcx), %xmm1
1093	movdqa	(%rdi, %rcx), %xmm2
1094	movdqa	%xmm2, %xmm4
1095
1096	psrldq	$9, %xmm3
1097	pslldq	$7, %xmm2
1098	por	%xmm3, %xmm2
1099
1100	pcmpeqb	%xmm1, %xmm0
1101	pcmpeqb	%xmm2, %xmm1
1102	psubb	%xmm0, %xmm1
1103	pmovmskb %xmm1, %edx
1104	sub	$0xffff, %edx
1105	jnz	LABEL(exit)
1106
1107#ifdef USE_AS_STRNCMP
1108	sub	$16, %r11
1109	jbe	LABEL(strcmp_exitz)
1110#endif
1111
1112	add	$16, %rcx
1113	movdqa	%xmm4, %xmm3
1114
1115	add	$16, %r10
1116	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
1117
1118	movdqa	(%rsi, %rcx), %xmm1
1119	movdqa	(%rdi, %rcx), %xmm2
1120	movdqa	%xmm2, %xmm4
1121
1122	psrldq	$9, %xmm3
1123	pslldq 	$7, %xmm2
1124	por	%xmm3, %xmm2
1125
1126	pcmpeqb	%xmm1, %xmm0
1127	pcmpeqb	%xmm2, %xmm1
1128	psubb	%xmm0, %xmm1
1129	pmovmskb %xmm1, %edx
1130	sub	$0xffff, %edx
1131	jnz	LABEL(exit)
1132
1133#ifdef USE_AS_STRNCMP
1134	sub	$16, %r11
1135	jbe	LABEL(strcmp_exitz)
1136#endif
1137
1138	add	$16, %rcx
1139	movdqa	%xmm4, %xmm3		/* store for next cycle */
1140	jmp	LABEL(loop_ashr_9)
1141
1142	.p2align 4
1143LABEL(nibble_ashr_9):
1144	psrldq	$9, %xmm4
1145	movdqa	(%rsi, %rcx), %xmm1
1146	pcmpeqb	%xmm1, %xmm0
1147	pcmpeqb	%xmm4, %xmm1
1148	psubb	%xmm0, %xmm1
1149	pmovmskb %xmm1, %edx
1150	sub	$0x007f, %edx
1151	jnz	LABEL(exit)
1152#ifdef USE_AS_STRNCMP
1153	cmp	$7, %r11
1154	jbe	LABEL(strcmp_exitz)
1155#endif
1156 	pxor	%xmm0, %xmm0
1157	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1158	jmp	LABEL(gobble_ashr_9)
1159
1160/*
1161 * ashr_10 handles the following cases:
1162 * 	abs(str1 offset - str2 offset) = 6
1163 */
1164	.p2align 4
1165LABEL(ashr_10):
1166	pxor	%xmm0, %xmm0
1167	movdqa	(%rdi), %xmm2
1168	movdqa	(%rsi), %xmm1
1169	pcmpeqb	%xmm1, %xmm0
1170	pslldq	$6, %xmm2
1171	pcmpeqb	%xmm1, %xmm2
1172	psubb	%xmm0, %xmm2
1173	pmovmskb %xmm2, %r9d
1174	shr	%cl, %edx
1175	shr	%cl, %r9d
1176	sub	%r9d, %edx
1177	jnz	LABEL(less32bytes)
1178	movdqa	(%rdi), %xmm3
1179
1180	UPDATE_STRNCMP_COUNTER
1181
1182	pxor	%xmm0, %xmm0
1183	mov	$16, %rcx	/* index for loads */
1184	mov	$10, %r9d	/* rdi bytes already examined. Used in exit code */
1185	/*
1186	 * Setup %r10 value allows us to detect crossing a page boundary.
1187	 * When %r10 goes positive we are crossing a page boundary and
1188	 * need to do a nibble.
1189	 */
1190	lea	10(%rdi), %r10
1191	and	$0xfff, %r10	/* offset into 4K page */
1192	sub	$0x1000, %r10	/* subtract 4K pagesize */
1193	movdqa	%xmm3, %xmm4
1194
1195	.p2align 4
1196LABEL(loop_ashr_10):
1197	add	$16, %r10
1198	jg	LABEL(nibble_ashr_10)
1199
1200LABEL(gobble_ashr_10):
1201	movdqa	(%rsi, %rcx), %xmm1
1202	movdqa	(%rdi, %rcx), %xmm2
1203	movdqa	%xmm2, %xmm4
1204
1205	psrldq	$10, %xmm3
1206	pslldq	$6, %xmm2
1207	por	%xmm3, %xmm2
1208
1209	pcmpeqb	%xmm1, %xmm0
1210	pcmpeqb	%xmm2, %xmm1
1211	psubb	%xmm0, %xmm1
1212	pmovmskb %xmm1, %edx
1213	sub	$0xffff, %edx
1214	jnz	LABEL(exit)
1215
1216#ifdef USE_AS_STRNCMP
1217	sub	$16, %r11
1218	jbe	LABEL(strcmp_exitz)
1219#endif
1220
1221	add	$16, %rcx
1222	movdqa	%xmm4, %xmm3
1223
1224	add	$16, %r10
1225	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
1226
1227	movdqa	(%rsi, %rcx), %xmm1
1228	movdqa	(%rdi, %rcx), %xmm2
1229	movdqa	%xmm2, %xmm4
1230
1231	psrldq	$10, %xmm3
1232	pslldq 	$6, %xmm2
1233	por	%xmm3, %xmm2
1234
1235	pcmpeqb	%xmm1, %xmm0
1236	pcmpeqb	%xmm2, %xmm1
1237	psubb	%xmm0, %xmm1
1238	pmovmskb %xmm1, %edx
1239	sub	$0xffff, %edx
1240	jnz	LABEL(exit)
1241
1242#ifdef USE_AS_STRNCMP
1243	sub	$16, %r11
1244	jbe	LABEL(strcmp_exitz)
1245#endif
1246
1247	add	$16, %rcx
1248	movdqa	%xmm4, %xmm3
1249	jmp	LABEL(loop_ashr_10)
1250
1251	.p2align 4
1252LABEL(nibble_ashr_10):
1253	psrldq	$10, %xmm4
1254	movdqa	(%rsi, %rcx), %xmm1
1255	pcmpeqb	%xmm1, %xmm0
1256	pcmpeqb	%xmm4, %xmm1
1257	psubb	%xmm0, %xmm1
1258	pmovmskb %xmm1, %edx
1259	sub	$0x003f, %edx
1260	jnz	LABEL(exit)
1261#ifdef USE_AS_STRNCMP
1262	cmp	$6, %r11
1263	jbe	LABEL(strcmp_exitz)
1264#endif
1265 	pxor	%xmm0, %xmm0
1266	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1267	jmp	LABEL(gobble_ashr_10)
1268
1269/*
1270 * ashr_11 handles the following cases:
1271 * 	abs(str1 offset - str2 offset) = 5
1272 */
1273	.p2align 4
1274LABEL(ashr_11):
1275	pxor	%xmm0, %xmm0
1276	movdqa	(%rdi), %xmm2
1277	movdqa	(%rsi), %xmm1
1278	pcmpeqb	%xmm1, %xmm0
1279	pslldq	$5, %xmm2
1280	pcmpeqb	%xmm1, %xmm2
1281	psubb	%xmm0, %xmm2
1282	pmovmskb %xmm2, %r9d
1283	shr	%cl, %edx
1284	shr	%cl, %r9d
1285	sub	%r9d, %edx
1286	jnz	LABEL(less32bytes)
1287	movdqa	(%rdi), %xmm3
1288
1289	UPDATE_STRNCMP_COUNTER
1290
1291	pxor	%xmm0, %xmm0
1292	mov	$16, %rcx	/* index for loads */
1293	mov	$11, %r9d	/* rdi bytes already examined. Used in exit code */
1294	/*
1295	 * Setup %r10 value allows us to detect crossing a page boundary.
1296	 * When %r10 goes positive we are crossing a page boundary and
1297	 * need to do a nibble.
1298	 */
1299	lea	11(%rdi), %r10
1300	and	$0xfff, %r10	/* offset into 4K page */
1301	sub	$0x1000, %r10	/* subtract 4K pagesize */
1302	movdqa	%xmm3, %xmm4
1303
1304	.p2align 4
1305LABEL(loop_ashr_11):
1306	add	$16, %r10
1307	jg	LABEL(nibble_ashr_11)
1308
1309LABEL(gobble_ashr_11):
1310	movdqa	(%rsi, %rcx), %xmm1
1311	movdqa	(%rdi, %rcx), %xmm2
1312	movdqa	%xmm2, %xmm4
1313
1314	psrldq	$11, %xmm3
1315	pslldq	$5, %xmm2
1316	por	%xmm3, %xmm2
1317
1318	pcmpeqb	%xmm1, %xmm0
1319	pcmpeqb	%xmm2, %xmm1
1320	psubb	%xmm0, %xmm1
1321	pmovmskb %xmm1, %edx
1322	sub	$0xffff, %edx
1323	jnz	LABEL(exit)
1324
1325#ifdef USE_AS_STRNCMP
1326	sub	$16, %r11
1327	jbe	LABEL(strcmp_exitz)
1328#endif
1329
1330	add	$16, %rcx
1331	movdqa	%xmm4, %xmm3
1332
1333	add	$16, %r10
1334	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
1335
1336	movdqa	(%rsi, %rcx), %xmm1
1337	movdqa	(%rdi, %rcx), %xmm2
1338	movdqa	%xmm2, %xmm4
1339
1340	psrldq	$11, %xmm3
1341	pslldq 	$5, %xmm2
1342	por	%xmm3, %xmm2
1343
1344	pcmpeqb	%xmm1, %xmm0
1345	pcmpeqb	%xmm2, %xmm1
1346	psubb	%xmm0, %xmm1
1347	pmovmskb %xmm1, %edx
1348	sub	$0xffff, %edx
1349	jnz	LABEL(exit)
1350
1351#ifdef USE_AS_STRNCMP
1352	sub	$16, %r11
1353	jbe	LABEL(strcmp_exitz)
1354#endif
1355
1356	add	$16, %rcx
1357	movdqa	%xmm4, %xmm3
1358	jmp	LABEL(loop_ashr_11)
1359
1360	.p2align 4
1361LABEL(nibble_ashr_11):
1362	psrldq	$11, %xmm4
1363	movdqa	(%rsi, %rcx), %xmm1
1364	pcmpeqb	%xmm1, %xmm0
1365	pcmpeqb	%xmm4, %xmm1
1366	psubb	%xmm0, %xmm1
1367	pmovmskb %xmm1, %edx
1368	sub	$0x001f, %edx
1369	jnz	LABEL(exit)
1370#ifdef USE_AS_STRNCMP
1371	cmp	$5, %r11
1372	jbe	LABEL(strcmp_exitz)
1373#endif
1374 	pxor	%xmm0, %xmm0
1375	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1376	jmp	LABEL(gobble_ashr_11)
1377
1378/*
1379 * ashr_12 handles the following cases:
1380 * 	abs(str1 offset - str2 offset) = 4
1381 */
1382	.p2align 4
1383LABEL(ashr_12):
1384	pxor	%xmm0, %xmm0
1385	movdqa	(%rdi), %xmm2
1386	movdqa	(%rsi), %xmm1
1387	pcmpeqb	%xmm1, %xmm0
1388	pslldq	$4, %xmm2
1389	pcmpeqb	%xmm1, %xmm2
1390	psubb	%xmm0, %xmm2
1391	pmovmskb %xmm2, %r9d
1392	shr	%cl, %edx
1393	shr	%cl, %r9d
1394	sub	%r9d, %edx
1395	jnz	LABEL(less32bytes)
1396	movdqa	(%rdi), %xmm3
1397
1398	UPDATE_STRNCMP_COUNTER
1399
1400	pxor	%xmm0, %xmm0
1401	mov	$16, %rcx	/* index for loads */
1402	mov	$12, %r9d	/* rdi bytes already examined. Used in exit code */
1403	/*
1404	 * Setup %r10 value allows us to detect crossing a page boundary.
1405	 * When %r10 goes positive we are crossing a page boundary and
1406	 * need to do a nibble.
1407	 */
1408	lea	12(%rdi), %r10
1409	and	$0xfff, %r10	/* offset into 4K page */
1410	sub	$0x1000, %r10	/* subtract 4K pagesize */
1411	movdqa	%xmm3, %xmm4
1412
1413	.p2align 4
1414LABEL(loop_ashr_12):
1415	add	$16, %r10
1416	jg	LABEL(nibble_ashr_12)
1417
1418LABEL(gobble_ashr_12):
1419	movdqa	(%rsi, %rcx), %xmm1
1420	movdqa	(%rdi, %rcx), %xmm2
1421	movdqa	%xmm2, %xmm4
1422
1423	psrldq	$12, %xmm3
1424	pslldq	$4, %xmm2
1425	por	%xmm3, %xmm2
1426
1427	pcmpeqb	%xmm1, %xmm0
1428	pcmpeqb	%xmm2, %xmm1
1429	psubb	%xmm0, %xmm1
1430	pmovmskb %xmm1, %edx
1431	sub	$0xffff, %edx
1432	jnz	LABEL(exit)
1433
1434#ifdef USE_AS_STRNCMP
1435	sub	$16, %r11
1436	jbe	LABEL(strcmp_exitz)
1437#endif
1438
1439	add	$16, %rcx
1440	movdqa	%xmm4, %xmm3
1441
1442	add	$16, %r10
1443	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
1444
1445	movdqa	(%rsi, %rcx), %xmm1
1446	movdqa	(%rdi, %rcx), %xmm2
1447	movdqa	%xmm2, %xmm4
1448
1449	psrldq	$12, %xmm3
1450	pslldq 	$4, %xmm2
1451	por	%xmm3, %xmm2
1452
1453	pcmpeqb	%xmm1, %xmm0
1454	pcmpeqb	%xmm2, %xmm1
1455	psubb	%xmm0, %xmm1
1456	pmovmskb %xmm1, %edx
1457	sub	$0xffff, %edx
1458	jnz	LABEL(exit)
1459
1460#ifdef USE_AS_STRNCMP
1461	sub	$16, %r11
1462	jbe	LABEL(strcmp_exitz)
1463#endif
1464
1465	add	$16, %rcx
1466	movdqa	%xmm4, %xmm3
1467	jmp	LABEL(loop_ashr_12)
1468
1469	.p2align 4
1470LABEL(nibble_ashr_12):
1471	psrldq	$12, %xmm4
1472	movdqa	(%rsi, %rcx), %xmm1
1473	pcmpeqb	%xmm1, %xmm0
1474	pcmpeqb	%xmm4, %xmm1
1475	psubb	%xmm0, %xmm1
1476	pmovmskb %xmm1, %edx
1477	sub	$0x000f, %edx
1478	jnz	LABEL(exit)
1479#ifdef USE_AS_STRNCMP
1480	cmp	$4, %r11
1481	jbe	LABEL(strcmp_exitz)
1482#endif
1483 	pxor	%xmm0, %xmm0
1484	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1485	jmp	LABEL(gobble_ashr_12)
1486
1487/*
1488 * ashr_13 handles the following cases:
1489 * 	abs(str1 offset - str2 offset) = 3
1490 */
1491	.p2align 4
1492LABEL(ashr_13):
1493	pxor	%xmm0, %xmm0
1494	movdqa	(%rdi), %xmm2
1495	movdqa	(%rsi), %xmm1
1496	pcmpeqb	%xmm1, %xmm0
1497	pslldq	$3, %xmm2
1498	pcmpeqb	%xmm1, %xmm2
1499	psubb	%xmm0, %xmm2
1500	pmovmskb %xmm2, %r9d
1501	shr	%cl, %edx
1502	shr	%cl, %r9d
1503	sub	%r9d, %edx
1504	jnz	LABEL(less32bytes)
1505	movdqa	(%rdi), %xmm3
1506
1507	UPDATE_STRNCMP_COUNTER
1508
1509	pxor	%xmm0, %xmm0
1510	mov	$16, %rcx	/* index for loads */
1511	mov	$13, %r9d	/* rdi bytes already examined. Used in exit code */
1512	/*
1513	 * Setup %r10 value allows us to detect crossing a page boundary.
1514	 * When %r10 goes positive we are crossing a page boundary and
1515	 * need to do a nibble.
1516	 */
1517	lea	13(%rdi), %r10
1518	and	$0xfff, %r10	/* offset into 4K page */
1519	sub	$0x1000, %r10	/* subtract 4K pagesize */
1520	movdqa	%xmm3, %xmm4
1521
1522	.p2align 4
1523LABEL(loop_ashr_13):
1524	add	$16, %r10
1525	jg	LABEL(nibble_ashr_13)
1526
1527LABEL(gobble_ashr_13):
1528	movdqa	(%rsi, %rcx), %xmm1
1529	movdqa	(%rdi, %rcx), %xmm2
1530	movdqa	%xmm2, %xmm4
1531
1532	psrldq	$13, %xmm3
1533	pslldq	$3, %xmm2
1534	por	%xmm3, %xmm2
1535
1536	pcmpeqb	%xmm1, %xmm0
1537	pcmpeqb	%xmm2, %xmm1
1538	psubb	%xmm0, %xmm1
1539	pmovmskb %xmm1, %edx
1540	sub	$0xffff, %edx
1541	jnz	LABEL(exit)
1542
1543#ifdef USE_AS_STRNCMP
1544	sub	$16, %r11
1545	jbe	LABEL(strcmp_exitz)
1546#endif
1547
1548	add	$16, %rcx
1549	movdqa	%xmm4, %xmm3
1550
1551	add	$16, %r10
1552	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
1553
1554	movdqa	(%rsi, %rcx), %xmm1
1555	movdqa	(%rdi, %rcx), %xmm2
1556	movdqa	%xmm2, %xmm4
1557
1558	psrldq	$13, %xmm3
1559	pslldq 	$3, %xmm2
1560	por	%xmm3, %xmm2
1561
1562	pcmpeqb	%xmm1, %xmm0
1563	pcmpeqb	%xmm2, %xmm1
1564	psubb	%xmm0, %xmm1
1565	pmovmskb %xmm1, %edx
1566	sub	$0xffff, %edx
1567	jnz	LABEL(exit)
1568
1569#ifdef USE_AS_STRNCMP
1570	sub	$16, %r11
1571	jbe	LABEL(strcmp_exitz)
1572#endif
1573
1574	add	$16, %rcx
1575	movdqa	%xmm4, %xmm3
1576	jmp	LABEL(loop_ashr_13)
1577
1578	.p2align 4
1579LABEL(nibble_ashr_13):
1580	psrldq	$13, %xmm4
1581	movdqa	(%rsi, %rcx), %xmm1
1582	pcmpeqb	%xmm1, %xmm0
1583	pcmpeqb	%xmm4, %xmm1
1584	psubb	%xmm0, %xmm1
1585	pmovmskb %xmm1, %edx
1586	sub	$0x0007, %edx
1587	jnz	LABEL(exit)
1588#ifdef USE_AS_STRNCMP
1589	cmp	$3, %r11
1590	jbe	LABEL(strcmp_exitz)
1591#endif
1592 	pxor	%xmm0, %xmm0
1593	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1594	jmp	LABEL(gobble_ashr_13)
1595
1596/*
1597 * ashr_14 handles the following cases:
1598 * 	abs(str1 offset - str2 offset) = 2
1599 */
1600	.p2align 4
1601LABEL(ashr_14):
1602	pxor	%xmm0, %xmm0
1603	movdqa	(%rdi), %xmm2
1604	movdqa	(%rsi), %xmm1
1605	pcmpeqb	%xmm1, %xmm0
1606	pslldq  $2, %xmm2
1607	pcmpeqb	%xmm1, %xmm2
1608	psubb	%xmm0, %xmm2
1609	pmovmskb %xmm2, %r9d
1610	shr	%cl, %edx
1611	shr	%cl, %r9d
1612	sub	%r9d, %edx
1613	jnz	LABEL(less32bytes)
1614	movdqa	(%rdi), %xmm3
1615
1616	UPDATE_STRNCMP_COUNTER
1617
1618	pxor	%xmm0, %xmm0
1619	mov	$16, %rcx	/* index for loads */
1620	mov	$14, %r9d	/* rdi bytes already examined. Used in exit code */
1621	/*
1622	 * Setup %r10 value allows us to detect crossing a page boundary.
1623	 * When %r10 goes positive we are crossing a page boundary and
1624	 * need to do a nibble.
1625	 */
1626	lea	14(%rdi), %r10
1627	and	$0xfff, %r10	/* offset into 4K page */
1628	sub	$0x1000, %r10	/* subtract 4K pagesize */
1629	movdqa	%xmm3, %xmm4
1630
1631	.p2align 4
1632LABEL(loop_ashr_14):
1633	add	$16, %r10
1634	jg	LABEL(nibble_ashr_14)
1635
1636LABEL(gobble_ashr_14):
1637	movdqa	(%rsi, %rcx), %xmm1
1638	movdqa	(%rdi, %rcx), %xmm2
1639	movdqa	%xmm2, %xmm4
1640
1641	psrldq	$14, %xmm3
1642	pslldq	$2, %xmm2
1643	por	%xmm3, %xmm2
1644
1645	pcmpeqb	%xmm1, %xmm0
1646	pcmpeqb	%xmm2, %xmm1
1647	psubb	%xmm0, %xmm1
1648	pmovmskb %xmm1, %edx
1649	sub	$0xffff, %edx
1650	jnz	LABEL(exit)
1651
1652#ifdef USE_AS_STRNCMP
1653	sub	$16, %r11
1654	jbe	LABEL(strcmp_exitz)
1655#endif
1656
1657	add	$16, %rcx
1658	movdqa	%xmm4, %xmm3
1659
1660	add	$16, %r10
1661	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
1662
1663	movdqa	(%rsi, %rcx), %xmm1
1664	movdqa	(%rdi, %rcx), %xmm2
1665	movdqa	%xmm2, %xmm4
1666
1667	psrldq	$14, %xmm3
1668	pslldq 	$2, %xmm2
1669	por	%xmm3, %xmm2
1670
1671	pcmpeqb	%xmm1, %xmm0
1672	pcmpeqb	%xmm2, %xmm1
1673	psubb	%xmm0, %xmm1
1674	pmovmskb %xmm1, %edx
1675	sub	$0xffff, %edx
1676	jnz	LABEL(exit)
1677
1678#ifdef USE_AS_STRNCMP
1679	sub	$16, %r11
1680	jbe	LABEL(strcmp_exitz)
1681#endif
1682
1683	add	$16, %rcx
1684	movdqa	%xmm4, %xmm3
1685	jmp	LABEL(loop_ashr_14)
1686
1687	.p2align 4
1688LABEL(nibble_ashr_14):
1689	psrldq	$14, %xmm4
1690	movdqa	(%rsi, %rcx), %xmm1
1691	pcmpeqb	%xmm1, %xmm0
1692	pcmpeqb	%xmm4, %xmm1
1693	psubb	%xmm0, %xmm1
1694	pmovmskb %xmm1, %edx
1695	sub	$0x0003, %edx
1696	jnz	LABEL(exit)
1697#ifdef USE_AS_STRNCMP
1698	cmp	$2, %r11
1699	jbe	LABEL(strcmp_exitz)
1700#endif
1701 	pxor	%xmm0, %xmm0
1702	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1703	jmp	LABEL(gobble_ashr_14)
1704
1705/*
1706 * ashr_15 handles the following cases:
1707 * 	abs(str1 offset - str2 offset) = 1
1708 */
1709	.p2align 4
1710LABEL(ashr_15):
1711	pxor	%xmm0, %xmm0
1712	movdqa	(%rdi), %xmm2
1713	movdqa	(%rsi), %xmm1
1714	pcmpeqb	%xmm1, %xmm0
1715	pslldq	$1, %xmm2
1716	pcmpeqb	%xmm1, %xmm2
1717	psubb	%xmm0, %xmm2
1718	pmovmskb %xmm2, %r9d
1719	shr	%cl, %edx
1720	shr	%cl, %r9d
1721	sub	%r9d, %edx
1722	jnz	LABEL(less32bytes)
1723
1724	movdqa	(%rdi), %xmm3
1725
1726	UPDATE_STRNCMP_COUNTER
1727
1728	pxor	%xmm0, %xmm0
1729	mov	$16, %rcx	/* index for loads */
1730	mov	$15, %r9d	/* rdi bytes already examined. Used in exit code */
1731	/*
1732	 * Setup %r10 value allows us to detect crossing a page boundary.
1733	 * When %r10 goes positive we are crossing a page boundary and
1734	 * need to do a nibble.
1735	 */
1736	lea	15(%rdi), %r10
1737	and	$0xfff, %r10	/* offset into 4K page */
1738	sub	$0x1000, %r10	/* subtract 4K pagesize */
1739	movdqa	%xmm3, %xmm4
1740
1741	.p2align 4
1742LABEL(loop_ashr_15):
1743	add	$16, %r10
1744	jg	LABEL(nibble_ashr_15)
1745
1746LABEL(gobble_ashr_15):
1747	movdqa	(%rsi, %rcx), %xmm1
1748	movdqa	(%rdi, %rcx), %xmm2
1749	movdqa	%xmm2, %xmm4
1750
1751	psrldq	$15, %xmm3
1752	pslldq	$1, %xmm2
1753	por	%xmm3, %xmm2
1754
1755	pcmpeqb	%xmm1, %xmm0
1756	pcmpeqb	%xmm2, %xmm1
1757	psubb	%xmm0, %xmm1
1758	pmovmskb %xmm1, %edx
1759	sub	$0xffff, %edx
1760	jnz	LABEL(exit)
1761
1762#ifdef USE_AS_STRNCMP
1763	sub	$16, %r11
1764	jbe	LABEL(strcmp_exitz)
1765#endif
1766
1767	add	$16, %rcx
1768	movdqa	%xmm4, %xmm3
1769
1770	add	$16, %r10
1771	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
1772
1773	movdqa	(%rsi, %rcx), %xmm1
1774	movdqa	(%rdi, %rcx), %xmm2
1775	movdqa	%xmm2, %xmm4
1776
1777	psrldq	$15, %xmm3
1778	pslldq 	$1, %xmm2
1779	por	%xmm3, %xmm2
1780
1781	pcmpeqb	%xmm1, %xmm0
1782	pcmpeqb	%xmm2, %xmm1
1783	psubb	%xmm0, %xmm1
1784	pmovmskb %xmm1, %edx
1785	sub	$0xffff, %edx
1786	jnz	LABEL(exit)
1787
1788#ifdef USE_AS_STRNCMP
1789	sub	$16, %r11
1790	jbe	LABEL(strcmp_exitz)
1791#endif
1792
1793	add	$16, %rcx
1794	movdqa	%xmm4, %xmm3
1795	jmp	LABEL(loop_ashr_15)
1796
1797	.p2align 4
1798LABEL(nibble_ashr_15):
1799	psrldq	$15, %xmm4
1800	movdqa	(%rsi, %rcx), %xmm1
1801	pcmpeqb	%xmm1, %xmm0
1802	pcmpeqb	%xmm4, %xmm1
1803	psubb	%xmm0, %xmm1
1804	pmovmskb %xmm1, %edx
1805	sub	$0x0001, %edx
1806	jnz	LABEL(exit)
1807#ifdef USE_AS_STRNCMP
1808	cmp	$1, %r11
1809	jbe	LABEL(strcmp_exitz)
1810#endif
1811 	pxor	%xmm0, %xmm0
1812	sub	$0x1000, %r10		/* subtract 4K from %r10 */
1813	jmp	LABEL(gobble_ashr_15)
1814
1815	.p2align 4
1816LABEL(exit):
1817	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
1818LABEL(less32bytes):
1819	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
1820	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
1821	test	%r8d, %r8d
1822	jz	LABEL(ret)
1823	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
1824
1825	.p2align 4
1826LABEL(ret):
1827LABEL(less16bytes):
1828	/*
1829	 * Check to see if BSF is fast on this processor. If not, use a different
1830	 * exit tail.
1831	 */
1832	testl	$USE_BSF,.memops_method(%rip)
1833	jz	LABEL(AMD_exit)
1834	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
1835
1836#ifdef USE_AS_STRNCMP
1837	sub	%rdx, %r11
1838	jbe	LABEL(strcmp_exitz)
1839#endif
1840	xor	%ecx, %ecx		/* clear %ecx */
1841	xor	%eax, %eax		/* clear %eax */
1842
1843	movb	(%rsi, %rdx), %cl
1844	movb	(%rdi, %rdx), %al
1845
1846	sub	%ecx, %eax
1847	ret
1848
1849#ifdef USE_AS_STRNCMP
1850LABEL(strcmp_exitz):
1851	xor	%eax, %eax
1852	ret
1853#endif
1854
1855	/*
1856	 * This exit tail does not use the bsf instruction.
1857	 */
1858	.p2align 4
1859LABEL(AMD_exit):
1860	test	%dl, %dl
1861	jz	LABEL(next_8_bytes)
1862
1863	test	$0x01, %dl
1864	jnz	LABEL(Byte0)
1865
1866	test	$0x02, %dl
1867	jnz	LABEL(Byte1)
1868
1869	test	$0x04, %dl
1870	jnz	LABEL(Byte2)
1871
1872	test	$0x08, %dl
1873	jnz	LABEL(Byte3)
1874
1875	test	$0x10, %dl
1876	jnz	LABEL(Byte4)
1877
1878	test	$0x20, %dl
1879	jnz	LABEL(Byte5)
1880
1881	test	$0x40, %dl
1882	jnz	LABEL(Byte6)
1883
1884#ifdef USE_AS_STRNCMP
1885	sub	$7, %r11
1886	jbe	LABEL(strcmp_exitz)
1887#endif
1888	movzx	7(%rsi), %ecx
1889	movzx	7(%rdi), %eax
1890
1891	sub	%ecx, %eax
1892	ret
1893
1894	.p2align 4
1895LABEL(Byte0):
1896	/*
1897	 * never need to handle byte 0 for strncmpy
1898#ifdef USE_AS_STRNCMP
1899	sub	$0, %r11
1900	jbe	LABEL(strcmp_exitz)
1901#endif
1902	*/
1903	movzx	(%rsi), %ecx
1904	movzx	(%rdi), %eax
1905
1906	sub	%ecx, %eax
1907	ret
1908
1909	.p2align 4
1910LABEL(Byte1):
1911
1912#ifdef USE_AS_STRNCMP
1913	sub	$1, %r11
1914	jbe	LABEL(strcmp_exitz)
1915#endif
1916	movzx	1(%rsi), %ecx
1917	movzx	1(%rdi), %eax
1918
1919	sub	%ecx, %eax
1920	ret
1921
1922	.p2align 4
1923LABEL(Byte2):
1924
1925#ifdef USE_AS_STRNCMP
1926	sub	$2, %r11
1927	jbe	LABEL(strcmp_exitz)
1928#endif
1929	movzx	2(%rsi), %ecx
1930	movzx	2(%rdi), %eax
1931
1932	sub	%ecx, %eax
1933	ret
1934
1935	.p2align 4
1936LABEL(Byte3):
1937
1938#ifdef USE_AS_STRNCMP
1939	sub	$3, %r11
1940	jbe	LABEL(strcmp_exitz)
1941#endif
1942	movzx	3(%rsi), %ecx
1943	movzx	3(%rdi), %eax
1944
1945	sub	%ecx, %eax
1946	ret
1947
1948	.p2align 4
1949LABEL(Byte4):
1950
1951#ifdef USE_AS_STRNCMP
1952	sub	$4, %r11
1953	jbe	LABEL(strcmp_exitz)
1954#endif
1955	movzx	4(%rsi), %ecx
1956	movzx	4(%rdi), %eax
1957
1958	sub	%ecx, %eax
1959	ret
1960
1961	.p2align 4
1962LABEL(Byte5):
1963
1964#ifdef USE_AS_STRNCMP
1965	sub	$5, %r11
1966	jbe	LABEL(strcmp_exitz)
1967#endif
1968	movzx	5(%rsi), %ecx
1969	movzx	5(%rdi), %eax
1970
1971	sub	%ecx, %eax
1972	ret
1973
1974	.p2align 4
1975LABEL(Byte6):
1976
1977#ifdef USE_AS_STRNCMP
1978	sub	$6, %r11
1979	jbe	LABEL(strcmp_exitz)
1980#endif
1981	movzx	6(%rsi), %ecx
1982	movzx	6(%rdi), %eax
1983
1984	sub	%ecx, %eax
1985	ret
1986
1987	.p2align 4
1988LABEL(next_8_bytes):
1989	add	$8, %rdi
1990	add	$8, %rsi
1991#ifdef USE_AS_STRNCMP
1992	sub	$8, %r11
1993	jbe	LABEL(strcmp_exitz)
1994#endif
1995	test	$0x01, %dh
1996	jnz	LABEL(Byte0)
1997
1998	test	$0x02, %dh
1999	jnz	LABEL(Byte1)
2000
2001	test	$0x04, %dh
2002	jnz	LABEL(Byte2)
2003
2004	test	$0x08, %dh
2005	jnz	LABEL(Byte3)
2006
2007	test	$0x10, %dh
2008	jnz	LABEL(Byte4)
2009
2010	test	$0x20, %dh
2011	jnz	LABEL(Byte5)
2012
2013	test	$0x40, %dh
2014	jnz	LABEL(Byte6)
2015
2016#ifdef USE_AS_STRNCMP
2017	sub	$7, %r11
2018	jbe	LABEL(strcmp_exitz)
2019#endif
2020	movzx	7(%rsi), %ecx
2021	movzx	7(%rdi), %eax
2022
2023	sub	%ecx, %eax
2024	ret
2025
2026	.pushsection .rodata
2027	.p2align 4
2028LABEL(unaligned_table):
2029	.int	LABEL(ashr_0) - LABEL(unaligned_table)
2030	.int	LABEL(ashr_15) - LABEL(unaligned_table)
2031	.int	LABEL(ashr_14) - LABEL(unaligned_table)
2032	.int	LABEL(ashr_13) - LABEL(unaligned_table)
2033	.int	LABEL(ashr_12) - LABEL(unaligned_table)
2034	.int	LABEL(ashr_11) - LABEL(unaligned_table)
2035	.int	LABEL(ashr_10) - LABEL(unaligned_table)
2036	.int	LABEL(ashr_9) - LABEL(unaligned_table)
2037	.int	LABEL(ashr_8) - LABEL(unaligned_table)
2038	.int	LABEL(ashr_7) - LABEL(unaligned_table)
2039	.int	LABEL(ashr_6) - LABEL(unaligned_table)
2040	.int	LABEL(ashr_5) - LABEL(unaligned_table)
2041	.int	LABEL(ashr_4) - LABEL(unaligned_table)
2042	.int	LABEL(ashr_3) - LABEL(unaligned_table)
2043	.int	LABEL(ashr_2) - LABEL(unaligned_table)
2044	.int	LABEL(ashr_1) - LABEL(unaligned_table)
2045	.popsection
2046#ifdef USE_AS_STRNCMP
2047	SET_SIZE(strncmp)
2048#else
2049	SET_SIZE(strcmp)		/* (const char *, const char *) */
2050#endif
2051