xref: /illumos-gate/usr/src/lib/libc/amd64/gen/memcmp.S (revision 5d9d9091)
1/*
2 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6/*
7 * Copyright (c) 2002 Advanced Micro Devices, Inc.
8 *
9 * All rights reserved.
10 *
11 * Redistribution and  use in source and binary  forms, with or
12 * without  modification,  are   permitted  provided  that  the
13 * following conditions are met:
14 *
15 * + Redistributions  of source  code  must  retain  the  above
16 *   copyright  notice,   this  list  of   conditions  and  the
17 *   following disclaimer.
18 *
19 * + Redistributions  in binary  form must reproduce  the above
20 *   copyright  notice,   this  list  of   conditions  and  the
21 *   following  disclaimer in  the  documentation and/or  other
22 *   materials provided with the distribution.
23 *
24 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
25 *   names  of  its contributors  may  be  used  to endorse  or
26 *   promote  products  derived   from  this  software  without
27 *   specific prior written permission.
28 *
29 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
30 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
31 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
32 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
33 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
34 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
35 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
36 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
37 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
38 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
39 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
40 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
41 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
42 * POSSIBILITY OF SUCH DAMAGE.
43 *
44 * It is  licensee's responsibility  to comply with  any export
45 * regulations applicable in licensee's jurisdiction.
46 */
47
48	.file	"memcmp.s"
49
50#include <sys/asm_linkage.h>
51
52	ANSI_PRAGMA_WEAK(memcmp,function)
53
54#include "SYS.h"
55#include "cache.h"
56
57#define LABEL(s) .memcmp##s
58
59	ENTRY(memcmp)                 /* (const void *, const void*, size_t) */
60
61LABEL(try1):
62        cmp     $8, %rdx
63        jae     LABEL(1after)
64
65LABEL(1):                                /* 1-byte */
66        test    %rdx, %rdx
67        mov     $0, %eax
68        jz      LABEL(exit)
69
70LABEL(1loop):
71        movzbl  (%rdi), %eax
72        movzbl  (%rsi), %ecx
73        sub     %ecx, %eax
74        jnz     LABEL(exit)
75
76        dec     %rdx
77
78        lea     1 (%rdi), %rdi
79        lea     1 (%rsi), %rsi
80
81        jnz     LABEL(1loop)
82
83LABEL(exit):
84        rep
85        ret
86
87        .p2align 4
88
89LABEL(1after):
90
91LABEL(8try):
92        cmp     $32, %rdx
93        jae     LABEL(8after)
94
95LABEL(8):                        /* 8-byte */
96        mov     %edx, %ecx
97        shr     $3, %ecx
98        jz      LABEL(1)
99
100        .p2align 4
101
102LABEL(8loop):
103        mov     (%rsi), %rax
104        cmp     (%rdi), %rax
105        jne     LABEL(1)
106
107        sub     $8, %rdx
108        dec     %ecx
109
110        lea     8 (%rsi), %rsi
111        lea     8 (%rdi), %rdi
112
113        jnz     LABEL(8loop)
114
115LABEL(8skip):
116        and     $7, %edx
117        jnz     LABEL(1)
118
119        xor     %eax, %eax
120        ret
121
122        .p2align 4
123
124LABEL(8after):
125
126LABEL(32try):
127        cmp     $2048, %rdx
128        ja      LABEL(32after)
129
130LABEL(32):                               /* 32-byte */
131        mov     %edx, %ecx
132        shr     $5, %ecx
133        jz      LABEL(8)
134
135        .p2align 4
136
137LABEL(32loop):
138        mov        (%rsi), %rax
139        mov      8 (%rsi),  %r8
140        mov     16 (%rsi),  %r9
141        mov     24 (%rsi), %r10
142        sub        (%rdi), %rax
143        sub      8 (%rdi),  %r8
144        sub     16 (%rdi),  %r9
145        sub     24 (%rdi), %r10
146
147        or      %rax,  %r8
148        or       %r9, %r10
149        or       %r8, %r10
150        jnz     LABEL(8)
151
152        sub     $32, %rdx
153        dec     %ecx
154
155        lea     32 (%rsi), %rsi
156        lea     32 (%rdi), %rdi
157
158        jnz     LABEL(32loop)
159
160LABEL(32skip):
161        and     $31, %edx
162        jnz     LABEL(8)
163
164        xor     %eax, %eax
165        ret
166
167        .p2align 4
168
169LABEL(32after):
170
171	prefetchnta _sref_(.amd64cache1half)	/* 3DNow: use prefetch */
172
173LABEL(srctry):
174        mov     %esi, %r8d      /* align by source */
175
176        and     $7, %r8d
177        jz      LABEL(srcafter)  /* not unaligned */
178
179LABEL(src):                      /* align */
180        lea     -8 (%r8, %rdx), %rdx
181        sub     $8, %r8d
182
183
184LABEL(srcloop):
185        movzbl  (%rdi), %eax
186        movzbl  (%rsi), %ecx
187        sub     %ecx, %eax
188        jnz     LABEL(exit)
189
190        inc     %r8d
191
192        lea     1 (%rdi), %rdi
193        lea     1 (%rsi), %rsi
194
195        jnz     LABEL(srcloop)
196
197        .p2align 4
198
199LABEL(srcafter):
200
201LABEL(64try):
202        mov     _sref_(.amd64cache1half), %rcx
203        cmp	%rdx, %rcx
204        cmova   %rdx, %rcx
205
206LABEL(64):                               /* 64-byte */
207        shr     $6, %rcx
208        jz      LABEL(32)
209
210        .p2align 4
211
212LABEL(64loop):
213        mov        (%rsi), %rax
214        mov      8 (%rsi),  %r8
215        sub        (%rdi), %rax
216        sub      8 (%rdi),  %r8
217        or      %r8,  %rax
218
219        mov     16 (%rsi),  %r9
220        mov     24 (%rsi), %r10
221        sub     16 (%rdi),  %r9
222        sub     24 (%rdi), %r10
223        or      %r10, %r9
224
225        or      %r9,  %rax
226        jnz     LABEL(32)
227
228        mov     32 (%rsi), %rax
229        mov     40 (%rsi),  %r8
230        sub     32 (%rdi), %rax
231        sub     40 (%rdi),  %r8
232        or      %r8,  %rax
233
234        mov     48 (%rsi),  %r9
235        mov     56 (%rsi), %r10
236        sub     48 (%rdi),  %r9
237        sub     56 (%rdi), %r10
238        or      %r10, %r9
239
240        or      %r9,  %rax
241        jnz    	LABEL(32)
242
243        lea     64 (%rsi), %rsi
244        lea     64 (%rdi), %rdi
245
246        sub     $64, %rdx
247        dec     %rcx
248        jnz     LABEL(64loop)
249
250LABEL(64skip):
251        cmp     $2048, %rdx
252        ja     LABEL(64after)
253
254        test    %edx, %edx
255        jnz     LABEL(32)
256
257        xor     %eax, %eax
258        ret
259
260        .p2align 4
261
262LABEL(64after):
263
264LABEL(pretry):
265
266LABEL(pre):                              /* 64-byte prefetching */
267        mov     _sref_(.amd64cache2half), %rcx
268        cmp	%rdx, %rcx
269        cmova   %rdx, %rcx
270
271        shr     $6, %rcx
272        jz      LABEL(preskip)
273
274        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
275        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
276
277        mov        (%rsi), %rax
278        mov      8 (%rsi), %r9
279        mov     16 (%rsi), %r10
280        mov     24 (%rsi), %r11
281        sub        (%rdi), %rax
282        sub      8 (%rdi), %r9
283        sub     16 (%rdi), %r10
284        sub     24 (%rdi), %r11
285
286        or       %r9, %rax
287        or      %r11, %r10
288        or      %r10, %rax
289        jnz     LABEL(32)
290
291        mov     32 (%rsi), %rax
292        mov     40 (%rsi), %r9
293        mov     48 (%rsi), %r10
294        mov     56 (%rsi), %r11
295        sub     32 (%rdi), %rax
296        sub     40 (%rdi), %r9
297        sub     48 (%rdi), %r10
298        sub     56 (%rdi), %r11
299
300        or       %r9, %rax
301        or      %r11, %r10
302        or      %r10, %rax
303        jnz     LABEL(32)
304
305        lea     64 (%rsi), %rsi
306        lea     64 (%rdi), %rdi
307
308        sub     $64, %rdx
309        dec     %rcx
310
311        .p2align 4
312
313LABEL(preloop):
314        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
315        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
316
317        mov        (%rsi), %rax
318        mov      8 (%rsi), %r9
319        mov     16 (%rsi), %r10
320        mov     24 (%rsi), %r11
321        sub        (%rdi), %rax
322        sub      8 (%rdi), %r9
323        sub     16 (%rdi), %r10
324        sub     24 (%rdi), %r11
325
326        or       %r9, %rax
327        or      %r11, %r10
328        or      %r10, %rax
329        jnz     LABEL(32)
330
331        mov     32 (%rsi), %rax
332        mov     40 (%rsi), %r9
333        mov     48 (%rsi), %r10
334        mov     56 (%rsi), %r11
335        sub     32 (%rdi), %rax
336        sub     40 (%rdi), %r9
337        sub     48 (%rdi), %r10
338        sub     56 (%rdi), %r11
339
340        or       %r9, %rax
341        or      %r11, %r10
342        or      %r10, %rax
343        jnz     LABEL(32)
344
345        lea     64 (%rsi), %rsi
346        lea     64 (%rdi), %rdi
347
348        sub     $64, %rdx
349        dec     %rcx
350        jnz     LABEL(preloop)
351
352
353LABEL(preskip):
354        cmp     $2048, %rdx
355        ja      LABEL(preafter)
356
357        test    %edx, %edx
358        jnz     LABEL(32)
359
360        xor     %eax, %eax
361        ret
362
363        .p2align 4
364
365LABEL(preafter):
366
367LABEL(128try):
368
369LABEL(128):                              /* 128-byte */
370        mov     %rdx, %rcx
371        shr     $7, %rcx
372        jz      LABEL(128skip)
373
374        .p2align 4
375
376LABEL(128loop):
377        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
378        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
379
380        mov        (%rsi), %rax
381        mov      8 (%rsi), %r8
382        sub        (%rdi), %rax
383        sub      8 (%rdi), %r8
384        mov     16 (%rsi), %r9
385        mov     24 (%rsi), %r10
386        sub     16 (%rdi), %r9
387        sub     24 (%rdi), %r10
388
389        or       %r8, %rax
390        or       %r9, %r10
391        or      %r10, %rax
392
393        mov     32 (%rsi), %r8
394        mov     40 (%rsi), %r9
395        sub     32 (%rdi), %r8
396        sub     40 (%rdi), %r9
397        mov     48 (%rsi), %r10
398        mov     56 (%rsi), %r11
399        sub     48 (%rdi), %r10
400        sub     56 (%rdi), %r11
401
402        or       %r9, %r8
403        or      %r11, %r10
404        or      %r10, %r8
405
406        or      %r8, %rax
407        jnz     LABEL(32)
408
409        prefetchnta 576 (%rsi)	/* 3DNow: use prefetch */
410        prefetchnta 576 (%rdi)	/* 3DNow: use prefetch */
411
412        mov      64 (%rsi), %rax
413        mov      72 (%rsi), %r8
414        sub      64 (%rdi), %rax
415        sub      72 (%rdi), %r8
416        mov      80 (%rsi), %r9
417        mov      88 (%rsi), %r10
418        sub      80 (%rdi), %r9
419        sub      88 (%rdi), %r10
420
421        or       %r8, %rax
422        or       %r9, %r10
423        or      %r10, %rax
424
425        mov      96 (%rsi), %r8
426        mov     104 (%rsi), %r9
427        sub      96 (%rdi), %r8
428        sub     104 (%rdi), %r9
429        mov     112 (%rsi), %r10
430        mov     120 (%rsi), %r11
431        sub     112 (%rdi), %r10
432        sub     120 (%rdi), %r11
433
434        or       %r9, %r8
435        or      %r11, %r10
436        or      %r10, %r8
437
438        or      %r8, %rax
439        jnz     LABEL(32)
440
441        sub     $128, %rdx
442        dec     %rcx
443
444        lea     128 (%rsi), %rsi
445        lea     128 (%rdi), %rdi
446
447        jnz     LABEL(128loop)
448
449LABEL(128skip):
450        and     $127, %edx
451        jnz     LABEL(32)
452
453        xor     %eax, %eax
454        ret
455
456	SET_SIZE(memcmp)
457