xref: /illumos-gate/usr/src/lib/libc/amd64/gen/memcpy.s (revision 4e5b757f)
1/*
2 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6/*
7 * Copyright (c) 2002 Advanced Micro Devices, Inc.
8 *
9 * All rights reserved.
10 *
11 * Redistribution and  use in source and binary  forms, with or
12 * without  modification,  are   permitted  provided  that  the
13 * following conditions are met:
14 *
15 * + Redistributions  of source  code  must  retain  the  above
16 *   copyright  notice,   this  list  of   conditions  and  the
17 *   following disclaimer.
18 *
19 * + Redistributions  in binary  form must reproduce  the above
20 *   copyright  notice,   this  list  of   conditions  and  the
21 *   following  disclaimer in  the  documentation and/or  other
22 *   materials provided with the distribution.
23 *
24 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
25 *   names  of  its contributors  may  be  used  to endorse  or
26 *   promote  products  derived   from  this  software  without
27 *   specific prior written permission.
28 *
29 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
30 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
31 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
32 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
33 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
34 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
35 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
36 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
37 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
38 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
39 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
40 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
41 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
42 * POSSIBILITY OF SUCH DAMAGE.
43 *
44 * It is  licensee's responsibility  to comply with  any export
45 * regulations applicable in licensee's jurisdiction.
46 */
47
48	.ident	"%Z%%M%	%I%	%E% SMI"
49
50	.file	"%M%"
51
52#include <sys/asm_linkage.h>
53
54	ANSI_PRAGMA_WEAK(memmove,function)
55	ANSI_PRAGMA_WEAK(memcpy,function)
56
57#include "SYS.h"
58#include "cache.h"
59
60	ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function)
61
62#define LABEL(s) .memcpy/**/s
63
64	ENTRY(memmove)		/* (void *s1, void *s2, size_t n) */
65	cmpq	%rsi,%rdi	/ if (source addr > dest addr)
66	leaq	-1(%rsi,%rdx),%r9
67	jle	.CopyRight	/
68	cmpq	%r9,%rdi
69	jle	.CopyLeft
70	jmp	.CopyRight
71
72	ENTRY(memcpy)                        /* (void *, const void*, size_t) */
73
74.CopyRight:
75LABEL(1try):
76        cmp     $16, %rdx
77        mov     %rdi, %rax
78        jae     LABEL(1after)
79
80        .p2align 4
81
82LABEL(1):				/* 1-byte */
83        test    $1, %dl
84        jz      LABEL(1a)
85
86        mov     (%rsi), %cl
87        mov     %cl, (%rdi)
88
89	dec	%dl
90	lea	1 (%rsi), %rsi
91	lea	1 (%rdi), %rdi
92	jz	LABEL(exit)
93
94        .p2align 4,, 4
95
96LABEL(1a):
97        test    $2, %dl
98        jz      LABEL(1b)
99
100        mov     (%rsi), %cx
101        mov     %cx, (%rdi)
102
103	sub	$2, %dl
104	lea	2 (%rsi), %rsi
105	lea	2 (%rdi), %rdi
106	jz	LABEL(exit)
107
108        .p2align 4,, 4
109
110LABEL(1b):
111        test    $4, %dl
112        jz      LABEL(1c)
113
114        mov     (%rsi), %ecx
115        mov     %ecx, (%rdi)
116
117/*	sub	$4, %dl */
118	lea	4 (%rsi), %rsi
119	lea	4 (%rdi), %rdi
120/*	jz	LABEL(exit) */
121
122        .p2align 4,, 4
123
124LABEL(1c):
125        test    $8, %dl
126        jz      LABEL(1d)
127
128        mov     (%rsi), %rcx
129        mov     %rcx, (%rdi)
130
131/*	sub	$8, %dl */
132/*	lea	8 (%rsi), %rsi */
133/*	lea	8 (%rdi), %rdi */
134/*	jz	LABEL(exit) */
135
136        .p2align 4
137
138LABEL(1d):
139
140LABEL(exit):
141        rep
142        ret
143
144        .p2align 4
145
146LABEL(1after):
147        push    %rax
148
149LABEL(8try):
150        cmp     $32, %rdx
151        jae     LABEL(8after)
152
153LABEL(8):                        /* 8-byte */
154        mov     %edx, %ecx
155        shr     $3, %ecx
156        jz      LABEL(8skip)
157
158        .p2align 4
159
160LABEL(8loop):
161        dec     %ecx
162
163        mov     (%rsi), %rax
164        mov     %rax, (%rdi)
165
166        lea     8 (%rsi), %rsi
167        lea     8 (%rdi), %rdi
168
169        jnz     LABEL(8loop)
170
171LABEL(8skip):
172        and     $7, %edx
173        pop     %rax
174        jnz     LABEL(1)
175
176        rep
177        ret
178
179        .p2align 4
180
181LABEL(8after):
182
183LABEL(32try):
184	mov	$512, %r8d		/* size for unaligned data */
185	mov	$4096, %r9d		/* size for aligned data */
186	test	$7, %esi		/* check if either source.. */
187	cmovz	%r9, %r8
188	test	$7, %edi		/* .. or destination is aligned */
189	cmovz	%r9, %r8
190
191        cmp     %r8, %rdx
192        ja	LABEL(32after)
193
194LABEL(32):				/* 32-byte */
195        mov     %edx, %ecx
196        shr     $5, %ecx
197        jz      LABEL(32skip)
198
199        .p2align 4
200
201LABEL(32loop):
202        dec     %ecx
203
204        mov        (%rsi), %rax
205        mov      8 (%rsi), %r8
206        mov     16 (%rsi), %r9
207        mov     24 (%rsi), %r10
208
209        mov     %rax,    (%rdi)
210        mov      %r8,  8 (%rdi)
211        mov      %r9, 16 (%rdi)
212        mov     %r10, 24 (%rdi)
213
214        lea     32 (%rsi), %rsi
215        lea     32 (%rdi), %rdi
216
217        jz      LABEL(32skip)
218
219        dec     %ecx
220
221        mov        (%rsi), %rax
222        mov      8 (%rsi), %r8
223        mov     16 (%rsi), %r9
224        mov     24 (%rsi), %r10
225
226        mov     %rax,    (%rdi)
227        mov      %r8,  8 (%rdi)
228        mov      %r9, 16 (%rdi)
229        mov     %r10, 24 (%rdi)
230
231        lea     32 (%rsi), %rsi
232        lea     32 (%rdi), %rdi
233
234        jnz     LABEL(32loop)
235
236        .p2align 4
237
238LABEL(32skip):
239        and     $31, %edx
240        jnz     LABEL(8)
241
242        pop     %rax
243        ret
244
245        .p2align 4
246
247LABEL(32after):
248
249	/* 3DNow: use prefetch */
250	prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */
251
252LABEL(aligntry):
253        mov     %edi, %r8d      	/* align by destination */
254
255        and	$7, %r8d
256        jz      LABEL(alignafter)  	/* not unaligned */
257
258LABEL(align):                      	/* align */
259        lea     -8 (%r8, %rdx), %rdx
260        sub     $8, %r8d
261
262        .p2align 4
263
264LABEL(alignloop):
265        inc     %r8d
266
267        mov     (%rsi), %al
268        mov     %al, (%rdi)
269
270        lea     1 (%rsi), %rsi
271        lea     1 (%rdi), %rdi
272
273        jnz     LABEL(alignloop)
274
275        .p2align 4
276
277LABEL(alignafter):
278        mov     _sref_(.amd64cache1half), %r11
279        cmp     %rdx, %r11
280        cmova   %rdx, %r11
281
282LABEL(fast):
283	mov	%r11, %rcx
284	and	$-8, %r11
285	shr	$3, %rcx
286/*	jz	LABEL(fastskip) */
287
288	rep				/* good ol' MOVS */
289	movsq
290
291LABEL(fastskip):
292	sub	%r11, %rdx
293	test	$-8, %rdx
294	jnz	LABEL(fastafterlater)
295
296	and	$7, %edx
297	pop	%rax
298	jnz	LABEL(1)
299
300	rep
301	ret
302
303        .p2align 4
304
305LABEL(64try):
306        mov     _sref_(.amd64cache1half), %r11
307        cmp     %rdx, %r11
308        cmova   %rdx, %r11
309
310LABEL(64):                               /* 64-byte */
311        mov     %r11, %rcx
312        and     $-64, %r11
313        shr     $6, %rcx
314        jz      LABEL(64skip)
315
316        .p2align 4
317
318LABEL(64loop):
319        dec     %ecx
320
321        mov        (%rsi), %rax
322        mov      8 (%rsi), %r8
323        mov     16 (%rsi), %r9
324        mov     24 (%rsi), %r10
325
326        mov     %rax,    (%rdi)
327        mov      %r8,  8 (%rdi)
328        mov      %r9, 16 (%rdi)
329        mov     %r10, 24 (%rdi)
330
331        mov     32 (%rsi), %rax
332        mov     40 (%rsi), %r8
333        mov     48 (%rsi), %r9
334        mov     56 (%rsi), %r10
335
336        mov     %rax, 32 (%rdi)
337        mov      %r8, 40 (%rdi)
338        mov      %r9, 48 (%rdi)
339        mov     %r10, 56 (%rdi)
340
341        lea     64 (%rsi), %rsi
342        lea     64 (%rdi), %rdi
343
344        jz      LABEL(64skip)
345
346        dec     %ecx
347
348        mov        (%rsi), %rax
349        mov      8 (%rsi), %r8
350        mov     16 (%rsi), %r9
351        mov     24 (%rsi), %r10
352
353        mov     %rax,    (%rdi)
354        mov      %r8,  8 (%rdi)
355        mov      %r9, 16 (%rdi)
356        mov     %r10, 24 (%rdi)
357
358        mov     32 (%rsi), %rax
359        mov     40 (%rsi), %r8
360        mov     48 (%rsi), %r9
361        mov     56 (%rsi), %r10
362
363        mov     %rax, 32 (%rdi)
364        mov      %r8, 40 (%rdi)
365        mov      %r9, 48 (%rdi)
366        mov     %r10, 56 (%rdi)
367
368        lea     64 (%rsi), %rsi
369        lea     64 (%rdi), %rdi
370
371        jnz     LABEL(64loop)
372
373        .p2align 4
374
375LABEL(64skip):
376        sub     %r11, %rdx
377        test    $-64, %rdx
378        jnz     LABEL(64after)
379
380        and     $63, %edx
381        jnz     LABEL(32)
382
383        pop     %rax
384        ret
385
386        .p2align 4
387
388LABEL(64after):
389
390LABEL(fastafterlater):
391
392LABEL(pretry):
393        mov     _sref_(.amd64cache2half), %r8
394        cmp     %rdx, %r8
395        cmova   %rdx, %r8
396
397LABEL(pre):                              /* 64-byte prefetching */
398        mov     %r8, %rcx
399        and     $-64, %r8
400        shr     $6, %rcx
401        jz      LABEL(preskip)
402
403        push    %r14
404        push    %r13
405        push    %r12
406        push    %rbx
407
408        .p2align 4
409
410LABEL(preloop):
411        dec     %rcx
412
413        mov        (%rsi), %rax
414        mov      8 (%rsi), %rbx
415        mov     16 (%rsi), %r9
416        mov     24 (%rsi), %r10
417        mov     32 (%rsi), %r11
418        mov     40 (%rsi), %r12
419        mov     48 (%rsi), %r13
420        mov     56 (%rsi), %r14
421
422        prefetchnta  0 + 896 (%rsi)	/* 3DNow: use prefetch */
423        prefetchnta 64 + 896 (%rsi)	/* 3DNow: use prefetch */
424
425        mov     %rax,    (%rdi)
426        mov     %rbx,  8 (%rdi)
427        mov      %r9, 16 (%rdi)
428        mov     %r10, 24 (%rdi)
429        mov     %r11, 32 (%rdi)
430        mov     %r12, 40 (%rdi)
431        mov     %r13, 48 (%rdi)
432        mov     %r14, 56 (%rdi)
433
434        lea     64 (%rsi), %rsi
435        lea     64 (%rdi), %rdi
436
437        jz      LABEL(preskipa)
438
439        dec     %rcx
440
441        mov        (%rsi), %rax
442        mov      8 (%rsi), %rbx
443        mov     16 (%rsi), %r9
444        mov     24 (%rsi), %r10
445        mov     32 (%rsi), %r11
446        mov     40 (%rsi), %r12
447        mov     48 (%rsi), %r13
448        mov     56 (%rsi), %r14
449
450        mov     %rax,    (%rdi)
451        mov     %rbx,  8 (%rdi)
452        mov      %r9, 16 (%rdi)
453        mov     %r10, 24 (%rdi)
454        mov     %r11, 32 (%rdi)
455        mov     %r12, 40 (%rdi)
456        mov     %r13, 48 (%rdi)
457        mov     %r14, 56 (%rdi)
458
459        prefetchnta -64 + 896 (%rdi)	/* 3DNow: use prefetchw */
460        prefetchnta   0 + 896 (%rdi)	/* 3DNow: use prefetchw */
461
462        lea     64 (%rsi), %rsi
463        lea     64 (%rdi), %rdi
464
465        jnz     LABEL(preloop)
466
467LABEL(preskipa):
468        pop     %rbx
469        pop     %r12
470        pop     %r13
471        pop     %r14
472
473
474LABEL(preskip):
475        sub     %r8, %rdx
476        test    $-64, %rdx
477        jnz     LABEL(preafter)
478
479        and     $63, %edx
480        jnz     LABEL(32)
481
482        pop     %rax
483        ret
484
485        .p2align 4
486
487LABEL(preafter):
488
489LABEL(NTtry):
490
491LABEL(NT):                               /* NT 64-byte */
492        mov     %rdx, %rcx
493        shr     $7, %rcx
494        jz      LABEL(NTskip)
495
496        push    %r14
497        push    %r13
498        push    %r12
499
500       .p2align 4
501
502LABEL(NTloop):
503        prefetchnta 768 (%rsi)		/* prefetching NT here is not so good on B0 and C0 MP systems */
504        prefetchnta 832 (%rsi)
505
506        dec     %rcx
507
508        mov        (%rsi), %rax
509        mov      8 (%rsi), %r8
510        mov     16 (%rsi), %r9
511        mov     24 (%rsi), %r10
512        mov     32 (%rsi), %r11
513        mov     40 (%rsi), %r12
514        mov     48 (%rsi), %r13
515        mov     56 (%rsi), %r14
516
517        movnti  %rax,    (%rdi)
518        movnti   %r8,  8 (%rdi)
519        movnti   %r9, 16 (%rdi)
520        movnti  %r10, 24 (%rdi)
521        movnti  %r11, 32 (%rdi)
522        movnti  %r12, 40 (%rdi)
523        movnti  %r13, 48 (%rdi)
524        movnti  %r14, 56 (%rdi)
525
526        mov      64 (%rsi), %rax
527        mov      72 (%rsi), %r8
528        mov      80 (%rsi), %r9
529        mov      88 (%rsi), %r10
530        mov      96 (%rsi), %r11
531        mov     104 (%rsi), %r12
532        mov     112 (%rsi), %r13
533        mov     120 (%rsi), %r14
534
535        movnti  %rax,  64 (%rdi)
536        movnti   %r8,  72 (%rdi)
537        movnti   %r9,  80 (%rdi)
538        movnti  %r10,  88 (%rdi)
539        movnti  %r11,  96 (%rdi)
540        movnti  %r12, 104 (%rdi)
541        movnti  %r13, 112 (%rdi)
542        movnti  %r14, 120 (%rdi)
543
544        lea     128 (%rsi), %rsi
545        lea     128 (%rdi), %rdi
546
547        jnz     LABEL(NTloop)
548
549        mfence
550
551        pop     %r12
552        pop     %r13
553        pop     %r14
554
555LABEL(NTskip):
556        and     $127, %edx
557        jnz     LABEL(32)
558
559        pop     %rax
560        ret
561
562	SET_SIZE(memcpy)                   /* (void *, const void*, size_t) */
563
564.CopyLeft:
565	movq	%rdi,%rax		/ set up return value
566	movq	$7,%r8			/ heavily used constant
567	movq	%rdx,%rcx		/ put len into %rcx for rep
568	std				/ reverse direction bit (RtoL)
569	cmpq	$24,%rcx		/ if (size < 24)
570	ja	.BigCopyLeft		/ {
571	movq	%r9,%rsi		/     src = src + size - 1
572	leaq	-1(%rcx,%rdi),%rdi	/     dst = dst + size - 1
573	rep;	smovb			/    do the byte copy
574	cld				/    reset direction flag to LtoR
575	ret				/  return(dba);
576.BigCopyLeft:				/ } else {
577	xchgq	%r9,%rcx
578	movq	%rcx,%rsi		/ align source w/byte copy
579	leaq	-1(%r9,%rdi),%rdi
580	andq	%r8,%rcx
581	jz	.SkipAlignLeft
582	addq	$1, %rcx		/ we need to insure that future
583	subq	%rcx,%r9		/ copy is done on aligned boundary
584	rep;	smovb
585.SkipAlignLeft:
586	movq	%r9,%rcx
587	subq	%r8,%rsi
588	shrq	$3,%rcx			/ do 8 byte copy RtoL
589	subq	%r8,%rdi
590	rep;	smovq
591	andq	%r8,%r9		/ do 1 byte copy whats left
592	jz	.CleanupReturnLeft
593	movq	%r9,%rcx
594	addq	%r8,%rsi		/ rep; smovl instruction will decrement
595	addq	%r8,%rdi		/ %rdi, %rsi by four after each copy
596					/ adding 3 will restore pointers to byte
597					/ before last double word copied
598					/ which is where they are expected to
599					/ be for the single byte copy code
600	rep;	smovb
601.CleanupReturnLeft:
602	cld				/ reset direction flag to LtoR
603	ret				/ return(dba);
604	SET_SIZE(memmove)
605