279a70fc3bSMark J. Nelson	.file	"memcpy.s"
297c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
317c478bd9Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memmove,function)
327c478bd9Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memcpy,function)
347c478bd9Sstevel@tonic-gate	ENTRY(memmove)
357c478bd9Sstevel@tonic-gate	movl	0+12(%esp),%ecx	/ get number of bytes to move
367c478bd9Sstevel@tonic-gate	pushl	%esi		/ save off %edi, %esi and move destination
377c478bd9Sstevel@tonic-gate	pushl	%edi
387c478bd9Sstevel@tonic-gate	movl	8+ 4(%esp),%edi	/ destination buffer address
397c478bd9Sstevel@tonic-gate	movl	8+ 8(%esp),%esi	/ source buffer address
407c478bd9Sstevel@tonic-gate	movl	%edi, %eax
417c478bd9Sstevel@tonic-gate	testl	%ecx,%ecx
427c478bd9Sstevel@tonic-gate	jz	.Return
447c478bd9Sstevel@tonic-gate	cmpl	%esi,%edi	/ if (source addr > dest addr)
457c478bd9Sstevel@tonic-gate	leal	-1(%esi,%ecx),%edx	/ %edx = src + size - 1
46*46b59285SSudheer A	jbe	.memcpy_post	/ jump if dst <= src
477c478bd9Sstevel@tonic-gate	cmpl	%edx,%edi
48*46b59285SSudheer A	jbe	.CopyLeft	/ jump if dst <= src + size - 1
497c478bd9Sstevel@tonic-gate	jmp	.memcpy_post
517c478bd9Sstevel@tonic-gate	ENTRY(memcpy)
527c478bd9Sstevel@tonic-gate	pushl	%esi
537c478bd9Sstevel@tonic-gate	pushl	%edi
557c478bd9Sstevel@tonic-gate	movl	8+4(%esp),%edi	/ %edi = dest address
567c478bd9Sstevel@tonic-gate	movl	%edi, %eax	/ save this
577c478bd9Sstevel@tonic-gate	movl	8+8(%esp),%esi	/ %esi = source address
587c478bd9Sstevel@tonic-gate	movl	8+12(%esp),%ecx/ %ecx = length of string
597c478bd9Sstevel@tonic-gate				/ %edx scratch register
607c478bd9Sstevel@tonic-gate				/ %eax scratch register
627c478bd9Sstevel@tonic-gate	nop			/ this really helps, don't know why
637c478bd9Sstevel@tonic-gate				/ note:	cld is perf death on P4
647c478bd9Sstevel@tonic-gate	cmpl	$63,%ecx
657c478bd9Sstevel@tonic-gate	ja	.move_sse	/ not worth doing sse for less
687c478bd9Sstevel@tonic-gate	movl	%ecx,%edx	/ save byte cnt
697c478bd9Sstevel@tonic-gate	shrl	$2,%ecx		/ %ecx = number of words to move
707c478bd9Sstevel@tonic-gate	rep ; smovl		/ move the words
737c478bd9Sstevel@tonic-gate	andl	$0x3,%edx	/ %edx = number of bytes left to move
747c478bd9Sstevel@tonic-gate	jz	.Return		/ %edx <= 3, so just unroll the loop
767c478bd9Sstevel@tonic-gate	movb	(%esi), %cl
777c478bd9Sstevel@tonic-gate	movb	%cl, (%edi)
787c478bd9Sstevel@tonic-gate	decl	%edx
797c478bd9Sstevel@tonic-gate	jz	.Return
807c478bd9Sstevel@tonic-gate	movb	1(%esi), %cl
817c478bd9Sstevel@tonic-gate	movb	%cl, 1(%edi)
827c478bd9Sstevel@tonic-gate	decl	%edx
837c478bd9Sstevel@tonic-gate	jz	.Return
847c478bd9Sstevel@tonic-gate	movb	2(%esi), %cl
857c478bd9Sstevel@tonic-gate	movb	%cl, 2(%edi)
887c478bd9Sstevel@tonic-gate	popl	%edi		/ restore register variables
897c478bd9Sstevel@tonic-gate	popl	%esi
907c478bd9Sstevel@tonic-gate	ret
937c478bd9Sstevel@tonic-gate	/
947c478bd9Sstevel@tonic-gate	/ time to 16 byte align destination
957c478bd9Sstevel@tonic-gate	/
967c478bd9Sstevel@tonic-gate	andl	$15, %eax
977c478bd9Sstevel@tonic-gate	jnz	.sse_unaligned	/ jmp if dest is unaligned
987c478bd9Sstevel@tonic-gate.sse:				/ dest is aligned, check source
997c478bd9Sstevel@tonic-gate	movl	%ecx, %edx	/ get byte count
1007c478bd9Sstevel@tonic-gate	shrl	$6, %edx	/ number of 64 byte blocks to move
1017c478bd9Sstevel@tonic-gate	testl	$15, %esi
1027c478bd9Sstevel@tonic-gate	jnz	.sse_da		/ go to slow loop if source is unaligned
1037c478bd9Sstevel@tonic-gate	cmpl	$65535, %ecx
1047c478bd9Sstevel@tonic-gate	ja	.sse_sa_nt_loop
1067c478bd9Sstevel@tonic-gate	/
1077c478bd9Sstevel@tonic-gate	/ use aligned load since we're lucky
1087c478bd9Sstevel@tonic-gate	/
1107c478bd9Sstevel@tonic-gate	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
1117c478bd9Sstevel@tonic-gate	prefetcht0 568(%edi)	/ prefetch source & copy 64 byte at a time
1127c478bd9Sstevel@tonic-gate	movaps	0(%esi), %xmm0
1137c478bd9Sstevel@tonic-gate	movaps	%xmm0, 0(%edi)
1147c478bd9Sstevel@tonic-gate	movaps	16(%esi), %xmm1
1157c478bd9Sstevel@tonic-gate	movaps	%xmm1, 16(%edi)
1167c478bd9Sstevel@tonic-gate	movaps	32(%esi), %xmm2
1177c478bd9Sstevel@tonic-gate	movaps	%xmm2, 32(%edi)
1187c478bd9Sstevel@tonic-gate	movaps	48(%esi), %xmm3
1197c478bd9Sstevel@tonic-gate	movaps	%xmm3, 48(%edi)
1207c478bd9Sstevel@tonic-gate	addl	$64, %esi
1217c478bd9Sstevel@tonic-gate	addl	$64, %edi
1227c478bd9Sstevel@tonic-gate	decl	%edx
1237c478bd9Sstevel@tonic-gate	jnz	.sse_sa_loop
1267c478bd9Sstevel@tonic-gate	andl	$63, %ecx	/ compute remaining bytes
1277c478bd9Sstevel@tonic-gate	movl	8+4(%esp), %eax	/ setup return value
1287c478bd9Sstevel@tonic-gate	jz	.Return
1297c478bd9Sstevel@tonic-gate	jmp	.movew
1317c478bd9Sstevel@tonic-gate	/
1327c478bd9Sstevel@tonic-gate	/ use aligned load since we're lucky
1337c478bd9Sstevel@tonic-gate	/
1347c478bd9Sstevel@tonic-gate	.align 16
1367c478bd9Sstevel@tonic-gate	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
1377c478bd9Sstevel@tonic-gate	movaps	(%esi), %xmm0
1387c478bd9Sstevel@tonic-gate	movntps	%xmm0, 0(%edi)
1397c478bd9Sstevel@tonic-gate	movaps	16(%esi), %xmm1
1407c478bd9Sstevel@tonic-gate	movntps	%xmm1, 16(%edi)
1417c478bd9Sstevel@tonic-gate	movaps	32(%esi), %xmm2
1427c478bd9Sstevel@tonic-gate	movntps	%xmm2, 32(%edi)
1437c478bd9Sstevel@tonic-gate	movaps	48(%esi), %xmm3
1447c478bd9Sstevel@tonic-gate	movntps	%xmm3, 48(%edi)
1457c478bd9Sstevel@tonic-gate	addl	$64, %esi
1467c478bd9Sstevel@tonic-gate	addl	$64, %edi
1477c478bd9Sstevel@tonic-gate	decl	%edx
1487c478bd9Sstevel@tonic-gate	jnz	.sse_sa_nt_loop
1497c478bd9Sstevel@tonic-gate#if defined(_SSE2_INSN)
1507c478bd9Sstevel@tonic-gate	mfence
1517c478bd9Sstevel@tonic-gate#elif defined(_SSE_INSN)
1527c478bd9Sstevel@tonic-gate	sfence
1547c478bd9Sstevel@tonic-gate#error "Must have either SSE or SSE2"
1567c478bd9Sstevel@tonic-gate	jmp	.sse_cleanup
1587c478bd9Sstevel@tonic-gate	/
1597c478bd9Sstevel@tonic-gate	/ Make certain that destination buffer becomes aligned
1607c478bd9Sstevel@tonic-gate	/
1627c478bd9Sstevel@tonic-gate	neg	%eax		/ subtract from 16 and get destination
1637c478bd9Sstevel@tonic-gate	andl	$15, %eax	/ aligned on a 16 byte boundary
1647c478bd9Sstevel@tonic-gate	movl	%ecx, %edx	/ saved count
1657c478bd9Sstevel@tonic-gate	subl	%eax, %ecx	/ subtract from byte count
1667c478bd9Sstevel@tonic-gate	cmpl	$64, %ecx	/ after aligning, will we still have 64 bytes?
1677c478bd9Sstevel@tonic-gate	cmovb	%edx, %ecx	/ if not, restore original byte count,
1687c478bd9Sstevel@tonic-gate	cmovb	8+4(%esp), %eax	/ and restore return value,
1697c478bd9Sstevel@tonic-gate	jb	.movew		/ and do a non-SSE move.
1707c478bd9Sstevel@tonic-gate	xchg	%ecx, %eax	/ flip for copy
1717c478bd9Sstevel@tonic-gate	rep ; smovb		/ move the bytes
1727c478bd9Sstevel@tonic-gate	xchg	%ecx, %eax	/ flip back
1737c478bd9Sstevel@tonic-gate	jmp	.sse
1757c478bd9Sstevel@tonic-gate	.align 16
1777c478bd9Sstevel@tonic-gate	cmpl	$65535, %ecx
1787c478bd9Sstevel@tonic-gate	jbe	.sse_da_loop
1807c478bd9Sstevel@tonic-gate	/
1817c478bd9Sstevel@tonic-gate	/ use unaligned load since source doesn't line up
1827c478bd9Sstevel@tonic-gate	/
1847c478bd9Sstevel@tonic-gate	prefetchnta 16384(%esi)	/ prefetch source & copy 64 byte at a time
1857c478bd9Sstevel@tonic-gate	movups	0(%esi), %xmm0
1867c478bd9Sstevel@tonic-gate	movntps	%xmm0, 0(%edi)
1877c478bd9Sstevel@tonic-gate	movups	16(%esi), %xmm1
1887c478bd9Sstevel@tonic-gate	movntps	%xmm1, 16(%edi)
1897c478bd9Sstevel@tonic-gate	movups	32(%esi), %xmm2
1907c478bd9Sstevel@tonic-gate	movntps	%xmm2, 32(%edi)
1917c478bd9Sstevel@tonic-gate	movups	48(%esi), %xmm3
1927c478bd9Sstevel@tonic-gate	movntps	%xmm3, 48(%edi)
1937c478bd9Sstevel@tonic-gate	addl	$64, %esi
1947c478bd9Sstevel@tonic-gate	addl	$64, %edi
1957c478bd9Sstevel@tonic-gate	decl	%edx
1967c478bd9Sstevel@tonic-gate	jnz	.sse_da_nt_loop
1977c478bd9Sstevel@tonic-gate#if defined(_SSE2_INSN)
1987c478bd9Sstevel@tonic-gate	mfence
1997c478bd9Sstevel@tonic-gate#elif defined(_SSE_INSN)
2007c478bd9Sstevel@tonic-gate	sfence
2027c478bd9Sstevel@tonic-gate#error "Must have either SSE or SSE2"
2047c478bd9Sstevel@tonic-gate	jmp	.sse_cleanup
2057c478bd9Sstevel@tonic-gate	/
2067c478bd9Sstevel@tonic-gate	/ use unaligned load since source doesn't line up
2077c478bd9Sstevel@tonic-gate	/
2087c478bd9Sstevel@tonic-gate	.align	16
2107c478bd9Sstevel@tonic-gate	prefetcht0 568(%esi)	/ prefetch source & copy 64 byte at a time
2117c478bd9Sstevel@tonic-gate	prefetcht0 568(%edi)
2127c478bd9Sstevel@tonic-gate	movups	0(%esi), %xmm0
2137c478bd9Sstevel@tonic-gate	movaps	%xmm0, 0(%edi)
2147c478bd9Sstevel@tonic-gate	movups	16(%esi), %xmm1
2157c478bd9Sstevel@tonic-gate	movaps	%xmm1, 16(%edi)
2167c478bd9Sstevel@tonic-gate	movups	32(%esi), %xmm2
2177c478bd9Sstevel@tonic-gate	movaps	%xmm2, 32(%edi)
2187c478bd9Sstevel@tonic-gate	movups	48(%esi), %xmm3
2197c478bd9Sstevel@tonic-gate	movaps	%xmm3, 48(%edi)
2207c478bd9Sstevel@tonic-gate	addl	$64, %esi
2217c478bd9Sstevel@tonic-gate	addl	$64, %edi
2227c478bd9Sstevel@tonic-gate	decl	%edx
2237c478bd9Sstevel@tonic-gate	jnz	.sse_da_loop
2247c478bd9Sstevel@tonic-gate	jmp	.sse_cleanup
2267c478bd9Sstevel@tonic-gate	SET_SIZE(memcpy)
2297c478bd9Sstevel@tonic-gate/ .CopyLeft handles the memmove case where we must perform the copy backwards,
2307c478bd9Sstevel@tonic-gate/ because of overlap between src and dst. This is not particularly optimized.
2337c478bd9Sstevel@tonic-gate	movl	$3,%eax			/ heavily used constant
2347c478bd9Sstevel@tonic-gate	std				/ reverse direction bit (RtoL)
2357c478bd9Sstevel@tonic-gate	cmpl	$12,%ecx		/ if (size < 12)
2367c478bd9Sstevel@tonic-gate	ja	.BigCopyLeft		/ {
2377c478bd9Sstevel@tonic-gate	movl	%edx,%esi		/     src = src + size - 1
2387c478bd9Sstevel@tonic-gate	leal	-1(%ecx,%edi),%edi	/     dst = dst + size - 1
2397c478bd9Sstevel@tonic-gate	rep;	smovb			/    do the byte copy
2407c478bd9Sstevel@tonic-gate	cld				/    reset direction flag to LtoR
2417c478bd9Sstevel@tonic-gate	popl	%edi			/  }
2427c478bd9Sstevel@tonic-gate	popl	%esi			/  restore registers
2437c478bd9Sstevel@tonic-gate	movl	4(%esp),%eax		/  set up return value
2447c478bd9Sstevel@tonic-gate	ret				/  return(dba);
2457c478bd9Sstevel@tonic-gate.BigCopyLeft:				/ } else {
2467c478bd9Sstevel@tonic-gate	xchgl	%edx,%ecx
2477c478bd9Sstevel@tonic-gate	movl	%ecx,%esi		/ align source w/byte copy
2487c478bd9Sstevel@tonic-gate	leal	-1(%edx,%edi),%edi
2497c478bd9Sstevel@tonic-gate	andl	%eax,%ecx
2507c478bd9Sstevel@tonic-gate	jz	.SkipAlignLeft
2517c478bd9Sstevel@tonic-gate	addl	$1, %ecx		/ we need to insure that future
2527c478bd9Sstevel@tonic-gate	subl	%ecx,%edx		/ copy is done on aligned boundary
2537c478bd9Sstevel@tonic-gate	rep;	smovb
2557c478bd9Sstevel@tonic-gate	movl	%edx,%ecx
2567c478bd9Sstevel@tonic-gate	subl	%eax,%esi
2577c478bd9Sstevel@tonic-gate	shrl	$2,%ecx			/ do 4 byte copy RtoL
2587c478bd9Sstevel@tonic-gate	subl	%eax,%edi
2597c478bd9Sstevel@tonic-gate	rep;	smovl
2607c478bd9Sstevel@tonic-gate	andl	%eax,%edx		/ do 1 byte copy whats left
2617c478bd9Sstevel@tonic-gate	jz	.CleanupReturnLeft
2627c478bd9Sstevel@tonic-gate	movl	%edx,%ecx
2637c478bd9Sstevel@tonic-gate	addl	%eax,%esi		/ rep; smovl instruction will decrement
2647c478bd9Sstevel@tonic-gate	addl	%eax,%edi		/ %edi, %esi by four after each copy
2657c478bd9Sstevel@tonic-gate					/ adding 3 will restore pointers to byte
2667c478bd9Sstevel@tonic-gate					/ before last double word copied
2677c478bd9Sstevel@tonic-gate					/ which is where they are expected to
2687c478bd9Sstevel@tonic-gate					/ be for the single byte copy code
2697c478bd9Sstevel@tonic-gate	rep;	smovb
2717c478bd9Sstevel@tonic-gate	cld				/ reset direction flag to LtoR
2727c478bd9Sstevel@tonic-gate	popl	%edi
2737c478bd9Sstevel@tonic-gate	popl	%esi			/ restore registers
2747c478bd9Sstevel@tonic-gate	movl	4(%esp),%eax		/ set up return value
2757c478bd9Sstevel@tonic-gate	ret				/ return(dba);
2767c478bd9Sstevel@tonic-gate	SET_SIZE(memmove)