17c478bd9Sstevel@tonic-gate/*
27c478bd9Sstevel@tonic-gate * CDDL HEADER START
37c478bd9Sstevel@tonic-gate *
47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the
58cd45542Sraf * Common Development and Distribution License (the "License").
68cd45542Sraf * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate *
87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate * and limitations under the License.
127c478bd9Sstevel@tonic-gate *
137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate *
197c478bd9Sstevel@tonic-gate * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
218cd45542Sraf
227c478bd9Sstevel@tonic-gate/*
238cd45542Sraf * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
247c478bd9Sstevel@tonic-gate * Use is subject to license terms.
257c478bd9Sstevel@tonic-gate */
267c478bd9Sstevel@tonic-gate
279a70fc3bSMark J. Nelson	.file	"memset.s"
287c478bd9Sstevel@tonic-gate
297c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h>
307c478bd9Sstevel@tonic-gate
317c478bd9Sstevel@tonic-gate	ANSI_PRAGMA_WEAK(memset,function)
327c478bd9Sstevel@tonic-gate
337c478bd9Sstevel@tonic-gate	ENTRY(memset)
347c478bd9Sstevel@tonic-gate	pushl	%edi		/ save register variable
357c478bd9Sstevel@tonic-gate	movl	8(%esp),%edi	/ %edi = string address
367c478bd9Sstevel@tonic-gate	movl	12(%esp),%eax	/ %al = byte to duplicate
377c478bd9Sstevel@tonic-gate	movl	16(%esp),%ecx	/ %ecx = number of copies
387c478bd9Sstevel@tonic-gate
397c478bd9Sstevel@tonic-gate	/ For all basic blocks in this routine, maintain the following
407c478bd9Sstevel@tonic-gate	/ entry conditions:	%eax each byte is set to desired byte.
417c478bd9Sstevel@tonic-gate	/			NOTE: .byteset doesn't require this
427c478bd9Sstevel@tonic-gate	/			%ecx contains # bytes to set
437c478bd9Sstevel@tonic-gate	/			%edi contain address to set
447c478bd9Sstevel@tonic-gate
457c478bd9Sstevel@tonic-gate	cld			/ make sure we go the right way...
467c478bd9Sstevel@tonic-gate	cmpl	$20,%ecx	/ strings with fewer than 20 chars should be byte set
47*55fea89dSDan Cross	jbe	.byteset
487c478bd9Sstevel@tonic-gate
497c478bd9Sstevel@tonic-gate	andl	$0xff, %eax	/ trim anything above low byte
507c478bd9Sstevel@tonic-gate	imul	$0x01010101, %eax	/ extend low byte to each byte
51*55fea89dSDan Cross
527c478bd9Sstevel@tonic-gate	cmpl	$256, %ecx	/ smaller areas don't benefit from alignment
537c478bd9Sstevel@tonic-gate	jbe	.wordset
547c478bd9Sstevel@tonic-gate
557c478bd9Sstevel@tonic-gate	cmpl	$511, %ecx	/ areas smaller than this should be wordset
56*55fea89dSDan Cross	jbe	.check_wordset
577c478bd9Sstevel@tonic-gate
587c478bd9Sstevel@tonic-gate	/
597c478bd9Sstevel@tonic-gate	/ prep work for sse temporal and non-temporal
607c478bd9Sstevel@tonic-gate	/
617c478bd9Sstevel@tonic-gate
627c478bd9Sstevel@tonic-gate	pushl	%ebx		/ more registers are needed
637c478bd9Sstevel@tonic-gate	pushl	%esi		/ for alignment work
647c478bd9Sstevel@tonic-gate
657c478bd9Sstevel@tonic-gate	/
667c478bd9Sstevel@tonic-gate	/ align address to 64 byte boundaries.
677c478bd9Sstevel@tonic-gate	/
687c478bd9Sstevel@tonic-gate
697c478bd9Sstevel@tonic-gate	movl	%ecx, %ebx	/ save byte count
707c478bd9Sstevel@tonic-gate	movl	%edi, %esi	/ esi is scratch register
717c478bd9Sstevel@tonic-gate	andl	$63, %esi	/ bytes to align to 64 byte align addr
72*55fea89dSDan Cross	neg	%esi		/ compute count of bytes
737c478bd9Sstevel@tonic-gate	addl	$64, %esi	/ needed to align
747c478bd9Sstevel@tonic-gate	andl	$63, %esi	/ to 64 byte align addr
757c478bd9Sstevel@tonic-gate	jz	.sse_aligned	/ skip alignment if not needed
767c478bd9Sstevel@tonic-gate	subl	%esi, %ebx	/ ebx contains remainder of bytes to set
777c478bd9Sstevel@tonic-gate	movl	%esi, %ecx	/ alignment bytes
787c478bd9Sstevel@tonic-gate	shrl	$2,%ecx		/ %ecx = number of words to set
797c478bd9Sstevel@tonic-gate	rep; sstol
807c478bd9Sstevel@tonic-gate	movl	%esi,%ecx
817c478bd9Sstevel@tonic-gate	andl	$3,%ecx		/ %ecx = number of bytes left
827c478bd9Sstevel@tonic-gate	rep; sstob
837c478bd9Sstevel@tonic-gate	movl	%ebx, %ecx	/ remainder to be set
847c478bd9Sstevel@tonic-gate
857c478bd9Sstevel@tonic-gate.sse_aligned:
86*55fea89dSDan Cross
877c478bd9Sstevel@tonic-gate	shr	$6, %ecx	/ number of 64 byte blocks to set
887c478bd9Sstevel@tonic-gate
897c478bd9Sstevel@tonic-gate	/
907c478bd9Sstevel@tonic-gate	/ load xmm0 with bytes to be set
917c478bd9Sstevel@tonic-gate	/
927c478bd9Sstevel@tonic-gate	subl	$16,%esp	/ give ourselves some working room on the stack
937c478bd9Sstevel@tonic-gate	movl	%eax,(%esp)	/ copy eax into each of 4 bytes
947c478bd9Sstevel@tonic-gate	movl	%eax,4(%esp)	/ avoid pushl since it causes more interlocking
957c478bd9Sstevel@tonic-gate	movl	%eax,8(%esp)	/
967c478bd9Sstevel@tonic-gate	movl	%eax,12(%esp)	/
977c478bd9Sstevel@tonic-gate	movups	(%esp), %xmm0	/ unaligned load from stack into xmm0
987c478bd9Sstevel@tonic-gate	addl	$16,%esp	/ restore stack position
99*55fea89dSDan Cross
1007c478bd9Sstevel@tonic-gate	cmpl	$262143, %ebx	/ blocks smaller than this allocate in the cache
1017c478bd9Sstevel@tonic-gate	jbe	.sse_loop
1027c478bd9Sstevel@tonic-gate	jmp	.sse_nt_loop	/ branch across alignment nops
103*55fea89dSDan Cross
1047c478bd9Sstevel@tonic-gate	.align 16
1057c478bd9Sstevel@tonic-gate
106*55fea89dSDan Cross.sse_nt_loop:
1077c478bd9Sstevel@tonic-gate	movntps %xmm0, (%edi)	/ block non-temporal store
1087c478bd9Sstevel@tonic-gate	movntps %xmm0, 16(%edi)	/ use sse rather than sse2
1097c478bd9Sstevel@tonic-gate	movntps %xmm0, 32(%edi)	/ so we work more places
1107c478bd9Sstevel@tonic-gate	movntps %xmm0, 48(%edi)	/
1117c478bd9Sstevel@tonic-gate
1127c478bd9Sstevel@tonic-gate	addl	$64, %edi	/ increment dest address
1137c478bd9Sstevel@tonic-gate	dec	%ecx		/ dec count of blocks
1147c478bd9Sstevel@tonic-gate	jnz	.sse_nt_loop	/ jump if not done
1157c478bd9Sstevel@tonic-gate
1167c478bd9Sstevel@tonic-gate	andl	$63, %ebx	/ remainder of bytes to copy
1177c478bd9Sstevel@tonic-gate	movl	%ebx, %ecx	/ ecx contains remainer of bytes to set
1187c478bd9Sstevel@tonic-gate	popl	%esi		/ restore stack config
1197c478bd9Sstevel@tonic-gate	popl	%ebx		/
1207c478bd9Sstevel@tonic-gate#if defined(_SSE2_INSN)
1217c478bd9Sstevel@tonic-gate	mfence
1227c478bd9Sstevel@tonic-gate#elif defined(_SSE_INSN)
1237c478bd9Sstevel@tonic-gate	sfence
1247c478bd9Sstevel@tonic-gate#else
1257c478bd9Sstevel@tonic-gate#error "Must have either SSE or SSE2"
1267c478bd9Sstevel@tonic-gate#endif
1277c478bd9Sstevel@tonic-gate	cmpl	$20, %ecx	/ compare and jump accordingly
1287c478bd9Sstevel@tonic-gate	jbe	.byteset
129*55fea89dSDan Cross	jmp	.wordset
1307c478bd9Sstevel@tonic-gate
1317c478bd9Sstevel@tonic-gate	.align 16
1327c478bd9Sstevel@tonic-gate.sse_loop:
1337c478bd9Sstevel@tonic-gate 	movaps %xmm0, (%edi)	/ block copy w/ SSE
1347c478bd9Sstevel@tonic-gate	movaps %xmm0, 16(%edi)
1357c478bd9Sstevel@tonic-gate	movaps %xmm0, 32(%edi)
1367c478bd9Sstevel@tonic-gate	movaps %xmm0, 48(%edi)
1377c478bd9Sstevel@tonic-gate
1387c478bd9Sstevel@tonic-gate	addl	$64, %edi	/ increment addr
1397c478bd9Sstevel@tonic-gate	dec	%ecx		/ dec count of blocks
1407c478bd9Sstevel@tonic-gate	jnz	.sse_loop	/ jump if not done
1417c478bd9Sstevel@tonic-gate
1427c478bd9Sstevel@tonic-gate	andl	$63, %ebx	/ remainder of bytes to copy
1437c478bd9Sstevel@tonic-gate	movl	%ebx, %ecx	/ in %ecx as normal
1447c478bd9Sstevel@tonic-gate	popl	%esi		/ restore stack config
1457c478bd9Sstevel@tonic-gate	popl	%ebx		/
146*55fea89dSDan Cross	cmpl	$20, %ecx
1477c478bd9Sstevel@tonic-gate	jbe	.byteset
1487c478bd9Sstevel@tonic-gate	jmp	.wordset
1497c478bd9Sstevel@tonic-gate
1507c478bd9Sstevel@tonic-gate.check_wordset:
1517c478bd9Sstevel@tonic-gate	movl	%edi, %edx	/ save current store ptr
1527c478bd9Sstevel@tonic-gate	andl	$7, %edi	/ check alignment
1537c478bd9Sstevel@tonic-gate	movl	%edx,%edi	/ %edi = string address
154*55fea89dSDan Cross	jz	.wordset	/ all ok
1557c478bd9Sstevel@tonic-gate
156*55fea89dSDan Cross
157*55fea89dSDan Cross.align_wordset:
1587c478bd9Sstevel@tonic-gate	pushl	%ebx		/ more registers are needed
159*55fea89dSDan Cross	pushl	%esi
1607c478bd9Sstevel@tonic-gate
1617c478bd9Sstevel@tonic-gate	movl	%ecx, %ebx
1627c478bd9Sstevel@tonic-gate	movl	%edi, %esi
1637c478bd9Sstevel@tonic-gate	andl	$7, %esi
1647c478bd9Sstevel@tonic-gate	neg	%esi
1657c478bd9Sstevel@tonic-gate	addl	$8, %esi
1667c478bd9Sstevel@tonic-gate	andl	$7, %esi
1677c478bd9Sstevel@tonic-gate	subl	%esi, %ebx	/ ebx contains remainder of bytes to copy
1687c478bd9Sstevel@tonic-gate	movl	%esi, %ecx
169*55fea89dSDan Cross	rep; sstob
1707c478bd9Sstevel@tonic-gate	movl	%ebx, %ecx
1717c478bd9Sstevel@tonic-gate	popl	%esi		/ restore stack config
1727c478bd9Sstevel@tonic-gate	popl	%ebx		/
1737c478bd9Sstevel@tonic-gate
1747c478bd9Sstevel@tonic-gate.wordset:
1757c478bd9Sstevel@tonic-gate	movl	%ecx, %edx	/ save cont
1767c478bd9Sstevel@tonic-gate	shrl	$2,%ecx		/ %ecx = number of words to set
1777c478bd9Sstevel@tonic-gate	rep; sstol
1787c478bd9Sstevel@tonic-gate	movl	%edx,%ecx
1797c478bd9Sstevel@tonic-gate	andl	$3,%ecx		/ %ecx = number of bytes left
1807c478bd9Sstevel@tonic-gate
1817c478bd9Sstevel@tonic-gate.byteset:
1827c478bd9Sstevel@tonic-gate	rep; sstob
1837c478bd9Sstevel@tonic-gate	movl	8(%esp),%eax	/ return string address
1847c478bd9Sstevel@tonic-gate	popl	%edi		/ restore register variable
1857c478bd9Sstevel@tonic-gate	ret
1867c478bd9Sstevel@tonic-gate	SET_SIZE(memset)
187