1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.file	"memset.s"
28
29#include <sys/asm_linkage.h>
30
31	ANSI_PRAGMA_WEAK(memset,function)
32
33	ENTRY(memset)
34	pushl	%edi		/ save register variable
35	movl	8(%esp),%edi	/ %edi = string address
36	movl	12(%esp),%eax	/ %al = byte to duplicate
37	movl	16(%esp),%ecx	/ %ecx = number of copies
38
39	/ For all basic blocks in this routine, maintain the following
40	/ entry conditions:	%eax each byte is set to desired byte.
41	/			NOTE: .byteset doesn't require this
42	/			%ecx contains # bytes to set
43	/			%edi contain address to set
44
45	cld			/ make sure we go the right way...
46	cmpl	$20,%ecx	/ strings with fewer than 20 chars should be byte set
47	jbe	.byteset
48
49	andl	$0xff, %eax	/ trim anything above low byte
50	imul	$0x01010101, %eax	/ extend low byte to each byte
51
52	cmpl	$256, %ecx	/ smaller areas don't benefit from alignment
53	jbe	.wordset
54
55	cmpl	$511, %ecx	/ areas smaller than this should be wordset
56	jbe	.check_wordset
57
58	/
59	/ prep work for sse temporal and non-temporal
60	/
61
62	pushl	%ebx		/ more registers are needed
63	pushl	%esi		/ for alignment work
64
65	/
66	/ align address to 64 byte boundaries.
67	/
68
69	movl	%ecx, %ebx	/ save byte count
70	movl	%edi, %esi	/ esi is scratch register
71	andl	$63, %esi	/ bytes to align to 64 byte align addr
72	neg	%esi		/ compute count of bytes
73	addl	$64, %esi	/ needed to align
74	andl	$63, %esi	/ to 64 byte align addr
75	jz	.sse_aligned	/ skip alignment if not needed
76	subl	%esi, %ebx	/ ebx contains remainder of bytes to set
77	movl	%esi, %ecx	/ alignment bytes
78	shrl	$2,%ecx		/ %ecx = number of words to set
79	rep; sstol
80	movl	%esi,%ecx
81	andl	$3,%ecx		/ %ecx = number of bytes left
82	rep; sstob
83	movl	%ebx, %ecx	/ remainder to be set
84
85.sse_aligned:
86
87	shr	$6, %ecx	/ number of 64 byte blocks to set
88
89	/
90	/ load xmm0 with bytes to be set
91	/
92	subl	$16,%esp	/ give ourselves some working room on the stack
93	movl	%eax,(%esp)	/ copy eax into each of 4 bytes
94	movl	%eax,4(%esp)	/ avoid pushl since it causes more interlocking
95	movl	%eax,8(%esp)	/
96	movl	%eax,12(%esp)	/
97	movups	(%esp), %xmm0	/ unaligned load from stack into xmm0
98	addl	$16,%esp	/ restore stack position
99
100	cmpl	$262143, %ebx	/ blocks smaller than this allocate in the cache
101	jbe	.sse_loop
102	jmp	.sse_nt_loop	/ branch across alignment nops
103
104	.align 16
105
106.sse_nt_loop:
107	movntps %xmm0, (%edi)	/ block non-temporal store
108	movntps %xmm0, 16(%edi)	/ use sse rather than sse2
109	movntps %xmm0, 32(%edi)	/ so we work more places
110	movntps %xmm0, 48(%edi)	/
111
112	addl	$64, %edi	/ increment dest address
113	dec	%ecx		/ dec count of blocks
114	jnz	.sse_nt_loop	/ jump if not done
115
116	andl	$63, %ebx	/ remainder of bytes to copy
117	movl	%ebx, %ecx	/ ecx contains remainer of bytes to set
118	popl	%esi		/ restore stack config
119	popl	%ebx		/
120#if defined(_SSE2_INSN)
121	mfence
122#elif defined(_SSE_INSN)
123	sfence
124#else
125#error "Must have either SSE or SSE2"
126#endif
127	cmpl	$20, %ecx	/ compare and jump accordingly
128	jbe	.byteset
129	jmp	.wordset
130
131	.align 16
132.sse_loop:
133 	movaps %xmm0, (%edi)	/ block copy w/ SSE
134	movaps %xmm0, 16(%edi)
135	movaps %xmm0, 32(%edi)
136	movaps %xmm0, 48(%edi)
137
138	addl	$64, %edi	/ increment addr
139	dec	%ecx		/ dec count of blocks
140	jnz	.sse_loop	/ jump if not done
141
142	andl	$63, %ebx	/ remainder of bytes to copy
143	movl	%ebx, %ecx	/ in %ecx as normal
144	popl	%esi		/ restore stack config
145	popl	%ebx		/
146	cmpl	$20, %ecx
147	jbe	.byteset
148	jmp	.wordset
149
150.check_wordset:
151	movl	%edi, %edx	/ save current store ptr
152	andl	$7, %edi	/ check alignment
153	movl	%edx,%edi	/ %edi = string address
154	jz	.wordset	/ all ok
155
156
157.align_wordset:
158	pushl	%ebx		/ more registers are needed
159	pushl	%esi
160
161	movl	%ecx, %ebx
162	movl	%edi, %esi
163	andl	$7, %esi
164	neg	%esi
165	addl	$8, %esi
166	andl	$7, %esi
167	subl	%esi, %ebx	/ ebx contains remainder of bytes to copy
168	movl	%esi, %ecx
169	rep; sstob
170	movl	%ebx, %ecx
171	popl	%esi		/ restore stack config
172	popl	%ebx		/
173
174.wordset:
175	movl	%ecx, %edx	/ save cont
176	shrl	$2,%ecx		/ %ecx = number of words to set
177	rep; sstol
178	movl	%edx,%ecx
179	andl	$3,%ecx		/ %ecx = number of bytes left
180
181.byteset:
182	rep; sstob
183	movl	8(%esp),%eax	/ return string address
184	popl	%edi		/ restore register variable
185	ret
186	SET_SIZE(memset)
187