1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26	.file	"memcpy.s"
27
28/*
29 * memcpy(s1, s2, len)
30 *
31 * Copy s2 to s1, always copy n bytes.
32 * Note: this C code does not work for overlapped copies.
33 *       Memmove() and bcopy() do.
34 *
35 * Fast assembler language version of the following C-program for memcpy
36 * which represents the `standard' for the C-library.
37 *
38 *	void *
39 *	memcpy(void *s, const void *s0, size_t n)
40 *	{
41 *		if (n != 0) {
42 *	   	    char *s1 = s;
43 *		    const char *s2 = s0;
44 *		    do {
45 *			*s1++ = *s2++;
46 *		    } while (--n != 0);
47 *		}
48 *		return (s);
49 *	}
50 */
51
52#include <sys/asm_linkage.h>
53#include <sys/sun4asi.h>
54#include <sys/trap.h>
55
56#define	ICACHE_LINE_SIZE	64
57#define	BLOCK_SIZE		64
58#define	FPRS_FEF		0x4
59
60#define	ALIGNED8_FPCOPY_THRESHOLD	1024
61#define	ALIGNED4_FPCOPY_THRESHOLD	1024
62#define	BST_THRESHOLD			65536
63
64#define	SHORTCOPY	3
65#define	SMALL_MAX	64
66#define	MEDIUM_MAX	255
67#define	MED_WMAX	256	/* max copy for medium word-aligned case */
68
69#define	N_READS_STRONG	20
70#define	N_WRITES_STRONG	22
71
72
73	ANSI_PRAGMA_WEAK(memmove,function)
74	ANSI_PRAGMA_WEAK(memcpy,function)
75
76	ENTRY(memmove)
77	prefetch [%o1], N_READS_STRONG
78	prefetch [%o0], N_WRITES_STRONG
79	cmp	%o1, %o0	! if from address is >= to use forward copy
80	bgeu	%ncc, .forcpy	! else use backward if ...
81	sub	%o0, %o1, %o4	! get difference of two addresses
82	cmp	%o2, %o4	! compare size and difference of addresses
83	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
84	nop
85
86	!
87	! an overlapped copy that must be done "backwards"
88	!
89.ovbc:
90	mov	%o0, %g1		! save dest address for return val
91	add     %o1, %o2, %o1           ! get to end of source space
92	add     %o0, %o2, %o0           ! get to end of destination space
93
94	cmp	%o2, 64
95	bgeu,pn	%ncc, .dbalign
96	nop
97	cmp	%o2, 4
98	blt,pn	%ncc, .byte
99	sub	%o2, 3, %o2
100.byte4loop:
101	ldub	[%o1-1], %o3		! load last byte
102	stb	%o3, [%o0-1]		! store last byte
103	sub	%o1, 4, %o1
104	ldub	[%o1+2], %o3		! load 2nd from last byte
105	stb	%o3, [%o0-2]		! store 2nd from last byte
106	sub	%o0, 4, %o0
107	ldub	[%o1+1], %o3		! load 3rd from last byte
108	stb	%o3, [%o0+1]		! store 3rd from last byte
109	subcc	%o2, 4, %o2
110	ldub	[%o1], %o3		! load 4th from last byte
111	bgu,pt	%ncc, .byte4loop
112	stb	%o3, [%o0]		! store 4th from last byte
113.byte:
114	addcc	%o2, 3, %o2
115	bz,pt	%ncc, .exit
116.byteloop:
117	dec	%o1			! decrement src address
118	ldub	[%o1], %o3		! read a byte
119	dec	%o0			! decrement dst address
120	deccc	%o2			! decrement count
121	bgu,pt	%ncc, .byteloop		! loop until done
122	stb	%o3, [%o0]		! write byte
123.exit:
124	retl
125	mov	%g1, %o0
126
127	.align	16
128.dbalign:
129	prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read
130	prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write
131	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
132	bz,pt	%ncc, .dbmed
133	sub	%o2, %o5, %o2		! update count
134.dbalign1:
135	dec	%o1			! decrement src address
136	ldub	[%o1], %o3		! read a byte
137	dec	%o0			! decrement dst address
138	deccc	%o5			! decrement count
139	bgu,pt	%ncc, .dbalign1		! loop until done
140	stb	%o3, [%o0]		! store a byte
141
142! check for src long word alignment
143.dbmed:
144	andcc	%o1, 7, %g0		! chk src long word alignment
145	bnz,pn	%ncc, .dbbck
146	nop
147!
148! Following code is for overlapping copies where src and dest
149! are long word aligned
150!
151!
152! For SPARC64-VI, prefetch is effective for both integer and fp register
153! operations. There are no benefits in using the fp registers for
154! aligned data copying.
155
156.dbmedl32enter:
157	subcc	%o2, 31, %o2		! adjust length to allow cc test
158					! for end of loop
159	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
160	nop
161.dbmedl32:
162	ldx	[%o1-8], %o4		! load
163	prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read
164	subcc	%o2, 32, %o2		! decrement length count
165	stx	%o4, [%o0-8]		! and store
166	prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write
167	ldx	[%o1-16], %o3		! a block of 32 bytes
168	sub	%o1, 32, %o1		! decrease src ptr by 32
169	stx	%o3, [%o0-16]
170	ldx	[%o1+8], %o4
171	sub	%o0, 32, %o0		! decrease dst ptr by 32
172	stx	%o4, [%o0+8]
173	ldx	[%o1], %o3
174	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
175	stx	%o3, [%o0]
176.dbmedl31:
177	addcc	%o2, 16, %o2		! adjust remaining count
178	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
179	nop				!
180	ldx	[%o1-8], %o4		! load and store 16 bytes
181	sub	%o1, 16, %o1		! decrease src ptr by 16
182	stx	%o4, [%o0-8]		!
183	sub	%o2, 16, %o2		! decrease count by 16
184	ldx	[%o1], %o3		!
185	sub	%o0, 16, %o0		! decrease dst ptr by 16
186	stx	%o3, [%o0]
187.dbmedl15:
188	addcc	%o2, 15, %o2		! restore count
189	bz,pt	%ncc, .dbexit		! exit if finished
190	nop
191	cmp	%o2, 8
192	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
193	nop
194	ldx	[%o1-8], %o4		! load 8 bytes
195	sub	%o1, 8, %o1		! decrease src ptr by 8
196	stx	%o4, [%o0-8]		! and store 8 bytes
197	subcc	%o2, 8, %o2		! decrease count by 8
198	bnz	%ncc, .dbremain		! exit if finished
199	sub	%o0, 8, %o0		! decrease dst ptr by 8
200	retl
201	mov	%g1, %o0
202
203!
204! Following code is for overlapping copies where src and dest
205! are not long word aligned
206!
207	.align	16
208.dbbck:
209	rd	%fprs, %o3		! o3 = fprs
210
211	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
212	! So set it anyway, without checking.
213	wr	%g0, 0x4, %fprs		! fprs.fef = 1
214
215	alignaddr %o1, %g0, %o5		! align src
216	ldd	[%o5], %d0		! get first 8 byte block
217	andn	%o2, 7, %o4		! prepare src ptr for finishup code
218	cmp	%o2, 32
219	blt,pn	%ncc, .dbmv8
220	sub	%o1, %o4, %o1		!
221	cmp	%o2, 4095		! check for short memmoves
222	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
223.dbmv64:
224	ldd	[%o5-8], %d2		! load 8 bytes
225	ldd	[%o5-16], %d4		! load 8 bytes
226	sub	%o5, 64, %o5		!
227	ldd	[%o5+40], %d6		! load 8 bytes
228	sub	%o0, 64, %o0		!
229	ldd	[%o5+32], %d8		! load 8 bytes
230	sub	%o2, 64, %o2		! 64 less bytes to copy
231	ldd	[%o5+24], %d18		! load 8 bytes
232	cmp	%o2, 64			! do we have < 64 bytes remaining
233	ldd	[%o5+16], %d28		! load 8 bytes
234	ldd	[%o5+8], %d30		! load 8 bytes
235	faligndata %d2, %d0, %d10	! extract 8 bytes out
236	prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read
237	ldd	[%o5], %d0		! load 8 bytes
238	std	%d10, [%o0+56]		! store the current 8 bytes
239	faligndata %d4, %d2, %d12	! extract 8 bytes out
240	prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write
241	std	%d12, [%o0+48]		! store the current 8 bytes
242	faligndata %d6, %d4, %d14	! extract 8 bytes out
243	std	%d14, [%o0+40]		! store the current 8 bytes
244	faligndata %d8, %d6, %d16	! extract 8 bytes out
245	std	%d16, [%o0+32]		! store the current 8 bytes
246	faligndata %d18, %d8, %d20	! extract 8 bytes out
247	std	%d20, [%o0+24]		! store the current 8 bytes
248	faligndata %d28, %d18, %d22	! extract 8 bytes out
249	std	%d22, [%o0+16]		! store the current 8 bytes
250	faligndata %d30, %d28, %d24	! extract 8 bytes out
251	std	%d24, [%o0+8]		! store the current 8 bytes
252	faligndata %d0, %d30, %d26	! extract 8 bytes out
253	bgeu,pt	%ncc, .dbmv64
254	std	%d26, [%o0]		! store the current 8 bytes
255
256	cmp	%o2, 32
257	blt,pn	%ncc, .dbmvx
258	nop
259.dbmv32:
260	ldd	[%o5-8], %d2		! load 8 bytes
261.dbmv32enter:
262	ldd	[%o5-16], %d4		! load 8 bytes
263	sub	%o5, 32, %o5		!
264	ldd	[%o5+8], %d6		! load 8 bytes
265	sub	%o0, 32, %o0		!
266	faligndata %d2, %d0, %d10	! extract 8 bytes out
267	ldd	[%o5], %d0		! load 8 bytes
268	sub	%o2,32, %o2		! 32 less bytes to copy
269	std	%d10, [%o0+24]		! store the current 8 bytes
270	cmp	%o2, 32			! do we have < 32 bytes remaining
271	faligndata %d4, %d2, %d12	! extract 8 bytes out
272	std	%d12, [%o0+16]		! store the current 8 bytes
273	faligndata %d6, %d4, %d14	! extract 8 bytes out
274	std	%d14, [%o0+8]		! store the current 8 bytes
275	faligndata %d0, %d6, %d16	! extract 8 bytes out
276	bgeu,pt	%ncc, .dbmv32
277	std	%d16, [%o0]		! store the current 8 bytes
278.dbmvx:
279	cmp	%o2, 8			! do we have < 8 bytes remaining
280	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
281	nop
282.dbmv8:
283	ldd	[%o5-8], %d2
284	sub	%o0, 8, %o0		! since we are at the end
285					! when we first enter the loop
286	sub	%o2, 8, %o2		! 8 less bytes to copy
287	sub	%o5, 8, %o5
288	cmp	%o2, 8			! do we have < 8 bytes remaining
289	faligndata %d2, %d0, %d8	! extract 8 bytes out
290	std	%d8, [%o0]		! store the current 8 bytes
291	bgeu,pt	%ncc, .dbmv8
292	fmovd	%d2, %d0
293.dbmvfinish:
294	and	%o3, 0x4, %o3		! fprs.du = fprs.dl = 0
295	tst	%o2
296	bz,pt	%ncc, .dbexit
297	wr	%o3, %g0, %fprs		! fprs = o3   restore fprs
298
299.dbremain:
300	cmp	%o2, 4
301	blt,pn	%ncc, .dbbyte
302	nop
303	ldub	[%o1-1], %o3		! load last byte
304	stb	%o3, [%o0-1]		! store last byte
305	sub	%o1, 4, %o1
306	ldub	[%o1+2], %o3		! load 2nd from last byte
307	stb	%o3, [%o0-2]		! store 2nd from last byte
308	sub	%o0, 4, %o0
309	ldub	[%o1+1], %o3		! load 3rd from last byte
310	stb	%o3, [%o0+1]		! store 3rd from last byte
311	subcc	%o2, 4, %o2
312	ldub	[%o1], %o3		! load 4th from last byte
313	stb	%o3, [%o0]		! store 4th from last byte
314	bz,pt	%ncc, .dbexit
315.dbbyte:
316	dec	%o1			! decrement src address
317	ldub	[%o1], %o3		! read a byte
318	dec	%o0			! decrement dst address
319	deccc	%o2			! decrement count
320	bgu,pt	%ncc, .dbbyte		! loop until done
321	stb	%o3, [%o0]		! write byte
322.dbexit:
323	retl
324	mov	%g1, %o0
325	SET_SIZE(memmove)
326
327
328	.align ICACHE_LINE_SIZE
329	ENTRY(memcpy)
330					! adjust instruction alignment
331	nop				! Do not remove, these nops affect
332	nop				! icache alignment and performance
333.forcpy:
334	prefetch [%o1], N_READS_STRONG
335	prefetch [%o0], N_WRITES_STRONG
336	cmp	%o2, SMALL_MAX		! check for not small case
337	bgu,pn	%ncc, .medium		! go to larger cases
338	mov	%o0, %g1		! save %o0
339	cmp	%o2, SHORTCOPY		! check for really short case
340	ble,pt	%ncc, .smallleft	!
341	or	%o0, %o1, %o3		! prepare alignment check
342	andcc	%o3, 0x3, %g0		! test for alignment
343	bz,pt	%ncc, .smallword	! branch to word aligned case
344	sub	%o2, 3, %o2		! adjust count to allow cc zero test
345.smallnotalign4:
346	ldub	[%o1], %o3		! read byte
347	subcc	%o2, 4, %o2		! reduce count by 4
348	stb	%o3, [%o0]		! write byte
349	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
350	add	%o1, 4, %o1		! advance SRC by 4
351	stb	%o3, [%o0+1]
352	ldub	[%o1-2], %o3
353	add	%o0, 4, %o0		! advance DST by 4
354	stb	%o3, [%o0-2]
355	ldub	[%o1-1], %o3
356	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
357	stb	%o3, [%o0-1]
358	add	%o2, 3, %o2		! restore count
359.smallleft:
360	tst	%o2
361	bz,pt	%ncc, .smallexit
362	nop
363.smallleft3:				! 1, 2, or 3 bytes remain
364	ldub	[%o1], %o3		! load one byte
365	deccc	%o2			! reduce count for cc test
366	bz,pt	%ncc, .smallexit
367	stb	%o3, [%o0]		! store one byte
368	ldub	[%o1+1], %o3		! load second byte
369	deccc	%o2
370	bz,pt	%ncc, .smallexit
371	stb	%o3, [%o0+1]		! store second byte
372	ldub	[%o1+2], %o3		! load third byte
373	stb	%o3, [%o0+2]		! store third byte
374	retl
375	mov	%g1, %o0		! restore %o0
376
377	.align	16
378	nop				! affects loop icache alignment
379.smallwords:
380	lduw	[%o1], %o3		! read word
381.smallwordx:
382	subcc	%o2, 8, %o2		! update count
383	stw	%o3, [%o0]		! write word
384	add	%o1, 8, %o1		! update SRC
385	lduw	[%o1-4], %o3		! read word
386	add	%o0, 8, %o0		! update DST
387	bgu,pt	%ncc, .smallwords	! loop until done
388	stw	%o3, [%o0-4]		! write word
389	addcc	%o2, 7, %o2		! restore count
390	bz,pt	%ncc, .smallexit	! check for completion
391	nop
392	cmp	%o2, 4			! check for 4 or more bytes left
393	blt	.smallleft3		! if not, go to finish up
394	nop
395	lduw	[%o1], %o3
396	add	%o1, 4, %o1
397	subcc	%o2, 4, %o2
398	stw	%o3, [%o0]
399	add	%o0, 4, %o0
400	bnz,pt	%ncc, .smallleft3
401	nop
402	retl
403	mov	%g1, %o0		! restore %o0
404
405.smallword:
406	subcc	%o2, 4, %o2		! update count
407	bgu,pt	%ncc, .smallwordx
408	lduw	[%o1], %o3		! read word
409	addcc	%o2, 3, %o2		! restore count
410	bz,pt	%ncc, .smallexit
411	stw	%o3, [%o0]		! write word
412	deccc	%o2			! reduce count for cc test
413	ldub	[%o1+4], %o3		! load one byte
414	bz,pt	%ncc, .smallexit
415	stb	%o3, [%o0+4]		! store one byte
416	ldub	[%o1+5], %o3		! load second byte
417	deccc	%o2
418	bz,pt	%ncc, .smallexit
419	stb	%o3, [%o0+5]		! store second byte
420	ldub	[%o1+6], %o3		! load third byte
421	stb	%o3, [%o0+6]		! store third byte
422.smallexit:
423	retl
424	mov	%g1, %o0		! restore %o0
425	.align 16
426.medium:
427	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
428	prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
429	neg	%o0, %o5
430	neg	%o1, %o3
431	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
432	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
433
434	bz	%ncc, 2f
435	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
436				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
437
438	sub	%o2, %o5, %o2	! update count
439
4401:
441	ldub	[%o1], %o4
442	deccc	%o5
443	inc	%o1
444	stb	%o4, [%o0]
445	bgu,pt	%ncc, 1b
446	inc	%o0
447
448	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
449
4502:
451	andcc	%o1, 0x3, %g0		! test alignment
452	prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read
453	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
454					! if src, dst not aligned
455	prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write
456
457/*
458 * Handle all cases where src and dest are aligned on word
459 * or long word boundaries.  Use unrolled loops for better
460 * performance.  This option wins over standard large data
461 * move when source and destination is in cache for medium
462 * to short data moves.
463 */
464	andcc	%o1, 0x7, %g0		! test word alignment
465	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
466	bz,pt	%ncc, .medlword		! branch to long word aligned case
467	prefetch [%o0 + (2 * BLOCK_SIZE)], #one_write
468	cmp	%o2, ALIGNED4_FPCOPY_THRESHOLD	! limit to store buffer size
469	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
470	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
471	subcc	%o2, 15, %o2		! adjust length to allow cc test
472	prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
473					! for end of loop
474	ble,pt	%ncc, .medw15		! skip big loop if less than 16
475	  .empty
476.medw16:
477	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
478	ld	[%o1], %o4		! load
479	subcc	%o2, 16, %o2		! decrement length count
480	prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write
481	stw	%o4, [%o0]		! and store
482	ld	[%o1+4], %o3		! a block of 16 bytes
483	add	%o1, 16, %o1		! increase src ptr by 16
484	stw	%o3, [%o0+4]
485	ld	[%o1-8], %o4
486	add	%o0, 16, %o0		! increase dst ptr by 16
487	stw	%o4, [%o0-8]
488	ld	[%o1-4], %o3
489	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
490	stw	%o3, [%o0-4]
491.medw15:
492	addcc	%o2, 15, %o2		! restore count
493	bz,pt	%ncc, .medwexit		! exit if finished
494	nop
495	cmp	%o2, 8
496	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
497	nop				!
498	ld	[%o1], %o4		! load 4 bytes
499	subcc	%o2, 8, %o2		! decrease count by 8
500	stw	%o4, [%o0]		! and store 4 bytes
501	add	%o1, 8, %o1		! increase src ptr by 8
502	ld	[%o1-4], %o3		! load 4 bytes
503	add	%o0, 8, %o0		! increase dst ptr by 8
504	stw	%o3, [%o0-4]		! and store 4 bytes
505	bz	%ncc, .medwexit		! exit if finished
506	nop
507.medw7:					! count is ge 1, less than 8
508	cmp	%o2, 3			! check for 4 bytes left
509	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
510	nop				!
511	ld	[%o1], %o4		! load 4 bytes
512	sub	%o2, 4, %o2		! decrease count by 4
513	add	%o1, 4, %o1		! increase src ptr by 4
514	stw	%o4, [%o0]		! and store 4 bytes
515	add	%o0, 4, %o0		! increase dst ptr by 4
516	tst	%o2			! check for zero bytes left
517	bz	%ncc, .medwexit		! exit if finished
518	nop
519.medw3:					! count is known to be 1, 2, or 3
520	deccc	%o2			! reduce count by one
521	ldub	[%o1], %o3		! load one byte
522	bz,pt	%ncc, .medwexit		! exit if last byte
523	stb	%o3, [%o0]		! store one byte
524	ldub	[%o1+1], %o3		! load second byte
525	deccc	%o2			! reduce count by one
526	bz,pt	%ncc, .medwexit		! exit if last byte
527	stb	%o3, [%o0+1]		! store second byte
528	ldub	[%o1+2], %o3		! load third byte
529	stb	%o3, [%o0+2]		! store third byte
530.medwexit:
531	retl
532	mov	%g1, %o0		! restore %o0
533
534/*
535 * Special case for handling when src and dest are both long word aligned
536 * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD
537 * bytes.
538 */
539
540	.align 16
541	nop
542.medlword:				! long word aligned
543					! length > ALIGNED8_FPCOPY_THRESHOLD
544	cmp	%o2, ALIGNED8_FPCOPY_THRESHOLD
545	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
546	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
547	prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write
548	subcc	%o2, 31, %o2		! adjust length to allow cc test
549					! for end of loop
550	ble,pt	%ncc, .medl31		! skip big loop if less than 32
551	  .empty
552.medl32:
553	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
554	ldx	[%o1], %o4		! load
555	subcc	%o2, 32, %o2		! decrement length count
556	prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read
557	stx	%o4, [%o0]		! and store
558	ldx	[%o1+8], %o3		! a block of 32 bytes
559	add	%o1, 32, %o1		! increase src ptr by 32
560	stx	%o3, [%o0+8]
561	ldx	[%o1-16], %o4
562	add	%o0, 32, %o0		! increase dst ptr by 32
563	stx	%o4, [%o0-16]
564	ldx	[%o1-8], %o3
565	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
566	stx	%o3, [%o0-8]
567.medl31:
568	addcc	%o2, 16, %o2		! adjust remaining count
569	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
570	nop				!
571	ldx	[%o1], %o4		! load and store 16 bytes
572	add	%o1, 16, %o1		! increase src ptr by 16
573	stx	%o4, [%o0]		!
574	sub	%o2, 16, %o2		! decrease count by 16
575	ldx	[%o1-8], %o3		!
576	add	%o0, 16, %o0		! increase dst ptr by 16
577	stx	%o3, [%o0-8]
578.medl15:
579	addcc	%o2, 15, %o2		! restore count
580	bz,pt	%ncc, .medwexit		! exit if finished
581	nop
582	cmp	%o2, 8
583	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
584	nop
585	ldx	[%o1], %o4		! load 8 bytes
586	add	%o1, 8, %o1		! increase src ptr by 8
587	stx	%o4, [%o0]		! and store 8 bytes
588	subcc	%o2, 8, %o2		! decrease count by 8
589	bz	%ncc, .medwexit		! exit if finished
590	add	%o0, 8, %o0		! increase dst ptr by 8
591	ba	.medw7
592	nop
593
594	.align 16
595	nop
596	nop
597	nop
598.mediumsetup:
599	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
600	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
601.mediumrejoin:
602	rd	%fprs, %o4		! check for unused FPU
603
604	add	%o1, 8, %o1		! prepare to round SRC upward
605
606	sethi	%hi(0x1234567f), %o5	! For GSR.MASK
607	or	%o5, 0x67f, %o5
608
609	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
610	bz,a	%ncc, 3f
611	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
6123:
613	cmp	%o2, MEDIUM_MAX
614	bmask	%o5, %g0, %g0
615
616	! Compute o5 (number of bytes that need copying using the main loop).
617	! First, compute for the medium case.
618	! Then, if large case, o5 is replaced by count for block alignment.
619	! Be careful not to read past end of SRC
620	! Currently, o2 is the actual count remaining
621	!            o3 is how much sooner we'll cross the alignment boundary
622	!                in SRC compared to in DST
623	!
624	! Examples:  Let # denote bytes that should not be accessed
625	!            Let x denote a byte already copied to align DST
626	!            Let . and - denote bytes not yet copied
627	!            Let | denote double alignment boundaries
628	!
629	!            DST:  ######xx|........|--------|..######   o2 = 18
630	!                          o0
631	!
632	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
633	!                          o1
634	!
635	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
636	!                                   o1
637	!
638	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
639	!                                   o1
640
641	or	%g0, -8, %o5
642	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
643
644	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
645	add	%o5, %o2, %o5
646	add	%o5, %o3, %o5
647
648	bleu	%ncc, 4f
649	andn	%o5, 7, %o5		! 8 byte aligned count
650	neg	%o0, %o5		! 'large' case
651	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
6524:
653	brgez,a	%o3, .beginmedloop
654	ldd	[%o1-8], %d0
655
656	add	%o1, %o3, %o1		! back up o1
6575:
658	ldda	[%o1]ASI_FL8_P, %d2
659	inc	%o1
660	andcc	%o1, 7, %g0
661	bnz	%ncc, 5b
662	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
663
664.beginmedloop:
665	tst	%o5
666	bz	%ncc, .endmedloop
667	sub	%o2, %o5, %o2		! update count for later
668
669	! Main loop to write out doubles.  Note: o5 & 7 == 0
670
671	ldd	[%o1], %d2
672	subcc	%o5, 8, %o5		! update local count
673	bz,pn	%ncc, 1f
674	add	%o1, 8, %o1		! update SRC
675
676.medloop:
677	faligndata %d0, %d2, %d4
678	ldd	[%o1], %d0
679	subcc	%o5, 8, %o5		! update local count
680	add	%o1, 16, %o1		! update SRC
681	std	%d4, [%o0]
682	bz,pn	%ncc, 2f
683	faligndata %d2, %d0, %d6
684	ldd	[%o1 - 8], %d2
685	subcc	%o5, 8, %o5		! update local count
686	std	%d6, [%o0 + 8]
687	bnz,pt	%ncc, .medloop
688	add	%o0, 16, %o0		! update DST
689
6901:
691	faligndata %d0, %d2, %d4
692	fmovd	%d2, %d0
693	std	%d4, [%o0]
694	ba	.endmedloop
695	add	%o0, 8, %o0
696
6972:
698	std	%d6, [%o0 + 8]
699	sub	%o1, 8, %o1
700	add	%o0, 16, %o0
701
702
703.endmedloop:
704	! Currently, o1 is pointing to the next double-aligned byte in SRC
705	! The 8 bytes starting at [o1-8] are available in d0
706	! At least one, and possibly all, of these need to be written.
707
708	cmp	%o2, BLOCK_SIZE
709	bgu	%ncc, .large		! otherwise, less than 16 bytes left
710
711#if 0
712
713	/* This code will use partial stores.  */
714
715	mov	%g0, %o5
716	and	%o3, 7, %o3		! Number of bytes needed to completely
717					! fill %d0 with good (unwritten) data.
718
719	subcc	%o2, 8, %o2		! update count (maybe too much)
720	movl	%ncc, %o2, %o5
721	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
722	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
723
724	bz	%ncc, 2f
725	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
726
7271:
728	deccc	%o5
729	ldda	[%o1]ASI_FL8_P, %d2
730	inc	%o1
731	bgu	%ncc, 1b
732	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
733
7342:
735	not	%o3
736	faligndata %d0, %d0, %d0	! shift bytes to the left
737	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
738	edge8n	%g0, %o3, %o5
739	stda	%d0, [%o0]%o5, ASI_PST8_P
740	brlez	%o2, .mediumexit
741	add	%o0, %o3, %o0		! update DST to last stored byte
7423:
743	inc	%o0
744	deccc	%o2
745	ldub	[%o1], %o3
746	stb	%o3, [%o0]
747	bgu	%ncc, 3b
748	inc	%o1
749
750#else
751
752	andcc	%o3, 7, %o5		! Number of bytes needed to completely
753					! fill %d0 with good (unwritten) data.
754	bz	%ncc, 2f
755	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
756	cmp	%o2, 8
757	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
758	add	%o1, %o3, %o1 		! Back up %o1
759
7601:
761	deccc	%o5
762	ldda	[%o1]ASI_FL8_P, %d2
763	inc	%o1
764	bgu	%ncc, 1b
765	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
766
7672:
768	subcc	%o2, 8, %o2
769	std	%d0, [%o0]
770	bz	%ncc, .mediumexit
771	add	%o0, 8, %o0
7723:
773	ldub	[%o1], %o3
774	deccc	%o2
775	inc	%o1
776	stb	%o3, [%o0]
777	bgu	%ncc, 3b
778	inc	%o0
779#endif
780
781.mediumexit:
782        wr	%o4, %g0, %fprs		! fprs = o4   restore fprs
783	retl
784        mov	%g1, %o0
785
786
787	.align ICACHE_LINE_SIZE
788.large:
789
790	! %o0 I/O DST is 64-byte aligned
791	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
792	! %d0 I/O already loaded with SRC data from [%o1-8]
793	! %o2 I/O count (number of bytes that need to be written)
794	! %o3 I   Not written.  If zero, then SRC is double aligned.
795	! %o4 I   Not written.  Holds fprs.
796	! %o5   O The number of doubles that remain to be written.
797
798	! Load the rest of the current block
799	! Recall that %o1 is further into SRC than %o0 is into DST
800
801	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
802	prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read
803
804	set	BST_THRESHOLD, %o5
805	cmp	%o2, %o5
806	bgu,pn	%icc, .xlarge
807	prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read
808
809	ldd	[%o1], %f2
810	ldd	[%o1 + 0x8], %f4
811	faligndata %f0, %f2, %f32
812	ldd	[%o1 + 0x10], %f6
813	faligndata %f2, %f4, %f34
814	ldd	[%o1 + 0x18], %f8
815	faligndata %f4, %f6, %f36
816	ldd	[%o1 + 0x20], %f10
817	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
818	faligndata %f6, %f8, %f38
819	prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
820	ldd	[%o1 + 0x28], %f12
821	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
822	faligndata %f8, %f10, %f40
823	ldd	[%o1 + 0x30], %f14
824	faligndata %f10, %f12, %f42
825	ldd	[%o1 + 0x38], %f0
826	sub	%o2, BLOCK_SIZE, %o2	! update count
827	add	%o1, BLOCK_SIZE, %o1	! update SRC
828
829	! Main loop.  Write previous block.  Load rest of current block.
830	! Some bytes will be loaded that won't yet be written.
8311:
832	ldd	[%o1], %f2
833	faligndata %f12, %f14, %f44
834	ldd	[%o1 + 0x8], %f4
835	faligndata %f14, %f0, %f46
836	std	%f32, [%o0]
837	std	%f34, [%o0+8]
838	std	%f36, [%o0+16]
839	std	%f38, [%o0+24]
840	std	%f40, [%o0+32]
841	std	%f42, [%o0+40]
842	std	%f44, [%o0+48]
843	std	%f46, [%o0+56]
844	sub	%o2, BLOCK_SIZE, %o2		! update count
845	prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read
846	add	%o0, BLOCK_SIZE, %o0		! update DST
847	ldd	[%o1 + 0x10], %f6
848	faligndata %f0, %f2, %f32
849	ldd	[%o1 + 0x18], %f8
850	faligndata %f2, %f4, %f34
851	ldd	[%o1 + 0x20], %f10
852	faligndata %f4, %f6, %f36
853	ldd	[%o1 + 0x28], %f12
854	faligndata %f6, %f8, %f38
855	ldd	[%o1 + 0x30], %f14
856	faligndata %f8, %f10, %f40
857	ldd	[%o1 + 0x38], %f0
858	faligndata %f10, %f12, %f42
859	prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
860	cmp	%o2, BLOCK_SIZE + 8
861	prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write
862	bgu,pt	%ncc, 1b
863	add	%o1, BLOCK_SIZE, %o1	! update SRC
864	faligndata %f12, %f14, %f44
865	faligndata %f14, %f0, %f46
866	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
867	cmp	%o2, BLOCK_SIZE
868	bne	%ncc, 2f		! exactly 1 block remaining?
869	add	%o0, BLOCK_SIZE, %o0	! update DST
870	brz,a	%o3, 3f			! is SRC double aligned?
871	ldd	[%o1], %f2
872
8732:
874	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
875	add	%o5, %o3, %o5
876
877	membar	#StoreLoad|#StoreStore
878
879	ba	.beginmedloop
880	andn	%o5, 7, %o5		! 8 byte aligned count
881
882
883	! This is when there is exactly 1 block remaining and SRC is aligned
8843:
885	ldd	[%o1 + 0x8], %f4
886	ldd	[%o1 + 0x10], %f6
887	fsrc1	%f0, %f32
888	ldd	[%o1 + 0x18], %f8
889	fsrc1	%f2, %f34
890	ldd	[%o1 + 0x20], %f10
891	fsrc1	%f4, %f36
892	ldd	[%o1 + 0x28], %f12
893	fsrc1	%f6, %f38
894	ldd	[%o1 + 0x30], %f14
895	fsrc1	%f8, %f40
896	fsrc1	%f10, %f42
897	fsrc1	%f12, %f44
898	fsrc1	%f14, %f46
899	stda	%f32, [%o0]ASI_BLK_P
900	membar	#StoreLoad|#StoreStore
901	wr	%o4, 0, %fprs
902	retl
903	mov	%g1, %o0
904
905
906	.align 16
907	! two nops here causes loop starting at 1f below to be
908	! on a cache line boundary, improving performance
909	nop
910	nop
911.xlarge:
912	! %o0 I/O DST is 64-byte aligned
913	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
914	! %d0 I/O already loaded with SRC data from [%o1-8]
915	! %o2 I/O count (number of bytes that need to be written)
916	! %o3 I   Not written.  If zero, then SRC is double aligned.
917	! %o4 I   Not written.  Holds fprs.
918	! %o5   O The number of doubles that remain to be written.
919
920	! Load the rest of the current block
921	! Recall that %o1 is further into SRC than %o0 is into DST
922
923	ldd	[%o1], %f2
924	ldd	[%o1 + 0x8], %f4
925	faligndata %f0, %f2, %f32
926	ldd	[%o1 + 0x10], %f6
927	faligndata %f2, %f4, %f34
928	ldd	[%o1 + 0x18], %f8
929	faligndata %f4, %f6, %f36
930	ldd	[%o1 + 0x20], %f10
931	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
932	faligndata %f6, %f8, %f38
933	ldd	[%o1 + 0x28], %f12
934	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
935	prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read
936	faligndata %f8, %f10, %f40
937	ldd	[%o1 + 0x30], %f14
938	faligndata %f10, %f12, %f42
939	ldd	[%o1 + 0x38], %f0
940	prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read
941	sub	%o2, BLOCK_SIZE, %o2	! update count
942	add	%o1, BLOCK_SIZE, %o1	! update SRC
943
944	! This point is 32-byte aligned since 24 instructions appear since
945	! the previous alignment directive.
946
947
948	! Main loop.  Write previous block.  Load rest of current block.
949	! Some bytes will be loaded that won't yet be written.
9501:
951	ldd	[%o1], %f2
952	faligndata %f12, %f14, %f44
953	ldd	[%o1 + 0x8], %f4
954	faligndata %f14, %f0, %f46
955	stda	%f32, [%o0]ASI_BLK_P
956	sub	%o2, BLOCK_SIZE, %o2		! update count
957	ldd	[%o1 + 0x10], %f6
958	faligndata %f0, %f2, %f32
959	ldd	[%o1 + 0x18], %f8
960	faligndata %f2, %f4, %f34
961	ldd	[%o1 + 0x20], %f10
962	faligndata %f4, %f6, %f36
963	ldd	[%o1 + 0x28], %f12
964	faligndata %f6, %f8, %f38
965	ldd	[%o1 + 0x30], %f14
966	prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads
967	faligndata %f8, %f10, %f40
968	ldd	[%o1 + 0x38], %f0
969	faligndata %f10, %f12, %f42
970	prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read
971	add	%o0, BLOCK_SIZE, %o0		! update DST
972	cmp	%o2, BLOCK_SIZE + 8
973	! second prefetch important to correct for occasional dropped
974	prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read
975	bgu,pt	%ncc, 1b
976	add	%o1, BLOCK_SIZE, %o1		! update SRC
977
978	faligndata %f12, %f14, %f44
979	faligndata %f14, %f0, %f46
980	stda	%f32, [%o0]ASI_BLK_P		! store 64 bytes, bypass cache
981	cmp	%o2, BLOCK_SIZE
982	bne	%ncc, 2f		! exactly 1 block remaining?
983	add	%o0, BLOCK_SIZE, %o0	! update DST
984	brz,a	%o3, 3f			! is SRC double aligned?
985	ldd	[%o1], %f2
986
9872:
988	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8
989	add	%o5, %o3, %o5
990
991	membar	#StoreLoad|#StoreStore
992
993	ba	.beginmedloop
994	andn	%o5, 7, %o5		! 8 byte aligned count
995
996
997	! This is when there is exactly 1 block remaining and SRC is aligned
9983:
999	ldd	[%o1 + 0x8], %f4
1000	ldd	[%o1 + 0x10], %f6
1001	fsrc1	%f0, %f32
1002	ldd	[%o1 + 0x18], %f8
1003	fsrc1	%f2, %f34
1004	ldd	[%o1 + 0x20], %f10
1005	fsrc1	%f4, %f36
1006	ldd	[%o1 + 0x28], %f12
1007	fsrc1	%f6, %f38
1008	ldd	[%o1 + 0x30], %f14
1009	fsrc1	%f8, %f40
1010	fsrc1	%f10, %f42
1011	fsrc1	%f12, %f44
1012	fsrc1	%f14, %f46
1013	stda	%f32, [%o0]ASI_BLK_P
1014	membar	#StoreLoad|#StoreStore
1015	wr	%o4, 0, %fprs
1016	retl
1017	mov	%g1, %o0
1018
1019	SET_SIZE(memcpy)
1020