1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26#include <sys/param.h>
27#include <sys/errno.h>
28#include <sys/asm_linkage.h>
29#include <sys/vtrace.h>
30#include <sys/machthread.h>
31#include <sys/clock.h>
32#include <sys/asi.h>
33#include <sys/fsr.h>
34#include <sys/privregs.h>
35#include <sys/machasi.h>
36#include <sys/niagaraasi.h>
37
38#if !defined(lint)
39#include "assym.h"
40#endif	/* lint */
41
42
43/*
44 * Pseudo-code to aid in understanding the control flow of the
45 * bcopy/kcopy routine.
46 *
47 *	! WARNING : <Register usage convention>
48 *	! In kcopy() the %o5, holds previous error handler and a flag
49 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
50 *	! The %o5 is not available for any other use.
51 *
52 * On entry:
53 *	! Determine whether to use the FP register version or the
54 *	! the leaf routine version depending on the size of the copy.
55 *	! Set up error handling accordingly.
56 *	! The transition point depends on FP_COPY
57 *	! For both versions %o5 is reserved
58 *
59 * kcopy():
60 *	if(length > FP_COPY)
61 *		go to regular_kcopy
62 *
63 *	! Setup_leaf_rtn_error_handler
64 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
65 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
66 *	curthread->t_lofault = .sm_copyerr;
67 *	goto small_bcopy();
68 *
69 * regular_kcopy:
70 *	save_registers()
71 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
72 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
73 *	curthread->t_lofault = .copyerr;
74 *	goto do_copy();
75 *
76 * bcopy():
77 *	if(length > FP_COPY)
78 *		go to regular_bcopy
79 *
80 *	! Setup_leaf_rtn_error_handler
81 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
82 *	curthread->t_lofault = .sm_copyerr;
83 *	goto small_bcopy();
84 *
85 * regular_bcopy:
86 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
87 *	curthread->t_lofault = .copyerr;
88 *	goto do_copy();
89 *
90 * small_bcopy:
91 *	! handle copies smaller than FP_COPY
92 *	restore t_lofault handler
93 *	exit
94 *
95 * do_copy:
96 *	! handle copies larger than FP_COPY
97 *	save fp_regs
98 * 	blockcopy;
99 *	restore fp_regs
100 *	restore t_lofault handler if came from kcopy();
101 *
102 *
103 * In leaf lofault handler:
104 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
105 *	return (errno)
106 *
107 * In lofault handler:
108 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
109 *	restore fp_regs
110 *	return (errno)
111 *
112 *
113 *
114 * For all of bcopy/copyin/copyout the copy logic is specialized according
115 * to how the src and dst is aligned and how much data needs to be moved.
116 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
117 *
118 * N2/RF Flow :
119 *
120 * if (count < FP_COPY) {  (584 bytes)
121 *   set small fault handler (no register window save/restore)
122 *   if count < SHORTCOPY  (7 bytes)
123 *	copy bytes; go to short_exit
124 *   else
125 *   determine dst alignment, move minimum bytes/halfwords to
126 *   get dst aligned on long word boundary
127 *     if( src is on long word boundary ) {
128 * medlong:					   src/dst aligned on 8 bytes
129 *	 copy with ldx/stx in 4-way unrolled loop;
130 *       copy final 0-31 bytes; go to short_exit
131 *     } else {					src/dst not aligned on 8 bytes
132 *     if src is word aligned, ld/st words in 32-byte chunks
133 *     if src is half word aligned, ld half, ld word, ld half; pack
134 *		into long word, store long words in 32-byte chunks
135 *     if src is byte aligned, ld byte,half,word parts;  pack into long
136 *	   word, store long words in 32-byte chunks
137 *     move final 0-31 bytes according to src alignment;  go to short_exit
138 * short_exit:
139 *     restore trap handler if needed, retl
140 * else {					   More than FP_COPY bytes
141 *     set fault handler
142 *     disable kernel preemption
143 *     save registers, save FP registers if in use
144 *     move bytes to align destination register on long word boundary
145 *     if(src is on long word boundary) {	   src/dst aligned on 8 bytes
146 *       align dst on 64 byte boundary;  use 8-way test for each of 8 possible
147 *       src alignments relative to a 64 byte boundary to select the
148 *       16-way unrolled loop (128 bytes) to use for
149 *       block load, fmovd, block-init-store, block-store, fmovd operations
150 *       then go to remain_stuff.
151 * remain_stuff: move remaining bytes. go to long_exit
152 *     } else {
153 *       setup alignaddr for faligndata instructions
154 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
155 *       src alignments to nearest long word relative to 64 byte boundary to
156 *       select the 8-way unrolled loop (64 bytes) to use for
157 *       block load, falign, fmovd, block-store loop
158 *	 (only use block-init-store when src/dst on 8 byte boundaries.)
159 *       goto unalign_done.
160 * unalign_done:
161 *       move remaining bytes for unaligned cases. go to long_exit
162 * long_exit:
163 *       restore %gsr, FP regs (either from stack or set to zero),
164 *       restore trap handler, check for kernel preemption request,
165 *       handle if needed, ret.
166 * }
167 *
168 * Other platforms include hw_bcopy_limit_[1248] to control the exact
169 * point where the FP register code is used. On those platforms, the
170 * FP register code did not leave data in L2 cache, potentially affecting
171 * performance more than the gain/loss from the algorithm difference.
172 * For N2/RF, block store places data in the L2 cache, so use or non-use
173 * of the FP registers has no effect on L2 cache behavior.
174 * The cost for testing hw_bcopy_limit_* according to different
175 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
176 * were not used. That cost was judged too high relative to the benefits,
177 * so the hw_bcopy_limit option is omitted from this code.
178 */
179
180/*
181 * Less then or equal this number of bytes we will always copy byte-for-byte
182 */
183#define	SMALL_LIMIT	7
184
185/*
186 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
187 * handler was set
188 */
189#define	LOFAULT_SET 2
190
191/*
192 * This define is to align data for the unaligned source cases.
193 * The data1, data2 and data3 is merged into data1 and data2.
194 * The data3 is preserved for next merge.
195 */
196#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
197	sllx	data1, lshift, data1				;\
198	srlx	data2, rshift, tmp				;\
199	or	data1, tmp, data1				;\
200	sllx	data2, lshift, data2				;\
201	srlx	data3, rshift, tmp				;\
202	or	data2, tmp, data2
203/*
204 * This macro is to align the data. Basically it merges
205 * data1 and data2 to form double word.
206 */
207#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
208	sllx	data1, lshift, data1				;\
209	srlx	data2, rshift, tmp				;\
210	or	data1, tmp, data1
211
212#if !defined(NIAGARA_IMPL)
213/*
214 * Flags set in the lower bits of the t_lofault address:
215 * FPUSED_FLAG: The FP registers were in use and must be restored
216 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
217 * COPY_FLAGS: Both of the above
218 *
219 * Other flags:
220 * KPREEMPT_FLAG: kpreempt needs to be called
221 */
222#define	FPUSED_FLAG	1
223#define	LOFAULT_SET	2
224#define	COPY_FLAGS	(FPUSED_FLAG | LOFAULT_SET)
225#define	KPREEMPT_FLAG	4
226
227#define	ALIGN_OFF_1_7			\
228	faligndata %d0, %d2, %d48	;\
229	faligndata %d2, %d4, %d50	;\
230	faligndata %d4, %d6, %d52	;\
231	faligndata %d6, %d8, %d54	;\
232	faligndata %d8, %d10, %d56	;\
233	faligndata %d10, %d12, %d58	;\
234	faligndata %d12, %d14, %d60	;\
235	faligndata %d14, %d16, %d62
236
237#define	ALIGN_OFF_8_15			\
238	faligndata %d2, %d4, %d48	;\
239	faligndata %d4, %d6, %d50	;\
240	faligndata %d6, %d8, %d52	;\
241	faligndata %d8, %d10, %d54	;\
242	faligndata %d10, %d12, %d56	;\
243	faligndata %d12, %d14, %d58	;\
244	faligndata %d14, %d16, %d60	;\
245	faligndata %d16, %d18, %d62
246
247#define	ALIGN_OFF_16_23			\
248	faligndata %d4, %d6, %d48	;\
249	faligndata %d6, %d8, %d50	;\
250	faligndata %d8, %d10, %d52	;\
251	faligndata %d10, %d12, %d54	;\
252	faligndata %d12, %d14, %d56	;\
253	faligndata %d14, %d16, %d58	;\
254	faligndata %d16, %d18, %d60	;\
255	faligndata %d18, %d20, %d62
256
257#define	ALIGN_OFF_24_31			\
258	faligndata %d6, %d8, %d48	;\
259	faligndata %d8, %d10, %d50	;\
260	faligndata %d10, %d12, %d52	;\
261	faligndata %d12, %d14, %d54	;\
262	faligndata %d14, %d16, %d56	;\
263	faligndata %d16, %d18, %d58	;\
264	faligndata %d18, %d20, %d60	;\
265	faligndata %d20, %d22, %d62
266
267#define	ALIGN_OFF_32_39			\
268	faligndata %d8, %d10, %d48	;\
269	faligndata %d10, %d12, %d50	;\
270	faligndata %d12, %d14, %d52	;\
271	faligndata %d14, %d16, %d54	;\
272	faligndata %d16, %d18, %d56	;\
273	faligndata %d18, %d20, %d58	;\
274	faligndata %d20, %d22, %d60	;\
275	faligndata %d22, %d24, %d62
276
277#define	ALIGN_OFF_40_47			\
278	faligndata %d10, %d12, %d48	;\
279	faligndata %d12, %d14, %d50	;\
280	faligndata %d14, %d16, %d52	;\
281	faligndata %d16, %d18, %d54	;\
282	faligndata %d18, %d20, %d56	;\
283	faligndata %d20, %d22, %d58	;\
284	faligndata %d22, %d24, %d60	;\
285	faligndata %d24, %d26, %d62
286
287#define	ALIGN_OFF_48_55			\
288	faligndata %d12, %d14, %d48	;\
289	faligndata %d14, %d16, %d50	;\
290	faligndata %d16, %d18, %d52	;\
291	faligndata %d18, %d20, %d54	;\
292	faligndata %d20, %d22, %d56	;\
293	faligndata %d22, %d24, %d58	;\
294	faligndata %d24, %d26, %d60	;\
295	faligndata %d26, %d28, %d62
296
297#define	ALIGN_OFF_56_63			\
298	faligndata %d14, %d16, %d48	;\
299	faligndata %d16, %d18, %d50	;\
300	faligndata %d18, %d20, %d52	;\
301	faligndata %d20, %d22, %d54	;\
302	faligndata %d22, %d24, %d56	;\
303	faligndata %d24, %d26, %d58	;\
304	faligndata %d26, %d28, %d60	;\
305	faligndata %d28, %d30, %d62
306
307/*
308 * FP_COPY indicates the minimum number of bytes needed
309 * to justify using FP/VIS-accelerated memory operations.
310 * The FPBLK code assumes a minimum number of bytes are available
311 * to be moved on entry.  Check that code carefully before
312 * reducing FP_COPY below 256.
313 */
314#define FP_COPY			584
315#define SHORTCOPY		7
316#define ASI_STBI_P		ASI_BLK_INIT_ST_QUAD_LDD_P
317#define ASI_STBI_AIUS		ASI_BLK_INIT_QUAD_LDD_AIUS
318#define CACHE_LINE		64
319#define	VIS_BLOCKSIZE		64
320
321/*
322 * Size of stack frame in order to accomodate a 64-byte aligned
323 * floating-point register save area and 2 64-bit temp locations.
324 * All copy functions use three quadrants of fp registers; to assure a
325 * block-aligned three block buffer in which to save we must reserve
326 * four blocks on stack.
327 *
328 *    _______________________________________ <-- %fp + STACK_BIAS
329 *    | We may need to preserve 3 quadrants |
330 *    | of fp regs, but since we do so with |
331 *    | BST/BLD we need room in which to    |
332 *    | align to VIS_BLOCKSIZE bytes.  So   |
333 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
334 *    |-------------------------------------|
335 *    | 8 bytes to save %fprs		    | <--  - SAVED_FPRS_OFFSET
336 *    |-------------------------------------|
337 *    | 8 bytes to save %gsr		    | <--  - SAVED_GSR_OFFSET
338 *    ---------------------------------------
339 */
340#define HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
341#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 4)
342#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 3) + 1)
343#define SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
344#define SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
345
346/*
347 * In FP copies if we do not have preserved data to restore over
348 * the fp regs we used then we must zero those regs to avoid
349 * exposing portions of the data to later threads (data security).
350 */
351#define	FZERO				\
352	fzero	%f0			;\
353	fzero	%f2			;\
354	faddd	%f0, %f2, %f4		;\
355	fmuld	%f0, %f2, %f6		;\
356	faddd	%f0, %f2, %f8		;\
357	fmuld	%f0, %f2, %f10		;\
358	faddd	%f0, %f2, %f12		;\
359	fmuld	%f0, %f2, %f14		;\
360	faddd	%f0, %f2, %f16		;\
361	fmuld	%f0, %f2, %f18		;\
362	faddd	%f0, %f2, %f20		;\
363	fmuld	%f0, %f2, %f22		;\
364	faddd	%f0, %f2, %f24		;\
365	fmuld	%f0, %f2, %f26		;\
366	faddd	%f0, %f2, %f28		;\
367	fmuld	%f0, %f2, %f30		;\
368	faddd	%f0, %f2, %f48		;\
369	fmuld	%f0, %f2, %f50		;\
370	faddd	%f0, %f2, %f52		;\
371	fmuld	%f0, %f2, %f54		;\
372	faddd	%f0, %f2, %f56		;\
373	fmuld	%f0, %f2, %f58		;\
374	faddd	%f0, %f2, %f60		;\
375	fmuld	%f0, %f2, %f62
376
377#if !defined(lint)
378
379/*
380 * Macros to save and restore fp registers to/from the stack.
381 * Used to save and restore in-use fp registers when we want to use FP.
382 */
383#define BST_FP_TOSTACK(tmp1)					\
384	/* membar #Sync	*/					;\
385	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
386	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
387	stda	%f0, [tmp1]ASI_BLK_P				;\
388	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
389	stda	%f16, [tmp1]ASI_BLK_P				;\
390	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
391	stda	%f48, [tmp1]ASI_BLK_P				;\
392	membar	#Sync
393
394#define	BLD_FP_FROMSTACK(tmp1)					\
395	/* membar #Sync - provided at copy completion */	;\
396	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
397	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
398	ldda	[tmp1]ASI_BLK_P, %f0				;\
399	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
400	ldda	[tmp1]ASI_BLK_P, %f16				;\
401	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
402	ldda	[tmp1]ASI_BLK_P, %f48				;\
403	membar	#Sync
404#endif	/* NIAGARA_IMPL */
405
406#endif	/* lint */
407/*
408 * Copy a block of storage, returning an error code if `from' or
409 * `to' takes a kernel pagefault which cannot be resolved.
410 * Returns errno value on pagefault error, 0 if all ok
411 */
412
413#if defined(lint)
414
415/* ARGSUSED */
416int
417kcopy(const void *from, void *to, size_t count)
418{ return(0); }
419
420#else	/* lint */
421
422	.seg	".text"
423	.align	4
424
425	ENTRY(kcopy)
426#if !defined(NIAGARA_IMPL)
427	cmp	%o2, FP_COPY			! check for small copy/leaf case
428	bgt,pt	%ncc, .kcopy_more		!
429	nop
430.kcopy_small:					! setup error handler
431	sethi	%hi(.sm_copyerr), %o4
432	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
433	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
434	! Note that we carefully do *not* flag the setting of
435	! t_lofault.
436	membar	#Sync				! sync error barrier
437	b	.sm_do_copy			! common code
438	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
439
440
441.kcopy_more:
442	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
443	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
444	or	%l7, %lo(.copyerr), %l7
445	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
446	! Note that we carefully do *not* flag the setting of
447	! t_lofault.
448	membar	#Sync				! sync error barrier
449	b	.do_copy			! common code
450	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
451
452/*
453 * We got here because of a fault during a small kcopy or bcopy.
454 * if a fault handler existed when bcopy was called.
455 * No floating point registers are used by the small copies.
456 * Small copies are from a leaf routine
457 * Errno value is in %g1.
458 */
459.sm_copyerr:
460	! The kcopy will always set a t_lofault handler. If it fires,
461	! we're expected to just return the error code and not to
462	! invoke any existing error handler. As far as bcopy is concerned,
463	! we only set t_lofault if there was an existing lofault handler.
464	! In that case we're expected to invoke the previously existing
465	! handler after resetting the t_lofault value.
466	btst	LOFAULT_SET, %o5
467	membar	#Sync				! sync error barrier
468	andn	%o5, LOFAULT_SET, %o5		! clear fault flag
469	bnz,pn	%ncc, 3f
470	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
471	retl
472	mov	%g1, %o0
4733:
474	! We're here via bcopy. There must have been an error handler
475	! in place otherwise we would have died a nasty death already.
476	jmp	%o5				! goto real handler
477	mov	%g0, %o0
478/*
479 *  end of .sm_copyerr
480 */
481
482/*
483 * We got here because of a fault during kcopy or bcopy if a fault
484 * handler existed when bcopy was called.
485 * stack and fp registers need to be restored
486 * Errno value is in %g1.
487 */
488.copyerr:
489	sethi	%hi(.copyerr2), %l1
490	or	%l1, %lo(.copyerr2), %l1
491	membar	#Sync				! sync error barrier
492	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
493	btst	FPUSED_FLAG, %o5
494	bz,pt	%xcc, 1f
495	and	%o5, LOFAULT_SET, %l1	! copy flag to %l1
496
497	membar	#Sync				! sync error barrier
498	wr	%l5, 0, %gsr
499	btst	FPRS_FEF, %g5
500	bz,pt	%icc, 4f
501	nop
502	! restore fpregs from stack
503	BLD_FP_FROMSTACK(%o2)
504	ba,pt	%ncc, 2f
505	wr	%g5, 0, %fprs		! restore fprs
5064:
507	FZERO
508	wr	%g5, 0, %fprs		! restore fprs
5092:
510	ldn	[THREAD_REG + T_LWP], %o2
511	brnz,pt	%o2, 1f
512	nop
513
514	ldsb	[THREAD_REG + T_PREEMPT], %l0
515	deccc	%l0
516	bnz,pn	%ncc, 1f
517	stb	%l0, [THREAD_REG + T_PREEMPT]
518
519	! Check for a kernel preemption request
520	ldn	[THREAD_REG + T_CPU], %l0
521	ldub	[%l0 + CPU_KPRUNRUN], %l0
522	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
523	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
524
525	! The kcopy will always set a t_lofault handler. If it fires,
526	! we're expected to just return the error code and not to
527	! invoke any existing error handler. As far as bcopy is concerned,
528	! we only set t_lofault if there was an existing lofault handler.
529	! In that case we're expected to invoke the previously existing
530	! handler after resetting the t_lofault value.
5311:
532	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
533	membar	#Sync				! sync error barrier
534	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
535
536	! call kpreempt if necessary
537	btst	KPREEMPT_FLAG, %l1
538	bz,pt	%icc, 2f
539	nop
540	call	kpreempt
541	rdpr	%pil, %o0	! pass %pil
5422:
543	btst	LOFAULT_SET, %l1
544	bnz,pn	%ncc, 3f
545	nop
546	ret
547	restore	%g1, 0, %o0
5483:
549	! We're here via bcopy. There must have been an error handler
550	! in place otherwise we would have died a nasty death already.
551	jmp	%o5				! goto real handler
552	restore	%g0, 0, %o0			! dispose of copy window
553
554/*
555 * We got here because of a fault in .copyerr.  We can't safely restore fp
556 * state, so we panic.
557 */
558fp_panic_msg:
559	.asciz	"Unable to restore fp state after copy operation"
560
561	.align	4
562.copyerr2:
563	set	fp_panic_msg, %o0
564	call	panic
565	nop
566/*
567 *  end of .copyerr
568 */
569
570#else	/* NIAGARA_IMPL */
571	save	%sp, -SA(MINFRAME), %sp
572	set	.copyerr, %l7			! copyerr is lofault value
573	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
574	or	%o5, LOFAULT_SET, %o5
575	membar	#Sync				! sync error barrier
576	b	.do_copy			! common code
577	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
578
579/*
580 * We got here because of a fault during kcopy.
581 * Errno value is in %g1.
582 */
583.copyerr:
584	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
585	! into %o5 to indicate it has set t_lofault handler. Need to clear
586	! LOFAULT_SET flag before restoring the error handler.
587	andn	%o5, LOFAULT_SET, %o5
588	membar	#Sync				! sync error barrier
589	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
590	ret
591	restore	%g1, 0, %o0
592#endif	/* NIAGARA_IMPL */
593
594	SET_SIZE(kcopy)
595#endif	/* lint */
596
597
598/*
599 * Copy a block of storage - must not overlap (from + len <= to).
600 */
601#if defined(lint)
602
603/* ARGSUSED */
604void
605bcopy(const void *from, void *to, size_t count)
606{}
607
608#else	/* lint */
609
610	ENTRY(bcopy)
611#if !defined(NIAGARA_IMPL)
612	cmp	%o2, FP_COPY			! check for small copy/leaf case
613	bgt,pt	%ncc, .bcopy_more		!
614	nop
615.bcopy_small:					! setup error handler
616	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
617	tst	%o5
618	bz,pt	%icc, .sm_do_copy
619	sethi	%hi(.sm_copyerr), %o4
620	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
621	membar	#Sync				! sync error barrier
622	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
623	or	%o5, LOFAULT_SET, %o5		! Error should trampoline
624.sm_do_copy:
625	mov	%o0, %g1		! save %o0
626	cmp	%o2, SHORTCOPY		! make sure there is enough to align
627	ble,pt	%ncc, .bc_smallest
628	andcc	%o1, 0x7, %o3		! is dest long aligned
629	bnz,pn	%ncc, .bc_align
630	andcc	%o1, 1, %o3		! is dest byte aligned
631
632! Destination is long word aligned
633.bc_al_src:
634	andcc	%o0, 7, %o3
635	brnz,pt	%o3, .bc_src_dst_unal8
636	nop
637/*
638 * Special case for handling when src and dest are both long word aligned
639 * and total data to move is less than FP_COPY bytes
640 * Also handles finish up for large block moves, so may be less than 32 bytes
641 */
642.bc_medlong:
643	subcc	%o2, 31, %o2		! adjust length to allow cc test
644	ble,pt	%ncc, .bc_medl31
645	nop
646.bc_medl32:
647	ldx	[%o0], %o4		! move 32 bytes
648	subcc	%o2, 32, %o2		! decrement length count by 32
649	stx	%o4, [%o1]
650	ldx	[%o0+8], %o4
651	stx	%o4, [%o1+8]
652	ldx	[%o0+16], %o4
653	add	%o0, 32, %o0		! increase src ptr by 32
654	stx	%o4, [%o1+16]
655	ldx	[%o0-8], %o4
656	add	%o1, 32, %o1		! increase dst ptr by 32
657	bgu,pt	%ncc, .bc_medl32	! repeat if at least 32 bytes left
658	stx	%o4, [%o1-8]
659.bc_medl31:
660	addcc	%o2, 24, %o2		! adjust count to be off by 7
661	ble,pt	%ncc, .bc_medl7		! skip if 7 or fewer bytes left
662	nop
663.bc_medl8:
664	ldx	[%o0], %o4		! move 8 bytes
665	add	%o0, 8, %o0		! increase src ptr by 8
666	subcc	%o2, 8, %o2		! decrease count by 8
667	add	%o1, 8, %o1		! increase dst ptr by 8
668	bgu,pt	%ncc, .bc_medl8
669	stx	%o4, [%o1-8]
670.bc_medl7:
671	addcc	%o2, 7, %o2		! finish adjustment of remaining count
672	bnz,pt	%ncc, .bc_small4	! do final bytes if not finished
673
674.bc_smallx:				! finish up and exit
675	tst	%o5
676	bz,pt	%ncc, .bc_sm_done
677	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
678	membar	#Sync			! sync error barrier
679	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
680.bc_sm_done:
681	retl
682	mov	%g0, %o0
683
684.bc_small4:
685	cmp	%o2, 4
686	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
687	nop				!
688	ld	[%o0], %o4		! move 4 bytes
689	add	%o0, 4, %o0		! increase src ptr by 4
690	add	%o1, 4, %o1		! increase dst ptr by 4
691	subcc	%o2, 4, %o2		! decrease count by 4
692	bz,pt	%ncc, .bc_smallx
693	stw	%o4, [%o1-4]
694
695.bc_small3x:				! Exactly 1, 2, or 3 bytes remain
696	subcc	%o2, 1, %o2		! reduce count for cc test
697	ldub	[%o0], %o4		! load one byte
698	bz,pt	%ncc, .bc_smallx
699	stb	%o4, [%o1]		! store one byte
700	ldub	[%o0+1], %o4		! load second byte
701	subcc	%o2, 1, %o2
702	bz,pt	%ncc, .bc_smallx
703	stb	%o4, [%o1+1]		! store second byte
704	ldub	[%o0+2], %o4		! load third byte
705	ba	.bc_smallx
706	stb	%o4, [%o1+2]		! store third byte
707
708.bc_smallest:				! 7 or fewer bytes remain
709	tst	%o2
710	bz,pt	%ncc, .bc_smallx
711	cmp	%o2, 4
712	blt,pt	%ncc, .bc_small3x
713	nop
714	ldub	[%o0], %o4		! read byte
715	subcc	%o2, 4, %o2		! reduce count by 4
716	stb	%o4, [%o1]		! write byte
717	ldub	[%o0+1], %o4		! repeat for total of 4 bytes
718	add	%o0, 4, %o0		! advance src by 4
719	stb	%o4, [%o1+1]
720	ldub	[%o0-2], %o4
721	add	%o1, 4, %o1		! advance dst by 4
722	stb	%o4, [%o1-2]
723	ldub	[%o0-1], %o4
724	bnz,pt	%ncc, .bc_small3x
725	stb	%o4, [%o1-1]
726	ba	.bc_smallx
727	nop
728
729/*
730 * Align destination to long word boundary
731 */
732.bc_align:				! byte align test in prior branch delay
733	bnz,pt	%ncc, .bc_al_d1
734.bc_al_d1f:				! dest is now half word aligned
735	andcc	%o1, 2, %o3
736	bnz,pt	%ncc, .bc_al_d2
737.bc_al_d2f:				! dest is now word aligned
738	andcc	%o1, 4, %o3		! is dest longword aligned?
739	bz,pt	%ncc, .bc_al_src
740	nop
741.bc_al_d4:				! dest is word aligned;  src is unknown
742	ldub	[%o0], %o4		! move a word (src align unknown)
743	ldub	[%o0+1], %o3
744	sll	%o4, 24, %o4		! position
745	sll	%o3, 16, %o3		! position
746	or	%o4, %o3, %o3		! merge
747	ldub	[%o0+2], %o4
748	sll	%o4, 8, %o4		! position
749	or	%o4, %o3, %o3		! merge
750	ldub	[%o0+3], %o4
751	or	%o4, %o3, %o4		! merge
752	stw	%o4,[%o1]		! store four bytes
753	add	%o0, 4, %o0		! adjust src by 4
754	add	%o1, 4, %o1		! adjust dest by 4
755	sub	%o2, 4, %o2		! adjust count by 4
756	andcc	%o0, 7, %o3		! check for src long word alignment
757	brz,pt	%o3, .bc_medlong
758.bc_src_dst_unal8:
759	! dst is 8-byte aligned, src is not
760	! Size is less than FP_COPY
761	! Following code is to select for alignment
762	andcc	%o0, 0x3, %o3		! test word alignment
763	bz,pt	%ncc, .bc_medword
764	nop
765	andcc	%o0, 0x1, %o3		! test halfword alignment
766	bnz,pt	%ncc, .bc_med_byte	! go to byte move if not halfword
767	andcc	%o0, 0x2, %o3		! test which byte alignment
768	ba	.bc_medhalf
769	nop
770.bc_al_d1:				! align dest to half word
771	ldub	[%o0], %o4		! move a byte
772	add	%o0, 1, %o0
773	stb	%o4, [%o1]
774	add	%o1, 1, %o1
775	andcc	%o1, 2, %o3
776	bz,pt	%ncc, .bc_al_d2f
777	sub	%o2, 1, %o2
778.bc_al_d2:				! align dest to word
779	ldub	[%o0], %o4		! move a half-word (src align unknown)
780	ldub	[%o0+1], %o3
781	sll	%o4, 8, %o4		! position
782	or	%o4, %o3, %o4		! merge
783	sth	%o4, [%o1]
784	add	%o0, 2, %o0
785	add	%o1, 2, %o1
786	andcc	%o1, 4, %o3		! is dest longword aligned?
787	bz,pt	%ncc, .bc_al_src
788	sub	%o2, 2, %o2
789	ba	.bc_al_d4
790	nop
791/*
792 * Handle all cases where src and dest are aligned on word
793 * boundaries. Use unrolled loops for better performance.
794 * This option wins over standard large data move when
795 * source and destination is in cache for medium
796 * to short data moves.
797 */
798.bc_medword:
799	subcc	%o2, 31, %o2		! adjust length to allow cc test
800	ble,pt	%ncc, .bc_medw31
801	nop
802.bc_medw32:
803	ld	[%o0], %o4		! move a block of 32 bytes
804	stw	%o4, [%o1]
805	ld	[%o0+4], %o4
806	stw	%o4, [%o1+4]
807	ld	[%o0+8], %o4
808	stw	%o4, [%o1+8]
809	ld	[%o0+12], %o4
810	stw	%o4, [%o1+12]
811	ld	[%o0+16], %o4
812	stw	%o4, [%o1+16]
813	ld	[%o0+20], %o4
814	subcc	%o2, 32, %o2		! decrement length count
815	stw	%o4, [%o1+20]
816	ld	[%o0+24], %o4
817	add	%o0, 32, %o0		! increase src ptr by 32
818	stw	%o4, [%o1+24]
819	ld	[%o0-4], %o4
820	add	%o1, 32, %o1		! increase dst ptr by 32
821	bgu,pt	%ncc, .bc_medw32	! repeat if at least 32 bytes left
822	stw	%o4, [%o1-4]
823.bc_medw31:
824	addcc	%o2, 24, %o2		! adjust count to be off by 7
825	ble,pt	%ncc, .bc_medw7		! skip if 7 or fewer bytes left
826	nop				!
827.bc_medw15:
828	ld	[%o0], %o4		! move a block of 8 bytes
829	subcc	%o2, 8, %o2		! decrement length count
830	stw	%o4, [%o1]
831	add	%o0, 8, %o0		! increase src ptr by 8
832	ld	[%o0-4], %o4
833	add	%o1, 8, %o1		! increase dst ptr by 8
834	bgu,pt	%ncc, .bc_medw15
835	stw	%o4, [%o1-4]
836.bc_medw7:
837	addcc	%o2, 7, %o2		! finish adjustment of remaining count
838	bz,pt	%ncc, .bc_smallx	! exit if finished
839	cmp	%o2, 4
840	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
841	nop				!
842	ld	[%o0], %o4		! move 4 bytes
843	add	%o0, 4, %o0		! increase src ptr by 4
844	add	%o1, 4, %o1		! increase dst ptr by 4
845	subcc	%o2, 4, %o2		! decrease count by 4
846	bnz	.bc_small3x
847	stw	%o4, [%o1-4]
848	ba	.bc_smallx
849	nop
850
851.bc_medhalf:
852	subcc	%o2, 31, %o2		! adjust length to allow cc test
853	ble,pt	%ncc, .bc_medh31
854	nop
855.bc_medh32:				! load and store block of 32 bytes
856	subcc	%o2, 32, %o2		! decrement length count
857
858	lduh	[%o0], %o4		! move 32 bytes
859	lduw	[%o0+2], %o3
860	sllx	%o4, 48, %o4
861	sllx	%o3, 16, %o3
862	or	%o4, %o3, %o3
863	lduh	[%o0+6], %o4
864	or	%o4, %o3, %o4
865	stx	%o4, [%o1]
866
867	lduh	[%o0+8], %o4
868	lduw	[%o0+10], %o3
869	sllx	%o4, 48, %o4
870	sllx	%o3, 16, %o3
871	or	%o4, %o3, %o3
872	lduh	[%o0+14], %o4
873	or	%o4, %o3, %o4
874	stx	%o4, [%o1+8]
875
876	lduh	[%o0+16], %o4
877	lduw	[%o0+18], %o3
878	sllx	%o4, 48, %o4
879	sllx	%o3, 16, %o3
880	or	%o4, %o3, %o3
881	lduh	[%o0+22], %o4
882	or	%o4, %o3, %o4
883	stx	%o4, [%o1+16]
884
885	add	%o0, 32, %o0		! increase src ptr by 32
886	add	%o1, 32, %o1		! increase dst ptr by 32
887
888	lduh	[%o0-8], %o4
889	lduw	[%o0-6], %o3
890	sllx	%o4, 48, %o4
891	sllx	%o3, 16, %o3
892	or	%o4, %o3, %o3
893	lduh	[%o0-2], %o4
894	or	%o3, %o4, %o4
895	bgu,pt	%ncc, .bc_medh32	! repeat if at least 32 bytes left
896	stx	%o4, [%o1-8]
897
898.bc_medh31:
899	addcc	%o2, 24, %o2		! adjust count to be off by 7
900	ble,pt	%ncc, .bc_medh7		! skip if 7 or fewer bytes left
901	nop				!
902.bc_medh15:
903	lduh	[%o0], %o4		! move 16 bytes
904	subcc	%o2, 8, %o2		! decrement length count
905	lduw	[%o0+2], %o3
906	sllx	%o4, 48, %o4
907	sllx	%o3, 16, %o3
908	or	%o4, %o3, %o3
909	add	%o1, 8, %o1		! increase dst ptr by 8
910	lduh	[%o0+6], %o4
911	add	%o0, 8, %o0		! increase src ptr by 8
912	or	%o4, %o3, %o4
913	bgu,pt	%ncc, .bc_medh15
914	stx	%o4, [%o1-8]
915.bc_medh7:
916	addcc	%o2, 7, %o2		! finish adjustment of remaining count
917	bz,pt	%ncc, .bc_smallx	! exit if finished
918	cmp	%o2, 4
919	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
920	nop				!
921	lduh	[%o0], %o4
922	sll	%o4, 16, %o4
923	lduh	[%o0+2], %o3
924	or	%o3, %o4, %o4
925	subcc	%o2, 4, %o2
926	add	%o0, 4, %o0
927	add	%o1, 4, %o1
928	bnz	.bc_small3x
929	stw	%o4, [%o1-4]
930	ba	.bc_smallx
931	nop
932
933	.align 16
934.bc_med_byte:
935	bnz,pt	%ncc, .bc_medbh32a	! go to correct byte move
936	subcc	%o2, 31, %o2		! adjust length to allow cc test
937	ble,pt	%ncc, .bc_medb31
938	nop
939.bc_medb32:				! Alignment 1 or 5
940	subcc	%o2, 32, %o2		! decrement length count
941
942	ldub	[%o0], %o4		! load and store a block of 32 bytes
943	sllx	%o4, 56, %o3
944	lduh	[%o0+1], %o4
945	sllx	%o4, 40, %o4
946	or	%o4, %o3, %o3
947	lduw	[%o0+3], %o4
948	sllx	%o4, 8, %o4
949	or	%o4, %o3, %o3
950	ldub	[%o0+7], %o4
951	or	%o4, %o3, %o4
952	stx	%o4, [%o1]
953
954	ldub	[%o0+8], %o4
955	sllx	%o4, 56, %o3
956	lduh	[%o0+9], %o4
957	sllx	%o4, 40, %o4
958	or	%o4, %o3, %o3
959	lduw	[%o0+11], %o4
960	sllx	%o4, 8, %o4
961	or	%o4, %o3, %o3
962	ldub	[%o0+15], %o4
963	or	%o4, %o3, %o4
964	stx	%o4, [%o1+8]
965
966	ldub	[%o0+16], %o4
967	sllx	%o4, 56, %o3
968	lduh	[%o0+17], %o4
969	sllx	%o4, 40, %o4
970	or	%o4, %o3, %o3
971	lduw	[%o0+19], %o4
972	sllx	%o4, 8, %o4
973	or	%o4, %o3, %o3
974	ldub	[%o0+23], %o4
975	or	%o4, %o3, %o4
976	stx	%o4, [%o1+16]
977
978	add	%o0, 32, %o0		! increase src ptr by 32
979	add	%o1, 32, %o1		! increase dst ptr by 32
980
981	ldub	[%o0-8], %o4
982	sllx	%o4, 56, %o3
983	lduh	[%o0-7], %o4
984	sllx	%o4, 40, %o4
985	or	%o4, %o3, %o3
986	lduw	[%o0-5], %o4
987	sllx	%o4, 8, %o4
988	or	%o4, %o3, %o3
989	ldub	[%o0-1], %o4
990	or	%o4, %o3, %o4
991	bgu,pt	%ncc, .bc_medb32	! repeat if at least 32 bytes left
992	stx	%o4, [%o1-8]
993
994.bc_medb31:				! 31 or fewer bytes remaining
995	addcc	%o2, 24, %o2		! adjust count to be off by 7
996	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
997	nop				!
998.bc_medb15:
999
1000	ldub	[%o0], %o4		! load and store a block of 8 bytes
1001	subcc	%o2, 8, %o2		! decrement length count
1002	sllx	%o4, 56, %o3
1003	lduh	[%o0+1], %o4
1004	sllx	%o4, 40, %o4
1005	or	%o4, %o3, %o3
1006	lduw	[%o0+3], %o4
1007	add	%o1, 8, %o1		! increase dst ptr by 16
1008	sllx	%o4, 8, %o4
1009	or	%o4, %o3, %o3
1010	ldub	[%o0+7], %o4
1011	add	%o0, 8, %o0		! increase src ptr by 16
1012	or	%o4, %o3, %o4
1013	bgu,pt	%ncc, .bc_medb15
1014	stx	%o4, [%o1-8]
1015.bc_medb7:
1016	addcc	%o2, 7, %o2		! finish adjustment of remaining count
1017	bz,pt	%ncc, .bc_smallx	! exit if finished
1018	cmp	%o2, 4
1019	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
1020	nop				!
1021	ldub	[%o0], %o4		! move 4 bytes
1022	sll	%o4, 24, %o3
1023	lduh	[%o0+1], %o4
1024	sll	%o4, 8, %o4
1025	or	%o4, %o3, %o3
1026	ldub	[%o0+3], %o4
1027	or	%o4, %o3, %o4
1028	subcc	%o2, 4, %o2
1029	add	%o0, 4, %o0
1030	add	%o1, 4, %o1
1031	bnz	.bc_small3x
1032	stw	%o4, [%o1-4]
1033	ba	.bc_smallx
1034	nop
1035
1036	.align 16
1037.bc_medbh32a:				! Alignment 3 or 7
1038	ble,pt	%ncc, .bc_medbh31
1039	nop
1040.bc_medbh32:				! Alignment 3 or 7
1041	subcc	%o2, 32, %o2		! decrement length count
1042
1043	ldub	[%o0], %o4		! load and store a block of 32 bytes
1044	sllx	%o4, 56, %o3
1045	lduw	[%o0+1], %o4
1046	sllx	%o4, 24, %o4
1047	or	%o4, %o3, %o3
1048	lduh	[%o0+5], %o4
1049	sllx	%o4, 8, %o4
1050	or	%o4, %o3, %o3
1051	ldub	[%o0+7], %o4
1052	or	%o4, %o3, %o4
1053	stx	%o4, [%o1]
1054
1055	ldub	[%o0+8], %o4
1056	sllx	%o4, 56, %o3
1057	lduw	[%o0+9], %o4
1058	sllx	%o4, 24, %o4
1059	or	%o4, %o3, %o3
1060	lduh	[%o0+13], %o4
1061	sllx	%o4, 8, %o4
1062	or	%o4, %o3, %o3
1063	ldub	[%o0+15], %o4
1064	or	%o4, %o3, %o4
1065	stx	%o4, [%o1+8]
1066
1067	ldub	[%o0+16], %o4
1068	sllx	%o4, 56, %o3
1069	lduw	[%o0+17], %o4
1070	sllx	%o4, 24, %o4
1071	or	%o4, %o3, %o3
1072	lduh	[%o0+21], %o4
1073	sllx	%o4, 8, %o4
1074	or	%o4, %o3, %o3
1075	ldub	[%o0+23], %o4
1076	or	%o4, %o3, %o4
1077	stx	%o4, [%o1+16]
1078
1079	add	%o0, 32, %o0		! increase src ptr by 32
1080	add	%o1, 32, %o1		! increase dst ptr by 32
1081
1082	ldub	[%o0-8], %o4
1083	sllx	%o4, 56, %o3
1084	lduw	[%o0-7], %o4
1085	sllx	%o4, 24, %o4
1086	or	%o4, %o3, %o3
1087	lduh	[%o0-3], %o4
1088	sllx	%o4, 8, %o4
1089	or	%o4, %o3, %o3
1090	ldub	[%o0-1], %o4
1091	or	%o4, %o3, %o4
1092	bgu,pt	%ncc, .bc_medbh32	! repeat if at least 32 bytes left
1093	stx	%o4, [%o1-8]
1094
1095.bc_medbh31:
1096	addcc	%o2, 24, %o2		! adjust count to be off by 7
1097	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
1098	nop				!
1099.bc_medbh15:
1100	ldub	[%o0], %o4		! load and store a block of 8 bytes
1101	sllx	%o4, 56, %o3
1102	lduw	[%o0+1], %o4
1103	sllx	%o4, 24, %o4
1104	or	%o4, %o3, %o3
1105	lduh	[%o0+5], %o4
1106	sllx	%o4, 8, %o4
1107	or	%o4, %o3, %o3
1108	ldub	[%o0+7], %o4
1109	or	%o4, %o3, %o4
1110	stx	%o4, [%o1]
1111	subcc	%o2, 8, %o2		! decrement length count
1112	add	%o1, 8, %o1		! increase dst ptr by 8
1113	add	%o0, 8, %o0		! increase src ptr by 8
1114	bgu,pt	%ncc, .bc_medbh15
1115	stx	%o4, [%o1-8]
1116	ba	.bc_medb7
1117	nop
1118
1119	SET_SIZE(bcopy)
1120/*
1121 * The _more entry points are not intended to be used directly by
1122 * any caller from outside this file.  They are provided to allow
1123 * profiling and dtrace of the portions of the copy code that uses
1124 * the floating point registers.
1125*/
1126	ENTRY(bcopy_more)
1127.bcopy_more:
1128	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1129	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
1130	brz,pt	%o5, .do_copy
1131	nop
1132	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
1133	or	%l7, %lo(.copyerr), %l7
1134	membar	#Sync				! sync error barrier
1135	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
1136	! We've already captured whether t_lofault was zero on entry.
1137	! We need to mark ourselves as being from bcopy since both
1138	! kcopy and bcopy use the same code path. If LOFAULT_SET is
1139	! set and the saved lofault was zero, we won't reset lofault on
1140	! returning.
1141	or	%o5, LOFAULT_SET, %o5
1142.do_copy:
1143	ldn	[THREAD_REG + T_LWP], %o3
1144	brnz,pt	%o3, 1f
1145	nop
1146/*
1147 * kpreempt_disable();
1148 */
1149	ldsb	[THREAD_REG +T_PREEMPT], %o3
1150	inc	%o3
1151	stb	%o3, [THREAD_REG + T_PREEMPT]
11521:
1153/*
1154 * Following code is for large copies. We know there is at
1155 * least FP_COPY bytes available. FP regs are used, so
1156 *  we save registers and fp regs before starting
1157 */
1158	rd	%fprs, %g5		! check for unused fp
1159	or	%o5,FPUSED_FLAG,%o5
1160	! if fprs.fef == 0, set it.
1161	! Setting it when already set costs more than checking
1162	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1163	bz,pt	%ncc, .bc_fp_unused
1164	prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1165	BST_FP_TOSTACK(%o3)
1166	ba	.bc_fp_ready
1167.bc_fp_unused:
1168	andcc	%i1, 1, %o3		! is dest byte aligned
1169	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1170.bc_fp_ready:
1171	rd	%gsr, %l5		! save %gsr value
1172	bnz,pt	%ncc, .bc_big_d1
1173.bc_big_d1f:				! dest is now half word aligned
1174	andcc	%i1, 2, %o3
1175	bnz,pt	%ncc, .bc_big_d2
1176.bc_big_d2f:				! dest is now word aligned
1177	andcc	%i1, 4, %o3
1178	bnz,pt	%ncc, .bc_big_d4
1179.bc_big_d4f:				! dest is now long word aligned
1180	andcc	%i0, 7, %o3		! is src long word aligned
1181	brnz,pt	%o3, .bc_big_unal8
1182	prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1183
1184	! Src and dst are long word aligned
1185	! align dst to 64 byte boundary
1186	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
1187	brz,pn	%o3, .bc_al_to_64
1188	nop
1189	sub	%o3, 64, %o3		! %o3 has negative bytes to move
1190	add	%i2, %o3, %i2		! adjust remaining count
1191	andcc	%o3, 8, %o4		! odd long words to move?
1192	brz,pt	%o4, .bc_al_to_16
1193	nop
1194	add	%o3, 8, %o3
1195	ldx	[%i0], %o4
1196	add	%i0, 8, %i0		! increment src ptr
1197	add	%i1, 8, %i1		! increment dst ptr
1198	stx	%o4, [%i1-8]
1199! Dest is aligned on 16 bytes, src 8 byte aligned
1200.bc_al_to_16:
1201	andcc	%o3, 0x30, %o4		! pair of long words to move?
1202	brz,pt	%o4, .bc_al_to_64
1203	nop
1204.bc_al_mv_16:
1205	add	%o3, 16, %o3
1206	ldx	[%i0], %o4
1207	stx	%o4, [%i1]
1208	ldx	[%i0+8], %o4
1209	add	%i0, 16, %i0		! increment src ptr
1210	stx	%o4, [%i1+8]
1211	andcc	%o3, 48, %o4
1212	brnz,pt	%o4, .bc_al_mv_16
1213	add	%i1, 16, %i1		! increment dst ptr
1214! Dest is aligned on 64 bytes, src 8 byte aligned
1215.bc_al_to_64:
1216	! Determine source alignment
1217	! to correct 8 byte offset
1218	andcc	%i0, 32, %o3
1219	brnz,pn	%o3, .bc_aln_1
1220	andcc	%i0, 16, %o3
1221	brnz,pn	%o3, .bc_aln_01
1222	andcc	%i0, 8, %o3
1223	brz,pn	%o3, .bc_aln_000
1224	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1225	ba	.bc_aln_001
1226	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1227
1228.bc_aln_01:
1229	brnz,pn	%o3, .bc_aln_011
1230	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1231	ba	.bc_aln_010
1232	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1233.bc_aln_1:
1234	andcc	%i0, 16, %o3
1235	brnz,pn	%o3, .bc_aln_11
1236	andcc	%i0, 8, %o3
1237	brnz,pn	%o3, .bc_aln_101
1238	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1239	ba	.bc_aln_100
1240	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1241.bc_aln_11:
1242	brz,pn	%o3, .bc_aln_110
1243	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1244
1245.bc_aln_111:
1246! Alignment off by 8 bytes
1247	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1248	ldd	[%i0], %d0
1249	add	%i0, 8, %i0
1250	sub	%i2, 8, %i2
1251	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1252	and	%i2, 0x7f, %i2		! residue bytes in %i2
1253	sub	%i1, %i0, %i1
1254.bc_aln_111_loop:
1255	ldda	[%i0]ASI_BLK_P,%d16		! block load
1256	subcc	%o3, 64, %o3
1257	fmovd	%d16, %d2
1258	fmovd	%d18, %d4
1259	fmovd	%d20, %d6
1260	fmovd	%d22, %d8
1261	fmovd	%d24, %d10
1262	fmovd	%d26, %d12
1263	fmovd	%d28, %d14
1264	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1265	stda	%d0,[%i0+%i1]ASI_BLK_P
1266	add	%i0, 64, %i0
1267	fmovd	%d30, %d0
1268	bgt,pt	%ncc, .bc_aln_111_loop
1269	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1270	add	%i1, %i0, %i1
1271
1272	std	%d0, [%i1]
1273	ba	.bc_remain_stuff
1274	add	%i1, 8, %i1
1275	! END OF aln_111
1276
1277.bc_aln_110:
1278! Alignment off by 16 bytes
1279	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1280	ldd	[%i0], %d0
1281	ldd	[%i0+8], %d2
1282	add	%i0, 16, %i0
1283	sub	%i2, 16, %i2
1284	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1285	and	%i2, 0x7f, %i2		! residue bytes in %i2
1286	sub	%i1, %i0, %i1
1287.bc_aln_110_loop:
1288	ldda	[%i0]ASI_BLK_P,%d16		! block load
1289	subcc	%o3, 64, %o3
1290	fmovd	%d16, %d4
1291	fmovd	%d18, %d6
1292	fmovd	%d20, %d8
1293	fmovd	%d22, %d10
1294	fmovd	%d24, %d12
1295	fmovd	%d26, %d14
1296	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1297	stda	%d0,[%i0+%i1]ASI_BLK_P
1298	add	%i0, 64, %i0
1299	fmovd	%d28, %d0
1300	fmovd	%d30, %d2
1301	bgt,pt	%ncc, .bc_aln_110_loop
1302	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1303	add	%i1, %i0, %i1
1304
1305	std	%d0, [%i1]
1306	std	%d2, [%i1+8]
1307	ba	.bc_remain_stuff
1308	add	%i1, 16, %i1
1309	! END OF aln_110
1310
1311.bc_aln_101:
1312! Alignment off by 24 bytes
1313	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1314	ldd	[%i0], %d0
1315	ldd	[%i0+8], %d2
1316	ldd	[%i0+16], %d4
1317	add	%i0, 24, %i0
1318	sub	%i2, 24, %i2
1319	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1320	and	%i2, 0x7f, %i2		! residue bytes in %i2
1321	sub	%i1, %i0, %i1
1322.bc_aln_101_loop:
1323	ldda	[%i0]ASI_BLK_P,%d16	! block load
1324	subcc	%o3, 64, %o3
1325	fmovd	%d16, %d6
1326	fmovd	%d18, %d8
1327	fmovd	%d20, %d10
1328	fmovd	%d22, %d12
1329	fmovd	%d24, %d14
1330	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1331	stda	%d0,[%i0+%i1]ASI_BLK_P
1332	add	%i0, 64, %i0
1333	fmovd	%d26, %d0
1334	fmovd	%d28, %d2
1335	fmovd	%d30, %d4
1336	bgt,pt	%ncc, .bc_aln_101_loop
1337	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1338	add	%i1, %i0, %i1
1339
1340	std	%d0, [%i1]
1341	std	%d2, [%i1+8]
1342	std	%d4, [%i1+16]
1343	ba	.bc_remain_stuff
1344	add	%i1, 24, %i1
1345	! END OF aln_101
1346
1347.bc_aln_100:
1348! Alignment off by 32 bytes
1349	ldd	[%i0], %d0
1350	ldd	[%i0+8], %d2
1351	ldd	[%i0+16],%d4
1352	ldd	[%i0+24],%d6
1353	add	%i0, 32, %i0
1354	sub	%i2, 32, %i2
1355	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1356	and	%i2, 0x7f, %i2		! residue bytes in %i2
1357	sub	%i1, %i0, %i1
1358.bc_aln_100_loop:
1359	ldda	[%i0]ASI_BLK_P,%d16	! block load
1360	subcc	%o3, 64, %o3
1361	fmovd	%d16, %d8
1362	fmovd	%d18, %d10
1363	fmovd	%d20, %d12
1364	fmovd	%d22, %d14
1365	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1366	stda	%d0,[%i0+%i1]ASI_BLK_P
1367	add	%i0, 64, %i0
1368	fmovd	%d24, %d0
1369	fmovd	%d26, %d2
1370	fmovd	%d28, %d4
1371	fmovd	%d30, %d6
1372	bgt,pt	%ncc, .bc_aln_100_loop
1373	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1374	add	%i1, %i0, %i1
1375
1376	std	%d0, [%i1]
1377	std	%d2, [%i1+8]
1378	std	%d4, [%i1+16]
1379	std	%d6, [%i1+24]
1380	ba	.bc_remain_stuff
1381	add	%i1, 32, %i1
1382	! END OF aln_100
1383
1384.bc_aln_011:
1385! Alignment off by 40 bytes
1386	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1387	ldd	[%i0], %d0
1388	ldd	[%i0+8], %d2
1389	ldd	[%i0+16], %d4
1390	ldd	[%i0+24], %d6
1391	ldd	[%i0+32], %d8
1392	add	%i0, 40, %i0
1393	sub	%i2, 40, %i2
1394	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1395	and	%i2, 0x7f, %i2		! residue bytes in %i2
1396	sub	%i1, %i0, %i1
1397.bc_aln_011_loop:
1398	ldda	[%i0]ASI_BLK_P,%d16	! block load
1399	subcc	%o3, 64, %o3
1400	fmovd	%d16, %d10
1401	fmovd	%d18, %d12
1402	fmovd	%d20, %d14
1403	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1404	stda	%d0,[%i0+%i1]ASI_BLK_P
1405	add	%i0, 64, %i0
1406	fmovd	%d22, %d0
1407	fmovd	%d24, %d2
1408	fmovd	%d26, %d4
1409	fmovd	%d28, %d6
1410	fmovd	%d30, %d8
1411	bgt,pt	%ncc, .bc_aln_011_loop
1412	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1413	add	%i1, %i0, %i1
1414
1415	std	%d0, [%i1]
1416	std	%d2, [%i1+8]
1417	std	%d4, [%i1+16]
1418	std	%d6, [%i1+24]
1419	std	%d8, [%i1+32]
1420	ba	.bc_remain_stuff
1421	add	%i1, 40, %i1
1422	! END OF aln_011
1423
1424.bc_aln_010:
1425! Alignment off by 48 bytes
1426	ldd	[%i0], %d0
1427	ldd	[%i0+8], %d2
1428	ldd	[%i0+16], %d4
1429	ldd	[%i0+24], %d6
1430	ldd	[%i0+32], %d8
1431	ldd	[%i0+40], %d10
1432	add	%i0, 48, %i0
1433	sub	%i2, 48, %i2
1434	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1435	and	%i2, 0x7f, %i2		! residue bytes in %i2
1436	sub	%i1, %i0, %i1
1437.bc_aln_010_loop:
1438	ldda	[%i0]ASI_BLK_P,%d16	! block load
1439	subcc	%o3, 64, %o3
1440	fmovd	%d16, %d12
1441	fmovd	%d18, %d14
1442	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1443	stda	%d0,[%i0+%i1]ASI_BLK_P
1444	add	%i0, 64, %i0
1445	fmovd	%d20, %d0
1446	fmovd	%d22, %d2
1447	fmovd	%d24, %d4
1448	fmovd	%d26, %d6
1449	fmovd	%d28, %d8
1450	fmovd	%d30, %d10
1451	bgt,pt	%ncc, .bc_aln_010_loop
1452	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1453	add	%i1, %i0, %i1
1454
1455	std	%d0, [%i1]
1456	std	%d2, [%i1+8]
1457	std	%d4, [%i1+16]
1458	std	%d6, [%i1+24]
1459	std	%d8, [%i1+32]
1460	std	%d10, [%i1+40]
1461	ba	.bc_remain_stuff
1462	add	%i1, 48, %i1
1463	! END OF aln_010
1464
1465.bc_aln_001:
1466! Alignment off by 56 bytes
1467	ldd	[%i0], %d0
1468	ldd	[%i0+8], %d2
1469	ldd	[%i0+16], %d4
1470	ldd	[%i0+24], %d6
1471	ldd	[%i0+32], %d8
1472	ldd	[%i0+40], %d10
1473	ldd	[%i0+48], %d12
1474	add	%i0, 56, %i0
1475	sub	%i2, 56, %i2
1476	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1477	and	%i2, 0x7f, %i2		! residue bytes in %i2
1478	sub	%i1, %i0, %i1
1479.bc_aln_001_loop:
1480	ldda	[%i0]ASI_BLK_P,%d16	! block load
1481	subcc	%o3, 64, %o3
1482	fmovd	%d16, %d14
1483	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1484	stda	%d0,[%i0+%i1]ASI_BLK_P
1485	add	%i0, 64, %i0
1486	fmovd	%d18, %d0
1487	fmovd	%d20, %d2
1488	fmovd	%d22, %d4
1489	fmovd	%d24, %d6
1490	fmovd	%d26, %d8
1491	fmovd	%d28, %d10
1492	fmovd	%d30, %d12
1493	bgt,pt	%ncc, .bc_aln_001_loop
1494	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1495	add	%i1, %i0, %i1
1496
1497	std	%d0, [%i1]
1498	std	%d2, [%i1+8]
1499	std	%d4, [%i1+16]
1500	std	%d6, [%i1+24]
1501	std	%d8, [%i1+32]
1502	std	%d10, [%i1+40]
1503	std	%d12, [%i1+48]
1504	ba	.bc_remain_stuff
1505	add	%i1, 56, %i1
1506	! END OF aln_001
1507
1508.bc_aln_000:
1509	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1510	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1511	and	%i2, 0x7f, %i2		! residue bytes in %i2
1512	sub	%i1, %i0, %i1
1513.bc_aln_000_loop:
1514	ldda	[%i0]ASI_BLK_P,%d0
1515	subcc	%o3, 64, %o3
1516	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1517	stda	%d0,[%i0+%i1]ASI_BLK_P
1518	add	%i0, 64, %i0
1519	bgt,pt	%ncc, .bc_aln_000_loop
1520	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1521	add	%i1, %i0, %i1
1522
1523	! END OF aln_000
1524
1525.bc_remain_stuff:
1526	subcc	%i2, 31, %i2		! adjust length to allow cc test
1527	ble,pt	%ncc, .bc_aln_31
1528	nop
1529.bc_aln_32:
1530	ldx	[%i0], %o4		! move 32 bytes
1531	subcc	%i2, 32, %i2		! decrement length count by 32
1532	stx	%o4, [%i1]
1533	ldx	[%i0+8], %o4
1534	stx	%o4, [%i1+8]
1535	ldx	[%i0+16], %o4
1536	add	%i0, 32, %i0		! increase src ptr by 32
1537	stx	%o4, [%i1+16]
1538	ldx	[%i0-8], %o4
1539	add	%i1, 32, %i1		! increase dst ptr by 32
1540	bgu,pt	%ncc, .bc_aln_32	! repeat if at least 32 bytes left
1541	stx	%o4, [%i1-8]
1542.bc_aln_31:
1543	addcc	%i2, 24, %i2		! adjust count to be off by 7
1544	ble,pt	%ncc, .bc_aln_7		! skip if 7 or fewer bytes left
1545	nop				!
1546.bc_aln_15:
1547	ldx	[%i0], %o4		! move 8 bytes
1548	add	%i0, 8, %i0		! increase src ptr by 8
1549	subcc	%i2, 8, %i2		! decrease count by 8
1550	add	%i1, 8, %i1		! increase dst ptr by 8
1551	bgu,pt	%ncc, .bc_aln_15
1552	stx	%o4, [%i1-8]		!
1553.bc_aln_7:
1554	addcc	%i2, 7, %i2		! finish adjustment of remaining count
1555	bz,pt	%ncc, .bc_exit		! exit if finished
1556	cmp	%i2, 4
1557	blt,pt	%ncc, .bc_unaln3x	! skip if less than 4 bytes left
1558	nop				!
1559	ld	[%i0], %o4		! move 4 bytes
1560	add	%i0, 4, %i0		! increase src ptr by 4
1561	add	%i1, 4, %i1		! increase dst ptr by 4
1562	subcc	%i2, 4, %i2		! decrease count by 4
1563	bnz	.bc_unaln3x
1564	stw	%o4, [%i1-4]
1565	ba	.bc_exit
1566	nop
1567
1568	! destination alignment code
1569.bc_big_d1:
1570	ldub	[%i0], %o4		! move a byte
1571	add	%i0, 1, %i0
1572	stb	%o4, [%i1]
1573	add	%i1, 1, %i1
1574	andcc	%i1, 2, %o3
1575	bz,pt	%ncc, .bc_big_d2f
1576	sub	%i2, 1, %i2
1577.bc_big_d2:
1578	ldub	[%i0], %o4		! move a half-word (src align unknown)
1579	ldub	[%i0+1], %o3
1580	add	%i0, 2, %i0
1581	sll	%o4, 8, %o4		! position
1582	or	%o4, %o3, %o4		! merge
1583	sth	%o4, [%i1]
1584	add	%i1, 2, %i1
1585	andcc	%i1, 4, %o3
1586	bz,pt	%ncc, .bc_big_d4f
1587	sub	%i2, 2, %i2
1588.bc_big_d4:
1589	ldub	[%i0], %o4		! move a word (src align unknown)
1590	ldub	[%i0+1], %o3
1591	sll	%o4, 24, %o4		! position
1592	sll	%o3, 16, %o3		! position
1593	or	%o4, %o3, %o3		! merge
1594	ldub	[%i0+2], %o4
1595	sll	%o4, 8, %o4		! position
1596	or	%o4, %o3, %o3		! merge
1597	ldub	[%i0+3], %o4
1598	or	%o4, %o3, %o4		! merge
1599	stw	%o4,[%i1]		! store four bytes
1600	add	%i0, 4, %i0		! adjust src by 4
1601	add	%i1, 4, %i1		! adjust dest by 4
1602	ba	.bc_big_d4f
1603	sub	%i2, 4, %i2		! adjust count by 4
1604
1605
1606	! Dst is on 8 byte boundary; src is not;
1607.bc_big_unal8:
1608	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
1609	bz	%ncc, .bc_unalnsrc
1610	sub	%o3, 64, %o3		! %o3 will be multiple of 8
1611	neg	%o3			! bytes until dest is 64 byte aligned
1612	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
1613	! Move bytes according to source alignment
1614	andcc	%i0, 0x1, %o4
1615	bnz	%ncc, .bc_unalnbyte	! check for byte alignment
1616	nop
1617	andcc	%i0, 2, %o4		! check for half word alignment
1618	bnz	%ncc, .bc_unalnhalf
1619	nop
1620	! Src is word aligned, move bytes until dest 64 byte aligned
1621.bc_unalnword:
1622	ld	[%i0], %o4		! load 4 bytes
1623	stw	%o4, [%i1]		! and store 4 bytes
1624	ld	[%i0+4], %o4		! load 4 bytes
1625	add	%i0, 8, %i0		! increase src ptr by 8
1626	stw	%o4, [%i1+4]		! and store 4 bytes
1627	subcc	%o3, 8, %o3		! decrease count by 8
1628	bnz	%ncc, .bc_unalnword
1629	add	%i1, 8, %i1		! increase dst ptr by 8
1630	ba	.bc_unalnsrc
1631	nop
1632
1633	! Src is half-word aligned, move bytes until dest 64 byte aligned
1634.bc_unalnhalf:
1635	lduh	[%i0], %o4		! load 2 bytes
1636	sllx	%o4, 32, %i3		! shift left
1637	lduw	[%i0+2], %o4
1638	or	%o4, %i3, %i3
1639	sllx	%i3, 16, %i3
1640	lduh	[%i0+6], %o4
1641	or	%o4, %i3, %i3
1642	stx	%i3, [%i1]
1643	add	%i0, 8, %i0
1644	subcc	%o3, 8, %o3
1645	bnz	%ncc, .bc_unalnhalf
1646	add	%i1, 8, %i1
1647	ba	.bc_unalnsrc
1648	nop
1649
1650	! Src is Byte aligned, move bytes until dest 64 byte aligned
1651.bc_unalnbyte:
1652	sub	%i1, %i0, %i1		! share pointer advance
1653.bc_unalnbyte_loop:
1654	ldub	[%i0], %o4
1655	sllx	%o4, 56, %i3
1656	lduh	[%i0+1], %o4
1657	sllx	%o4, 40, %o4
1658	or	%o4, %i3, %i3
1659	lduh	[%i0+3], %o4
1660	sllx	%o4, 24, %o4
1661	or	%o4, %i3, %i3
1662	lduh	[%i0+5], %o4
1663	sllx	%o4, 8, %o4
1664	or	%o4, %i3, %i3
1665	ldub	[%i0+7], %o4
1666	or	%o4, %i3, %i3
1667	stx	%i3, [%i1+%i0]
1668	subcc	%o3, 8, %o3
1669	bnz	%ncc, .bc_unalnbyte_loop
1670	add	%i0, 8, %i0
1671	add	%i1,%i0, %i1		! restore pointer
1672
1673	! Destination is now block (64 byte aligned), src is not 8 byte aligned
1674.bc_unalnsrc:
1675	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
1676	and	%i2, 0x3f, %i2		! residue bytes in %i2
1677	add	%i2, 64, %i2		! Insure we don't load beyond
1678	sub	%i3, 64, %i3		! end of source buffer
1679
1680	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
1681	prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1682	alignaddr %i0, %g0, %g0		! generate %gsr
1683	add	%i0, %i3, %i0		! advance %i0 to after blocks
1684	!
1685	! Determine source alignment to correct 8 byte offset
1686	andcc	%i0, 0x20, %o3
1687	brnz,pn	%o3, .bc_unaln_1
1688	andcc	%i0, 0x10, %o3
1689	brnz,pn	%o3, .bc_unaln_01
1690	andcc	%i0, 0x08, %o3
1691	brz,a	%o3, .bc_unaln_000
1692	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1693	ba	.bc_unaln_001
1694	nop
1695.bc_unaln_01:
1696	brnz,a	%o3, .bc_unaln_011
1697	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1698	ba	.bc_unaln_010
1699	nop
1700.bc_unaln_1:
1701	brnz,pn	%o3, .bc_unaln_11
1702	andcc	%i0, 0x08, %o3
1703	brnz,a	%o3, .bc_unaln_101
1704	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1705	ba	.bc_unaln_100
1706	nop
1707.bc_unaln_11:
1708	brz,pn	%o3, .bc_unaln_110
1709	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1710
1711.bc_unaln_111:
1712	ldd	[%o4+56], %d14
1713.bc_unaln_111_loop:
1714	add	%o4, 64, %o4
1715	ldda	[%o4]ASI_BLK_P, %d16
1716	faligndata %d14, %d16, %d48
1717	faligndata %d16, %d18, %d50
1718	faligndata %d18, %d20, %d52
1719	faligndata %d20, %d22, %d54
1720	faligndata %d22, %d24, %d56
1721	faligndata %d24, %d26, %d58
1722	faligndata %d26, %d28, %d60
1723	faligndata %d28, %d30, %d62
1724	fmovd	%d30, %d14
1725	stda	%d48, [%i1]ASI_BLK_P
1726	subcc	%i3, 64, %i3
1727	add	%i1, 64, %i1
1728	bgu,pt	%ncc, .bc_unaln_111_loop
1729	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1730	ba	.bc_unaln_done
1731	nop
1732
1733.bc_unaln_110:
1734	ldd	[%o4+48], %d12
1735	ldd	[%o4+56], %d14
1736.bc_unaln_110_loop:
1737	add	%o4, 64, %o4
1738	ldda	[%o4]ASI_BLK_P, %d16
1739	faligndata %d12, %d14, %d48
1740	faligndata %d14, %d16, %d50
1741	faligndata %d16, %d18, %d52
1742	faligndata %d18, %d20, %d54
1743	faligndata %d20, %d22, %d56
1744	faligndata %d22, %d24, %d58
1745	faligndata %d24, %d26, %d60
1746	faligndata %d26, %d28, %d62
1747	fmovd	%d28, %d12
1748	fmovd	%d30, %d14
1749	stda	%d48, [%i1]ASI_BLK_P
1750	subcc	%i3, 64, %i3
1751	add	%i1, 64, %i1
1752	bgu,pt	%ncc, .bc_unaln_110_loop
1753	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1754	ba	.bc_unaln_done
1755	nop
1756
1757.bc_unaln_101:
1758	ldd	[%o4+40], %d10
1759	ldd	[%o4+48], %d12
1760	ldd	[%o4+56], %d14
1761.bc_unaln_101_loop:
1762	add	%o4, 64, %o4
1763	ldda	[%o4]ASI_BLK_P, %d16
1764	faligndata %d10, %d12, %d48
1765	faligndata %d12, %d14, %d50
1766	faligndata %d14, %d16, %d52
1767	faligndata %d16, %d18, %d54
1768	faligndata %d18, %d20, %d56
1769	faligndata %d20, %d22, %d58
1770	faligndata %d22, %d24, %d60
1771	faligndata %d24, %d26, %d62
1772	fmovd	%d26, %d10
1773	fmovd	%d28, %d12
1774	fmovd	%d30, %d14
1775	stda	%d48, [%i1]ASI_BLK_P
1776	subcc	%i3, 64, %i3
1777	add	%i1, 64, %i1
1778	bgu,pt	%ncc, .bc_unaln_101_loop
1779	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1780	ba	.bc_unaln_done
1781	nop
1782
1783.bc_unaln_100:
1784	ldd	[%o4+32], %d8
1785	ldd	[%o4+40], %d10
1786	ldd	[%o4+48], %d12
1787	ldd	[%o4+56], %d14
1788.bc_unaln_100_loop:
1789	add	%o4, 64, %o4
1790	ldda	[%o4]ASI_BLK_P, %d16
1791	faligndata %d8, %d10, %d48
1792	faligndata %d10, %d12, %d50
1793	faligndata %d12, %d14, %d52
1794	faligndata %d14, %d16, %d54
1795	faligndata %d16, %d18, %d56
1796	faligndata %d18, %d20, %d58
1797	faligndata %d20, %d22, %d60
1798	faligndata %d22, %d24, %d62
1799	fmovd	%d24, %d8
1800	fmovd	%d26, %d10
1801	fmovd	%d28, %d12
1802	fmovd	%d30, %d14
1803	stda	%d48, [%i1]ASI_BLK_P
1804	subcc	%i3, 64, %i3
1805	add	%i1, 64, %i1
1806	bgu,pt	%ncc, .bc_unaln_100_loop
1807	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1808	ba	.bc_unaln_done
1809	nop
1810
1811.bc_unaln_011:
1812	ldd	[%o4+24], %d6
1813	ldd	[%o4+32], %d8
1814	ldd	[%o4+40], %d10
1815	ldd	[%o4+48], %d12
1816	ldd	[%o4+56], %d14
1817.bc_unaln_011_loop:
1818	add	%o4, 64, %o4
1819	ldda	[%o4]ASI_BLK_P, %d16
1820	faligndata %d6, %d8, %d48
1821	faligndata %d8, %d10, %d50
1822	faligndata %d10, %d12, %d52
1823	faligndata %d12, %d14, %d54
1824	faligndata %d14, %d16, %d56
1825	faligndata %d16, %d18, %d58
1826	faligndata %d18, %d20, %d60
1827	faligndata %d20, %d22, %d62
1828	fmovd	%d22, %d6
1829	fmovd	%d24, %d8
1830	fmovd	%d26, %d10
1831	fmovd	%d28, %d12
1832	fmovd	%d30, %d14
1833	stda	%d48, [%i1]ASI_BLK_P
1834	subcc	%i3, 64, %i3
1835	add	%i1, 64, %i1
1836	bgu,pt	%ncc, .bc_unaln_011_loop
1837	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1838	ba	.bc_unaln_done
1839	nop
1840
1841.bc_unaln_010:
1842	ldd	[%o4+16], %d4
1843	ldd	[%o4+24], %d6
1844	ldd	[%o4+32], %d8
1845	ldd	[%o4+40], %d10
1846	ldd	[%o4+48], %d12
1847	ldd	[%o4+56], %d14
1848.bc_unaln_010_loop:
1849	add	%o4, 64, %o4
1850	ldda	[%o4]ASI_BLK_P, %d16
1851	faligndata %d4, %d6, %d48
1852	faligndata %d6, %d8, %d50
1853	faligndata %d8, %d10, %d52
1854	faligndata %d10, %d12, %d54
1855	faligndata %d12, %d14, %d56
1856	faligndata %d14, %d16, %d58
1857	faligndata %d16, %d18, %d60
1858	faligndata %d18, %d20, %d62
1859	fmovd	%d20, %d4
1860	fmovd	%d22, %d6
1861	fmovd	%d24, %d8
1862	fmovd	%d26, %d10
1863	fmovd	%d28, %d12
1864	fmovd	%d30, %d14
1865	stda	%d48, [%i1]ASI_BLK_P
1866	subcc	%i3, 64, %i3
1867	add	%i1, 64, %i1
1868	bgu,pt	%ncc, .bc_unaln_010_loop
1869	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1870	ba	.bc_unaln_done
1871	nop
1872
1873.bc_unaln_001:
1874	ldd	[%o4+8], %d2
1875	ldd	[%o4+16], %d4
1876	ldd	[%o4+24], %d6
1877	ldd	[%o4+32], %d8
1878	ldd	[%o4+40], %d10
1879	ldd	[%o4+48], %d12
1880	ldd	[%o4+56], %d14
1881.bc_unaln_001_loop:
1882	add	%o4, 64, %o4
1883	ldda	[%o4]ASI_BLK_P, %d16
1884	faligndata %d2, %d4, %d48
1885	faligndata %d4, %d6, %d50
1886	faligndata %d6, %d8, %d52
1887	faligndata %d8, %d10, %d54
1888	faligndata %d10, %d12, %d56
1889	faligndata %d12, %d14, %d58
1890	faligndata %d14, %d16, %d60
1891	faligndata %d16, %d18, %d62
1892	fmovd	%d18, %d2
1893	fmovd	%d20, %d4
1894	fmovd	%d22, %d6
1895	fmovd	%d24, %d8
1896	fmovd	%d26, %d10
1897	fmovd	%d28, %d12
1898	fmovd	%d30, %d14
1899	stda	%d48, [%i1]ASI_BLK_P
1900	subcc	%i3, 64, %i3
1901	add	%i1, 64, %i1
1902	bgu,pt	%ncc, .bc_unaln_001_loop
1903	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1904	ba	.bc_unaln_done
1905	nop
1906
1907.bc_unaln_000:
1908	ldda	[%o4]ASI_BLK_P, %d0
1909.bc_unaln_000_loop:
1910	add	%o4, 64, %o4
1911	ldda	[%o4]ASI_BLK_P, %d16
1912	faligndata %d0, %d2, %d48
1913	faligndata %d2, %d4, %d50
1914	faligndata %d4, %d6, %d52
1915	faligndata %d6, %d8, %d54
1916	faligndata %d8, %d10, %d56
1917	faligndata %d10, %d12, %d58
1918	faligndata %d12, %d14, %d60
1919	faligndata %d14, %d16, %d62
1920	fmovd	%d16, %d0
1921	fmovd	%d18, %d2
1922	fmovd	%d20, %d4
1923	fmovd	%d22, %d6
1924	fmovd	%d24, %d8
1925	fmovd	%d26, %d10
1926	fmovd	%d28, %d12
1927	fmovd	%d30, %d14
1928	stda	%d48, [%i1]ASI_BLK_P
1929	subcc	%i3, 64, %i3
1930	add	%i1, 64, %i1
1931	bgu,pt	%ncc, .bc_unaln_000_loop
1932	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1933
1934.bc_unaln_done:
1935	! Handle trailing bytes, 64 to 127
1936	! Dest long word aligned, Src not long word aligned
1937	cmp	%i2, 15
1938	bleu	%ncc, .bc_unaln_short
1939
1940	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
1941	and	%i2, 0x7, %i2		! residue bytes in %i2
1942	add	%i2, 8, %i2
1943	sub	%i3, 8, %i3		! insure we don't load past end of src
1944	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
1945	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
1946	ldd	[%o4], %d0		! fetch partial word
1947.bc_unaln_by8:
1948	ldd	[%o4+8], %d2
1949	add	%o4, 8, %o4
1950	faligndata %d0, %d2, %d16
1951	subcc	%i3, 8, %i3
1952	std	%d16, [%i1]
1953	fmovd	%d2, %d0
1954	bgu,pt	%ncc, .bc_unaln_by8
1955	add	%i1, 8, %i1
1956
1957.bc_unaln_short:
1958	cmp	%i2, 8
1959	blt,pt	%ncc, .bc_unalnfin
1960	nop
1961	ldub	[%i0], %o4
1962	sll	%o4, 24, %o3
1963	ldub	[%i0+1], %o4
1964	sll	%o4, 16, %o4
1965	or	%o4, %o3, %o3
1966	ldub	[%i0+2], %o4
1967	sll	%o4, 8, %o4
1968	or	%o4, %o3, %o3
1969	ldub	[%i0+3], %o4
1970	or	%o4, %o3, %o3
1971	stw	%o3, [%i1]
1972	ldub	[%i0+4], %o4
1973	sll	%o4, 24, %o3
1974	ldub	[%i0+5], %o4
1975	sll	%o4, 16, %o4
1976	or	%o4, %o3, %o3
1977	ldub	[%i0+6], %o4
1978	sll	%o4, 8, %o4
1979	or	%o4, %o3, %o3
1980	ldub	[%i0+7], %o4
1981	or	%o4, %o3, %o3
1982	stw	%o3, [%i1+4]
1983	add	%i0, 8, %i0
1984	add	%i1, 8, %i1
1985	sub	%i2, 8, %i2
1986.bc_unalnfin:
1987	cmp	%i2, 4
1988	blt,pt	%ncc, .bc_unalnz
1989	tst	%i2
1990	ldub	[%i0], %o3		! read byte
1991	subcc	%i2, 4, %i2		! reduce count by 4
1992	sll	%o3, 24, %o3		! position
1993	ldub	[%i0+1], %o4
1994	sll	%o4, 16, %o4		! position
1995	or	%o4, %o3, %o3		! merge
1996	ldub	[%i0+2], %o4
1997	sll	%o4, 8, %o4		! position
1998	or	%o4, %o3, %o3		! merge
1999	add	%i1, 4, %i1		! advance dst by 4
2000	ldub	[%i0+3], %o4
2001	add	%i0, 4, %i0		! advance src by 4
2002	or	%o4, %o3, %o4		! merge
2003	bnz,pt	%ncc, .bc_unaln3x
2004	stw	%o4, [%i1-4]
2005	ba	.bc_exit
2006	nop
2007.bc_unalnz:
2008	bz,pt	%ncc, .bc_exit
2009.bc_unaln3x:				! Exactly 1, 2, or 3 bytes remain
2010	subcc	%i2, 1, %i2		! reduce count for cc test
2011	ldub	[%i0], %o4		! load one byte
2012	bz,pt	%ncc, .bc_exit
2013	stb	%o4, [%i1]		! store one byte
2014	ldub	[%i0+1], %o4		! load second byte
2015	subcc	%i2, 1, %i2
2016	bz,pt	%ncc, .bc_exit
2017	stb	%o4, [%i1+1]		! store second byte
2018	ldub	[%i0+2], %o4		! load third byte
2019	stb	%o4, [%i1+2]		! store third byte
2020.bc_exit:
2021	wr	%l5, %g0, %gsr		! restore %gsr
2022	brnz	%g5, .bc_fp_restore
2023	and	%o5, COPY_FLAGS, %l1	! save flags in %l1
2024	FZERO
2025	wr	%g5, %g0, %fprs
2026	ba,pt	%ncc, .bc_ex2
2027	nop
2028.bc_fp_restore:
2029	BLD_FP_FROMSTACK(%o4)
2030.bc_ex2:
2031	ldn	[THREAD_REG + T_LWP], %o2
2032	brnz,pt	%o2, 1f
2033	nop
2034
2035	ldsb	[THREAD_REG + T_PREEMPT], %l0
2036	deccc	%l0
2037	bnz,pn	%ncc, 1f
2038	stb	%l0, [THREAD_REG + T_PREEMPT]
2039
2040	! Check for a kernel preemption request
2041	ldn	[THREAD_REG + T_CPU], %l0
2042	ldub	[%l0 + CPU_KPRUNRUN], %l0
2043	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
2044	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
20451:
2046	btst	LOFAULT_SET, %l1
2047	bz,pn	%icc, 3f
2048	andncc	%o5, COPY_FLAGS, %o5
2049	! Here via bcopy. Check to see if the handler was NULL.
2050	! If so, just return quietly. Otherwise, reset the
2051	! handler and return.
2052	bz,pn %ncc, 2f
2053	nop
2054	membar	#Sync
2055	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
20562:
2057	btst	KPREEMPT_FLAG, %l1
2058	bz,pt	%icc, 3f
2059	nop
2060	call	kpreempt
2061	rdpr	%pil, %o0		! pass %pil
20623:
2063	ret
2064	restore	%g0, 0, %o0
2065
2066	SET_SIZE(bcopy_more)
2067
2068
2069#else	/* NIAGARA_IMPL */
2070	save	%sp, -SA(MINFRAME), %sp
2071	clr	%o5			! flag LOFAULT_SET is not set for bcopy
2072.do_copy:
2073	cmp	%i2, 12			! for small counts
2074	blu	%ncc, .bytecp		! just copy bytes
2075	.empty
2076
2077	cmp	%i2, 128		! for less than 128 bytes
2078	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
2079	nop
2080
2081	set	use_hw_bcopy, %o2
2082	ld	[%o2], %o2
2083	brz,pn	%o2, .bcb_punt
2084	nop
2085
2086	subcc	%i1, %i0, %i3
2087	bneg,a,pn %ncc, 1f
2088	neg	%i3
20891:
2090	/*
2091	 * Compare against 256 since we should be checking block addresses
2092	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2093	 * src = dest + (64 * 3) + 63.
2094	 */
2095	cmp	%i3, 256
2096	blu,pn	%ncc, .bcb_punt
2097	nop
2098
2099	/*
2100	 * Copy that reach here have at least 2 blocks of data to copy.
2101	 */
2102.do_blockcopy:
2103	! Swap src/dst since the code below is memcpy code
2104	! and memcpy/bcopy have different calling sequences
2105	mov	%i1, %i5
2106	mov	%i0, %i1
2107	mov	%i5, %i0
2108
2109	! Block (64 bytes) align the destination.
2110	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
2111	bz	%xcc, .chksrc		! dst is already double aligned
2112	sub	%i3, 0x40, %i3
2113	neg	%i3			! bytes till dst 64 bytes aligned
2114	sub	%i2, %i3, %i2		! update i2 with new count
2115
2116	! Based on source and destination alignment do
2117	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2118
2119	! Is dst & src 8B aligned
2120	or	%i0, %i1, %o2
2121	andcc	%o2, 0x7, %g0
2122	bz	%ncc, .alewdcp
2123	nop
2124
2125	! Is dst & src 4B aligned
2126	andcc	%o2, 0x3, %g0
2127	bz	%ncc, .alwdcp
2128	nop
2129
2130	! Is dst & src 2B aligned
2131	andcc	%o2, 0x1, %g0
2132	bz	%ncc, .alhlfwdcp
2133	nop
2134
2135	! 1B aligned
21361:	ldub	[%i1], %o2
2137	stb	%o2, [%i0]
2138	inc	%i1
2139	deccc	%i3
2140	bgu,pt	%ncc, 1b
2141	inc	%i0
2142
2143	ba	.chksrc
2144	nop
2145
2146	! dst & src 4B aligned
2147.alwdcp:
2148	ld	[%i1], %o2
2149	st	%o2, [%i0]
2150	add	%i1, 0x4, %i1
2151	subcc	%i3, 0x4, %i3
2152	bgu,pt	%ncc, .alwdcp
2153	add	%i0, 0x4, %i0
2154
2155	ba	.chksrc
2156	nop
2157
2158	! dst & src 2B aligned
2159.alhlfwdcp:
2160	lduh	[%i1], %o2
2161	stuh	%o2, [%i0]
2162	add	%i1, 0x2, %i1
2163	subcc	%i3, 0x2, %i3
2164	bgu,pt	%ncc, .alhlfwdcp
2165	add	%i0, 0x2, %i0
2166
2167	ba	.chksrc
2168	nop
2169
2170	! dst & src 8B aligned
2171.alewdcp:
2172	ldx	[%i1], %o2
2173	stx	%o2, [%i0]
2174	add	%i1, 0x8, %i1
2175	subcc	%i3, 0x8, %i3
2176	bgu,pt	%ncc, .alewdcp
2177	add	%i0, 0x8, %i0
2178
2179	! Now Destination is block (64 bytes) aligned
2180.chksrc:
2181	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2182	sub	%i2, %i3, %i2		! Residue bytes in %i2
2183
2184	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2185
2186	andcc	%i1, 0xf, %o2		! is src quadword aligned
2187	bz,pn	%xcc, .blkcpy		! src offset in %o2
2188	nop
2189	cmp	%o2, 0x8
2190	bg	.cpy_upper_double
2191	nop
2192	bl	.cpy_lower_double
2193	nop
2194
2195	! Falls through when source offset is equal to 8 i.e.
2196	! source is double word aligned.
2197	! In this case no shift/merge of data is required
2198	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2199	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2200	prefetch [%l0+0x0], #one_read
2201	ldda	[%i1+0x0]%asi, %l2
2202loop0:
2203	ldda	[%i1+0x10]%asi, %l4
2204	prefetch [%l0+0x40], #one_read
2205
2206	stxa	%l3, [%i0+0x0]%asi
2207	stxa	%l4, [%i0+0x8]%asi
2208
2209	ldda	[%i1+0x20]%asi, %l2
2210	stxa	%l5, [%i0+0x10]%asi
2211	stxa	%l2, [%i0+0x18]%asi
2212
2213	ldda	[%i1+0x30]%asi, %l4
2214	stxa	%l3, [%i0+0x20]%asi
2215	stxa	%l4, [%i0+0x28]%asi
2216
2217	ldda	[%i1+0x40]%asi, %l2
2218	stxa	%l5, [%i0+0x30]%asi
2219	stxa	%l2, [%i0+0x38]%asi
2220
2221	add	%l0, 0x40, %l0
2222	add	%i1, 0x40, %i1
2223	subcc	%i3, 0x40, %i3
2224	bgu,pt	%xcc, loop0
2225	add	%i0, 0x40, %i0
2226	ba	.blkdone
2227	add	%i1, %o2, %i1		! increment the source by src offset
2228					! the src offset was stored in %o2
2229
2230.cpy_lower_double:
2231	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2232	sll	%o2, 3, %o0		! %o0 left shift
2233	mov	0x40, %o1
2234	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2235	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2236	prefetch [%l0+0x0], #one_read
2237	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
2238					! complete data
2239loop1:
2240	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
2241	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2242							! into %l2 and %l3
2243	prefetch [%l0+0x40], #one_read
2244	stxa	%l2, [%i0+0x0]%asi
2245	stxa	%l3, [%i0+0x8]%asi
2246
2247	ldda	[%i1+0x20]%asi, %l2
2248	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2249	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
2250	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
2251
2252	! Repeat the same for next 32 bytes.
2253
2254	ldda	[%i1+0x30]%asi, %l4
2255	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2256	stxa	%l2, [%i0+0x20]%asi
2257	stxa	%l3, [%i0+0x28]%asi
2258
2259	ldda	[%i1+0x40]%asi, %l2
2260	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2261	stxa	%l4, [%i0+0x30]%asi
2262	stxa	%l5, [%i0+0x38]%asi
2263
2264	add	%l0, 0x40, %l0
2265	add	%i1, 0x40, %i1
2266	subcc	%i3, 0x40, %i3
2267	bgu,pt	%xcc, loop1
2268	add	%i0, 0x40, %i0
2269	ba	.blkdone
2270	add	%i1, %o2, %i1		! increment the source by src offset
2271					! the src offset was stored in %o2
2272
2273.cpy_upper_double:
2274	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2275	mov	0x8, %o0
2276	sub	%o2, %o0, %o0
2277	sll	%o0, 3, %o0		! %o0 left shift
2278	mov	0x40, %o1
2279	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2280	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2281	prefetch [%l0+0x0], #one_read
2282	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
2283					! no data in %l2
2284loop2:
2285	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
2286					! partial
2287	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2288							! into %l3 and %l4
2289	prefetch [%l0+0x40], #one_read
2290	stxa	%l3, [%i0+0x0]%asi
2291	stxa	%l4, [%i0+0x8]%asi
2292
2293	ldda	[%i1+0x20]%asi, %l2
2294	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2295	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
2296	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
2297
2298	! Repeat the same for next 32 bytes.
2299
2300	ldda	[%i1+0x30]%asi, %l4
2301	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2302	stxa	%l3, [%i0+0x20]%asi
2303	stxa	%l4, [%i0+0x28]%asi
2304
2305	ldda	[%i1+0x40]%asi, %l2
2306	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2307	stxa	%l5, [%i0+0x30]%asi
2308	stxa	%l2, [%i0+0x38]%asi
2309
2310	add	%l0, 0x40, %l0
2311	add	%i1, 0x40, %i1
2312	subcc	%i3, 0x40, %i3
2313	bgu,pt	%xcc, loop2
2314	add	%i0, 0x40, %i0
2315	ba	.blkdone
2316	add	%i1, %o2, %i1		! increment the source by src offset
2317					! the src offset was stored in %o2
2318
2319
2320	! Both Source and Destination are block aligned.
2321	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2322.blkcpy:
2323	prefetch [%i1+0x0], #one_read
23241:
2325	ldda	[%i1+0x0]%asi, %l0
2326	ldda	[%i1+0x10]%asi, %l2
2327	prefetch [%i1+0x40], #one_read
2328
2329	stxa	%l0, [%i0+0x0]%asi
2330	ldda	[%i1+0x20]%asi, %l4
2331	ldda	[%i1+0x30]%asi, %l6
2332
2333	stxa	%l1, [%i0+0x8]%asi
2334	stxa	%l2, [%i0+0x10]%asi
2335	stxa	%l3, [%i0+0x18]%asi
2336	stxa	%l4, [%i0+0x20]%asi
2337	stxa	%l5, [%i0+0x28]%asi
2338	stxa	%l6, [%i0+0x30]%asi
2339	stxa	%l7, [%i0+0x38]%asi
2340
2341	add	%i1, 0x40, %i1
2342	subcc	%i3, 0x40, %i3
2343	bgu,pt	%xcc, 1b
2344	add	%i0, 0x40, %i0
2345
2346.blkdone:
2347	membar	#Sync
2348
2349	brz,pt	%i2, .blkexit
2350	nop
2351
2352	! Handle trailing bytes
2353	cmp	%i2, 0x8
2354	blu,pt	%ncc, .residue
2355	nop
2356
2357	! Can we do some 8B ops
2358	or	%i1, %i0, %o2
2359	andcc	%o2, 0x7, %g0
2360	bnz	%ncc, .last4
2361	nop
2362
2363	! Do 8byte ops as long as possible
2364.last8:
2365	ldx	[%i1], %o2
2366	stx	%o2, [%i0]
2367	add	%i1, 0x8, %i1
2368	sub	%i2, 0x8, %i2
2369	cmp	%i2, 0x8
2370	bgu,pt	%ncc, .last8
2371	add	%i0, 0x8, %i0
2372
2373	brz,pt	%i2, .blkexit
2374	nop
2375
2376	ba	.residue
2377	nop
2378
2379.last4:
2380	! Can we do 4B ops
2381	andcc	%o2, 0x3, %g0
2382	bnz	%ncc, .last2
2383	nop
23841:
2385	ld	[%i1], %o2
2386	st	%o2, [%i0]
2387	add	%i1, 0x4, %i1
2388	sub	%i2, 0x4, %i2
2389	cmp	%i2, 0x4
2390	bgu,pt	%ncc, 1b
2391	add	%i0, 0x4, %i0
2392
2393	brz,pt	%i2, .blkexit
2394	nop
2395
2396	ba	.residue
2397	nop
2398
2399.last2:
2400	! Can we do 2B ops
2401	andcc	%o2, 0x1, %g0
2402	bnz	%ncc, .residue
2403	nop
2404
24051:
2406	lduh	[%i1], %o2
2407	stuh	%o2, [%i0]
2408	add	%i1, 0x2, %i1
2409	sub	%i2, 0x2, %i2
2410	cmp	%i2, 0x2
2411	bgu,pt	%ncc, 1b
2412	add	%i0, 0x2, %i0
2413
2414	brz,pt	%i2, .blkexit
2415	nop
2416
2417.residue:
2418	ldub	[%i1], %o2
2419	stb	%o2, [%i0]
2420	inc	%i1
2421	deccc	%i2
2422	bgu,pt	%ncc, .residue
2423	inc	%i0
2424
2425.blkexit:
2426
2427	membar	#Sync				! sync error barrier
2428	! Restore t_lofault handler, if came here from kcopy().
2429	tst	%o5
2430	bz	%ncc, 1f
2431	andn	%o5, LOFAULT_SET, %o5
2432	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
24331:
2434	ret
2435	restore	%g0, 0, %o0
2436
2437
2438.bcb_punt:
2439	!
2440	! use aligned transfers where possible
2441	!
2442	xor	%i0, %i1, %o4		! xor from and to address
2443	btst	7, %o4			! if lower three bits zero
2444	bz	.aldoubcp		! can align on double boundary
2445	.empty	! assembler complaints about label
2446
2447	xor	%i0, %i1, %o4		! xor from and to address
2448	btst	3, %o4			! if lower two bits zero
2449	bz	.alwordcp		! can align on word boundary
2450	btst	3, %i0			! delay slot, from address unaligned?
2451	!
2452	! use aligned reads and writes where possible
2453	! this differs from wordcp in that it copes
2454	! with odd alignment between source and destnation
2455	! using word reads and writes with the proper shifts
2456	! in between to align transfers to and from memory
2457	! i0 - src address, i1 - dest address, i2 - count
2458	! i3, i4 - tmps for used generating complete word
2459	! i5 (word to write)
2460	! l0 size in bits of upper part of source word (US)
2461	! l1 size in bits of lower part of source word (LS = 32 - US)
2462	! l2 size in bits of upper part of destination word (UD)
2463	! l3 size in bits of lower part of destination word (LD = 32 - UD)
2464	! l4 number of bytes leftover after aligned transfers complete
2465	! l5 the number 32
2466	!
2467	mov	32, %l5			! load an oft-needed constant
2468	bz	.align_dst_only
2469	btst	3, %i1			! is destnation address aligned?
2470	clr	%i4			! clear registers used in either case
2471	bz	.align_src_only
2472	clr	%l0
2473	!
2474	! both source and destination addresses are unaligned
2475	!
24761:					! align source
2477	ldub	[%i0], %i3		! read a byte from source address
2478	add	%i0, 1, %i0		! increment source address
2479	or	%i4, %i3, %i4		! or in with previous bytes (if any)
2480	btst	3, %i0			! is source aligned?
2481	add	%l0, 8, %l0		! increment size of upper source (US)
2482	bnz,a	1b
2483	sll	%i4, 8, %i4		! make room for next byte
2484
2485	sub	%l5, %l0, %l1		! generate shift left count (LS)
2486	sll	%i4, %l1, %i4		! prepare to get rest
2487	ld	[%i0], %i3		! read a word
2488	add	%i0, 4, %i0		! increment source address
2489	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
2490	or	%i4, %i5, %i5		! merge
2491	mov	24, %l3			! align destination
24921:
2493	srl	%i5, %l3, %i4		! prepare to write a single byte
2494	stb	%i4, [%i1]		! write a byte
2495	add	%i1, 1, %i1		! increment destination address
2496	sub	%i2, 1, %i2		! decrement count
2497	btst	3, %i1			! is destination aligned?
2498	bnz,a	1b
2499	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
2500	sub	%l5, %l3, %l2		! generate shift left count (UD)
2501	sll	%i5, %l2, %i5		! move leftover into upper bytes
2502	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
2503	bgu	%ncc, .more_needed	! need more to fill than we have
2504	nop
2505
2506	sll	%i3, %l1, %i3		! clear upper used byte(s)
2507	srl	%i3, %l1, %i3
2508	! get the odd bytes between alignments
2509	sub	%l0, %l2, %l0		! regenerate shift count
2510	sub	%l5, %l0, %l1		! generate new shift left count (LS)
2511	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
2512	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
2513	srl	%i3, %l0, %i4
2514	or	%i5, %i4, %i5
2515	st	%i5, [%i1]		! write a word
2516	subcc	%i2, 4, %i2		! decrement count
2517	bz	%ncc, .unalign_out
2518	add	%i1, 4, %i1		! increment destination address
2519
2520	b	2f
2521	sll	%i3, %l1, %i5		! get leftover into upper bits
2522.more_needed:
2523	sll	%i3, %l0, %i3		! save remaining byte(s)
2524	srl	%i3, %l0, %i3
2525	sub	%l2, %l0, %l1		! regenerate shift count
2526	sub	%l5, %l1, %l0		! generate new shift left count
2527	sll	%i3, %l1, %i4		! move to fill empty space
2528	b	3f
2529	or	%i5, %i4, %i5		! merge to complete word
2530	!
2531	! the source address is aligned and destination is not
2532	!
2533.align_dst_only:
2534	ld	[%i0], %i4		! read a word
2535	add	%i0, 4, %i0		! increment source address
2536	mov	24, %l0			! initial shift alignment count
25371:
2538	srl	%i4, %l0, %i3		! prepare to write a single byte
2539	stb	%i3, [%i1]		! write a byte
2540	add	%i1, 1, %i1		! increment destination address
2541	sub	%i2, 1, %i2		! decrement count
2542	btst	3, %i1			! is destination aligned?
2543	bnz,a	1b
2544	sub	%l0, 8, %l0		! delay slot, decrement shift count
2545.xfer:
2546	sub	%l5, %l0, %l1		! generate shift left count
2547	sll	%i4, %l1, %i5		! get leftover
25483:
2549	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
2550	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
25512:
2552	ld	[%i0], %i3		! read a source word
2553	add	%i0, 4, %i0		! increment source address
2554	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
2555	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
2556	st	%i5, [%i1]		! write a destination word
2557	subcc	%i2, 4, %i2		! decrement count
2558	bz	%ncc, .unalign_out	! check if done
2559	add	%i1, 4, %i1		! increment destination address
2560	b	2b			! loop
2561	sll	%i3, %l1, %i5		! get leftover
2562.unalign_out:
2563	tst	%l4			! any bytes leftover?
2564	bz	%ncc, .cpdone
2565	.empty				! allow next instruction in delay slot
25661:
2567	sub	%l0, 8, %l0		! decrement shift
2568	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
2569	stb	%i4, [%i1]		! write a byte
2570	subcc	%l4, 1, %l4		! decrement count
2571	bz	%ncc, .cpdone		! done?
2572	add	%i1, 1, %i1		! increment destination
2573	tst	%l0			! any more previously read bytes
2574	bnz	%ncc, 1b		! we have leftover bytes
2575	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
2576	b	.dbytecp		! let dbytecp do the rest
2577	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2578	!
2579	! the destination address is aligned and the source is not
2580	!
2581.align_src_only:
2582	ldub	[%i0], %i3		! read a byte from source address
2583	add	%i0, 1, %i0		! increment source address
2584	or	%i4, %i3, %i4		! or in with previous bytes (if any)
2585	btst	3, %i0			! is source aligned?
2586	add	%l0, 8, %l0		! increment shift count (US)
2587	bnz,a	.align_src_only
2588	sll	%i4, 8, %i4		! make room for next byte
2589	b,a	.xfer
2590	!
2591	! if from address unaligned for double-word moves,
2592	! move bytes till it is, if count is < 56 it could take
2593	! longer to align the thing than to do the transfer
2594	! in word size chunks right away
2595	!
2596.aldoubcp:
2597	cmp	%i2, 56			! if count < 56, use wordcp, it takes
2598	blu,a	%ncc, .alwordcp		! longer to align doubles than words
2599	mov	3, %o0			! mask for word alignment
2600	call	.alignit		! copy bytes until aligned
2601	mov	7, %o0			! mask for double alignment
2602	!
2603	! source and destination are now double-word aligned
2604	! i3 has aligned count returned by alignit
2605	!
2606	and	%i2, 7, %i2		! unaligned leftover count
2607	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
26085:
2609	ldx	[%i0+%i1], %o4		! read from address
2610	stx	%o4, [%i1]		! write at destination address
2611	subcc	%i3, 8, %i3		! dec count
2612	bgu	%ncc, 5b
2613	add	%i1, 8, %i1		! delay slot, inc to address
2614	cmp	%i2, 4			! see if we can copy a word
2615	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
2616	.empty
2617	!
2618	! for leftover bytes we fall into wordcp, if needed
2619	!
2620.wordcp:
2621	and	%i2, 3, %i2		! unaligned leftover count
26225:
2623	ld	[%i0+%i1], %o4		! read from address
2624	st	%o4, [%i1]		! write at destination address
2625	subcc	%i3, 4, %i3		! dec count
2626	bgu	%ncc, 5b
2627	add	%i1, 4, %i1		! delay slot, inc to address
2628	b,a	.dbytecp
2629
2630	! we come here to align copies on word boundaries
2631.alwordcp:
2632	call	.alignit		! go word-align it
2633	mov	3, %o0			! bits that must be zero to be aligned
2634	b	.wordcp
2635	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
2636
2637	!
2638	! byte copy, works with any alignment
2639	!
2640.bytecp:
2641	b	.dbytecp
2642	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
2643
2644	!
2645	! differenced byte copy, works with any alignment
2646	! assumes dest in %i1 and (source - dest) in %i0
2647	!
26481:
2649	stb	%o4, [%i1]		! write to address
2650	inc	%i1			! inc to address
2651.dbytecp:
2652	deccc	%i2			! dec count
2653	bgeu,a	%ncc, 1b		! loop till done
2654	ldub	[%i0+%i1], %o4		! read from address
2655.cpdone:
2656
2657	membar	#Sync				! sync error barrier
2658	! Restore t_lofault handler, if came here from kcopy().
2659	tst	%o5
2660	bz	%ncc, 1f
2661	andn	%o5, LOFAULT_SET, %o5
2662	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
26631:
2664	ret
2665	restore %g0, 0, %o0		! return (0)
2666
2667/*
2668 * Common code used to align transfers on word and doubleword
2669 * boundaries.  Aligns source and destination and returns a count
2670 * of aligned bytes to transfer in %i3
2671 */
26721:
2673	inc	%i0			! inc from
2674	stb	%o4, [%i1]		! write a byte
2675	inc	%i1			! inc to
2676	dec	%i2			! dec count
2677.alignit:
2678	btst	%o0, %i0		! %o0 is bit mask to check for alignment
2679	bnz,a	1b
2680	ldub	[%i0], %o4		! read next byte
2681
2682	retl
2683	andn	%i2, %o0, %i3		! return size of aligned bytes
2684
2685	SET_SIZE(bcopy)
2686
2687#endif	/* NIAGARA_IMPL */
2688
2689#endif	/* lint */
2690
2691/*
2692 * Block copy with possibly overlapped operands.
2693 */
2694
2695#if defined(lint)
2696
2697/*ARGSUSED*/
2698void
2699ovbcopy(const void *from, void *to, size_t count)
2700{}
2701
2702#else	/* lint */
2703
2704	ENTRY(ovbcopy)
2705	tst	%o2			! check count
2706	bgu,a	%ncc, 1f		! nothing to do or bad arguments
2707	subcc	%o0, %o1, %o3		! difference of from and to address
2708
2709	retl				! return
2710	nop
27111:
2712	bneg,a	%ncc, 2f
2713	neg	%o3			! if < 0, make it positive
27142:	cmp	%o2, %o3		! cmp size and abs(from - to)
2715	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
2716	.empty				!   no overlap
2717	cmp	%o0, %o1		! compare from and to addresses
2718	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
2719	nop
2720	!
2721	! Copy forwards.
2722	!
2723.ov_fwd:
2724	ldub	[%o0], %o3		! read from address
2725	inc	%o0			! inc from address
2726	stb	%o3, [%o1]		! write to address
2727	deccc	%o2			! dec count
2728	bgu	%ncc, .ov_fwd		! loop till done
2729	inc	%o1			! inc to address
2730
2731	retl				! return
2732	nop
2733	!
2734	! Copy backwards.
2735	!
2736.ov_bkwd:
2737	deccc	%o2			! dec count
2738	ldub	[%o0 + %o2], %o3	! get byte at end of src
2739	bgu	%ncc, .ov_bkwd		! loop till done
2740	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
2741
2742	retl				! return
2743	nop
2744	SET_SIZE(ovbcopy)
2745
2746#endif	/* lint */
2747
2748/*
2749 * hwblkpagecopy()
2750 *
2751 * Copies exactly one page.  This routine assumes the caller (ppcopy)
2752 * has already disabled kernel preemption and has checked
2753 * use_hw_bcopy.
2754 */
2755#ifdef lint
2756/*ARGSUSED*/
2757void
2758hwblkpagecopy(const void *src, void *dst)
2759{ }
2760#else /* lint */
2761	ENTRY(hwblkpagecopy)
2762	save	%sp, -SA(MINFRAME), %sp
2763
2764	! %i0 - source address (arg)
2765	! %i1 - destination address (arg)
2766	! %i2 - length of region (not arg)
2767
2768	set	PAGESIZE, %i2
2769
2770	/*
2771	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
2772	 */
2773	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2774	prefetch [%i0+0x0], #one_read
2775	prefetch [%i0+0x40], #one_read
27761:
2777	prefetch [%i0+0x80], #one_read
2778	prefetch [%i0+0xc0], #one_read
2779	ldda	[%i0+0x0]%asi, %l0
2780	ldda	[%i0+0x10]%asi, %l2
2781	ldda	[%i0+0x20]%asi, %l4
2782	ldda	[%i0+0x30]%asi, %l6
2783	stxa	%l0, [%i1+0x0]%asi
2784	stxa	%l1, [%i1+0x8]%asi
2785	stxa	%l2, [%i1+0x10]%asi
2786	stxa	%l3, [%i1+0x18]%asi
2787	stxa	%l4, [%i1+0x20]%asi
2788	stxa	%l5, [%i1+0x28]%asi
2789	stxa	%l6, [%i1+0x30]%asi
2790	stxa	%l7, [%i1+0x38]%asi
2791	ldda	[%i0+0x40]%asi, %l0
2792	ldda	[%i0+0x50]%asi, %l2
2793	ldda	[%i0+0x60]%asi, %l4
2794	ldda	[%i0+0x70]%asi, %l6
2795	stxa	%l0, [%i1+0x40]%asi
2796	stxa	%l1, [%i1+0x48]%asi
2797	stxa	%l2, [%i1+0x50]%asi
2798	stxa	%l3, [%i1+0x58]%asi
2799	stxa	%l4, [%i1+0x60]%asi
2800	stxa	%l5, [%i1+0x68]%asi
2801	stxa	%l6, [%i1+0x70]%asi
2802	stxa	%l7, [%i1+0x78]%asi
2803
2804	add	%i0, 0x80, %i0
2805	subcc	%i2, 0x80, %i2
2806	bgu,pt	%xcc, 1b
2807	add	%i1, 0x80, %i1
2808
2809	membar #Sync
2810	ret
2811	restore	%g0, 0, %o0
2812	SET_SIZE(hwblkpagecopy)
2813#endif	/* lint */
2814
2815
2816/*
2817 * Transfer data to and from user space -
2818 * Note that these routines can cause faults
2819 * It is assumed that the kernel has nothing at
2820 * less than KERNELBASE in the virtual address space.
2821 *
2822 * Note that copyin(9F) and copyout(9F) are part of the
2823 * DDI/DKI which specifies that they return '-1' on "errors."
2824 *
2825 * Sigh.
2826 *
2827 * So there's two extremely similar routines - xcopyin() and xcopyout()
2828 * which return the errno that we've faithfully computed.  This
2829 * allows other callers (e.g. uiomove(9F)) to work correctly.
2830 * Given that these are used pretty heavily, we expand the calling
2831 * sequences inline for all flavours (rather than making wrappers).
2832 *
2833 * There are also stub routines for xcopyout_little and xcopyin_little,
2834 * which currently are intended to handle requests of <= 16 bytes from
2835 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2836 * is left as an exercise...
2837 */
2838
2839/*
2840 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2841 *
2842 * General theory of operation:
2843 *
2844 * None of the copyops routines grab a window until it's decided that
2845 * we need to do a HW block copy operation. This saves a window
2846 * spill/fill when we're called during socket ops. The typical IO
2847 * path won't cause spill/fill traps.
2848 *
2849 * This code uses a set of 4 limits for the maximum size that will
2850 * be copied given a particular input/output address alignment.
2851 * the default limits are:
2852 *
2853 * single byte aligned - 256 (hw_copy_limit_1)
2854 * two byte aligned - 512 (hw_copy_limit_2)
2855 * four byte aligned - 1024 (hw_copy_limit_4)
2856 * eight byte aligned - 1024 (hw_copy_limit_8)
2857 *
2858 * If the value for a particular limit is zero, the copy will be done
2859 * via the copy loops rather than block store/quad load instructions.
2860 *
2861 * Flow:
2862 *
2863 * If count == zero return zero.
2864 *
2865 * Store the previous lo_fault handler into %g6.
2866 * Place our secondary lofault handler into %g5.
2867 * Place the address of our nowindow fault handler into %o3.
2868 * Place the address of the windowed fault handler into %o4.
2869 * --> We'll use this handler if we end up grabbing a window
2870 * --> before we use block initializing store and quad load ASIs
2871 *
2872 * If count is less than or equal to SMALL_LIMIT (7) we
2873 * always do a byte for byte copy.
2874 *
2875 * If count is > SMALL_LIMIT, we check the alignment of the input
2876 * and output pointers. Based on the alignment we check count
2877 * against a limit based on detected alignment.  If we exceed the
2878 * alignment value we copy via block initializing store and quad
2879 * load instructions.
2880 *
2881 * If we don't exceed one of the limits, we store -count in %o3,
2882 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2883 * on in our basic copy loop in %o2. Following this we branch
2884 * to the appropriate copy loop and copy that many chunks.
2885 * Since we've been adding the chunk size to %o3 each time through
2886 * as well as decrementing %o2, we can tell if any data is
2887 * is left to be copied by examining %o3. If that is zero, we're
2888 * done and can go home. If not, we figure out what the largest
2889 * chunk size left to be copied is and branch to that copy loop
2890 * unless there's only one byte left. We load that as we're
2891 * branching to code that stores it just before we return.
2892 *
2893 * Fault handlers are invoked if we reference memory that has no
2894 * current mapping.  All forms share the same copyio_fault handler.
2895 * This routine handles fixing up the stack and general housecleaning.
2896 * Each copy operation has a simple fault handler that is then called
2897 * to do the work specific to the invidual operation.  The handler
2898 * for copyOP and xcopyOP are found at the end of individual function.
2899 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
2900 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
2901 */
2902
2903/*
2904 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2905 */
2906
2907#if defined(lint)
2908
2909/*ARGSUSED*/
2910int
2911copyout(const void *kaddr, void *uaddr, size_t count)
2912{ return (0); }
2913
2914#else	/* lint */
2915
2916/*
2917 * We save the arguments in the following registers in case of a fault:
2918 * 	kaddr - %g2
2919 * 	uaddr - %g3
2920 * 	count - %g4
2921 */
2922#define	SAVE_SRC	%g2
2923#define	SAVE_DST	%g3
2924#define	SAVE_COUNT	%g4
2925
2926#define	REAL_LOFAULT		%g5
2927#define	SAVED_LOFAULT		%g6
2928
2929/*
2930 * Generic copyio fault handler.  This is the first line of defense when a
2931 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2932 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2933 * This allows us to share common code for all the flavors of the copy
2934 * operations, including the _noerr versions.
2935 *
2936 * Note that this function will restore the original input parameters before
2937 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2938 * member of the t_copyop structure, if needed.
2939 */
2940	ENTRY(copyio_fault)
2941#if !defined(NIAGARA_IMPL)
2942	btst	FPUSED_FLAG, SAVED_LOFAULT
2943	bz	1f
2944	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2945
2946	wr	%l5, 0, %gsr		! restore gsr
2947
2948	btst	FPRS_FEF, %g1
2949	bz	%icc, 4f
2950	nop
2951
2952	! restore fpregs from stack
2953	BLD_FP_FROMSTACK(%o2)
2954
2955	ba,pt	%ncc, 1f
2956	nop
29574:
2958	FZERO				! zero all of the fpregs
2959	wr	%g1, %g0, %fprs		! restore fprs
29601:
2961	restore
2962	mov	SAVE_SRC, %o0
2963	mov	SAVE_DST, %o1
2964	jmp	REAL_LOFAULT
2965	mov	SAVE_COUNT, %o2
2966
2967#else	/* NIAGARA_IMPL */
2968	membar	#Sync
2969	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2970	restore
2971	mov	SAVE_SRC, %o0
2972	mov	SAVE_DST, %o1
2973	jmp	REAL_LOFAULT
2974	mov	SAVE_COUNT, %o2
2975
2976#endif	/* NIAGARA_IMPL */
2977
2978	SET_SIZE(copyio_fault)
2979
2980	ENTRY(copyio_fault_nowindow)
2981	membar	#Sync
2982	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2983
2984	mov	SAVE_SRC, %o0
2985	mov	SAVE_DST, %o1
2986	jmp	REAL_LOFAULT
2987	mov	SAVE_COUNT, %o2
2988	SET_SIZE(copyio_fault_nowindow)
2989
2990	ENTRY(copyout)
2991	sethi	%hi(.copyout_err), REAL_LOFAULT
2992	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2993
2994#if !defined(NIAGARA_IMPL)
2995.do_copyout:
2996	tst	%o2			! check for zero count;  quick exit
2997	bz,pt	%ncc, .co_smallqx
2998	mov	%o0, SAVE_SRC
2999	mov	%o1, SAVE_DST
3000	mov	%o2, SAVE_COUNT
3001	cmp	%o2, FP_COPY		! check for small copy/leaf case
3002	bgt,pt	%ncc, .co_copy_more
3003	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3004/*
3005 * Small copy out code
3006 *
3007 */
3008	sethi	%hi(copyio_fault_nowindow), %o3
3009	or	%o3, %lo(copyio_fault_nowindow), %o3
3010	membar	#Sync
3011	stn	%o3, [THREAD_REG + T_LOFAULT]
3012
3013	mov	ASI_USER, %asi
3014	cmp	%o2, SHORTCOPY		! make sure there is enough to align
3015	ble,pt	%ncc, .co_smallest
3016	andcc	%o1, 0x7, %o3		! is dest long word aligned
3017	bnz,pn	%ncc, .co_align
3018	andcc	%o1, 1, %o3		! is dest byte aligned
3019
3020! Destination is long word aligned
3021! 8 cases for src alignment; load parts, store long words
3022.co_al_src:
3023	andcc	%o0, 7, %o3
3024	brnz,pt	%o3, .co_src_dst_unal8
3025	nop
3026/*
3027 * Special case for handling when src and dest are both long word aligned
3028 * and total data to move is less than FP_COPY bytes
3029 * Also handles finish up for large block moves, so may be less than 32 bytes
3030 */
3031.co_medlong:
3032	subcc	%o2, 31, %o2		! adjust length to allow cc test
3033	ble,pt	%ncc, .co_medl31
3034	nop
3035.co_medl32:
3036	ldx	[%o0], %o4		! move 32 bytes
3037	subcc	%o2, 32, %o2		! decrement length count by 32
3038	stxa	%o4, [%o1]%asi
3039	ldx	[%o0+8], %o4
3040	stxa	%o4, [%o1+8]%asi
3041	ldx	[%o0+16], %o4
3042	add	%o0, 32, %o0		! increase src ptr by 32
3043	stxa	%o4, [%o1+16]%asi
3044	ldx	[%o0-8], %o4
3045	add	%o1, 32, %o1		! increase dst ptr by 32
3046	bgu,pt	%ncc, .co_medl32	! repeat if at least 32 bytes left
3047	stxa	%o4, [%o1-8]%asi
3048.co_medl31:
3049	addcc	%o2, 24, %o2		! adjust count to be off by 7
3050	ble,pt	%ncc, .co_medl7		! skip if 7 or fewer bytes left
3051	nop
3052.co_medl8:
3053	ldx	[%o0], %o4		! move 8 bytes
3054	add	%o0, 8, %o0		! increase src ptr by 8
3055	subcc	%o2, 8, %o2		! decrease count by 8
3056	add	%o1, 8, %o1		! increase dst ptr by 8
3057	bgu,pt	%ncc, .co_medl8
3058	stxa	%o4, [%o1-8]%asi
3059.co_medl7:
3060	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3061	bnz,pt	%ncc, .co_small4	! do final bytes if not finished
3062
3063.co_smallx:				! finish up and exit
3064	membar	#Sync
3065	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3066.co_smallqx:
3067	retl
3068	mov	%g0, %o0
3069
3070.co_small4:
3071	cmp	%o2, 4
3072	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3073	nop				!
3074	ld	[%o0], %o4		! move 4 bytes
3075	add	%o0, 4, %o0		! increase src ptr by 4
3076	add	%o1, 4, %o1		! increase dst ptr by 4
3077	subcc	%o2, 4, %o2		! decrease count by 4
3078	bz,pt	%ncc, .co_smallx
3079	stwa	%o4, [%o1-4]%asi
3080
3081.co_small3x:				! Exactly 1, 2, or 3 bytes remain
3082	subcc	%o2, 1, %o2		! reduce count for cc test
3083	ldub	[%o0], %o4		! load one byte
3084	bz,pt	%ncc, .co_smallx
3085	stba	%o4, [%o1]%asi		! store one byte
3086	ldub	[%o0+1], %o4		! load second byte
3087	subcc	%o2, 1, %o2
3088	bz,pt	%ncc, .co_smallx
3089	stba	%o4, [%o1+1]%asi	! store second byte
3090	ldub	[%o0+2], %o4		! load third byte
3091	ba	.co_smallx
3092	stba	%o4, [%o1+2]%asi	! store third byte
3093
3094.co_smallest:				! 7 or fewer bytes remain
3095	cmp	%o2, 4
3096	blt,pt	%ncc, .co_small3x
3097	nop
3098	ldub	[%o0], %o4		! read byte
3099	subcc	%o2, 4, %o2		! reduce count by 4
3100	stba	%o4, [%o1]%asi		! write byte
3101	ldub	[%o0+1], %o4		! repeat for total of 4 bytes
3102	add	%o0, 4, %o0		! advance src by 4
3103	stba	%o4, [%o1+1]%asi
3104	ldub	[%o0-2], %o4
3105	add	%o1, 4, %o1		! advance dst by 4
3106	stba	%o4, [%o1-2]%asi
3107	ldub	[%o0-1], %o4
3108	bnz,pt	%ncc, .co_small3x
3109	stba	%o4, [%o1-1]%asi
3110	membar	#Sync
3111	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3112	retl
3113	mov	%g0, %o0
3114
3115.co_align:				! byte align test in prior branch delay
3116	bnz,pt	%ncc, .co_al_d1
3117.co_al_d1f:				! dest is now half word aligned
3118	andcc	%o1, 2, %o3
3119	bnz,pt	%ncc, .co_al_d2
3120.co_al_d2f:				! dest is now word aligned
3121	andcc	%o1, 4, %o3		! is dest longword aligned?
3122	bz,pt	%ncc, .co_al_src
3123	nop
3124.co_al_d4:				! dest is word aligned;  src is unknown
3125	ldub	[%o0], %o4		! move a word (src align unknown)
3126	ldub	[%o0+1], %o3
3127	sll	%o4, 24, %o4		! position
3128	sll	%o3, 16, %o3		! position
3129	or	%o4, %o3, %o3		! merge
3130	ldub	[%o0+2], %o4
3131	sll	%o4, 8, %o4		! position
3132	or	%o4, %o3, %o3		! merge
3133	ldub	[%o0+3], %o4
3134	or	%o4, %o3, %o4		! merge
3135	stwa	%o4,[%o1]%asi		! store four bytes
3136	add	%o0, 4, %o0		! adjust src by 4
3137	add	%o1, 4, %o1		! adjust dest by 4
3138	sub	%o2, 4, %o2		! adjust count by 4
3139	andcc	%o0, 7, %o3		! check for src long word alignment
3140	brz,pt	%o3, .co_medlong
3141.co_src_dst_unal8:
3142	! dst is 8-byte aligned, src is not
3143	! Size is less than FP_COPY
3144	! Following code is to select for alignment
3145	andcc	%o0, 0x3, %o3		! test word alignment
3146	bz,pt	%ncc, .co_medword
3147	nop
3148	andcc	%o0, 0x1, %o3		! test halfword alignment
3149	bnz,pt	%ncc, .co_med_byte	! go to byte move if not halfword
3150	andcc	%o0, 0x2, %o3		! test which byte alignment
3151	ba	.co_medhalf
3152	nop
3153.co_al_d1:				! align dest to half word
3154	ldub	[%o0], %o4		! move a byte
3155	add	%o0, 1, %o0
3156	stba	%o4, [%o1]%asi
3157	add	%o1, 1, %o1
3158	andcc	%o1, 2, %o3
3159	bz,pt	%ncc, .co_al_d2f
3160	sub	%o2, 1, %o2
3161.co_al_d2:				! align dest to word
3162	ldub	[%o0], %o4		! move a half-word (src align unknown)
3163	ldub	[%o0+1], %o3
3164	sll	%o4, 8, %o4		! position
3165	or	%o4, %o3, %o4		! merge
3166	stha	%o4, [%o1]%asi
3167	add	%o0, 2, %o0
3168	add	%o1, 2, %o1
3169	andcc	%o1, 4, %o3		! is dest longword aligned?
3170	bz,pt	%ncc, .co_al_src
3171	sub	%o2, 2, %o2
3172	ba	.co_al_d4
3173	nop
3174/*
3175 * Handle all cases where src and dest are aligned on word
3176 * boundaries. Use unrolled loops for better performance.
3177 * This option wins over standard large data move when
3178 * source and destination is in cache for medium
3179 * to short data moves.
3180 */
3181.co_medword:
3182	subcc	%o2, 31, %o2		! adjust length to allow cc test
3183	ble,pt	%ncc, .co_medw31
3184	nop
3185.co_medw32:
3186	ld	[%o0], %o4		! move a block of 32 bytes
3187	stwa	%o4, [%o1]%asi
3188	ld	[%o0+4], %o4
3189	stwa	%o4, [%o1+4]%asi
3190	ld	[%o0+8], %o4
3191	stwa	%o4, [%o1+8]%asi
3192	ld	[%o0+12], %o4
3193	stwa	%o4, [%o1+12]%asi
3194	ld	[%o0+16], %o4
3195	stwa	%o4, [%o1+16]%asi
3196	ld	[%o0+20], %o4
3197	subcc	%o2, 32, %o2		! decrement length count
3198	stwa	%o4, [%o1+20]%asi
3199	ld	[%o0+24], %o4
3200	add	%o0, 32, %o0		! increase src ptr by 32
3201	stwa	%o4, [%o1+24]%asi
3202	ld	[%o0-4], %o4
3203	add	%o1, 32, %o1		! increase dst ptr by 32
3204	bgu,pt	%ncc, .co_medw32	! repeat if at least 32 bytes left
3205	stwa	%o4, [%o1-4]%asi
3206.co_medw31:
3207	addcc	%o2, 24, %o2		! adjust count to be off by 7
3208	ble,pt	%ncc, .co_medw7		! skip if 7 or fewer bytes left
3209	nop				!
3210.co_medw15:
3211	ld	[%o0], %o4		! move a block of 8 bytes
3212	subcc	%o2, 8, %o2		! decrement length count
3213	stwa	%o4, [%o1]%asi
3214	add	%o0, 8, %o0		! increase src ptr by 8
3215	ld	[%o0-4], %o4
3216	add	%o1, 8, %o1		! increase dst ptr by 8
3217	bgu,pt	%ncc, .co_medw15
3218	stwa	%o4, [%o1-4]%asi
3219.co_medw7:
3220	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3221	bz,pt	%ncc, .co_smallx	! exit if finished
3222	cmp	%o2, 4
3223	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3224	nop				!
3225	ld	[%o0], %o4		! move 4 bytes
3226	add	%o0, 4, %o0		! increase src ptr by 4
3227	add	%o1, 4, %o1		! increase dst ptr by 4
3228	subcc	%o2, 4, %o2		! decrease count by 4
3229	bnz	.co_small3x
3230	stwa	%o4, [%o1-4]%asi
3231	membar	#Sync
3232	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3233	retl
3234	mov	%g0, %o0
3235
3236.co_medhalf:
3237	subcc	%o2, 31, %o2		! adjust length to allow cc test
3238	ble,pt	%ncc, .co_medh31
3239	nop
3240.co_medh32:				! load and store block of 32 bytes
3241
3242	lduh	[%o0], %o4		! move 32 bytes
3243	subcc	%o2, 32, %o2		! decrement length count
3244	lduw	[%o0+2], %o3
3245	sllx	%o4, 48, %o4
3246	sllx	%o3, 16, %o3
3247	or	%o4, %o3, %o3
3248	lduh	[%o0+6], %o4
3249	or	%o4, %o3, %o4
3250	stxa	%o4, [%o1]%asi
3251
3252	lduh	[%o0+8], %o4
3253	lduw	[%o0+10], %o3
3254	sllx	%o4, 48, %o4
3255	sllx	%o3, 16, %o3
3256	or	%o4, %o3, %o3
3257	lduh	[%o0+14], %o4
3258	or	%o4, %o3, %o4
3259	stxa	%o4, [%o1+8]%asi
3260
3261	lduh	[%o0+16], %o4
3262	lduw	[%o0+18], %o3
3263	sllx	%o4, 48, %o4
3264	sllx	%o3, 16, %o3
3265	or	%o4, %o3, %o3
3266	lduh	[%o0+22], %o4
3267	or	%o4, %o3, %o4
3268	stxa	%o4, [%o1+16]%asi
3269
3270	add	%o0, 32, %o0		! increase src ptr by 32
3271	add	%o1, 32, %o1		! increase dst ptr by 32
3272
3273	lduh	[%o0-8], %o4
3274	lduw	[%o0-6], %o3
3275	sllx	%o4, 48, %o4
3276	sllx	%o3, 16, %o3
3277	or	%o4, %o3, %o3
3278	lduh	[%o0-2], %o4
3279	or	%o3, %o4, %o4
3280	bgu,pt	%ncc, .co_medh32	! repeat if at least 32 bytes left
3281	stxa	%o4, [%o1-8]%asi
3282
3283.co_medh31:
3284	addcc	%o2, 24, %o2		! adjust count to be off by 7
3285	ble,pt	%ncc, .co_medh7		! skip if 7 or fewer bytes left
3286	nop				!
3287.co_medh15:
3288	lduh	[%o0], %o4		! move 16 bytes
3289	subcc	%o2, 8, %o2		! decrement length count
3290	lduw	[%o0+2], %o3
3291	sllx	%o4, 48, %o4
3292	sllx	%o3, 16, %o3
3293	or	%o4, %o3, %o3
3294	add	%o1, 8, %o1		! increase dst ptr by 8
3295	lduh	[%o0+6], %o4
3296	add	%o0, 8, %o0		! increase src ptr by 8
3297	or	%o4, %o3, %o4
3298	bgu,pt	%ncc, .co_medh15
3299	stxa	%o4, [%o1-8]%asi
3300.co_medh7:
3301	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3302	bz,pt	%ncc, .co_smallx	! exit if finished
3303	cmp	%o2, 4
3304	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3305	nop				!
3306	lduh	[%o0], %o4
3307	sll	%o4, 16, %o4
3308	lduh	[%o0+2], %o3
3309	or	%o3, %o4, %o4
3310	subcc	%o2, 4, %o2
3311	add	%o0, 4, %o0
3312	add	%o1, 4, %o1
3313	bnz	.co_small3x
3314	stwa	%o4, [%o1-4]%asi
3315	membar	#Sync
3316	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3317	retl
3318	mov	%g0, %o0
3319
3320	.align 16
3321.co_med_byte:
3322	bnz,pt	%ncc, .co_medbh32a	! go to correct byte move
3323	subcc	%o2, 31, %o2		! adjust length to allow cc test
3324	ble,pt	%ncc, .co_medb31
3325	nop
3326.co_medb32:				! Alignment 1 or 5
3327	subcc	%o2, 32, %o2		! decrement length count
3328
3329	ldub	[%o0], %o4		! load and store a block of 32 bytes
3330	sllx	%o4, 56, %o3
3331	lduh	[%o0+1], %o4
3332	sllx	%o4, 40, %o4
3333	or	%o4, %o3, %o3
3334	lduw	[%o0+3], %o4
3335	sllx	%o4, 8, %o4
3336	or	%o4, %o3, %o3
3337	ldub	[%o0+7], %o4
3338	or	%o4, %o3, %o4
3339	stxa	%o4, [%o1]%asi
3340
3341	ldub	[%o0+8], %o4
3342	sllx	%o4, 56, %o3
3343	lduh	[%o0+9], %o4
3344	sllx	%o4, 40, %o4
3345	or	%o4, %o3, %o3
3346	lduw	[%o0+11], %o4
3347	sllx	%o4, 8, %o4
3348	or	%o4, %o3, %o3
3349	ldub	[%o0+15], %o4
3350	or	%o4, %o3, %o4
3351	stxa	%o4, [%o1+8]%asi
3352
3353	ldub	[%o0+16], %o4
3354	sllx	%o4, 56, %o3
3355	lduh	[%o0+17], %o4
3356	sllx	%o4, 40, %o4
3357	or	%o4, %o3, %o3
3358	lduw	[%o0+19], %o4
3359	sllx	%o4, 8, %o4
3360	or	%o4, %o3, %o3
3361	ldub	[%o0+23], %o4
3362	or	%o4, %o3, %o4
3363	stxa	%o4, [%o1+16]%asi
3364
3365	add	%o0, 32, %o0		! increase src ptr by 32
3366	add	%o1, 32, %o1		! increase dst ptr by 32
3367
3368	ldub	[%o0-8], %o4
3369	sllx	%o4, 56, %o3
3370	lduh	[%o0-7], %o4
3371	sllx	%o4, 40, %o4
3372	or	%o4, %o3, %o3
3373	lduw	[%o0-5], %o4
3374	sllx	%o4, 8, %o4
3375	or	%o4, %o3, %o3
3376	ldub	[%o0-1], %o4
3377	or	%o4, %o3, %o4
3378	bgu,pt	%ncc, .co_medb32	! repeat if at least 32 bytes left
3379	stxa	%o4, [%o1-8]%asi
3380
3381.co_medb31:				! 31 or fewer bytes remaining
3382	addcc	%o2, 24, %o2		! adjust count to be off by 7
3383	ble,pt	%ncc, .co_medb7		! skip if 7 or fewer bytes left
3384	nop				!
3385.co_medb15:
3386
3387	ldub	[%o0], %o4		! load and store a block of 8 bytes
3388	subcc	%o2, 8, %o2		! decrement length count
3389	sllx	%o4, 56, %o3
3390	lduh	[%o0+1], %o4
3391	sllx	%o4, 40, %o4
3392	or	%o4, %o3, %o3
3393	lduw	[%o0+3], %o4
3394	add	%o1, 8, %o1		! increase dst ptr by 16
3395	sllx	%o4, 8, %o4
3396	or	%o4, %o3, %o3
3397	ldub	[%o0+7], %o4
3398	add	%o0, 8, %o0		! increase src ptr by 16
3399	or	%o4, %o3, %o4
3400	bgu,pt	%ncc, .co_medb15
3401	stxa	%o4, [%o1-8]%asi
3402.co_medb7:
3403	addcc	%o2, 7, %o2		! finish adjustment of remaining count
3404	bz,pt	%ncc, .co_smallx	! exit if finished
3405	cmp	%o2, 4
3406	blt,pt	%ncc, .co_small3x	! skip if less than 4 bytes left
3407	nop				!
3408	ldub	[%o0], %o4		! move 4 bytes
3409	sll	%o4, 24, %o3
3410	lduh	[%o0+1], %o4
3411	sll	%o4, 8, %o4
3412	or	%o4, %o3, %o3
3413	ldub	[%o0+3], %o4
3414	or	%o4, %o3, %o4
3415	subcc	%o2, 4, %o2
3416	add	%o0, 4, %o0
3417	add	%o1, 4, %o1
3418	bnz	.co_small3x
3419	stwa	%o4, [%o1-4]%asi
3420	membar	#Sync
3421	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3422	retl
3423	mov	%g0, %o0
3424
3425	.align 16
3426.co_medbh32a:
3427	ble,pt	%ncc, .co_medbh31
3428	nop
3429.co_medbh32:				! Alignment 3 or 7
3430	subcc	%o2, 32, %o2		! decrement length count
3431
3432	ldub	[%o0], %o4		! load and store a block of 32 bytes
3433	sllx	%o4, 56, %o3
3434	lduw	[%o0+1], %o4
3435	sllx	%o4, 24, %o4
3436	or	%o4, %o3, %o3
3437	lduh	[%o0+5], %o4
3438	sllx	%o4, 8, %o4
3439	or	%o4, %o3, %o3
3440	ldub	[%o0+7], %o4
3441	or	%o4, %o3, %o4
3442	stxa	%o4, [%o1]%asi
3443
3444	ldub	[%o0+8], %o4
3445	sllx	%o4, 56, %o3
3446	lduw	[%o0+9], %o4
3447	sllx	%o4, 24, %o4
3448	or	%o4, %o3, %o3
3449	lduh	[%o0+13], %o4
3450	sllx	%o4, 8, %o4
3451	or	%o4, %o3, %o3
3452	ldub	[%o0+15], %o4
3453	or	%o4, %o3, %o4
3454	stxa	%o4, [%o1+8]%asi
3455
3456	ldub	[%o0+16], %o4
3457	sllx	%o4, 56, %o3
3458	lduw	[%o0+17], %o4
3459	sllx	%o4, 24, %o4
3460	or	%o4, %o3, %o3
3461	lduh	[%o0+21], %o4
3462	sllx	%o4, 8, %o4
3463	or	%o4, %o3, %o3
3464	ldub	[%o0+23], %o4
3465	or	%o4, %o3, %o4
3466	stxa	%o4, [%o1+16]%asi
3467
3468	add	%o0, 32, %o0		! increase src ptr by 32
3469	add	%o1, 32, %o1		! increase dst ptr by 32
3470
3471	ldub	[%o0-8], %o4
3472	sllx	%o4, 56, %o3
3473	lduw	[%o0-7], %o4
3474	sllx	%o4, 24, %o4
3475	or	%o4, %o3, %o3
3476	lduh	[%o0-3], %o4
3477	sllx	%o4, 8, %o4
3478	or	%o4, %o3, %o3
3479	ldub	[%o0-1], %o4
3480	or	%o4, %o3, %o4
3481	bgu,pt	%ncc, .co_medbh32	! repeat if at least 32 bytes left
3482	stxa	%o4, [%o1-8]%asi
3483
3484.co_medbh31:
3485	addcc	%o2, 24, %o2		! adjust count to be off by 7
3486	ble,pt	%ncc, .co_medb7		! skip if 7 or fewer bytes left
3487	nop				!
3488.co_medbh15:
3489	ldub	[%o0], %o4		! load and store a block of 8 bytes
3490	sllx	%o4, 56, %o3
3491	lduw	[%o0+1], %o4
3492	sllx	%o4, 24, %o4
3493	or	%o4, %o3, %o3
3494	lduh	[%o0+5], %o4
3495	sllx	%o4, 8, %o4
3496	or	%o4, %o3, %o3
3497	ldub	[%o0+7], %o4
3498	or	%o4, %o3, %o4
3499	stxa	%o4, [%o1]%asi
3500	subcc	%o2, 8, %o2		! decrement length count
3501	add	%o1, 8, %o1		! increase dst ptr by 8
3502	add	%o0, 8, %o0		! increase src ptr by 8
3503	bgu,pt	%ncc, .co_medbh15
3504	stxa	%o4, [%o1-8]%asi
3505	ba	.co_medb7
3506	nop
3507/*
3508 * End of small copy (no window) code
3509 */
3510
3511/*
3512 * Long copy code
3513 */
3514.co_copy_more:
3515	sethi	%hi(copyio_fault), %o3
3516	or	%o3, %lo(copyio_fault), %o3
3517	membar	#Sync
3518	stn	%o3, [THREAD_REG + T_LOFAULT]
3519
3520/*
3521 * Following code is for large copies. We know there is at
3522 * least FP_COPY bytes available. FP regs are used, so
3523 *  we save registers and fp regs before starting
3524 */
3525	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3526	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3527	rd	%fprs, %g1		! check for unused fp
3528	! if fprs.fef == 0, set it.
3529	! Setting it when already set costs more than checking
3530	andcc	%g1, FPRS_FEF, %g1	! test FEF, fprs.du = fprs.dl = 0
3531	bz,pt	%ncc, .co_fp_unused
3532	mov	ASI_USER, %asi
3533	BST_FP_TOSTACK(%o3)
3534	ba	.co_fp_ready
3535.co_fp_unused:
3536	prefetch [%i0 + (1 * CACHE_LINE)], #one_read
3537	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
3538.co_fp_ready:
3539	rd	%gsr, %l5		! save %gsr value
3540	andcc	%i1, 1, %o3		! is dest byte aligned
3541	bnz,pt	%ncc, .co_big_d1
3542.co_big_d1f:				! dest is now half word aligned
3543	andcc	%i1, 2, %o3
3544	bnz,pt	%ncc, .co_big_d2
3545.co_big_d2f:				! dest is now word aligned
3546	andcc	%i1, 4, %o3		! is dest longword aligned
3547	bnz,pt	%ncc, .co_big_d4
3548.co_big_d4f:				! dest is now long word aligned
3549	andcc	%i0, 7, %o3		! is src long word aligned
3550	brnz,pt	%o3, .co_big_unal8
3551	prefetch [%i0 + (2 * CACHE_LINE)], #one_read
3552	! Src and dst are long word aligned
3553	! align dst to 64 byte boundary
3554	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
3555	brz,pn	%o3, .co_al_to_64
3556	nop
3557	sub	%o3, 64, %o3		! %o3 has negative bytes to move
3558	add	%i2, %o3, %i2		! adjust remaining count
3559	andcc	%o3, 8, %o4		! odd long words to move?
3560	brz,pt	%o4, .co_al_to_16
3561	nop
3562	add	%o3, 8, %o3
3563	ldx	[%i0], %o4
3564	add	%i0, 8, %i0		! increment src ptr
3565	stxa	%o4, [%i1]ASI_USER
3566	add	%i1, 8, %i1		! increment dst ptr
3567! Dest is aligned on 16 bytes, src 8 byte aligned
3568.co_al_to_16:
3569	andcc	%o3, 0x30, %o4		! move to move?
3570	brz,pt	%o4, .co_al_to_64
3571	nop
3572.co_al_mv_16:
3573	add	%o3, 16, %o3
3574	ldx	[%i0], %o4
3575	stxa	%o4, [%i1]ASI_USER
3576	add	%i0, 16, %i0		! increment src ptr
3577	ldx	[%i0-8], %o4
3578	add	%i1, 8, %i1		! increment dst ptr
3579	stxa	%o4, [%i1]ASI_USER
3580	andcc	%o3, 0x30, %o4
3581	brnz,pt	%o4, .co_al_mv_16
3582	add	%i1, 8, %i1		! increment dst ptr
3583! Dest is aligned on 64 bytes, src 8 byte aligned
3584.co_al_to_64:
3585	! Determine source alignment
3586	! to correct 8 byte offset
3587	andcc	%i0, 32, %o3
3588	brnz,pn	%o3, .co_aln_1
3589	andcc	%i0, 16, %o3
3590	brnz,pn	%o3, .co_aln_01
3591	andcc	%i0, 8, %o3
3592	brz,pn	%o3, .co_aln_000
3593	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3594	ba	.co_aln_001
3595	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3596.co_aln_01:
3597	brnz,pn	%o3, .co_aln_011
3598	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3599	ba	.co_aln_010
3600	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3601.co_aln_1:
3602	andcc	%i0, 16, %o3
3603	brnz,pn	%o3, .co_aln_11
3604	andcc	%i0, 8, %o3
3605	brnz,pn	%o3, .co_aln_101
3606	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3607	ba	.co_aln_100
3608	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3609.co_aln_11:
3610	brz,pn	%o3, .co_aln_110
3611	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
3612
3613.co_aln_111:
3614! Alignment off by 8 bytes
3615	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3616	ldd	[%i0], %d0
3617	add	%i0, 8, %i0
3618	sub	%i2, 8, %i2
3619	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3620	and	%i2, 0x7f, %i2		! residue bytes in %i2
3621	sub	%i1, %i0, %i1
3622.co_aln_111_loop:
3623	ldda	[%i0]ASI_BLK_P,%d16		! block load
3624	subcc	%o3, 64, %o3
3625	fmovd	%d16, %d2
3626	fmovd	%d18, %d4
3627	fmovd	%d20, %d6
3628	fmovd	%d22, %d8
3629	fmovd	%d24, %d10
3630	fmovd	%d26, %d12
3631	fmovd	%d28, %d14
3632	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3633	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3634	add	%i0, 64, %i0
3635	fmovd	%d30, %d0
3636	bgt,pt	%ncc, .co_aln_111_loop
3637	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3638	add	%i1, %i0, %i1
3639
3640	stda	%d0, [%i1]ASI_USER
3641	ba	.co_remain_stuff
3642	add	%i1, 8, %i1
3643	! END OF aln_111
3644
3645.co_aln_110:
3646! Alignment off by 16 bytes
3647	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3648	ldd	[%i0], %d0
3649	ldd	[%i0+8], %d2
3650	add	%i0, 16, %i0
3651	sub	%i2, 16, %i2
3652	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3653	and	%i2, 0x7f, %i2		! residue bytes in %i2
3654	sub	%i1, %i0, %i1
3655.co_aln_110_loop:
3656	ldda	[%i0]ASI_BLK_P,%d16		! block load
3657	subcc	%o3, 64, %o3
3658	fmovd	%d16, %d4
3659	fmovd	%d18, %d6
3660	fmovd	%d20, %d8
3661	fmovd	%d22, %d10
3662	fmovd	%d24, %d12
3663	fmovd	%d26, %d14
3664	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3665	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3666	add	%i0, 64, %i0
3667	fmovd	%d28, %d0
3668	fmovd	%d30, %d2
3669	bgt,pt	%ncc, .co_aln_110_loop
3670	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3671	add	%i1, %i0, %i1
3672
3673	stda	%d0, [%i1]%asi
3674	stda	%d2, [%i1+8]%asi
3675	ba	.co_remain_stuff
3676	add	%i1, 16, %i1
3677	! END OF aln_110
3678
3679.co_aln_101:
3680! Alignment off by 24 bytes
3681	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3682	ldd	[%i0], %d0
3683	ldd	[%i0+8], %d2
3684	ldd	[%i0+16], %d4
3685	add	%i0, 24, %i0
3686	sub	%i2, 24, %i2
3687	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3688	and	%i2, 0x7f, %i2		! residue bytes in %i2
3689	sub	%i1, %i0, %i1
3690.co_aln_101_loop:
3691	ldda	[%i0]ASI_BLK_P,%d16	! block load
3692	subcc	%o3, 64, %o3
3693	fmovd	%d16, %d6
3694	fmovd	%d18, %d8
3695	fmovd	%d20, %d10
3696	fmovd	%d22, %d12
3697	fmovd	%d24, %d14
3698	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3699	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3700	add	%i0, 64, %i0
3701	fmovd	%d26, %d0
3702	fmovd	%d28, %d2
3703	fmovd	%d30, %d4
3704	bgt,pt	%ncc, .co_aln_101_loop
3705	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3706	add	%i1, %i0, %i1
3707
3708	stda	%d0, [%i1]%asi
3709	stda	%d2, [%i1+8]%asi
3710	stda	%d4, [%i1+16]%asi
3711	ba	.co_remain_stuff
3712	add	%i1, 24, %i1
3713	! END OF aln_101
3714
3715.co_aln_100:
3716! Alignment off by 32 bytes
3717	ldd	[%i0], %d0
3718	ldd	[%i0+8], %d2
3719	ldd	[%i0+16],%d4
3720	ldd	[%i0+24],%d6
3721	add	%i0, 32, %i0
3722	sub	%i2, 32, %i2
3723	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3724	and	%i2, 0x7f, %i2		! residue bytes in %i2
3725	sub	%i1, %i0, %i1
3726.co_aln_100_loop:
3727	ldda	[%i0]ASI_BLK_P,%d16	! block load
3728	subcc	%o3, 64, %o3
3729	fmovd	%d16, %d8
3730	fmovd	%d18, %d10
3731	fmovd	%d20, %d12
3732	fmovd	%d22, %d14
3733	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3734	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3735	add	%i0, 64, %i0
3736	fmovd	%d24, %d0
3737	fmovd	%d26, %d2
3738	fmovd	%d28, %d4
3739	fmovd	%d30, %d6
3740	bgt,pt	%ncc, .co_aln_100_loop
3741	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3742	add	%i1, %i0, %i1
3743
3744	stda	%d0, [%i1]%asi
3745	stda	%d2, [%i1+8]%asi
3746	stda	%d4, [%i1+16]%asi
3747	stda	%d6, [%i1+24]%asi
3748	ba	.co_remain_stuff
3749	add	%i1, 32, %i1
3750	! END OF aln_100
3751
3752.co_aln_011:
3753! Alignment off by 40 bytes
3754	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3755	ldd	[%i0], %d0
3756	ldd	[%i0+8], %d2
3757	ldd	[%i0+16], %d4
3758	ldd	[%i0+24], %d6
3759	ldd	[%i0+32], %d8
3760	add	%i0, 40, %i0
3761	sub	%i2, 40, %i2
3762	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3763	and	%i2, 0x7f, %i2		! residue bytes in %i2
3764	sub	%i1, %i0, %i1
3765.co_aln_011_loop:
3766	ldda	[%i0]ASI_BLK_P,%d16	! block load
3767	subcc	%o3, 64, %o3
3768	fmovd	%d16, %d10
3769	fmovd	%d18, %d12
3770	fmovd	%d20, %d14
3771	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3772	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3773	add	%i0, 64, %i0
3774	fmovd	%d22, %d0
3775	fmovd	%d24, %d2
3776	fmovd	%d26, %d4
3777	fmovd	%d28, %d6
3778	fmovd	%d30, %d8
3779	bgt,pt	%ncc, .co_aln_011_loop
3780	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3781	add	%i1, %i0, %i1
3782
3783	stda	%d0, [%i1]%asi
3784	stda	%d2, [%i1+8]%asi
3785	stda	%d4, [%i1+16]%asi
3786	stda	%d6, [%i1+24]%asi
3787	stda	%d8, [%i1+32]%asi
3788	ba	.co_remain_stuff
3789	add	%i1, 40, %i1
3790	! END OF aln_011
3791
3792.co_aln_010:
3793! Alignment off by 48 bytes
3794	ldd	[%i0], %d0
3795	ldd	[%i0+8], %d2
3796	ldd	[%i0+16], %d4
3797	ldd	[%i0+24], %d6
3798	ldd	[%i0+32], %d8
3799	ldd	[%i0+40], %d10
3800	add	%i0, 48, %i0
3801	sub	%i2, 48, %i2
3802	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3803	and	%i2, 0x7f, %i2		! residue bytes in %i2
3804	sub	%i1, %i0, %i1
3805.co_aln_010_loop:
3806	ldda	[%i0]ASI_BLK_P,%d16	! block load
3807	subcc	%o3, 64, %o3
3808	fmovd	%d16, %d12
3809	fmovd	%d18, %d14
3810	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3811	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3812	add	%i0, 64, %i0
3813	fmovd	%d20, %d0
3814	fmovd	%d22, %d2
3815	fmovd	%d24, %d4
3816	fmovd	%d26, %d6
3817	fmovd	%d28, %d8
3818	fmovd	%d30, %d10
3819	bgt,pt	%ncc, .co_aln_010_loop
3820	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3821	add	%i1, %i0, %i1
3822
3823	stda	%d0, [%i1]%asi
3824	stda	%d2, [%i1+8]%asi
3825	stda	%d4, [%i1+16]%asi
3826	stda	%d6, [%i1+24]%asi
3827	stda	%d8, [%i1+32]%asi
3828	stda	%d10, [%i1+40]%asi
3829	ba	.co_remain_stuff
3830	add	%i1, 48, %i1
3831	! END OF aln_010
3832
3833.co_aln_001:
3834! Alignment off by 56 bytes
3835	ldd	[%i0], %d0
3836	ldd	[%i0+8], %d2
3837	ldd	[%i0+16], %d4
3838	ldd	[%i0+24], %d6
3839	ldd	[%i0+32], %d8
3840	ldd	[%i0+40], %d10
3841	ldd	[%i0+48], %d12
3842	add	%i0, 56, %i0
3843	sub	%i2, 56, %i2
3844	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3845	and	%i2, 0x7f, %i2		! residue bytes in %i2
3846	sub	%i1, %i0, %i1
3847.co_aln_001_loop:
3848	ldda	[%i0]ASI_BLK_P,%d16	! block load
3849	subcc	%o3, 64, %o3
3850	fmovd	%d16, %d14
3851	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3852	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3853	add	%i0, 64, %i0
3854	fmovd	%d18, %d0
3855	fmovd	%d20, %d2
3856	fmovd	%d22, %d4
3857	fmovd	%d24, %d6
3858	fmovd	%d26, %d8
3859	fmovd	%d28, %d10
3860	fmovd	%d30, %d12
3861	bgt,pt	%ncc, .co_aln_001_loop
3862	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3863	add	%i1, %i0, %i1
3864
3865	stda	%d0, [%i1]%asi
3866	stda	%d2, [%i1+8]%asi
3867	stda	%d4, [%i1+16]%asi
3868	stda	%d6, [%i1+24]%asi
3869	stda	%d8, [%i1+32]%asi
3870	stda	%d10, [%i1+40]%asi
3871	stda	%d12, [%i1+48]%asi
3872	ba	.co_remain_stuff
3873	add	%i1, 56, %i1
3874	! END OF aln_001
3875
3876.co_aln_000:
3877	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3878	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
3879	and	%i2, 0x7f, %i2		! residue bytes in %i2
3880	sub	%i1, %i0, %i1
3881.co_aln_000_loop:
3882	ldda	[%i0]ASI_BLK_P,%d0
3883	subcc	%o3, 64, %o3
3884	stxa	%g0,[%i0+%i1]ASI_STBI_AIUS	! block initializing store
3885	stda	%d0,[%i0+%i1]ASI_BLK_AIUS
3886	add	%i0, 64, %i0
3887	bgt,pt	%ncc, .co_aln_000_loop
3888	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
3889	add	%i1, %i0, %i1
3890
3891	! END OF aln_000
3892
3893.co_remain_stuff:
3894	subcc	%i2, 31, %i2		! adjust length to allow cc test
3895	ble,pt	%ncc, .co_aln_31
3896	nop
3897.co_aln_32:
3898	ldx	[%i0], %o4		! move 32 bytes
3899	subcc	%i2, 32, %i2		! decrement length count by 32
3900	stxa	%o4, [%i1]%asi
3901	ldx	[%i0+8], %o4
3902	stxa	%o4, [%i1+8]%asi
3903	ldx	[%i0+16], %o4
3904	add	%i0, 32, %i0		! increase src ptr by 32
3905	stxa	%o4, [%i1+16]%asi
3906	ldx	[%i0-8], %o4
3907	add	%i1, 32, %i1		! increase dst ptr by 32
3908	bgu,pt	%ncc, .co_aln_32	! repeat if at least 32 bytes left
3909	stxa	%o4, [%i1-8]%asi
3910.co_aln_31:
3911	addcc	%i2, 24, %i2		! adjust count to be off by 7
3912	ble,pt	%ncc, .co_aln_7		! skip if 7 or fewer bytes left
3913	nop				!
3914.co_aln_15:
3915	ldx	[%i0], %o4		! move 8 bytes
3916	add	%i0, 8, %i0		! increase src ptr by 8
3917	subcc	%i2, 8, %i2		! decrease count by 8
3918	add	%i1, 8, %i1		! increase dst ptr by 8
3919	bgu,pt	%ncc, .co_aln_15
3920	stxa	%o4, [%i1-8]%asi
3921.co_aln_7:
3922	addcc	%i2, 7, %i2		! finish adjustment of remaining count
3923	bz,pt	%ncc, .co_exit		! exit if finished
3924	cmp	%i2, 4
3925	blt,pt	%ncc, .co_unaln3x	! skip if less than 4 bytes left
3926	nop				!
3927	ld	[%i0], %o4		! move 4 bytes
3928	add	%i0, 4, %i0		! increase src ptr by 4
3929	add	%i1, 4, %i1		! increase dst ptr by 4
3930	subcc	%i2, 4, %i2		! decrease count by 4
3931	bnz	.co_unaln3x
3932	stwa	%o4, [%i1-4]%asi
3933	ba	.co_exit
3934	nop
3935
3936	! destination alignment code
3937.co_big_d1:
3938	ldub	[%i0], %o4		! move a byte
3939	add	%i0, 1, %i0
3940	stba	%o4, [%i1]ASI_USER
3941	add	%i1, 1, %i1
3942	andcc	%i1, 2, %o3
3943	bz,pt	%ncc, .co_big_d2f
3944	sub	%i2, 1, %i2
3945.co_big_d2:
3946	ldub	[%i0], %o4		! move a half-word (src align unknown)
3947	ldub	[%i0+1], %o3
3948	add	%i0, 2, %i0
3949	sll	%o4, 8, %o4		! position
3950	or	%o4, %o3, %o4		! merge
3951	stha	%o4, [%i1]ASI_USER
3952	add	%i1, 2, %i1
3953	andcc	%i1, 4, %o3		! is dest longword aligned
3954	bz,pt	%ncc, .co_big_d4f
3955	sub	%i2, 2, %i2
3956.co_big_d4:				! dest is at least word aligned
3957	nop
3958	ldub	[%i0], %o4		! move a word (src align unknown)
3959	ldub	[%i0+1], %o3
3960	sll	%o4, 24, %o4		! position
3961	sll	%o3, 16, %o3		! position
3962	or	%o4, %o3, %o3		! merge
3963	ldub	[%i0+2], %o4
3964	sll	%o4, 8, %o4		! position
3965	or	%o4, %o3, %o3		! merge
3966	ldub	[%i0+3], %o4
3967	or	%o4, %o3, %o4		! merge
3968	stwa	%o4,[%i1]ASI_USER	! store four bytes
3969	add	%i0, 4, %i0		! adjust src by 4
3970	add	%i1, 4, %i1		! adjust dest by 4
3971	ba	.co_big_d4f
3972	sub	%i2, 4, %i2		! adjust count by 4
3973
3974
3975	! Dst is on 8 byte boundary; src is not;
3976.co_big_unal8:
3977	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
3978	bz	%ncc, .co_unalnsrc
3979	sub	%o3, 64, %o3		! %o3 will be multiple of 8
3980	neg	%o3			! bytes until dest is 64 byte aligned
3981	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
3982	! Move bytes according to source alignment
3983	andcc	%i0, 0x1, %o4
3984	bnz	%ncc, .co_unalnbyte	! check for byte alignment
3985	nop
3986	andcc	%i0, 2, %o4		! check for half word alignment
3987	bnz	%ncc, .co_unalnhalf
3988	nop
3989	! Src is word aligned, move bytes until dest 64 byte aligned
3990.co_unalnword:
3991	ld	[%i0], %o4		! load 4 bytes
3992	stwa	%o4, [%i1]%asi		! and store 4 bytes
3993	ld	[%i0+4], %o4		! load 4 bytes
3994	add	%i0, 8, %i0		! increase src ptr by 8
3995	stwa	%o4, [%i1+4]%asi	! and store 4 bytes
3996	subcc	%o3, 8, %o3		! decrease count by 8
3997	bnz	%ncc, .co_unalnword
3998	add	%i1, 8, %i1		! increase dst ptr by 8
3999	ba	.co_unalnsrc
4000	nop
4001
4002	! Src is half-word aligned, move bytes until dest 64 byte aligned
4003.co_unalnhalf:
4004	lduh	[%i0], %o4		! load 2 bytes
4005	sllx	%o4, 32, %i3		! shift left
4006	lduw	[%i0+2], %o4
4007	or	%o4, %i3, %i3
4008	sllx	%i3, 16, %i3
4009	lduh	[%i0+6], %o4
4010	or	%o4, %i3, %i3
4011	stxa	%i3, [%i1]ASI_USER
4012	add	%i0, 8, %i0
4013	subcc	%o3, 8, %o3
4014	bnz	%ncc, .co_unalnhalf
4015	add	%i1, 8, %i1
4016	ba	.co_unalnsrc
4017	nop
4018
4019	! Src is Byte aligned, move bytes until dest 64 byte aligned
4020.co_unalnbyte:
4021	sub	%i1, %i0, %i1		! share pointer advance
4022.co_unalnbyte_loop:
4023	ldub	[%i0], %o4
4024	sllx	%o4, 56, %i3
4025	lduh	[%i0+1], %o4
4026	sllx	%o4, 40, %o4
4027	or	%o4, %i3, %i3
4028	lduh	[%i0+3], %o4
4029	sllx	%o4, 24, %o4
4030	or	%o4, %i3, %i3
4031	lduh	[%i0+5], %o4
4032	sllx	%o4, 8, %o4
4033	or	%o4, %i3, %i3
4034	ldub	[%i0+7], %o4
4035	or	%o4, %i3, %i3
4036	stxa	%i3, [%i1+%i0]ASI_USER
4037	subcc	%o3, 8, %o3
4038	bnz	%ncc, .co_unalnbyte_loop
4039	add	%i0, 8, %i0
4040	add	%i1,%i0, %i1		! restore pointer
4041
4042	! Destination is now block (64 byte aligned), src is not 8 byte aligned
4043.co_unalnsrc:
4044	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
4045	and	%i2, 0x3f, %i2		! residue bytes in %i2
4046	add	%i2, 64, %i2		! Insure we don't load beyond
4047	sub	%i3, 64, %i3		! end of source buffer
4048
4049	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
4050	prefetch [%o4 + (3 * CACHE_LINE)], #one_read
4051	alignaddr %i0, %g0, %g0		! generate %gsr
4052	add	%i0, %i3, %i0		! advance %i0 to after blocks
4053	!
4054	! Determine source alignment to correct 8 byte offset
4055	andcc	%i0, 0x20, %o3
4056	brnz,pn	%o3, .co_unaln_1
4057	andcc	%i0, 0x10, %o3
4058	brnz,pn	%o3, .co_unaln_01
4059	andcc	%i0, 0x08, %o3
4060	brz,a	%o3, .co_unaln_000
4061	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4062	ba	.co_unaln_001
4063	nop
4064.co_unaln_01:
4065	brnz,a	%o3, .co_unaln_011
4066	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4067	ba	.co_unaln_010
4068	nop
4069.co_unaln_1:
4070	brnz,pn	%o3, .co_unaln_11
4071	andcc	%i0, 0x08, %o3
4072	brnz,a	%o3, .co_unaln_101
4073	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4074	ba	.co_unaln_100
4075	nop
4076.co_unaln_11:
4077	brz,pn	%o3, .co_unaln_110
4078	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
4079
4080.co_unaln_111:
4081	ldd	[%o4+56], %d14
4082.co_unaln_111_loop:
4083	add	%o4, 64, %o4
4084	ldda	[%o4]ASI_BLK_P, %d16
4085	faligndata %d14, %d16, %d48
4086	faligndata %d16, %d18, %d50
4087	faligndata %d18, %d20, %d52
4088	faligndata %d20, %d22, %d54
4089	faligndata %d22, %d24, %d56
4090	faligndata %d24, %d26, %d58
4091	faligndata %d26, %d28, %d60
4092	faligndata %d28, %d30, %d62
4093	fmovd	%d30, %d14
4094	stda	%d48, [%i1]ASI_BLK_AIUS
4095	subcc	%i3, 64, %i3
4096	add	%i1, 64, %i1
4097	bgu,pt	%ncc, .co_unaln_111_loop
4098	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4099	ba	.co_unaln_done
4100	nop
4101
4102.co_unaln_110:
4103	ldd	[%o4+48], %d12
4104	ldd	[%o4+56], %d14
4105.co_unaln_110_loop:
4106	add	%o4, 64, %o4
4107	ldda	[%o4]ASI_BLK_P, %d16
4108	faligndata %d12, %d14, %d48
4109	faligndata %d14, %d16, %d50
4110	faligndata %d16, %d18, %d52
4111	faligndata %d18, %d20, %d54
4112	faligndata %d20, %d22, %d56
4113	faligndata %d22, %d24, %d58
4114	faligndata %d24, %d26, %d60
4115	faligndata %d26, %d28, %d62
4116	fmovd	%d28, %d12
4117	fmovd	%d30, %d14
4118	stda	%d48, [%i1]ASI_BLK_AIUS
4119	subcc	%i3, 64, %i3
4120	add	%i1, 64, %i1
4121	bgu,pt	%ncc, .co_unaln_110_loop
4122	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4123	ba	.co_unaln_done
4124	nop
4125
4126.co_unaln_101:
4127	ldd	[%o4+40], %d10
4128	ldd	[%o4+48], %d12
4129	ldd	[%o4+56], %d14
4130.co_unaln_101_loop:
4131	add	%o4, 64, %o4
4132	ldda	[%o4]ASI_BLK_P, %d16
4133	faligndata %d10, %d12, %d48
4134	faligndata %d12, %d14, %d50
4135	faligndata %d14, %d16, %d52
4136	faligndata %d16, %d18, %d54
4137	faligndata %d18, %d20, %d56
4138	faligndata %d20, %d22, %d58
4139	faligndata %d22, %d24, %d60
4140	faligndata %d24, %d26, %d62
4141	fmovd	%d26, %d10
4142	fmovd	%d28, %d12
4143	fmovd	%d30, %d14
4144	stda	%d48, [%i1]ASI_BLK_AIUS
4145	subcc	%i3, 64, %i3
4146	add	%i1, 64, %i1
4147	bgu,pt	%ncc, .co_unaln_101_loop
4148	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4149	ba	.co_unaln_done
4150	nop
4151
4152.co_unaln_100:
4153	ldd	[%o4+32], %d8
4154	ldd	[%o4+40], %d10
4155	ldd	[%o4+48], %d12
4156	ldd	[%o4+56], %d14
4157.co_unaln_100_loop:
4158	add	%o4, 64, %o4
4159	ldda	[%o4]ASI_BLK_P, %d16
4160	faligndata %d8, %d10, %d48
4161	faligndata %d10, %d12, %d50
4162	faligndata %d12, %d14, %d52
4163	faligndata %d14, %d16, %d54
4164	faligndata %d16, %d18, %d56
4165	faligndata %d18, %d20, %d58
4166	faligndata %d20, %d22, %d60
4167	faligndata %d22, %d24, %d62
4168	fmovd	%d24, %d8
4169	fmovd	%d26, %d10
4170	fmovd	%d28, %d12
4171	fmovd	%d30, %d14
4172	stda	%d48, [%i1]ASI_BLK_AIUS
4173	subcc	%i3, 64, %i3
4174	add	%i1, 64, %i1
4175	bgu,pt	%ncc, .co_unaln_100_loop
4176	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4177	ba	.co_unaln_done
4178	nop
4179
4180.co_unaln_011:
4181	ldd	[%o4+24], %d6
4182	ldd	[%o4+32], %d8
4183	ldd	[%o4+40], %d10
4184	ldd	[%o4+48], %d12
4185	ldd	[%o4+56], %d14
4186.co_unaln_011_loop:
4187	add	%o4, 64, %o4
4188	ldda	[%o4]ASI_BLK_P, %d16
4189	faligndata %d6, %d8, %d48
4190	faligndata %d8, %d10, %d50
4191	faligndata %d10, %d12, %d52
4192	faligndata %d12, %d14, %d54
4193	faligndata %d14, %d16, %d56
4194	faligndata %d16, %d18, %d58
4195	faligndata %d18, %d20, %d60
4196	faligndata %d20, %d22, %d62
4197	fmovd	%d22, %d6
4198	fmovd	%d24, %d8
4199	fmovd	%d26, %d10
4200	fmovd	%d28, %d12
4201	fmovd	%d30, %d14
4202	stda	%d48, [%i1]ASI_BLK_AIUS
4203	subcc	%i3, 64, %i3
4204	add	%i1, 64, %i1
4205	bgu,pt	%ncc, .co_unaln_011_loop
4206	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4207	ba	.co_unaln_done
4208	nop
4209
4210.co_unaln_010:
4211	ldd	[%o4+16], %d4
4212	ldd	[%o4+24], %d6
4213	ldd	[%o4+32], %d8
4214	ldd	[%o4+40], %d10
4215	ldd	[%o4+48], %d12
4216	ldd	[%o4+56], %d14
4217.co_unaln_010_loop:
4218	add	%o4, 64, %o4
4219	ldda	[%o4]ASI_BLK_P, %d16
4220	faligndata %d4, %d6, %d48
4221	faligndata %d6, %d8, %d50
4222	faligndata %d8, %d10, %d52
4223	faligndata %d10, %d12, %d54
4224	faligndata %d12, %d14, %d56
4225	faligndata %d14, %d16, %d58
4226	faligndata %d16, %d18, %d60
4227	faligndata %d18, %d20, %d62
4228	fmovd	%d20, %d4
4229	fmovd	%d22, %d6
4230	fmovd	%d24, %d8
4231	fmovd	%d26, %d10
4232	fmovd	%d28, %d12
4233	fmovd	%d30, %d14
4234	stda	%d48, [%i1]ASI_BLK_AIUS
4235	subcc	%i3, 64, %i3
4236	add	%i1, 64, %i1
4237	bgu,pt	%ncc, .co_unaln_010_loop
4238	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
4239	ba	.co_unaln_done
4240	nop
4241
4242.co_unaln_001:
4243	ldd	[%o4+8], %d2
4244	ldd	[%o4+16], %d4
4245	ldd	[%o4+24], %d6
4246	ldd	[%o4+32], %d8
4247	ldd	[%o4+40], %d10
4248	ldd	[%o4+48], %d12
4249	ldd	[%o4+56], %d14
4250.co_unaln_001_loop:
4251	add	%o4, 64, %o4
4252	ldda	[%o4]ASI_BLK_P, %d16
4253	faligndata %d2, %d4, %d48
4254	faligndata %d4, %d6, %d50
4255	faligndata %d6, %d8, %d52
4256	faligndata %d8, %d10, %d54
4257	faligndata %d10, %d12, %d56
4258	faligndata %d12, %d14, %d58
4259	faligndata %d14, %d16, %d60
4260	faligndata %d16, %d18, %d62
4261	fmovd	%d18, %d2
4262	fmovd	%d20, %d4