1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26#include <sys/param.h>
27#include <sys/errno.h>
28#include <sys/asm_linkage.h>
29#include <sys/vtrace.h>
30#include <sys/machthread.h>
31#include <sys/clock.h>
32#include <sys/asi.h>
33#include <sys/fsr.h>
34#include <sys/privregs.h>
35#include <sys/machasi.h>
36#include <sys/niagaraasi.h>
37
38#include "assym.h"
39
40
41/*
42 * Pseudo-code to aid in understanding the control flow of the
43 * bcopy/kcopy routine.
44 *
45 *	! WARNING : <Register usage convention>
46 *	! In kcopy() the %o5, holds previous error handler and a flag
47 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
48 *	! The %o5 is not available for any other use.
49 *
50 * On entry:
51 *	! Determine whether to use the FP register version or the
52 *	! the leaf routine version depending on the size of the copy.
53 *	! Set up error handling accordingly.
54 *	! The transition point depends on FP_COPY
55 *	! For both versions %o5 is reserved
56 *
57 * kcopy():
58 *	if(length > FP_COPY)
59 *		go to regular_kcopy
60 *
61 *	! Setup_leaf_rtn_error_handler
62 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
63 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
64 *	curthread->t_lofault = .sm_copyerr;
65 *	goto small_bcopy();
66 *
67 * regular_kcopy:
68 *	save_registers()
69 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
70 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
71 *	curthread->t_lofault = .copyerr;
72 *	goto do_copy();
73 *
74 * bcopy():
75 *	if(length > FP_COPY)
76 *		go to regular_bcopy
77 *
78 *	! Setup_leaf_rtn_error_handler
79 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
80 *	curthread->t_lofault = .sm_copyerr;
81 *	goto small_bcopy();
82 *
83 * regular_bcopy:
84 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
85 *	curthread->t_lofault = .copyerr;
86 *	goto do_copy();
87 *
88 * small_bcopy:
89 *	! handle copies smaller than FP_COPY
90 *	restore t_lofault handler
91 *	exit
92 *
93 * do_copy:
94 *	! handle copies larger than FP_COPY
95 *	save fp_regs
96 * 	blockcopy;
97 *	restore fp_regs
98 *	restore t_lofault handler if came from kcopy();
99 *
100 *
101 * In leaf lofault handler:
102 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
103 *	return (errno)
104 *
105 * In lofault handler:
106 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
107 *	restore fp_regs
108 *	return (errno)
109 *
110 *
111 *
112 * For all of bcopy/copyin/copyout the copy logic is specialized according
113 * to how the src and dst is aligned and how much data needs to be moved.
114 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL))
115 *
116 * N2/RF Flow :
117 *
118 * if (count < FP_COPY) {  (584 bytes)
119 *   set small fault handler (no register window save/restore)
120 *   if count < SHORTCOPY  (7 bytes)
121 *	copy bytes; go to short_exit
122 *   else
123 *   determine dst alignment, move minimum bytes/halfwords to
124 *   get dst aligned on long word boundary
125 *     if( src is on long word boundary ) {
126 * medlong:					   src/dst aligned on 8 bytes
127 *	 copy with ldx/stx in 4-way unrolled loop;
128 *       copy final 0-31 bytes; go to short_exit
129 *     } else {					src/dst not aligned on 8 bytes
130 *     if src is word aligned, ld/st words in 32-byte chunks
131 *     if src is half word aligned, ld half, ld word, ld half; pack
132 *		into long word, store long words in 32-byte chunks
133 *     if src is byte aligned, ld byte,half,word parts;  pack into long
134 *	   word, store long words in 32-byte chunks
135 *     move final 0-31 bytes according to src alignment;  go to short_exit
136 * short_exit:
137 *     restore trap handler if needed, retl
138 * else {					   More than FP_COPY bytes
139 *     set fault handler
140 *     disable kernel preemption
141 *     save registers, save FP registers if in use
142 *     move bytes to align destination register on long word boundary
143 *     if(src is on long word boundary) {	   src/dst aligned on 8 bytes
144 *       align dst on 64 byte boundary;  use 8-way test for each of 8 possible
145 *       src alignments relative to a 64 byte boundary to select the
146 *       16-way unrolled loop (128 bytes) to use for
147 *       block load, fmovd, block-init-store, block-store, fmovd operations
148 *       then go to remain_stuff.
149 * remain_stuff: move remaining bytes. go to long_exit
150 *     } else {
151 *       setup alignaddr for faligndata instructions
152 *       align dst on 64 byte boundary; use 8-way test for each of 8 possible
153 *       src alignments to nearest long word relative to 64 byte boundary to
154 *       select the 8-way unrolled loop (64 bytes) to use for
155 *       block load, falign, fmovd, block-store loop
156 *	 (only use block-init-store when src/dst on 8 byte boundaries.)
157 *       goto unalign_done.
158 * unalign_done:
159 *       move remaining bytes for unaligned cases. go to long_exit
160 * long_exit:
161 *       restore %gsr, FP regs (either from stack or set to zero),
162 *       restore trap handler, check for kernel preemption request,
163 *       handle if needed, ret.
164 * }
165 *
166 * Other platforms include hw_bcopy_limit_[1248] to control the exact
167 * point where the FP register code is used. On those platforms, the
168 * FP register code did not leave data in L2 cache, potentially affecting
169 * performance more than the gain/loss from the algorithm difference.
170 * For N2/RF, block store places data in the L2 cache, so use or non-use
171 * of the FP registers has no effect on L2 cache behavior.
172 * The cost for testing hw_bcopy_limit_* according to different
173 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits
174 * were not used. That cost was judged too high relative to the benefits,
175 * so the hw_bcopy_limit option is omitted from this code.
176 */
177
178/*
179 * Less then or equal this number of bytes we will always copy byte-for-byte
180 */
181#define	SMALL_LIMIT	7
182
183/*
184 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
185 * handler was set
186 */
187#define	LOFAULT_SET 2
188
189/*
190 * This define is to align data for the unaligned source cases.
191 * The data1, data2 and data3 is merged into data1 and data2.
192 * The data3 is preserved for next merge.
193 */
194#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
195	sllx	data1, lshift, data1				;\
196	srlx	data2, rshift, tmp				;\
197	or	data1, tmp, data1				;\
198	sllx	data2, lshift, data2				;\
199	srlx	data3, rshift, tmp				;\
200	or	data2, tmp, data2
201/*
202 * This macro is to align the data. Basically it merges
203 * data1 and data2 to form double word.
204 */
205#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
206	sllx	data1, lshift, data1				;\
207	srlx	data2, rshift, tmp				;\
208	or	data1, tmp, data1
209
210#if !defined(NIAGARA_IMPL)
211/*
212 * Flags set in the lower bits of the t_lofault address:
213 * FPUSED_FLAG: The FP registers were in use and must be restored
214 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls
215 * COPY_FLAGS: Both of the above
216 *
217 * Other flags:
218 * KPREEMPT_FLAG: kpreempt needs to be called
219 */
220#define	FPUSED_FLAG	1
221#define	LOFAULT_SET	2
222#define	COPY_FLAGS	(FPUSED_FLAG | LOFAULT_SET)
223#define	KPREEMPT_FLAG	4
224
225#define	ALIGN_OFF_1_7			\
226	faligndata %d0, %d2, %d48	;\
227	faligndata %d2, %d4, %d50	;\
228	faligndata %d4, %d6, %d52	;\
229	faligndata %d6, %d8, %d54	;\
230	faligndata %d8, %d10, %d56	;\
231	faligndata %d10, %d12, %d58	;\
232	faligndata %d12, %d14, %d60	;\
233	faligndata %d14, %d16, %d62
234
235#define	ALIGN_OFF_8_15			\
236	faligndata %d2, %d4, %d48	;\
237	faligndata %d4, %d6, %d50	;\
238	faligndata %d6, %d8, %d52	;\
239	faligndata %d8, %d10, %d54	;\
240	faligndata %d10, %d12, %d56	;\
241	faligndata %d12, %d14, %d58	;\
242	faligndata %d14, %d16, %d60	;\
243	faligndata %d16, %d18, %d62
244
245#define	ALIGN_OFF_16_23			\
246	faligndata %d4, %d6, %d48	;\
247	faligndata %d6, %d8, %d50	;\
248	faligndata %d8, %d10, %d52	;\
249	faligndata %d10, %d12, %d54	;\
250	faligndata %d12, %d14, %d56	;\
251	faligndata %d14, %d16, %d58	;\
252	faligndata %d16, %d18, %d60	;\
253	faligndata %d18, %d20, %d62
254
255#define	ALIGN_OFF_24_31			\
256	faligndata %d6, %d8, %d48	;\
257	faligndata %d8, %d10, %d50	;\
258	faligndata %d10, %d12, %d52	;\
259	faligndata %d12, %d14, %d54	;\
260	faligndata %d14, %d16, %d56	;\
261	faligndata %d16, %d18, %d58	;\
262	faligndata %d18, %d20, %d60	;\
263	faligndata %d20, %d22, %d62
264
265#define	ALIGN_OFF_32_39			\
266	faligndata %d8, %d10, %d48	;\
267	faligndata %d10, %d12, %d50	;\
268	faligndata %d12, %d14, %d52	;\
269	faligndata %d14, %d16, %d54	;\
270	faligndata %d16, %d18, %d56	;\
271	faligndata %d18, %d20, %d58	;\
272	faligndata %d20, %d22, %d60	;\
273	faligndata %d22, %d24, %d62
274
275#define	ALIGN_OFF_40_47			\
276	faligndata %d10, %d12, %d48	;\
277	faligndata %d12, %d14, %d50	;\
278	faligndata %d14, %d16, %d52	;\
279	faligndata %d16, %d18, %d54	;\
280	faligndata %d18, %d20, %d56	;\
281	faligndata %d20, %d22, %d58	;\
282	faligndata %d22, %d24, %d60	;\
283	faligndata %d24, %d26, %d62
284
285#define	ALIGN_OFF_48_55			\
286	faligndata %d12, %d14, %d48	;\
287	faligndata %d14, %d16, %d50	;\
288	faligndata %d16, %d18, %d52	;\
289	faligndata %d18, %d20, %d54	;\
290	faligndata %d20, %d22, %d56	;\
291	faligndata %d22, %d24, %d58	;\
292	faligndata %d24, %d26, %d60	;\
293	faligndata %d26, %d28, %d62
294
295#define	ALIGN_OFF_56_63			\
296	faligndata %d14, %d16, %d48	;\
297	faligndata %d16, %d18, %d50	;\
298	faligndata %d18, %d20, %d52	;\
299	faligndata %d20, %d22, %d54	;\
300	faligndata %d22, %d24, %d56	;\
301	faligndata %d24, %d26, %d58	;\
302	faligndata %d26, %d28, %d60	;\
303	faligndata %d28, %d30, %d62
304
305/*
306 * FP_COPY indicates the minimum number of bytes needed
307 * to justify using FP/VIS-accelerated memory operations.
308 * The FPBLK code assumes a minimum number of bytes are available
309 * to be moved on entry.  Check that code carefully before
310 * reducing FP_COPY below 256.
311 */
312#define FP_COPY			584
313#define SHORTCOPY		7
314#define ASI_STBI_P		ASI_BLK_INIT_ST_QUAD_LDD_P
315#define ASI_STBI_AIUS		ASI_BLK_INIT_QUAD_LDD_AIUS
316#define CACHE_LINE		64
317#define	VIS_BLOCKSIZE		64
318
319/*
320 * Size of stack frame in order to accomodate a 64-byte aligned
321 * floating-point register save area and 2 64-bit temp locations.
322 * All copy functions use three quadrants of fp registers; to assure a
323 * block-aligned three block buffer in which to save we must reserve
324 * four blocks on stack.
325 *
326 *    _______________________________________ <-- %fp + STACK_BIAS
327 *    | We may need to preserve 3 quadrants |
328 *    | of fp regs, but since we do so with |
329 *    | BST/BLD we need room in which to    |
330 *    | align to VIS_BLOCKSIZE bytes.  So   |
331 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
332 *    |-------------------------------------|
333 *    | 8 bytes to save %fprs		    | <--  - SAVED_FPRS_OFFSET
334 *    |-------------------------------------|
335 *    | 8 bytes to save %gsr		    | <--  - SAVED_GSR_OFFSET
336 *    ---------------------------------------
337 */
338#define HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
339#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 4)
340#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 3) + 1)
341#define SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
342#define SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
343
344/*
345 * In FP copies if we do not have preserved data to restore over
346 * the fp regs we used then we must zero those regs to avoid
347 * exposing portions of the data to later threads (data security).
348 */
349#define	FZERO				\
350	fzero	%f0			;\
351	fzero	%f2			;\
352	faddd	%f0, %f2, %f4		;\
353	fmuld	%f0, %f2, %f6		;\
354	faddd	%f0, %f2, %f8		;\
355	fmuld	%f0, %f2, %f10		;\
356	faddd	%f0, %f2, %f12		;\
357	fmuld	%f0, %f2, %f14		;\
358	faddd	%f0, %f2, %f16		;\
359	fmuld	%f0, %f2, %f18		;\
360	faddd	%f0, %f2, %f20		;\
361	fmuld	%f0, %f2, %f22		;\
362	faddd	%f0, %f2, %f24		;\
363	fmuld	%f0, %f2, %f26		;\
364	faddd	%f0, %f2, %f28		;\
365	fmuld	%f0, %f2, %f30		;\
366	faddd	%f0, %f2, %f48		;\
367	fmuld	%f0, %f2, %f50		;\
368	faddd	%f0, %f2, %f52		;\
369	fmuld	%f0, %f2, %f54		;\
370	faddd	%f0, %f2, %f56		;\
371	fmuld	%f0, %f2, %f58		;\
372	faddd	%f0, %f2, %f60		;\
373	fmuld	%f0, %f2, %f62
374
375/*
376 * Macros to save and restore fp registers to/from the stack.
377 * Used to save and restore in-use fp registers when we want to use FP.
378 */
379#define BST_FP_TOSTACK(tmp1)					\
380	/* membar #Sync	*/					;\
381	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
382	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
383	stda	%f0, [tmp1]ASI_BLK_P				;\
384	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
385	stda	%f16, [tmp1]ASI_BLK_P				;\
386	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
387	stda	%f48, [tmp1]ASI_BLK_P				;\
388	membar	#Sync
389
390#define	BLD_FP_FROMSTACK(tmp1)					\
391	/* membar #Sync - provided at copy completion */	;\
392	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
393	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
394	ldda	[tmp1]ASI_BLK_P, %f0				;\
395	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
396	ldda	[tmp1]ASI_BLK_P, %f16				;\
397	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
398	ldda	[tmp1]ASI_BLK_P, %f48				;\
399	membar	#Sync
400
401#endif /* !NIAGARA_IMPL */
402
403/*
404 * Copy a block of storage, returning an error code if `from' or
405 * `to' takes a kernel pagefault which cannot be resolved.
406 * Returns errno value on pagefault error, 0 if all ok
407 */
408
409	.seg	".text"
410	.align	4
411
412	ENTRY(kcopy)
413#if !defined(NIAGARA_IMPL)
414	cmp	%o2, FP_COPY			! check for small copy/leaf case
415	bgt,pt	%ncc, .kcopy_more		!
416	nop
417.kcopy_small:					! setup error handler
418	sethi	%hi(.sm_copyerr), %o4
419	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
420	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
421	! Note that we carefully do *not* flag the setting of
422	! t_lofault.
423	membar	#Sync				! sync error barrier
424	b	.sm_do_copy			! common code
425	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
426
427
428.kcopy_more:
429	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
430	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
431	or	%l7, %lo(.copyerr), %l7
432	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
433	! Note that we carefully do *not* flag the setting of
434	! t_lofault.
435	membar	#Sync				! sync error barrier
436	b	.do_copy			! common code
437	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
438
439/*
440 * We got here because of a fault during a small kcopy or bcopy.
441 * if a fault handler existed when bcopy was called.
442 * No floating point registers are used by the small copies.
443 * Small copies are from a leaf routine
444 * Errno value is in %g1.
445 */
446.sm_copyerr:
447	! The kcopy will always set a t_lofault handler. If it fires,
448	! we're expected to just return the error code and not to
449	! invoke any existing error handler. As far as bcopy is concerned,
450	! we only set t_lofault if there was an existing lofault handler.
451	! In that case we're expected to invoke the previously existing
452	! handler after resetting the t_lofault value.
453	btst	LOFAULT_SET, %o5
454	membar	#Sync				! sync error barrier
455	andn	%o5, LOFAULT_SET, %o5		! clear fault flag
456	bnz,pn	%ncc, 3f
457	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
458	retl
459	mov	%g1, %o0
4603:
461	! We're here via bcopy. There must have been an error handler
462	! in place otherwise we would have died a nasty death already.
463	jmp	%o5				! goto real handler
464	mov	%g0, %o0
465/*
466 *  end of .sm_copyerr
467 */
468
469/*
470 * We got here because of a fault during kcopy or bcopy if a fault
471 * handler existed when bcopy was called.
472 * stack and fp registers need to be restored
473 * Errno value is in %g1.
474 */
475.copyerr:
476	sethi	%hi(.copyerr2), %l1
477	or	%l1, %lo(.copyerr2), %l1
478	membar	#Sync				! sync error barrier
479	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
480	btst	FPUSED_FLAG, %o5
481	bz,pt	%xcc, 1f
482	and	%o5, LOFAULT_SET, %l1	! copy flag to %l1
483
484	membar	#Sync				! sync error barrier
485	wr	%l5, 0, %gsr
486	btst	FPRS_FEF, %g5
487	bz,pt	%icc, 4f
488	nop
489	! restore fpregs from stack
490	BLD_FP_FROMSTACK(%o2)
491	ba,pt	%ncc, 2f
492	wr	%g5, 0, %fprs		! restore fprs
4934:
494	FZERO
495	wr	%g5, 0, %fprs		! restore fprs
4962:
497	ldn	[THREAD_REG + T_LWP], %o2
498	brnz,pt	%o2, 1f
499	nop
500
501	ldsb	[THREAD_REG + T_PREEMPT], %l0
502	deccc	%l0
503	bnz,pn	%ncc, 1f
504	stb	%l0, [THREAD_REG + T_PREEMPT]
505
506	! Check for a kernel preemption request
507	ldn	[THREAD_REG + T_CPU], %l0
508	ldub	[%l0 + CPU_KPRUNRUN], %l0
509	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
510	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
511
512	! The kcopy will always set a t_lofault handler. If it fires,
513	! we're expected to just return the error code and not to
514	! invoke any existing error handler. As far as bcopy is concerned,
515	! we only set t_lofault if there was an existing lofault handler.
516	! In that case we're expected to invoke the previously existing
517	! handler after resetting the t_lofault value.
5181:
519	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
520	membar	#Sync				! sync error barrier
521	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
522
523	! call kpreempt if necessary
524	btst	KPREEMPT_FLAG, %l1
525	bz,pt	%icc, 2f
526	nop
527	call	kpreempt
528	rdpr	%pil, %o0	! pass %pil
5292:
530	btst	LOFAULT_SET, %l1
531	bnz,pn	%ncc, 3f
532	nop
533	ret
534	restore	%g1, 0, %o0
5353:
536	! We're here via bcopy. There must have been an error handler
537	! in place otherwise we would have died a nasty death already.
538	jmp	%o5				! goto real handler
539	restore	%g0, 0, %o0			! dispose of copy window
540
541/*
542 * We got here because of a fault in .copyerr.  We can't safely restore fp
543 * state, so we panic.
544 */
545fp_panic_msg:
546	.asciz	"Unable to restore fp state after copy operation"
547
548	.align	4
549.copyerr2:
550	set	fp_panic_msg, %o0
551	call	panic
552	nop
553/*
554 *  end of .copyerr
555 */
556
557#else	/* NIAGARA_IMPL */
558	save	%sp, -SA(MINFRAME), %sp
559	set	.copyerr, %l7			! copyerr is lofault value
560	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
561	or	%o5, LOFAULT_SET, %o5
562	membar	#Sync				! sync error barrier
563	b	.do_copy			! common code
564	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
565
566/*
567 * We got here because of a fault during kcopy.
568 * Errno value is in %g1.
569 */
570.copyerr:
571	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
572	! into %o5 to indicate it has set t_lofault handler. Need to clear
573	! LOFAULT_SET flag before restoring the error handler.
574	andn	%o5, LOFAULT_SET, %o5
575	membar	#Sync				! sync error barrier
576	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
577	ret
578	restore	%g1, 0, %o0
579#endif	/* NIAGARA_IMPL */
580
581	SET_SIZE(kcopy)
582
583
584/*
585 * Copy a block of storage - must not overlap (from + len <= to).
586 */
587
588	ENTRY(bcopy)
589#if !defined(NIAGARA_IMPL)
590	cmp	%o2, FP_COPY			! check for small copy/leaf case
591	bgt,pt	%ncc, .bcopy_more		!
592	nop
593.bcopy_small:					! setup error handler
594	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
595	tst	%o5
596	bz,pt	%icc, .sm_do_copy
597	sethi	%hi(.sm_copyerr), %o4
598	or	%o4, %lo(.sm_copyerr), %o4	! .sm_copyerr is lofault value
599	membar	#Sync				! sync error barrier
600	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
601	or	%o5, LOFAULT_SET, %o5		! Error should trampoline
602.sm_do_copy:
603	mov	%o0, %g1		! save %o0
604	cmp	%o2, SHORTCOPY		! make sure there is enough to align
605	ble,pt	%ncc, .bc_smallest
606	andcc	%o1, 0x7, %o3		! is dest long aligned
607	bnz,pn	%ncc, .bc_align
608	andcc	%o1, 1, %o3		! is dest byte aligned
609
610! Destination is long word aligned
611.bc_al_src:
612	andcc	%o0, 7, %o3
613	brnz,pt	%o3, .bc_src_dst_unal8
614	nop
615/*
616 * Special case for handling when src and dest are both long word aligned
617 * and total data to move is less than FP_COPY bytes
618 * Also handles finish up for large block moves, so may be less than 32 bytes
619 */
620.bc_medlong:
621	subcc	%o2, 31, %o2		! adjust length to allow cc test
622	ble,pt	%ncc, .bc_medl31
623	nop
624.bc_medl32:
625	ldx	[%o0], %o4		! move 32 bytes
626	subcc	%o2, 32, %o2		! decrement length count by 32
627	stx	%o4, [%o1]
628	ldx	[%o0+8], %o4
629	stx	%o4, [%o1+8]
630	ldx	[%o0+16], %o4
631	add	%o0, 32, %o0		! increase src ptr by 32
632	stx	%o4, [%o1+16]
633	ldx	[%o0-8], %o4
634	add	%o1, 32, %o1		! increase dst ptr by 32
635	bgu,pt	%ncc, .bc_medl32	! repeat if at least 32 bytes left
636	stx	%o4, [%o1-8]
637.bc_medl31:
638	addcc	%o2, 24, %o2		! adjust count to be off by 7
639	ble,pt	%ncc, .bc_medl7		! skip if 7 or fewer bytes left
640	nop
641.bc_medl8:
642	ldx	[%o0], %o4		! move 8 bytes
643	add	%o0, 8, %o0		! increase src ptr by 8
644	subcc	%o2, 8, %o2		! decrease count by 8
645	add	%o1, 8, %o1		! increase dst ptr by 8
646	bgu,pt	%ncc, .bc_medl8
647	stx	%o4, [%o1-8]
648.bc_medl7:
649	addcc	%o2, 7, %o2		! finish adjustment of remaining count
650	bnz,pt	%ncc, .bc_small4	! do final bytes if not finished
651
652.bc_smallx:				! finish up and exit
653	tst	%o5
654	bz,pt	%ncc, .bc_sm_done
655	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
656	membar	#Sync			! sync error barrier
657	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
658.bc_sm_done:
659	retl
660	mov	%g0, %o0
661
662.bc_small4:
663	cmp	%o2, 4
664	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
665	nop				!
666	ld	[%o0], %o4		! move 4 bytes
667	add	%o0, 4, %o0		! increase src ptr by 4
668	add	%o1, 4, %o1		! increase dst ptr by 4
669	subcc	%o2, 4, %o2		! decrease count by 4
670	bz,pt	%ncc, .bc_smallx
671	stw	%o4, [%o1-4]
672
673.bc_small3x:				! Exactly 1, 2, or 3 bytes remain
674	subcc	%o2, 1, %o2		! reduce count for cc test
675	ldub	[%o0], %o4		! load one byte
676	bz,pt	%ncc, .bc_smallx
677	stb	%o4, [%o1]		! store one byte
678	ldub	[%o0+1], %o4		! load second byte
679	subcc	%o2, 1, %o2
680	bz,pt	%ncc, .bc_smallx
681	stb	%o4, [%o1+1]		! store second byte
682	ldub	[%o0+2], %o4		! load third byte
683	ba	.bc_smallx
684	stb	%o4, [%o1+2]		! store third byte
685
686.bc_smallest:				! 7 or fewer bytes remain
687	tst	%o2
688	bz,pt	%ncc, .bc_smallx
689	cmp	%o2, 4
690	blt,pt	%ncc, .bc_small3x
691	nop
692	ldub	[%o0], %o4		! read byte
693	subcc	%o2, 4, %o2		! reduce count by 4
694	stb	%o4, [%o1]		! write byte
695	ldub	[%o0+1], %o4		! repeat for total of 4 bytes
696	add	%o0, 4, %o0		! advance src by 4
697	stb	%o4, [%o1+1]
698	ldub	[%o0-2], %o4
699	add	%o1, 4, %o1		! advance dst by 4
700	stb	%o4, [%o1-2]
701	ldub	[%o0-1], %o4
702	bnz,pt	%ncc, .bc_small3x
703	stb	%o4, [%o1-1]
704	ba	.bc_smallx
705	nop
706
707/*
708 * Align destination to long word boundary
709 */
710.bc_align:				! byte align test in prior branch delay
711	bnz,pt	%ncc, .bc_al_d1
712.bc_al_d1f:				! dest is now half word aligned
713	andcc	%o1, 2, %o3
714	bnz,pt	%ncc, .bc_al_d2
715.bc_al_d2f:				! dest is now word aligned
716	andcc	%o1, 4, %o3		! is dest longword aligned?
717	bz,pt	%ncc, .bc_al_src
718	nop
719.bc_al_d4:				! dest is word aligned;  src is unknown
720	ldub	[%o0], %o4		! move a word (src align unknown)
721	ldub	[%o0+1], %o3
722	sll	%o4, 24, %o4		! position
723	sll	%o3, 16, %o3		! position
724	or	%o4, %o3, %o3		! merge
725	ldub	[%o0+2], %o4
726	sll	%o4, 8, %o4		! position
727	or	%o4, %o3, %o3		! merge
728	ldub	[%o0+3], %o4
729	or	%o4, %o3, %o4		! merge
730	stw	%o4,[%o1]		! store four bytes
731	add	%o0, 4, %o0		! adjust src by 4
732	add	%o1, 4, %o1		! adjust dest by 4
733	sub	%o2, 4, %o2		! adjust count by 4
734	andcc	%o0, 7, %o3		! check for src long word alignment
735	brz,pt	%o3, .bc_medlong
736.bc_src_dst_unal8:
737	! dst is 8-byte aligned, src is not
738	! Size is less than FP_COPY
739	! Following code is to select for alignment
740	andcc	%o0, 0x3, %o3		! test word alignment
741	bz,pt	%ncc, .bc_medword
742	nop
743	andcc	%o0, 0x1, %o3		! test halfword alignment
744	bnz,pt	%ncc, .bc_med_byte	! go to byte move if not halfword
745	andcc	%o0, 0x2, %o3		! test which byte alignment
746	ba	.bc_medhalf
747	nop
748.bc_al_d1:				! align dest to half word
749	ldub	[%o0], %o4		! move a byte
750	add	%o0, 1, %o0
751	stb	%o4, [%o1]
752	add	%o1, 1, %o1
753	andcc	%o1, 2, %o3
754	bz,pt	%ncc, .bc_al_d2f
755	sub	%o2, 1, %o2
756.bc_al_d2:				! align dest to word
757	ldub	[%o0], %o4		! move a half-word (src align unknown)
758	ldub	[%o0+1], %o3
759	sll	%o4, 8, %o4		! position
760	or	%o4, %o3, %o4		! merge
761	sth	%o4, [%o1]
762	add	%o0, 2, %o0
763	add	%o1, 2, %o1
764	andcc	%o1, 4, %o3		! is dest longword aligned?
765	bz,pt	%ncc, .bc_al_src
766	sub	%o2, 2, %o2
767	ba	.bc_al_d4
768	nop
769/*
770 * Handle all cases where src and dest are aligned on word
771 * boundaries. Use unrolled loops for better performance.
772 * This option wins over standard large data move when
773 * source and destination is in cache for medium
774 * to short data moves.
775 */
776.bc_medword:
777	subcc	%o2, 31, %o2		! adjust length to allow cc test
778	ble,pt	%ncc, .bc_medw31
779	nop
780.bc_medw32:
781	ld	[%o0], %o4		! move a block of 32 bytes
782	stw	%o4, [%o1]
783	ld	[%o0+4], %o4
784	stw	%o4, [%o1+4]
785	ld	[%o0+8], %o4
786	stw	%o4, [%o1+8]
787	ld	[%o0+12], %o4
788	stw	%o4, [%o1+12]
789	ld	[%o0+16], %o4
790	stw	%o4, [%o1+16]
791	ld	[%o0+20], %o4
792	subcc	%o2, 32, %o2		! decrement length count
793	stw	%o4, [%o1+20]
794	ld	[%o0+24], %o4
795	add	%o0, 32, %o0		! increase src ptr by 32
796	stw	%o4, [%o1+24]
797	ld	[%o0-4], %o4
798	add	%o1, 32, %o1		! increase dst ptr by 32
799	bgu,pt	%ncc, .bc_medw32	! repeat if at least 32 bytes left
800	stw	%o4, [%o1-4]
801.bc_medw31:
802	addcc	%o2, 24, %o2		! adjust count to be off by 7
803	ble,pt	%ncc, .bc_medw7		! skip if 7 or fewer bytes left
804	nop				!
805.bc_medw15:
806	ld	[%o0], %o4		! move a block of 8 bytes
807	subcc	%o2, 8, %o2		! decrement length count
808	stw	%o4, [%o1]
809	add	%o0, 8, %o0		! increase src ptr by 8
810	ld	[%o0-4], %o4
811	add	%o1, 8, %o1		! increase dst ptr by 8
812	bgu,pt	%ncc, .bc_medw15
813	stw	%o4, [%o1-4]
814.bc_medw7:
815	addcc	%o2, 7, %o2		! finish adjustment of remaining count
816	bz,pt	%ncc, .bc_smallx	! exit if finished
817	cmp	%o2, 4
818	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
819	nop				!
820	ld	[%o0], %o4		! move 4 bytes
821	add	%o0, 4, %o0		! increase src ptr by 4
822	add	%o1, 4, %o1		! increase dst ptr by 4
823	subcc	%o2, 4, %o2		! decrease count by 4
824	bnz	.bc_small3x
825	stw	%o4, [%o1-4]
826	ba	.bc_smallx
827	nop
828
829.bc_medhalf:
830	subcc	%o2, 31, %o2		! adjust length to allow cc test
831	ble,pt	%ncc, .bc_medh31
832	nop
833.bc_medh32:				! load and store block of 32 bytes
834	subcc	%o2, 32, %o2		! decrement length count
835
836	lduh	[%o0], %o4		! move 32 bytes
837	lduw	[%o0+2], %o3
838	sllx	%o4, 48, %o4
839	sllx	%o3, 16, %o3
840	or	%o4, %o3, %o3
841	lduh	[%o0+6], %o4
842	or	%o4, %o3, %o4
843	stx	%o4, [%o1]
844
845	lduh	[%o0+8], %o4
846	lduw	[%o0+10], %o3
847	sllx	%o4, 48, %o4
848	sllx	%o3, 16, %o3
849	or	%o4, %o3, %o3
850	lduh	[%o0+14], %o4
851	or	%o4, %o3, %o4
852	stx	%o4, [%o1+8]
853
854	lduh	[%o0+16], %o4
855	lduw	[%o0+18], %o3
856	sllx	%o4, 48, %o4
857	sllx	%o3, 16, %o3
858	or	%o4, %o3, %o3
859	lduh	[%o0+22], %o4
860	or	%o4, %o3, %o4
861	stx	%o4, [%o1+16]
862
863	add	%o0, 32, %o0		! increase src ptr by 32
864	add	%o1, 32, %o1		! increase dst ptr by 32
865
866	lduh	[%o0-8], %o4
867	lduw	[%o0-6], %o3
868	sllx	%o4, 48, %o4
869	sllx	%o3, 16, %o3
870	or	%o4, %o3, %o3
871	lduh	[%o0-2], %o4
872	or	%o3, %o4, %o4
873	bgu,pt	%ncc, .bc_medh32	! repeat if at least 32 bytes left
874	stx	%o4, [%o1-8]
875
876.bc_medh31:
877	addcc	%o2, 24, %o2		! adjust count to be off by 7
878	ble,pt	%ncc, .bc_medh7		! skip if 7 or fewer bytes left
879	nop				!
880.bc_medh15:
881	lduh	[%o0], %o4		! move 16 bytes
882	subcc	%o2, 8, %o2		! decrement length count
883	lduw	[%o0+2], %o3
884	sllx	%o4, 48, %o4
885	sllx	%o3, 16, %o3
886	or	%o4, %o3, %o3
887	add	%o1, 8, %o1		! increase dst ptr by 8
888	lduh	[%o0+6], %o4
889	add	%o0, 8, %o0		! increase src ptr by 8
890	or	%o4, %o3, %o4
891	bgu,pt	%ncc, .bc_medh15
892	stx	%o4, [%o1-8]
893.bc_medh7:
894	addcc	%o2, 7, %o2		! finish adjustment of remaining count
895	bz,pt	%ncc, .bc_smallx	! exit if finished
896	cmp	%o2, 4
897	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
898	nop				!
899	lduh	[%o0], %o4
900	sll	%o4, 16, %o4
901	lduh	[%o0+2], %o3
902	or	%o3, %o4, %o4
903	subcc	%o2, 4, %o2
904	add	%o0, 4, %o0
905	add	%o1, 4, %o1
906	bnz	.bc_small3x
907	stw	%o4, [%o1-4]
908	ba	.bc_smallx
909	nop
910
911	.align 16
912.bc_med_byte:
913	bnz,pt	%ncc, .bc_medbh32a	! go to correct byte move
914	subcc	%o2, 31, %o2		! adjust length to allow cc test
915	ble,pt	%ncc, .bc_medb31
916	nop
917.bc_medb32:				! Alignment 1 or 5
918	subcc	%o2, 32, %o2		! decrement length count
919
920	ldub	[%o0], %o4		! load and store a block of 32 bytes
921	sllx	%o4, 56, %o3
922	lduh	[%o0+1], %o4
923	sllx	%o4, 40, %o4
924	or	%o4, %o3, %o3
925	lduw	[%o0+3], %o4
926	sllx	%o4, 8, %o4
927	or	%o4, %o3, %o3
928	ldub	[%o0+7], %o4
929	or	%o4, %o3, %o4
930	stx	%o4, [%o1]
931
932	ldub	[%o0+8], %o4
933	sllx	%o4, 56, %o3
934	lduh	[%o0+9], %o4
935	sllx	%o4, 40, %o4
936	or	%o4, %o3, %o3
937	lduw	[%o0+11], %o4
938	sllx	%o4, 8, %o4
939	or	%o4, %o3, %o3
940	ldub	[%o0+15], %o4
941	or	%o4, %o3, %o4
942	stx	%o4, [%o1+8]
943
944	ldub	[%o0+16], %o4
945	sllx	%o4, 56, %o3
946	lduh	[%o0+17], %o4
947	sllx	%o4, 40, %o4
948	or	%o4, %o3, %o3
949	lduw	[%o0+19], %o4
950	sllx	%o4, 8, %o4
951	or	%o4, %o3, %o3
952	ldub	[%o0+23], %o4
953	or	%o4, %o3, %o4
954	stx	%o4, [%o1+16]
955
956	add	%o0, 32, %o0		! increase src ptr by 32
957	add	%o1, 32, %o1		! increase dst ptr by 32
958
959	ldub	[%o0-8], %o4
960	sllx	%o4, 56, %o3
961	lduh	[%o0-7], %o4
962	sllx	%o4, 40, %o4
963	or	%o4, %o3, %o3
964	lduw	[%o0-5], %o4
965	sllx	%o4, 8, %o4
966	or	%o4, %o3, %o3
967	ldub	[%o0-1], %o4
968	or	%o4, %o3, %o4
969	bgu,pt	%ncc, .bc_medb32	! repeat if at least 32 bytes left
970	stx	%o4, [%o1-8]
971
972.bc_medb31:				! 31 or fewer bytes remaining
973	addcc	%o2, 24, %o2		! adjust count to be off by 7
974	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
975	nop				!
976.bc_medb15:
977
978	ldub	[%o0], %o4		! load and store a block of 8 bytes
979	subcc	%o2, 8, %o2		! decrement length count
980	sllx	%o4, 56, %o3
981	lduh	[%o0+1], %o4
982	sllx	%o4, 40, %o4
983	or	%o4, %o3, %o3
984	lduw	[%o0+3], %o4
985	add	%o1, 8, %o1		! increase dst ptr by 16
986	sllx	%o4, 8, %o4
987	or	%o4, %o3, %o3
988	ldub	[%o0+7], %o4
989	add	%o0, 8, %o0		! increase src ptr by 16
990	or	%o4, %o3, %o4
991	bgu,pt	%ncc, .bc_medb15
992	stx	%o4, [%o1-8]
993.bc_medb7:
994	addcc	%o2, 7, %o2		! finish adjustment of remaining count
995	bz,pt	%ncc, .bc_smallx	! exit if finished
996	cmp	%o2, 4
997	blt,pt	%ncc, .bc_small3x	! skip if less than 4 bytes left
998	nop				!
999	ldub	[%o0], %o4		! move 4 bytes
1000	sll	%o4, 24, %o3
1001	lduh	[%o0+1], %o4
1002	sll	%o4, 8, %o4
1003	or	%o4, %o3, %o3
1004	ldub	[%o0+3], %o4
1005	or	%o4, %o3, %o4
1006	subcc	%o2, 4, %o2
1007	add	%o0, 4, %o0
1008	add	%o1, 4, %o1
1009	bnz	.bc_small3x
1010	stw	%o4, [%o1-4]
1011	ba	.bc_smallx
1012	nop
1013
1014	.align 16
1015.bc_medbh32a:				! Alignment 3 or 7
1016	ble,pt	%ncc, .bc_medbh31
1017	nop
1018.bc_medbh32:				! Alignment 3 or 7
1019	subcc	%o2, 32, %o2		! decrement length count
1020
1021	ldub	[%o0], %o4		! load and store a block of 32 bytes
1022	sllx	%o4, 56, %o3
1023	lduw	[%o0+1], %o4
1024	sllx	%o4, 24, %o4
1025	or	%o4, %o3, %o3
1026	lduh	[%o0+5], %o4
1027	sllx	%o4, 8, %o4
1028	or	%o4, %o3, %o3
1029	ldub	[%o0+7], %o4
1030	or	%o4, %o3, %o4
1031	stx	%o4, [%o1]
1032
1033	ldub	[%o0+8], %o4
1034	sllx	%o4, 56, %o3
1035	lduw	[%o0+9], %o4
1036	sllx	%o4, 24, %o4
1037	or	%o4, %o3, %o3
1038	lduh	[%o0+13], %o4
1039	sllx	%o4, 8, %o4
1040	or	%o4, %o3, %o3
1041	ldub	[%o0+15], %o4
1042	or	%o4, %o3, %o4
1043	stx	%o4, [%o1+8]
1044
1045	ldub	[%o0+16], %o4
1046	sllx	%o4, 56, %o3
1047	lduw	[%o0+17], %o4
1048	sllx	%o4, 24, %o4
1049	or	%o4, %o3, %o3
1050	lduh	[%o0+21], %o4
1051	sllx	%o4, 8, %o4
1052	or	%o4, %o3, %o3
1053	ldub	[%o0+23], %o4
1054	or	%o4, %o3, %o4
1055	stx	%o4, [%o1+16]
1056
1057	add	%o0, 32, %o0		! increase src ptr by 32
1058	add	%o1, 32, %o1		! increase dst ptr by 32
1059
1060	ldub	[%o0-8], %o4
1061	sllx	%o4, 56, %o3
1062	lduw	[%o0-7], %o4
1063	sllx	%o4, 24, %o4
1064	or	%o4, %o3, %o3
1065	lduh	[%o0-3], %o4
1066	sllx	%o4, 8, %o4
1067	or	%o4, %o3, %o3
1068	ldub	[%o0-1], %o4
1069	or	%o4, %o3, %o4
1070	bgu,pt	%ncc, .bc_medbh32	! repeat if at least 32 bytes left
1071	stx	%o4, [%o1-8]
1072
1073.bc_medbh31:
1074	addcc	%o2, 24, %o2		! adjust count to be off by 7
1075	ble,pt	%ncc, .bc_medb7		! skip if 7 or fewer bytes left
1076	nop				!
1077.bc_medbh15:
1078	ldub	[%o0], %o4		! load and store a block of 8 bytes
1079	sllx	%o4, 56, %o3
1080	lduw	[%o0+1], %o4
1081	sllx	%o4, 24, %o4
1082	or	%o4, %o3, %o3
1083	lduh	[%o0+5], %o4
1084	sllx	%o4, 8, %o4
1085	or	%o4, %o3, %o3
1086	ldub	[%o0+7], %o4
1087	or	%o4, %o3, %o4
1088	stx	%o4, [%o1]
1089	subcc	%o2, 8, %o2		! decrement length count
1090	add	%o1, 8, %o1		! increase dst ptr by 8
1091	add	%o0, 8, %o0		! increase src ptr by 8
1092	bgu,pt	%ncc, .bc_medbh15
1093	stx	%o4, [%o1-8]
1094	ba	.bc_medb7
1095	nop
1096
1097	SET_SIZE(bcopy)
1098/*
1099 * The _more entry points are not intended to be used directly by
1100 * any caller from outside this file.  They are provided to allow
1101 * profiling and dtrace of the portions of the copy code that uses
1102 * the floating point registers.
1103*/
1104	ENTRY(bcopy_more)
1105.bcopy_more:
1106	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1107	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
1108	brz,pt	%o5, .do_copy
1109	nop
1110	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
1111	or	%l7, %lo(.copyerr), %l7
1112	membar	#Sync				! sync error barrier
1113	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
1114	! We've already captured whether t_lofault was zero on entry.
1115	! We need to mark ourselves as being from bcopy since both
1116	! kcopy and bcopy use the same code path. If LOFAULT_SET is
1117	! set and the saved lofault was zero, we won't reset lofault on
1118	! returning.
1119	or	%o5, LOFAULT_SET, %o5
1120.do_copy:
1121	ldn	[THREAD_REG + T_LWP], %o3
1122	brnz,pt	%o3, 1f
1123	nop
1124/*
1125 * kpreempt_disable();
1126 */
1127	ldsb	[THREAD_REG +T_PREEMPT], %o3
1128	inc	%o3
1129	stb	%o3, [THREAD_REG + T_PREEMPT]
11301:
1131/*
1132 * Following code is for large copies. We know there is at
1133 * least FP_COPY bytes available. FP regs are used, so
1134 *  we save registers and fp regs before starting
1135 */
1136	rd	%fprs, %g5		! check for unused fp
1137	or	%o5,FPUSED_FLAG,%o5
1138	! if fprs.fef == 0, set it.
1139	! Setting it when already set costs more than checking
1140	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
1141	bz,pt	%ncc, .bc_fp_unused
1142	prefetch [%i0 + (1 * CACHE_LINE)], #one_read
1143	BST_FP_TOSTACK(%o3)
1144	ba	.bc_fp_ready
1145.bc_fp_unused:
1146	andcc	%i1, 1, %o3		! is dest byte aligned
1147	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
1148.bc_fp_ready:
1149	rd	%gsr, %l5		! save %gsr value
1150	bnz,pt	%ncc, .bc_big_d1
1151.bc_big_d1f:				! dest is now half word aligned
1152	andcc	%i1, 2, %o3
1153	bnz,pt	%ncc, .bc_big_d2
1154.bc_big_d2f:				! dest is now word aligned
1155	andcc	%i1, 4, %o3
1156	bnz,pt	%ncc, .bc_big_d4
1157.bc_big_d4f:				! dest is now long word aligned
1158	andcc	%i0, 7, %o3		! is src long word aligned
1159	brnz,pt	%o3, .bc_big_unal8
1160	prefetch [%i0 + (2 * CACHE_LINE)], #one_read
1161
1162	! Src and dst are long word aligned
1163	! align dst to 64 byte boundary
1164	andcc	%i1, 0x3f, %o3		! %o3 == 0 means dst is 64 byte aligned
1165	brz,pn	%o3, .bc_al_to_64
1166	nop
1167	sub	%o3, 64, %o3		! %o3 has negative bytes to move
1168	add	%i2, %o3, %i2		! adjust remaining count
1169	andcc	%o3, 8, %o4		! odd long words to move?
1170	brz,pt	%o4, .bc_al_to_16
1171	nop
1172	add	%o3, 8, %o3
1173	ldx	[%i0], %o4
1174	add	%i0, 8, %i0		! increment src ptr
1175	add	%i1, 8, %i1		! increment dst ptr
1176	stx	%o4, [%i1-8]
1177! Dest is aligned on 16 bytes, src 8 byte aligned
1178.bc_al_to_16:
1179	andcc	%o3, 0x30, %o4		! pair of long words to move?
1180	brz,pt	%o4, .bc_al_to_64
1181	nop
1182.bc_al_mv_16:
1183	add	%o3, 16, %o3
1184	ldx	[%i0], %o4
1185	stx	%o4, [%i1]
1186	ldx	[%i0+8], %o4
1187	add	%i0, 16, %i0		! increment src ptr
1188	stx	%o4, [%i1+8]
1189	andcc	%o3, 48, %o4
1190	brnz,pt	%o4, .bc_al_mv_16
1191	add	%i1, 16, %i1		! increment dst ptr
1192! Dest is aligned on 64 bytes, src 8 byte aligned
1193.bc_al_to_64:
1194	! Determine source alignment
1195	! to correct 8 byte offset
1196	andcc	%i0, 32, %o3
1197	brnz,pn	%o3, .bc_aln_1
1198	andcc	%i0, 16, %o3
1199	brnz,pn	%o3, .bc_aln_01
1200	andcc	%i0, 8, %o3
1201	brz,pn	%o3, .bc_aln_000
1202	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1203	ba	.bc_aln_001
1204	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1205
1206.bc_aln_01:
1207	brnz,pn	%o3, .bc_aln_011
1208	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1209	ba	.bc_aln_010
1210	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1211.bc_aln_1:
1212	andcc	%i0, 16, %o3
1213	brnz,pn	%o3, .bc_aln_11
1214	andcc	%i0, 8, %o3
1215	brnz,pn	%o3, .bc_aln_101
1216	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1217	ba	.bc_aln_100
1218	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1219.bc_aln_11:
1220	brz,pn	%o3, .bc_aln_110
1221	prefetch [%i0 + (3 * CACHE_LINE)], #one_read
1222
1223.bc_aln_111:
1224! Alignment off by 8 bytes
1225	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1226	ldd	[%i0], %d0
1227	add	%i0, 8, %i0
1228	sub	%i2, 8, %i2
1229	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1230	and	%i2, 0x7f, %i2		! residue bytes in %i2
1231	sub	%i1, %i0, %i1
1232.bc_aln_111_loop:
1233	ldda	[%i0]ASI_BLK_P,%d16		! block load
1234	subcc	%o3, 64, %o3
1235	fmovd	%d16, %d2
1236	fmovd	%d18, %d4
1237	fmovd	%d20, %d6
1238	fmovd	%d22, %d8
1239	fmovd	%d24, %d10
1240	fmovd	%d26, %d12
1241	fmovd	%d28, %d14
1242	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1243	stda	%d0,[%i0+%i1]ASI_BLK_P
1244	add	%i0, 64, %i0
1245	fmovd	%d30, %d0
1246	bgt,pt	%ncc, .bc_aln_111_loop
1247	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1248	add	%i1, %i0, %i1
1249
1250	std	%d0, [%i1]
1251	ba	.bc_remain_stuff
1252	add	%i1, 8, %i1
1253	! END OF aln_111
1254
1255.bc_aln_110:
1256! Alignment off by 16 bytes
1257	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1258	ldd	[%i0], %d0
1259	ldd	[%i0+8], %d2
1260	add	%i0, 16, %i0
1261	sub	%i2, 16, %i2
1262	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1263	and	%i2, 0x7f, %i2		! residue bytes in %i2
1264	sub	%i1, %i0, %i1
1265.bc_aln_110_loop:
1266	ldda	[%i0]ASI_BLK_P,%d16		! block load
1267	subcc	%o3, 64, %o3
1268	fmovd	%d16, %d4
1269	fmovd	%d18, %d6
1270	fmovd	%d20, %d8
1271	fmovd	%d22, %d10
1272	fmovd	%d24, %d12
1273	fmovd	%d26, %d14
1274	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1275	stda	%d0,[%i0+%i1]ASI_BLK_P
1276	add	%i0, 64, %i0
1277	fmovd	%d28, %d0
1278	fmovd	%d30, %d2
1279	bgt,pt	%ncc, .bc_aln_110_loop
1280	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1281	add	%i1, %i0, %i1
1282
1283	std	%d0, [%i1]
1284	std	%d2, [%i1+8]
1285	ba	.bc_remain_stuff
1286	add	%i1, 16, %i1
1287	! END OF aln_110
1288
1289.bc_aln_101:
1290! Alignment off by 24 bytes
1291	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1292	ldd	[%i0], %d0
1293	ldd	[%i0+8], %d2
1294	ldd	[%i0+16], %d4
1295	add	%i0, 24, %i0
1296	sub	%i2, 24, %i2
1297	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1298	and	%i2, 0x7f, %i2		! residue bytes in %i2
1299	sub	%i1, %i0, %i1
1300.bc_aln_101_loop:
1301	ldda	[%i0]ASI_BLK_P,%d16	! block load
1302	subcc	%o3, 64, %o3
1303	fmovd	%d16, %d6
1304	fmovd	%d18, %d8
1305	fmovd	%d20, %d10
1306	fmovd	%d22, %d12
1307	fmovd	%d24, %d14
1308	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1309	stda	%d0,[%i0+%i1]ASI_BLK_P
1310	add	%i0, 64, %i0
1311	fmovd	%d26, %d0
1312	fmovd	%d28, %d2
1313	fmovd	%d30, %d4
1314	bgt,pt	%ncc, .bc_aln_101_loop
1315	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1316	add	%i1, %i0, %i1
1317
1318	std	%d0, [%i1]
1319	std	%d2, [%i1+8]
1320	std	%d4, [%i1+16]
1321	ba	.bc_remain_stuff
1322	add	%i1, 24, %i1
1323	! END OF aln_101
1324
1325.bc_aln_100:
1326! Alignment off by 32 bytes
1327	ldd	[%i0], %d0
1328	ldd	[%i0+8], %d2
1329	ldd	[%i0+16],%d4
1330	ldd	[%i0+24],%d6
1331	add	%i0, 32, %i0
1332	sub	%i2, 32, %i2
1333	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1334	and	%i2, 0x7f, %i2		! residue bytes in %i2
1335	sub	%i1, %i0, %i1
1336.bc_aln_100_loop:
1337	ldda	[%i0]ASI_BLK_P,%d16	! block load
1338	subcc	%o3, 64, %o3
1339	fmovd	%d16, %d8
1340	fmovd	%d18, %d10
1341	fmovd	%d20, %d12
1342	fmovd	%d22, %d14
1343	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1344	stda	%d0,[%i0+%i1]ASI_BLK_P
1345	add	%i0, 64, %i0
1346	fmovd	%d24, %d0
1347	fmovd	%d26, %d2
1348	fmovd	%d28, %d4
1349	fmovd	%d30, %d6
1350	bgt,pt	%ncc, .bc_aln_100_loop
1351	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1352	add	%i1, %i0, %i1
1353
1354	std	%d0, [%i1]
1355	std	%d2, [%i1+8]
1356	std	%d4, [%i1+16]
1357	std	%d6, [%i1+24]
1358	ba	.bc_remain_stuff
1359	add	%i1, 32, %i1
1360	! END OF aln_100
1361
1362.bc_aln_011:
1363! Alignment off by 40 bytes
1364	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1365	ldd	[%i0], %d0
1366	ldd	[%i0+8], %d2
1367	ldd	[%i0+16], %d4
1368	ldd	[%i0+24], %d6
1369	ldd	[%i0+32], %d8
1370	add	%i0, 40, %i0
1371	sub	%i2, 40, %i2
1372	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1373	and	%i2, 0x7f, %i2		! residue bytes in %i2
1374	sub	%i1, %i0, %i1
1375.bc_aln_011_loop:
1376	ldda	[%i0]ASI_BLK_P,%d16	! block load
1377	subcc	%o3, 64, %o3
1378	fmovd	%d16, %d10
1379	fmovd	%d18, %d12
1380	fmovd	%d20, %d14
1381	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1382	stda	%d0,[%i0+%i1]ASI_BLK_P
1383	add	%i0, 64, %i0
1384	fmovd	%d22, %d0
1385	fmovd	%d24, %d2
1386	fmovd	%d26, %d4
1387	fmovd	%d28, %d6
1388	fmovd	%d30, %d8
1389	bgt,pt	%ncc, .bc_aln_011_loop
1390	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1391	add	%i1, %i0, %i1
1392
1393	std	%d0, [%i1]
1394	std	%d2, [%i1+8]
1395	std	%d4, [%i1+16]
1396	std	%d6, [%i1+24]
1397	std	%d8, [%i1+32]
1398	ba	.bc_remain_stuff
1399	add	%i1, 40, %i1
1400	! END OF aln_011
1401
1402.bc_aln_010:
1403! Alignment off by 48 bytes
1404	ldd	[%i0], %d0
1405	ldd	[%i0+8], %d2
1406	ldd	[%i0+16], %d4
1407	ldd	[%i0+24], %d6
1408	ldd	[%i0+32], %d8
1409	ldd	[%i0+40], %d10
1410	add	%i0, 48, %i0
1411	sub	%i2, 48, %i2
1412	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1413	and	%i2, 0x7f, %i2		! residue bytes in %i2
1414	sub	%i1, %i0, %i1
1415.bc_aln_010_loop:
1416	ldda	[%i0]ASI_BLK_P,%d16	! block load
1417	subcc	%o3, 64, %o3
1418	fmovd	%d16, %d12
1419	fmovd	%d18, %d14
1420	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1421	stda	%d0,[%i0+%i1]ASI_BLK_P
1422	add	%i0, 64, %i0
1423	fmovd	%d20, %d0
1424	fmovd	%d22, %d2
1425	fmovd	%d24, %d4
1426	fmovd	%d26, %d6
1427	fmovd	%d28, %d8
1428	fmovd	%d30, %d10
1429	bgt,pt	%ncc, .bc_aln_010_loop
1430	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1431	add	%i1, %i0, %i1
1432
1433	std	%d0, [%i1]
1434	std	%d2, [%i1+8]
1435	std	%d4, [%i1+16]
1436	std	%d6, [%i1+24]
1437	std	%d8, [%i1+32]
1438	std	%d10, [%i1+40]
1439	ba	.bc_remain_stuff
1440	add	%i1, 48, %i1
1441	! END OF aln_010
1442
1443.bc_aln_001:
1444! Alignment off by 56 bytes
1445	ldd	[%i0], %d0
1446	ldd	[%i0+8], %d2
1447	ldd	[%i0+16], %d4
1448	ldd	[%i0+24], %d6
1449	ldd	[%i0+32], %d8
1450	ldd	[%i0+40], %d10
1451	ldd	[%i0+48], %d12
1452	add	%i0, 56, %i0
1453	sub	%i2, 56, %i2
1454	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1455	and	%i2, 0x7f, %i2		! residue bytes in %i2
1456	sub	%i1, %i0, %i1
1457.bc_aln_001_loop:
1458	ldda	[%i0]ASI_BLK_P,%d16	! block load
1459	subcc	%o3, 64, %o3
1460	fmovd	%d16, %d14
1461	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1462	stda	%d0,[%i0+%i1]ASI_BLK_P
1463	add	%i0, 64, %i0
1464	fmovd	%d18, %d0
1465	fmovd	%d20, %d2
1466	fmovd	%d22, %d4
1467	fmovd	%d24, %d6
1468	fmovd	%d26, %d8
1469	fmovd	%d28, %d10
1470	fmovd	%d30, %d12
1471	bgt,pt	%ncc, .bc_aln_001_loop
1472	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1473	add	%i1, %i0, %i1
1474
1475	std	%d0, [%i1]
1476	std	%d2, [%i1+8]
1477	std	%d4, [%i1+16]
1478	std	%d6, [%i1+24]
1479	std	%d8, [%i1+32]
1480	std	%d10, [%i1+40]
1481	std	%d12, [%i1+48]
1482	ba	.bc_remain_stuff
1483	add	%i1, 56, %i1
1484	! END OF aln_001
1485
1486.bc_aln_000:
1487	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1488	andn	%i2, 0x7f, %o3		! %o3 is multiple of 2*block size
1489	and	%i2, 0x7f, %i2		! residue bytes in %i2
1490	sub	%i1, %i0, %i1
1491.bc_aln_000_loop:
1492	ldda	[%i0]ASI_BLK_P,%d0
1493	subcc	%o3, 64, %o3
1494	stxa	%g0,[%i0+%i1]ASI_STBI_P	! block initializing store
1495	stda	%d0,[%i0+%i1]ASI_BLK_P
1496	add	%i0, 64, %i0
1497	bgt,pt	%ncc, .bc_aln_000_loop
1498	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1499	add	%i1, %i0, %i1
1500
1501	! END OF aln_000
1502
1503.bc_remain_stuff:
1504	subcc	%i2, 31, %i2		! adjust length to allow cc test
1505	ble,pt	%ncc, .bc_aln_31
1506	nop
1507.bc_aln_32:
1508	ldx	[%i0], %o4		! move 32 bytes
1509	subcc	%i2, 32, %i2		! decrement length count by 32
1510	stx	%o4, [%i1]
1511	ldx	[%i0+8], %o4
1512	stx	%o4, [%i1+8]
1513	ldx	[%i0+16], %o4
1514	add	%i0, 32, %i0		! increase src ptr by 32
1515	stx	%o4, [%i1+16]
1516	ldx	[%i0-8], %o4
1517	add	%i1, 32, %i1		! increase dst ptr by 32
1518	bgu,pt	%ncc, .bc_aln_32	! repeat if at least 32 bytes left
1519	stx	%o4, [%i1-8]
1520.bc_aln_31:
1521	addcc	%i2, 24, %i2		! adjust count to be off by 7
1522	ble,pt	%ncc, .bc_aln_7		! skip if 7 or fewer bytes left
1523	nop				!
1524.bc_aln_15:
1525	ldx	[%i0], %o4		! move 8 bytes
1526	add	%i0, 8, %i0		! increase src ptr by 8
1527	subcc	%i2, 8, %i2		! decrease count by 8
1528	add	%i1, 8, %i1		! increase dst ptr by 8
1529	bgu,pt	%ncc, .bc_aln_15
1530	stx	%o4, [%i1-8]		!
1531.bc_aln_7:
1532	addcc	%i2, 7, %i2		! finish adjustment of remaining count
1533	bz,pt	%ncc, .bc_exit		! exit if finished
1534	cmp	%i2, 4
1535	blt,pt	%ncc, .bc_unaln3x	! skip if less than 4 bytes left
1536	nop				!
1537	ld	[%i0], %o4		! move 4 bytes
1538	add	%i0, 4, %i0		! increase src ptr by 4
1539	add	%i1, 4, %i1		! increase dst ptr by 4
1540	subcc	%i2, 4, %i2		! decrease count by 4
1541	bnz	.bc_unaln3x
1542	stw	%o4, [%i1-4]
1543	ba	.bc_exit
1544	nop
1545
1546	! destination alignment code
1547.bc_big_d1:
1548	ldub	[%i0], %o4		! move a byte
1549	add	%i0, 1, %i0
1550	stb	%o4, [%i1]
1551	add	%i1, 1, %i1
1552	andcc	%i1, 2, %o3
1553	bz,pt	%ncc, .bc_big_d2f
1554	sub	%i2, 1, %i2
1555.bc_big_d2:
1556	ldub	[%i0], %o4		! move a half-word (src align unknown)
1557	ldub	[%i0+1], %o3
1558	add	%i0, 2, %i0
1559	sll	%o4, 8, %o4		! position
1560	or	%o4, %o3, %o4		! merge
1561	sth	%o4, [%i1]
1562	add	%i1, 2, %i1
1563	andcc	%i1, 4, %o3
1564	bz,pt	%ncc, .bc_big_d4f
1565	sub	%i2, 2, %i2
1566.bc_big_d4:
1567	ldub	[%i0], %o4		! move a word (src align unknown)
1568	ldub	[%i0+1], %o3
1569	sll	%o4, 24, %o4		! position
1570	sll	%o3, 16, %o3		! position
1571	or	%o4, %o3, %o3		! merge
1572	ldub	[%i0+2], %o4
1573	sll	%o4, 8, %o4		! position
1574	or	%o4, %o3, %o3		! merge
1575	ldub	[%i0+3], %o4
1576	or	%o4, %o3, %o4		! merge
1577	stw	%o4,[%i1]		! store four bytes
1578	add	%i0, 4, %i0		! adjust src by 4
1579	add	%i1, 4, %i1		! adjust dest by 4
1580	ba	.bc_big_d4f
1581	sub	%i2, 4, %i2		! adjust count by 4
1582
1583
1584	! Dst is on 8 byte boundary; src is not;
1585.bc_big_unal8:
1586	andcc	%i1, 0x3f, %o3		! is dst 64-byte block aligned?
1587	bz	%ncc, .bc_unalnsrc
1588	sub	%o3, 64, %o3		! %o3 will be multiple of 8
1589	neg	%o3			! bytes until dest is 64 byte aligned
1590	sub	%i2, %o3, %i2		! update cnt with bytes to be moved
1591	! Move bytes according to source alignment
1592	andcc	%i0, 0x1, %o4
1593	bnz	%ncc, .bc_unalnbyte	! check for byte alignment
1594	nop
1595	andcc	%i0, 2, %o4		! check for half word alignment
1596	bnz	%ncc, .bc_unalnhalf
1597	nop
1598	! Src is word aligned, move bytes until dest 64 byte aligned
1599.bc_unalnword:
1600	ld	[%i0], %o4		! load 4 bytes
1601	stw	%o4, [%i1]		! and store 4 bytes
1602	ld	[%i0+4], %o4		! load 4 bytes
1603	add	%i0, 8, %i0		! increase src ptr by 8
1604	stw	%o4, [%i1+4]		! and store 4 bytes
1605	subcc	%o3, 8, %o3		! decrease count by 8
1606	bnz	%ncc, .bc_unalnword
1607	add	%i1, 8, %i1		! increase dst ptr by 8
1608	ba	.bc_unalnsrc
1609	nop
1610
1611	! Src is half-word aligned, move bytes until dest 64 byte aligned
1612.bc_unalnhalf:
1613	lduh	[%i0], %o4		! load 2 bytes
1614	sllx	%o4, 32, %i3		! shift left
1615	lduw	[%i0+2], %o4
1616	or	%o4, %i3, %i3
1617	sllx	%i3, 16, %i3
1618	lduh	[%i0+6], %o4
1619	or	%o4, %i3, %i3
1620	stx	%i3, [%i1]
1621	add	%i0, 8, %i0
1622	subcc	%o3, 8, %o3
1623	bnz	%ncc, .bc_unalnhalf
1624	add	%i1, 8, %i1
1625	ba	.bc_unalnsrc
1626	nop
1627
1628	! Src is Byte aligned, move bytes until dest 64 byte aligned
1629.bc_unalnbyte:
1630	sub	%i1, %i0, %i1		! share pointer advance
1631.bc_unalnbyte_loop:
1632	ldub	[%i0], %o4
1633	sllx	%o4, 56, %i3
1634	lduh	[%i0+1], %o4
1635	sllx	%o4, 40, %o4
1636	or	%o4, %i3, %i3
1637	lduh	[%i0+3], %o4
1638	sllx	%o4, 24, %o4
1639	or	%o4, %i3, %i3
1640	lduh	[%i0+5], %o4
1641	sllx	%o4, 8, %o4
1642	or	%o4, %i3, %i3
1643	ldub	[%i0+7], %o4
1644	or	%o4, %i3, %i3
1645	stx	%i3, [%i1+%i0]
1646	subcc	%o3, 8, %o3
1647	bnz	%ncc, .bc_unalnbyte_loop
1648	add	%i0, 8, %i0
1649	add	%i1,%i0, %i1		! restore pointer
1650
1651	! Destination is now block (64 byte aligned), src is not 8 byte aligned
1652.bc_unalnsrc:
1653	andn	%i2, 0x3f, %i3		! %i3 is multiple of block size
1654	and	%i2, 0x3f, %i2		! residue bytes in %i2
1655	add	%i2, 64, %i2		! Insure we don't load beyond
1656	sub	%i3, 64, %i3		! end of source buffer
1657
1658	andn	%i0, 0x3f, %o4		! %o4 has block aligned src address
1659	prefetch [%o4 + (3 * CACHE_LINE)], #one_read
1660	alignaddr %i0, %g0, %g0		! generate %gsr
1661	add	%i0, %i3, %i0		! advance %i0 to after blocks
1662	!
1663	! Determine source alignment to correct 8 byte offset
1664	andcc	%i0, 0x20, %o3
1665	brnz,pn	%o3, .bc_unaln_1
1666	andcc	%i0, 0x10, %o3
1667	brnz,pn	%o3, .bc_unaln_01
1668	andcc	%i0, 0x08, %o3
1669	brz,a	%o3, .bc_unaln_000
1670	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1671	ba	.bc_unaln_001
1672	nop
1673.bc_unaln_01:
1674	brnz,a	%o3, .bc_unaln_011
1675	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1676	ba	.bc_unaln_010
1677	nop
1678.bc_unaln_1:
1679	brnz,pn	%o3, .bc_unaln_11
1680	andcc	%i0, 0x08, %o3
1681	brnz,a	%o3, .bc_unaln_101
1682	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1683	ba	.bc_unaln_100
1684	nop
1685.bc_unaln_11:
1686	brz,pn	%o3, .bc_unaln_110
1687	prefetch [%i0 + (4 * CACHE_LINE)], #one_read
1688
1689.bc_unaln_111:
1690	ldd	[%o4+56], %d14
1691.bc_unaln_111_loop:
1692	add	%o4, 64, %o4
1693	ldda	[%o4]ASI_BLK_P, %d16
1694	faligndata %d14, %d16, %d48
1695	faligndata %d16, %d18, %d50
1696	faligndata %d18, %d20, %d52
1697	faligndata %d20, %d22, %d54
1698	faligndata %d22, %d24, %d56
1699	faligndata %d24, %d26, %d58
1700	faligndata %d26, %d28, %d60
1701	faligndata %d28, %d30, %d62
1702	fmovd	%d30, %d14
1703	stda	%d48, [%i1]ASI_BLK_P
1704	subcc	%i3, 64, %i3
1705	add	%i1, 64, %i1
1706	bgu,pt	%ncc, .bc_unaln_111_loop
1707	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1708	ba	.bc_unaln_done
1709	nop
1710
1711.bc_unaln_110:
1712	ldd	[%o4+48], %d12
1713	ldd	[%o4+56], %d14
1714.bc_unaln_110_loop:
1715	add	%o4, 64, %o4
1716	ldda	[%o4]ASI_BLK_P, %d16
1717	faligndata %d12, %d14, %d48
1718	faligndata %d14, %d16, %d50
1719	faligndata %d16, %d18, %d52
1720	faligndata %d18, %d20, %d54
1721	faligndata %d20, %d22, %d56
1722	faligndata %d22, %d24, %d58
1723	faligndata %d24, %d26, %d60
1724	faligndata %d26, %d28, %d62
1725	fmovd	%d28, %d12
1726	fmovd	%d30, %d14
1727	stda	%d48, [%i1]ASI_BLK_P
1728	subcc	%i3, 64, %i3
1729	add	%i1, 64, %i1
1730	bgu,pt	%ncc, .bc_unaln_110_loop
1731	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1732	ba	.bc_unaln_done
1733	nop
1734
1735.bc_unaln_101:
1736	ldd	[%o4+40], %d10
1737	ldd	[%o4+48], %d12
1738	ldd	[%o4+56], %d14
1739.bc_unaln_101_loop:
1740	add	%o4, 64, %o4
1741	ldda	[%o4]ASI_BLK_P, %d16
1742	faligndata %d10, %d12, %d48
1743	faligndata %d12, %d14, %d50
1744	faligndata %d14, %d16, %d52
1745	faligndata %d16, %d18, %d54
1746	faligndata %d18, %d20, %d56
1747	faligndata %d20, %d22, %d58
1748	faligndata %d22, %d24, %d60
1749	faligndata %d24, %d26, %d62
1750	fmovd	%d26, %d10
1751	fmovd	%d28, %d12
1752	fmovd	%d30, %d14
1753	stda	%d48, [%i1]ASI_BLK_P
1754	subcc	%i3, 64, %i3
1755	add	%i1, 64, %i1
1756	bgu,pt	%ncc, .bc_unaln_101_loop
1757	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1758	ba	.bc_unaln_done
1759	nop
1760
1761.bc_unaln_100:
1762	ldd	[%o4+32], %d8
1763	ldd	[%o4+40], %d10
1764	ldd	[%o4+48], %d12
1765	ldd	[%o4+56], %d14
1766.bc_unaln_100_loop:
1767	add	%o4, 64, %o4
1768	ldda	[%o4]ASI_BLK_P, %d16
1769	faligndata %d8, %d10, %d48
1770	faligndata %d10, %d12, %d50
1771	faligndata %d12, %d14, %d52
1772	faligndata %d14, %d16, %d54
1773	faligndata %d16, %d18, %d56
1774	faligndata %d18, %d20, %d58
1775	faligndata %d20, %d22, %d60
1776	faligndata %d22, %d24, %d62
1777	fmovd	%d24, %d8
1778	fmovd	%d26, %d10
1779	fmovd	%d28, %d12
1780	fmovd	%d30, %d14
1781	stda	%d48, [%i1]ASI_BLK_P
1782	subcc	%i3, 64, %i3
1783	add	%i1, 64, %i1
1784	bgu,pt	%ncc, .bc_unaln_100_loop
1785	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1786	ba	.bc_unaln_done
1787	nop
1788
1789.bc_unaln_011:
1790	ldd	[%o4+24], %d6
1791	ldd	[%o4+32], %d8
1792	ldd	[%o4+40], %d10
1793	ldd	[%o4+48], %d12
1794	ldd	[%o4+56], %d14
1795.bc_unaln_011_loop:
1796	add	%o4, 64, %o4
1797	ldda	[%o4]ASI_BLK_P, %d16
1798	faligndata %d6, %d8, %d48
1799	faligndata %d8, %d10, %d50
1800	faligndata %d10, %d12, %d52
1801	faligndata %d12, %d14, %d54
1802	faligndata %d14, %d16, %d56
1803	faligndata %d16, %d18, %d58
1804	faligndata %d18, %d20, %d60
1805	faligndata %d20, %d22, %d62
1806	fmovd	%d22, %d6
1807	fmovd	%d24, %d8
1808	fmovd	%d26, %d10
1809	fmovd	%d28, %d12
1810	fmovd	%d30, %d14
1811	stda	%d48, [%i1]ASI_BLK_P
1812	subcc	%i3, 64, %i3
1813	add	%i1, 64, %i1
1814	bgu,pt	%ncc, .bc_unaln_011_loop
1815	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1816	ba	.bc_unaln_done
1817	nop
1818
1819.bc_unaln_010:
1820	ldd	[%o4+16], %d4
1821	ldd	[%o4+24], %d6
1822	ldd	[%o4+32], %d8
1823	ldd	[%o4+40], %d10
1824	ldd	[%o4+48], %d12
1825	ldd	[%o4+56], %d14
1826.bc_unaln_010_loop:
1827	add	%o4, 64, %o4
1828	ldda	[%o4]ASI_BLK_P, %d16
1829	faligndata %d4, %d6, %d48
1830	faligndata %d6, %d8, %d50
1831	faligndata %d8, %d10, %d52
1832	faligndata %d10, %d12, %d54
1833	faligndata %d12, %d14, %d56
1834	faligndata %d14, %d16, %d58
1835	faligndata %d16, %d18, %d60
1836	faligndata %d18, %d20, %d62
1837	fmovd	%d20, %d4
1838	fmovd	%d22, %d6
1839	fmovd	%d24, %d8
1840	fmovd	%d26, %d10
1841	fmovd	%d28, %d12
1842	fmovd	%d30, %d14
1843	stda	%d48, [%i1]ASI_BLK_P
1844	subcc	%i3, 64, %i3
1845	add	%i1, 64, %i1
1846	bgu,pt	%ncc, .bc_unaln_010_loop
1847	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1848	ba	.bc_unaln_done
1849	nop
1850
1851.bc_unaln_001:
1852	ldd	[%o4+8], %d2
1853	ldd	[%o4+16], %d4
1854	ldd	[%o4+24], %d6
1855	ldd	[%o4+32], %d8
1856	ldd	[%o4+40], %d10
1857	ldd	[%o4+48], %d12
1858	ldd	[%o4+56], %d14
1859.bc_unaln_001_loop:
1860	add	%o4, 64, %o4
1861	ldda	[%o4]ASI_BLK_P, %d16
1862	faligndata %d2, %d4, %d48
1863	faligndata %d4, %d6, %d50
1864	faligndata %d6, %d8, %d52
1865	faligndata %d8, %d10, %d54
1866	faligndata %d10, %d12, %d56
1867	faligndata %d12, %d14, %d58
1868	faligndata %d14, %d16, %d60
1869	faligndata %d16, %d18, %d62
1870	fmovd	%d18, %d2
1871	fmovd	%d20, %d4
1872	fmovd	%d22, %d6
1873	fmovd	%d24, %d8
1874	fmovd	%d26, %d10
1875	fmovd	%d28, %d12
1876	fmovd	%d30, %d14
1877	stda	%d48, [%i1]ASI_BLK_P
1878	subcc	%i3, 64, %i3
1879	add	%i1, 64, %i1
1880	bgu,pt	%ncc, .bc_unaln_001_loop
1881	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1882	ba	.bc_unaln_done
1883	nop
1884
1885.bc_unaln_000:
1886	ldda	[%o4]ASI_BLK_P, %d0
1887.bc_unaln_000_loop:
1888	add	%o4, 64, %o4
1889	ldda	[%o4]ASI_BLK_P, %d16
1890	faligndata %d0, %d2, %d48
1891	faligndata %d2, %d4, %d50
1892	faligndata %d4, %d6, %d52
1893	faligndata %d6, %d8, %d54
1894	faligndata %d8, %d10, %d56
1895	faligndata %d10, %d12, %d58
1896	faligndata %d12, %d14, %d60
1897	faligndata %d14, %d16, %d62
1898	fmovd	%d16, %d0
1899	fmovd	%d18, %d2
1900	fmovd	%d20, %d4
1901	fmovd	%d22, %d6
1902	fmovd	%d24, %d8
1903	fmovd	%d26, %d10
1904	fmovd	%d28, %d12
1905	fmovd	%d30, %d14
1906	stda	%d48, [%i1]ASI_BLK_P
1907	subcc	%i3, 64, %i3
1908	add	%i1, 64, %i1
1909	bgu,pt	%ncc, .bc_unaln_000_loop
1910	prefetch [%o4 + (4 * CACHE_LINE)], #one_read
1911
1912.bc_unaln_done:
1913	! Handle trailing bytes, 64 to 127
1914	! Dest long word aligned, Src not long word aligned
1915	cmp	%i2, 15
1916	bleu	%ncc, .bc_unaln_short
1917
1918	andn	%i2, 0x7, %i3		! %i3 is multiple of 8
1919	and	%i2, 0x7, %i2		! residue bytes in %i2
1920	add	%i2, 8, %i2
1921	sub	%i3, 8, %i3		! insure we don't load past end of src
1922	andn	%i0, 0x7, %o4		! %o4 has long word aligned src address
1923	add	%i0, %i3, %i0		! advance %i0 to after multiple of 8
1924	ldd	[%o4], %d0		! fetch partial word
1925.bc_unaln_by8:
1926	ldd	[%o4+8], %d2
1927	add	%o4, 8, %o4
1928	faligndata %d0, %d2, %d16
1929	subcc	%i3, 8, %i3
1930	std	%d16, [%i1]
1931	fmovd	%d2, %d0
1932	bgu,pt	%ncc, .bc_unaln_by8
1933	add	%i1, 8, %i1
1934
1935.bc_unaln_short:
1936	cmp	%i2, 8
1937	blt,pt	%ncc, .bc_unalnfin
1938	nop
1939	ldub	[%i0], %o4
1940	sll	%o4, 24, %o3
1941	ldub	[%i0+1], %o4
1942	sll	%o4, 16, %o4
1943	or	%o4, %o3, %o3
1944	ldub	[%i0+2], %o4
1945	sll	%o4, 8, %o4
1946	or	%o4, %o3, %o3
1947	ldub	[%i0+3], %o4
1948	or	%o4, %o3, %o3
1949	stw	%o3, [%i1]
1950	ldub	[%i0+4], %o4
1951	sll	%o4, 24, %o3
1952	ldub	[%i0+5], %o4
1953	sll	%o4, 16, %o4
1954	or	%o4, %o3, %o3
1955	ldub	[%i0+6], %o4
1956	sll	%o4, 8, %o4
1957	or	%o4, %o3, %o3
1958	ldub	[%i0+7], %o4
1959	or	%o4, %o3, %o3
1960	stw	%o3, [%i1+4]
1961	add	%i0, 8, %i0
1962	add	%i1, 8, %i1
1963	sub	%i2, 8, %i2
1964.bc_unalnfin:
1965	cmp	%i2, 4
1966	blt,pt	%ncc, .bc_unalnz
1967	tst	%i2
1968	ldub	[%i0], %o3		! read byte
1969	subcc	%i2, 4, %i2		! reduce count by 4
1970	sll	%o3, 24, %o3		! position
1971	ldub	[%i0+1], %o4
1972	sll	%o4, 16, %o4		! position
1973	or	%o4, %o3, %o3		! merge
1974	ldub	[%i0+2], %o4
1975	sll	%o4, 8, %o4		! position
1976	or	%o4, %o3, %o3		! merge
1977	add	%i1, 4, %i1		! advance dst by 4
1978	ldub	[%i0+3], %o4
1979	add	%i0, 4, %i0		! advance src by 4
1980	or	%o4, %o3, %o4		! merge
1981	bnz,pt	%ncc, .bc_unaln3x
1982	stw	%o4, [%i1-4]
1983	ba	.bc_exit
1984	nop
1985.bc_unalnz:
1986	bz,pt	%ncc, .bc_exit
1987.bc_unaln3x:				! Exactly 1, 2, or 3 bytes remain
1988	subcc	%i2, 1, %i2		! reduce count for cc test
1989	ldub	[%i0], %o4		! load one byte
1990	bz,pt	%ncc, .bc_exit
1991	stb	%o4, [%i1]		! store one byte
1992	ldub	[%i0+1], %o4		! load second byte
1993	subcc	%i2, 1, %i2
1994	bz,pt	%ncc, .bc_exit
1995	stb	%o4, [%i1+1]		! store second byte
1996	ldub	[%i0+2], %o4		! load third byte
1997	stb	%o4, [%i1+2]		! store third byte
1998.bc_exit:
1999	wr	%l5, %g0, %gsr		! restore %gsr
2000	brnz	%g5, .bc_fp_restore
2001	and	%o5, COPY_FLAGS, %l1	! save flags in %l1
2002	FZERO
2003	wr	%g5, %g0, %fprs
2004	ba,pt	%ncc, .bc_ex2
2005	nop
2006.bc_fp_restore:
2007	BLD_FP_FROMSTACK(%o4)
2008.bc_ex2:
2009	ldn	[THREAD_REG + T_LWP], %o2
2010	brnz,pt	%o2, 1f
2011	nop
2012
2013	ldsb	[THREAD_REG + T_PREEMPT], %l0
2014	deccc	%l0
2015	bnz,pn	%ncc, 1f
2016	stb	%l0, [THREAD_REG + T_PREEMPT]
2017
2018	! Check for a kernel preemption request
2019	ldn	[THREAD_REG + T_CPU], %l0
2020	ldub	[%l0 + CPU_KPRUNRUN], %l0
2021	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
2022	or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
20231:
2024	btst	LOFAULT_SET, %l1
2025	bz,pn	%icc, 3f
2026	andncc	%o5, COPY_FLAGS, %o5
2027	! Here via bcopy. Check to see if the handler was NULL.
2028	! If so, just return quietly. Otherwise, reset the
2029	! handler and return.
2030	bz,pn %ncc, 2f
2031	nop
2032	membar	#Sync
2033	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
20342:
2035	btst	KPREEMPT_FLAG, %l1
2036	bz,pt	%icc, 3f
2037	nop
2038	call	kpreempt
2039	rdpr	%pil, %o0		! pass %pil
20403:
2041	ret
2042	restore	%g0, 0, %o0
2043
2044	SET_SIZE(bcopy_more)
2045
2046
2047#else	/* NIAGARA_IMPL */
2048	save	%sp, -SA(MINFRAME), %sp
2049	clr	%o5			! flag LOFAULT_SET is not set for bcopy
2050.do_copy:
2051	cmp	%i2, 12			! for small counts
2052	blu	%ncc, .bytecp		! just copy bytes
2053	.empty
2054
2055	cmp	%i2, 128		! for less than 128 bytes
2056	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
2057	nop
2058
2059	set	use_hw_bcopy, %o2
2060	ld	[%o2], %o2
2061	brz,pn	%o2, .bcb_punt
2062	nop
2063
2064	subcc	%i1, %i0, %i3
2065	bneg,a,pn %ncc, 1f
2066	neg	%i3
20671:
2068	/*
2069	 * Compare against 256 since we should be checking block addresses
2070	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
2071	 * src = dest + (64 * 3) + 63.
2072	 */
2073	cmp	%i3, 256
2074	blu,pn	%ncc, .bcb_punt
2075	nop
2076
2077	/*
2078	 * Copy that reach here have at least 2 blocks of data to copy.
2079	 */
2080.do_blockcopy:
2081	! Swap src/dst since the code below is memcpy code
2082	! and memcpy/bcopy have different calling sequences
2083	mov	%i1, %i5
2084	mov	%i0, %i1
2085	mov	%i5, %i0
2086
2087	! Block (64 bytes) align the destination.
2088	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
2089	bz	%xcc, .chksrc		! dst is already double aligned
2090	sub	%i3, 0x40, %i3
2091	neg	%i3			! bytes till dst 64 bytes aligned
2092	sub	%i2, %i3, %i2		! update i2 with new count
2093
2094	! Based on source and destination alignment do
2095	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2096
2097	! Is dst & src 8B aligned
2098	or	%i0, %i1, %o2
2099	andcc	%o2, 0x7, %g0
2100	bz	%ncc, .alewdcp
2101	nop
2102
2103	! Is dst & src 4B aligned
2104	andcc	%o2, 0x3, %g0
2105	bz	%ncc, .alwdcp
2106	nop
2107
2108	! Is dst & src 2B aligned
2109	andcc	%o2, 0x1, %g0
2110	bz	%ncc, .alhlfwdcp
2111	nop
2112
2113	! 1B aligned
21141:	ldub	[%i1], %o2
2115	stb	%o2, [%i0]
2116	inc	%i1
2117	deccc	%i3
2118	bgu,pt	%ncc, 1b
2119	inc	%i0
2120
2121	ba	.chksrc
2122	nop
2123
2124	! dst & src 4B aligned
2125.alwdcp:
2126	ld	[%i1], %o2
2127	st	%o2, [%i0]
2128	add	%i1, 0x4, %i1
2129	subcc	%i3, 0x4, %i3
2130	bgu,pt	%ncc, .alwdcp
2131	add	%i0, 0x4, %i0
2132
2133	ba	.chksrc
2134	nop
2135
2136	! dst & src 2B aligned
2137.alhlfwdcp:
2138	lduh	[%i1], %o2
2139	stuh	%o2, [%i0]
2140	add	%i1, 0x2, %i1
2141	subcc	%i3, 0x2, %i3
2142	bgu,pt	%ncc, .alhlfwdcp
2143	add	%i0, 0x2, %i0
2144
2145	ba	.chksrc
2146	nop
2147
2148	! dst & src 8B aligned
2149.alewdcp:
2150	ldx	[%i1], %o2
2151	stx	%o2, [%i0]
2152	add	%i1, 0x8, %i1
2153	subcc	%i3, 0x8, %i3
2154	bgu,pt	%ncc, .alewdcp
2155	add	%i0, 0x8, %i0
2156
2157	! Now Destination is block (64 bytes) aligned
2158.chksrc:
2159	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2160	sub	%i2, %i3, %i2		! Residue bytes in %i2
2161
2162	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
2163
2164	andcc	%i1, 0xf, %o2		! is src quadword aligned
2165	bz,pn	%xcc, .blkcpy		! src offset in %o2
2166	nop
2167	cmp	%o2, 0x8
2168	bg	.cpy_upper_double
2169	nop
2170	bl	.cpy_lower_double
2171	nop
2172
2173	! Falls through when source offset is equal to 8 i.e.
2174	! source is double word aligned.
2175	! In this case no shift/merge of data is required
2176	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2177	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2178	prefetch [%l0+0x0], #one_read
2179	ldda	[%i1+0x0]%asi, %l2
2180loop0:
2181	ldda	[%i1+0x10]%asi, %l4
2182	prefetch [%l0+0x40], #one_read
2183
2184	stxa	%l3, [%i0+0x0]%asi
2185	stxa	%l4, [%i0+0x8]%asi
2186
2187	ldda	[%i1+0x20]%asi, %l2
2188	stxa	%l5, [%i0+0x10]%asi
2189	stxa	%l2, [%i0+0x18]%asi
2190
2191	ldda	[%i1+0x30]%asi, %l4
2192	stxa	%l3, [%i0+0x20]%asi
2193	stxa	%l4, [%i0+0x28]%asi
2194
2195	ldda	[%i1+0x40]%asi, %l2
2196	stxa	%l5, [%i0+0x30]%asi
2197	stxa	%l2, [%i0+0x38]%asi
2198
2199	add	%l0, 0x40, %l0
2200	add	%i1, 0x40, %i1
2201	subcc	%i3, 0x40, %i3
2202	bgu,pt	%xcc, loop0
2203	add	%i0, 0x40, %i0
2204	ba	.blkdone
2205	add	%i1, %o2, %i1		! increment the source by src offset
2206					! the src offset was stored in %o2
2207
2208.cpy_lower_double:
2209	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2210	sll	%o2, 3, %o0		! %o0 left shift
2211	mov	0x40, %o1
2212	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2213	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2214	prefetch [%l0+0x0], #one_read
2215	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
2216					! complete data
2217loop1:
2218	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
2219	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2220							! into %l2 and %l3
2221	prefetch [%l0+0x40], #one_read
2222	stxa	%l2, [%i0+0x0]%asi
2223	stxa	%l3, [%i0+0x8]%asi
2224
2225	ldda	[%i1+0x20]%asi, %l2
2226	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2227	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
2228	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
2229
2230	! Repeat the same for next 32 bytes.
2231
2232	ldda	[%i1+0x30]%asi, %l4
2233	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2234	stxa	%l2, [%i0+0x20]%asi
2235	stxa	%l3, [%i0+0x28]%asi
2236
2237	ldda	[%i1+0x40]%asi, %l2
2238	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2239	stxa	%l4, [%i0+0x30]%asi
2240	stxa	%l5, [%i0+0x38]%asi
2241
2242	add	%l0, 0x40, %l0
2243	add	%i1, 0x40, %i1
2244	subcc	%i3, 0x40, %i3
2245	bgu,pt	%xcc, loop1
2246	add	%i0, 0x40, %i0
2247	ba	.blkdone
2248	add	%i1, %o2, %i1		! increment the source by src offset
2249					! the src offset was stored in %o2
2250
2251.cpy_upper_double:
2252	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2253	mov	0x8, %o0
2254	sub	%o2, %o0, %o0
2255	sll	%o0, 3, %o0		! %o0 left shift
2256	mov	0x40, %o1
2257	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2258	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2259	prefetch [%l0+0x0], #one_read
2260	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
2261					! no data in %l2
2262loop2:
2263	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
2264					! partial
2265	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2266							! into %l3 and %l4
2267	prefetch [%l0+0x40], #one_read
2268	stxa	%l3, [%i0+0x0]%asi
2269	stxa	%l4, [%i0+0x8]%asi
2270
2271	ldda	[%i1+0x20]%asi, %l2
2272	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2273	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
2274	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
2275
2276	! Repeat the same for next 32 bytes.
2277
2278	ldda	[%i1+0x30]%asi, %l4
2279	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2280	stxa	%l3, [%i0+0x20]%asi
2281	stxa	%l4, [%i0+0x28]%asi
2282
2283	ldda	[%i1+0x40]%asi, %l2
2284	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2285	stxa	%l5, [%i0+0x30]%asi
2286	stxa	%l2, [%i0+0x38]%asi
2287
2288	add	%l0, 0x40, %l0
2289	add	%i1, 0x40, %i1
2290	subcc	%i3, 0x40, %i3
2291	bgu,pt	%xcc, loop2
2292	add	%i0, 0x40, %i0
2293	ba	.blkdone
2294	add	%i1, %o2, %i1		! increment the source by src offset
2295					! the src offset was stored in %o2
2296
2297
2298	! Both Source and Destination are block aligned.
2299	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2300.blkcpy:
2301	prefetch [%i1+0x0], #one_read
23021:
2303	ldda	[%i1+0x0]%asi, %l0
2304	ldda	[%i1+0x10]%asi, %l2
2305	prefetch [%i1+0x40], #one_read
2306
2307	stxa	%l0, [%i0+0x0]%asi
2308	ldda	[%i1+0x20]%asi, %l4
2309	ldda	[%i1+0x30]%asi, %l6
2310
2311	stxa	%l1, [%i0+0x8]%asi
2312	stxa	%l2, [%i0+0x10]%asi
2313	stxa	%l3, [%i0+0x18]%asi
2314	stxa	%l4, [%i0+0x20]%asi
2315	stxa	%l5, [%i0+0x28]%asi
2316	stxa	%l6, [%i0+0x30]%asi
2317	stxa	%l7, [%i0+0x38]%asi
2318
2319	add	%i1, 0x40, %i1
2320	subcc	%i3, 0x40, %i3
2321	bgu,pt	%xcc, 1b
2322	add	%i0, 0x40, %i0
2323
2324.blkdone:
2325	membar	#Sync
2326
2327	brz,pt	%i2, .blkexit
2328	nop
2329
2330	! Handle trailing bytes
2331	cmp	%i2, 0x8
2332	blu,pt	%ncc, .residue
2333	nop
2334
2335	! Can we do some 8B ops
2336	or	%i1, %i0, %o2
2337	andcc	%o2, 0x7, %g0
2338	bnz	%ncc, .last4
2339	nop
2340
2341	! Do 8byte ops as long as possible
2342.last8:
2343	ldx	[%i1], %o2
2344	stx	%o2, [%i0]
2345	add	%i1, 0x8, %i1
2346	sub	%i2, 0x8, %i2
2347	cmp	%i2, 0x8
2348	bgu,pt	%ncc, .last8
2349	add	%i0, 0x8, %i0
2350
2351	brz,pt	%i2, .blkexit
2352	nop
2353
2354	ba	.residue
2355	nop
2356
2357.last4:
2358	! Can we do 4B ops
2359	andcc	%o2, 0x3, %g0
2360	bnz	%ncc, .last2
2361	nop
23621:
2363	ld	[%i1], %o2
2364	st	%o2, [%i0]
2365	add	%i1, 0x4, %i1
2366	sub	%i2, 0x4, %i2
2367	cmp	%i2, 0x4
2368	bgu,pt	%ncc, 1b
2369	add	%i0, 0x4, %i0
2370
2371	brz,pt	%i2, .blkexit
2372	nop
2373
2374	ba	.residue
2375	nop
2376
2377.last2:
2378	! Can we do 2B ops
2379	andcc	%o2, 0x1, %g0
2380	bnz	%ncc, .residue
2381	nop
2382
23831:
2384	lduh	[%i1], %o2
2385	stuh	%o2, [%i0]
2386	add	%i1, 0x2, %i1
2387	sub	%i2, 0x2, %i2
2388	cmp	%i2, 0x2
2389	bgu,pt	%ncc, 1b
2390	add	%i0, 0x2, %i0
2391
2392	brz,pt	%i2, .blkexit
2393	nop
2394
2395.residue:
2396	ldub	[%i1], %o2
2397	stb	%o2, [%i0]
2398	inc	%i1
2399	deccc	%i2
2400	bgu,pt	%ncc, .residue
2401	inc	%i0
2402
2403.blkexit:
2404
2405	membar	#Sync				! sync error barrier
2406	! Restore t_lofault handler, if came here from kcopy().
2407	tst	%o5
2408	bz	%ncc, 1f
2409	andn	%o5, LOFAULT_SET, %o5
2410	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
24111:
2412	ret
2413	restore	%g0, 0, %o0
2414
2415
2416.bcb_punt:
2417	!
2418	! use aligned transfers where possible
2419	!
2420	xor	%i0, %i1, %o4		! xor from and to address
2421	btst	7, %o4			! if lower three bits zero
2422	bz	.aldoubcp		! can align on double boundary
2423	.empty	! assembler complaints about label
2424
2425	xor	%i0, %i1, %o4		! xor from and to address
2426	btst	3, %o4			! if lower two bits zero
2427	bz	.alwordcp		! can align on word boundary
2428	btst	3, %i0			! delay slot, from address unaligned?
2429	!
2430	! use aligned reads and writes where possible
2431	! this differs from wordcp in that it copes
2432	! with odd alignment between source and destnation
2433	! using word reads and writes with the proper shifts
2434	! in between to align transfers to and from memory
2435	! i0 - src address, i1 - dest address, i2 - count
2436	! i3, i4 - tmps for used generating complete word
2437	! i5 (word to write)
2438	! l0 size