1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/param.h>
28#include <sys/errno.h>
29#include <sys/asm_linkage.h>
30#include <sys/vtrace.h>
31#include <sys/machthread.h>
32#include <sys/clock.h>
33#include <sys/asi.h>
34#include <sys/fsr.h>
35#include <sys/privregs.h>
36
37#include "assym.h"
38
39
40/*
41 * Pseudo-code to aid in understanding the control flow of the
42 * bcopy routine.
43 *
44 * On entry to bcopy:
45 *
46 *	%l6 = curthread->t_lofault;
47 *	used_block_copy = FALSE;			! %l6 |= 1
48 *	if (%l6 != NULL) {
49 *		curthread->t_lofault = .copyerr;
50 *		caller_error_handler = TRUE		! %l6 |= 2
51 *	}
52 *
53 * 	if (length < VIS_COPY)
54 * 		goto regular_copy;
55 *
56 * 	if (!use_vis)
57 * 		goto_regular_copy;
58 *
59 * 	if (curthread->t_lwp == NULL) {
60 *		! Kernel threads do not have pcb's in which to store
61 *		! the floating point state, disallow preemption during
62 *		! the copy.
63 * 		kpreempt_disable(curthread);
64 *	}
65 *
66 * 	old_fprs = %fprs;
67 * 	old_gsr = %gsr;
68 * 	if (%fprs.fef) {
69 *              ! If we need to save 4 blocks of fpregs then make sure
70 *		! the length is still appropriate for that extra overhead.
71 * 		if (length < (large_length + (64 * 4))) {
72 * 			if (curthread->t_lwp == NULL)
73 * 				kpreempt_enable(curthread);
74 * 			goto regular_copy;
75 * 		}
76 * 		%fprs.fef = 1;
77 * 		save current fpregs on stack using blockstore
78 * 	} else {
79 * 		%fprs.fef = 1;
80 * 	}
81 *
82 * 	used_block_copy = 1;				! %l6 |= 1
83 * 	do_blockcopy_here;
84 *
85 * In lofault handler:
86 *	curthread->t_lofault = .copyerr2;
87 *	Continue on with the normal exit handler
88 *
89 * On exit:
90 *	call_kpreempt = 0;
91 * 	if (used_block_copy) {				! %l6 & 1
92 * 		%gsr = old_gsr;
93 * 		if (old_fprs & FPRS_FEF)
94 * 			restore fpregs from stack using blockload
95 *		else
96 *			zero fpregs
97 * 		%fprs = old_fprs;
98 * 		if (curthread->t_lwp == NULL) {
99 *			kpreempt_enable(curthread);
100 *			call_kpreempt = 1;
101 *		}
102 * 	}
103 * 	curthread->t_lofault = (%l6 & ~3);
104 *	if (call_kpreempt)
105 *		kpreempt(%pil);
106 * 	return (0)
107 *
108 * In second lofault handler (.copyerr2):
109 *	We've tried to restore fp state from the stack and failed.  To
110 *	prevent from returning with a corrupted fp state, we will panic.
111 */
112
113/*
114 * Notes on preserving existing fp state:
115 *
116 * When a copyOP decides to use fp we may have to preserve existing
117 * floating point state.  It is not the caller's state that we need to
118 * preserve - the rest of the kernel does not use fp and, anyway, fp
119 * registers are volatile across a call.  Some examples:
120 *
121 *	- userland has fp state and is interrupted (device interrupt
122 *	  or trap) and within the interrupt/trap handling we use
123 *	  bcopy()
124 *	- another (higher level) interrupt or trap handler uses bcopy
125 *	  while a bcopy from an earlier interrupt is still active
126 *	- an asynchronous error trap occurs while fp state exists (in
127 *	  userland or in kernel copy) and the tl0 component of the handling
128 *	  uses bcopy
129 *	- a user process with fp state incurs a copy-on-write fault and
130 *	  hwblkpagecopy always uses fp
131 *
132 * We therefore need a per-call place in which to preserve fp state -
133 * using our stack is ideal (and since fp copy cannot be leaf optimized
134 * because of calls it makes, this is no hardship).
135 *
136 * To make sure that floating point state is always saved and restored
137 * correctly, the following "big rules" must be followed when the floating
138 * point registers will be used:
139 *
140 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
141 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
142 *    use.  Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy.
143 *
144 * 2. The FPUSED flag indicates that all FP state has been successfully stored
145 *    on the stack.  It should not be set until this save has been completed.
146 *
147 * 3. The FPUSED flag should not be cleared on exit until all FP state has
148 *    been restored from the stack.  If an error occurs while restoring
149 *    data from the stack, the error handler can check this flag to see if
150 *    a restore is necessary.
151 *
152 * 4. Code run under the new lofault handler must be kept to a minimum.  In
153 *    particular, any calls to kpreempt() should not be made until after the
154 *    lofault handler has been restored.
155 */
156
157/*
158 * This shadows sys/machsystm.h which can't be included due to the lack of
159 * _ASM guards in include files it references. Change it here, change it there.
160 */
161#define VIS_COPY_THRESHOLD 900
162
163/*
164 * Less then or equal this number of bytes we will always copy byte-for-byte
165 */
166#define	SMALL_LIMIT	7
167
168/*
169 * Flags set in the lower bits of the t_lofault address:
170 * FPUSED_FLAG: The FP registers were in use and must be restored
171 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
172 * COPY_FLAGS: Both of the above
173 *
174 * Other flags:
175 * KPREEMPT_FLAG: kpreempt needs to be called
176 */
177#define	FPUSED_FLAG	1
178#define BCOPY_FLAG	2
179#define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
180#define	KPREEMPT_FLAG	4
181
182/*
183 * Size of stack frame in order to accomodate a 64-byte aligned
184 * floating-point register save area and 2 32-bit temp locations.
185 */
186#define	HWCOPYFRAMESIZE	((64 * 5) + (2 * 4))
187
188#define SAVED_FPREGS_OFFSET	(64 * 5)
189#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 4)
190#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 4)
191
192/*
193 * Common macros used by the various versions of the block copy
194 * routines in this file.
195 */
196
197#define	FZERO				\
198	fzero	%f0			;\
199	fzero	%f2			;\
200	faddd	%f0, %f2, %f4		;\
201	fmuld	%f0, %f2, %f6		;\
202	faddd	%f0, %f2, %f8		;\
203	fmuld	%f0, %f2, %f10		;\
204	faddd	%f0, %f2, %f12		;\
205	fmuld	%f0, %f2, %f14		;\
206	faddd	%f0, %f2, %f16		;\
207	fmuld	%f0, %f2, %f18		;\
208	faddd	%f0, %f2, %f20		;\
209	fmuld	%f0, %f2, %f22		;\
210	faddd	%f0, %f2, %f24		;\
211	fmuld	%f0, %f2, %f26		;\
212	faddd	%f0, %f2, %f28		;\
213	fmuld	%f0, %f2, %f30		;\
214	faddd	%f0, %f2, %f32		;\
215	fmuld	%f0, %f2, %f34		;\
216	faddd	%f0, %f2, %f36		;\
217	fmuld	%f0, %f2, %f38		;\
218	faddd	%f0, %f2, %f40		;\
219	fmuld	%f0, %f2, %f42		;\
220	faddd	%f0, %f2, %f44		;\
221	fmuld	%f0, %f2, %f46		;\
222	faddd	%f0, %f2, %f48		;\
223	fmuld	%f0, %f2, %f50		;\
224	faddd	%f0, %f2, %f52		;\
225	fmuld	%f0, %f2, %f54		;\
226	faddd	%f0, %f2, %f56		;\
227	fmuld	%f0, %f2, %f58		;\
228	faddd	%f0, %f2, %f60		;\
229	fmuld	%f0, %f2, %f62
230
231
232#define	FALIGN_D0			\
233	faligndata %d0, %d2, %d48	;\
234	faligndata %d2, %d4, %d50	;\
235	faligndata %d4, %d6, %d52	;\
236	faligndata %d6, %d8, %d54	;\
237	faligndata %d8, %d10, %d56	;\
238	faligndata %d10, %d12, %d58	;\
239	faligndata %d12, %d14, %d60	;\
240	faligndata %d14, %d16, %d62
241
242#define	FALIGN_D16			\
243	faligndata %d16, %d18, %d48	;\
244	faligndata %d18, %d20, %d50	;\
245	faligndata %d20, %d22, %d52	;\
246	faligndata %d22, %d24, %d54	;\
247	faligndata %d24, %d26, %d56	;\
248	faligndata %d26, %d28, %d58	;\
249	faligndata %d28, %d30, %d60	;\
250	faligndata %d30, %d32, %d62
251
252#define	FALIGN_D32			\
253	faligndata %d32, %d34, %d48	;\
254	faligndata %d34, %d36, %d50	;\
255	faligndata %d36, %d38, %d52	;\
256	faligndata %d38, %d40, %d54	;\
257	faligndata %d40, %d42, %d56	;\
258	faligndata %d42, %d44, %d58	;\
259	faligndata %d44, %d46, %d60	;\
260	faligndata %d46, %d0, %d62
261
262#define	FALIGN_D2			\
263	faligndata %d2, %d4, %d48	;\
264	faligndata %d4, %d6, %d50	;\
265	faligndata %d6, %d8, %d52	;\
266	faligndata %d8, %d10, %d54	;\
267	faligndata %d10, %d12, %d56	;\
268	faligndata %d12, %d14, %d58	;\
269	faligndata %d14, %d16, %d60	;\
270	faligndata %d16, %d18, %d62
271
272#define	FALIGN_D18			\
273	faligndata %d18, %d20, %d48	;\
274	faligndata %d20, %d22, %d50	;\
275	faligndata %d22, %d24, %d52	;\
276	faligndata %d24, %d26, %d54	;\
277	faligndata %d26, %d28, %d56	;\
278	faligndata %d28, %d30, %d58	;\
279	faligndata %d30, %d32, %d60	;\
280	faligndata %d32, %d34, %d62
281
282#define	FALIGN_D34			\
283	faligndata %d34, %d36, %d48	;\
284	faligndata %d36, %d38, %d50	;\
285	faligndata %d38, %d40, %d52	;\
286	faligndata %d40, %d42, %d54	;\
287	faligndata %d42, %d44, %d56	;\
288	faligndata %d44, %d46, %d58	;\
289	faligndata %d46, %d0, %d60	;\
290	faligndata %d0, %d2, %d62
291
292#define	FALIGN_D4			\
293	faligndata %d4, %d6, %d48	;\
294	faligndata %d6, %d8, %d50	;\
295	faligndata %d8, %d10, %d52	;\
296	faligndata %d10, %d12, %d54	;\
297	faligndata %d12, %d14, %d56	;\
298	faligndata %d14, %d16, %d58	;\
299	faligndata %d16, %d18, %d60	;\
300	faligndata %d18, %d20, %d62
301
302#define	FALIGN_D20			\
303	faligndata %d20, %d22, %d48	;\
304	faligndata %d22, %d24, %d50	;\
305	faligndata %d24, %d26, %d52	;\
306	faligndata %d26, %d28, %d54	;\
307	faligndata %d28, %d30, %d56	;\
308	faligndata %d30, %d32, %d58	;\
309	faligndata %d32, %d34, %d60	;\
310	faligndata %d34, %d36, %d62
311
312#define	FALIGN_D36			\
313	faligndata %d36, %d38, %d48	;\
314	faligndata %d38, %d40, %d50	;\
315	faligndata %d40, %d42, %d52	;\
316	faligndata %d42, %d44, %d54	;\
317	faligndata %d44, %d46, %d56	;\
318	faligndata %d46, %d0, %d58	;\
319	faligndata %d0, %d2, %d60	;\
320	faligndata %d2, %d4, %d62
321
322#define	FALIGN_D6			\
323	faligndata %d6, %d8, %d48	;\
324	faligndata %d8, %d10, %d50	;\
325	faligndata %d10, %d12, %d52	;\
326	faligndata %d12, %d14, %d54	;\
327	faligndata %d14, %d16, %d56	;\
328	faligndata %d16, %d18, %d58	;\
329	faligndata %d18, %d20, %d60	;\
330	faligndata %d20, %d22, %d62
331
332#define	FALIGN_D22			\
333	faligndata %d22, %d24, %d48	;\
334	faligndata %d24, %d26, %d50	;\
335	faligndata %d26, %d28, %d52	;\
336	faligndata %d28, %d30, %d54	;\
337	faligndata %d30, %d32, %d56	;\
338	faligndata %d32, %d34, %d58	;\
339	faligndata %d34, %d36, %d60	;\
340	faligndata %d36, %d38, %d62
341
342#define	FALIGN_D38			\
343	faligndata %d38, %d40, %d48	;\
344	faligndata %d40, %d42, %d50	;\
345	faligndata %d42, %d44, %d52	;\
346	faligndata %d44, %d46, %d54	;\
347	faligndata %d46, %d0, %d56	;\
348	faligndata %d0, %d2, %d58	;\
349	faligndata %d2, %d4, %d60	;\
350	faligndata %d4, %d6, %d62
351
352#define	FALIGN_D8			\
353	faligndata %d8, %d10, %d48	;\
354	faligndata %d10, %d12, %d50	;\
355	faligndata %d12, %d14, %d52	;\
356	faligndata %d14, %d16, %d54	;\
357	faligndata %d16, %d18, %d56	;\
358	faligndata %d18, %d20, %d58	;\
359	faligndata %d20, %d22, %d60	;\
360	faligndata %d22, %d24, %d62
361
362#define	FALIGN_D24			\
363	faligndata %d24, %d26, %d48	;\
364	faligndata %d26, %d28, %d50	;\
365	faligndata %d28, %d30, %d52	;\
366	faligndata %d30, %d32, %d54	;\
367	faligndata %d32, %d34, %d56	;\
368	faligndata %d34, %d36, %d58	;\
369	faligndata %d36, %d38, %d60	;\
370	faligndata %d38, %d40, %d62
371
372#define	FALIGN_D40			\
373	faligndata %d40, %d42, %d48	;\
374	faligndata %d42, %d44, %d50	;\
375	faligndata %d44, %d46, %d52	;\
376	faligndata %d46, %d0, %d54	;\
377	faligndata %d0, %d2, %d56	;\
378	faligndata %d2, %d4, %d58	;\
379	faligndata %d4, %d6, %d60	;\
380	faligndata %d6, %d8, %d62
381
382#define	FALIGN_D10			\
383	faligndata %d10, %d12, %d48	;\
384	faligndata %d12, %d14, %d50	;\
385	faligndata %d14, %d16, %d52	;\
386	faligndata %d16, %d18, %d54	;\
387	faligndata %d18, %d20, %d56	;\
388	faligndata %d20, %d22, %d58	;\
389	faligndata %d22, %d24, %d60	;\
390	faligndata %d24, %d26, %d62
391
392#define	FALIGN_D26			\
393	faligndata %d26, %d28, %d48	;\
394	faligndata %d28, %d30, %d50	;\
395	faligndata %d30, %d32, %d52	;\
396	faligndata %d32, %d34, %d54	;\
397	faligndata %d34, %d36, %d56	;\
398	faligndata %d36, %d38, %d58	;\
399	faligndata %d38, %d40, %d60	;\
400	faligndata %d40, %d42, %d62
401
402#define	FALIGN_D42			\
403	faligndata %d42, %d44, %d48	;\
404	faligndata %d44, %d46, %d50	;\
405	faligndata %d46, %d0, %d52	;\
406	faligndata %d0, %d2, %d54	;\
407	faligndata %d2, %d4, %d56	;\
408	faligndata %d4, %d6, %d58	;\
409	faligndata %d6, %d8, %d60	;\
410	faligndata %d8, %d10, %d62
411
412#define	FALIGN_D12			\
413	faligndata %d12, %d14, %d48	;\
414	faligndata %d14, %d16, %d50	;\
415	faligndata %d16, %d18, %d52	;\
416	faligndata %d18, %d20, %d54	;\
417	faligndata %d20, %d22, %d56	;\
418	faligndata %d22, %d24, %d58	;\
419	faligndata %d24, %d26, %d60	;\
420	faligndata %d26, %d28, %d62
421
422#define	FALIGN_D28			\
423	faligndata %d28, %d30, %d48	;\
424	faligndata %d30, %d32, %d50	;\
425	faligndata %d32, %d34, %d52	;\
426	faligndata %d34, %d36, %d54	;\
427	faligndata %d36, %d38, %d56	;\
428	faligndata %d38, %d40, %d58	;\
429	faligndata %d40, %d42, %d60	;\
430	faligndata %d42, %d44, %d62
431
432#define	FALIGN_D44			\
433	faligndata %d44, %d46, %d48	;\
434	faligndata %d46, %d0, %d50	;\
435	faligndata %d0, %d2, %d52	;\
436	faligndata %d2, %d4, %d54	;\
437	faligndata %d4, %d6, %d56	;\
438	faligndata %d6, %d8, %d58	;\
439	faligndata %d8, %d10, %d60	;\
440	faligndata %d10, %d12, %d62
441
442#define	FALIGN_D14			\
443	faligndata %d14, %d16, %d48	;\
444	faligndata %d16, %d18, %d50	;\
445	faligndata %d18, %d20, %d52	;\
446	faligndata %d20, %d22, %d54	;\
447	faligndata %d22, %d24, %d56	;\
448	faligndata %d24, %d26, %d58	;\
449	faligndata %d26, %d28, %d60	;\
450	faligndata %d28, %d30, %d62
451
452#define	FALIGN_D30			\
453	faligndata %d30, %d32, %d48	;\
454	faligndata %d32, %d34, %d50	;\
455	faligndata %d34, %d36, %d52	;\
456	faligndata %d36, %d38, %d54	;\
457	faligndata %d38, %d40, %d56	;\
458	faligndata %d40, %d42, %d58	;\
459	faligndata %d42, %d44, %d60	;\
460	faligndata %d44, %d46, %d62
461
462#define	FALIGN_D46			\
463	faligndata %d46, %d0, %d48	;\
464	faligndata %d0, %d2, %d50	;\
465	faligndata %d2, %d4, %d52	;\
466	faligndata %d4, %d6, %d54	;\
467	faligndata %d6, %d8, %d56	;\
468	faligndata %d8, %d10, %d58	;\
469	faligndata %d10, %d12, %d60	;\
470	faligndata %d12, %d14, %d62
471
472
473/*
474 * Copy a block of storage, returning an error code if `from' or
475 * `to' takes a kernel pagefault which cannot be resolved.
476 * Returns errno value on pagefault error, 0 if all ok
477 */
478
479
480
481	.seg	".text"
482	.align	4
483
484	ENTRY(kcopy)
485
486	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
487	set	.copyerr, %l6		! copyerr is lofault value
488	ldn	[THREAD_REG + T_LOFAULT], %l7	! save existing handler
489	membar	#Sync			! sync error barrier (see copy.s)
490	stn	%l6, [THREAD_REG + T_LOFAULT]	! set t_lofault
491	!
492	! Note that we carefully do *not* flag the setting of
493	! t_lofault.
494	!
495	ba,pt	%ncc, .do_copy		! common code
496	  mov	%l7, %l6
497
498/*
499 * We got here because of a fault during kcopy or bcopy if a fault
500 * handler existed when bcopy was called.
501 * Errno value is in %g1.
502 */
503.copyerr:
504	set	.copyerr2, %l1
505	membar	#Sync			! sync error barrier
506	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
507	btst	FPUSED_FLAG, %l6
508	bz	%icc, 1f
509	  and	%l6, BCOPY_FLAG, %l1	! copy flag to %l1
510
511	membar	#Sync
512
513	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
514	wr	%o2, 0, %gsr
515
516	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
517	btst	FPRS_FEF, %o3
518	bz	%icc, 4f
519	  nop
520
521	! restore fpregs from stack
522	membar	#Sync
523	add	%fp, STACK_BIAS - 257, %o2
524	and	%o2, -64, %o2
525	ldda	[%o2]ASI_BLK_P, %d0
526	add	%o2, 64, %o2
527	ldda	[%o2]ASI_BLK_P, %d16
528	add	%o2, 64, %o2
529	ldda	[%o2]ASI_BLK_P, %d32
530	add	%o2, 64, %o2
531	ldda	[%o2]ASI_BLK_P, %d48
532	membar	#Sync
533
534	ba,pt	%ncc, 2f
535	  wr	%o3, 0, %fprs		! restore fprs
536
5374:
538	FZERO				! zero all of the fpregs
539	wr	%o3, 0, %fprs		! restore fprs
540
5412:	ldn	[THREAD_REG + T_LWP], %o2
542	tst	%o2
543	bnz,pt	%ncc, 1f
544	  nop
545
546	ldsb	[THREAD_REG + T_PREEMPT], %l0
547	deccc	%l0
548	bnz,pn	%ncc, 1f
549	  stb	%l0, [THREAD_REG + T_PREEMPT]
550
551	! Check for a kernel preemption request
552	ldn	[THREAD_REG + T_CPU], %l0
553	ldub	[%l0 + CPU_KPRUNRUN], %l0
554	tst	%l0
555	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
556	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
557
558	!
559	! Need to cater for the different expectations of kcopy
560	! and bcopy. kcopy will *always* set a t_lofault handler
561	! If it fires, we're expected to just return the error code
562	! and *not* to invoke any existing error handler. As far as
563	! bcopy is concerned, we only set t_lofault if there was an
564	! existing lofault handler. In that case we're expected to
565	! invoke the previously existing handler after restting the
566	! t_lofault value.
567	!
5681:
569	andn	%l6, COPY_FLAGS, %l6	! remove flags from lofault address
570	membar	#Sync			! sync error barrier
571	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
572
573	! call kpreempt if necessary
574	btst	KPREEMPT_FLAG, %l1
575	bz,pt	%icc, 2f
576	  nop
577	call	kpreempt
578	  rdpr	%pil, %o0	! pass %pil
5792:
580	btst	BCOPY_FLAG, %l1
581	bnz,pn	%ncc, 3f
582	  nop
583	ret
584	restore	%g1, 0, %o0
585
5863:
587	!
588	! We're here via bcopy. There *must* have been an error handler
589	! in place otheerwise we would have died a nasty death already.
590	!
591	jmp	%l6				! goto real handler
592	restore	%g0, 0, %o0			! dispose of copy window
593
594/*
595 * We got here because of a fault in .copyerr.  We can't safely restore fp
596 * state, so we panic.
597 */
598fp_panic_msg:
599	.asciz	"Unable to restore fp state after copy operation"
600
601	.align	4
602.copyerr2:
603	set	fp_panic_msg, %o0
604	call	panic
605	  nop
606	SET_SIZE(kcopy)
607
608
609/*
610 * Copy a block of storage - must not overlap (from + len <= to).
611 * Registers: l6 - saved t_lofault
612 *
613 * Copy a page of memory.
614 * Assumes double word alignment and a count >= 256.
615 */
616
617	ENTRY(bcopy)
618
619	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
620	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
621	tst	%l6
622        !
623        ! We've already captured whether t_lofault was zero on entry.
624        ! We need to mark ourselves as being from bcopy since both
625        ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set
626        ! and the saved lofault was zero, we won't reset lofault on
627        ! returning.
628        !
629	or	%l6, BCOPY_FLAG, %l6
630	bz,pt	%ncc, .do_copy
631	sethi	%hi(.copyerr), %o2
632	or	%o2, %lo(.copyerr), %o2
633	membar	#Sync			! sync error barrier
634	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
635
636.do_copy:
637	cmp	%i2, 12			! for small counts
638	blu	%ncc, .bytecp		! just copy bytes
639	  .empty
640
641	cmp	%i2, VIS_COPY_THRESHOLD	! for large counts
642	blu,pt	%ncc, .bcb_punt
643	  .empty
644
645	!
646	! Check to see if VIS acceleration is enabled
647	!
648	sethi	%hi(use_hw_bcopy), %o2
649	ld	[%o2 + %lo(use_hw_bcopy)], %o2
650	tst	%o2
651	bz,pn	%icc, .bcb_punt
652	  nop
653
654	subcc	%i1, %i0, %i3
655	bneg,a,pn %ncc, 1f
656	neg	%i3
6571:
658	/*
659	 * Compare against 256 since we should be checking block addresses
660	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
661	 * src = dest + (64 * 3) + 63.
662	 */
663	cmp	%i3, 256
664	blu,pn	%ncc, .bcb_punt
665	  nop
666
667	ldn	[THREAD_REG + T_LWP], %o3
668	tst	%o3
669	bnz,pt	%ncc, 1f
670	  nop
671
672	! kpreempt_disable();
673	ldsb	[THREAD_REG + T_PREEMPT], %o2
674	inc	%o2
675	stb	%o2, [THREAD_REG + T_PREEMPT]
676
6771:
678	rd	%fprs, %o2		! check for unused fp
679	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
680	btst	FPRS_FEF, %o2
681	bz,a	%icc, .do_blockcopy
682	  wr	%g0, FPRS_FEF, %fprs
683
684.bcb_fpregs_inuse:
685	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
686	bgeu	%ncc, 1f		!  if we have to save the fpregs)
687	  nop
688
689	tst	%o3
690	bnz,pt	%ncc, .bcb_punt
691	  nop
692
693	ldsb	[THREAD_REG + T_PREEMPT], %l0
694	deccc	%l0
695	bnz,pn	%icc, .bcb_punt
696	  stb	%l0, [THREAD_REG + T_PREEMPT]
697
698	! Check for a kernel preemption request
699	ldn	[THREAD_REG + T_CPU], %l0
700	ldub	[%l0 + CPU_KPRUNRUN], %l0
701	tst	%l0
702	bz,pt	%icc, .bcb_punt
703	  nop
704
705	! Attempt to preempt
706	call	kpreempt
707	  rdpr	  %pil, %o0		  ! pass %pil
708
709	ba,pt	%ncc, .bcb_punt
710	  nop
711
7121:
713	wr	%g0, FPRS_FEF, %fprs
714
715	! save in-use fpregs on stack
716	membar	#Sync
717	add	%fp, STACK_BIAS - 257, %o2
718	and	%o2, -64, %o2
719	stda	%d0, [%o2]ASI_BLK_P
720	add	%o2, 64, %o2
721	stda	%d16, [%o2]ASI_BLK_P
722	add	%o2, 64, %o2
723	stda	%d32, [%o2]ASI_BLK_P
724	add	%o2, 64, %o2
725	stda	%d48, [%o2]ASI_BLK_P
726	membar	#Sync
727
728.do_blockcopy:
729	membar	#StoreStore|#StoreLoad|#LoadStore
730
731	rd	%gsr, %o2
732	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
733
734	! Set the lower bit in the saved t_lofault to indicate
735	! that we need to clear the %fprs register on the way
736	! out
737	or	%l6, FPUSED_FLAG, %l6
738
739	! Swap src/dst since the code below is memcpy code
740	! and memcpy/bcopy have different calling sequences
741	mov	%i1, %i5
742	mov	%i0, %i1
743	mov	%i5, %i0
744
745!!! This code is nearly identical to the version in the sun4u
746!!! libc_psr.  Most bugfixes made to that file should be
747!!! merged into this routine.
748
749	andcc	%i0, 7, %o3
750	bz,pt	%ncc, blkcpy
751	sub	%o3, 8, %o3
752	neg	%o3
753	sub	%i2, %o3, %i2
754
755	! Align Destination on double-word boundary
756
7572:	ldub	[%i1], %o4
758	inc	%i1
759	inc	%i0
760	deccc	%o3
761	bgu	%ncc, 2b
762	stb	%o4, [%i0 - 1]
763blkcpy:
764	andcc	%i0, 63, %i3
765	bz,pn	%ncc, blalign		! now block aligned
766	sub	%i3, 64, %i3
767	neg	%i3			! bytes till block aligned
768	sub	%i2, %i3, %i2		! update %i2 with new count
769
770	! Copy %i3 bytes till dst is block (64 byte) aligned. use
771	! double word copies.
772
773	alignaddr %i1, %g0, %g1
774	ldd	[%g1], %d0
775	add	%g1, 8, %g1
7766:
777	ldd	[%g1], %d2
778	add	%g1, 8, %g1
779	subcc	%i3, 8, %i3
780	faligndata %d0, %d2, %d8
781	std	%d8, [%i0]
782	add	%i1, 8, %i1
783	bz,pn	%ncc, blalign
784	add	%i0, 8, %i0
785	ldd	[%g1], %d0
786	add	%g1, 8, %g1
787	subcc	%i3, 8, %i3
788	faligndata %d2, %d0, %d8
789	std	%d8, [%i0]
790	add	%i1, 8, %i1
791	bgu,pn	%ncc, 6b
792	add	%i0, 8, %i0
793
794blalign:
795	membar	#StoreLoad
796	! %i2 = total length
797	! %i3 = blocks	(length - 64) / 64
798	! %i4 = doubles remaining  (length - blocks)
799	sub	%i2, 64, %i3
800	andn	%i3, 63, %i3
801	sub	%i2, %i3, %i4
802	andn	%i4, 7, %i4
803	sub	%i4, 16, %i4
804	sub	%i2, %i4, %i2
805	sub	%i2, %i3, %i2
806
807	andn	%i1, 0x3f, %l7		! blk aligned address
808	alignaddr %i1, %g0, %g0		! gen %gsr
809
810	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
811	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
812	add	%i1, %i4, %i1
813	add	%i1, %i3, %i1
814
815	ldda	[%l7]ASI_BLK_P, %d0
816	add	%l7, 64, %l7
817	ldda	[%l7]ASI_BLK_P, %d16
818	add	%l7, 64, %l7
819	ldda	[%l7]ASI_BLK_P, %d32
820	add	%l7, 64, %l7
821	sub	%i3, 128, %i3
822
823	! switch statement to get us to the right 8 byte blk within a
824	! 64 byte block
825	cmp	 %i5, 4
826	bgeu,a	 hlf
827	cmp	 %i5, 6
828	cmp	 %i5, 2
829	bgeu,a	 sqtr
830	nop
831	cmp	 %i5, 1
832	be,a	 seg1
833	nop
834	ba,pt	 %ncc, seg0
835	nop
836sqtr:
837	be,a	 seg2
838	nop
839	ba,pt	 %ncc, seg3
840	nop
841
842hlf:
843	bgeu,a	 fqtr
844	nop
845	cmp	 %i5, 5
846	be,a	 seg5
847	nop
848	ba,pt	 %ncc, seg4
849	nop
850fqtr:
851	be,a	 seg6
852	nop
853	ba,pt	 %ncc, seg7
854	nop
855
856
857seg0:
858	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
859	FALIGN_D0
860	ldda	[%l7]ASI_BLK_P, %d0
861	stda	%d48, [%i0]ASI_BLK_P
862	add	%l7, 64, %l7
863	subcc	%i3, 64, %i3
864	bz,pn	%ncc, 0f
865	add	%i0, 64, %i0
866	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
867	FALIGN_D16
868	ldda	[%l7]ASI_BLK_P, %d16
869	stda	%d48, [%i0]ASI_BLK_P
870	add	%l7, 64, %l7
871	subcc	%i3, 64, %i3
872	bz,pn	%ncc, 1f
873	add	%i0, 64, %i0
874	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
875	FALIGN_D32
876	ldda	[%l7]ASI_BLK_P, %d32
877	stda	%d48, [%i0]ASI_BLK_P
878	add	%l7, 64, %l7
879	subcc	%i3, 64, %i3
880	bz,pn	%ncc, 2f
881	add	%i0, 64, %i0
882	ba,a,pt	%ncc, seg0
883
8840:
885	FALIGN_D16
886	stda	%d48, [%i0]ASI_BLK_P
887	add	%i0, 64, %i0
888	membar	#Sync
889	FALIGN_D32
890	stda	%d48, [%i0]ASI_BLK_P
891	ba,pt	%ncc, blkd0
892	add	%i0, 64, %i0
893
8941:
895	FALIGN_D32
896	stda	%d48, [%i0]ASI_BLK_P
897	add	%i0, 64, %i0
898	membar	#Sync
899	FALIGN_D0
900	stda	%d48, [%i0]ASI_BLK_P
901	ba,pt	%ncc, blkd16
902	add	%i0, 64, %i0
903
9042:
905	FALIGN_D0
906	stda	%d48, [%i0]ASI_BLK_P
907	add	%i0, 64, %i0
908	membar	#Sync
909	FALIGN_D16
910	stda	%d48, [%i0]ASI_BLK_P
911	ba,pt	%ncc, blkd32
912	add	%i0, 64, %i0
913
914seg1:
915	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
916	FALIGN_D2
917	ldda	[%l7]ASI_BLK_P, %d0
918	stda	%d48, [%i0]ASI_BLK_P
919	add	%l7, 64, %l7
920	subcc	%i3, 64, %i3
921	bz,pn	%ncc, 0f
922	add	%i0, 64, %i0
923	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
924	FALIGN_D18
925	ldda	[%l7]ASI_BLK_P, %d16
926	stda	%d48, [%i0]ASI_BLK_P
927	add	%l7, 64, %l7
928	subcc	%i3, 64, %i3
929	bz,pn	%ncc, 1f
930	add	%i0, 64, %i0
931	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
932	FALIGN_D34
933	ldda	[%l7]ASI_BLK_P, %d32
934	stda	%d48, [%i0]ASI_BLK_P
935	add	%l7, 64, %l7
936	subcc	%i3, 64, %i3
937	bz,pn	%ncc, 2f
938	add	%i0, 64, %i0
939	ba,a,pt	%ncc, seg1
9400:
941	FALIGN_D18
942	stda	%d48, [%i0]ASI_BLK_P
943	add	%i0, 64, %i0
944	membar	#Sync
945	FALIGN_D34
946	stda	%d48, [%i0]ASI_BLK_P
947	ba,pt	%ncc, blkd2
948	add	%i0, 64, %i0
949
9501:
951	FALIGN_D34
952	stda	%d48, [%i0]ASI_BLK_P
953	add	%i0, 64, %i0
954	membar	#Sync
955	FALIGN_D2
956	stda	%d48, [%i0]ASI_BLK_P
957	ba,pt	%ncc, blkd18
958	add	%i0, 64, %i0
959
9602:
961	FALIGN_D2
962	stda	%d48, [%i0]ASI_BLK_P
963	add	%i0, 64, %i0
964	membar	#Sync
965	FALIGN_D18
966	stda	%d48, [%i0]ASI_BLK_P
967	ba,pt	%ncc, blkd34
968	add	%i0, 64, %i0
969
970seg2:
971	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
972	FALIGN_D4
973	ldda	[%l7]ASI_BLK_P, %d0
974	stda	%d48, [%i0]ASI_BLK_P
975	add	%l7, 64, %l7
976	subcc	%i3, 64, %i3
977	bz,pn	%ncc, 0f
978	add	%i0, 64, %i0
979	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
980	FALIGN_D20
981	ldda	[%l7]ASI_BLK_P, %d16
982	stda	%d48, [%i0]ASI_BLK_P
983	add	%l7, 64, %l7
984	subcc	%i3, 64, %i3
985	bz,pn	%ncc, 1f
986	add	%i0, 64, %i0
987	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
988	FALIGN_D36
989	ldda	[%l7]ASI_BLK_P, %d32
990	stda	%d48, [%i0]ASI_BLK_P
991	add	%l7, 64, %l7
992	subcc	%i3, 64, %i3
993	bz,pn	%ncc, 2f
994	add	%i0, 64, %i0
995	ba,a,pt	%ncc, seg2
996
9970:
998	FALIGN_D20
999	stda	%d48, [%i0]ASI_BLK_P
1000	add	%i0, 64, %i0
1001	membar	#Sync
1002	FALIGN_D36
1003	stda	%d48, [%i0]ASI_BLK_P
1004	ba,pt	%ncc, blkd4
1005	add	%i0, 64, %i0
1006
10071:
1008	FALIGN_D36
1009	stda	%d48, [%i0]ASI_BLK_P
1010	add	%i0, 64, %i0
1011	membar	#Sync
1012	FALIGN_D4
1013	stda	%d48, [%i0]ASI_BLK_P
1014	ba,pt	%ncc, blkd20
1015	add	%i0, 64, %i0
1016
10172:
1018	FALIGN_D4
1019	stda	%d48, [%i0]ASI_BLK_P
1020	add	%i0, 64, %i0
1021	membar	#Sync
1022	FALIGN_D20
1023	stda	%d48, [%i0]ASI_BLK_P
1024	ba,pt	%ncc, blkd36
1025	add	%i0, 64, %i0
1026
1027seg3:
1028	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1029	FALIGN_D6
1030	ldda	[%l7]ASI_BLK_P, %d0
1031	stda	%d48, [%i0]ASI_BLK_P
1032	add	%l7, 64, %l7
1033	subcc	%i3, 64, %i3
1034	bz,pn	%ncc, 0f
1035	add	%i0, 64, %i0
1036	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1037	FALIGN_D22
1038	ldda	[%l7]ASI_BLK_P, %d16
1039	stda	%d48, [%i0]ASI_BLK_P
1040	add	%l7, 64, %l7
1041	subcc	%i3, 64, %i3
1042	bz,pn	%ncc, 1f
1043	add	%i0, 64, %i0
1044	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1045	FALIGN_D38
1046	ldda	[%l7]ASI_BLK_P, %d32
1047	stda	%d48, [%i0]ASI_BLK_P
1048	add	%l7, 64, %l7
1049	subcc	%i3, 64, %i3
1050	bz,pn	%ncc, 2f
1051	add	%i0, 64, %i0
1052	ba,a,pt	%ncc, seg3
1053
10540:
1055	FALIGN_D22
1056	stda	%d48, [%i0]ASI_BLK_P
1057	add	%i0, 64, %i0
1058	membar	#Sync
1059	FALIGN_D38
1060	stda	%d48, [%i0]ASI_BLK_P
1061	ba,pt	%ncc, blkd6
1062	add	%i0, 64, %i0
1063
10641:
1065	FALIGN_D38
1066	stda	%d48, [%i0]ASI_BLK_P
1067	add	%i0, 64, %i0
1068	membar	#Sync
1069	FALIGN_D6
1070	stda	%d48, [%i0]ASI_BLK_P
1071	ba,pt	%ncc, blkd22
1072	add	%i0, 64, %i0
1073
10742:
1075	FALIGN_D6
1076	stda	%d48, [%i0]ASI_BLK_P
1077	add	%i0, 64, %i0
1078	membar	#Sync
1079	FALIGN_D22
1080	stda	%d48, [%i0]ASI_BLK_P
1081	ba,pt	%ncc, blkd38
1082	add	%i0, 64, %i0
1083
1084seg4:
1085	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1086	FALIGN_D8
1087	ldda	[%l7]ASI_BLK_P, %d0
1088	stda	%d48, [%i0]ASI_BLK_P
1089	add	%l7, 64, %l7
1090	subcc	%i3, 64, %i3
1091	bz,pn	%ncc, 0f
1092	add	%i0, 64, %i0
1093	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1094	FALIGN_D24
1095	ldda	[%l7]ASI_BLK_P, %d16
1096	stda	%d48, [%i0]ASI_BLK_P
1097	add	%l7, 64, %l7
1098	subcc	%i3, 64, %i3
1099	bz,pn	%ncc, 1f
1100	add	%i0, 64, %i0
1101	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1102	FALIGN_D40
1103	ldda	[%l7]ASI_BLK_P, %d32
1104	stda	%d48, [%i0]ASI_BLK_P
1105	add	%l7, 64, %l7
1106	subcc	%i3, 64, %i3
1107	bz,pn	%ncc, 2f
1108	add	%i0, 64, %i0
1109	ba,a,pt	%ncc, seg4
1110
11110:
1112	FALIGN_D24
1113	stda	%d48, [%i0]ASI_BLK_P
1114	add	%i0, 64, %i0
1115	membar	#Sync
1116	FALIGN_D40
1117	stda	%d48, [%i0]ASI_BLK_P
1118	ba,pt	%ncc, blkd8
1119	add	%i0, 64, %i0
1120
11211:
1122	FALIGN_D40
1123	stda	%d48, [%i0]ASI_BLK_P
1124	add	%i0, 64, %i0
1125	membar	#Sync
1126	FALIGN_D8
1127	stda	%d48, [%i0]ASI_BLK_P
1128	ba,pt	%ncc, blkd24
1129	add	%i0, 64, %i0
1130
11312:
1132	FALIGN_D8
1133	stda	%d48, [%i0]ASI_BLK_P
1134	add	%i0, 64, %i0
1135	membar	#Sync
1136	FALIGN_D24
1137	stda	%d48, [%i0]ASI_BLK_P
1138	ba,pt	%ncc, blkd40
1139	add	%i0, 64, %i0
1140
1141seg5:
1142	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1143	FALIGN_D10
1144	ldda	[%l7]ASI_BLK_P, %d0
1145	stda	%d48, [%i0]ASI_BLK_P
1146	add	%l7, 64, %l7
1147	subcc	%i3, 64, %i3
1148	bz,pn	%ncc, 0f
1149	add	%i0, 64, %i0
1150	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1151	FALIGN_D26
1152	ldda	[%l7]ASI_BLK_P, %d16
1153	stda	%d48, [%i0]ASI_BLK_P
1154	add	%l7, 64, %l7
1155	subcc	%i3, 64, %i3
1156	bz,pn	%ncc, 1f
1157	add	%i0, 64, %i0
1158	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1159	FALIGN_D42
1160	ldda	[%l7]ASI_BLK_P, %d32
1161	stda	%d48, [%i0]ASI_BLK_P
1162	add	%l7, 64, %l7
1163	subcc	%i3, 64, %i3
1164	bz,pn	%ncc, 2f
1165	add	%i0, 64, %i0
1166	ba,a,pt	%ncc, seg5
1167
11680:
1169	FALIGN_D26
1170	stda	%d48, [%i0]ASI_BLK_P
1171	add	%i0, 64, %i0
1172	membar	#Sync
1173	FALIGN_D42
1174	stda	%d48, [%i0]ASI_BLK_P
1175	ba,pt	%ncc, blkd10
1176	add	%i0, 64, %i0
1177
11781:
1179	FALIGN_D42
1180	stda	%d48, [%i0]ASI_BLK_P
1181	add	%i0, 64, %i0
1182	membar	#Sync
1183	FALIGN_D10
1184	stda	%d48, [%i0]ASI_BLK_P
1185	ba,pt	%ncc, blkd26
1186	add	%i0, 64, %i0
1187
11882:
1189	FALIGN_D10
1190	stda	%d48, [%i0]ASI_BLK_P
1191	add	%i0, 64, %i0
1192	membar	#Sync
1193	FALIGN_D26
1194	stda	%d48, [%i0]ASI_BLK_P
1195	ba,pt	%ncc, blkd42
1196	add	%i0, 64, %i0
1197
1198seg6:
1199	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1200	FALIGN_D12
1201	ldda	[%l7]ASI_BLK_P, %d0
1202	stda	%d48, [%i0]ASI_BLK_P
1203	add	%l7, 64, %l7
1204	subcc	%i3, 64, %i3
1205	bz,pn	%ncc, 0f
1206	add	%i0, 64, %i0
1207	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1208	FALIGN_D28
1209	ldda	[%l7]ASI_BLK_P, %d16
1210	stda	%d48, [%i0]ASI_BLK_P
1211	add	%l7, 64, %l7
1212	subcc	%i3, 64, %i3
1213	bz,pn	%ncc, 1f
1214	add	%i0, 64, %i0
1215	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1216	FALIGN_D44
1217	ldda	[%l7]ASI_BLK_P, %d32
1218	stda	%d48, [%i0]ASI_BLK_P
1219	add	%l7, 64, %l7
1220	subcc	%i3, 64, %i3
1221	bz,pn	%ncc, 2f
1222	add	%i0, 64, %i0
1223	ba,a,pt	%ncc, seg6
1224
12250:
1226	FALIGN_D28
1227	stda	%d48, [%i0]ASI_BLK_P
1228	add	%i0, 64, %i0
1229	membar	#Sync
1230	FALIGN_D44
1231	stda	%d48, [%i0]ASI_BLK_P
1232	ba,pt	%ncc, blkd12
1233	add	%i0, 64, %i0
1234
12351:
1236	FALIGN_D44
1237	stda	%d48, [%i0]ASI_BLK_P
1238	add	%i0, 64, %i0
1239	membar	#Sync
1240	FALIGN_D12
1241	stda	%d48, [%i0]ASI_BLK_P
1242	ba,pt	%ncc, blkd28
1243	add	%i0, 64, %i0
1244
12452:
1246	FALIGN_D12
1247	stda	%d48, [%i0]ASI_BLK_P
1248	add	%i0, 64, %i0
1249	membar	#Sync
1250	FALIGN_D28
1251	stda	%d48, [%i0]ASI_BLK_P
1252	ba,pt	%ncc, blkd44
1253	add	%i0, 64, %i0
1254
1255seg7:
1256	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1257	FALIGN_D14
1258	ldda	[%l7]ASI_BLK_P, %d0
1259	stda	%d48, [%i0]ASI_BLK_P
1260	add	%l7, 64, %l7
1261	subcc	%i3, 64, %i3
1262	bz,pn	%ncc, 0f
1263	add	%i0, 64, %i0
1264	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1265	FALIGN_D30
1266	ldda	[%l7]ASI_BLK_P, %d16
1267	stda	%d48, [%i0]ASI_BLK_P
1268	add	%l7, 64, %l7
1269	subcc	%i3, 64, %i3
1270	bz,pn	%ncc, 1f
1271	add	%i0, 64, %i0
1272	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1273	FALIGN_D46
1274	ldda	[%l7]ASI_BLK_P, %d32
1275	stda	%d48, [%i0]ASI_BLK_P
1276	add	%l7, 64, %l7
1277	subcc	%i3, 64, %i3
1278	bz,pn	%ncc, 2f
1279	add	%i0, 64, %i0
1280	ba,a,pt	%ncc, seg7
1281
12820:
1283	FALIGN_D30
1284	stda	%d48, [%i0]ASI_BLK_P
1285	add	%i0, 64, %i0
1286	membar	#Sync
1287	FALIGN_D46
1288	stda	%d48, [%i0]ASI_BLK_P
1289	ba,pt	%ncc, blkd14
1290	add	%i0, 64, %i0
1291
12921:
1293	FALIGN_D46
1294	stda	%d48, [%i0]ASI_BLK_P
1295	add	%i0, 64, %i0
1296	membar	#Sync
1297	FALIGN_D14
1298	stda	%d48, [%i0]ASI_BLK_P
1299	ba,pt	%ncc, blkd30
1300	add	%i0, 64, %i0
1301
13022:
1303	FALIGN_D14
1304	stda	%d48, [%i0]ASI_BLK_P
1305	add	%i0, 64, %i0
1306	membar	#Sync
1307	FALIGN_D30
1308	stda	%d48, [%i0]ASI_BLK_P
1309	ba,pt	%ncc, blkd46
1310	add	%i0, 64, %i0
1311
1312
1313	!
1314	! dribble out the last partial block
1315	!
1316blkd0:
1317	subcc	%i4, 8, %i4
1318	blu,pn	%ncc, blkdone
1319	faligndata %d0, %d2, %d48
1320	std	%d48, [%i0]
1321	add	%i0, 8, %i0
1322blkd2:
1323	subcc	%i4, 8, %i4
1324	blu,pn	%ncc, blkdone
1325	faligndata %d2, %d4, %d48
1326	std	%d48, [%i0]
1327	add	%i0, 8, %i0
1328blkd4:
1329	subcc	%i4, 8, %i4
1330	blu,pn	%ncc, blkdone
1331	faligndata %d4, %d6, %d48
1332	std	%d48, [%i0]
1333	add	%i0, 8, %i0
1334blkd6:
1335	subcc	%i4, 8, %i4
1336	blu,pn	%ncc, blkdone
1337	faligndata %d6, %d8, %d48
1338	std	%d48, [%i0]
1339	add	%i0, 8, %i0
1340blkd8:
1341	subcc	%i4, 8, %i4
1342	blu,pn	%ncc, blkdone
1343	faligndata %d8, %d10, %d48
1344	std	%d48, [%i0]
1345	add	%i0, 8, %i0
1346blkd10:
1347	subcc	%i4, 8, %i4
1348	blu,pn	%ncc, blkdone
1349	faligndata %d10, %d12, %d48
1350	std	%d48, [%i0]
1351	add	%i0, 8, %i0
1352blkd12:
1353	subcc	%i4, 8, %i4
1354	blu,pn	%ncc, blkdone
1355	faligndata %d12, %d14, %d48
1356	std	%d48, [%i0]
1357	add	%i0, 8, %i0
1358blkd14:
1359	subcc	%i4, 8, %i4
1360	blu,pn	%ncc, blkdone
1361	fsrc1	%d14, %d0
1362	ba,a,pt	%ncc, blkleft
1363
1364blkd16:
1365	subcc	%i4, 8, %i4
1366	blu,pn	%ncc, blkdone
1367	faligndata %d16, %d18, %d48
1368	std	%d48, [%i0]
1369	add	%i0, 8, %i0
1370blkd18:
1371	subcc	%i4, 8, %i4
1372	blu,pn	%ncc, blkdone
1373	faligndata %d18, %d20, %d48
1374	std	%d48, [%i0]
1375	add	%i0, 8, %i0
1376blkd20:
1377	subcc	%i4, 8, %i4
1378	blu,pn	%ncc, blkdone
1379	faligndata %d20, %d22, %d48
1380	std	%d48, [%i0]
1381	add	%i0, 8, %i0
1382blkd22:
1383	subcc	%i4, 8, %i4
1384	blu,pn	%ncc, blkdone
1385	faligndata %d22, %d24, %d48
1386	std	%d48, [%i0]
1387	add	%i0, 8, %i0
1388blkd24:
1389	subcc	%i4, 8, %i4
1390	blu,pn	%ncc, blkdone
1391	faligndata %d24, %d26, %d48
1392	std	%d48, [%i0]
1393	add	%i0, 8, %i0
1394blkd26:
1395	subcc	%i4, 8, %i4
1396	blu,pn	%ncc, blkdone
1397	faligndata %d26, %d28, %d48
1398	std	%d48, [%i0]
1399	add	%i0, 8, %i0
1400blkd28:
1401	subcc	%i4, 8, %i4
1402	blu,pn	%ncc, blkdone
1403	faligndata %d28, %d30, %d48
1404	std	%d48, [%i0]
1405	add	%i0, 8, %i0
1406blkd30:
1407	subcc	%i4, 8, %i4
1408	blu,pn	%ncc, blkdone
1409	fsrc1	%d30, %d0
1410	ba,a,pt	%ncc, blkleft
1411blkd32:
1412	subcc	%i4, 8, %i4
1413	blu,pn	%ncc, blkdone
1414	faligndata %d32, %d34, %d48
1415	std	%d48, [%i0]
1416	add	%i0, 8, %i0
1417blkd34:
1418	subcc	%i4, 8, %i4
1419	blu,pn	%ncc, blkdone
1420	faligndata %d34, %d36, %d48
1421	std	%d48, [%i0]
1422	add	%i0, 8, %i0
1423blkd36:
1424	subcc	%i4, 8, %i4
1425	blu,pn	%ncc, blkdone
1426	faligndata %d36, %d38, %d48
1427	std	%d48, [%i0]
1428	add	%i0, 8, %i0
1429blkd38:
1430	subcc	%i4, 8, %i4
1431	blu,pn	%ncc, blkdone
1432	faligndata %d38, %d40, %d48
1433	std	%d48, [%i0]
1434	add	%i0, 8, %i0
1435blkd40:
1436	subcc	%i4, 8, %i4
1437	blu,pn	%ncc, blkdone
1438	faligndata %d40, %d42, %d48
1439	std	%d48, [%i0]
1440	add	%i0, 8, %i0
1441blkd42:
1442	subcc	%i4, 8, %i4
1443	blu,pn	%ncc, blkdone
1444	faligndata %d42, %d44, %d48
1445	std	%d48, [%i0]
1446	add	%i0, 8, %i0
1447blkd44:
1448	subcc	%i4, 8, %i4
1449	blu,pn	%ncc, blkdone
1450	faligndata %d44, %d46, %d48
1451	std	%d48, [%i0]
1452	add	%i0, 8, %i0
1453blkd46:
1454	subcc	%i4, 8, %i4
1455	blu,pn	%ncc, blkdone
1456	fsrc1	%d46, %d0
1457
1458blkleft:
14591:
1460	ldd	[%l7], %d2
1461	add	%l7, 8, %l7
1462	subcc	%i4, 8, %i4
1463	faligndata %d0, %d2, %d8
1464	std	%d8, [%i0]
1465	blu,pn	%ncc, blkdone
1466	add	%i0, 8, %i0
1467	ldd	[%l7], %d0
1468	add	%l7, 8, %l7
1469	subcc	%i4, 8, %i4
1470	faligndata %d2, %d0, %d8
1471	std	%d8, [%i0]
1472	bgeu,pt	%ncc, 1b
1473	add	%i0, 8, %i0
1474
1475blkdone:
1476	tst	%i2
1477	bz,pt	%ncc, .bcb_exit
1478	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
1479
14807:	ldub	[%i1], %i4
1481	inc	%i1
1482	inc	%i0
1483	deccc	%i2
1484	bgu,pt	%ncc, 7b
1485	  stb	  %i4, [%i0 - 1]
1486
1487.bcb_exit:
1488	membar	#StoreLoad|#StoreStore
1489	btst	FPUSED_FLAG, %l6
1490	bz	%icc, 1f
1491	  and	%l6, COPY_FLAGS, %l1	! Store flags in %l1
1492					! We can't clear the flags from %l6 yet.
1493					! If there's an error, .copyerr will
1494					! need them
1495
1496	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1497	wr	%o2, 0, %gsr
1498
1499	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1500	btst	FPRS_FEF, %o3
1501	bz	%icc, 4f
1502	  nop
1503
1504	! restore fpregs from stack
1505	membar	#Sync
1506	add	%fp, STACK_BIAS - 257, %o2
1507	and	%o2, -64, %o2
1508	ldda	[%o2]ASI_BLK_P, %d0
1509	add	%o2, 64, %o2
1510	ldda	[%o2]ASI_BLK_P, %d16
1511	add	%o2, 64, %o2
1512	ldda	[%o2]ASI_BLK_P, %d32
1513	add	%o2, 64, %o2
1514	ldda	[%o2]ASI_BLK_P, %d48
1515	membar	#Sync
1516
1517	ba,pt	%ncc, 2f
1518	  wr	%o3, 0, %fprs		! restore fprs
1519
15204:
1521	FZERO				! zero all of the fpregs
1522	wr	%o3, 0, %fprs		! restore fprs
1523
15242:	ldn	[THREAD_REG + T_LWP], %o2
1525	tst	%o2
1526	bnz,pt	%ncc, 1f
1527	  nop
1528
1529	ldsb	[THREAD_REG + T_PREEMPT], %l0
1530	deccc	%l0
1531	bnz,pn	%ncc, 1f
1532	  stb	%l0, [THREAD_REG + T_PREEMPT]
1533
1534	! Check for a kernel preemption request
1535	ldn	[THREAD_REG + T_CPU], %l0
1536	ldub	[%l0 + CPU_KPRUNRUN], %l0
1537	tst	%l0
1538	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
1539	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
1540
15411:
1542	btst	BCOPY_FLAG, %l1
1543	bz,pn	%icc, 3f
1544	  andncc	%l6, COPY_FLAGS, %l6
1545
1546	!
1547	! Here via bcopy. Check to see if the handler was NULL.
1548	! If so, just return quietly. Otherwise, reset the
1549	! handler and go home.
1550	!
1551	bnz,pn	%ncc, 3f
1552	  nop
1553
1554	!
1555	! Null handler.  Check for kpreempt flag, call if necessary,
1556	! then return.
1557	!
1558	btst	KPREEMPT_FLAG, %l1
1559	bz,pt	%icc, 2f
1560	  nop
1561	call	kpreempt
1562	  rdpr	%pil, %o0	! pass %pil
15632:
1564	ret
1565	  restore	%g0, 0, %o0
1566
1567	!
1568	! Here via kcopy or bcopy with a handler.Reset the
1569	! fault handler.
1570	!
15713:
1572	membar	#Sync
1573	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1574
1575	! call kpreempt if necessary
1576	btst	KPREEMPT_FLAG, %l1
1577	bz,pt	%icc, 4f
1578	  nop
1579	call	kpreempt
1580	  rdpr	%pil, %o0
15814:
1582	ret
1583	  restore	%g0, 0, %o0
1584
1585.bcb_punt:
1586	!
1587	! use aligned transfers where possible
1588	!
1589	xor	%i0, %i1, %o4		! xor from and to address
1590	btst	7, %o4			! if lower three bits zero
1591	bz	%icc, .aldoubcp		! can align on double boundary
1592	.empty	! assembler complaints about label
1593
1594	xor	%i0, %i1, %o4		! xor from and to address
1595	btst	3, %o4			! if lower two bits zero
1596	bz	%icc, .alwordcp		! can align on word boundary
1597	btst	3, %i0			! delay slot, from address unaligned?
1598	!
1599	! use aligned reads and writes where possible
1600	! this differs from wordcp in that it copes
1601	! with odd alignment between source and destnation
1602	! using word reads and writes with the proper shifts
1603	! in between to align transfers to and from memory
1604	! i0 - src address, i1 - dest address, i2 - count
1605	! i3, i4 - tmps for used generating complete word
1606	! i5 (word to write)
1607	! l0 size in bits of upper part of source word (US)
1608	! l1 size in bits of lower part of source word (LS = 32 - US)
1609	! l2 size in bits of upper part of destination word (UD)
1610	! l3 size in bits of lower part of destination word (LD = 32 - UD)
1611	! l4 number of bytes leftover after aligned transfers complete
1612	! l5 the number 32
1613	!
1614	mov	32, %l5			! load an oft-needed constant
1615	bz	.align_dst_only
1616	btst	3, %i1			! is destnation address aligned?
1617	clr	%i4			! clear registers used in either case
1618	bz	%icc, .align_src_only
1619	clr	%l0
1620	!
1621	! both source and destination addresses are unaligned
1622	!
16231:					! align source
1624	ldub	[%i0], %i3		! read a byte from source address
1625	add	%i0, 1, %i0		! increment source address
1626	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1627	btst	3, %i0			! is source aligned?
1628	add	%l0, 8, %l0		! increment size of upper source (US)
1629	bnz,a	1b
1630	sll	%i4, 8, %i4		! make room for next byte
1631
1632	sub	%l5, %l0, %l1		! generate shift left count (LS)
1633	sll	%i4, %l1, %i4		! prepare to get rest
1634	ld	[%i0], %i3		! read a word
1635	add	%i0, 4, %i0		! increment source address
1636	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
1637	or	%i4, %i5, %i5		! merge
1638	mov	24, %l3			! align destination
16391:
1640	srl	%i5, %l3, %i4		! prepare to write a single byte
1641	stb	%i4, [%i1]		! write a byte
1642	add	%i1, 1, %i1		! increment destination address
1643	sub	%i2, 1, %i2		! decrement count
1644	btst	3, %i1			! is destination aligned?
1645	bnz,a	1b
1646	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
1647	sub	%l5, %l3, %l2		! generate shift left count (UD)
1648	sll	%i5, %l2, %i5		! move leftover into upper bytes
1649	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
1650	bgu	%ncc, .more_needed	! need more to fill than we have
1651	nop
1652
1653	sll	%i3, %l1, %i3		! clear upper used byte(s)
1654	srl	%i3, %l1, %i3
1655	! get the odd bytes between alignments
1656	sub	%l0, %l2, %l0		! regenerate shift count
1657	sub	%l5, %l0, %l1		! generate new shift left count (LS)
1658	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1659	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
1660	srl	%i3, %l0, %i4
1661	or	%i5, %i4, %i5
1662	st	%i5, [%i1]		! write a word
1663	subcc	%i2, 4, %i2		! decrement count
1664	bz	%ncc, .unalign_out
1665	add	%i1, 4, %i1		! increment destination address
1666
1667	b	2f
1668	sll	%i3, %l1, %i5		! get leftover into upper bits
1669.more_needed:
1670	sll	%i3, %l0, %i3		! save remaining byte(s)
1671	srl	%i3, %l0, %i3
1672	sub	%l2, %l0, %l1		! regenerate shift count
1673	sub	%l5, %l1, %l0		! generate new shift left count
1674	sll	%i3, %l1, %i4		! move to fill empty space
1675	b	3f
1676	or	%i5, %i4, %i5		! merge to complete word
1677	!
1678	! the source address is aligned and destination is not
1679	!
1680.align_dst_only:
1681	ld	[%i0], %i4		! read a word
1682	add	%i0, 4, %i0		! increment source address
1683	mov	24, %l0			! initial shift alignment count
16841:
1685	srl	%i4, %l0, %i3		! prepare to write a single byte
1686	stb	%i3, [%i1]		! write a byte
1687	add	%i1, 1, %i1		! increment destination address
1688	sub	%i2, 1, %i2		! decrement count
1689	btst	3, %i1			! is destination aligned?
1690	bnz,a	1b
1691	sub	%l0, 8, %l0		! delay slot, decrement shift count
1692.xfer:
1693	sub	%l5, %l0, %l1		! generate shift left count
1694	sll	%i4, %l1, %i5		! get leftover
16953:
1696	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1697	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
16982:
1699	ld	[%i0], %i3		! read a source word
1700	add	%i0, 4, %i0		! increment source address
1701	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
1702	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
1703	st	%i5, [%i1]		! write a destination word
1704	subcc	%i2, 4, %i2		! decrement count
1705	bz	%ncc, .unalign_out	! check if done
1706	add	%i1, 4, %i1		! increment destination address
1707	b	2b			! loop
1708	sll	%i3, %l1, %i5		! get leftover
1709.unalign_out:
1710	tst	%l4			! any bytes leftover?
1711	bz	%ncc, .cpdone
1712	.empty				! allow next instruction in delay slot
17131:
1714	sub	%l0, 8, %l0		! decrement shift
1715	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
1716	stb	%i4, [%i1]		! write a byte
1717	subcc	%l4, 1, %l4		! decrement count
1718	bz	%ncc, .cpdone		! done?
1719	add	%i1, 1, %i1		! increment destination
1720	tst	%l0			! any more previously read bytes
1721	bnz	%ncc, 1b		! we have leftover bytes
1722	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
1723	b	.dbytecp		! let dbytecp do the rest
1724	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1725	!
1726	! the destination address is aligned and the source is not
1727	!
1728.align_src_only:
1729	ldub	[%i0], %i3		! read a byte from source address
1730	add	%i0, 1, %i0		! increment source address
1731	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1732	btst	3, %i0			! is source aligned?
1733	add	%l0, 8, %l0		! increment shift count (US)
1734	bnz,a	.align_src_only
1735	sll	%i4, 8, %i4		! make room for next byte
1736	b,a	.xfer
1737	!
1738	! if from address unaligned for double-word moves,
1739	! move bytes till it is, if count is < 56 it could take
1740	! longer to align the thing than to do the transfer
1741	! in word size chunks right away
1742	!
1743.aldoubcp:
1744	cmp	%i2, 56			! if count < 56, use wordcp, it takes
1745	blu,a	%ncc, .alwordcp		! longer to align doubles than words
1746	mov	3, %o0			! mask for word alignment
1747	call	.alignit		! copy bytes until aligned
1748	mov	7, %o0			! mask for double alignment
1749	!
1750	! source and destination are now double-word aligned
1751	! i3 has aligned count returned by alignit
1752	!
1753	and	%i2, 7, %i2		! unaligned leftover count
1754	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
17555:
1756	ldx	[%i0+%i1], %o4		! read from address
1757	stx	%o4, [%i1]		! write at destination address
1758	subcc	%i3, 8, %i3		! dec count
1759	bgu	%ncc, 5b
1760	add	%i1, 8, %i1		! delay slot, inc to address
1761	cmp	%i2, 4			! see if we can copy a word
1762	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
1763	.empty
1764	!
1765	! for leftover bytes we fall into wordcp, if needed
1766	!
1767.wordcp:
1768	and	%i2, 3, %i2		! unaligned leftover count
17695:
1770	ld	[%i0+%i1], %o4		! read from address
1771	st	%o4, [%i1]		! write at destination address
1772	subcc	%i3, 4, %i3		! dec count
1773	bgu	%ncc, 5b
1774	add	%i1, 4, %i1		! delay slot, inc to address
1775	b,a	.dbytecp
1776
1777	! we come here to align copies on word boundaries
1778.alwordcp:
1779	call	.alignit		! go word-align it
1780	mov	3, %o0			! bits that must be zero to be aligned
1781	b	.wordcp
1782	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1783
1784	!
1785	! byte copy, works with any alignment
1786	!
1787.bytecp:
1788	b	.dbytecp
1789	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
1790
1791	!
1792	! differenced byte copy, works with any alignment
1793	! assumes dest in %i1 and (source - dest) in %i0
1794	!
17951:
1796	stb	%o4, [%i1]		! write to address
1797	inc	%i1			! inc to address
1798.dbytecp:
1799	deccc	%i2			! dec count
1800	bgeu,a	%ncc, 1b		! loop till done
1801	ldub	[%i0+%i1], %o4		! read from address
1802	!
1803	! FPUSED_FLAG will not have been set in any path leading to
1804	! this point. No need to deal with it.
1805	!
1806.cpdone:
1807	btst	BCOPY_FLAG, %l6
1808	bz,pn	%icc, 2f
1809	andncc	%l6, BCOPY_FLAG, %l6
1810	!
1811	! Here via bcopy. Check to see if the handler was NULL.
1812	! If so, just return quietly. Otherwise, reset the
1813	! handler and go home.
1814	!
1815	bnz,pn	%ncc, 2f
1816	nop
1817	!
1818	! Null handler.
1819	!
1820	ret
1821	restore %g0, 0, %o0
1822	!
1823	! Here via kcopy or bcopy with a handler.Reset the
1824	! fault handler.
1825	!
18262:
1827  	membar	#Sync
1828	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1829	ret
1830	restore	%g0, 0, %o0		! return (0)
1831
1832/*
1833 * Common code used to align transfers on word and doubleword
1834 * boudaries.  Aligns source and destination and returns a count
1835 * of aligned bytes to transfer in %i3
1836 */
18371:
1838	inc	%i0			! inc from
1839	stb	%o4, [%i1]		! write a byte
1840	inc	%i1			! inc to
1841	dec	%i2			! dec count
1842.alignit:
1843	btst	%o0, %i0		! %o0 is bit mask to check for alignment
1844	bnz,a	1b
1845	ldub	[%i0], %o4		! read next byte
1846
1847	retl
1848	andn	%i2, %o0, %i3		! return size of aligned bytes
1849	SET_SIZE(bcopy)
1850
1851/*
1852 * Block copy with possibly overlapped operands.
1853 */
1854
1855	ENTRY(ovbcopy)
1856	tst	%o2			! check count
1857	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1858	subcc	%o0, %o1, %o3		! difference of from and to address
1859
1860	retl				! return
1861	nop
18621:
1863	bneg,a	%ncc, 2f
1864	neg	%o3			! if < 0, make it positive
18652:	cmp	%o2, %o3		! cmp size and abs(from - to)
1866	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1867	.empty				!   no overlap
1868	cmp	%o0, %o1		! compare from and to addresses
1869	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1870	nop
1871	!
1872	! Copy forwards.
1873	!
1874.ov_fwd:
1875	ldub	[%o0], %o3		! read from address
1876	inc	%o0			! inc from address
1877	stb	%o3, [%o1]		! write to address
1878	deccc	%o2			! dec count
1879	bgu	%ncc, .ov_fwd		! loop till done
1880	inc	%o1			! inc to address
1881
1882	retl				! return
1883	nop
1884	!
1885	! Copy backwards.
1886	!
1887.ov_bkwd:
1888	deccc	%o2			! dec count
1889	ldub	[%o0 + %o2], %o3	! get byte at end of src
1890	bgu	%ncc, .ov_bkwd		! loop till done
1891	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1892
1893	retl				! return
1894	nop
1895	SET_SIZE(ovbcopy)
1896
1897/*
1898 * hwblkpagecopy()
1899 *
1900 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1901 * has already disabled kernel preemption and has checked
1902 * use_hw_bcopy.
1903 */
1904	ENTRY(hwblkpagecopy)
1905	! get another window w/space for three aligned blocks of saved fpregs
1906	save	%sp, -SA(MINFRAME + 4*64), %sp
1907
1908	! %i0 - source address (arg)
1909	! %i1 - destination address (arg)
1910	! %i2 - length of region (not arg)
1911	! %l0 - saved fprs
1912	! %l1 - pointer to saved fpregs
1913
1914	rd	%fprs, %l0		! check for unused fp
1915	btst	FPRS_FEF, %l0
1916	bz	1f
1917	membar	#Sync
1918
1919	! save in-use fpregs on stack
1920	add	%fp, STACK_BIAS - 193, %l1
1921	and	%l1, -64, %l1
1922	stda	%d0, [%l1]ASI_BLK_P
1923	add	%l1, 64, %l3
1924	stda	%d16, [%l3]ASI_BLK_P
1925	add	%l3, 64, %l3
1926	stda	%d32, [%l3]ASI_BLK_P
1927	membar	#Sync
1928
19291:	wr	%g0, FPRS_FEF, %fprs
1930	ldda	[%i0]ASI_BLK_P, %d0
1931	add	%i0, 64, %i0
1932	set	PAGESIZE - 64, %i2
1933
19342:	ldda	[%i0]ASI_BLK_P, %d16
1935	fsrc1	%d0, %d32
1936	fsrc1	%d2, %d34
1937	fsrc1	%d4, %d36
1938	fsrc1	%d6, %d38
1939	fsrc1	%d8, %d40
1940	fsrc1	%d10, %d42
1941	fsrc1	%d12, %d44
1942	fsrc1	%d14, %d46
1943	stda	%d32, [%i1]ASI_BLK_P
1944	add	%i0, 64, %i0
1945	subcc	%i2, 64, %i2
1946	bz,pn	%ncc, 3f
1947	add	%i1, 64, %i1
1948	ldda	[%i0]ASI_BLK_P, %d0
1949	fsrc1	%d16, %d32
1950	fsrc1	%d18, %d34
1951	fsrc1	%d20, %d36
1952	fsrc1	%d22, %d38
1953	fsrc1	%d24, %d40
1954	fsrc1	%d26, %d42
1955	fsrc1	%d28, %d44
1956	fsrc1	%d30, %d46
1957	stda	%d32, [%i1]ASI_BLK_P
1958	add	%i0, 64, %i0
1959	sub	%i2, 64, %i2
1960	ba,pt	%ncc, 2b
1961	add	%i1, 64, %i1
1962
19633:	membar	#Sync
1964	btst	FPRS_FEF, %l0
1965	bz	4f
1966	stda	%d16, [%i1]ASI_BLK_P
1967
1968	! restore fpregs from stack
1969	membar	#Sync
1970	ldda	[%l1]ASI_BLK_P, %d0
1971	add	%l1, 64, %l3
1972	ldda	[%l3]ASI_BLK_P, %d16
1973	add	%l3, 64, %l3
1974	ldda	[%l3]ASI_BLK_P, %d32
1975
19764:	wr	%l0, 0, %fprs		! restore fprs
1977	membar #Sync
1978	ret
1979	restore	%g0, 0, %o0
1980	SET_SIZE(hwblkpagecopy)
1981
1982
1983/*
1984 * Transfer data to and from user space -
1985 * Note that these routines can cause faults
1986 * It is assumed that the kernel has nothing at
1987 * less than KERNELBASE in the virtual address space.
1988 *
1989 * Note that copyin(9F) and copyout(9F) are part of the
1990 * DDI/DKI which specifies that they return '-1' on "errors."
1991 *
1992 * Sigh.
1993 *
1994 * So there's two extremely similar routines - xcopyin() and xcopyout()
1995 * which return the errno that we've faithfully computed.  This
1996 * allows other callers (e.g. uiomove(9F)) to work correctly.
1997 * Given that these are used pretty heavily, we expand the calling
1998 * sequences inline for all flavours (rather than making wrappers).
1999 *
2000 * There are also stub routines for xcopyout_little and xcopyin_little,
2001 * which currently are intended to handle requests of <= 16 bytes from
2002 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2003 * is left as an exercise...
2004 */
2005
2006/*
2007 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2008 *
2009 * General theory of operation:
2010 *
2011 * The only difference between default_copy{in,out} and
2012 * default_xcopy{in,out} is in the error handling routine they invoke
2013 * when a memory access error is seen. default_xcopyOP returns the errno
2014 * while default_copyOP returns -1 (see above). copy{in,out}_noerr set
2015 * a special flag (by oring the value 2 into the fault handler address)
2016 * if they are called with a fault handler already in place. That flag
2017 * causes the default handlers to trampoline to the previous handler
2018 * upon an error.
2019 *
2020 * None of the copyops routines grab a window until it's decided that
2021 * we need to do a HW block copy operation. This saves a window
2022 * spill/fill when we're called during socket ops. The typical IO
2023 * path won't cause spill/fill traps.
2024 *
2025 * This code uses a set of 4 limits for the maximum size that will
2026 * be copied given a particular input/output address alignment.
2027 * the default limits are:
2028 *
2029 * single byte aligned - 900 (hw_copy_limit_1)
2030 * two byte aligned - 1800 (hw_copy_limit_2)
2031 * four byte aligned - 3600 (hw_copy_limit_4)
2032 * eight byte aligned - 7200 (hw_copy_limit_8)
2033 *
2034 * If the value for a particular limit is zero, the copy will be done
2035 * via the copy loops rather than VIS.
2036 *
2037 * Flow:
2038 *
2039 * If count == zero return zero.
2040 *
2041 * Store the previous lo_fault handler into %g6.
2042 * Place our secondary lofault handler into %g5.
2043 * Place the address of our nowindow fault handler into %o3.
2044 * Place the address of the windowed fault handler into %o4.
2045 * --> We'll use this handler if we end up grabbing a window
2046 * --> before we use VIS instructions.
2047 *
2048 * If count is less than or equal to SMALL_LIMIT (7) we
2049 * always do a byte for byte copy.
2050 *
2051 * If count is > SMALL_LIMIT, we check the alignment of the input
2052 * and output pointers. Based on the alignment we check count
2053 * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If
2054 * we're larger than VIS_COPY_THRESHOLD, we check against a limit based
2055 * on detected alignment. If we exceed the alignment value we copy
2056 * via VIS instructions.
2057 *
2058 * If we don't exceed one of the limits, we store -count in %o3,
2059 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2060 * on in our basic copy loop in %o2. Following this we branch
2061 * to the appropriate copy loop and copy that many chunks.
2062 * Since we've been adding the chunk size to %o3 each time through
2063 * as well as decrementing %o2, we can tell if any data is
2064 * is left to be copied by examining %o3. If that is zero, we're
2065 * done and can go home. If not, we figure out what the largest
2066 * chunk size left to be copied is and branch to that copy loop
2067 * unless there's only one byte left. We load that as we're
2068 * branching to code that stores it just before we return.
2069 *
2070 * There is one potential situation in which we start to do a VIS
2071 * copy but decide to punt and return to the copy loops. There is
2072 * (in the default configuration) a window of 256 bytes between
2073 * the single byte aligned copy limit and what VIS treats as its
2074 * minimum if floating point is in use in the calling app. We need
2075 * to be prepared to handle this. See the .small_copyOP label for
2076 * details.
2077 *
2078 * Fault handlers are invoked if we reference memory that has no
2079 * current mapping.  All forms share the same copyio_fault handler.
2080 * This routine handles fixing up the stack and general housecleaning.
2081 * Each copy operation has a simple fault handler that is then called
2082 * to do the work specific to the invidual operation.  The handlers
2083 * for default_copyOP and copyOP_noerr are found at the end of
2084 * default_copyout. The handlers for default_xcopyOP are found at the
2085 * end of xdefault_copyin.
2086 */
2087
2088/*
2089 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2090 */
2091
2092/*
2093 * We save the arguments in the following registers in case of a fault:
2094 * 	kaddr - %g2
2095 * 	uaddr - %g3
2096 * 	count - %g4
2097 */
2098#define	SAVE_SRC	%g2
2099#define	SAVE_DST	%g3
2100#define	SAVE_COUNT	%g4
2101
2102#define	REAL_LOFAULT		%g5
2103#define	SAVED_LOFAULT		%g6
2104
2105/*
2106 * Generic copyio fault handler.  This is the first line of defense when a
2107 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2108 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2109 * This allows us to share common code for all the flavors of the copy
2110 * operations, including the _noerr versions.
2111 *
2112 * Note that this function will restore the original input parameters before
2113 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2114 * member of the t_copyop structure, if needed.
2115 */
2116	ENTRY(copyio_fault)
2117	btst	FPUSED_FLAG, SAVED_LOFAULT
2118	bz	1f
2119	  andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2120
2121	membar	#Sync
2122
2123	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2124	wr	%o2, 0, %gsr		! restore gsr
2125
2126	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2127	btst	FPRS_FEF, %o3
2128	bz	4f
2129	  nop
2130
2131	! restore fpregs from stack
2132	membar	#Sync
2133	add	%fp, STACK_BIAS - 257, %o2
2134	and	%o2, -64, %o2
2135	ldda	[%o2]ASI_BLK_P, %d0
2136	add	%o2, 64, %o2
2137	ldda	[%o2]ASI_BLK_P, %d16
2138	add	%o2, 64, %o2
2139	ldda	[%o2]ASI_BLK_P, %d32
2140	add	%o2, 64, %o2
2141	ldda	[%o2]ASI_BLK_P, %d48
2142	membar	#Sync
2143
2144	ba,pt	%ncc, 1f
2145	  wr	%o3, 0, %fprs		! restore fprs
2146
21474:
2148	FZERO				! zero all of the fpregs
2149	wr	%o3, 0, %fprs		! restore fprs
2150
21511:
2152
2153	restore
2154
2155	mov	SAVE_SRC, %o0
2156	mov	SAVE_DST, %o1
2157	jmp	REAL_LOFAULT
2158	  mov	SAVE_COUNT, %o2
2159	SET_SIZE(copyio_fault)
2160
2161	ENTRY(copyio_fault_nowindow)
2162	membar	#Sync
2163	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2164
2165	mov	SAVE_SRC, %o0
2166	mov	SAVE_DST, %o1
2167	jmp	REAL_LOFAULT
2168	  mov	SAVE_COUNT, %o2
2169	SET_SIZE(copyio_fault_nowindow)
2170
2171	ENTRY(copyout)
2172	sethi	%hi(.copyout_err), REAL_LOFAULT
2173	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2174
2175.do_copyout:
2176	!
2177	! Check the length and bail if zero.
2178	!
2179	tst	%o2
2180	bnz,pt	%ncc, 1f
2181	  nop
2182	retl
2183	  clr	%o0
21841:
2185	sethi	%hi(copyio_fault), %o4
2186	or	%o4, %lo(copyio_fault), %o4
2187	sethi	%hi(copyio_fault_nowindow), %o3
2188	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2189	or	%o3, %lo(copyio_fault_nowindow), %o3
2190	membar	#Sync
2191	stn	%o3, [THREAD_REG + T_LOFAULT]
2192
2193	mov	%o0, SAVE_SRC
2194	mov	%o1, SAVE_DST
2195	mov	%o2, SAVE_COUNT
2196
2197	!
2198	! Check to see if we're more than SMALL_LIMIT (7 bytes).
2199	! Run in leaf mode, using the %o regs as our input regs.
2200	!
2201	subcc	%o2, SMALL_LIMIT, %o3
2202	bgu,a,pt %ncc, .dco_ns
2203	or	%o0, %o1, %o3
2204	!
2205	! What was previously ".small_copyout"
2206	! Do full differenced copy.
2207	!
2208.dcobcp:
2209	sub	%g0, %o2, %o3		! negate count
2210	add	%o0, %o2, %o0		! make %o0 point at the end
2211	add	%o1, %o2, %o1		! make %o1 point at the end
2212	ba,pt	%ncc, .dcocl
2213	ldub	[%o0 + %o3], %o4	! load first byte
2214	!
2215	! %o0 and %o2 point at the end and remain pointing at the end
2216	! of their buffers. We pull things out by adding %o3 (which is
2217	! the negation of the length) to the buffer end which gives us
2218	! the curent location in the buffers. By incrementing %o3 we walk
2219	! through both buffers without having to bump each buffer's
2220	! pointer. A very fast 4 instruction loop.
2221	!
2222	.align 16
2223.dcocl:
2224	stba	%o4, [%o1 + %o3]ASI_USER
2225	inccc	%o3
2226	bl,a,pt	%ncc, .dcocl
2227	ldub	[%o0 + %o3], %o4
2228	!
2229	! We're done. Go home.
2230	!
2231	membar	#Sync
2232	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2233	retl
2234	clr	%o0
2235	!
2236	! Try aligned copies from here.
2237	!
2238.dco_ns:
2239	! %o0 = kernel addr (to be copied from)
2240	! %o1 = user addr (to be copied to)
2241	! %o2 = length
2242	! %o3 = %o1 | %o2 (used for alignment checking)
2243	! %o4 is alternate lo_fault
2244	! %o5 is original lo_fault
2245	!
2246	! See if we're single byte aligned. If we are, check the
2247	! limit for single byte copies. If we're smaller or equal,
2248	! bounce to the byte for byte copy loop. Otherwise do it in
2249	! HW (if enabled).
2250	!
2251	btst	1, %o3
2252	bz,pt	%icc, .dcoh8
2253	btst	7, %o3
2254	!
2255	! Single byte aligned. Do we do it via HW or via
2256	! byte for byte? Do a quick no memory reference
2257	! check to pick up small copies.
2258	!
2259	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2260	bleu,pt	%ncc, .dcobcp
2261	sethi	%hi(hw_copy_limit_1), %o3
2262	!
2263	! Big enough that we need to check the HW limit for
2264	! this size copy.
2265	!
2266	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2267	!
2268	! Is HW copy on? If not, do everything byte for byte.
2269	!
2270	tst	%o3
2271	bz,pn	%icc, .dcobcp
2272	subcc	%o3, %o2, %o3
2273	!
2274	! If we're less than or equal to the single byte copy limit,
2275	! bop to the copy loop.
2276	!
2277	bge,pt	%ncc, .dcobcp
2278	nop
2279	!
2280	! We're big enough and copy is on. Do it with HW.
2281	!
2282	ba,pt	%ncc, .big_copyout
2283	nop
2284.dcoh8:
2285	!
2286	! 8 byte aligned?
2287	!
2288	bnz,a	%ncc, .dcoh4
2289	btst	3, %o3
2290	!
2291	! See if we're in the "small range".
2292	! If so, go off and do the copy.
2293	! If not, load the hard limit. %o3 is
2294	! available for reuse.
2295	!
2296	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2297	bleu,pt	%ncc, .dcos8
2298	sethi	%hi(hw_copy_limit_8), %o3
2299	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2300	!
2301	! If it's zero, there's no HW bcopy.
2302	! Bop off to the aligned copy.
2303	!
2304	tst	%o3
2305	bz,pn	%icc, .dcos8
2306	subcc	%o3, %o2, %o3
2307	!
2308	! We're negative if our size is larger than hw_copy_limit_8.
2309	!
2310	bge,pt	%ncc, .dcos8
2311	nop
2312	!
2313	! HW assist is on and we're large enough. Do it.
2314	!
2315	ba,pt	%ncc, .big_copyout
2316	nop
2317.dcos8:
2318	!
2319	! Housekeeping for copy loops. Uses same idea as in the byte for
2320	! byte copy loop above.
2321	!
2322	add	%o0, %o2, %o0
2323	add	%o1, %o2, %o1
2324	sub	%g0, %o2, %o3
2325	ba,pt	%ncc, .dodebc
2326	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
2327	!
2328	! 4 byte aligned?
2329	!
2330.dcoh4:
2331	bnz,pn	%ncc, .dcoh2
2332	!
2333	! See if we're in the "small range".
2334	! If so, go off an do the copy.
2335	! If not, load the hard limit. %o3 is
2336	! available for reuse.
2337	!
2338	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2339	bleu,pt	%ncc, .dcos4
2340	sethi	%hi(hw_copy_limit_4), %o3
2341	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2342	!
2343	! If it's zero, there's no HW bcopy.
2344	! Bop off to the aligned copy.
2345	!
2346	tst	%o3
2347	bz,pn	%icc, .dcos4
2348	subcc	%o3, %o2, %o3
2349	!
2350	! We're negative if our size is larger than hw_copy_limit_4.
2351	!
2352	bge,pt	%ncc, .dcos4
2353	nop
2354	!
2355	! HW assist is on and we're large enough. Do it.
2356	!
2357	ba,pt	%ncc, .big_copyout
2358	nop
2359.dcos4:
2360	add	%o0, %o2, %o0
2361	add	%o1, %o2, %o1
2362	sub	%g0, %o2, %o3
2363	ba,pt	%ncc, .dodfbc
2364	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
2365	!
2366	! We must be 2 byte aligned. Off we go.
2367	! The check for small copies was done in the
2368	! delay at .dcoh4
2369	!
2370.dcoh2:
2371	ble	%ncc, .dcos2
2372	sethi	%hi(hw_copy_limit_2), %o3
2373	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2374	tst	%o3
2375	bz,pn	%icc, .dcos2
2376	subcc	%o3, %o2, %o3
2377	bge,pt	%ncc, .dcos2
2378	nop
2379	!
2380	! HW is on and we're big enough. Do it.
2381	!
2382	ba,pt	%ncc, .big_copyout
2383	nop
2384.dcos2:
2385	add	%o0, %o2, %o0
2386	add	%o1, %o2, %o1
2387	sub	%g0, %o2, %o3
2388	ba,pt	%ncc, .dodtbc
2389	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
2390.small_copyout:
2391	!
2392	! Why are we doing this AGAIN? There are certain conditions in
2393	! big_copyout that will cause us to forego the HW assisted copies
2394	! and bounce back to a non-HW assisted copy. This dispatches those
2395	! copies. Note that we branch around this in the main line code.
2396	!
2397	! We make no check for limits or HW enablement here. We've
2398	! already been told that we're a poster child so just go off
2399	! and do it.
2400	!
2401	or	%o0, %o1, %o3
2402	btst	1, %o3
2403	bnz	%icc, .dcobcp		! Most likely
2404	btst	7, %o3
2405	bz	%icc, .dcos8
2406	btst	3, %o3
2407	bz	%icc, .dcos4
2408	nop
2409	ba,pt	%ncc, .dcos2
2410	nop
2411	.align 32
2412.dodebc:
2413	ldx	[%o0 + %o3], %o4
2414	deccc	%o2
2415	stxa	%o4, [%o1 + %o3]ASI_USER
2416	bg,pt	%ncc, .dodebc
2417	addcc	%o3, 8, %o3
2418	!
2419	! End of copy loop. Check to see if we're done. Most
2420	! eight byte aligned copies end here.
2421	!
2422	bz,pt	%ncc, .dcofh
2423	nop
2424	!
2425	! Something is left - do it byte for byte.
2426	!
2427	ba,pt	%ncc, .dcocl
2428	ldub	[%o0 + %o3], %o4	! load next byte
2429	!
2430	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2431	!
2432	.align 32
2433.dodfbc:
2434	lduw	[%o0 + %o3], %o4
2435	deccc	%o2
2436	sta	%o4, [%o1 + %o3]ASI_USER
2437	bg,pt	%ncc, .dodfbc
2438	addcc	%o3, 4, %o3
2439	!
2440	! End of copy loop. Check to see if we're done. Most
2441	! four byte aligned copies end here.
2442	!
2443	bz,pt	%ncc, .dcofh
2444	nop
2445	!
2446	! Something is left. Do it byte for byte.
2447	!
2448	ba,pt	%ncc, .dcocl
2449	ldub	[%o0 + %o3], %o4	! load next byte
2450	!
2451	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2452	! copy.
2453	!
2454	.align 32
2455.dodtbc:
2456	lduh	[%o0 + %o3], %o4
2457	deccc	%o2
2458	stha	%o4, [%o1 + %o3]ASI_USER
2459	bg,pt	%ncc, .dodtbc
2460	addcc	%o3, 2, %o3
2461	!
2462	! End of copy loop. Anything left?
2463	!
2464	bz,pt	%ncc, .dcofh
2465	nop
2466	!
2467	! Deal with the last byte
2468	!
2469	ldub	[%o0 + %o3], %o4
2470	stba	%o4, [%o1 + %o3]ASI_USER
2471.dcofh:
2472	membar	#Sync
2473	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2474	retl
2475	clr	%o0
2476
2477.big_copyout:
2478	!
2479	! Are we using the FP registers?
2480	!
2481	rd	%fprs, %o3			! check for unused fp
2482	btst	FPRS_FEF, %o3
2483	bnz	%icc, .copyout_fpregs_inuse
2484	nop
2485	!
2486	! We're going to go off and do a block copy.
2487	! Switch fault hendlers and grab a window. We
2488	! don't do a membar #Sync since we've done only
2489	! kernel data to this point.
2490	!
2491	stn	%o4, [THREAD_REG + T_LOFAULT]
2492	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2493	!
2494	! %o3 is now %i3. Save original %fprs.
2495	!
2496	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2497	ba,pt	%ncc, .do_block_copyout		! Not in use. Go off and do it.
2498	wr	%g0, FPRS_FEF, %fprs		! clear %fprs
2499	!
2500.copyout_fpregs_inuse:
2501	!
2502	! We're here if the FP regs are in use. Need to see if the request
2503	! exceeds our suddenly larger minimum.
2504	!
2505	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
2506	bl	%ncc, .small_copyout
2507	  nop
2508	!
2509	! We're going to go off and do a block copy.
2510	! Change to the heavy duty fault handler and grab a window first.
2511	!
2512	stn	%o4, [THREAD_REG + T_LOFAULT]
2513	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2514	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2515	!
2516	! save in-use fpregs on stack
2517	!
2518	wr	%g0, FPRS_FEF, %fprs
2519	membar	#Sync
2520	add	%fp, STACK_BIAS - 257, %o2
2521	and	%o2, -64, %o2
2522	stda	%d0, [%o2]ASI_BLK_P
2523	add	%o2, 64, %o2
2524	stda	%d16, [%o2]ASI_BLK_P
2525	add	%o2, 64, %o2
2526	stda	%d32, [%o2]ASI_BLK_P
2527	add	%o2, 64, %o2
2528	stda	%d48, [%o2]ASI_BLK_P
2529	membar	#Sync
2530
2531.do_block_copyout:
2532	membar	#StoreStore|#StoreLoad|#LoadStore
2533
2534	rd	%gsr, %o2
2535	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2536
2537	! Set the lower bit in the saved t_lofault to indicate
2538	! that we need to clear the %fprs register on the way
2539	! out
2540	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2541
2542	! Swap src/dst since the code below is memcpy code
2543	! and memcpy/bcopy have different calling sequences
2544	mov	%i1, %i5
2545	mov	%i0, %i1
2546	mov	%i5, %i0
2547
2548!!! This code is nearly identical to the version in the sun4u
2549!!! libc_psr.  Most bugfixes made to that file should be
2550!!! merged into this routine.
2551
2552	andcc	%i0, 7, %o3
2553	bz	%ncc, copyout_blkcpy
2554	sub	%o3, 8, %o3
2555	neg	%o3
2556	sub	%i2, %o3, %i2
2557
2558	! Align Destination on double-word boundary
2559
25602:	ldub	[%i1], %o4
2561	inc	%i1
2562	stba	%o4, [%i0]ASI_USER
2563	deccc	%o3
2564	bgu	%ncc, 2b
2565	  inc	%i0
2566copyout_blkcpy:
2567	andcc	%i0, 63, %i3
2568	bz,pn	%ncc, copyout_blalign	! now block aligned
2569	sub	%i3, 64, %i3
2570	neg	%i3			! bytes till block aligned
2571	sub	%i2, %i3, %i2		! update %i2 with new count
2572
2573	! Copy %i3 bytes till dst is block (64 byte) aligned. use
2574	! double word copies.
2575
2576	alignaddr %i1, %g0, %g1
2577	ldd	[%g1], %d0
2578	add	%g1, 8, %g1
25796:
2580	ldd	[%g1], %d2
2581	add	%g1, 8, %g1
2582	subcc	%i3, 8, %i3
2583	faligndata %d0, %d2, %d8
2584	stda	 %d8, [%i0]ASI_USER
2585	add	%i1, 8, %i1
2586	bz,pn	%ncc, copyout_blalign
2587	add	%i0, 8, %i0
2588	ldd	[%g1], %d0
2589	add	%g1, 8, %g1
2590	subcc	%i3, 8, %i3
2591	faligndata %d2, %d0, %d8
2592	stda	 %d8, [%i0]ASI_USER
2593	add	%i1, 8, %i1
2594	bgu,pn	%ncc, 6b
2595	add	%i0, 8, %i0
2596
2597copyout_blalign:
2598	membar	#StoreLoad
2599	! %i2 = total length
2600	! %i3 = blocks	(length - 64) / 64
2601	! %i4 = doubles remaining  (length - blocks)
2602	sub	%i2, 64, %i3
2603	andn	%i3, 63, %i3
2604	sub	%i2, %i3, %i4
2605	andn	%i4, 7, %i4
2606	sub	%i4, 16, %i4
2607	sub	%i2, %i4, %i2
2608	sub	%i2, %i3, %i2
2609
2610	andn	%i1, 0x3f, %l7		! blk aligned address
2611	alignaddr %i1, %g0, %g0		! gen %gsr
2612
2613	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
2614	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
2615	add	%i1, %i4, %i1
2616	add	%i1, %i3, %i1
2617
2618	ldda	[%l7]ASI_BLK_P, %d0
2619	add	%l7, 64, %l7
2620	ldda	[%l7]ASI_BLK_P, %d16
2621	add	%l7, 64, %l7
2622	ldda	[%l7]ASI_BLK_P, %d32
2623	add	%l7, 64, %l7
2624	sub	%i3, 128, %i3
2625
2626	! switch statement to get us to the right 8 byte blk within a
2627	! 64 byte block
2628
2629	cmp	 %i5, 4
2630	bgeu,a	 copyout_hlf
2631	cmp	 %i5, 6
2632	cmp	 %i5, 2
2633	bgeu,a	 copyout_sqtr
2634	nop
2635	cmp	 %i5, 1
2636	be,a	 copyout_seg1
2637	nop
2638	ba,pt	 %ncc, copyout_seg0
2639	nop
2640copyout_sqtr:
2641	be,a	 copyout_seg2
2642	nop
2643	ba,pt	 %ncc, copyout_seg3
2644	nop
2645
2646copyout_hlf:
2647	bgeu,a	 copyout_fqtr
2648	nop
2649	cmp	 %i5, 5
2650	be,a	 copyout_seg5
2651	nop
2652	ba,pt	 %ncc, copyout_seg4
2653	nop
2654copyout_fqtr:
2655	be,a	 copyout_seg6
2656	nop
2657	ba,pt	 %ncc, copyout_seg7
2658	nop
2659
2660copyout_seg0:
2661	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2662	FALIGN_D0
2663	ldda	[%l7]ASI_BLK_P, %d0
2664	stda	%d48, [%i0]ASI_BLK_AIUS
2665	add	%l7, 64, %l7
2666	subcc	%i3, 64, %i3
2667	bz,pn	%ncc, 0f
2668	add	%i0, 64, %i0
2669	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2670	FALIGN_D16
2671	ldda	[%l7]ASI_BLK_P, %d16
2672	stda	%d48, [%i0]ASI_BLK_AIUS
2673	add	%l7, 64, %l7
2674	subcc	%i3, 64, %i3
2675	bz,pn	%ncc, 1f
2676	add	%i0, 64, %i0
2677	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2678	FALIGN_D32
2679	ldda	[%l7]ASI_BLK_P, %d32
2680	stda	%d48, [%i0]ASI_BLK_AIUS
2681	add	%l7, 64, %l7
2682	subcc	%i3, 64, %i3
2683	bz,pn	%ncc, 2f
2684	add	%i0, 64, %i0
2685	ba,a,pt	%ncc, copyout_seg0
2686
26870:
2688	FALIGN_D16
2689	stda	%d48, [%i0]ASI_BLK_AIUS
2690	add	%i0, 64, %i0
2691	membar	#Sync
2692	FALIGN_D32
2693	stda	%d48, [%i0]ASI_BLK_AIUS
2694	ba,pt	%ncc, copyout_blkd0
2695	add	%i0, 64, %i0
2696
26971:
2698	FALIGN_D32
2699	stda	%d48, [%i0]ASI_BLK_AIUS
2700	add	%i0, 64, %i0
2701	membar	#Sync
2702	FALIGN_D0
2703	stda	%d48, [%i0]ASI_BLK_AIUS
2704	ba,pt	%ncc, copyout_blkd16
2705	add	%i0, 64, %i0
2706
27072:
2708	FALIGN_D0
2709	stda	%d48, [%i0]ASI_BLK_AIUS
2710	add	%i0, 64, %i0
2711	membar	#Sync
2712	FALIGN_D16
2713	stda	%d48, [%i0]ASI_BLK_AIUS
2714	ba,pt	%ncc, copyout_blkd32
2715	add	%i0, 64, %i0
2716
2717copyout_seg1:
2718	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2719	FALIGN_D2
2720	ldda	[%l7]ASI_BLK_P, %d0
2721	stda	%d48, [%i0]ASI_BLK_AIUS
2722	add	%l7, 64, %l7
2723	subcc	%i3, 64, %i3
2724	bz,pn	%ncc, 0f
2725	add	%i0, 64, %i0
2726	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2727	FALIGN_D18
2728	ldda	[%l7]ASI_BLK_P, %d16