1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/param.h>
27#include <sys/errno.h>
28#include <sys/asm_linkage.h>
29#include <sys/vtrace.h>
30#include <sys/machthread.h>
31#include <sys/clock.h>
32#include <sys/asi.h>
33#include <sys/fsr.h>
34#include <sys/privregs.h>
35
36#include "assym.h"
37
38/*
39 * Pseudo-code to aid in understanding the control flow of the
40 * bcopy/copyin/copyout routines.
41 *
42 * On entry:
43 *
44 * 	! Determine whether to use the FP register version
45 * 	! or the leaf routine version depending on size
46 * 	! of copy and flags.  Set up error handling accordingly.
47 *	! The transition point depends on whether the src and
48 * 	! dst addresses can be aligned to long word, word,
49 * 	! half word, or byte boundaries.
50 *	!
51 *	! WARNING: <Register usage convention>
52 *	! For FP version, %l6 holds previous error handling and
53 *	! a flag: TRAMP_FLAG (low bits)
54 *	! for leaf routine version, %o4 holds those values.
55 *	! So either %l6 or %o4 is reserved and not available for
56 *	! any other use.
57 *
58 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
59 * 		go to small_copy;		! to speed short copies
60 *
61 * 	! src, dst long word alignable
62 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
63 * 			go to small_copy;
64 *		if (length <= hw_copy_limit_8)
65 * 			go to small_copy;
66 * 		go to FPBLK_copy;
67 * 	}
68 * 	if (src,dst not alignable) {
69 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
70 * 			go to small_copy;
71 *		if (length <= hw_copy_limit_1)
72 * 			go to small_copy;
73 * 		go to FPBLK_copy;
74 * 	}
75 * 	if (src,dst halfword alignable) {
76 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
77 * 			go to small_copy;
78 *		if (length <= hw_copy_limit_2)
79 * 			go to small_copy;
80 * 		go to FPBLK_copy;
81 * 	}
82 * 	if (src,dst word alignable) {
83 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
84 * 			go to small_copy;
85 *		if (length <= hw_copy_limit_4)
86 * 			go to small_copy;
87 * 		go to FPBLK_copy;
88 * 	}
89 *
90 * small_copy:
91 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
92 *
93 *	if (count <= 3)				! fast path for tiny copies
94 *		go to sm_left;			! special finish up code
95 *	else
96 *		if (count > CHKSIZE)		! medium sized copies
97 *			go to sm_med		! tuned by alignment
98 *		if(src&dst not both word aligned) {
99 *	sm_movebytes:
100 *			move byte by byte in 4-way unrolled loop
101 *			fall into sm_left;
102 *	sm_left:
103 *			move 0-3 bytes byte at a time as needed.
104 *			restore error handler and exit.
105 *
106 * 		} else {	! src&dst are word aligned
107 *			check for at least 8 bytes left,
108 *			move word at a time, unrolled by 2
109 *			when fewer than 8 bytes left,
110 *	sm_half:	move half word at a time while 2 or more bytes left
111 *	sm_byte:	move final byte if necessary
112 *	sm_exit:
113 *			restore error handler and exit.
114 *		}
115 *
116 * ! Medium length cases with at least CHKSIZE bytes available
117 * ! method: line up src and dst as best possible, then
118 * ! move data in 4-way unrolled loops.
119 *
120 * sm_med:
121 *	if(src&dst unalignable)
122 * 		go to sm_movebytes
123 *	if(src&dst halfword alignable)
124 *		go to sm_movehalf
125 *	if(src&dst word alignable)
126 *		go to sm_moveword
127 * ! fall into long word movement
128 *	move bytes until src is word aligned
129 *	if not long word aligned, move a word
130 *	move long words in 4-way unrolled loop until < 32 bytes left
131 *      move long words in 1-way unrolled loop until < 8 bytes left
132 *	if zero bytes left, goto sm_exit
133 *	if one byte left, go to sm_byte
134 *	else go to sm_half
135 *
136 * sm_moveword:
137 *	move bytes until src is word aligned
138 *	move words in 4-way unrolled loop until < 16 bytes left
139 *      move words in 1-way unrolled loop until < 4 bytes left
140 *	if zero bytes left, goto sm_exit
141 *	if one byte left, go to sm_byte
142 *	else go to sm_half
143 *
144 * sm_movehalf:
145 *	move a byte if needed to align src on halfword
146 *	move halfwords in 4-way unrolled loop until < 8 bytes left
147 *	if zero bytes left, goto sm_exit
148 *	if one byte left, go to sm_byte
149 *	else go to sm_half
150 *
151 *
152 * FPBLK_copy:
153 * 	%l6 = curthread->t_lofault;
154 * 	if (%l6 != NULL) {
155 * 		membar #Sync
156 * 		curthread->t_lofault = .copyerr;
157 * 		caller_error_handler = TRUE             ! %l6 |= 2
158 * 	}
159 *
160 *	! for FPU testing we must not migrate cpus
161 * 	if (curthread->t_lwp == NULL) {
162 *		! Kernel threads do not have pcb's in which to store
163 *		! the floating point state, so disallow preemption during
164 *		! the copy.  This also prevents cpu migration.
165 * 		kpreempt_disable(curthread);
166 *	} else {
167 *		thread_nomigrate();
168 *	}
169 *
170 * 	old_fprs = %fprs;
171 * 	old_gsr = %gsr;
172 * 	if (%fprs.fef) {
173 * 		%fprs.fef = 1;
174 * 		save current fpregs on stack using blockstore
175 * 	} else {
176 * 		%fprs.fef = 1;
177 * 	}
178 *
179 *
180 * 	do_blockcopy_here;
181 *
182 * In lofault handler:
183 *	curthread->t_lofault = .copyerr2;
184 *	Continue on with the normal exit handler
185 *
186 * On normal exit:
187 * 	%gsr = old_gsr;
188 * 	if (old_fprs & FPRS_FEF)
189 * 		restore fpregs from stack using blockload
190 *	else
191 *		zero fpregs
192 * 	%fprs = old_fprs;
193 * 	membar #Sync
194 * 	curthread->t_lofault = (%l6 & ~3);
195 *	! following test omitted from copyin/copyout as they
196 *	! will always have a current thread
197 * 	if (curthread->t_lwp == NULL)
198 *		kpreempt_enable(curthread);
199 *	else
200 *		thread_allowmigrate();
201 * 	return (0)
202 *
203 * In second lofault handler (.copyerr2):
204 *	We've tried to restore fp state from the stack and failed.  To
205 *	prevent from returning with a corrupted fp state, we will panic.
206 */
207
208/*
209 * Comments about optimization choices
210 *
211 * The initial optimization decision in this code is to determine
212 * whether to use the FP registers for a copy or not.  If we don't
213 * use the FP registers, we can execute the copy as a leaf routine,
214 * saving a register save and restore.  Also, less elaborate setup
215 * is required, allowing short copies to be completed more quickly.
216 * For longer copies, especially unaligned ones (where the src and
217 * dst do not align to allow simple ldx,stx operation), the FP
218 * registers allow much faster copy operations.
219 *
220 * The estimated extra cost of the FP path will vary depending on
221 * src/dst alignment, dst offset from the next 64 byte FPblock store
222 * boundary, remaining src data after the last full dst cache line is
223 * moved whether the FP registers need to be saved, and some other
224 * minor issues.  The average additional overhead is estimated to be
225 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
226 * around 10 clocks, elaborate calculation would slow down to all
227 * longer copies and only benefit a small portion of medium sized
228 * copies.  Rather than incur such cost, we chose fixed transition
229 * points for each of the alignment choices.
230 *
231 * For the inner loop, here is a comparison of the per cache line
232 * costs for each alignment when src&dst are in cache:
233 *
234 * byte aligned:  108 clocks slower for non-FPBLK
235 * half aligned:   44 clocks slower for non-FPBLK
236 * word aligned:   12 clocks slower for non-FPBLK
237 * long aligned:    4 clocks >>faster<< for non-FPBLK
238 *
239 * The long aligned loop runs faster because it does no prefetching.
240 * That wins if the data is not in cache or there is too little
241 * data to gain much benefit from prefetching.  But when there
242 * is more data and that data is not in cache, failing to prefetch
243 * can run much slower.  In addition, there is a 2 Kbyte store queue
244 * which will cause the non-FPBLK inner loop to slow for larger copies.
245 * The exact tradeoff is strongly load and application dependent, with
246 * increasing risk of a customer visible performance regression if the
247 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
248 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
249 * upper limit for the non-FPBLK code.  To minimize performance regression
250 * risk while still gaining the primary benefits of the improvements to
251 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
252 * hw_copy_limit_*.  Later experimental studies using different values
253 * of hw_copy_limit_* can be used to make further adjustments if
254 * appropriate.
255 *
256 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
257 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
258 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
259 * hw_copy_limit_8 = src and dst are longword aligned
260 *
261 * To say that src and dst are word aligned means that after
262 * some initial alignment activity of moving 0 to 3 bytes,
263 * both the src and dst will be on word boundaries so that
264 * word loads and stores may be used.
265 *
266 * Default values at May,2005 are:
267 * hw_copy_limit_1 =  256
268 * hw_copy_limit_2 =  512
269 * hw_copy_limit_4 = 1024
270 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
271 *
272 *
273 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
274 * disabled for that alignment choice.
275 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
276 * the value of VIS_COPY_THRESHOLD is used.
277 * It is not envisioned that hw_copy_limit_? will be changed in the field
278 * It is provided to allow for disabling FPBLK copies and to allow
279 * easy testing of alternate values on future HW implementations
280 * that might have different cache sizes, clock rates or instruction
281 * timing rules.
282 *
283 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
284 * threshold to speedup all shorter copies (less than 256).  That
285 * saves an alignment test, memory reference, and enabling test
286 * for all short copies, or an estimated 24 clocks.
287 *
288 * The order in which these limits are checked does matter since each
289 * non-predicted tst and branch costs around 10 clocks.
290 * If src and dst are randomly selected addresses,
291 * 4 of 8 will not be alignable.
292 * 2 of 8 will be half word alignable.
293 * 1 of 8 will be word alignable.
294 * 1 of 8 will be long word alignable.
295 * But, tests on running kernels show that src and dst to copy code
296 * are typically not on random alignments.  Structure copies and
297 * copies of larger data sizes are often on long word boundaries.
298 * So we test the long word alignment case first, then
299 * the byte alignment, then halfword, then word alignment.
300 *
301 * Several times, tests for length are made to split the code
302 * into subcases.  These tests often allow later tests to be
303 * avoided.  For example, within the non-FPBLK copy, we first
304 * check for tiny copies of 3 bytes or less.  That allows us
305 * to use a 4-way unrolled loop for the general byte copy case
306 * without a test on loop entry.
307 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
308 * vs longer cases.  For the really short case, we don't attempt
309 * align src and dst.  We try to minimize special case tests in
310 * the shortest loops as each test adds a significant percentage
311 * to the total time.
312 *
313 * For the medium sized cases, we allow ourselves to adjust the
314 * src and dst alignment and provide special cases for each of
315 * the four adjusted alignment cases. The CHKSIZE that was used
316 * to decide between short and medium size was chosen to be 39
317 * as that allows for the worst case of 7 bytes of alignment
318 * shift and 4 times 8 bytes for the first long word unrolling.
319 * That knowledge saves an initial test for length on entry into
320 * the medium cases.  If the general loop unrolling factor were
321 * to be increases, this number would also need to be adjusted.
322 *
323 * For all cases in the non-FPBLK code where it is known that at
324 * least 4 chunks of data are available for movement, the
325 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
326 * or 2 clocks per data element.
327 *
328 * Instruction alignment is forced by used of .align 16 directives
329 * and nops which are not executed in the code.  This
330 * combination of operations shifts the alignment of following
331 * loops to insure that loops are aligned so that their instructions
332 * fall within the minimum number of 4 instruction fetch groups.
333 * If instructions are inserted or removed between the .align
334 * instruction and the unrolled loops, then the alignment needs
335 * to be readjusted.  Misaligned loops can add a clock per loop
336 * iteration to the loop timing.
337 *
338 * In a few cases, code is duplicated to avoid a branch.  Since
339 * a non-predicted tst and branch takes 10 clocks, this savings
340 * is judged an appropriate time-space tradeoff.
341 *
342 * Within the FPBLK-code, the prefetch method in the inner
343 * loop needs to be explained as it is not standard.  Two
344 * prefetches are issued for each cache line instead of one.
345 * The primary one is at the maximum reach of 8 cache lines.
346 * Most of the time, that maximum prefetch reach gives the
347 * cache line more time to reach the processor for systems with
348 * higher processor clocks.  But, sometimes memory interference
349 * can cause that prefetch to be dropped.  Putting a second
350 * prefetch at a reach of 5 cache lines catches the drops
351 * three iterations later and shows a measured improvement
352 * in performance over any similar loop with a single prefetch.
353 * The prefetches are placed in the loop so they overlap with
354 * non-memory instructions, so that there is no extra cost
355 * when the data is already in-cache.
356 *
357 */
358
359/*
360 * Notes on preserving existing fp state and on membars.
361 *
362 * When a copyOP decides to use fp we may have to preserve existing
363 * floating point state.  It is not the caller's state that we need to
364 * preserve - the rest of the kernel does not use fp and, anyway, fp
365 * registers are volatile across a call.  Some examples:
366 *
367 *	- userland has fp state and is interrupted (device interrupt
368 *	  or trap) and within the interrupt/trap handling we use
369 *	  bcopy()
370 *	- another (higher level) interrupt or trap handler uses bcopy
371 *	  while a bcopy from an earlier interrupt is still active
372 *	- an asynchronous error trap occurs while fp state exists (in
373 *	  userland or in kernel copy) and the tl0 component of the handling
374 *	  uses bcopy
375 *	- a user process with fp state incurs a copy-on-write fault and
376 *	  hwblkpagecopy always uses fp
377 *
378 * We therefore need a per-call place in which to preserve fp state -
379 * using our stack is ideal (and since fp copy cannot be leaf optimized
380 * because of calls it makes, this is no hardship).
381 *
382 * When we have finished fp copy (with it's repeated block stores)
383 * we must membar #Sync so that our block stores may complete before
384 * we either restore the original fp state into the fp registers or
385 * return to a caller which may initiate other fp operations that could
386 * modify the fp regs we used before the block stores complete.
387 *
388 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
389 * t_lofault is not NULL will not panic but will instead trampoline
390 * to the registered lofault handler.  There is no need for any
391 * membars for these - eg, our store to t_lofault will always be visible to
392 * ourselves and it is our cpu which will take any trap.
393 *
394 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
395 * while t_lofault is not NULL will also not panic.  Since we're copying
396 * to or from userland the extent of the damage is known - the destination
397 * buffer is incomplete.  So trap handlers will trampoline to the lofault
398 * handler in this case which should take some form of error action to
399 * avoid using the incomplete buffer.  The trap handler also flags the
400 * fault so that later return-from-trap handling (for the trap that brought
401 * this thread into the kernel in the first place) can notify the process
402 * and reboot the system (or restart the service with Greenline/Contracts).
403 *
404 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
405 * result in deferred error traps - the trap is taken sometime after
406 * the event and the trap PC may not be the PC of the faulting access.
407 * Delivery of such pending traps can be forced by a membar #Sync, acting
408 * as an "error barrier" in this role.  To accurately apply the user/kernel
409 * separation described in the preceding paragraph we must force delivery
410 * of deferred traps affecting kernel state before we install a lofault
411 * handler (if we interpose a new lofault handler on an existing one there
412 * is no need to repeat this), and we must force delivery of deferred
413 * errors affecting the lofault-protected region before we clear t_lofault.
414 * Failure to do so results in lost kernel state being interpreted as
415 * affecting a copyin/copyout only, or of an error that really only
416 * affects copy data being interpreted as losing kernel state.
417 *
418 * Since the copy operations may preserve and later restore floating
419 * point state that does not belong to the caller (see examples above),
420 * we must be careful in how we do this in order to prevent corruption
421 * of another program.
422 *
423 * To make sure that floating point state is always saved and restored
424 * correctly, the following "big rules" must be followed when the floating
425 * point registers will be used:
426 *
427 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
428 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
429 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
430 *    lofault handler was set coming in.
431 *
432 * 2. The FPUSED flag indicates that all FP state has been successfully stored
433 *    on the stack.  It should not be set until this save has been completed.
434 *
435 * 3. The FPUSED flag should not be cleared on exit until all FP state has
436 *    been restored from the stack.  If an error occurs while restoring
437 *    data from the stack, the error handler can check this flag to see if
438 *    a restore is necessary.
439 *
440 * 4. Code run under the new lofault handler must be kept to a minimum.  In
441 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
442 *    to kpreempt(), should not be made until after the lofault handler has
443 *    been restored.
444 */
445
446/*
447 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
448 * to "break even" using FP/VIS-accelerated memory operations.
449 * The FPBLK code assumes a minimum number of bytes are available
450 * to be moved on entry.  Check that code carefully before
451 * reducing VIS_COPY_THRESHOLD below 256.
452 */
453/*
454 * This shadows sys/machsystm.h which can't be included due to the lack of
455 * _ASM guards in include files it references. Change it here, change it there.
456 */
457#define VIS_COPY_THRESHOLD 256
458
459/*
460 * TEST for very short copies
461 * Be aware that the maximum unroll for the short unaligned case
462 * is SHORTCOPY+1
463 */
464#define SHORTCOPY 3
465#define CHKSIZE  39
466
467/*
468 * Indicates that we're to trampoline to the error handler.
469 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
470 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
471 */
472#define	FPUSED_FLAG	1
473#define	TRAMP_FLAG	2
474#define	MASK_FLAGS	3
475
476/*
477 * Number of outstanding prefetches.
478 * first prefetch moves data from L2 to L1 (n_reads)
479 * second prefetch moves data from memory to L2 (one_read)
480 */
481#define	OLYMPUS_C_PREFETCH	24
482#define	OLYMPUS_C_2ND_PREFETCH	12
483
484#define	VIS_BLOCKSIZE		64
485
486/*
487 * Size of stack frame in order to accomodate a 64-byte aligned
488 * floating-point register save area and 2 64-bit temp locations.
489 * All copy functions use two quadrants of fp registers; to assure a
490 * block-aligned two block buffer in which to save we must reserve
491 * three blocks on stack.  Not all functions preserve %pfrs on stack
492 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
493 *
494 *    _______________________________________ <-- %fp + STACK_BIAS
495 *    | We may need to preserve 2 quadrants |
496 *    | of fp regs, but since we do so with |
497 *    | BST/BLD we need room in which to    |
498 *    | align to VIS_BLOCKSIZE bytes.  So   |
499 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
500 *    |-------------------------------------|
501 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
502 *    |-------------------------------------|
503 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
504 *    ---------------------------------------
505 */
506#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
507#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
508#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
509#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
510#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
511
512/*
513 * Common macros used by the various versions of the block copy
514 * routines in this file.
515 */
516
517/*
518 * In FP copies if we do not have preserved data to restore over
519 * the fp regs we used then we must zero those regs to avoid
520 * exposing portions of the data to later threads (data security).
521 *
522 * Copy functions use either quadrants 1 and 3 or 2 and 4.
523 *
524 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
525 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
526 *
527 * The instructions below are quicker than repeated fzero instructions
528 * since they can dispatch down two fp pipelines.
529 */
530#define	FZEROQ1Q3			\
531	fzero	%f0			;\
532	fmovd	%f0, %f2		;\
533	fmovd	%f0, %f4		;\
534	fmovd	%f0, %f6		;\
535	fmovd	%f0, %f8		;\
536	fmovd	%f0, %f10		;\
537	fmovd	%f0, %f12		;\
538	fmovd	%f0, %f14		;\
539	fmovd	%f0, %f32		;\
540	fmovd	%f0, %f34		;\
541	fmovd	%f0, %f36		;\
542	fmovd	%f0, %f38		;\
543	fmovd	%f0, %f40		;\
544	fmovd	%f0, %f42		;\
545	fmovd	%f0, %f44		;\
546	fmovd	%f0, %f46
547
548#define	FZEROQ2Q4			\
549	fzero	%f16			;\
550	fmovd	%f0, %f18		;\
551	fmovd	%f0, %f20		;\
552	fmovd	%f0, %f22		;\
553	fmovd	%f0, %f24		;\
554	fmovd	%f0, %f26		;\
555	fmovd	%f0, %f28		;\
556	fmovd	%f0, %f30		;\
557	fmovd	%f0, %f48		;\
558	fmovd	%f0, %f50		;\
559	fmovd	%f0, %f52		;\
560	fmovd	%f0, %f54		;\
561	fmovd	%f0, %f56		;\
562	fmovd	%f0, %f58		;\
563	fmovd	%f0, %f60		;\
564	fmovd	%f0, %f62
565
566/*
567 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
568 * Used to save and restore in-use fp registers when we want to use FP
569 * and find fp already in use and copy size still large enough to justify
570 * the additional overhead of this save and restore.
571 *
572 * A membar #Sync is needed before save to sync fp ops initiated before
573 * the call to the copy function (by whoever has fp in use); for example
574 * an earlier block load to the quadrant we are about to save may still be
575 * "in flight".  A membar #Sync is required at the end of the save to
576 * sync our block store (the copy code is about to begin ldd's to the
577 * first quadrant).
578 *
579 * Similarly: a membar #Sync before restore allows the block stores of
580 * the copy operation to complete before we fill the quadrants with their
581 * original data, and a membar #Sync after restore lets the block loads
582 * of the restore complete before we return to whoever has the fp regs
583 * in use.  To avoid repeated membar #Sync we make it the responsibility
584 * of the copy code to membar #Sync immediately after copy is complete
585 * and before using the BLD_*_FROMSTACK macro.
586 */
587#define BST_FPQ1Q3_TOSTACK(tmp1)				\
588	/* membar #Sync	*/					;\
589	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
590	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
591	stda	%f0, [tmp1]ASI_BLK_P				;\
592	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
593	stda	%f32, [tmp1]ASI_BLK_P				;\
594	membar	#Sync
595
596#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
597	/* membar #Sync - provided at copy completion */	;\
598	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
599	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
600	ldda	[tmp1]ASI_BLK_P, %f0				;\
601	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
602	ldda	[tmp1]ASI_BLK_P, %f32				;\
603	membar	#Sync
604
605#define BST_FPQ2Q4_TOSTACK(tmp1)				\
606	/* membar #Sync */					;\
607	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
608	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
609	stda	%f16, [tmp1]ASI_BLK_P				;\
610	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
611	stda	%f48, [tmp1]ASI_BLK_P				;\
612	membar	#Sync
613
614#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
615	/* membar #Sync - provided at copy completion */	;\
616	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
617	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
618	ldda	[tmp1]ASI_BLK_P, %f16				;\
619	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
620	ldda	[tmp1]ASI_BLK_P, %f48				;\
621	membar	#Sync
622
623/*
624 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
625 * prevent preemption if there is no t_lwp to save FP state to on context
626 * switch) before commencing a FP copy, and reallow it on completion or
627 * in error trampoline paths when we were using FP copy.
628 *
629 * Both macros may call other functions, so be aware that all outputs are
630 * forfeit after using these macros.  For this reason we do not pass registers
631 * to use - we just use any outputs we want.
632 *
633 * Pseudo code:
634 *
635 * FP_NOMIGRATE:
636 *
637 * if (curthread->t_lwp) {
638 *	thread_nomigrate();
639 * } else {
640 *	kpreempt_disable();
641 * }
642 *
643 * FP_ALLOWMIGRATE:
644 *
645 * if (curthread->t_lwp) {
646 *	thread_allowmigrate();
647 * } else {
648 *	kpreempt_enable();
649 * }
650 */
651
652#define	FP_NOMIGRATE(label1, label2)				\
653	ldn	[THREAD_REG + T_LWP], %o0			;\
654	brz,a,pn %o0, label1/**/f				;\
655	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
656	call	thread_nomigrate				;\
657	  nop							;\
658	ba	label2/**/f					;\
659	  nop							;\
660label1:								;\
661	inc	%o1						;\
662	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
663label2:
664
665#define	FP_ALLOWMIGRATE(label1, label2)			\
666	ldn	[THREAD_REG + T_LWP], %o0			;\
667	brz,a,pn %o0, label1/**/f				;\
668	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
669	call thread_allowmigrate				;\
670	  nop							;\
671	ba	label2/**/f					;\
672	  nop							;\
673label1:								;\
674	dec	%o1						;\
675	brnz,pn	%o1, label2/**/f				;\
676	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
677	ldn	[THREAD_REG + T_CPU], %o0			;\
678	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
679	brz,pt	%o0, label2/**/f				;\
680	  nop							;\
681	call	kpreempt					;\
682	  rdpr	%pil, %o0					;\
683label2:
684
685/*
686 * Copy a block of storage, returning an error code if `from' or
687 * `to' takes a kernel pagefault which cannot be resolved.
688 * Returns errno value on pagefault error, 0 if all ok
689 */
690
691	.seg	".text"
692	.align	4
693
694	ENTRY(kcopy)
695
696	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
697	bleu,pt	%ncc, .kcopy_small		! go to larger cases
698	  xor	%o0, %o1, %o3			! are src, dst alignable?
699	btst	7, %o3				!
700	bz,pt	%ncc, .kcopy_8			! check for longword alignment
701	  nop
702	btst	1, %o3				!
703	bz,pt	%ncc, .kcopy_2			! check for half-word
704	  nop
705	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
706	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
707	tst	%o3
708	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
709	  cmp	%o2, %o3			! if length <= limit
710	bleu,pt	%ncc, .kcopy_small		! go to small copy
711	  nop
712	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
713	  nop
714.kcopy_2:
715	btst	3, %o3				!
716	bz,pt	%ncc, .kcopy_4			! check for word alignment
717	  nop
718	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
719	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
720	tst	%o3
721	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
722	  cmp	%o2, %o3			! if length <= limit
723	bleu,pt	%ncc, .kcopy_small		! go to small copy
724	  nop
725	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
726	  nop
727.kcopy_4:
728	! already checked longword, must be word aligned
729	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
730	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
731	tst	%o3
732	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
733	  cmp	%o2, %o3			! if length <= limit
734	bleu,pt	%ncc, .kcopy_small		! go to small copy
735	  nop
736	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
737	  nop
738.kcopy_8:
739	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
740	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
741	tst	%o3
742	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
743	  cmp	%o2, %o3			! if length <= limit
744	bleu,pt	%ncc, .kcopy_small		! go to small copy
745	  nop
746	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
747	  nop
748
749.kcopy_small:
750	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
751	or	%o5, %lo(.sm_copyerr), %o5
752	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
753	membar	#Sync				! sync error barrier
754	ba,pt	%ncc, .sm_do_copy		! common code
755	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
756
757.kcopy_more:
758	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
759	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
760	or	%l7, %lo(.copyerr), %l7
761	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
762	membar	#Sync				! sync error barrier
763	ba,pt	%ncc, .do_copy			! common code
764	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
765
766
767/*
768 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
769 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
770 */
771.copyerr:
772	set	.copyerr2, %l0
773	membar	#Sync				! sync error barrier
774	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
775	btst	FPUSED_FLAG, %l6
776	bz	%ncc, 1f
777	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
778
779	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
780	wr	%o2, 0, %gsr
781
782	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
783	btst	FPRS_FEF, %o3
784	bz,pt	%icc, 4f
785	  nop
786
787	BLD_FPQ1Q3_FROMSTACK(%o2)
788
789	ba,pt	%ncc, 1f
790	  wr	%o3, 0, %fprs		! restore fprs
791
7924:
793	FZEROQ1Q3
794	wr	%o3, 0, %fprs		! restore fprs
795
796	!
797	! Need to cater for the different expectations of kcopy
798	! and bcopy. kcopy will *always* set a t_lofault handler
799	! If it fires, we're expected to just return the error code
800	! and *not* to invoke any existing error handler. As far as
801	! bcopy is concerned, we only set t_lofault if there was an
802	! existing lofault handler. In that case we're expected to
803	! invoke the previously existing handler after resetting the
804	! t_lofault value.
805	!
8061:
807	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
808	membar	#Sync				! sync error barrier
809	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
810	FP_ALLOWMIGRATE(5, 6)
811
812	btst	TRAMP_FLAG, %l0
813	bnz,pn	%ncc, 3f
814	  nop
815	ret
816	  restore	%g1, 0, %o0
817
8183:
819	!
820	! We're here via bcopy. There *must* have been an error handler
821	! in place otherwise we would have died a nasty death already.
822	!
823	jmp	%l6				! goto real handler
824	  restore	%g0, 0, %o0		! dispose of copy window
825
826/*
827 * We got here because of a fault in .copyerr.  We can't safely restore fp
828 * state, so we panic.
829 */
830fp_panic_msg:
831	.asciz	"Unable to restore fp state after copy operation"
832
833	.align	4
834.copyerr2:
835	set	fp_panic_msg, %o0
836	call	panic
837	  nop
838
839/*
840 * We got here because of a fault during a small kcopy or bcopy.
841 * No floating point registers are used by the small copies.
842 * Errno value is in %g1.
843 */
844.sm_copyerr:
8451:
846	btst	TRAMP_FLAG, %o4
847	membar	#Sync
848	andn	%o4, TRAMP_FLAG, %o4
849	bnz,pn	%ncc, 3f
850	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
851	retl
852	  mov	%g1, %o0
8533:
854	jmp	%o4				! goto real handler
855	  mov	%g0, %o0			!
856
857	SET_SIZE(kcopy)
858
859
860/*
861 * Copy a block of storage - must not overlap (from + len <= to).
862 * Registers: l6 - saved t_lofault
863 * (for short copies, o4 - saved t_lofault)
864 *
865 * Copy a page of memory.
866 * Assumes double word alignment and a count >= 256.
867 */
868
869	ENTRY(bcopy)
870
871	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
872	bleu,pt	%ncc, .bcopy_small		! go to larger cases
873	  xor	%o0, %o1, %o3			! are src, dst alignable?
874	btst	7, %o3				!
875	bz,pt	%ncc, .bcopy_8			! check for longword alignment
876	  nop
877	btst	1, %o3				!
878	bz,pt	%ncc, .bcopy_2			! check for half-word
879	  nop
880	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
881	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
882	tst	%o3
883	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
884	  cmp	%o2, %o3			! if length <= limit
885	bleu,pt	%ncc, .bcopy_small		! go to small copy
886	  nop
887	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
888	  nop
889.bcopy_2:
890	btst	3, %o3				!
891	bz,pt	%ncc, .bcopy_4			! check for word alignment
892	  nop
893	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
894	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
895	tst	%o3
896	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
897	  cmp	%o2, %o3			! if length <= limit
898	bleu,pt	%ncc, .bcopy_small		! go to small copy
899	  nop
900	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
901	  nop
902.bcopy_4:
903	! already checked longword, must be word aligned
904	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
905	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
906	tst	%o3
907	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
908	  cmp	%o2, %o3			! if length <= limit
909	bleu,pt	%ncc, .bcopy_small		! go to small copy
910	  nop
911	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
912	  nop
913.bcopy_8:
914	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
915	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
916	tst	%o3
917	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
918	  cmp	%o2, %o3			! if length <= limit
919	bleu,pt	%ncc, .bcopy_small		! go to small copy
920	  nop
921	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
922	  nop
923
924	.align	16
925.bcopy_small:
926	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
927	tst	%o4
928	bz,pt	%icc, .sm_do_copy
929	  nop
930	sethi	%hi(.sm_copyerr), %o5
931	or	%o5, %lo(.sm_copyerr), %o5
932	membar	#Sync				! sync error barrier
933	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
934	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
935.sm_do_copy:
936	cmp	%o2, SHORTCOPY		! check for really short case
937	bleu,pt	%ncc, .bc_sm_left	!
938	  cmp	%o2, CHKSIZE		! check for medium length cases
939	bgu,pn	%ncc, .bc_med		!
940	  or	%o0, %o1, %o3		! prepare alignment check
941	andcc	%o3, 0x3, %g0		! test for alignment
942	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
943.bc_sm_movebytes:
944	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
945.bc_sm_notalign4:
946	ldub	[%o0], %o3		! read byte
947	stb	%o3, [%o1]		! write byte
948	subcc	%o2, 4, %o2		! reduce count by 4
949	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
950	add	%o0, 4, %o0		! advance SRC by 4
951	stb	%o3, [%o1 + 1]
952	ldub	[%o0 - 2], %o3
953	add	%o1, 4, %o1		! advance DST by 4
954	stb	%o3, [%o1 - 2]
955	ldub	[%o0 - 1], %o3
956	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
957	  stb	%o3, [%o1 - 1]
958	add	%o2, 3, %o2		! restore count
959.bc_sm_left:
960	tst	%o2
961	bz,pt	%ncc, .bc_sm_exit	! check for zero length
962	  deccc	%o2			! reduce count for cc test
963	ldub	[%o0], %o3		! move one byte
964	bz,pt	%ncc, .bc_sm_exit
965	  stb	%o3, [%o1]
966	ldub	[%o0 + 1], %o3		! move another byte
967	deccc	%o2			! check for more
968	bz,pt	%ncc, .bc_sm_exit
969	  stb	%o3, [%o1 + 1]
970	ldub	[%o0 + 2], %o3		! move final byte
971	ba,pt   %ncc, .bc_sm_exit
972	  stb	%o3, [%o1 + 2]
973	.align	16
974	nop				! instruction alignment
975					! see discussion at start of file
976.bc_sm_words:
977	lduw	[%o0], %o3		! read word
978.bc_sm_wordx:
979	subcc	%o2, 8, %o2		! update count
980	stw	%o3, [%o1]		! write word
981	add	%o0, 8, %o0		! update SRC
982	lduw	[%o0 - 4], %o3		! read word
983	add	%o1, 8, %o1		! update DST
984	bgt,pt	%ncc, .bc_sm_words	! loop til done
985	  stw	%o3, [%o1 - 4]		! write word
986	addcc	%o2, 7, %o2		! restore count
987	bz,pt	%ncc, .bc_sm_exit
988	  deccc	%o2
989	bz,pt	%ncc, .bc_sm_byte
990.bc_sm_half:
991	  subcc	%o2, 2, %o2		! reduce count by 2
992	add	%o0, 2, %o0		! advance SRC by 2
993	lduh	[%o0 - 2], %o3		! read half word
994	add	%o1, 2, %o1		! advance DST by 2
995	bgt,pt	%ncc, .bc_sm_half	! loop til done
996	  sth	%o3, [%o1 - 2]		! write half word
997	addcc	%o2, 1, %o2		! restore count
998	bz,pt	%ncc, .bc_sm_exit
999	  nop
1000.bc_sm_byte:
1001	ldub	[%o0], %o3
1002	ba,pt   %ncc, .bc_sm_exit
1003	  stb	%o3, [%o1]
1004
1005.bc_sm_word:
1006	subcc	%o2, 4, %o2		! update count
1007	bgt,pt	%ncc, .bc_sm_wordx
1008	  lduw	[%o0], %o3		! read word
1009	addcc	%o2, 3, %o2		! restore count
1010	bz,pt	%ncc, .bc_sm_exit
1011	  stw	%o3, [%o1]		! write word
1012	deccc	%o2			! reduce count for cc test
1013	ldub	[%o0 + 4], %o3		! load one byte
1014	bz,pt	%ncc, .bc_sm_exit
1015	  stb	%o3, [%o1 + 4]		! store one byte
1016	ldub	[%o0 + 5], %o3		! load second byte
1017	deccc	%o2
1018	bz,pt	%ncc, .bc_sm_exit
1019	  stb	%o3, [%o1 + 5]		! store second byte
1020	ldub	[%o0 + 6], %o3		! load third byte
1021	stb	%o3, [%o1 + 6]		! store third byte
1022.bc_sm_exit:
1023	ldn     [THREAD_REG + T_LOFAULT], %o3
1024	brz,pt  %o3, .bc_sm_done
1025	  nop
1026	membar	#Sync				! sync error barrier
1027	andn	%o4, TRAMP_FLAG, %o4
1028	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1029.bc_sm_done:
1030	retl
1031	  mov	%g0, %o0		! return 0
1032
1033	.align 16
1034.bc_med:
1035	xor	%o0, %o1, %o3		! setup alignment check
1036	btst	1, %o3
1037	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
1038	  nop
1039	btst	3, %o3
1040	bnz,pt	%ncc, .bc_med_half	! halfword aligned
1041	  nop
1042	btst	7, %o3
1043	bnz,pt	%ncc, .bc_med_word	! word aligned
1044	  nop
1045.bc_med_long:
1046	btst	3, %o0			! check for
1047	bz,pt	%ncc, .bc_med_long1	! word alignment
1048	  nop
1049.bc_med_long0:
1050	ldub	[%o0], %o3		! load one byte
1051	inc	%o0
1052	stb	%o3,[%o1]		! store byte
1053	inc	%o1
1054	btst	3, %o0
1055	bnz,pt	%ncc, .bc_med_long0
1056	  dec	%o2
1057.bc_med_long1:			! word aligned
1058	btst	7, %o0			! check for long word
1059	bz,pt	%ncc, .bc_med_long2
1060	  nop
1061	lduw	[%o0], %o3		! load word
1062	add	%o0, 4, %o0		! advance SRC by 4
1063	stw	%o3, [%o1]		! store word
1064	add	%o1, 4, %o1		! advance DST by 4
1065	sub	%o2, 4, %o2		! reduce count by 4
1066!
1067!  Now long word aligned and have at least 32 bytes to move
1068!
1069.bc_med_long2:
1070	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1071.bc_med_lmove:
1072	ldx	[%o0], %o3		! read long word
1073	stx	%o3, [%o1]		! write long word
1074	subcc	%o2, 32, %o2		! reduce count by 32
1075	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1076	add	%o0, 32, %o0		! advance SRC by 32
1077	stx	%o3, [%o1 + 8]
1078	ldx	[%o0 - 16], %o3
1079	add	%o1, 32, %o1		! advance DST by 32
1080	stx	%o3, [%o1 - 16]
1081	ldx	[%o0 - 8], %o3
1082	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
1083	  stx	%o3, [%o1 - 8]
1084	addcc	%o2, 24, %o2		! restore count to long word offset
1085	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
1086	  nop
1087.bc_med_lword:
1088	ldx	[%o0], %o3		! read long word
1089	subcc	%o2, 8, %o2		! reduce count by 8
1090	stx	%o3, [%o1]		! write long word
1091	add	%o0, 8, %o0		! advance SRC by 8
1092	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
1093	  add	%o1, 8, %o1		! advance DST by 8
1094.bc_med_lextra:
1095	addcc	%o2, 7, %o2		! restore rest of count
1096	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1097	  deccc	%o2
1098	bz,pt	%ncc, .bc_sm_byte
1099	  nop
1100	ba,pt	%ncc, .bc_sm_half
1101	  nop
1102
1103	.align 16
1104.bc_med_word:
1105	btst	3, %o0			! check for
1106	bz,pt	%ncc, .bc_med_word1	! word alignment
1107	  nop
1108.bc_med_word0:
1109	ldub	[%o0], %o3		! load one byte
1110	inc	%o0
1111	stb	%o3,[%o1]		! store byte
1112	inc	%o1
1113	btst	3, %o0
1114	bnz,pt	%ncc, .bc_med_word0
1115	  dec	%o2
1116!
1117!  Now word aligned and have at least 36 bytes to move
1118!
1119.bc_med_word1:
1120	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1121.bc_med_wmove:
1122	lduw	[%o0], %o3		! read word
1123	stw	%o3, [%o1]		! write word
1124	subcc	%o2, 16, %o2		! reduce count by 16
1125	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1126	add	%o0, 16, %o0		! advance SRC by 16
1127	stw	%o3, [%o1 + 4]
1128	lduw	[%o0 - 8], %o3
1129	add	%o1, 16, %o1		! advance DST by 16
1130	stw	%o3, [%o1 - 8]
1131	lduw	[%o0 - 4], %o3
1132	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
1133	  stw	%o3, [%o1 - 4]
1134	addcc	%o2, 12, %o2		! restore count to word offset
1135	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
1136	  nop
1137.bc_med_word2:
1138	lduw	[%o0], %o3		! read word
1139	subcc	%o2, 4, %o2		! reduce count by 4
1140	stw	%o3, [%o1]		! write word
1141	add	%o0, 4, %o0		! advance SRC by 4
1142	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
1143	  add	%o1, 4, %o1		! advance DST by 4
1144.bc_med_wextra:
1145	addcc	%o2, 3, %o2		! restore rest of count
1146	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1147	  deccc	%o2
1148	bz,pt	%ncc, .bc_sm_byte
1149	  nop
1150	ba,pt	%ncc, .bc_sm_half
1151	  nop
1152
1153	.align 16
1154.bc_med_half:
1155	btst	1, %o0			! check for
1156	bz,pt	%ncc, .bc_med_half1	! half word alignment
1157	  nop
1158	ldub	[%o0], %o3		! load one byte
1159	inc	%o0
1160	stb	%o3,[%o1]		! store byte
1161	inc	%o1
1162	dec	%o2
1163!
1164!  Now half word aligned and have at least 38 bytes to move
1165!
1166.bc_med_half1:
1167	sub	%o2, 7, %o2		! adjust count to allow cc zero test
1168.bc_med_hmove:
1169	lduh	[%o0], %o3		! read half word
1170	sth	%o3, [%o1]		! write half word
1171	subcc	%o2, 8, %o2		! reduce count by 8
1172	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
1173	add	%o0, 8, %o0		! advance SRC by 8
1174	sth	%o3, [%o1 + 2]
1175	lduh	[%o0 - 4], %o3
1176	add	%o1, 8, %o1		! advance DST by 8
1177	sth	%o3, [%o1 - 4]
1178	lduh	[%o0 - 2], %o3
1179	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
1180	  sth	%o3, [%o1 - 2]
1181	addcc	%o2, 7, %o2		! restore count
1182	bz,pt	%ncc, .bc_sm_exit
1183	  deccc	%o2
1184	bz,pt	%ncc, .bc_sm_byte
1185	  nop
1186	ba,pt	%ncc, .bc_sm_half
1187	  nop
1188
1189	SET_SIZE(bcopy)
1190
1191/*
1192 * The _more entry points are not intended to be used directly by
1193 * any caller from outside this file.  They are provided to allow
1194 * profiling and dtrace of the portions of the copy code that uses
1195 * the floating point registers.
1196 * This entry is particularly important as DTRACE (at least as of
1197 * 4/2004) does not support leaf functions.
1198 */
1199
1200	ENTRY(bcopy_more)
1201.bcopy_more:
1202	prefetch [%o0], #n_reads
1203	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1204	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
1205	tst	%l6
1206	bz,pt	%ncc, .do_copy
1207	  nop
1208	sethi	%hi(.copyerr), %o2
1209	or	%o2, %lo(.copyerr), %o2
1210	membar	#Sync				! sync error barrier
1211	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1212	!
1213	! We've already captured whether t_lofault was zero on entry.
1214	! We need to mark ourselves as being from bcopy since both
1215	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1216	! and the saved lofault was zero, we won't reset lofault on
1217	! returning.
1218	!
1219	or	%l6, TRAMP_FLAG, %l6
1220
1221/*
1222 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1223 * Also, use of FP registers has been tested to be enabled
1224 */
1225.do_copy:
1226	FP_NOMIGRATE(6, 7)
1227
1228	rd	%fprs, %o2		! check for unused fp
1229	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1230	btst	FPRS_FEF, %o2
1231	bz,a,pt	%icc, .do_blockcopy
1232	  wr	%g0, FPRS_FEF, %fprs
1233
1234	BST_FPQ1Q3_TOSTACK(%o2)
1235
1236.do_blockcopy:
1237	rd	%gsr, %o2
1238	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
1239	or	%l6, FPUSED_FLAG, %l6
1240
1241#define	REALSRC	%i0
1242#define	DST	%i1
1243#define	CNT	%i2
1244#define	SRC	%i3
1245#define	TMP	%i5
1246
1247	andcc	DST, VIS_BLOCKSIZE - 1, TMP
1248	bz,pt	%ncc, 2f
1249	  neg	TMP
1250	add	TMP, VIS_BLOCKSIZE, TMP
1251
1252	! TMP = bytes required to align DST on FP_BLOCK boundary
1253	! Using SRC as a tmp here
1254	cmp	TMP, 3
1255	bleu,pt	%ncc, 1f
1256	  sub	CNT,TMP,CNT		! adjust main count
1257	sub	TMP, 3, TMP		! adjust for end of loop test
1258.bc_blkalign:
1259	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
1260	stb	SRC, [DST]
1261	subcc	TMP, 4, TMP
1262	ldub	[REALSRC + 1], SRC
1263	add	REALSRC, 4, REALSRC
1264	stb	SRC, [DST + 1]
1265	ldub	[REALSRC - 2], SRC
1266	add	DST, 4, DST
1267	stb	SRC, [DST - 2]
1268	ldub	[REALSRC - 1], SRC
1269	bgu,pt	%ncc, .bc_blkalign
1270	  stb	SRC, [DST - 1]
1271
1272	addcc	TMP, 3, TMP		! restore count adjustment
1273	bz,pt	%ncc, 2f		! no bytes left?
1274	  nop
12751:	ldub	[REALSRC], SRC
1276	inc	REALSRC
1277	inc	DST
1278	deccc	TMP
1279	bgu	%ncc, 1b
1280	  stb	SRC, [DST - 1]
1281
12822:
1283	membar	#StoreLoad
1284	andn	REALSRC, 0x7, SRC
1285
1286	! SRC - 8-byte aligned
1287	! DST - 64-byte aligned
1288	ldd	[SRC], %f0
1289	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1290	alignaddr REALSRC, %g0, %g0
1291	ldd	[SRC + 0x08], %f2
1292	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1293	faligndata %f0, %f2, %f32
1294	ldd	[SRC + 0x10], %f4
1295	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1296	faligndata %f2, %f4, %f34
1297	ldd	[SRC + 0x18], %f6
1298	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1299	faligndata %f4, %f6, %f36
1300	ldd	[SRC + 0x20], %f8
1301	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1302	faligndata %f6, %f8, %f38
1303	ldd	[SRC + 0x28], %f10
1304	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1305	faligndata %f8, %f10, %f40
1306	ldd	[SRC + 0x30], %f12
1307	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1308	faligndata %f10, %f12, %f42
1309	ldd	[SRC + 0x38], %f14
1310	ldd	[SRC + VIS_BLOCKSIZE], %f0
1311	sub	CNT, VIS_BLOCKSIZE, CNT
1312	add	SRC, VIS_BLOCKSIZE, SRC
1313	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1314	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1315	ba,pt	%ncc, 1f
1316	  prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1317	.align	32
13181:
1319	ldd	[SRC + 0x08], %f2
1320	faligndata %f12, %f14, %f44
1321	ldd	[SRC + 0x10], %f4
1322	faligndata %f14, %f0, %f46
1323	stda	%f32, [DST]ASI_BLK_P
1324	ldd	[SRC + 0x18], %f6
1325	faligndata %f0, %f2, %f32
1326	ldd	[SRC + 0x20], %f8
1327	faligndata %f2, %f4, %f34
1328	ldd	[SRC + 0x28], %f10
1329	faligndata %f4, %f6, %f36
1330	ldd	[SRC + 0x30], %f12
1331	faligndata %f6, %f8, %f38
1332	sub	CNT, VIS_BLOCKSIZE, CNT
1333	ldd	[SRC + 0x38], %f14
1334	faligndata %f8, %f10, %f40
1335	add	DST, VIS_BLOCKSIZE, DST
1336	ldd	[SRC + VIS_BLOCKSIZE], %f0
1337	faligndata %f10, %f12, %f42
1338	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1339	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1340	add	SRC, VIS_BLOCKSIZE, SRC
1341	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1342	cmp	CNT, VIS_BLOCKSIZE + 8
1343	bgu,pt	%ncc, 1b
1344	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1345
1346	! only if REALSRC & 0x7 is 0
1347	cmp	CNT, VIS_BLOCKSIZE
1348	bne	%ncc, 3f
1349	  andcc	REALSRC, 0x7, %g0
1350	bz,pt	%ncc, 2f
1351	  nop
13523:
1353	faligndata %f12, %f14, %f44
1354	faligndata %f14, %f0, %f46
1355	stda	%f32, [DST]ASI_BLK_P
1356	add	DST, VIS_BLOCKSIZE, DST
1357	ba,pt	%ncc, 3f
1358	  nop
13592:
1360	ldd	[SRC + 0x08], %f2
1361	fsrc1	%f12, %f44
1362	ldd	[SRC + 0x10], %f4
1363	fsrc1	%f14, %f46
1364	stda	%f32, [DST]ASI_BLK_P
1365	ldd	[SRC + 0x18], %f6
1366	fsrc1	%f0, %f32
1367	ldd	[SRC + 0x20], %f8
1368	fsrc1	%f2, %f34
1369	ldd	[SRC + 0x28], %f10
1370	fsrc1	%f4, %f36
1371	ldd	[SRC + 0x30], %f12
1372	fsrc1	%f6, %f38
1373	ldd	[SRC + 0x38], %f14
1374	fsrc1	%f8, %f40
1375	sub	CNT, VIS_BLOCKSIZE, CNT
1376	add	DST, VIS_BLOCKSIZE, DST
1377	add	SRC, VIS_BLOCKSIZE, SRC
1378	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1379	fsrc1	%f10, %f42
1380	fsrc1	%f12, %f44
1381	fsrc1	%f14, %f46
1382	stda	%f32, [DST]ASI_BLK_P
1383	add	DST, VIS_BLOCKSIZE, DST
1384	ba,a,pt	%ncc, .bcb_exit
1385	  nop
1386
13873:	tst	CNT
1388	bz,a,pt	%ncc, .bcb_exit
1389	  nop
1390
13915:	ldub	[REALSRC], TMP
1392	inc	REALSRC
1393	inc	DST
1394	deccc	CNT
1395	bgu	%ncc, 5b
1396	  stb	TMP, [DST - 1]
1397.bcb_exit:
1398	membar	#Sync
1399
1400	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1401	wr	%o2, 0, %gsr
1402
1403	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1404	btst	FPRS_FEF, %o3
1405	bz,pt	%icc, 4f
1406	  nop
1407
1408	BLD_FPQ1Q3_FROMSTACK(%o2)
1409
1410	ba,pt	%ncc, 2f
1411	  wr	%o3, 0, %fprs		! restore fprs
14124:
1413	FZEROQ1Q3
1414	wr	%o3, 0, %fprs		! restore fprs
14152:
1416	membar	#Sync				! sync error barrier
1417	andn	%l6, MASK_FLAGS, %l6
1418	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1419	FP_ALLOWMIGRATE(5, 6)
1420	ret
1421	  restore	%g0, 0, %o0
1422
1423	SET_SIZE(bcopy_more)
1424
1425/*
1426 * Block copy with possibly overlapped operands.
1427 */
1428
1429	ENTRY(ovbcopy)
1430	tst	%o2			! check count
1431	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1432	  subcc	%o0, %o1, %o3		! difference of from and to address
1433
1434	retl				! return
1435	  nop
14361:
1437	bneg,a	%ncc, 2f
1438	  neg	%o3			! if < 0, make it positive
14392:	cmp	%o2, %o3		! cmp size and abs(from - to)
1440	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1441	  .empty				!   no overlap
1442	  cmp	%o0, %o1		! compare from and to addresses
1443	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1444	  nop
1445	!
1446	! Copy forwards.
1447	!
1448.ov_fwd:
1449	ldub	[%o0], %o3		! read from address
1450	inc	%o0			! inc from address
1451	stb	%o3, [%o1]		! write to address
1452	deccc	%o2			! dec count
1453	bgu	%ncc, .ov_fwd		! loop till done
1454	  inc	%o1			! inc to address
1455
1456	retl				! return
1457	  nop
1458	!
1459	! Copy backwards.
1460	!
1461.ov_bkwd:
1462	deccc	%o2			! dec count
1463	ldub	[%o0 + %o2], %o3	! get byte at end of src
1464	bgu	%ncc, .ov_bkwd		! loop till done
1465	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1466
1467	retl				! return
1468	  nop
1469
1470	SET_SIZE(ovbcopy)
1471
1472
1473/*
1474 * hwblkpagecopy()
1475 *
1476 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1477 * has already disabled kernel preemption and has checked
1478 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1479 */
1480	ENTRY(hwblkpagecopy)
1481	! get another window w/space for three aligned blocks of saved fpregs
1482	prefetch [%o0], #n_reads
1483	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1484
1485	! %i0 - source address (arg)
1486	! %i1 - destination address (arg)
1487	! %i2 - length of region (not arg)
1488	! %l0 - saved fprs
1489	! %l1 - pointer to saved fpregs
1490
1491	rd	%fprs, %l0		! check for unused fp
1492	btst	FPRS_FEF, %l0
1493	bz,a,pt	%icc, 1f
1494	  wr	%g0, FPRS_FEF, %fprs
1495
1496	BST_FPQ1Q3_TOSTACK(%l1)
1497
14981:	set	PAGESIZE, CNT
1499	mov	REALSRC, SRC
1500
1501	ldd	[SRC], %f0
1502	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1503	ldd	[SRC + 0x08], %f2
1504	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1505	fmovd	%f0, %f32
1506	ldd	[SRC + 0x10], %f4
1507	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1508	fmovd	%f2, %f34
1509	ldd	[SRC + 0x18], %f6
1510	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1511	fmovd	%f4, %f36
1512	ldd	[SRC + 0x20], %f8
1513	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1514	fmovd	%f6, %f38
1515	ldd	[SRC + 0x28], %f10
1516	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1517	fmovd	%f8, %f40
1518	ldd	[SRC + 0x30], %f12
1519	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1520	fmovd	%f10, %f42
1521	ldd	[SRC + 0x38], %f14
1522	ldd	[SRC + VIS_BLOCKSIZE], %f0
1523	sub	CNT, VIS_BLOCKSIZE, CNT
1524	add	SRC, VIS_BLOCKSIZE, SRC
1525	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1526	ba,pt	%ncc, 2f
1527	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1528	.align	32
15292:
1530	ldd	[SRC + 0x08], %f2
1531	fmovd	%f12, %f44
1532	ldd	[SRC + 0x10], %f4
1533	fmovd	%f14, %f46
1534	stda	%f32, [DST]ASI_BLK_P
1535	ldd	[SRC + 0x18], %f6
1536	fmovd	%f0, %f32
1537	ldd	[SRC + 0x20], %f8
1538	fmovd	%f2, %f34
1539	ldd	[SRC + 0x28], %f10
1540	fmovd	%f4, %f36
1541	ldd	[SRC + 0x30], %f12
1542	fmovd	%f6, %f38
1543	ldd	[SRC + 0x38], %f14
1544	fmovd	%f8, %f40
1545	ldd	[SRC + VIS_BLOCKSIZE], %f0
1546	fmovd	%f10, %f42
1547	sub	CNT, VIS_BLOCKSIZE, CNT
1548	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1549	add	DST, VIS_BLOCKSIZE, DST
1550	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1551	add	SRC, VIS_BLOCKSIZE, SRC
1552	cmp	CNT, VIS_BLOCKSIZE + 8
1553	bgu,pt	%ncc, 2b
1554	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1555
1556	! trailing block
1557	ldd	[SRC + 0x08], %f2
1558	fsrc1	%f12, %f44
1559	ldd	[SRC + 0x10], %f4
1560	fsrc1	%f14, %f46
1561	stda	%f32, [DST]ASI_BLK_P
1562	ldd	[SRC + 0x18], %f6
1563	fsrc1	%f0, %f32
1564	ldd	[SRC + 0x20], %f8
1565	fsrc1	%f2, %f34
1566	ldd	[SRC + 0x28], %f10
1567	fsrc1	%f4, %f36
1568	ldd	[SRC + 0x30], %f12
1569	fsrc1	%f6, %f38
1570	ldd	[SRC + 0x38], %f14
1571	fsrc1	%f8, %f40
1572	sub	CNT, VIS_BLOCKSIZE, CNT
1573	add	DST, VIS_BLOCKSIZE, DST
1574	add	SRC, VIS_BLOCKSIZE, SRC
1575	fsrc1	%f10, %f42
1576	fsrc1	%f12, %f44
1577	fsrc1	%f14, %f46
1578	stda	%f32, [DST]ASI_BLK_P
1579
1580	membar	#Sync
1581
1582	btst	FPRS_FEF, %l0
1583	bz,pt	%icc, 2f
1584	  nop
1585
1586	BLD_FPQ1Q3_FROMSTACK(%l3)
1587	ba	3f
1588	  nop
1589
15902:	FZEROQ1Q3
1591
15923:	wr	%l0, 0, %fprs		! restore fprs
1593	ret
1594	  restore	%g0, 0, %o0
1595
1596	SET_SIZE(hwblkpagecopy)
1597
1598
1599/*
1600 * Transfer data to and from user space -
1601 * Note that these routines can cause faults
1602 * It is assumed that the kernel has nothing at
1603 * less than KERNELBASE in the virtual address space.
1604 *
1605 * Note that copyin(9F) and copyout(9F) are part of the
1606 * DDI/DKI which specifies that they return '-1' on "errors."
1607 *
1608 * Sigh.
1609 *
1610 * So there's two extremely similar routines - xcopyin() and xcopyout()
1611 * which return the errno that we've faithfully computed.  This
1612 * allows other callers (e.g. uiomove(9F)) to work correctly.
1613 * Given that these are used pretty heavily, we expand the calling
1614 * sequences inline for all flavours (rather than making wrappers).
1615 *
1616 * There are also stub routines for xcopyout_little and xcopyin_little,
1617 * which currently are intended to handle requests of <= 16 bytes from
1618 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1619 * is left as an exercise...
1620 */
1621
1622/*
1623 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1624 *
1625 * General theory of operation:
1626 *
1627 * The only difference between copy{in,out} and
1628 * xcopy{in,out} is in the error handling routine they invoke
1629 * when a memory access error occurs. xcopyOP returns the errno
1630 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1631 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1632 * if they are called with a fault handler already in place. That flag
1633 * causes the default handlers to trampoline to the previous handler
1634 * upon an error.
1635 *
1636 * None of the copyops routines grab a window until it's decided that
1637 * we need to do a HW block copy operation. This saves a window
1638 * spill/fill when we're called during socket ops. The typical IO
1639 * path won't cause spill/fill traps.
1640 *
1641 * This code uses a set of 4 limits for the maximum size that will
1642 * be copied given a particular input/output address alignment.
1643 * If the value for a particular limit is zero, the copy will be performed
1644 * by the plain copy loops rather than FPBLK.
1645 *
1646 * See the description of bcopy above for more details of the
1647 * data copying algorithm and the default limits.
1648 *
1649 */
1650
1651/*
1652 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1653 */
1654
1655/*
1656 * We save the arguments in the following registers in case of a fault:
1657 *	kaddr - %l1
1658 *	uaddr - %l2
1659 *	count - %l3
1660 */
1661#define SAVE_SRC	%l1
1662#define SAVE_DST	%l2
1663#define SAVE_COUNT	%l3
1664
1665#define SM_SAVE_SRC		%g4
1666#define SM_SAVE_DST		%g5
1667#define SM_SAVE_COUNT		%o5
1668#define ERRNO		%l5
1669
1670
1671#define REAL_LOFAULT	%l4
1672/*
1673 * Generic copyio fault handler.  This is the first line of defense when a
1674 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1675 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1676 * This allows us to share common code for all the flavors of the copy
1677 * operations, including the _noerr versions.
1678 *
1679 * Note that this function will restore the original input parameters before
1680 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1681 * member of the t_copyop structure, if needed.
1682 */
1683	ENTRY(copyio_fault)
1684	membar	#Sync
1685	mov	%g1,ERRNO			! save errno in ERRNO
1686	btst	FPUSED_FLAG, %l6
1687	bz	%ncc, 1f
1688	  nop
1689
1690	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1691	wr	%o2, 0, %gsr    	! restore gsr
1692
1693	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1694	btst	FPRS_FEF, %o3
1695	bz,pt	%icc, 4f
1696	  nop
1697
1698	BLD_FPQ2Q4_FROMSTACK(%o2)
1699
1700	ba,pt	%ncc, 1f
1701	  wr	%o3, 0, %fprs   	! restore fprs
1702
17034:
1704	FZEROQ2Q4
1705	wr	%o3, 0, %fprs   	! restore fprs
1706
17071:
1708	andn	%l6, FPUSED_FLAG, %l6
1709	membar	#Sync
1710	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1711	FP_ALLOWMIGRATE(5, 6)
1712
1713	mov	SAVE_SRC, %i0
1714	mov	SAVE_DST, %i1
1715	jmp	REAL_LOFAULT
1716	  mov	SAVE_COUNT, %i2
1717
1718	SET_SIZE(copyio_fault)
1719
1720
1721	ENTRY(copyout)
1722
1723	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
1724	bleu,pt	%ncc, .copyout_small		! go to larger cases
1725	  xor	%o0, %o1, %o3			! are src, dst alignable?
1726	btst	7, %o3				!
1727	bz,pt	%ncc, .copyout_8		! check for longword alignment
1728	  nop
1729	btst	1, %o3				!
1730	bz,pt	%ncc, .copyout_2		! check for half-word
1731	  nop
1732	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
1733	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1734	tst	%o3
1735	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1736	  cmp	%o2, %o3			! if length <= limit
1737	bleu,pt	%ncc, .copyout_small		! go to small copy
1738	  nop
1739	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1740	  nop
1741.copyout_2:
1742	btst	3, %o3				!
1743	bz,pt	%ncc, .copyout_4		! check for word alignment
1744	  nop
1745	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
1746	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1747	tst	%o3
1748	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1749	  cmp	%o2, %o3			! if length <= limit
1750	bleu,pt	%ncc, .copyout_small		! go to small copy
1751	  nop
1752	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1753	  nop
1754.copyout_4:
1755	! already checked longword, must be word aligned
1756	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
1757	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1758	tst	%o3
1759	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1760	  cmp	%o2, %o3			! if length <= limit
1761	bleu,pt	%ncc, .copyout_small		! go to small copy
1762	  nop
1763	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1764	  nop
1765.copyout_8:
1766	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
1767	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1768	tst	%o3
1769	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1770	  cmp	%o2, %o3			! if length <= limit
1771	bleu,pt	%ncc, .copyout_small		! go to small copy
1772	  nop
1773	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1774	  nop
1775
1776	.align	16
1777	nop				! instruction alignment
1778					! see discussion at start of file
1779.copyout_small:
1780	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
1781	or	%o5, %lo(.sm_copyout_err), %o5
1782	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
1783	membar	#Sync				! sync error barrier
1784	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
1785.sm_do_copyout:
1786	mov	%o0, SM_SAVE_SRC
1787	mov	%o1, SM_SAVE_DST
1788	cmp	%o2, SHORTCOPY		! check for really short case
1789	bleu,pt	%ncc, .co_sm_left	!
1790	  mov	%o2, SM_SAVE_COUNT
1791	cmp	%o2, CHKSIZE		! check for medium length cases
1792	bgu,pn	%ncc, .co_med		!
1793	  or	%o0, %o1, %o3		! prepare alignment check
1794	andcc	%o3, 0x3, %g0		! test for alignment
1795	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
1796.co_sm_movebytes:
1797	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1798.co_sm_notalign4:
1799	ldub	[%o0], %o3		! read byte
1800	subcc	%o2, 4, %o2		! reduce count by 4
1801	stba	%o3, [%o1]ASI_USER	! write byte
1802	inc	%o1			! advance DST by 1
1803	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1804	add	%o0, 4, %o0		! advance SRC by 4
1805	stba	%o3, [%o1]ASI_USER
1806	inc	%o1			! advance DST by 1
1807	ldub	[%o0 - 2], %o3
1808	stba	%o3, [%o1]ASI_USER
1809	inc	%o1			! advance DST by 1
1810	ldub	[%o0 - 1], %o3
1811	stba	%o3, [%o1]ASI_USER
1812	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
1813	  inc	%o1			! advance DST by 1
1814	add	%o2, 3, %o2		! restore count
1815.co_sm_left:
1816	tst	%o2
1817	bz,pt	%ncc, .co_sm_exit	! check for zero length
1818	  nop
1819	ldub	[%o0], %o3		! load one byte
1820	deccc	%o2			! reduce count for cc test
1821	bz,pt	%ncc, .co_sm_exit
1822	  stba	%o3,[%o1]ASI_USER	! store one byte
1823	ldub	[%o0 + 1], %o3		! load second byte
1824	deccc	%o2
1825	inc	%o1
1826	bz,pt	%ncc, .co_sm_exit
1827	  stba	%o3,[%o1]ASI_USER	! store second byte
1828	ldub	[%o0 + 2], %o3		! load third byte
1829	inc	%o1
1830	stba	%o3,[%o1]ASI_USER	! store third byte
1831	membar	#Sync				! sync error barrier
1832	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1833	retl
1834	  mov	%g0, %o0		! return 0
1835	.align	16
1836.co_sm_words:
1837	lduw	[%o0], %o3		! read word
1838.co_sm_wordx:
1839	subcc	%o2, 8, %o2		! update count
1840	stwa	%o3, [%o1]ASI_USER	! write word
1841	add	%o0, 8, %o0		! update SRC
1842	lduw	[%o0 - 4], %o3		! read word
1843	add	%o1, 4, %o1		! update DST
1844	stwa	%o3, [%o1]ASI_USER	! write word
1845	bgt,pt	%ncc, .co_sm_words	! loop til done
1846	  add	%o1, 4, %o1		! update DST
1847	addcc	%o2, 7, %o2		! restore count
1848	bz,pt	%ncc, .co_sm_exit
1849	  nop
1850	deccc	%o2
1851	bz,pt	%ncc, .co_sm_byte
1852.co_sm_half:
1853	  subcc	%o2, 2, %o2		! reduce count by 2
1854	lduh	[%o0], %o3		! read half word
1855	add	%o0, 2, %o0		! advance SRC by 2
1856	stha	%o3, [%o1]ASI_USER	! write half word
1857	bgt,pt	%ncc, .co_sm_half	! loop til done
1858	  add	%o1, 2, %o1		! advance DST by 2
1859	addcc	%o2, 1, %o2		! restore count
1860	bz,pt	%ncc, .co_sm_exit
1861	  nop
1862.co_sm_byte:
1863	ldub	[%o0], %o3
1864	stba	%o3, [%o1]ASI_USER
1865	membar	#Sync				! sync error barrier
1866	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1867	retl
1868	  mov	%g0, %o0		! return 0
1869	.align 16
1870.co_sm_word:
1871	subcc	%o2, 4, %o2		! update count
1872	bgt,pt	%ncc, .co_sm_wordx
1873	  lduw	[%o0], %o3		! read word
1874	addcc	%o2, 3, %o2		! restore count
1875	bz,pt	%ncc, .co_sm_exit
1876	  stwa	%o3, [%o1]ASI_USER	! write word
1877	deccc	%o2			! reduce count for cc test
1878	ldub	[%o0 + 4], %o3		! load one byte
1879	add	%o1, 4, %o1
1880	bz,pt	%ncc, .co_sm_exit
1881	  stba	%o3, [%o1]ASI_USER	! store one byte
1882	ldub	[%o0 + 5], %o3		! load second byte
1883	deccc	%o2
1884	inc	%o1
1885	bz,pt	%ncc, .co_sm_exit
1886	  stba	%o3, [%o1]ASI_USER	! store second byte
1887	ldub	[%o0 + 6], %o3		! load third byte
1888	inc	%o1
1889	stba	%o3, [%o1]ASI_USER	! store third byte
1890.co_sm_exit:
1891	  membar	#Sync				! sync error barrier
1892	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1893	retl
1894	  mov	%g0, %o0		! return 0
1895
1896	.align 16
1897.co_med:
1898	xor	%o0, %o1, %o3		! setup alignment check
1899	btst	1, %o3
1900	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
1901	  nop
1902	btst	3, %o3
1903	bnz,pt	%ncc, .co_med_half	! halfword aligned
1904	  nop
1905	btst	7, %o3
1906	bnz,pt	%ncc, .co_med_word	! word aligned
1907	  nop
1908.co_med_long:
1909	btst	3, %o0			! check for
1910	bz,pt	%ncc, .co_med_long1	! word alignment
1911	  nop
1912.co_med_long0:
1913	ldub	[%o0], %o3		! load one byte
1914	inc	%o0
1915	stba	%o3,[%o1]ASI_USER	! store byte
1916	inc	%o1
1917	btst	3, %o0
1918	bnz,pt	%ncc, .co_med_long0
1919	  dec	%o2
1920.co_med_long1:			! word aligned
1921	btst	7, %o0			! check for long word
1922	bz,pt	%ncc, .co_med_long2
1923	  nop
1924	lduw	[%o0], %o3		! load word
1925	add	%o0, 4, %o0		! advance SRC by 4
1926	stwa	%o3, [%o1]ASI_USER	! store word
1927	add	%o1, 4, %o1		! advance DST by 4
1928	sub	%o2, 4, %o2		! reduce count by 4
1929!
1930!  Now long word aligned and have at least 32 bytes to move
1931!
1932.co_med_long2:
1933	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1934	sub	%o1, 8, %o1		! adjust pointer to allow store in
1935					! branch delay slot instead of add
1936.co_med_lmove:
1937	add	%o1, 8, %o1		! advance DST by 8
1938	ldx	[%o0], %o3		! read long word
1939	subcc	%o2, 32, %o2		! reduce count by 32
1940	stxa	%o3, [%o1]ASI_USER	! write long word
1941	add	%o1, 8, %o1		! advance DST by 8
1942	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1943	add	%o0, 32, %o0		! advance SRC by 32
1944	stxa	%o3, [%o1]ASI_USER
1945	ldx	[%o0 - 16], %o3
1946	add	%o1, 8, %o1		! advance DST by 8
1947	stxa	%o3, [%o1]ASI_USER
1948	ldx	[%o0 - 8], %o3
1949	add	%o1, 8, %o1		! advance DST by 8
1950	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
1951	  stxa	%o3, [%o1]ASI_USER
1952	add	%o1, 8, %o1		! advance DST by 8
1953	addcc	%o2, 24, %o2		! restore count to long word offset
1954	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
1955	  nop
1956.co_med_lword:
1957	ldx	[%o0], %o3		! read long word
1958	subcc	%o2, 8, %o2		! reduce count by 8
1959	stxa	%o3, [%o1]ASI_USER	! write long word
1960	add	%o0, 8, %o0		! advance SRC by 8
1961	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
1962	  add	%o1, 8, %o1		! advance DST by 8
1963.co_med_lextra:
1964	addcc	%o2, 7, %o2		! restore rest of count
1965	bz,pt	%ncc, .co_sm_exit	! if zero, then done
1966	  deccc	%o2
1967	bz,pt	%ncc, .co_sm_byte
1968	  nop
1969	ba,pt	%ncc, .co_sm_half
1970	  nop
1971
1972	.align 16
1973	nop				! instruction alignment
1974					! see discussion at start of file
1975.co_med_word:
1976	btst	3, %o0			! check for
1977	bz,pt	%ncc, .co_med_word1	! word alignment
1978	  nop
1979.co_med_word0:
1980	ldub	[%o0], %o3		! load one byte
1981	inc	%o0
1982	stba	%o3,[%o1]ASI_USER	! store byte
1983	inc	%o1
1984	btst	3, %o0
1985	bnz,pt	%ncc, .co_med_word0
1986	  dec	%o2
1987!
1988!  Now word aligned and have at least 36 bytes to move
1989!
1990.co_med_word1:
1991	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1992.co_med_wmove:
1993	lduw	[%o0], %o3		! read word
1994	subcc	%o2, 16, %o2		! reduce count by 16
1995	stwa	%o3, [%o1]ASI_USER	! write word
1996	add	%o1, 4, %o1		! advance DST by 4
1997	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1998	add	%o0, 16, %o0		! advance SRC by 16
1999	stwa	%o3, [%o1]ASI_USER
2000	add	%o1, 4, %o1		! advance DST by 4
2001	lduw	[%o0 - 8], %o3
2002	stwa	%o3, [%o1]ASI_USER
2003	add	%o1, 4, %o1		! advance DST by 4
2004	lduw	[%o0 - 4], %o3
2005	stwa	%o3, [%o1]ASI_USER
2006	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
2007	  add	%o1, 4, %o1		! advance DST by 4
2008	addcc	%o2, 12, %o2		! restore count to word offset
2009	ble,pt	%ncc, .co_med_wextra	! check for more words to move
2010	  nop
2011.co_med_word2:
2012	lduw	[%o0], %o3		! read word
2013	subcc	%o2, 4, %o2		! reduce count by 4
2014	stwa	%o3, [%o1]ASI_USER	! write word
2015	add	%o0, 4, %o0		! advance SRC by 4
2016	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
2017	  add	%o1, 4, %o1		! advance DST by 4
2018.co_med_wextra:
2019	addcc	%o2, 3, %o2		! restore rest of count
2020	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2021	  deccc	%o2
2022	bz,pt	%ncc, .co_sm_byte
2023	  nop
2024	ba,pt	%ncc, .co_sm_half
2025	  nop
2026
2027	.align 16
2028	nop				! instruction alignment
2029	nop				! see discussion at start of file
2030	nop
2031.co_med_half:
2032	btst	1, %o0			! check for
2033	bz,pt	%ncc, .co_med_half1	! half word alignment
2034	  nop
2035	ldub	[%o0], %o3		! load one byte
2036	inc	%o0
2037	stba	%o3,[%o1]ASI_USER	! store byte
2038	inc	%o1
2039	dec	%o2
2040!
2041!  Now half word aligned and have at least 38 bytes to move
2042!
2043.co_med_half1:
2044	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2045.co_med_hmove:
2046	lduh	[%o0], %o3		! read half word
2047	subcc	%o2, 8, %o2		! reduce count by 8
2048	stha	%o3, [%o1]ASI_USER	! write half word
2049	add	%o1, 2, %o1		! advance DST by 2
2050	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
2051	add	%o0, 8, %o0		! advance SRC by 8
2052	stha	%o3, [%o1]ASI_USER
2053	add	%o1, 2, %o1		! advance DST by 2
2054	lduh	[%o0 - 4], %o3
2055	stha	%o3, [%o1]ASI_USER
2056	add	%o1, 2, %o1		! advance DST by 2
2057	lduh	[%o0 - 2], %o3
2058	stha	%o3, [%o1]ASI_USER
2059	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
2060	  add	%o1, 2, %o1		! advance DST by 2
2061	addcc	%o2, 7, %o2		! restore count
2062	bz,pt	%ncc, .co_sm_exit
2063	  deccc	%o2
2064	bz,pt	%ncc, .co_sm_byte
2065	  nop
2066	ba,pt	%ncc, .co_sm_half
2067	  nop
2068
2069/*
2070 * We got here because of a fault during short copyout.
2071 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2072 */
2073.sm_copyout_err:
2074	membar	#Sync
2075	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2076	mov	SM_SAVE_SRC, %o0
2077	mov	SM_SAVE_DST, %o1
2078	mov	SM_SAVE_COUNT, %o2
2079	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2080	tst	%o3
2081	bz,pt	%ncc, 3f			! if not, return error
2082	  nop
2083	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
2084	jmp	%o5				! original arguments
2085	  nop
20863:
2087	retl
2088	  or	%g0, -1, %o0		! return error value
2089
2090	SET_SIZE(copyout)
2091
2092/*
2093 * The _more entry points are not intended to be used directly by
2094 * any caller from outside this file.  They are provided to allow
2095 * profiling and dtrace of the portions of the copy code that uses
2096 * the floating point registers.
2097 * This entry is particularly important as DTRACE (at least as of
2098 * 4/2004) does not support leaf functions.
2099 */
2100
2101	ENTRY(copyout_more)
2102.copyout_more:
2103	prefetch [%o0], #n_reads
2104	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2105	set	.copyout_err, REAL_LOFAULT
2106
2107/*
2108 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2109 */
2110.do_copyout:
2111        set     copyio_fault, %l7		! .copyio_fault is lofault val
2112
2113	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2114	membar	#Sync				! sync error barrier
2115	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2116
2117	mov	%i0, SAVE_SRC
2118	mov	%i1, SAVE_DST
2119	mov	%i2, SAVE_COUNT
2120
2121	FP_NOMIGRATE(6, 7)
2122
2123	rd	%fprs, %o2		! check for unused fp
2124	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2125	btst	FPRS_FEF, %o2
2126	bz,a,pt	%icc, .do_blockcopyout
2127	  wr	%g0, FPRS_FEF, %fprs
2128
2129	BST_FPQ2Q4_TOSTACK(%o2)
2130
2131.do_blockcopyout:
2132	rd	%gsr, %o2
2133	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2134	or	%l6, FPUSED_FLAG, %l6
2135
2136	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2137	mov	ASI_USER, %asi
2138	bz,pt	%ncc, 2f
2139	  neg	TMP
2140	add	TMP, VIS_BLOCKSIZE, TMP
2141
2142	! TMP = bytes required to align DST on FP_BLOCK boundary
2143	! Using SRC as a tmp here
2144	cmp	TMP, 3
2145	bleu,pt	%ncc, 1f
2146	  sub	CNT,TMP,CNT		! adjust main count
2147	sub	TMP, 3, TMP		! adjust for end of loop test
2148.co_blkalign:
2149	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
2150	stba	SRC, [DST]%asi
2151	subcc	TMP, 4, TMP
2152	ldub	[REALSRC + 1], SRC
2153	add	REALSRC, 4, REALSRC
2154	stba	SRC, [DST + 1]%asi
2155	ldub	[REALSRC - 2], SRC
2156	add	DST, 4, DST
2157	stba	SRC, [DST - 2]%asi
2158	ldub	[REALSRC - 1], SRC
2159	bgu,pt	%ncc, .co_blkalign
2160	  stba	SRC, [DST - 1]%asi
2161
2162	addcc	TMP, 3, TMP		! restore count adjustment
2163	bz,pt	%ncc, 2f		! no bytes left?
2164	  nop
21651:	ldub	[REALSRC], SRC
2166	inc	REALSRC
2167	inc	DST
2168	deccc	TMP
2169	bgu	%ncc, 1b
2170	  stba	SRC, [DST - 1]%asi
2171
21722:
2173	membar	#StoreLoad
2174	andn	REALSRC, 0x7, SRC
2175
2176	! SRC - 8-byte aligned
2177	! DST - 64-byte aligned
2178	ldd	[SRC], %f16
2179	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2180	alignaddr REALSRC, %g0, %g0
2181	ldd	[SRC + 0x08], %f18
2182	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2183	faligndata %f16, %f18, %f48
2184	ldd	[SRC + 0x10], %f20
2185	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2186	faligndata %f18, %f20, %f50
2187	ldd	[SRC + 0x18], %f22
2188	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2189	faligndata %f20, %f22, %f52
2190	ldd	[SRC + 0x20], %f24
2191	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2192	faligndata %f22, %f24, %f54
2193	ldd	[SRC + 0x28], %f26
2194	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2195	faligndata %f24, %f26, %f56
2196	ldd	[SRC + 0x30], %f28
2197	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2198	faligndata %f26, %f28, %f58
2199	ldd	[SRC + 0x38], %f30
2200	ldd	[SRC + VIS_BLOCKSIZE], %f16
2201	sub	CNT, VIS_BLOCKSIZE, CNT
2202	add	SRC, VIS_BLOCKSIZE, SRC
2203	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2204	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2205	ba,pt	%ncc, 1f
2206	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2207	.align	32
22081:
2209	ldd	[SRC + 0x08], %f18
2210	faligndata %f28, %f30, %f60
2211	ldd	[SRC + 0x10], %f20
2212	faligndata %f30, %f16, %f62
2213	stda	%f48, [DST]ASI_BLK_AIUS
2214	ldd	[SRC + 0x18], %f22
2215	faligndata %f16, %f18, %f48
2216	ldd	[SRC + 0x20], %f24
2217	faligndata %f18, %f20, %f50
2218	ldd	[SRC + 0x28], %f26
2219	faligndata %f20, %f22, %f52
2220	ldd	[SRC + 0x30], %f28
2221	faligndata %f22, %f24, %f54
2222	sub	CNT, VIS_BLOCKSIZE, CNT
2223	ldd	[SRC + 0x38], %f30
2224	faligndata %f24, %f26, %f56
2225	add	DST, VIS_BLOCKSIZE, DST
2226	ldd	[SRC + VIS_BLOCKSIZE], %f16
2227	faligndata %f26, %f28, %f58
2228	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2229	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2230	add	SRC, VIS_BLOCKSIZE, SRC
2231	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2232	cmp	CNT, VIS_BLOCKSIZE + 8
2233	bgu,pt	%ncc, 1b
2234	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2235
2236	! only if REALSRC & 0x7 is 0
2237	cmp	CNT, VIS_BLOCKSIZE
2238	bne	%ncc, 3f
2239	  andcc	REALSRC, 0x7, %g0
2240	bz,pt	%ncc, 2f
2241	  nop
22423:
2243	faligndata %f28, %f30, %f60
2244	faligndata %f30, %f16, %f62
2245	stda	%f48, [DST]ASI_BLK_AIUS
2246	add	DST, VIS_BLOCKSIZE, DST
2247	ba,pt	%ncc, 3f
2248	  nop
22492:
2250	ldd	[SRC + 0x08], %f18
2251	fsrc1	%f28, %f60
2252	ldd	[SRC + 0x10], %f20
2253	fsrc1	%f30, %f62
2254	stda	%f48, [DST]ASI_BLK_AIUS
2255	ldd	[SRC + 0x18], %f22
2256	fsrc1	%f16, %f48
2257	ldd	[SRC + 0x20], %f24
2258	fsrc1	%f18, %f50
2259	ldd	[SRC + 0x28], %f26
2260	fsrc1	%f20, %f52
2261	ldd	[SRC + 0x30], %f28
2262	fsrc1	%f22, %f54
2263	ldd	[SRC + 0x38], %f30
2264	fsrc1	%f24, %f56
2265	sub	CNT, VIS_BLOCKSIZE, CNT
2266	add	DST, VIS_BLOCKSIZE, DST
2267	add	SRC, VIS_BLOCKSIZE, SRC
2268	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2269	fsrc1	%f26, %f58
2270	fsrc1	%f28, %f60
2271	fsrc1	%f30, %f62
2272	stda	%f48, [DST]ASI_BLK_AIUS
2273	add	DST, VIS_BLOCKSIZE, DST
2274	ba,a,pt	%ncc, 4f
2275	  nop
2276
22773:	tst	CNT
2278	bz,a	%ncc, 4f
2279	  nop
2280
22815:	ldub	[REALSRC], TMP
2282	inc	REALSRC
2283	inc	DST
2284	deccc	CNT
2285	bgu	%ncc, 5b
2286	  stba	TMP, [DST - 1]%asi
22874:
2288
2289.copyout_exit:
2290	membar	#Sync
2291
2292	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2293	wr	%o2, 0, %gsr		! restore gsr
2294
2295	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2296	btst	FPRS_FEF, %o3
2297	bz,pt	%icc, 4f
2298	  nop
2299
2300	BLD_FPQ2Q4_FROMSTACK(%o2)
2301
2302	ba,pt	%ncc, 1f
2303	  wr	%o3, 0, %fprs		! restore fprs
2304
23054:
2306	FZEROQ2Q4
2307	wr	%o3, 0, %fprs		! restore fprs
2308
23091:
2310	membar	#Sync
2311	andn	%l6, FPUSED_FLAG, %l6
2312	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2313	FP_ALLOWMIGRATE(5, 6)
2314	ret
2315	  restore	%g0, 0, %o0
2316
2317/*
2318 * We got here because of a fault during copyout.
2319 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2320 */
2321.copyout_err:
2322	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2323	tst	%o4
2324	bz,pt	%ncc, 2f			! if not, return error
2325	  nop
2326	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
2327	jmp	%g2				! original arguments
2328	  restore %g0, 0, %g0			! dispose of copy window
23292:
2330        ret
2331	  restore %g0, -1, %o0			! return error value
2332
2333
2334	SET_SIZE(copyout_more)
2335
2336
2337	ENTRY(xcopyout)
2338	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2339	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
2340	  xor	%o0, %o1, %o3			! are src, dst alignable?
2341	btst	7, %o3				!
2342	bz,pt	%ncc, .xcopyout_8		!
2343	  nop
2344	btst	1, %o3				!
2345	bz,pt	%ncc, .xcopyout_2		! check for half-word
2346	  nop
2347	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2348	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2349	tst	%o3
2350	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2351	  cmp	%o2, %o3			! if length <= limit
2352	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2353	  nop
2354	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2355	  nop
2356.xcopyout_2:
2357	btst	3, %o3				!
2358	bz,pt	%ncc, .xcopyout_4		! check for word alignment
2359	  nop
2360	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2361	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2362	tst	%o3
2363	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2364	  cmp	%o2, %o3			! if length <= limit
2365	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2366	  nop
2367	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2368	  nop
2369.xcopyout_4:
2370	! already checked longword, must be word aligned
2371	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2372	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2373	tst	%o3
2374	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2375	  cmp	%o2, %o3			! if length <= limit
2376	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2377	  nop
2378	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2379	  nop
2380.xcopyout_8:
2381	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2382	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2383	tst	%o3
2384	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2385	  cmp	%o2, %o3			! if length <= limit
2386	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2387	  nop
2388	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2389	  nop
2390
2391.xcopyout_small:
2392	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
2393	or	%o5, %lo(.sm_xcopyout_err), %o5
2394	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
2395	membar	#Sync				! sync error barrier
2396	ba,pt	%ncc, .sm_do_copyout		! common code
2397	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
2398
2399.xcopyout_more:
2400	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2401	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2402	ba,pt	%ncc, .do_copyout		! common code
2403	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2404
2405/*
2406 * We got here because of fault during xcopyout
2407 * Errno value is in ERRNO
2408 */
2409.xcopyout_err:
2410	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2411	tst	%o4
2412	bz,pt	%ncc, 2f			! if not, return error
2413	  nop
2414	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
2415	jmp	%g2				! original arguments
2416	  restore %g0, 0, %g0			! dispose of copy window
24172:
2418        ret
2419	  restore ERRNO, 0, %o0			! return errno value
2420
2421.sm_xcopyout_err:
2422
2423	membar	#Sync
2424	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2425	mov	SM_SAVE_SRC, %o0
2426	mov	SM_SAVE_DST, %o1
2427	mov	SM_SAVE_COUNT, %o2
2428	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2429	tst	%o3
2430	bz,pt	%ncc, 3f			! if not, return error
2431	  nop
2432	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
2433	jmp	%o5				! original arguments
2434	  nop
24353:
2436	retl
2437	  or	%g1, 0, %o0		! return errno value
2438
2439	SET_SIZE(xcopyout)
2440
2441	ENTRY(xcopyout_little)
2442	sethi	%hi(.xcopyio_err), %o5
2443	or	%o5, %lo(.xcopyio_err), %o5
2444	ldn	[THREAD_REG + T_LOFAULT], %o4
2445	membar	#Sync				! sync error barrier
2446	stn	%o5, [THREAD_REG + T_LOFAULT]
2447	mov	%o4, %o5
2448
2449	subcc	%g0, %o2, %o3
2450	add	%o0, %o2, %o0
2451	bz,pn	%ncc, 2f		! check for zero bytes
2452	  sub	%o2, 1, %o4
2453	add	%o0, %o4, %o0		! start w/last byte
2454	add	%o1, %o2, %o1
2455	ldub	[%o0 + %o3], %o4
2456
24571:	stba	%o4, [%o1 + %o3]ASI_AIUSL
2458	inccc	%o3
2459	sub	%o0, 2, %o0		! get next byte
2460	bcc,a,pt %ncc, 1b
2461	  ldub	[%o0 + %o3], %o4
2462
24632:
2464	membar	#Sync				! sync error barrier
2465	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2466	retl
2467	  mov	%g0, %o0		! return (0)
2468
2469	SET_SIZE(xcopyout_little)
2470
2471/*
2472 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2473 */
2474
2475	ENTRY(copyin)
2476	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2477	bleu,pt	%ncc, .copyin_small		! go to larger cases
2478	  xor	%o0, %o1, %o3			! are src, dst alignable?
2479	btst	7, %o3				!
2480	bz,pt	%ncc, .copyin_8			! check for longword alignment
2481	  nop
2482	btst	1, %o3				!
2483	bz,pt	%ncc, .copyin_2			! check for half-word
2484	  nop
2485	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2486	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2487	tst	%o3
2488	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2489	  cmp	%o2, %o3			! if length <= limit
2490	bleu,pt	%ncc, .copyin_small		! go to small copy
2491	  nop
2492	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2493	  nop
2494.copyin_2:
2495	btst	3, %o3				!
2496	bz,pt	%ncc, .copyin_4			! check for word alignment
2497	  nop
2498	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2499	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2500	tst	%o3
2501	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2502	  cmp	%o2, %o3			! if length <= limit
2503	bleu,pt	%ncc, .copyin_small		! go to small copy
2504	  nop
2505	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2506	  nop
2507.copyin_4:
2508	! already checked longword, must be word aligned
2509	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2510	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2511	tst	%o3
2512	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2513	  cmp	%o2, %o3			! if length <= limit
2514	bleu,pt	%ncc, .copyin_small		! go to small copy
2515	  nop
2516	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2517	  nop
2518.copyin_8:
2519	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2520	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2521	tst	%o3
2522	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2523	  cmp	%o2, %o3			! if length <= limit
2524	bleu,pt	%ncc, .copyin_small		! go to small copy
2525	  nop
2526	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2527	  nop
2528
2529	.align	16
2530	nop				! instruction alignment
2531					! see discussion at start of file
2532.copyin_small:
2533	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
2534	or	%o5, %lo(.sm_copyin_err), %o5
2535	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
2536	membar	#Sync				! sync error barrier
2537	stn	%o5, [THREAD_REG + T_LOFAULT]
2538.sm_do_copyin:
2539	mov	%o0, SM_SAVE_SRC
2540	mov	%o1, SM_SAVE_DST
2541	cmp	%o2, SHORTCOPY		! check for really short case
2542	bleu,pt	%ncc, .ci_sm_left	!
2543	  mov	%o2, SM_SAVE_COUNT
2544	cmp	%o2, CHKSIZE		! check for medium length cases
2545	bgu,pn	%ncc, .ci_med		!
2546	  or	%o0, %o1, %o3		! prepare alignment check
2547	andcc	%o3, 0x3, %g0		! test for alignment
2548	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
2549.ci_sm_movebytes:
2550	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
2551.ci_sm_notalign4:
2552	lduba	[%o0]ASI_USER, %o3	! read byte
2553	subcc	%o2, 4, %o2		! reduce count by 4
2554	stb	%o3, [%o1]		! write byte
2555	add	%o0, 1, %o0		! advance SRC by 1
2556	lduba	[%o0]ASI_USER, %