225cf1a30Sjl * CDDL HEADER START
325cf1a30Sjl *
425cf1a30Sjl * The contents of this file are subject to the terms of the
525cf1a30Sjl * Common Development and Distribution License (the "License").
625cf1a30Sjl * You may not use this file except in compliance with the License.
725cf1a30Sjl *
825cf1a30Sjl * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
925cf1a30Sjl * or http://www.opensolaris.org/os/licensing.
1025cf1a30Sjl * See the License for the specific language governing permissions
1125cf1a30Sjl * and limitations under the License.
1225cf1a30Sjl *
1325cf1a30Sjl * When distributing Covered Code, include this CDDL HEADER in each
1425cf1a30Sjl * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1525cf1a30Sjl * If applicable, add the following below this CDDL HEADER, with the
1625cf1a30Sjl * fields enclosed by brackets "[]" replaced with your own identifying
1725cf1a30Sjl * information: Portions Copyright [yyyy] [name of copyright owner]
1825cf1a30Sjl *
1925cf1a30Sjl * CDDL HEADER END
2025cf1a30Sjl */
22e64c6c3fSMichael Bergknoff * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
2325cf1a30Sjl * Use is subject to license terms.
2425cf1a30Sjl */
2625cf1a30Sjl#include <sys/param.h>
2725cf1a30Sjl#include <sys/errno.h>
2825cf1a30Sjl#include <sys/asm_linkage.h>
2925cf1a30Sjl#include <sys/vtrace.h>
3025cf1a30Sjl#include <sys/machthread.h>
3125cf1a30Sjl#include <sys/clock.h>
3225cf1a30Sjl#include <sys/asi.h>
3325cf1a30Sjl#include <sys/fsr.h>
3425cf1a30Sjl#include <sys/privregs.h>
3625cf1a30Sjl#include "assym.h"
3925cf1a30Sjl * Pseudo-code to aid in understanding the control flow of the
4025cf1a30Sjl * bcopy/copyin/copyout routines.
4125cf1a30Sjl *
4225cf1a30Sjl * On entry:
4325cf1a30Sjl *
4425cf1a30Sjl * 	! Determine whether to use the FP register version
4525cf1a30Sjl * 	! or the leaf routine version depending on size
4625cf1a30Sjl * 	! of copy and flags.  Set up error handling accordingly.
4725cf1a30Sjl *	! The transition point depends on whether the src and
4825cf1a30Sjl * 	! dst addresses can be aligned to long word, word,
4925cf1a30Sjl * 	! half word, or byte boundaries.
5025cf1a30Sjl *	!
5125cf1a30Sjl *	! WARNING: <Register usage convention>
5225cf1a30Sjl *	! For FP version, %l6 holds previous error handling and
5325cf1a30Sjl *	! a flag: TRAMP_FLAG (low bits)
5425cf1a30Sjl *	! for leaf routine version, %o4 holds those values.
5525cf1a30Sjl *	! So either %l6 or %o4 is reserved and not available for
5625cf1a30Sjl *	! any other use.
5725cf1a30Sjl *
5825cf1a30Sjl * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
5925cf1a30Sjl * 		go to small_copy;		! to speed short copies
6025cf1a30Sjl *
6125cf1a30Sjl * 	! src, dst long word alignable
6225cf1a30Sjl * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
6325cf1a30Sjl * 			go to small_copy;
6425cf1a30Sjl *		if (length <= hw_copy_limit_8)
6525cf1a30Sjl * 			go to small_copy;
6625cf1a30Sjl * 		go to FPBLK_copy;
6725cf1a30Sjl * 	}
6825cf1a30Sjl * 	if (src,dst not alignable) {
6925cf1a30Sjl * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
7025cf1a30Sjl * 			go to small_copy;
7125cf1a30Sjl *		if (length <= hw_copy_limit_1)
7225cf1a30Sjl * 			go to small_copy;
7325cf1a30Sjl * 		go to FPBLK_copy;
7425cf1a30Sjl * 	}
7525cf1a30Sjl * 	if (src,dst halfword alignable) {
7625cf1a30Sjl * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
7725cf1a30Sjl * 			go to small_copy;
7825cf1a30Sjl *		if (length <= hw_copy_limit_2)
7925cf1a30Sjl * 			go to small_copy;
8025cf1a30Sjl * 		go to FPBLK_copy;
8125cf1a30Sjl * 	}
8225cf1a30Sjl * 	if (src,dst word alignable) {
8325cf1a30Sjl * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
8425cf1a30Sjl * 			go to small_copy;
8525cf1a30Sjl *		if (length <= hw_copy_limit_4)
8625cf1a30Sjl * 			go to small_copy;
8725cf1a30Sjl * 		go to FPBLK_copy;
8825cf1a30Sjl * 	}
8925cf1a30Sjl *
9025cf1a30Sjl * small_copy:
9125cf1a30Sjl *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
9225cf1a30Sjl *
9325cf1a30Sjl *	if (count <= 3)				! fast path for tiny copies
9425cf1a30Sjl *		go to sm_left;			! special finish up code
9525cf1a30Sjl *	else
9625cf1a30Sjl *		if (count > CHKSIZE)		! medium sized copies
9725cf1a30Sjl *			go to sm_med		! tuned by alignment
9825cf1a30Sjl *		if(src&dst not both word aligned) {
9925cf1a30Sjl *	sm_movebytes:
10025cf1a30Sjl *			move byte by byte in 4-way unrolled loop
10125cf1a30Sjl *			fall into sm_left;
10225cf1a30Sjl *	sm_left:
10325cf1a30Sjl *			move 0-3 bytes byte at a time as needed.
10425cf1a30Sjl *			restore error handler and exit.
10525cf1a30Sjl *
10625cf1a30Sjl * 		} else {	! src&dst are word aligned
10725cf1a30Sjl *			check for at least 8 bytes left,
10825cf1a30Sjl *			move word at a time, unrolled by 2
10925cf1a30Sjl *			when fewer than 8 bytes left,
11025cf1a30Sjl *	sm_half:	move half word at a time while 2 or more bytes left
11125cf1a30Sjl *	sm_byte:	move final byte if necessary
11225cf1a30Sjl *	sm_exit:
11325cf1a30Sjl *			restore error handler and exit.
11425cf1a30Sjl *		}
11525cf1a30Sjl *
11625cf1a30Sjl * ! Medium length cases with at least CHKSIZE bytes available
11725cf1a30Sjl * ! method: line up src and dst as best possible, then
11825cf1a30Sjl * ! move data in 4-way unrolled loops.
11925cf1a30Sjl *
12025cf1a30Sjl * sm_med:
12125cf1a30Sjl *	if(src&dst unalignable)
12225cf1a30Sjl * 		go to sm_movebytes
12325cf1a30Sjl *	if(src&dst halfword alignable)
12425cf1a30Sjl *		go to sm_movehalf
12525cf1a30Sjl *	if(src&dst word alignable)
12625cf1a30Sjl *		go to sm_moveword
12725cf1a30Sjl * ! fall into long word movement
12825cf1a30Sjl *	move bytes until src is word aligned
12925cf1a30Sjl *	if not long word aligned, move a word
13025cf1a30Sjl *	move long words in 4-way unrolled loop until < 32 bytes left
13125cf1a30Sjl *      move long words in 1-way unrolled loop until < 8 bytes left
13225cf1a30Sjl *	if zero bytes left, goto sm_exit
13325cf1a30Sjl *	if one byte left, go to sm_byte
13425cf1a30Sjl *	else go to sm_half
13525cf1a30Sjl *
13625cf1a30Sjl * sm_moveword:
13725cf1a30Sjl *	move bytes until src is word aligned
13825cf1a30Sjl *	move words in 4-way unrolled loop until < 16 bytes left
13925cf1a30Sjl *      move words in 1-way unrolled loop until < 4 bytes left
14025cf1a30Sjl *	if zero bytes left, goto sm_exit
14125cf1a30Sjl *	if one byte left, go to sm_byte
14225cf1a30Sjl *	else go to sm_half
14325cf1a30Sjl *
14425cf1a30Sjl * sm_movehalf:
14525cf1a30Sjl *	move a byte if needed to align src on halfword
14625cf1a30Sjl *	move halfwords in 4-way unrolled loop until < 8 bytes left
14725cf1a30Sjl *	if zero bytes left, goto sm_exit
14825cf1a30Sjl *	if one byte left, go to sm_byte
14925cf1a30Sjl *	else go to sm_half
15025cf1a30Sjl *
15125cf1a30Sjl *
15225cf1a30Sjl * FPBLK_copy:
15325cf1a30Sjl * 	%l6 = curthread->t_lofault;
15425cf1a30Sjl * 	if (%l6 != NULL) {
15525cf1a30Sjl * 		membar #Sync
15625cf1a30Sjl * 		curthread->t_lofault = .copyerr;
15725cf1a30Sjl * 		caller_error_handler = TRUE             ! %l6 |= 2
15825cf1a30Sjl * 	}
15925cf1a30Sjl *
16025cf1a30Sjl *	! for FPU testing we must not migrate cpus
16125cf1a30Sjl * 	if (curthread->t_lwp == NULL) {
16225cf1a30Sjl *		! Kernel threads do not have pcb's in which to store
16325cf1a30Sjl *		! the floating point state, so disallow preemption during
16425cf1a30Sjl *		! the copy.  This also prevents cpu migration.
16525cf1a30Sjl * 		kpreempt_disable(curthread);
16625cf1a30Sjl *	} else {
16725cf1a30Sjl *		thread_nomigrate();
16825cf1a30Sjl *	}
16925cf1a30Sjl *
17025cf1a30Sjl * 	old_fprs = %fprs;
17125cf1a30Sjl * 	old_gsr = %gsr;
17225cf1a30Sjl * 	if (%fprs.fef) {
17325cf1a30Sjl * 		%fprs.fef = 1;
17425cf1a30Sjl * 		save current fpregs on stack using blockstore
17525cf1a30Sjl * 	} else {
17625cf1a30Sjl * 		%fprs.fef = 1;
17725cf1a30Sjl * 	}
17825cf1a30Sjl *
17925cf1a30Sjl *
18025cf1a30Sjl * 	do_blockcopy_here;
18125cf1a30Sjl *
18225cf1a30Sjl * In lofault handler:
18325cf1a30Sjl *	curthread->t_lofault = .copyerr2;
18425cf1a30Sjl *	Continue on with the normal exit handler
18525cf1a30Sjl *
18625cf1a30Sjl * On normal exit:
18725cf1a30Sjl * 	%gsr = old_gsr;
18825cf1a30Sjl * 	if (old_fprs & FPRS_FEF)
18925cf1a30Sjl * 		restore fpregs from stack using blockload
19025cf1a30Sjl *	else
19125cf1a30Sjl *		zero fpregs
19225cf1a30Sjl * 	%fprs = old_fprs;
19325cf1a30Sjl * 	membar #Sync
19425cf1a30Sjl * 	curthread->t_lofault = (%l6 & ~3);
19525cf1a30Sjl *	! following test omitted from copyin/copyout as they
19625cf1a30Sjl *	! will always have a current thread
19725cf1a30Sjl * 	if (curthread->t_lwp == NULL)
19825cf1a30Sjl *		kpreempt_enable(curthread);
19925cf1a30Sjl *	else
20025cf1a30Sjl *		thread_allowmigrate();
20125cf1a30Sjl * 	return (0)
20225cf1a30Sjl *
20325cf1a30Sjl * In second lofault handler (.copyerr2):
20425cf1a30Sjl *	We've tried to restore fp state from the stack and failed.  To
20525cf1a30Sjl *	prevent from returning with a corrupted fp state, we will panic.
20625cf1a30Sjl */
20925cf1a30Sjl * Comments about optimization choices
21025cf1a30Sjl *
21125cf1a30Sjl * The initial optimization decision in this code is to determine
21225cf1a30Sjl * whether to use the FP registers for a copy or not.  If we don't
21325cf1a30Sjl * use the FP registers, we can execute the copy as a leaf routine,
21425cf1a30Sjl * saving a register save and restore.  Also, less elaborate setup
21525cf1a30Sjl * is required, allowing short copies to be completed more quickly.
21625cf1a30Sjl * For longer copies, especially unaligned ones (where the src and
21725cf1a30Sjl * dst do not align to allow simple ldx,stx operation), the FP
21825cf1a30Sjl * registers allow much faster copy operations.
21925cf1a30Sjl *
22025cf1a30Sjl * The estimated extra cost of the FP path will vary depending on
22125cf1a30Sjl * src/dst alignment, dst offset from the next 64 byte FPblock store
22225cf1a30Sjl * boundary, remaining src data after the last full dst cache line is
22325cf1a30Sjl * moved whether the FP registers need to be saved, and some other
22425cf1a30Sjl * minor issues.  The average additional overhead is estimated to be
22525cf1a30Sjl * 400 clocks.  Since each non-repeated/predicted tst and branch costs
22625cf1a30Sjl * around 10 clocks, elaborate calculation would slow down to all
22725cf1a30Sjl * longer copies and only benefit a small portion of medium sized
22825cf1a30Sjl * copies.  Rather than incur such cost, we chose fixed transition
22925cf1a30Sjl * points for each of the alignment choices.
23025cf1a30Sjl *
23125cf1a30Sjl * For the inner loop, here is a comparison of the per cache line
23225cf1a30Sjl * costs for each alignment when src&dst are in cache:
23325cf1a30Sjl *
23425cf1a30Sjl * byte aligned:  108 clocks slower for non-FPBLK
23525cf1a30Sjl * half aligned:   44 clocks slower for non-FPBLK
23625cf1a30Sjl * word aligned:   12 clocks slower for non-FPBLK
23725cf1a30Sjl * long aligned:    4 clocks >>faster<< for non-FPBLK
23825cf1a30Sjl *
23925cf1a30Sjl * The long aligned loop runs faster because it does no prefetching.
24025cf1a30Sjl * That wins if the data is not in cache or there is too little
24125cf1a30Sjl * data to gain much benefit from prefetching.  But when there
24225cf1a30Sjl * is more data and that data is not in cache, failing to prefetch
24325cf1a30Sjl * can run much slower.  In addition, there is a 2 Kbyte store queue
24425cf1a30Sjl * which will cause the non-FPBLK inner loop to slow for larger copies.
24525cf1a30Sjl * The exact tradeoff is strongly load and application dependent, with
24625cf1a30Sjl * increasing risk of a customer visible performance regression if the
24725cf1a30Sjl * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
24825cf1a30Sjl * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
24925cf1a30Sjl * upper limit for the non-FPBLK code.  To minimize performance regression
25025cf1a30Sjl * risk while still gaining the primary benefits of the improvements to
25125cf1a30Sjl * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
25225cf1a30Sjl * hw_copy_limit_*.  Later experimental studies using different values
25325cf1a30Sjl * of hw_copy_limit_* can be used to make further adjustments if
25425cf1a30Sjl * appropriate.
25525cf1a30Sjl *
25625cf1a30Sjl * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
25725cf1a30Sjl * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
25825cf1a30Sjl * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
25925cf1a30Sjl * hw_copy_limit_8 = src and dst are longword aligned
26025cf1a30Sjl *
26125cf1a30Sjl * To say that src and dst are word aligned means that after
26225cf1a30Sjl * some initial alignment activity of moving 0 to 3 bytes,
26325cf1a30Sjl * both the src and dst will be on word boundaries so that
26425cf1a30Sjl * word loads and stores may be used.
26525cf1a30Sjl *
26625cf1a30Sjl * Default values at May,2005 are:
26725cf1a30Sjl * hw_copy_limit_1 =  256
26825cf1a30Sjl * hw_copy_limit_2 =  512
26925cf1a30Sjl * hw_copy_limit_4 = 1024
27025cf1a30Sjl * hw_copy_limit_8 = 1024 (or 1536 on some systems)
27125cf1a30Sjl *
27225cf1a30Sjl *
27325cf1a30Sjl * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
27425cf1a30Sjl * disabled for that alignment choice.
27525cf1a30Sjl * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
27625cf1a30Sjl * the value of VIS_COPY_THRESHOLD is used.
27725cf1a30Sjl * It is not envisioned that hw_copy_limit_? will be changed in the field
27825cf1a30Sjl * It is provided to allow for disabling FPBLK copies and to allow
27925cf1a30Sjl * easy testing of alternate values on future HW implementations
28025cf1a30Sjl * that might have different cache sizes, clock rates or instruction
28125cf1a30Sjl * timing rules.
28225cf1a30Sjl *
28325cf1a30Sjl * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
28425cf1a30Sjl * threshold to speedup all shorter copies (less than 256).  That
28525cf1a30Sjl * saves an alignment test, memory reference, and enabling test
28625cf1a30Sjl * for all short copies, or an estimated 24 clocks.
28725cf1a30Sjl *
28825cf1a30Sjl * The order in which these limits are checked does matter since each
28925cf1a30Sjl * non-predicted tst and branch costs around 10 clocks.
29025cf1a30Sjl * If src and dst are randomly selected addresses,
29125cf1a30Sjl * 4 of 8 will not be alignable.
29225cf1a30Sjl * 2 of 8 will be half word alignable.
29325cf1a30Sjl * 1 of 8 will be word alignable.
29425cf1a30Sjl * 1 of 8 will be long word alignable.
29525cf1a30Sjl * But, tests on running kernels show that src and dst to copy code
29625cf1a30Sjl * are typically not on random alignments.  Structure copies and
29725cf1a30Sjl * copies of larger data sizes are often on long word boundaries.
29825cf1a30Sjl * So we test the long word alignment case first, then
29925cf1a30Sjl * the byte alignment, then halfword, then word alignment.
30025cf1a30Sjl *
30125cf1a30Sjl * Several times, tests for length are made to split the code
30225cf1a30Sjl * into subcases.  These tests often allow later tests to be
30325cf1a30Sjl * avoided.  For example, within the non-FPBLK copy, we first
30425cf1a30Sjl * check for tiny copies of 3 bytes or less.  That allows us
30525cf1a30Sjl * to use a 4-way unrolled loop for the general byte copy case
30625cf1a30Sjl * without a test on loop entry.
30725cf1a30Sjl * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
30825cf1a30Sjl * vs longer cases.  For the really short case, we don't attempt
30925cf1a30Sjl * align src and dst.  We try to minimize special case tests in
31025cf1a30Sjl * the shortest loops as each test adds a significant percentage
31125cf1a30Sjl * to the total time.
31225cf1a30Sjl *
31325cf1a30Sjl * For the medium sized cases, we allow ourselves to adjust the
31425cf1a30Sjl * src and dst alignment and provide special cases for each of
31525cf1a30Sjl * the four adjusted alignment cases. The CHKSIZE that was used
31625cf1a30Sjl * to decide between short and medium size was chosen to be 39
31725cf1a30Sjl * as that allows for the worst case of 7 bytes of alignment
31825cf1a30Sjl * shift and 4 times 8 bytes for the first long word unrolling.
31925cf1a30Sjl * That knowledge saves an initial test for length on entry into
32025cf1a30Sjl * the medium cases.  If the general loop unrolling factor were
32125cf1a30Sjl * to be increases, this number would also need to be adjusted.
32225cf1a30Sjl *
32325cf1a30Sjl * For all cases in the non-FPBLK code where it is known that at
32425cf1a30Sjl * least 4 chunks of data are available for movement, the
32525cf1a30Sjl * loop is unrolled by four.  This 4-way loop runs in 8 clocks
32625cf1a30Sjl * or 2 clocks per data element.
32725cf1a30Sjl *
32825cf1a30Sjl * Instruction alignment is forced by used of .align 16 directives
32925cf1a30Sjl * and nops which are not executed in the code.  This
33025cf1a30Sjl * combination of operations shifts the alignment of following
33125cf1a30Sjl * loops to insure that loops are aligned so that their instructions
33225cf1a30Sjl * fall within the minimum number of 4 instruction fetch groups.
33325cf1a30Sjl * If instructions are inserted or removed between the .align
33425cf1a30Sjl * instruction and the unrolled loops, then the alignment needs
33525cf1a30Sjl * to be readjusted.  Misaligned loops can add a clock per loop
33625cf1a30Sjl * iteration to the loop timing.
33725cf1a30Sjl *
33825cf1a30Sjl * In a few cases, code is duplicated to avoid a branch.  Since
33925cf1a30Sjl * a non-predicted tst and branch takes 10 clocks, this savings
34025cf1a30Sjl * is judged an appropriate time-space tradeoff.
34125cf1a30Sjl *
34225cf1a30Sjl * Within the FPBLK-code, the prefetch method in the inner
34325cf1a30Sjl * loop needs to be explained as it is not standard.  Two
34425cf1a30Sjl * prefetches are issued for each cache line instead of one.
34525cf1a30Sjl * The primary one is at the maximum reach of 8 cache lines.
34625cf1a30Sjl * Most of the time, that maximum prefetch reach gives the
34725cf1a30Sjl * cache line more time to reach the processor for systems with
34825cf1a30Sjl * higher processor clocks.  But, sometimes memory interference
34925cf1a30Sjl * can cause that prefetch to be dropped.  Putting a second
35025cf1a30Sjl * prefetch at a reach of 5 cache lines catches the drops
35125cf1a30Sjl * three iterations later and shows a measured improvement
35225cf1a30Sjl * in performance over any similar loop with a single prefetch.
35325cf1a30Sjl * The prefetches are placed in the loop so they overlap with
35425cf1a30Sjl * non-memory instructions, so that there is no extra cost
35525cf1a30Sjl * when the data is already in-cache.
35625cf1a30Sjl *
35725cf1a30Sjl */
36025cf1a30Sjl * Notes on preserving existing fp state and on membars.
36125cf1a30Sjl *
36225cf1a30Sjl * When a copyOP decides to use fp we may have to preserve existing
36325cf1a30Sjl * floating point state.  It is not the caller's state that we need to
36425cf1a30Sjl * preserve - the rest of the kernel does not use fp and, anyway, fp
36525cf1a30Sjl * registers are volatile across a call.  Some examples:
36625cf1a30Sjl *
36725cf1a30Sjl *	- userland has fp state and is interrupted (device interrupt
36825cf1a30Sjl *	  or trap) and within the interrupt/trap handling we use
36925cf1a30Sjl *	  bcopy()
37025cf1a30Sjl *	- another (higher level) interrupt or trap handler uses bcopy
37125cf1a30Sjl *	  while a bcopy from an earlier interrupt is still active
37225cf1a30Sjl *	- an asynchronous error trap occurs while fp state exists (in
37325cf1a30Sjl *	  userland or in kernel copy) and the tl0 component of the handling
37425cf1a30Sjl *	  uses bcopy
37525cf1a30Sjl *	- a user process with fp state incurs a copy-on-write fault and
37625cf1a30Sjl *	  hwblkpagecopy always uses fp
37725cf1a30Sjl *
37825cf1a30Sjl * We therefore need a per-call place in which to preserve fp state -
37925cf1a30Sjl * using our stack is ideal (and since fp copy cannot be leaf optimized
38025cf1a30Sjl * because of calls it makes, this is no hardship).
38125cf1a30Sjl *
38225cf1a30Sjl * When we have finished fp copy (with it's repeated block stores)
38325cf1a30Sjl * we must membar #Sync so that our block stores may complete before
38425cf1a30Sjl * we either restore the original fp state into the fp registers or
38525cf1a30Sjl * return to a caller which may initiate other fp operations that could
38625cf1a30Sjl * modify the fp regs we used before the block stores complete.
38725cf1a30Sjl *
38825cf1a30Sjl * Synchronous faults (eg, unresolvable DMMU miss) that occur while
38925cf1a30Sjl * t_lofault is not NULL will not panic but will instead trampoline
39025cf1a30Sjl * to the registered lofault handler.  There is no need for any
39125cf1a30Sjl * membars for these - eg, our store to t_lofault will always be visible to
39225cf1a30Sjl * ourselves and it is our cpu which will take any trap.
39325cf1a30Sjl *
39425cf1a30Sjl * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
39525cf1a30Sjl * while t_lofault is not NULL will also not panic.  Since we're copying
39625cf1a30Sjl * to or from userland the extent of the damage is known - the destination
39725cf1a30Sjl * buffer is incomplete.  So trap handlers will trampoline to the lofault
39825cf1a30Sjl * handler in this case which should take some form of error action to
39925cf1a30Sjl * avoid using the incomplete buffer.  The trap handler also flags the
40025cf1a30Sjl * fault so that later return-from-trap handling (for the trap that brought
40125cf1a30Sjl * this thread into the kernel in the first place) can notify the process
40225cf1a30Sjl * and reboot the system (or restart the service with Greenline/Contracts).
40325cf1a30Sjl *
40425cf1a30Sjl * Asynchronous faults (eg, uncorrectable ECC error from memory) can
40525cf1a30Sjl * result in deferred error traps - the trap is taken sometime after
40625cf1a30Sjl * the event and the trap PC may not be the PC of the faulting access.
40725cf1a30Sjl * Delivery of such pending traps can be forced by a membar #Sync, acting
40825cf1a30Sjl * as an "error barrier" in this role.  To accurately apply the user/kernel
40925cf1a30Sjl * separation described in the preceding paragraph we must force delivery
41025cf1a30Sjl * of deferred traps affecting kernel state before we install a lofault
41125cf1a30Sjl * handler (if we interpose a new lofault handler on an existing one there
41225cf1a30Sjl * is no need to repeat this), and we must force delivery of deferred
41325cf1a30Sjl * errors affecting the lofault-protected region before we clear t_lofault.
41425cf1a30Sjl * Failure to do so results in lost kernel state being interpreted as
41525cf1a30Sjl * affecting a copyin/copyout only, or of an error that really only
41625cf1a30Sjl * affects copy data being interpreted as losing kernel state.
41725cf1a30Sjl *
41825cf1a30Sjl * Since the copy operations may preserve and later restore floating
41925cf1a30Sjl * point state that does not belong to the caller (see examples above),
42025cf1a30Sjl * we must be careful in how we do this in order to prevent corruption
42125cf1a30Sjl * of another program.
42225cf1a30Sjl *
42325cf1a30Sjl * To make sure that floating point state is always saved and restored
42425cf1a30Sjl * correctly, the following "big rules" must be followed when the floating
42525cf1a30Sjl * point registers will be used:
42625cf1a30Sjl *
42725cf1a30Sjl * 1. %l6 always holds the caller's lofault handler.  Also in this register,
42825cf1a30Sjl *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
42925cf1a30Sjl *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
43025cf1a30Sjl *    lofault handler was set coming in.
43125cf1a30Sjl *
43225cf1a30Sjl * 2. The FPUSED flag indicates that all FP state has been successfully stored
43325cf1a30Sjl *    on the stack.  It should not be set until this save has been completed.
43425cf1a30Sjl *
43525cf1a30Sjl * 3. The FPUSED flag should not be cleared on exit until all FP state has
43625cf1a30Sjl *    been restored from the stack.  If an error occurs while restoring
43725cf1a30Sjl *    data from the stack, the error handler can check this flag to see if
43825cf1a30Sjl *    a restore is necessary.
43925cf1a30Sjl *
44025cf1a30Sjl * 4. Code run under the new lofault handler must be kept to a minimum.  In
44125cf1a30Sjl *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
44225cf1a30Sjl *    to kpreempt(), should not be made until after the lofault handler has
44325cf1a30Sjl *    been restored.
44425cf1a30Sjl */
44725cf1a30Sjl * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
44825cf1a30Sjl * to "break even" using FP/VIS-accelerated memory operations.
44925cf1a30Sjl * The FPBLK code assumes a minimum number of bytes are available
45025cf1a30Sjl * to be moved on entry.  Check that code carefully before
45125cf1a30Sjl * reducing VIS_COPY_THRESHOLD below 256.
45225cf1a30Sjl */
45425cf1a30Sjl * This shadows sys/machsystm.h which can't be included due to the lack of
45525cf1a30Sjl * _ASM guards in include files it references. Change it here, change it there.
45625cf1a30Sjl */
45725cf1a30Sjl#define VIS_COPY_THRESHOLD 256
46025cf1a30Sjl * TEST for very short copies
46125cf1a30Sjl * Be aware that the maximum unroll for the short unaligned case
46225cf1a30Sjl * is SHORTCOPY+1
46325cf1a30Sjl */
46425cf1a30Sjl#define SHORTCOPY 3
46525cf1a30Sjl#define CHKSIZE  39
46825cf1a30Sjl * Indicates that we're to trampoline to the error handler.
46925cf1a30Sjl * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
47025cf1a30Sjl * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
47125cf1a30Sjl */
47225cf1a30Sjl#define	FPUSED_FLAG	1
47325cf1a30Sjl#define	TRAMP_FLAG	2
47425cf1a30Sjl#define	MASK_FLAGS	3
47725cf1a30Sjl * Number of outstanding prefetches.
478c8a722abSpm * first prefetch moves data from L2 to L1 (n_reads)
479c8a722abSpm * second prefetch moves data from memory to L2 (one_read)
48025cf1a30Sjl */
481c8a722abSpm#define	OLYMPUS_C_PREFETCH	24
482c8a722abSpm#define	OLYMPUS_C_2ND_PREFETCH	12
48425cf1a30Sjl#define	VIS_BLOCKSIZE		64
48725cf1a30Sjl * Size of stack frame in order to accomodate a 64-byte aligned
48825cf1a30Sjl * floating-point register save area and 2 64-bit temp locations.
48925cf1a30Sjl * All copy functions use two quadrants of fp registers; to assure a
49025cf1a30Sjl * block-aligned two block buffer in which to save we must reserve
49125cf1a30Sjl * three blocks on stack.  Not all functions preserve %pfrs on stack
49225cf1a30Sjl * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
49325cf1a30Sjl *
49425cf1a30Sjl *    _______________________________________ <-- %fp + STACK_BIAS
49525cf1a30Sjl *    | We may need to preserve 2 quadrants |
49625cf1a30Sjl *    | of fp regs, but since we do so with |
49725cf1a30Sjl *    | BST/BLD we need room in which to    |
49825cf1a30Sjl *    | align to VIS_BLOCKSIZE bytes.  So   |
49925cf1a30Sjl *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
50025cf1a30Sjl *    |-------------------------------------|
50125cf1a30Sjl *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
50225cf1a30Sjl *    |-------------------------------------|
50325cf1a30Sjl *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
50425cf1a30Sjl *    ---------------------------------------
50525cf1a30Sjl */
50625cf1a30Sjl#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
50725cf1a30Sjl#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
50825cf1a30Sjl#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
51025cf1a30Sjl#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
51325cf1a30Sjl * Common macros used by the various versions of the block copy
51425cf1a30Sjl * routines in this file.
51525cf1a30Sjl */
51825cf1a30Sjl * In FP copies if we do not have preserved data to restore over
51925cf1a30Sjl * the fp regs we used then we must zero those regs to avoid
52025cf1a30Sjl * exposing portions of the data to later threads (data security).
52125cf1a30Sjl *
52225cf1a30Sjl * Copy functions use either quadrants 1 and 3 or 2 and 4.
52325cf1a30Sjl *
52425cf1a30Sjl * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
52525cf1a30Sjl * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
52625cf1a30Sjl *
52725cf1a30Sjl * The instructions below are quicker than repeated fzero instructions
52825cf1a30Sjl * since they can dispatch down two fp pipelines.
52925cf1a30Sjl */
53025cf1a30Sjl#define	FZEROQ1Q3			\
53125cf1a30Sjl	fzero	%f0			;\
53225cf1a30Sjl	fmovd	%f0, %f2		;\
53325cf1a30Sjl	fmovd	%f0, %f4		;\
53425cf1a30Sjl	fmovd	%f0, %f6		;\
53525cf1a30Sjl	fmovd	%f0, %f8		;\
53625cf1a30Sjl	fmovd	%f0, %f10		;\
53725cf1a30Sjl	fmovd	%f0, %f12		;\
53825cf1a30Sjl	fmovd	%f0, %f14		;\
53925cf1a30Sjl	fmovd	%f0, %f32		;\
54025cf1a30Sjl	fmovd	%f0, %f34		;\
54125cf1a30Sjl	fmovd	%f0, %f36		;\
54225cf1a30Sjl	fmovd	%f0, %f38		;\
54325cf1a30Sjl	fmovd	%f0, %f40		;\
54425cf1a30Sjl	fmovd	%f0, %f42		;\
54525cf1a30Sjl	fmovd	%f0, %f44		;\
54625cf1a30Sjl	fmovd	%f0, %f46
54825cf1a30Sjl#define	FZEROQ2Q4			\
54925cf1a30Sjl	fzero	%f16			;\
55025cf1a30Sjl	fmovd	%f0, %f18		;\
55125cf1a30Sjl	fmovd	%f0, %f20		;\
55225cf1a30Sjl	fmovd	%f0, %f22		;\
55325cf1a30Sjl	fmovd	%f0, %f24		;\
55425cf1a30Sjl	fmovd	%f0, %f26		;\
55525cf1a30Sjl	fmovd	%f0, %f28		;\
55625cf1a30Sjl	fmovd	%f0, %f30		;\
55725cf1a30Sjl	fmovd	%f0, %f48		;\
55825cf1a30Sjl	fmovd	%f0, %f50		;\
55925cf1a30Sjl	fmovd	%f0, %f52		;\
56025cf1a30Sjl	fmovd	%f0, %f54		;\
56125cf1a30Sjl	fmovd	%f0, %f56		;\
56225cf1a30Sjl	fmovd	%f0, %f58		;\
56325cf1a30Sjl	fmovd	%f0, %f60		;\
56425cf1a30Sjl	fmovd	%f0, %f62
56725cf1a30Sjl * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
56825cf1a30Sjl * Used to save and restore in-use fp registers when we want to use FP
56925cf1a30Sjl * and find fp already in use and copy size still large enough to justify
57025cf1a30Sjl * the additional overhead of this save and restore.
57125cf1a30Sjl *
57225cf1a30Sjl * A membar #Sync is needed before save to sync fp ops initiated before
57325cf1a30Sjl * the call to the copy function (by whoever has fp in use); for example
57425cf1a30Sjl * an earlier block load to the quadrant we are about to save may still be
57525cf1a30Sjl * "in flight".  A membar #Sync is required at the end of the save to
57625cf1a30Sjl * sync our block store (the copy code is about to begin ldd's to the
57725cf1a30Sjl * first quadrant).
57825cf1a30Sjl *
57925cf1a30Sjl * Similarly: a membar #Sync before restore allows the block stores of
58025cf1a30Sjl * the copy operation to complete before we fill the quadrants with their
58125cf1a30Sjl * original data, and a membar #Sync after restore lets the block loads
58225cf1a30Sjl * of the restore complete before we return to whoever has the fp regs
58325cf1a30Sjl * in use.  To avoid repeated membar #Sync we make it the responsibility
58425cf1a30Sjl * of the copy code to membar #Sync immediately after copy is complete
58525cf1a30Sjl * and before using the BLD_*_FROMSTACK macro.
58625cf1a30Sjl */
58725cf1a30Sjl#define BST_FPQ1Q3_TOSTACK(tmp1)				\
58825cf1a30Sjl	/* membar #Sync	*/					;\
58925cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59025cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
59125cf1a30Sjl	stda	%f0, [tmp1]ASI_BLK_P				;\
59225cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
59325cf1a30Sjl	stda	%f32, [tmp1]ASI_BLK_P				;\
59425cf1a30Sjl	membar	#Sync
59625cf1a30Sjl#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
59725cf1a30Sjl	/* membar #Sync - provided at copy completion */	;\
59825cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59925cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60025cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f0				;\
60125cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
60225cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f32				;\
60325cf1a30Sjl	membar	#Sync
60525cf1a30Sjl#define BST_FPQ2Q4_TOSTACK(tmp1)				\
60625cf1a30Sjl	/* membar #Sync */					;\
60725cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
60825cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60925cf1a30Sjl	stda	%f16, [tmp1]ASI_BLK_P				;\
61025cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
61125cf1a30Sjl	stda	%f48, [tmp1]ASI_BLK_P				;\
61225cf1a30Sjl	membar	#Sync
61425cf1a30Sjl#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
61525cf1a30Sjl	/* membar #Sync - provided at copy completion */	;\
61625cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
61725cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
61825cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f16				;\
61925cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
62025cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f48				;\
62125cf1a30Sjl	membar	#Sync
62425cf1a30Sjl * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
62525cf1a30Sjl * prevent preemption if there is no t_lwp to save FP state to on context
62625cf1a30Sjl * switch) before commencing a FP copy, and reallow it on completion or
62725cf1a30Sjl * in error trampoline paths when we were using FP copy.
62825cf1a30Sjl *
62925cf1a30Sjl * Both macros may call other functions, so be aware that all outputs are
63025cf1a30Sjl * forfeit after using these macros.  For this reason we do not pass registers
63125cf1a30Sjl * to use - we just use any outputs we want.
63225cf1a30Sjl *
63325cf1a30Sjl * Pseudo code:
63425cf1a30Sjl *
63525cf1a30Sjl * FP_NOMIGRATE:
63625cf1a30Sjl *
63725cf1a30Sjl * if (curthread->t_lwp) {
63825cf1a30Sjl *	thread_nomigrate();
63925cf1a30Sjl * } else {
64025cf1a30Sjl *	kpreempt_disable();
64125cf1a30Sjl * }
64225cf1a30Sjl *
64325cf1a30Sjl * FP_ALLOWMIGRATE:
64425cf1a30Sjl *
64525cf1a30Sjl * if (curthread->t_lwp) {
64625cf1a30Sjl *	thread_allowmigrate();
64725cf1a30Sjl * } else {
64825cf1a30Sjl *	kpreempt_enable();
64925cf1a30Sjl * }
65025cf1a30Sjl */
65225cf1a30Sjl#define	FP_NOMIGRATE(label1, label2)				\
65325cf1a30Sjl	ldn	[THREAD_REG + T_LWP], %o0			;\
65425cf1a30Sjl	brz,a,pn %o0, label1/**/f				;\
65525cf1a30Sjl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
65625cf1a30Sjl	call	thread_nomigrate				;\
65725cf1a30Sjl	  nop							;\
65825cf1a30Sjl	ba	label2/**/f					;\
65925cf1a30Sjl	  nop							;\
66025cf1a30Sjllabel1:								;\
66125cf1a30Sjl	inc	%o1						;\
66225cf1a30Sjl	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
66525cf1a30Sjl#define	FP_ALLOWMIGRATE(label1, label2)			\
66625cf1a30Sjl	ldn	[THREAD_REG + T_LWP], %o0			;\
66725cf1a30Sjl	brz,a,pn %o0, label1/**/f				;\
66825cf1a30Sjl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
66925cf1a30Sjl	call thread_allowmigrate				;\
67025cf1a30Sjl	  nop							;\
67125cf1a30Sjl	ba	label2/**/f					;\
67225cf1a30Sjl	  nop							;\
67325cf1a30Sjllabel1:								;\
67425cf1a30Sjl	dec	%o1						;\
67525cf1a30Sjl	brnz,pn	%o1, label2/**/f				;\
67625cf1a30Sjl	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
67725cf1a30Sjl	ldn	[THREAD_REG + T_CPU], %o0			;\
67825cf1a30Sjl	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
67925cf1a30Sjl	brz,pt	%o0, label2/**/f				;\
68025cf1a30Sjl	  nop							;\
68125cf1a30Sjl	call	kpreempt					;\
68225cf1a30Sjl	  rdpr	%pil, %o0					;\
68625cf1a30Sjl * Copy a block of storage, returning an error code if `from' or
68725cf1a30Sjl * `to' takes a kernel pagefault which cannot be resolved.
68825cf1a30Sjl * Returns errno value on pagefault error, 0 if all ok
68925cf1a30Sjl */
69125cf1a30Sjl	.seg	".text"
69225cf1a30Sjl	.align	4
69425cf1a30Sjl	ENTRY(kcopy)
69625cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
69725cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to larger cases
69825cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
69925cf1a30Sjl	btst	7, %o3				!
70025cf1a30Sjl	bz,pt	%ncc, .kcopy_8			! check for longword alignment
70125cf1a30Sjl	  nop
70225cf1a30Sjl	btst	1, %o3				!
70325cf1a30Sjl	bz,pt	%ncc, .kcopy_2			! check for half-word
70425cf1a30Sjl	  nop
70525cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
70625cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
70725cf1a30Sjl	tst	%o3
70825cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
70925cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
71025cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
71125cf1a30Sjl	  nop
71225cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
71325cf1a30Sjl	  nop
71525cf1a30Sjl	btst	3, %o3				!
71625cf1a30Sjl	bz,pt	%ncc, .kcopy_4			! check for word alignment
71725cf1a30Sjl	  nop
71825cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
71925cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
72025cf1a30Sjl	tst	%o3
72125cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
72225cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
72325cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
72425cf1a30Sjl	  nop
72525cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
72625cf1a30Sjl	  nop
72825cf1a30Sjl	! already checked longword, must be word aligned
72925cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
73025cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
73125cf1a30Sjl	tst	%o3
73225cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
73325cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
73425cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
73525cf1a30Sjl	  nop
73625cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
73725cf1a30Sjl	  nop
73925cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
74025cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
74125cf1a30Sjl	tst	%o3
74225cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
74325cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
74425cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
74525cf1a30Sjl	  nop
74625cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
74725cf1a30Sjl	  nop
75025cf1a30Sjl	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
75125cf1a30Sjl	or	%o5, %lo(.sm_copyerr), %o5
75225cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
75325cf1a30Sjl	membar	#Sync				! sync error barrier
75425cf1a30Sjl	ba,pt	%ncc, .sm_do_copy		! common code
75525cf1a30Sjl	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
75825cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
75925cf1a30Sjl	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
76025cf1a30Sjl	or	%l7, %lo(.copyerr), %l7
76125cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
76225cf1a30Sjl	membar	#Sync				! sync error barrier
76325cf1a30Sjl	ba,pt	%ncc, .do_copy			! common code
76425cf1a30Sjl	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
76825cf1a30Sjl * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
76925cf1a30Sjl * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
77025cf1a30Sjl */
77225cf1a30Sjl	set	.copyerr2, %l0
77325cf1a30Sjl	membar	#Sync				! sync error barrier
77425cf1a30Sjl	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
77525cf1a30Sjl	btst	FPUSED_FLAG, %l6
77625cf1a30Sjl	bz	%ncc, 1f
77725cf1a30Sjl	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
77925cf1a30Sjl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
78025cf1a30Sjl	wr	%o2, 0, %gsr
78225cf1a30Sjl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
78325cf1a30Sjl	btst	FPRS_FEF, %o3
78425cf1a30Sjl	bz,pt	%icc, 4f
78525cf1a30Sjl	  nop
78725cf1a30Sjl	BLD_FPQ1Q3_FROMSTACK(%o2)
78925cf1a30Sjl	ba,pt	%ncc, 1f
79025cf1a30Sjl	  wr	%o3, 0, %fprs		! restore fprs
79325cf1a30Sjl	FZEROQ1Q3
79425cf1a30Sjl	wr	%o3, 0, %fprs		! restore fprs
79625cf1a30Sjl	!
79725cf1a30Sjl	! Need to cater for the different expectations of kcopy
79825cf1a30Sjl	! and bcopy. kcopy will *always* set a t_lofault handler
79925cf1a30Sjl	! If it fires, we're expected to just return the error code
80025cf1a30Sjl	! and *not* to invoke any existing error handler. As far as
80125cf1a30Sjl	! bcopy is concerned, we only set t_lofault if there was an
80225cf1a30Sjl	! existing lofault handler. In that case we're expected to
80325cf1a30Sjl	! invoke the previously existing handler after resetting the
80425cf1a30Sjl	! t_lofault value.
80525cf1a30Sjl	!
80725cf1a30Sjl	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
80825cf1a30Sjl	membar	#Sync				! sync error barrier
80925cf1a30Sjl	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
81025cf1a30Sjl	FP_ALLOWMIGRATE(5, 6)
81225cf1a30Sjl	btst	TRAMP_FLAG, %l0
81325cf1a30Sjl	bnz,pn	%ncc, 3f
81425cf1a30Sjl	  nop
81525cf1a30Sjl	ret
81625cf1a30Sjl	  restore	%g1, 0, %o0
81925cf1a30Sjl	!
82025cf1a30Sjl	! We're here via bcopy. There *must* have been an error handler
82125cf1a30Sjl	! in place otherwise we would have died a nasty death already.
82225cf1a30Sjl	!
82325cf1a30Sjl	jmp	%l6				! goto real handler
82425cf1a30Sjl	  restore	%g0, 0, %o0		! dispose of copy window
82725cf1a30Sjl * We got here because of a fault in .copyerr.  We can't safely restore fp
82825cf1a30Sjl * state, so we panic.
82925cf1a30Sjl */
83125cf1a30Sjl	.asciz	"Unable to restore fp state after copy operation"
83325cf1a30Sjl	.align	4
83525cf1a30Sjl	set	fp_panic_msg, %o0
83625cf1a30Sjl	call	panic
83725cf1a30Sjl	  nop
84025cf1a30Sjl * We got here because of a fault during a small kcopy or bcopy.
84125cf1a30Sjl * No floating point registers are used by the small copies.
84225cf1a30Sjl * Errno value is in %g1.
84325cf1a30Sjl */
84625cf1a30Sjl	btst	TRAMP_FLAG, %o4
84725cf1a30Sjl	membar	#Sync
84825cf1a30Sjl	andn	%o4, TRAMP_FLAG, %o4
84925cf1a30Sjl	bnz,pn	%ncc, 3f
85025cf1a30Sjl	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
85125cf1a30Sjl	retl
85225cf1a30Sjl	  mov	%g1, %o0
85425cf1a30Sjl	jmp	%o4				! goto real handler
85525cf1a30Sjl	  mov	%g0, %o0			!
85725cf1a30Sjl	SET_SIZE(kcopy)
86125cf1a30Sjl * Copy a block of storage - must not overlap (from + len <= to).
86225cf1a30Sjl * Registers: l6 - saved t_lofault
86325cf1a30Sjl * (for short copies, o4 - saved t_lofault)
86425cf1a30Sjl *
86525cf1a30Sjl * Copy a page of memory.
86625cf1a30Sjl * Assumes double word alignment and a count >= 256.
86725cf1a30Sjl */
86925cf1a30Sjl	ENTRY(bcopy)
87125cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
87225cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to larger cases
87325cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
87425cf1a30Sjl	btst	7, %o3				!
87525cf1a30Sjl	bz,pt	%ncc, .bcopy_8			! check for longword alignment
87625cf1a30Sjl	  nop
87725cf1a30Sjl	btst	1, %o3				!
87825cf1a30Sjl	bz,pt	%ncc, .bcopy_2			! check for half-word
87925cf1a30Sjl	  nop
88025cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
88125cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
88225cf1a30Sjl	tst	%o3
88325cf1a30Sjl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
88425cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
88525cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to small copy
88625cf1a30Sjl	  nop
88725cf1a30Sjl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
88825cf1a30Sjl	  nop
89025cf1a30Sjl	btst	3, %o3				!
89125cf1a30Sjl	bz,pt	%ncc, .bcopy_4			! check for word alignment
89225cf1a30Sjl	  nop
89325cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
89425cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
89525cf1a30Sjl	tst	%o3
89625cf1a30Sjl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
89725cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
89825cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to small copy
89925cf1a30Sjl	  nop
90025cf1a30Sjl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
90125cf1a30Sjl	  nop
90325cf1a30Sjl	! already checked longword, must be word aligned
90425cf1a30Sjl	sethi	%hi(