125cf1a3jl/*
225cf1a3jl * CDDL HEADER START
325cf1a3jl *
425cf1a3jl * The contents of this file are subject to the terms of the
525cf1a3jl * Common Development and Distribution License (the "License").
625cf1a3jl * You may not use this file except in compliance with the License.
725cf1a3jl *
825cf1a3jl * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
925cf1a3jl * or http://www.opensolaris.org/os/licensing.
1025cf1a3jl * See the License for the specific language governing permissions
1125cf1a3jl * and limitations under the License.
1225cf1a3jl *
1325cf1a3jl * When distributing Covered Code, include this CDDL HEADER in each
1425cf1a3jl * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1525cf1a3jl * If applicable, add the following below this CDDL HEADER, with the
1625cf1a3jl * fields enclosed by brackets "[]" replaced with your own identifying
1725cf1a3jl * information: Portions Copyright [yyyy] [name of copyright owner]
1825cf1a3jl *
1925cf1a3jl * CDDL HEADER END
2025cf1a3jl */
2125cf1a3jl/*
22e64c6c3Michael Bergknoff * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
2325cf1a3jl * Use is subject to license terms.
2425cf1a3jl */
2525cf1a3jl
2625cf1a3jl#include <sys/param.h>
2725cf1a3jl#include <sys/errno.h>
2825cf1a3jl#include <sys/asm_linkage.h>
2925cf1a3jl#include <sys/vtrace.h>
3025cf1a3jl#include <sys/machthread.h>
3125cf1a3jl#include <sys/clock.h>
3225cf1a3jl#include <sys/asi.h>
3325cf1a3jl#include <sys/fsr.h>
3425cf1a3jl#include <sys/privregs.h>
3525cf1a3jl
3625cf1a3jl#if !defined(lint)
3725cf1a3jl#include "assym.h"
3825cf1a3jl#endif	/* lint */
3925cf1a3jl
4025cf1a3jl/*
4125cf1a3jl * Pseudo-code to aid in understanding the control flow of the
4225cf1a3jl * bcopy/copyin/copyout routines.
4325cf1a3jl *
4425cf1a3jl * On entry:
4525cf1a3jl *
4625cf1a3jl * 	! Determine whether to use the FP register version
4725cf1a3jl * 	! or the leaf routine version depending on size
4825cf1a3jl * 	! of copy and flags.  Set up error handling accordingly.
4925cf1a3jl *	! The transition point depends on whether the src and
5025cf1a3jl * 	! dst addresses can be aligned to long word, word,
5125cf1a3jl * 	! half word, or byte boundaries.
5225cf1a3jl *	!
5325cf1a3jl *	! WARNING: <Register usage convention>
5425cf1a3jl *	! For FP version, %l6 holds previous error handling and
5525cf1a3jl *	! a flag: TRAMP_FLAG (low bits)
5625cf1a3jl *	! for leaf routine version, %o4 holds those values.
5725cf1a3jl *	! So either %l6 or %o4 is reserved and not available for
5825cf1a3jl *	! any other use.
5925cf1a3jl *
6025cf1a3jl * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
6125cf1a3jl * 		go to small_copy;		! to speed short copies
6225cf1a3jl *
6325cf1a3jl * 	! src, dst long word alignable
6425cf1a3jl * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
6525cf1a3jl * 			go to small_copy;
6625cf1a3jl *		if (length <= hw_copy_limit_8)
6725cf1a3jl * 			go to small_copy;
6825cf1a3jl * 		go to FPBLK_copy;
6925cf1a3jl * 	}
7025cf1a3jl * 	if (src,dst not alignable) {
7125cf1a3jl * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
7225cf1a3jl * 			go to small_copy;
7325cf1a3jl *		if (length <= hw_copy_limit_1)
7425cf1a3jl * 			go to small_copy;
7525cf1a3jl * 		go to FPBLK_copy;
7625cf1a3jl * 	}
7725cf1a3jl * 	if (src,dst halfword alignable) {
7825cf1a3jl * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
7925cf1a3jl * 			go to small_copy;
8025cf1a3jl *		if (length <= hw_copy_limit_2)
8125cf1a3jl * 			go to small_copy;
8225cf1a3jl * 		go to FPBLK_copy;
8325cf1a3jl * 	}
8425cf1a3jl * 	if (src,dst word alignable) {
8525cf1a3jl * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
8625cf1a3jl * 			go to small_copy;
8725cf1a3jl *		if (length <= hw_copy_limit_4)
8825cf1a3jl * 			go to small_copy;
8925cf1a3jl * 		go to FPBLK_copy;
9025cf1a3jl * 	}
9125cf1a3jl *
9225cf1a3jl * small_copy:
9325cf1a3jl *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
9425cf1a3jl *
9525cf1a3jl *	if (count <= 3)				! fast path for tiny copies
9625cf1a3jl *		go to sm_left;			! special finish up code
9725cf1a3jl *	else
9825cf1a3jl *		if (count > CHKSIZE)		! medium sized copies
9925cf1a3jl *			go to sm_med		! tuned by alignment
10025cf1a3jl *		if(src&dst not both word aligned) {
10125cf1a3jl *	sm_movebytes:
10225cf1a3jl *			move byte by byte in 4-way unrolled loop
10325cf1a3jl *			fall into sm_left;
10425cf1a3jl *	sm_left:
10525cf1a3jl *			move 0-3 bytes byte at a time as needed.
10625cf1a3jl *			restore error handler and exit.
10725cf1a3jl *
10825cf1a3jl * 		} else {	! src&dst are word aligned
10925cf1a3jl *			check for at least 8 bytes left,
11025cf1a3jl *			move word at a time, unrolled by 2
11125cf1a3jl *			when fewer than 8 bytes left,
11225cf1a3jl *	sm_half:	move half word at a time while 2 or more bytes left
11325cf1a3jl *	sm_byte:	move final byte if necessary
11425cf1a3jl *	sm_exit:
11525cf1a3jl *			restore error handler and exit.
11625cf1a3jl *		}
11725cf1a3jl *
11825cf1a3jl * ! Medium length cases with at least CHKSIZE bytes available
11925cf1a3jl * ! method: line up src and dst as best possible, then
12025cf1a3jl * ! move data in 4-way unrolled loops.
12125cf1a3jl *
12225cf1a3jl * sm_med:
12325cf1a3jl *	if(src&dst unalignable)
12425cf1a3jl * 		go to sm_movebytes
12525cf1a3jl *	if(src&dst halfword alignable)
12625cf1a3jl *		go to sm_movehalf
12725cf1a3jl *	if(src&dst word alignable)
12825cf1a3jl *		go to sm_moveword
12925cf1a3jl * ! fall into long word movement
13025cf1a3jl *	move bytes until src is word aligned
13125cf1a3jl *	if not long word aligned, move a word
13225cf1a3jl *	move long words in 4-way unrolled loop until < 32 bytes left
13325cf1a3jl *      move long words in 1-way unrolled loop until < 8 bytes left
13425cf1a3jl *	if zero bytes left, goto sm_exit
13525cf1a3jl *	if one byte left, go to sm_byte
13625cf1a3jl *	else go to sm_half
13725cf1a3jl *
13825cf1a3jl * sm_moveword:
13925cf1a3jl *	move bytes until src is word aligned
14025cf1a3jl *	move words in 4-way unrolled loop until < 16 bytes left
14125cf1a3jl *      move words in 1-way unrolled loop until < 4 bytes left
14225cf1a3jl *	if zero bytes left, goto sm_exit
14325cf1a3jl *	if one byte left, go to sm_byte
14425cf1a3jl *	else go to sm_half
14525cf1a3jl *
14625cf1a3jl * sm_movehalf:
14725cf1a3jl *	move a byte if needed to align src on halfword
14825cf1a3jl *	move halfwords in 4-way unrolled loop until < 8 bytes left
14925cf1a3jl *	if zero bytes left, goto sm_exit
15025cf1a3jl *	if one byte left, go to sm_byte
15125cf1a3jl *	else go to sm_half
15225cf1a3jl *
15325cf1a3jl *
15425cf1a3jl * FPBLK_copy:
15525cf1a3jl * 	%l6 = curthread->t_lofault;
15625cf1a3jl * 	if (%l6 != NULL) {
15725cf1a3jl * 		membar #Sync
15825cf1a3jl * 		curthread->t_lofault = .copyerr;
15925cf1a3jl * 		caller_error_handler = TRUE             ! %l6 |= 2
16025cf1a3jl * 	}
16125cf1a3jl *
16225cf1a3jl *	! for FPU testing we must not migrate cpus
16325cf1a3jl * 	if (curthread->t_lwp == NULL) {
16425cf1a3jl *		! Kernel threads do not have pcb's in which to store
16525cf1a3jl *		! the floating point state, so disallow preemption during
16625cf1a3jl *		! the copy.  This also prevents cpu migration.
16725cf1a3jl * 		kpreempt_disable(curthread);
16825cf1a3jl *	} else {
16925cf1a3jl *		thread_nomigrate();
17025cf1a3jl *	}
17125cf1a3jl *
17225cf1a3jl * 	old_fprs = %fprs;
17325cf1a3jl * 	old_gsr = %gsr;
17425cf1a3jl * 	if (%fprs.fef) {
17525cf1a3jl * 		%fprs.fef = 1;
17625cf1a3jl * 		save current fpregs on stack using blockstore
17725cf1a3jl * 	} else {
17825cf1a3jl * 		%fprs.fef = 1;
17925cf1a3jl * 	}
18025cf1a3jl *
18125cf1a3jl *
18225cf1a3jl * 	do_blockcopy_here;
18325cf1a3jl *
18425cf1a3jl * In lofault handler:
18525cf1a3jl *	curthread->t_lofault = .copyerr2;
18625cf1a3jl *	Continue on with the normal exit handler
18725cf1a3jl *
18825cf1a3jl * On normal exit:
18925cf1a3jl * 	%gsr = old_gsr;
19025cf1a3jl * 	if (old_fprs & FPRS_FEF)
19125cf1a3jl * 		restore fpregs from stack using blockload
19225cf1a3jl *	else
19325cf1a3jl *		zero fpregs
19425cf1a3jl * 	%fprs = old_fprs;
19525cf1a3jl * 	membar #Sync
19625cf1a3jl * 	curthread->t_lofault = (%l6 & ~3);
19725cf1a3jl *	! following test omitted from copyin/copyout as they
19825cf1a3jl *	! will always have a current thread
19925cf1a3jl * 	if (curthread->t_lwp == NULL)
20025cf1a3jl *		kpreempt_enable(curthread);
20125cf1a3jl *	else
20225cf1a3jl *		thread_allowmigrate();
20325cf1a3jl * 	return (0)
20425cf1a3jl *
20525cf1a3jl * In second lofault handler (.copyerr2):
20625cf1a3jl *	We've tried to restore fp state from the stack and failed.  To
20725cf1a3jl *	prevent from returning with a corrupted fp state, we will panic.
20825cf1a3jl */
20925cf1a3jl
21025cf1a3jl/*
21125cf1a3jl * Comments about optimization choices
21225cf1a3jl *
21325cf1a3jl * The initial optimization decision in this code is to determine
21425cf1a3jl * whether to use the FP registers for a copy or not.  If we don't
21525cf1a3jl * use the FP registers, we can execute the copy as a leaf routine,
21625cf1a3jl * saving a register save and restore.  Also, less elaborate setup
21725cf1a3jl * is required, allowing short copies to be completed more quickly.
21825cf1a3jl * For longer copies, especially unaligned ones (where the src and
21925cf1a3jl * dst do not align to allow simple ldx,stx operation), the FP
22025cf1a3jl * registers allow much faster copy operations.
22125cf1a3jl *
22225cf1a3jl * The estimated extra cost of the FP path will vary depending on
22325cf1a3jl * src/dst alignment, dst offset from the next 64 byte FPblock store
22425cf1a3jl * boundary, remaining src data after the last full dst cache line is
22525cf1a3jl * moved whether the FP registers need to be saved, and some other
22625cf1a3jl * minor issues.  The average additional overhead is estimated to be
22725cf1a3jl * 400 clocks.  Since each non-repeated/predicted tst and branch costs
22825cf1a3jl * around 10 clocks, elaborate calculation would slow down to all
22925cf1a3jl * longer copies and only benefit a small portion of medium sized
23025cf1a3jl * copies.  Rather than incur such cost, we chose fixed transition
23125cf1a3jl * points for each of the alignment choices.
23225cf1a3jl *
23325cf1a3jl * For the inner loop, here is a comparison of the per cache line
23425cf1a3jl * costs for each alignment when src&dst are in cache:
23525cf1a3jl *
23625cf1a3jl * byte aligned:  108 clocks slower for non-FPBLK
23725cf1a3jl * half aligned:   44 clocks slower for non-FPBLK
23825cf1a3jl * word aligned:   12 clocks slower for non-FPBLK
23925cf1a3jl * long aligned:    4 clocks >>faster<< for non-FPBLK
24025cf1a3jl *
24125cf1a3jl * The long aligned loop runs faster because it does no prefetching.
24225cf1a3jl * That wins if the data is not in cache or there is too little
24325cf1a3jl * data to gain much benefit from prefetching.  But when there
24425cf1a3jl * is more data and that data is not in cache, failing to prefetch
24525cf1a3jl * can run much slower.  In addition, there is a 2 Kbyte store queue
24625cf1a3jl * which will cause the non-FPBLK inner loop to slow for larger copies.
24725cf1a3jl * The exact tradeoff is strongly load and application dependent, with
24825cf1a3jl * increasing risk of a customer visible performance regression if the
24925cf1a3jl * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
25025cf1a3jl * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
25125cf1a3jl * upper limit for the non-FPBLK code.  To minimize performance regression
25225cf1a3jl * risk while still gaining the primary benefits of the improvements to
25325cf1a3jl * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
25425cf1a3jl * hw_copy_limit_*.  Later experimental studies using different values
25525cf1a3jl * of hw_copy_limit_* can be used to make further adjustments if
25625cf1a3jl * appropriate.
25725cf1a3jl *
25825cf1a3jl * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
25925cf1a3jl * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
26025cf1a3jl * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
26125cf1a3jl * hw_copy_limit_8 = src and dst are longword aligned
26225cf1a3jl *
26325cf1a3jl * To say that src and dst are word aligned means that after
26425cf1a3jl * some initial alignment activity of moving 0 to 3 bytes,
26525cf1a3jl * both the src and dst will be on word boundaries so that
26625cf1a3jl * word loads and stores may be used.
26725cf1a3jl *
26825cf1a3jl * Default values at May,2005 are:
26925cf1a3jl * hw_copy_limit_1 =  256
27025cf1a3jl * hw_copy_limit_2 =  512
27125cf1a3jl * hw_copy_limit_4 = 1024
27225cf1a3jl * hw_copy_limit_8 = 1024 (or 1536 on some systems)
27325cf1a3jl *
27425cf1a3jl *
27525cf1a3jl * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
27625cf1a3jl * disabled for that alignment choice.
27725cf1a3jl * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
27825cf1a3jl * the value of VIS_COPY_THRESHOLD is used.
27925cf1a3jl * It is not envisioned that hw_copy_limit_? will be changed in the field
28025cf1a3jl * It is provided to allow for disabling FPBLK copies and to allow
28125cf1a3jl * easy testing of alternate values on future HW implementations
28225cf1a3jl * that might have different cache sizes, clock rates or instruction
28325cf1a3jl * timing rules.
28425cf1a3jl *
28525cf1a3jl * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
28625cf1a3jl * threshold to speedup all shorter copies (less than 256).  That
28725cf1a3jl * saves an alignment test, memory reference, and enabling test
28825cf1a3jl * for all short copies, or an estimated 24 clocks.
28925cf1a3jl *
29025cf1a3jl * The order in which these limits are checked does matter since each
29125cf1a3jl * non-predicted tst and branch costs around 10 clocks.
29225cf1a3jl * If src and dst are randomly selected addresses,
29325cf1a3jl * 4 of 8 will not be alignable.
29425cf1a3jl * 2 of 8 will be half word alignable.
29525cf1a3jl * 1 of 8 will be word alignable.
29625cf1a3jl * 1 of 8 will be long word alignable.
29725cf1a3jl * But, tests on running kernels show that src and dst to copy code
29825cf1a3jl * are typically not on random alignments.  Structure copies and
29925cf1a3jl * copies of larger data sizes are often on long word boundaries.
30025cf1a3jl * So we test the long word alignment case first, then
30125cf1a3jl * the byte alignment, then halfword, then word alignment.
30225cf1a3jl *
30325cf1a3jl * Several times, tests for length are made to split the code
30425cf1a3jl * into subcases.  These tests often allow later tests to be
30525cf1a3jl * avoided.  For example, within the non-FPBLK copy, we first
30625cf1a3jl * check for tiny copies of 3 bytes or less.  That allows us
30725cf1a3jl * to use a 4-way unrolled loop for the general byte copy case
30825cf1a3jl * without a test on loop entry.
30925cf1a3jl * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
31025cf1a3jl * vs longer cases.  For the really short case, we don't attempt
31125cf1a3jl * align src and dst.  We try to minimize special case tests in
31225cf1a3jl * the shortest loops as each test adds a significant percentage
31325cf1a3jl * to the total time.
31425cf1a3jl *
31525cf1a3jl * For the medium sized cases, we allow ourselves to adjust the
31625cf1a3jl * src and dst alignment and provide special cases for each of
31725cf1a3jl * the four adjusted alignment cases. The CHKSIZE that was used
31825cf1a3jl * to decide between short and medium size was chosen to be 39
31925cf1a3jl * as that allows for the worst case of 7 bytes of alignment
32025cf1a3jl * shift and 4 times 8 bytes for the first long word unrolling.
32125cf1a3jl * That knowledge saves an initial test for length on entry into
32225cf1a3jl * the medium cases.  If the general loop unrolling factor were
32325cf1a3jl * to be increases, this number would also need to be adjusted.
32425cf1a3jl *
32525cf1a3jl * For all cases in the non-FPBLK code where it is known that at
32625cf1a3jl * least 4 chunks of data are available for movement, the
32725cf1a3jl * loop is unrolled by four.  This 4-way loop runs in 8 clocks
32825cf1a3jl * or 2 clocks per data element.
32925cf1a3jl *
33025cf1a3jl * Instruction alignment is forced by used of .align 16 directives
33125cf1a3jl * and nops which are not executed in the code.  This
33225cf1a3jl * combination of operations shifts the alignment of following
33325cf1a3jl * loops to insure that loops are aligned so that their instructions
33425cf1a3jl * fall within the minimum number of 4 instruction fetch groups.
33525cf1a3jl * If instructions are inserted or removed between the .align
33625cf1a3jl * instruction and the unrolled loops, then the alignment needs
33725cf1a3jl * to be readjusted.  Misaligned loops can add a clock per loop
33825cf1a3jl * iteration to the loop timing.
33925cf1a3jl *
34025cf1a3jl * In a few cases, code is duplicated to avoid a branch.  Since
34125cf1a3jl * a non-predicted tst and branch takes 10 clocks, this savings
34225cf1a3jl * is judged an appropriate time-space tradeoff.
34325cf1a3jl *
34425cf1a3jl * Within the FPBLK-code, the prefetch method in the inner
34525cf1a3jl * loop needs to be explained as it is not standard.  Two
34625cf1a3jl * prefetches are issued for each cache line instead of one.
34725cf1a3jl * The primary one is at the maximum reach of 8 cache lines.
34825cf1a3jl * Most of the time, that maximum prefetch reach gives the
34925cf1a3jl * cache line more time to reach the processor for systems with
35025cf1a3jl * higher processor clocks.  But, sometimes memory interference
35125cf1a3jl * can cause that prefetch to be dropped.  Putting a second
35225cf1a3jl * prefetch at a reach of 5 cache lines catches the drops
35325cf1a3jl * three iterations later and shows a measured improvement
35425cf1a3jl * in performance over any similar loop with a single prefetch.
35525cf1a3jl * The prefetches are placed in the loop so they overlap with
35625cf1a3jl * non-memory instructions, so that there is no extra cost
35725cf1a3jl * when the data is already in-cache.
35825cf1a3jl *
35925cf1a3jl */
36025cf1a3jl
36125cf1a3jl/*
36225cf1a3jl * Notes on preserving existing fp state and on membars.
36325cf1a3jl *
36425cf1a3jl * When a copyOP decides to use fp we may have to preserve existing
36525cf1a3jl * floating point state.  It is not the caller's state that we need to
36625cf1a3jl * preserve - the rest of the kernel does not use fp and, anyway, fp
36725cf1a3jl * registers are volatile across a call.  Some examples:
36825cf1a3jl *
36925cf1a3jl *	- userland has fp state and is interrupted (device interrupt
37025cf1a3jl *	  or trap) and within the interrupt/trap handling we use
37125cf1a3jl *	  bcopy()
37225cf1a3jl *	- another (higher level) interrupt or trap handler uses bcopy
37325cf1a3jl *	  while a bcopy from an earlier interrupt is still active
37425cf1a3jl *	- an asynchronous error trap occurs while fp state exists (in
37525cf1a3jl *	  userland or in kernel copy) and the tl0 component of the handling
37625cf1a3jl *	  uses bcopy
37725cf1a3jl *	- a user process with fp state incurs a copy-on-write fault and
37825cf1a3jl *	  hwblkpagecopy always uses fp
37925cf1a3jl *
38025cf1a3jl * We therefore need a per-call place in which to preserve fp state -
38125cf1a3jl * using our stack is ideal (and since fp copy cannot be leaf optimized
38225cf1a3jl * because of calls it makes, this is no hardship).
38325cf1a3jl *
38425cf1a3jl * When we have finished fp copy (with it's repeated block stores)
38525cf1a3jl * we must membar #Sync so that our block stores may complete before
38625cf1a3jl * we either restore the original fp state into the fp registers or
38725cf1a3jl * return to a caller which may initiate other fp operations that could
38825cf1a3jl * modify the fp regs we used before the block stores complete.
38925cf1a3jl *
39025cf1a3jl * Synchronous faults (eg, unresolvable DMMU miss) that occur while
39125cf1a3jl * t_lofault is not NULL will not panic but will instead trampoline
39225cf1a3jl * to the registered lofault handler.  There is no need for any
39325cf1a3jl * membars for these - eg, our store to t_lofault will always be visible to
39425cf1a3jl * ourselves and it is our cpu which will take any trap.
39525cf1a3jl *
39625cf1a3jl * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
39725cf1a3jl * while t_lofault is not NULL will also not panic.  Since we're copying
39825cf1a3jl * to or from userland the extent of the damage is known - the destination
39925cf1a3jl * buffer is incomplete.  So trap handlers will trampoline to the lofault
40025cf1a3jl * handler in this case which should take some form of error action to
40125cf1a3jl * avoid using the incomplete buffer.  The trap handler also flags the
40225cf1a3jl * fault so that later return-from-trap handling (for the trap that brought
40325cf1a3jl * this thread into the kernel in the first place) can notify the process
40425cf1a3jl * and reboot the system (or restart the service with Greenline/Contracts).
40525cf1a3jl *
40625cf1a3jl * Asynchronous faults (eg, uncorrectable ECC error from memory) can
40725cf1a3jl * result in deferred error traps - the trap is taken sometime after
40825cf1a3jl * the event and the trap PC may not be the PC of the faulting access.
40925cf1a3jl * Delivery of such pending traps can be forced by a membar #Sync, acting
41025cf1a3jl * as an "error barrier" in this role.  To accurately apply the user/kernel
41125cf1a3jl * separation described in the preceding paragraph we must force delivery
41225cf1a3jl * of deferred traps affecting kernel state before we install a lofault
41325cf1a3jl * handler (if we interpose a new lofault handler on an existing one there
41425cf1a3jl * is no need to repeat this), and we must force delivery of deferred
41525cf1a3jl * errors affecting the lofault-protected region before we clear t_lofault.
41625cf1a3jl * Failure to do so results in lost kernel state being interpreted as
41725cf1a3jl * affecting a copyin/copyout only, or of an error that really only
41825cf1a3jl * affects copy data being interpreted as losing kernel state.
41925cf1a3jl *
42025cf1a3jl * Since the copy operations may preserve and later restore floating
42125cf1a3jl * point state that does not belong to the caller (see examples above),
42225cf1a3jl * we must be careful in how we do this in order to prevent corruption
42325cf1a3jl * of another program.
42425cf1a3jl *
42525cf1a3jl * To make sure that floating point state is always saved and restored
42625cf1a3jl * correctly, the following "big rules" must be followed when the floating
42725cf1a3jl * point registers will be used:
42825cf1a3jl *
42925cf1a3jl * 1. %l6 always holds the caller's lofault handler.  Also in this register,
43025cf1a3jl *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
43125cf1a3jl *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
43225cf1a3jl *    lofault handler was set coming in.
43325cf1a3jl *
43425cf1a3jl * 2. The FPUSED flag indicates that all FP state has been successfully stored
43525cf1a3jl *    on the stack.  It should not be set until this save has been completed.
43625cf1a3jl *
43725cf1a3jl * 3. The FPUSED flag should not be cleared on exit until all FP state has
43825cf1a3jl *    been restored from the stack.  If an error occurs while restoring
43925cf1a3jl *    data from the stack, the error handler can check this flag to see if
44025cf1a3jl *    a restore is necessary.
44125cf1a3jl *
44225cf1a3jl * 4. Code run under the new lofault handler must be kept to a minimum.  In
44325cf1a3jl *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
44425cf1a3jl *    to kpreempt(), should not be made until after the lofault handler has
44525cf1a3jl *    been restored.
44625cf1a3jl */
44725cf1a3jl
44825cf1a3jl/*
44925cf1a3jl * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
45025cf1a3jl * to "break even" using FP/VIS-accelerated memory operations.
45125cf1a3jl * The FPBLK code assumes a minimum number of bytes are available
45225cf1a3jl * to be moved on entry.  Check that code carefully before
45325cf1a3jl * reducing VIS_COPY_THRESHOLD below 256.
45425cf1a3jl */
45525cf1a3jl/*
45625cf1a3jl * This shadows sys/machsystm.h which can't be included due to the lack of
45725cf1a3jl * _ASM guards in include files it references. Change it here, change it there.
45825cf1a3jl */
45925cf1a3jl#define VIS_COPY_THRESHOLD 256
46025cf1a3jl
46125cf1a3jl/*
46225cf1a3jl * TEST for very short copies
46325cf1a3jl * Be aware that the maximum unroll for the short unaligned case
46425cf1a3jl * is SHORTCOPY+1
46525cf1a3jl */
46625cf1a3jl#define SHORTCOPY 3
46725cf1a3jl#define CHKSIZE  39
46825cf1a3jl
46925cf1a3jl/*
47025cf1a3jl * Indicates that we're to trampoline to the error handler.
47125cf1a3jl * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
47225cf1a3jl * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
47325cf1a3jl */
47425cf1a3jl#define	FPUSED_FLAG	1
47525cf1a3jl#define	TRAMP_FLAG	2
47625cf1a3jl#define	MASK_FLAGS	3
47725cf1a3jl
47825cf1a3jl/*
47925cf1a3jl * Number of outstanding prefetches.
480c8a722apm * first prefetch moves data from L2 to L1 (n_reads)
481c8a722apm * second prefetch moves data from memory to L2 (one_read)
48225cf1a3jl */
483c8a722apm#define	OLYMPUS_C_PREFETCH	24
484c8a722apm#define	OLYMPUS_C_2ND_PREFETCH	12
48525cf1a3jl
48625cf1a3jl#define	VIS_BLOCKSIZE		64
48725cf1a3jl
48825cf1a3jl/*
48925cf1a3jl * Size of stack frame in order to accomodate a 64-byte aligned
49025cf1a3jl * floating-point register save area and 2 64-bit temp locations.
49125cf1a3jl * All copy functions use two quadrants of fp registers; to assure a
49225cf1a3jl * block-aligned two block buffer in which to save we must reserve
49325cf1a3jl * three blocks on stack.  Not all functions preserve %pfrs on stack
49425cf1a3jl * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
49525cf1a3jl *
49625cf1a3jl *    _______________________________________ <-- %fp + STACK_BIAS
49725cf1a3jl *    | We may need to preserve 2 quadrants |
49825cf1a3jl *    | of fp regs, but since we do so with |
49925cf1a3jl *    | BST/BLD we need room in which to    |
50025cf1a3jl *    | align to VIS_BLOCKSIZE bytes.  So   |
50125cf1a3jl *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
50225cf1a3jl *    |-------------------------------------|
50325cf1a3jl *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
50425cf1a3jl *    |-------------------------------------|
50525cf1a3jl *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
50625cf1a3jl *    ---------------------------------------
50725cf1a3jl */
50825cf1a3jl#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
50925cf1a3jl#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
51025cf1a3jl#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
51125cf1a3jl#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
51225cf1a3jl#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
51325cf1a3jl
51425cf1a3jl/*
51525cf1a3jl * Common macros used by the various versions of the block copy
51625cf1a3jl * routines in this file.
51725cf1a3jl */
51825cf1a3jl
51925cf1a3jl/*
52025cf1a3jl * In FP copies if we do not have preserved data to restore over
52125cf1a3jl * the fp regs we used then we must zero those regs to avoid
52225cf1a3jl * exposing portions of the data to later threads (data security).
52325cf1a3jl *
52425cf1a3jl * Copy functions use either quadrants 1 and 3 or 2 and 4.
52525cf1a3jl *
52625cf1a3jl * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
52725cf1a3jl * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
52825cf1a3jl *
52925cf1a3jl * The instructions below are quicker than repeated fzero instructions
53025cf1a3jl * since they can dispatch down two fp pipelines.
53125cf1a3jl */
53225cf1a3jl#define	FZEROQ1Q3			\
53325cf1a3jl	fzero	%f0			;\
53425cf1a3jl	fmovd	%f0, %f2		;\
53525cf1a3jl	fmovd	%f0, %f4		;\
53625cf1a3jl	fmovd	%f0, %f6		;\
53725cf1a3jl	fmovd	%f0, %f8		;\
53825cf1a3jl	fmovd	%f0, %f10		;\
53925cf1a3jl	fmovd	%f0, %f12		;\
54025cf1a3jl	fmovd	%f0, %f14		;\
54125cf1a3jl	fmovd	%f0, %f32		;\
54225cf1a3jl	fmovd	%f0, %f34		;\
54325cf1a3jl	fmovd	%f0, %f36		;\
54425cf1a3jl	fmovd	%f0, %f38		;\
54525cf1a3jl	fmovd	%f0, %f40		;\
54625cf1a3jl	fmovd	%f0, %f42		;\
54725cf1a3jl	fmovd	%f0, %f44		;\
54825cf1a3jl	fmovd	%f0, %f46
54925cf1a3jl
55025cf1a3jl#define	FZEROQ2Q4			\
55125cf1a3jl	fzero	%f16			;\
55225cf1a3jl	fmovd	%f0, %f18		;\
55325cf1a3jl	fmovd	%f0, %f20		;\
55425cf1a3jl	fmovd	%f0, %f22		;\
55525cf1a3jl	fmovd	%f0, %f24		;\
55625cf1a3jl	fmovd	%f0, %f26		;\
55725cf1a3jl	fmovd	%f0, %f28		;\
55825cf1a3jl	fmovd	%f0, %f30		;\
55925cf1a3jl	fmovd	%f0, %f48		;\
56025cf1a3jl	fmovd	%f0, %f50		;\
56125cf1a3jl	fmovd	%f0, %f52		;\
56225cf1a3jl	fmovd	%f0, %f54		;\
56325cf1a3jl	fmovd	%f0, %f56		;\
56425cf1a3jl	fmovd	%f0, %f58		;\
56525cf1a3jl	fmovd	%f0, %f60		;\
56625cf1a3jl	fmovd	%f0, %f62
56725cf1a3jl
56825cf1a3jl/*
56925cf1a3jl * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
57025cf1a3jl * Used to save and restore in-use fp registers when we want to use FP
57125cf1a3jl * and find fp already in use and copy size still large enough to justify
57225cf1a3jl * the additional overhead of this save and restore.
57325cf1a3jl *
57425cf1a3jl * A membar #Sync is needed before save to sync fp ops initiated before
57525cf1a3jl * the call to the copy function (by whoever has fp in use); for example
57625cf1a3jl * an earlier block load to the quadrant we are about to save may still be
57725cf1a3jl * "in flight".  A membar #Sync is required at the end of the save to
57825cf1a3jl * sync our block store (the copy code is about to begin ldd's to the
57925cf1a3jl * first quadrant).
58025cf1a3jl *
58125cf1a3jl * Similarly: a membar #Sync before restore allows the block stores of
58225cf1a3jl * the copy operation to complete before we fill the quadrants with their
58325cf1a3jl * original data, and a membar #Sync after restore lets the block loads
58425cf1a3jl * of the restore complete before we return to whoever has the fp regs
58525cf1a3jl * in use.  To avoid repeated membar #Sync we make it the responsibility
58625cf1a3jl * of the copy code to membar #Sync immediately after copy is complete
58725cf1a3jl * and before using the BLD_*_FROMSTACK macro.
58825cf1a3jl */
58925cf1a3jl#if !defined(lint)
59025cf1a3jl#define BST_FPQ1Q3_TOSTACK(tmp1)				\
59125cf1a3jl	/* membar #Sync	*/					;\
59225cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59325cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
59425cf1a3jl	stda	%f0, [tmp1]ASI_BLK_P				;\
59525cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
59625cf1a3jl	stda	%f32, [tmp1]ASI_BLK_P				;\
59725cf1a3jl	membar	#Sync
59825cf1a3jl
59925cf1a3jl#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
60025cf1a3jl	/* membar #Sync - provided at copy completion */	;\
60125cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
60225cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60325cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f0				;\
60425cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
60525cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f32				;\
60625cf1a3jl	membar	#Sync
60725cf1a3jl
60825cf1a3jl#define BST_FPQ2Q4_TOSTACK(tmp1)				\
60925cf1a3jl	/* membar #Sync */					;\
61025cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
61125cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
61225cf1a3jl	stda	%f16, [tmp1]ASI_BLK_P				;\
61325cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
61425cf1a3jl	stda	%f48, [tmp1]ASI_BLK_P				;\
61525cf1a3jl	membar	#Sync
61625cf1a3jl
61725cf1a3jl#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
61825cf1a3jl	/* membar #Sync - provided at copy completion */	;\
61925cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
62025cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
62125cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f16				;\
62225cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
62325cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f48				;\
62425cf1a3jl	membar	#Sync
62525cf1a3jl#endif
62625cf1a3jl
62725cf1a3jl/*
62825cf1a3jl * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
62925cf1a3jl * prevent preemption if there is no t_lwp to save FP state to on context
63025cf1a3jl * switch) before commencing a FP copy, and reallow it on completion or
63125cf1a3jl * in error trampoline paths when we were using FP copy.
63225cf1a3jl *
63325cf1a3jl * Both macros may call other functions, so be aware that all outputs are
63425cf1a3jl * forfeit after using these macros.  For this reason we do not pass registers
63525cf1a3jl * to use - we just use any outputs we want.
63625cf1a3jl *
63725cf1a3jl * Pseudo code:
63825cf1a3jl *
63925cf1a3jl * FP_NOMIGRATE:
64025cf1a3jl *
64125cf1a3jl * if (curthread->t_lwp) {
64225cf1a3jl *	thread_nomigrate();
64325cf1a3jl * } else {
64425cf1a3jl *	kpreempt_disable();
64525cf1a3jl * }
64625cf1a3jl *
64725cf1a3jl * FP_ALLOWMIGRATE:
64825cf1a3jl *
64925cf1a3jl * if (curthread->t_lwp) {
65025cf1a3jl *	thread_allowmigrate();
65125cf1a3jl * } else {
65225cf1a3jl *	kpreempt_enable();
65325cf1a3jl * }
65425cf1a3jl */
65525cf1a3jl
65625cf1a3jl#define	FP_NOMIGRATE(label1, label2)				\
65725cf1a3jl	ldn	[THREAD_REG + T_LWP], %o0			;\
65825cf1a3jl	brz,a,pn %o0, label1/**/f				;\
65925cf1a3jl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
66025cf1a3jl	call	thread_nomigrate				;\
66125cf1a3jl	  nop							;\
66225cf1a3jl	ba	label2/**/f					;\
66325cf1a3jl	  nop							;\
66425cf1a3jllabel1:								;\
66525cf1a3jl	inc	%o1						;\
66625cf1a3jl	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
66725cf1a3jllabel2:
66825cf1a3jl
66925cf1a3jl#define	FP_ALLOWMIGRATE(label1, label2)			\
67025cf1a3jl	ldn	[THREAD_REG + T_LWP], %o0			;\
67125cf1a3jl	brz,a,pn %o0, label1/**/f				;\
67225cf1a3jl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
67325cf1a3jl	call thread_allowmigrate				;\
67425cf1a3jl	  nop							;\
67525cf1a3jl	ba	label2/**/f					;\
67625cf1a3jl	  nop							;\
67725cf1a3jllabel1:								;\
67825cf1a3jl	dec	%o1						;\
67925cf1a3jl	brnz,pn	%o1, label2/**/f				;\
68025cf1a3jl	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
68125cf1a3jl	ldn	[THREAD_REG + T_CPU], %o0			;\
68225cf1a3jl	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
68325cf1a3jl	brz,pt	%o0, label2/**/f				;\
68425cf1a3jl	  nop							;\
68525cf1a3jl	call	kpreempt					;\
68625cf1a3jl	  rdpr	%pil, %o0					;\
68725cf1a3jllabel2:
68825cf1a3jl
68925cf1a3jl/*
69025cf1a3jl * Copy a block of storage, returning an error code if `from' or
69125cf1a3jl * `to' takes a kernel pagefault which cannot be resolved.
69225cf1a3jl * Returns errno value on pagefault error, 0 if all ok
69325cf1a3jl */
69425cf1a3jl
69525cf1a3jl#if defined(lint)
69625cf1a3jl
69725cf1a3jl/* ARGSUSED */
69825cf1a3jlint
69925cf1a3jlkcopy(const void *from, void *to, size_t count)
70025cf1a3jl{ return(0); }
70125cf1a3jl
70225cf1a3jl#else	/* lint */
70325cf1a3jl
70425cf1a3jl	.seg	".text"
70525cf1a3jl	.align	4
70625cf1a3jl
70725cf1a3jl	ENTRY(kcopy)
70825cf1a3jl
70925cf1a3jl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
71025cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to larger cases
71125cf1a3jl	  xor	%o0, %o1, %o3			! are src, dst alignable?
71225cf1a3jl	btst	7, %o3				!
71325cf1a3jl	bz,pt	%ncc, .kcopy_8			! check for longword alignment
71425cf1a3jl	  nop
71525cf1a3jl	btst	1, %o3				!
71625cf1a3jl	bz,pt	%ncc, .kcopy_2			! check for half-word
71725cf1a3jl	  nop
71825cf1a3jl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
71925cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
72025cf1a3jl	tst	%o3
72125cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
72225cf1a3jl	  cmp	%o2, %o3			! if length <= limit
72325cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
72425cf1a3jl	  nop
72525cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
72625cf1a3jl	  nop
72725cf1a3jl.kcopy_2:
72825cf1a3jl	btst	3, %o3				!
72925cf1a3jl	bz,pt	%ncc, .kcopy_4			! check for word alignment
73025cf1a3jl	  nop
73125cf1a3jl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
73225cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
73325cf1a3jl	tst	%o3
73425cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
73525cf1a3jl	  cmp	%o2, %o3			! if length <= limit
73625cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
73725cf1a3jl	  nop
73825cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
73925cf1a3jl	  nop
74025cf1a3jl.kcopy_4:
74125cf1a3jl	! already checked longword, must be word aligned
74225cf1a3jl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
74325cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
74425cf1a3jl	tst	%o3
74525cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
74625cf1a3jl	  cmp	%o2, %o3			! if length <= limit
74725cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
74825cf1a3jl	  nop
74925cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
75025cf1a3jl	  nop
75125cf1a3jl.kcopy_8:
75225cf1a3jl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
75325cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
75425cf1a3jl	tst	%o3
75525cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
75625cf1a3jl	  cmp	%o2, %o3			! if length <= limit
75725cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
75825cf1a3jl	  nop
75925cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
76025cf1a3jl	  nop
76125cf1a3jl
76225cf1a3jl.kcopy_small:
76325cf1a3jl	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
76425cf1a3jl	or	%o5, %lo(.sm_copyerr), %o5
76525cf1a3jl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
76625cf1a3jl	membar	#Sync				! sync error barrier
76725cf1a3jl	ba,pt	%ncc, .sm_do_copy		! common code
76825cf1a3jl	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
76925cf1a3jl
77025cf1a3jl.kcopy_more:
77125cf1a3jl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
77225cf1a3jl	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
77325cf1a3jl	or	%l7, %lo(.copyerr), %l7
77425cf1a3jl	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
77525cf1a3jl	membar	#Sync				! sync error barrier
77625cf1a3jl	ba,pt	%ncc, .do_copy			! common code
77725cf1a3jl	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
77825cf1a3jl
77925cf1a3jl
78025cf1a3jl/*
78125cf1a3jl * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
78225cf1a3jl * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
78325cf1a3jl */
78425cf1a3jl.copyerr:
78525cf1a3jl	set	.copyerr2, %l0
78625cf1a3jl	membar	#Sync				! sync error barrier
78725cf1a3jl	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
78825cf1a3jl	btst	FPUSED_FLAG, %l6
78925cf1a3jl	bz	%ncc, 1f
79025cf1a3jl	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
79125cf1a3jl
79225cf1a3jl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
79325cf1a3jl	wr	%o2, 0, %gsr
79425cf1a3jl
79525cf1a3jl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
79625cf1a3jl	btst	FPRS_FEF, %o3
79725cf1a3jl	bz,pt	%icc, 4f
79825cf1a3jl	  nop
79925cf1a3jl
80025cf1a3jl	BLD_FPQ1Q3_FROMSTACK(%o2)
80125cf1a3jl
80225cf1a3jl	ba,pt	%ncc, 1f
80325cf1a3jl	  wr	%o3, 0, %fprs		! restore fprs
80425cf1a3jl
80525cf1a3jl4:
80625cf1a3jl	FZEROQ1Q3
80725cf1a3jl	wr	%o3, 0, %fprs		! restore fprs
80825cf1a3jl
80925cf1a3jl	!
81025cf1a3jl	! Need to cater for the different expectations of kcopy
81125cf1a3jl	! and bcopy. kcopy will *always* set a t_lofault handler
81225cf1a3jl	! If it fires, we're expected to just return the error code
81325cf1a3jl	! and *not* to invoke any existing error handler. As far as
81425cf1a3jl	! bcopy is concerned, we only set t_lofault if there was an
81525cf1a3jl	! existing lofault handler. In that case we're expected to
81625cf1a3jl	! invoke the previously existing handler after resetting the
81725cf1a3jl	! t_lofault value.
81825cf1a3jl	!
81925cf1a3jl1:
82025cf1a3jl	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
82125cf1a3jl	membar	#Sync				! sync error barrier
82225cf1a3jl	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
82325cf1a3jl	FP_ALLOWMIGRATE(5, 6)
82425cf1a3jl
82525cf1a3jl	btst	TRAMP_FLAG, %l0
82625cf1a3jl	bnz,pn	%ncc, 3f
82725cf1a3jl	  nop
82825cf1a3jl	ret
82925cf1a3jl	  restore	%g1, 0, %o0
83025cf1a3jl
83125cf1a3jl3:
83225cf1a3jl	!
83325cf1a3jl	! We're here via bcopy. There *must* have been an error handler
83425cf1a3jl	! in place otherwise we would have died a nasty death already.
83525cf1a3jl	!
83625cf1a3jl	jmp	%l6				! goto real handler
83725cf1a3jl	  restore	%g0, 0, %o0		! dispose of copy window
83825cf1a3jl
83925cf1a3jl/*
84025cf1a3jl * We got here because of a fault in .copyerr.  We can't safely restore fp
84125cf1a3jl * state, so we panic.
84225cf1a3jl */
84325cf1a3jlfp_panic_msg:
84425cf1a3jl	.asciz	"Unable to restore fp state after copy operation"
84525cf1a3jl
84625cf1a3jl	.align	4
84725cf1a3jl.copyerr2:
84825cf1a3jl	set	fp_panic_msg, %o0
84925cf1a3jl	call	panic
85025cf1a3jl	  nop
85125cf1a3jl
85225cf1a3jl/*
85325cf1a3jl * We got here because of a fault during a small kcopy or bcopy.
85425cf1a3jl * No floating point registers are used by the small copies.
85525cf1a3jl * Errno value is in %g1.
85625cf1a3jl */
85725cf1a3jl.sm_copyerr:
85825cf1a3jl1:
85925cf1a3jl	btst	TRAMP_FLAG, %o4
86025cf1a3jl	membar	#Sync
86125cf1a3jl	andn	%o4, TRAMP_FLAG, %o4
86225cf1a3jl	bnz,pn	%ncc, 3f
86325cf1a3jl	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
86425cf1a3jl	retl
86525cf1a3jl	  mov	%g1, %o0
86625cf1a3jl3:
86725cf1a3jl	jmp	%o4				! goto real handler
86825cf1a3jl	  mov	%g0, %o0			!
86925cf1a3jl
87025cf1a3jl	SET_SIZE(kcopy)
87125cf1a3jl#endif	/* lint */
87225cf1a3jl
87325cf1a3jl
87425cf1a3jl/*
87525cf1a3jl * Copy a block of storage - must not overlap (from + len <= to).
87625cf1a3jl * Registers: l6 - saved t_lofault
87725cf1a3jl * (for short copies, o4 - saved t_lofault)
87825cf1a3jl *
87925cf1a3jl * Copy a page of memory.
88025cf1a3jl * Assumes double word alignment and a count >= 256.
88125cf1a3jl */
88225cf1a3jl#if defined(lint)
88325cf1a3jl
88425cf1a3jl/* ARGSUSED */
88525cf1a3jlvoid
88625cf1a3jlbcopy(const void *from, void *to, size_t count)
88725cf1a3jl{}
88825cf1a3jl
88925cf1a3jl#else	/* lint */
89025cf1a3jl
89125cf1a3jl	ENTRY(bcopy)
89225cf1a3jl
89325cf1a3jl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
89425cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to larger cases
89525cf1a3jl	  xor	%o0, %o1, %o3			! are src, dst alignable?
89625cf1a3jl	btst	7, %o3				!
89725cf1a3jl	bz,pt	%ncc, .bcopy_8			! check for longword alignment
89825cf1a3jl	  nop
89925cf1a3jl	btst	1, %o3				!
90025cf1a3jl	bz,pt	%ncc, .bcopy_2			! check for half-word
90125cf1a3jl	  nop
90225cf1a3jl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
90325cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
90425cf1a3jl	tst	%o3
90525cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
90625cf1a3jl	  cmp	%o2, %o3			! if length <= limit
90725cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
90825cf1a3jl	  nop
90925cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
91025cf1a3jl	  nop
91125cf1a3jl.bcopy_2:
91225cf1a3jl	btst	3, %o3				!
91325cf1a3jl	bz,pt	%ncc, .bcopy_4			! check for word alignment
91425cf1a3jl	  nop
91525cf1a3jl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
91625cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
91725cf1a3jl	tst	%o3
91825cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
91925cf1a3jl	  cmp	%o2, %o3			! if length <= limit
92025cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
92125cf1a3jl	  nop
92225cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
92325cf1a3jl	  nop
92425cf1a3jl.bcopy_4:
92525cf1a3jl	! already checked longword, must be word aligned
92625cf1a3jl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
92725cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
92825cf1a3jl	tst	%o3
92925cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
93025cf1a3jl	  cmp	%o2, %o3			! if length <= limit
93125cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
93225cf1a3jl	  nop
93325cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
93425cf1a3jl	  nop
93525cf1a3jl.bcopy_8:
93625cf1a3jl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
93725cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
93825cf1a3jl	tst	%o3
93925cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
94025cf1a3jl	  cmp	%o2, %o3			! if length <= limit
94125cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
94225cf1a3jl	  nop
94325cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
94425cf1a3jl	  nop
94525cf1a3jl
94625cf1a3jl	.align	16
94725cf1a3jl.bcopy_small:
94825cf1a3jl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
94925cf1a3jl	tst	%o4
95025cf1a3jl	bz,pt	%icc, .sm_do_copy
95125cf1a3jl	  nop
95225cf1a3jl	sethi	%hi(.sm_copyerr), %o5
95325cf1a3jl	or	%o5, %lo(.sm_copyerr), %o5
95425cf1a3jl	membar	#Sync				! sync error barrier
95525cf1a3jl	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
95625cf1a3jl	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
95725cf1a3jl.sm_do_copy:
95825cf1a3jl	cmp	%o2, SHORTCOPY		! check for really short case
95925cf1a3jl	bleu,pt	%ncc, .bc_sm_left	!
96025cf1a3jl	  cmp	%o2, CHKSIZE		! check for medium length cases
96125cf1a3jl	bgu,pn	%ncc, .bc_med		!
96225cf1a3jl	  or	%o0, %o1, %o3		! prepare alignment check
96325cf1a3jl	andcc	%o3, 0x3, %g0		! test for alignment
96425cf1a3jl	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
96525cf1a3jl.bc_sm_movebytes:
96625cf1a3jl	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
96725cf1a3jl.bc_sm_notalign4:
96825cf1a3jl	ldub	[%o0], %o3		! read byte
96925cf1a3jl	stb	%o3, [%o1]		! write byte
97025cf1a3jl	subcc	%o2, 4, %o2		! reduce count by 4
97125cf1a3jl	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
97225cf1a3jl	add	%o0, 4, %o0		! advance SRC by 4
97325cf1a3jl	stb	%o3, [%o1 + 1]
97425cf1a3jl	ldub	[%o0 - 2], %o3
97525cf1a3jl	add	%o1, 4, %o1		! advance DST by 4
97625cf1a3jl	stb	%o3, [%o1 - 2]
97725cf1a3jl	ldub	[%o0 - 1], %o3
97825cf1a3jl	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
97925cf1a3jl	  stb	%o3, [%o1 - 1]
98025cf1a3jl	add	%o2, 3, %o2		! restore count
98125cf1a3jl.bc_sm_left:
98225cf1a3jl	tst	%o2
98325cf1a3jl	bz,pt	%ncc, .bc_sm_exit	! check for zero length
98425cf1a3jl	  deccc	%o2			! reduce count for cc test
98525cf1a3jl	ldub	[%o0], %o3		! move one byte
98625cf1a3jl	bz,pt	%ncc, .bc_sm_exit
98725cf1a3jl	  stb	%o3, [%o1]
98825cf1a3jl	ldub	[%o0 + 1], %o3		! move another byte
98925cf1a3jl	deccc	%o2			! check for more
99025cf1a3jl	bz,pt	%ncc, .bc_sm_exit
99125cf1a3jl	  stb	%o3, [%o1 + 1]
99225cf1a3jl	ldub	[%o0 + 2], %o3		! move final byte
993e64c6c3Michael Bergknoff	ba,pt   %ncc, .bc_sm_exit
994e64c6c3Michael Bergknoff	  stb	%o3, [%o1 + 2]
99525cf1a3jl	.align	16
99625cf1a3jl	nop				! instruction alignment
99725cf1a3jl					! see discussion at start of file
99825cf1a3jl.bc_sm_words:
99925cf1a3jl	lduw	[%o0], %o3		! read word
100025cf1a3jl.bc_sm_wordx:
100125cf1a3jl	subcc	%o2, 8, %o2		! update count
100225cf1a3jl	stw	%o3, [%o1]		! write word
100325cf1a3jl	add	%o0, 8, %o0		! update SRC
100425cf1a3jl	lduw	[%o0 - 4], %o3		! read word
100525cf1a3jl	add	%o1, 8, %o1		! update DST
100625cf1a3jl	bgt,pt	%ncc, .bc_sm_words	! loop til done
100725cf1a3jl	  stw	%o3, [%o1 - 4]		! write word
100825cf1a3jl	addcc	%o2, 7, %o2		! restore count
100925cf1a3jl	bz,pt	%ncc, .bc_sm_exit
101025cf1a3jl	  deccc	%o2
101125cf1a3jl	bz,pt	%ncc, .bc_sm_byte
101225cf1a3jl.bc_sm_half:
101325cf1a3jl	  subcc	%o2, 2, %o2		! reduce count by 2
101425cf1a3jl	add	%o0, 2, %o0		! advance SRC by 2
101525cf1a3jl	lduh	[%o0 - 2], %o3		! read half word
101625cf1a3jl	add	%o1, 2, %o1		! advance DST by 2
101725cf1a3jl	bgt,pt	%ncc, .bc_sm_half	! loop til done
101825cf1a3jl	  sth	%o3, [%o1 - 2]		! write half word
101925cf1a3jl	addcc	%o2, 1, %o2		! restore count
102025cf1a3jl	bz,pt	%ncc, .bc_sm_exit
102125cf1a3jl	  nop
102225cf1a3jl.bc_sm_byte:
102325cf1a3jl	ldub	[%o0], %o3
1024e64c6c3Michael Bergknoff	ba,pt   %ncc, .bc_sm_exit
1025e64c6c3Michael Bergknoff	  stb	%o3, [%o1]
102625cf1a3jl
102725cf1a3jl.bc_sm_word:
102825cf1a3jl	subcc	%o2, 4, %o2		! update count
102925cf1a3jl	bgt,pt	%ncc, .bc_sm_wordx
103025cf1a3jl	  lduw	[%o0], %o3		! read word
103125cf1a3jl	addcc	%o2, 3, %o2		! restore count
103225cf1a3jl	bz,pt	%ncc, .bc_sm_exit
103325cf1a3jl	  stw	%o3, [%o1]		! write word
103425cf1a3jl	deccc	%o2			! reduce count for cc test
103525cf1a3jl	ldub	[%o0 + 4], %o3		! load one byte
103625cf1a3jl	bz,pt	%ncc, .bc_sm_exit
103725cf1a3jl	  stb	%o3, [%o1 + 4]		! store one byte
103825cf1a3jl	ldub	[%o0 + 5], %o3		! load second byte
103925cf1a3jl	deccc	%o2
104025cf1a3jl	bz,pt	%ncc, .bc_sm_exit
104125cf1a3jl	  stb	%o3, [%o1 + 5]		! store second byte
104225cf1a3jl	ldub	[%o0 + 6], %o3		! load third byte
104325cf1a3jl	stb	%o3, [%o1 + 6]		! store third byte
104425cf1a3jl.bc_sm_exit:
1045e64c6c3Michael Bergknoff	ldn     [THREAD_REG + T_LOFAULT], %o3
1046e64c6c3Michael Bergknoff	brz,pt  %o3, .bc_sm_done
10470090fbakm	  nop
104825cf1a3jl	membar	#Sync				! sync error barrier
104925cf1a3jl	andn	%o4, TRAMP_FLAG, %o4
105025cf1a3jl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10510090fbakm.bc_sm_done:
105225cf1a3jl	retl
105325cf1a3jl	  mov	%g0, %o0		! return 0
105425cf1a3jl
105525cf1a3jl	.align 16
105625cf1a3jl.bc_med:
105725cf1a3jl	xor	%o0, %o1, %o3		! setup alignment check
105825cf1a3jl	btst	1, %o3
105925cf1a3jl	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
106025cf1a3jl	  nop
106125cf1a3jl	btst	3, %o3
106225cf1a3jl	bnz,pt	%ncc, .bc_med_half	! halfword aligned
106325cf1a3jl	  nop
106425cf1a3jl	btst	7, %o3
106525cf1a3jl	bnz,pt	%ncc, .bc_med_word	! word aligned
106625cf1a3jl	  nop
106725cf1a3jl.bc_med_long:
106825cf1a3jl	btst	3, %o0			! check for
106925cf1a3jl	bz,pt	%ncc, .bc_med_long1	! word alignment
107025cf1a3jl	  nop
107125cf1a3jl.bc_med_long0:
107225cf1a3jl	ldub	[%o0], %o3		! load one byte
107325cf1a3jl	inc	%o0
107425cf1a3jl	stb	%o3,[%o1]		! store byte
1075