125cf1a3jl/*
225cf1a3jl * CDDL HEADER START
325cf1a3jl *
425cf1a3jl * The contents of this file are subject to the terms of the
525cf1a3jl * Common Development and Distribution License (the "License").
625cf1a3jl * You may not use this file except in compliance with the License.
725cf1a3jl *
825cf1a3jl * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
925cf1a3jl * or http://www.opensolaris.org/os/licensing.
1025cf1a3jl * See the License for the specific language governing permissions
1125cf1a3jl * and limitations under the License.
1225cf1a3jl *
1325cf1a3jl * When distributing Covered Code, include this CDDL HEADER in each
1425cf1a3jl * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1525cf1a3jl * If applicable, add the following below this CDDL HEADER, with the
1625cf1a3jl * fields enclosed by brackets "[]" replaced with your own identifying
1725cf1a3jl * information: Portions Copyright [yyyy] [name of copyright owner]
1825cf1a3jl *
1925cf1a3jl * CDDL HEADER END
2025cf1a3jl */
2125cf1a3jl/*
22e64c6c3Michael Bergknoff * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
2325cf1a3jl * Use is subject to license terms.
2425cf1a3jl */
2525cf1a3jl
2625cf1a3jl#include <sys/param.h>
2725cf1a3jl#include <sys/errno.h>
2825cf1a3jl#include <sys/asm_linkage.h>
2925cf1a3jl#include <sys/vtrace.h>
3025cf1a3jl#include <sys/machthread.h>
3125cf1a3jl#include <sys/clock.h>
3225cf1a3jl#include <sys/asi.h>
3325cf1a3jl#include <sys/fsr.h>
3425cf1a3jl#include <sys/privregs.h>
3525cf1a3jl
3625cf1a3jl#include "assym.h"
3725cf1a3jl
3825cf1a3jl/*
3925cf1a3jl * Pseudo-code to aid in understanding the control flow of the
4025cf1a3jl * bcopy/copyin/copyout routines.
4125cf1a3jl *
4225cf1a3jl * On entry:
4325cf1a3jl *
4425cf1a3jl * 	! Determine whether to use the FP register version
4525cf1a3jl * 	! or the leaf routine version depending on size
4625cf1a3jl * 	! of copy and flags.  Set up error handling accordingly.
4725cf1a3jl *	! The transition point depends on whether the src and
4825cf1a3jl * 	! dst addresses can be aligned to long word, word,
4925cf1a3jl * 	! half word, or byte boundaries.
5025cf1a3jl *	!
5125cf1a3jl *	! WARNING: <Register usage convention>
5225cf1a3jl *	! For FP version, %l6 holds previous error handling and
5325cf1a3jl *	! a flag: TRAMP_FLAG (low bits)
5425cf1a3jl *	! for leaf routine version, %o4 holds those values.
5525cf1a3jl *	! So either %l6 or %o4 is reserved and not available for
5625cf1a3jl *	! any other use.
5725cf1a3jl *
5825cf1a3jl * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
5925cf1a3jl * 		go to small_copy;		! to speed short copies
6025cf1a3jl *
6125cf1a3jl * 	! src, dst long word alignable
6225cf1a3jl * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
6325cf1a3jl * 			go to small_copy;
6425cf1a3jl *		if (length <= hw_copy_limit_8)
6525cf1a3jl * 			go to small_copy;
6625cf1a3jl * 		go to FPBLK_copy;
6725cf1a3jl * 	}
6825cf1a3jl * 	if (src,dst not alignable) {
6925cf1a3jl * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
7025cf1a3jl * 			go to small_copy;
7125cf1a3jl *		if (length <= hw_copy_limit_1)
7225cf1a3jl * 			go to small_copy;
7325cf1a3jl * 		go to FPBLK_copy;
7425cf1a3jl * 	}
7525cf1a3jl * 	if (src,dst halfword alignable) {
7625cf1a3jl * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
7725cf1a3jl * 			go to small_copy;
7825cf1a3jl *		if (length <= hw_copy_limit_2)
7925cf1a3jl * 			go to small_copy;
8025cf1a3jl * 		go to FPBLK_copy;
8125cf1a3jl * 	}
8225cf1a3jl * 	if (src,dst word alignable) {
8325cf1a3jl * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
8425cf1a3jl * 			go to small_copy;
8525cf1a3jl *		if (length <= hw_copy_limit_4)
8625cf1a3jl * 			go to small_copy;
8725cf1a3jl * 		go to FPBLK_copy;
8825cf1a3jl * 	}
8925cf1a3jl *
9025cf1a3jl * small_copy:
9125cf1a3jl *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
9225cf1a3jl *
9325cf1a3jl *	if (count <= 3)				! fast path for tiny copies
9425cf1a3jl *		go to sm_left;			! special finish up code
9525cf1a3jl *	else
9625cf1a3jl *		if (count > CHKSIZE)		! medium sized copies
9725cf1a3jl *			go to sm_med		! tuned by alignment
9825cf1a3jl *		if(src&dst not both word aligned) {
9925cf1a3jl *	sm_movebytes:
10025cf1a3jl *			move byte by byte in 4-way unrolled loop
10125cf1a3jl *			fall into sm_left;
10225cf1a3jl *	sm_left:
10325cf1a3jl *			move 0-3 bytes byte at a time as needed.
10425cf1a3jl *			restore error handler and exit.
10525cf1a3jl *
10625cf1a3jl * 		} else {	! src&dst are word aligned
10725cf1a3jl *			check for at least 8 bytes left,
10825cf1a3jl *			move word at a time, unrolled by 2
10925cf1a3jl *			when fewer than 8 bytes left,
11025cf1a3jl *	sm_half:	move half word at a time while 2 or more bytes left
11125cf1a3jl *	sm_byte:	move final byte if necessary
11225cf1a3jl *	sm_exit:
11325cf1a3jl *			restore error handler and exit.
11425cf1a3jl *		}
11525cf1a3jl *
11625cf1a3jl * ! Medium length cases with at least CHKSIZE bytes available
11725cf1a3jl * ! method: line up src and dst as best possible, then
11825cf1a3jl * ! move data in 4-way unrolled loops.
11925cf1a3jl *
12025cf1a3jl * sm_med:
12125cf1a3jl *	if(src&dst unalignable)
12225cf1a3jl * 		go to sm_movebytes
12325cf1a3jl *	if(src&dst halfword alignable)
12425cf1a3jl *		go to sm_movehalf
12525cf1a3jl *	if(src&dst word alignable)
12625cf1a3jl *		go to sm_moveword
12725cf1a3jl * ! fall into long word movement
12825cf1a3jl *	move bytes until src is word aligned
12925cf1a3jl *	if not long word aligned, move a word
13025cf1a3jl *	move long words in 4-way unrolled loop until < 32 bytes left
13125cf1a3jl *      move long words in 1-way unrolled loop until < 8 bytes left
13225cf1a3jl *	if zero bytes left, goto sm_exit
13325cf1a3jl *	if one byte left, go to sm_byte
13425cf1a3jl *	else go to sm_half
13525cf1a3jl *
13625cf1a3jl * sm_moveword:
13725cf1a3jl *	move bytes until src is word aligned
13825cf1a3jl *	move words in 4-way unrolled loop until < 16 bytes left
13925cf1a3jl *      move words in 1-way unrolled loop until < 4 bytes left
14025cf1a3jl *	if zero bytes left, goto sm_exit
14125cf1a3jl *	if one byte left, go to sm_byte
14225cf1a3jl *	else go to sm_half
14325cf1a3jl *
14425cf1a3jl * sm_movehalf:
14525cf1a3jl *	move a byte if needed to align src on halfword
14625cf1a3jl *	move halfwords in 4-way unrolled loop until < 8 bytes left
14725cf1a3jl *	if zero bytes left, goto sm_exit
14825cf1a3jl *	if one byte left, go to sm_byte
14925cf1a3jl *	else go to sm_half
15025cf1a3jl *
15125cf1a3jl *
15225cf1a3jl * FPBLK_copy:
15325cf1a3jl * 	%l6 = curthread->t_lofault;
15425cf1a3jl * 	if (%l6 != NULL) {
15525cf1a3jl * 		membar #Sync
15625cf1a3jl * 		curthread->t_lofault = .copyerr;
15725cf1a3jl * 		caller_error_handler = TRUE             ! %l6 |= 2
15825cf1a3jl * 	}
15925cf1a3jl *
16025cf1a3jl *	! for FPU testing we must not migrate cpus
16125cf1a3jl * 	if (curthread->t_lwp == NULL) {
16225cf1a3jl *		! Kernel threads do not have pcb's in which to store
16325cf1a3jl *		! the floating point state, so disallow preemption during
16425cf1a3jl *		! the copy.  This also prevents cpu migration.
16525cf1a3jl * 		kpreempt_disable(curthread);
16625cf1a3jl *	} else {
16725cf1a3jl *		thread_nomigrate();
16825cf1a3jl *	}
16925cf1a3jl *
17025cf1a3jl * 	old_fprs = %fprs;
17125cf1a3jl * 	old_gsr = %gsr;
17225cf1a3jl * 	if (%fprs.fef) {
17325cf1a3jl * 		%fprs.fef = 1;
17425cf1a3jl * 		save current fpregs on stack using blockstore
17525cf1a3jl * 	} else {
17625cf1a3jl * 		%fprs.fef = 1;
17725cf1a3jl * 	}
17825cf1a3jl *
17925cf1a3jl *
18025cf1a3jl * 	do_blockcopy_here;
18125cf1a3jl *
18225cf1a3jl * In lofault handler:
18325cf1a3jl *	curthread->t_lofault = .copyerr2;
18425cf1a3jl *	Continue on with the normal exit handler
18525cf1a3jl *
18625cf1a3jl * On normal exit:
18725cf1a3jl * 	%gsr = old_gsr;
18825cf1a3jl * 	if (old_fprs & FPRS_FEF)
18925cf1a3jl * 		restore fpregs from stack using blockload
19025cf1a3jl *	else
19125cf1a3jl *		zero fpregs
19225cf1a3jl * 	%fprs = old_fprs;
19325cf1a3jl * 	membar #Sync
19425cf1a3jl * 	curthread->t_lofault = (%l6 & ~3);
19525cf1a3jl *	! following test omitted from copyin/copyout as they
19625cf1a3jl *	! will always have a current thread
19725cf1a3jl * 	if (curthread->t_lwp == NULL)
19825cf1a3jl *		kpreempt_enable(curthread);
19925cf1a3jl *	else
20025cf1a3jl *		thread_allowmigrate();
20125cf1a3jl * 	return (0)
20225cf1a3jl *
20325cf1a3jl * In second lofault handler (.copyerr2):
20425cf1a3jl *	We've tried to restore fp state from the stack and failed.  To
20525cf1a3jl *	prevent from returning with a corrupted fp state, we will panic.
20625cf1a3jl */
20725cf1a3jl
20825cf1a3jl/*
20925cf1a3jl * Comments about optimization choices
21025cf1a3jl *
21125cf1a3jl * The initial optimization decision in this code is to determine
21225cf1a3jl * whether to use the FP registers for a copy or not.  If we don't
21325cf1a3jl * use the FP registers, we can execute the copy as a leaf routine,
21425cf1a3jl * saving a register save and restore.  Also, less elaborate setup
21525cf1a3jl * is required, allowing short copies to be completed more quickly.
21625cf1a3jl * For longer copies, especially unaligned ones (where the src and
21725cf1a3jl * dst do not align to allow simple ldx,stx operation), the FP
21825cf1a3jl * registers allow much faster copy operations.
21925cf1a3jl *
22025cf1a3jl * The estimated extra cost of the FP path will vary depending on
22125cf1a3jl * src/dst alignment, dst offset from the next 64 byte FPblock store
22225cf1a3jl * boundary, remaining src data after the last full dst cache line is
22325cf1a3jl * moved whether the FP registers need to be saved, and some other
22425cf1a3jl * minor issues.  The average additional overhead is estimated to be
22525cf1a3jl * 400 clocks.  Since each non-repeated/predicted tst and branch costs
22625cf1a3jl * around 10 clocks, elaborate calculation would slow down to all
22725cf1a3jl * longer copies and only benefit a small portion of medium sized
22825cf1a3jl * copies.  Rather than incur such cost, we chose fixed transition
22925cf1a3jl * points for each of the alignment choices.
23025cf1a3jl *
23125cf1a3jl * For the inner loop, here is a comparison of the per cache line
23225cf1a3jl * costs for each alignment when src&dst are in cache:
23325cf1a3jl *
23425cf1a3jl * byte aligned:  108 clocks slower for non-FPBLK
23525cf1a3jl * half aligned:   44 clocks slower for non-FPBLK
23625cf1a3jl * word aligned:   12 clocks slower for non-FPBLK
23725cf1a3jl * long aligned:    4 clocks >>faster<< for non-FPBLK
23825cf1a3jl *
23925cf1a3jl * The long aligned loop runs faster because it does no prefetching.
24025cf1a3jl * That wins if the data is not in cache or there is too little
24125cf1a3jl * data to gain much benefit from prefetching.  But when there
24225cf1a3jl * is more data and that data is not in cache, failing to prefetch
24325cf1a3jl * can run much slower.  In addition, there is a 2 Kbyte store queue
24425cf1a3jl * which will cause the non-FPBLK inner loop to slow for larger copies.
24525cf1a3jl * The exact tradeoff is strongly load and application dependent, with
24625cf1a3jl * increasing risk of a customer visible performance regression if the
24725cf1a3jl * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
24825cf1a3jl * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
24925cf1a3jl * upper limit for the non-FPBLK code.  To minimize performance regression
25025cf1a3jl * risk while still gaining the primary benefits of the improvements to
25125cf1a3jl * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
25225cf1a3jl * hw_copy_limit_*.  Later experimental studies using different values
25325cf1a3jl * of hw_copy_limit_* can be used to make further adjustments if
25425cf1a3jl * appropriate.
25525cf1a3jl *
25625cf1a3jl * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
25725cf1a3jl * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
25825cf1a3jl * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
25925cf1a3jl * hw_copy_limit_8 = src and dst are longword aligned
26025cf1a3jl *
26125cf1a3jl * To say that src and dst are word aligned means that after
26225cf1a3jl * some initial alignment activity of moving 0 to 3 bytes,
26325cf1a3jl * both the src and dst will be on word boundaries so that
26425cf1a3jl * word loads and stores may be used.
26525cf1a3jl *
26625cf1a3jl * Default values at May,2005 are:
26725cf1a3jl * hw_copy_limit_1 =  256
26825cf1a3jl * hw_copy_limit_2 =  512
26925cf1a3jl * hw_copy_limit_4 = 1024
27025cf1a3jl * hw_copy_limit_8 = 1024 (or 1536 on some systems)
27125cf1a3jl *
27225cf1a3jl *
27325cf1a3jl * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
27425cf1a3jl * disabled for that alignment choice.
27525cf1a3jl * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
27625cf1a3jl * the value of VIS_COPY_THRESHOLD is used.
27725cf1a3jl * It is not envisioned that hw_copy_limit_? will be changed in the field
27825cf1a3jl * It is provided to allow for disabling FPBLK copies and to allow
27925cf1a3jl * easy testing of alternate values on future HW implementations
28025cf1a3jl * that might have different cache sizes, clock rates or instruction
28125cf1a3jl * timing rules.
28225cf1a3jl *
28325cf1a3jl * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
28425cf1a3jl * threshold to speedup all shorter copies (less than 256).  That
28525cf1a3jl * saves an alignment test, memory reference, and enabling test
28625cf1a3jl * for all short copies, or an estimated 24 clocks.
28725cf1a3jl *
28825cf1a3jl * The order in which these limits are checked does matter since each
28925cf1a3jl * non-predicted tst and branch costs around 10 clocks.
29025cf1a3jl * If src and dst are randomly selected addresses,
29125cf1a3jl * 4 of 8 will not be alignable.
29225cf1a3jl * 2 of 8 will be half word alignable.
29325cf1a3jl * 1 of 8 will be word alignable.
29425cf1a3jl * 1 of 8 will be long word alignable.
29525cf1a3jl * But, tests on running kernels show that src and dst to copy code
29625cf1a3jl * are typically not on random alignments.  Structure copies and
29725cf1a3jl * copies of larger data sizes are often on long word boundaries.
29825cf1a3jl * So we test the long word alignment case first, then
29925cf1a3jl * the byte alignment, then halfword, then word alignment.
30025cf1a3jl *
30125cf1a3jl * Several times, tests for length are made to split the code
30225cf1a3jl * into subcases.  These tests often allow later tests to be
30325cf1a3jl * avoided.  For example, within the non-FPBLK copy, we first
30425cf1a3jl * check for tiny copies of 3 bytes or less.  That allows us
30525cf1a3jl * to use a 4-way unrolled loop for the general byte copy case
30625cf1a3jl * without a test on loop entry.
30725cf1a3jl * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
30825cf1a3jl * vs longer cases.  For the really short case, we don't attempt
30925cf1a3jl * align src and dst.  We try to minimize special case tests in
31025cf1a3jl * the shortest loops as each test adds a significant percentage
31125cf1a3jl * to the total time.
31225cf1a3jl *
31325cf1a3jl * For the medium sized cases, we allow ourselves to adjust the
31425cf1a3jl * src and dst alignment and provide special cases for each of
31525cf1a3jl * the four adjusted alignment cases. The CHKSIZE that was used
31625cf1a3jl * to decide between short and medium size was chosen to be 39
31725cf1a3jl * as that allows for the worst case of 7 bytes of alignment
31825cf1a3jl * shift and 4 times 8 bytes for the first long word unrolling.
31925cf1a3jl * That knowledge saves an initial test for length on entry into
32025cf1a3jl * the medium cases.  If the general loop unrolling factor were
32125cf1a3jl * to be increases, this number would also need to be adjusted.
32225cf1a3jl *
32325cf1a3jl * For all cases in the non-FPBLK code where it is known that at
32425cf1a3jl * least 4 chunks of data are available for movement, the
32525cf1a3jl * loop is unrolled by four.  This 4-way loop runs in 8 clocks
32625cf1a3jl * or 2 clocks per data element.
32725cf1a3jl *
32825cf1a3jl * Instruction alignment is forced by used of .align 16 directives
32925cf1a3jl * and nops which are not executed in the code.  This
33025cf1a3jl * combination of operations shifts the alignment of following
33125cf1a3jl * loops to insure that loops are aligned so that their instructions
33225cf1a3jl * fall within the minimum number of 4 instruction fetch groups.
33325cf1a3jl * If instructions are inserted or removed between the .align
33425cf1a3jl * instruction and the unrolled loops, then the alignment needs
33525cf1a3jl * to be readjusted.  Misaligned loops can add a clock per loop
33625cf1a3jl * iteration to the loop timing.
33725cf1a3jl *
33825cf1a3jl * In a few cases, code is duplicated to avoid a branch.  Since
33925cf1a3jl * a non-predicted tst and branch takes 10 clocks, this savings
34025cf1a3jl * is judged an appropriate time-space tradeoff.
34125cf1a3jl *
34225cf1a3jl * Within the FPBLK-code, the prefetch method in the inner
34325cf1a3jl * loop needs to be explained as it is not standard.  Two
34425cf1a3jl * prefetches are issued for each cache line instead of one.
34525cf1a3jl * The primary one is at the maximum reach of 8 cache lines.
34625cf1a3jl * Most of the time, that maximum prefetch reach gives the
34725cf1a3jl * cache line more time to reach the processor for systems with
34825cf1a3jl * higher processor clocks.  But, sometimes memory interference
34925cf1a3jl * can cause that prefetch to be dropped.  Putting a second
35025cf1a3jl * prefetch at a reach of 5 cache lines catches the drops
35125cf1a3jl * three iterations later and shows a measured improvement
35225cf1a3jl * in performance over any similar loop with a single prefetch.
35325cf1a3jl * The prefetches are placed in the loop so they overlap with
35425cf1a3jl * non-memory instructions, so that there is no extra cost
35525cf1a3jl * when the data is already in-cache.
35625cf1a3jl *
35725cf1a3jl */
35825cf1a3jl
35925cf1a3jl/*
36025cf1a3jl * Notes on preserving existing fp state and on membars.
36125cf1a3jl *
36225cf1a3jl * When a copyOP decides to use fp we may have to preserve existing
36325cf1a3jl * floating point state.  It is not the caller's state that we need to
36425cf1a3jl * preserve - the rest of the kernel does not use fp and, anyway, fp
36525cf1a3jl * registers are volatile across a call.  Some examples:
36625cf1a3jl *
36725cf1a3jl *	- userland has fp state and is interrupted (device interrupt
36825cf1a3jl *	  or trap) and within the interrupt/trap handling we use
36925cf1a3jl *	  bcopy()
37025cf1a3jl *	- another (higher level) interrupt or trap handler uses bcopy
37125cf1a3jl *	  while a bcopy from an earlier interrupt is still active
37225cf1a3jl *	- an asynchronous error trap occurs while fp state exists (in
37325cf1a3jl *	  userland or in kernel copy) and the tl0 component of the handling
37425cf1a3jl *	  uses bcopy
37525cf1a3jl *	- a user process with fp state incurs a copy-on-write fault and
37625cf1a3jl *	  hwblkpagecopy always uses fp
37725cf1a3jl *
37825cf1a3jl * We therefore need a per-call place in which to preserve fp state -
37925cf1a3jl * using our stack is ideal (and since fp copy cannot be leaf optimized
38025cf1a3jl * because of calls it makes, this is no hardship).
38125cf1a3jl *
38225cf1a3jl * When we have finished fp copy (with it's repeated block stores)
38325cf1a3jl * we must membar #Sync so that our block stores may complete before
38425cf1a3jl * we either restore the original fp state into the fp registers or
38525cf1a3jl * return to a caller which may initiate other fp operations that could
38625cf1a3jl * modify the fp regs we used before the block stores complete.
38725cf1a3jl *
38825cf1a3jl * Synchronous faults (eg, unresolvable DMMU miss) that occur while
38925cf1a3jl * t_lofault is not NULL will not panic but will instead trampoline
39025cf1a3jl * to the registered lofault handler.  There is no need for any
39125cf1a3jl * membars for these - eg, our store to t_lofault will always be visible to
39225cf1a3jl * ourselves and it is our cpu which will take any trap.
39325cf1a3jl *
39425cf1a3jl * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
39525cf1a3jl * while t_lofault is not NULL will also not panic.  Since we're copying
39625cf1a3jl * to or from userland the extent of the damage is known - the destination
39725cf1a3jl * buffer is incomplete.  So trap handlers will trampoline to the lofault
39825cf1a3jl * handler in this case which should take some form of error action to
39925cf1a3jl * avoid using the incomplete buffer.  The trap handler also flags the
40025cf1a3jl * fault so that later return-from-trap handling (for the trap that brought
40125cf1a3jl * this thread into the kernel in the first place) can notify the process
40225cf1a3jl * and reboot the system (or restart the service with Greenline/Contracts).
40325cf1a3jl *
40425cf1a3jl * Asynchronous faults (eg, uncorrectable ECC error from memory) can
40525cf1a3jl * result in deferred error traps - the trap is taken sometime after
40625cf1a3jl * the event and the trap PC may not be the PC of the faulting access.
40725cf1a3jl * Delivery of such pending traps can be forced by a membar #Sync, acting
40825cf1a3jl * as an "error barrier" in this role.  To accurately apply the user/kernel
40925cf1a3jl * separation described in the preceding paragraph we must force delivery
41025cf1a3jl * of deferred traps affecting kernel state before we install a lofault
41125cf1a3jl * handler (if we interpose a new lofault handler on an existing one there
41225cf1a3jl * is no need to repeat this), and we must force delivery of deferred
41325cf1a3jl * errors affecting the lofault-protected region before we clear t_lofault.
41425cf1a3jl * Failure to do so results in lost kernel state being interpreted as
41525cf1a3jl * affecting a copyin/copyout only, or of an error that really only
41625cf1a3jl * affects copy data being interpreted as losing kernel state.
41725cf1a3jl *
41825cf1a3jl * Since the copy operations may preserve and later restore floating
41925cf1a3jl * point state that does not belong to the caller (see examples above),
42025cf1a3jl * we must be careful in how we do this in order to prevent corruption
42125cf1a3jl * of another program.
42225cf1a3jl *
42325cf1a3jl * To make sure that floating point state is always saved and restored
42425cf1a3jl * correctly, the following "big rules" must be followed when the floating
42525cf1a3jl * point registers will be used:
42625cf1a3jl *
42725cf1a3jl * 1. %l6 always holds the caller's lofault handler.  Also in this register,
42825cf1a3jl *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
42925cf1a3jl *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
43025cf1a3jl *    lofault handler was set coming in.
43125cf1a3jl *
43225cf1a3jl * 2. The FPUSED flag indicates that all FP state has been successfully stored
43325cf1a3jl *    on the stack.  It should not be set until this save has been completed.
43425cf1a3jl *
43525cf1a3jl * 3. The FPUSED flag should not be cleared on exit until all FP state has
43625cf1a3jl *    been restored from the stack.  If an error occurs while restoring
43725cf1a3jl *    data from the stack, the error handler can check this flag to see if
43825cf1a3jl *    a restore is necessary.
43925cf1a3jl *
44025cf1a3jl * 4. Code run under the new lofault handler must be kept to a minimum.  In
44125cf1a3jl *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
44225cf1a3jl *    to kpreempt(), should not be made until after the lofault handler has
44325cf1a3jl *    been restored.
44425cf1a3jl */
44525cf1a3jl
44625cf1a3jl/*
44725cf1a3jl * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
44825cf1a3jl * to "break even" using FP/VIS-accelerated memory operations.
44925cf1a3jl * The FPBLK code assumes a minimum number of bytes are available
45025cf1a3jl * to be moved on entry.  Check that code carefully before
45125cf1a3jl * reducing VIS_COPY_THRESHOLD below 256.
45225cf1a3jl */
45325cf1a3jl/*
45425cf1a3jl * This shadows sys/machsystm.h which can't be included due to the lack of
45525cf1a3jl * _ASM guards in include files it references. Change it here, change it there.
45625cf1a3jl */
45725cf1a3jl#define VIS_COPY_THRESHOLD 256
45825cf1a3jl
45925cf1a3jl/*
46025cf1a3jl * TEST for very short copies
46125cf1a3jl * Be aware that the maximum unroll for the short unaligned case
46225cf1a3jl * is SHORTCOPY+1
46325cf1a3jl */
46425cf1a3jl#define SHORTCOPY 3
46525cf1a3jl#define CHKSIZE  39
46625cf1a3jl
46725cf1a3jl/*
46825cf1a3jl * Indicates that we're to trampoline to the error handler.
46925cf1a3jl * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
47025cf1a3jl * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
47125cf1a3jl */
47225cf1a3jl#define	FPUSED_FLAG	1
47325cf1a3jl#define	TRAMP_FLAG	2
47425cf1a3jl#define	MASK_FLAGS	3
47525cf1a3jl
47625cf1a3jl/*
47725cf1a3jl * Number of outstanding prefetches.
478c8a722apm * first prefetch moves data from L2 to L1 (n_reads)
479c8a722apm * second prefetch moves data from memory to L2 (one_read)
48025cf1a3jl */
481c8a722apm#define	OLYMPUS_C_PREFETCH	24
482c8a722apm#define	OLYMPUS_C_2ND_PREFETCH	12
48325cf1a3jl
48425cf1a3jl#define	VIS_BLOCKSIZE		64
48525cf1a3jl
48625cf1a3jl/*
48725cf1a3jl * Size of stack frame in order to accomodate a 64-byte aligned
48825cf1a3jl * floating-point register save area and 2 64-bit temp locations.
48925cf1a3jl * All copy functions use two quadrants of fp registers; to assure a
49025cf1a3jl * block-aligned two block buffer in which to save we must reserve
49125cf1a3jl * three blocks on stack.  Not all functions preserve %pfrs on stack
49225cf1a3jl * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
49325cf1a3jl *
49425cf1a3jl *    _______________________________________ <-- %fp + STACK_BIAS
49525cf1a3jl *    | We may need to preserve 2 quadrants |
49625cf1a3jl *    | of fp regs, but since we do so with |
49725cf1a3jl *    | BST/BLD we need room in which to    |
49825cf1a3jl *    | align to VIS_BLOCKSIZE bytes.  So   |
49925cf1a3jl *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
50025cf1a3jl *    |-------------------------------------|
50125cf1a3jl *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
50225cf1a3jl *    |-------------------------------------|
50325cf1a3jl *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
50425cf1a3jl *    ---------------------------------------
50525cf1a3jl */
50625cf1a3jl#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
50725cf1a3jl#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
50825cf1a3jl#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
50925cf1a3jl#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
51025cf1a3jl#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
51125cf1a3jl
51225cf1a3jl/*
51325cf1a3jl * Common macros used by the various versions of the block copy
51425cf1a3jl * routines in this file.
51525cf1a3jl */
51625cf1a3jl
51725cf1a3jl/*
51825cf1a3jl * In FP copies if we do not have preserved data to restore over
51925cf1a3jl * the fp regs we used then we must zero those regs to avoid
52025cf1a3jl * exposing portions of the data to later threads (data security).
52125cf1a3jl *
52225cf1a3jl * Copy functions use either quadrants 1 and 3 or 2 and 4.
52325cf1a3jl *
52425cf1a3jl * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
52525cf1a3jl * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
52625cf1a3jl *
52725cf1a3jl * The instructions below are quicker than repeated fzero instructions
52825cf1a3jl * since they can dispatch down two fp pipelines.
52925cf1a3jl */
53025cf1a3jl#define	FZEROQ1Q3			\
53125cf1a3jl	fzero	%f0			;\
53225cf1a3jl	fmovd	%f0, %f2		;\
53325cf1a3jl	fmovd	%f0, %f4		;\
53425cf1a3jl	fmovd	%f0, %f6		;\
53525cf1a3jl	fmovd	%f0, %f8		;\
53625cf1a3jl	fmovd	%f0, %f10		;\
53725cf1a3jl	fmovd	%f0, %f12		;\
53825cf1a3jl	fmovd	%f0, %f14		;\
53925cf1a3jl	fmovd	%f0, %f32		;\
54025cf1a3jl	fmovd	%f0, %f34		;\
54125cf1a3jl	fmovd	%f0, %f36		;\
54225cf1a3jl	fmovd	%f0, %f38		;\
54325cf1a3jl	fmovd	%f0, %f40		;\
54425cf1a3jl	fmovd	%f0, %f42		;\
54525cf1a3jl	fmovd	%f0, %f44		;\
54625cf1a3jl	fmovd	%f0, %f46
54725cf1a3jl
54825cf1a3jl#define	FZEROQ2Q4			\
54925cf1a3jl	fzero	%f16			;\
55025cf1a3jl	fmovd	%f0, %f18		;\
55125cf1a3jl	fmovd	%f0, %f20		;\
55225cf1a3jl	fmovd	%f0, %f22		;\
55325cf1a3jl	fmovd	%f0, %f24		;\
55425cf1a3jl	fmovd	%f0, %f26		;\
55525cf1a3jl	fmovd	%f0, %f28		;\
55625cf1a3jl	fmovd	%f0, %f30		;\
55725cf1a3jl	fmovd	%f0, %f48		;\
55825cf1a3jl	fmovd	%f0, %f50		;\
55925cf1a3jl	fmovd	%f0, %f52		;\
56025cf1a3jl	fmovd	%f0, %f54		;\
56125cf1a3jl	fmovd	%f0, %f56		;\
56225cf1a3jl	fmovd	%f0, %f58		;\
56325cf1a3jl	fmovd	%f0, %f60		;\
56425cf1a3jl	fmovd	%f0, %f62
56525cf1a3jl
56625cf1a3jl/*
56725cf1a3jl * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
56825cf1a3jl * Used to save and restore in-use fp registers when we want to use FP
56925cf1a3jl * and find fp already in use and copy size still large enough to justify
57025cf1a3jl * the additional overhead of this save and restore.
57125cf1a3jl *
57225cf1a3jl * A membar #Sync is needed before save to sync fp ops initiated before
57325cf1a3jl * the call to the copy function (by whoever has fp in use); for example
57425cf1a3jl * an earlier block load to the quadrant we are about to save may still be
57525cf1a3jl * "in flight".  A membar #Sync is required at the end of the save to
57625cf1a3jl * sync our block store (the copy code is about to begin ldd's to the
57725cf1a3jl * first quadrant).
57825cf1a3jl *
57925cf1a3jl * Similarly: a membar #Sync before restore allows the block stores of
58025cf1a3jl * the copy operation to complete before we fill the quadrants with their
58125cf1a3jl * original data, and a membar #Sync after restore lets the block loads
58225cf1a3jl * of the restore complete before we return to whoever has the fp regs
58325cf1a3jl * in use.  To avoid repeated membar #Sync we make it the responsibility
58425cf1a3jl * of the copy code to membar #Sync immediately after copy is complete
58525cf1a3jl * and before using the BLD_*_FROMSTACK macro.
58625cf1a3jl */
58725cf1a3jl#define BST_FPQ1Q3_TOSTACK(tmp1)				\
58825cf1a3jl	/* membar #Sync	*/					;\
58925cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59025cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
59125cf1a3jl	stda	%f0, [tmp1]ASI_BLK_P				;\
59225cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
59325cf1a3jl	stda	%f32, [tmp1]ASI_BLK_P				;\
59425cf1a3jl	membar	#Sync
59525cf1a3jl
59625cf1a3jl#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
59725cf1a3jl	/* membar #Sync - provided at copy completion */	;\
59825cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59925cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60025cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f0				;\
60125cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
60225cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f32				;\
60325cf1a3jl	membar	#Sync
60425cf1a3jl
60525cf1a3jl#define BST_FPQ2Q4_TOSTACK(tmp1)				\
60625cf1a3jl	/* membar #Sync */					;\
60725cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
60825cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60925cf1a3jl	stda	%f16, [tmp1]ASI_BLK_P				;\
61025cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
61125cf1a3jl	stda	%f48, [tmp1]ASI_BLK_P				;\
61225cf1a3jl	membar	#Sync
61325cf1a3jl
61425cf1a3jl#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
61525cf1a3jl	/* membar #Sync - provided at copy completion */	;\
61625cf1a3jl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
61725cf1a3jl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
61825cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f16				;\
61925cf1a3jl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
62025cf1a3jl	ldda	[tmp1]ASI_BLK_P, %f48				;\
62125cf1a3jl	membar	#Sync
62225cf1a3jl
62325cf1a3jl/*
62425cf1a3jl * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
62525cf1a3jl * prevent preemption if there is no t_lwp to save FP state to on context
62625cf1a3jl * switch) before commencing a FP copy, and reallow it on completion or
62725cf1a3jl * in error trampoline paths when we were using FP copy.
62825cf1a3jl *
62925cf1a3jl * Both macros may call other functions, so be aware that all outputs are
63025cf1a3jl * forfeit after using these macros.  For this reason we do not pass registers
63125cf1a3jl * to use - we just use any outputs we want.
63225cf1a3jl *
63325cf1a3jl * Pseudo code:
63425cf1a3jl *
63525cf1a3jl * FP_NOMIGRATE:
63625cf1a3jl *
63725cf1a3jl * if (curthread->t_lwp) {
63825cf1a3jl *	thread_nomigrate();
63925cf1a3jl * } else {
64025cf1a3jl *	kpreempt_disable();
64125cf1a3jl * }
64225cf1a3jl *
64325cf1a3jl * FP_ALLOWMIGRATE:
64425cf1a3jl *
64525cf1a3jl * if (curthread->t_lwp) {
64625cf1a3jl *	thread_allowmigrate();
64725cf1a3jl * } else {
64825cf1a3jl *	kpreempt_enable();
64925cf1a3jl * }
65025cf1a3jl */
65125cf1a3jl
65225cf1a3jl#define	FP_NOMIGRATE(label1, label2)				\
65325cf1a3jl	ldn	[THREAD_REG + T_LWP], %o0			;\
65425cf1a3jl	brz,a,pn %o0, label1/**/f				;\
65525cf1a3jl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
65625cf1a3jl	call	thread_nomigrate				;\
65725cf1a3jl	  nop							;\
65825cf1a3jl	ba	label2/**/f					;\
65925cf1a3jl	  nop							;\
66025cf1a3jllabel1:								;\
66125cf1a3jl	inc	%o1						;\
66225cf1a3jl	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
66325cf1a3jllabel2:
66425cf1a3jl
66525cf1a3jl#define	FP_ALLOWMIGRATE(label1, label2)			\
66625cf1a3jl	ldn	[THREAD_REG + T_LWP], %o0			;\
66725cf1a3jl	brz,a,pn %o0, label1/**/f				;\
66825cf1a3jl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
66925cf1a3jl	call thread_allowmigrate				;\
67025cf1a3jl	  nop							;\
67125cf1a3jl	ba	label2/**/f					;\
67225cf1a3jl	  nop							;\
67325cf1a3jllabel1:								;\
67425cf1a3jl	dec	%o1						;\
67525cf1a3jl	brnz,pn	%o1, label2/**/f				;\
67625cf1a3jl	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
67725cf1a3jl	ldn	[THREAD_REG + T_CPU], %o0			;\
67825cf1a3jl	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
67925cf1a3jl	brz,pt	%o0, label2/**/f				;\
68025cf1a3jl	  nop							;\
68125cf1a3jl	call	kpreempt					;\
68225cf1a3jl	  rdpr	%pil, %o0					;\
68325cf1a3jllabel2:
68425cf1a3jl
68525cf1a3jl/*
68625cf1a3jl * Copy a block of storage, returning an error code if `from' or
68725cf1a3jl * `to' takes a kernel pagefault which cannot be resolved.
68825cf1a3jl * Returns errno value on pagefault error, 0 if all ok
68925cf1a3jl */
69025cf1a3jl
69125cf1a3jl	.seg	".text"
69225cf1a3jl	.align	4
69325cf1a3jl
69425cf1a3jl	ENTRY(kcopy)
69525cf1a3jl
69625cf1a3jl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
69725cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to larger cases
69825cf1a3jl	  xor	%o0, %o1, %o3			! are src, dst alignable?
69925cf1a3jl	btst	7, %o3				!
70025cf1a3jl	bz,pt	%ncc, .kcopy_8			! check for longword alignment
70125cf1a3jl	  nop
70225cf1a3jl	btst	1, %o3				!
70325cf1a3jl	bz,pt	%ncc, .kcopy_2			! check for half-word
70425cf1a3jl	  nop
70525cf1a3jl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
70625cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
70725cf1a3jl	tst	%o3
70825cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
70925cf1a3jl	  cmp	%o2, %o3			! if length <= limit
71025cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
71125cf1a3jl	  nop
71225cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
71325cf1a3jl	  nop
71425cf1a3jl.kcopy_2:
71525cf1a3jl	btst	3, %o3				!
71625cf1a3jl	bz,pt	%ncc, .kcopy_4			! check for word alignment
71725cf1a3jl	  nop
71825cf1a3jl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
71925cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
72025cf1a3jl	tst	%o3
72125cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
72225cf1a3jl	  cmp	%o2, %o3			! if length <= limit
72325cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
72425cf1a3jl	  nop
72525cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
72625cf1a3jl	  nop
72725cf1a3jl.kcopy_4:
72825cf1a3jl	! already checked longword, must be word aligned
72925cf1a3jl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
73025cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
73125cf1a3jl	tst	%o3
73225cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
73325cf1a3jl	  cmp	%o2, %o3			! if length <= limit
73425cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
73525cf1a3jl	  nop
73625cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
73725cf1a3jl	  nop
73825cf1a3jl.kcopy_8:
73925cf1a3jl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
74025cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
74125cf1a3jl	tst	%o3
74225cf1a3jl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
74325cf1a3jl	  cmp	%o2, %o3			! if length <= limit
74425cf1a3jl	bleu,pt	%ncc, .kcopy_small		! go to small copy
74525cf1a3jl	  nop
74625cf1a3jl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
74725cf1a3jl	  nop
74825cf1a3jl
74925cf1a3jl.kcopy_small:
75025cf1a3jl	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
75125cf1a3jl	or	%o5, %lo(.sm_copyerr), %o5
75225cf1a3jl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
75325cf1a3jl	membar	#Sync				! sync error barrier
75425cf1a3jl	ba,pt	%ncc, .sm_do_copy		! common code
75525cf1a3jl	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
75625cf1a3jl
75725cf1a3jl.kcopy_more:
75825cf1a3jl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
75925cf1a3jl	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
76025cf1a3jl	or	%l7, %lo(.copyerr), %l7
76125cf1a3jl	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
76225cf1a3jl	membar	#Sync				! sync error barrier
76325cf1a3jl	ba,pt	%ncc, .do_copy			! common code
76425cf1a3jl	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
76525cf1a3jl
76625cf1a3jl
76725cf1a3jl/*
76825cf1a3jl * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
76925cf1a3jl * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
77025cf1a3jl */
77125cf1a3jl.copyerr:
77225cf1a3jl	set	.copyerr2, %l0
77325cf1a3jl	membar	#Sync				! sync error barrier
77425cf1a3jl	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
77525cf1a3jl	btst	FPUSED_FLAG, %l6
77625cf1a3jl	bz	%ncc, 1f
77725cf1a3jl	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
77825cf1a3jl
77925cf1a3jl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
78025cf1a3jl	wr	%o2, 0, %gsr
78125cf1a3jl
78225cf1a3jl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
78325cf1a3jl	btst	FPRS_FEF, %o3
78425cf1a3jl	bz,pt	%icc, 4f
78525cf1a3jl	  nop
78625cf1a3jl
78725cf1a3jl	BLD_FPQ1Q3_FROMSTACK(%o2)
78825cf1a3jl
78925cf1a3jl	ba,pt	%ncc, 1f
79025cf1a3jl	  wr	%o3, 0, %fprs		! restore fprs
79125cf1a3jl
79225cf1a3jl4:
79325cf1a3jl	FZEROQ1Q3
79425cf1a3jl	wr	%o3, 0, %fprs		! restore fprs
79525cf1a3jl
79625cf1a3jl	!
79725cf1a3jl	! Need to cater for the different expectations of kcopy
79825cf1a3jl	! and bcopy. kcopy will *always* set a t_lofault handler
79925cf1a3jl	! If it fires, we're expected to just return the error code
80025cf1a3jl	! and *not* to invoke any existing error handler. As far as
80125cf1a3jl	! bcopy is concerned, we only set t_lofault if there was an
80225cf1a3jl	! existing lofault handler. In that case we're expected to
80325cf1a3jl	! invoke the previously existing handler after resetting the
80425cf1a3jl	! t_lofault value.
80525cf1a3jl	!
80625cf1a3jl1:
80725cf1a3jl	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
80825cf1a3jl	membar	#Sync				! sync error barrier
80925cf1a3jl	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
81025cf1a3jl	FP_ALLOWMIGRATE(5, 6)
81125cf1a3jl
81225cf1a3jl	btst	TRAMP_FLAG, %l0
81325cf1a3jl	bnz,pn	%ncc, 3f
81425cf1a3jl	  nop
81525cf1a3jl	ret
81625cf1a3jl	  restore	%g1, 0, %o0
81725cf1a3jl
81825cf1a3jl3:
81925cf1a3jl	!
82025cf1a3jl	! We're here via bcopy. There *must* have been an error handler
82125cf1a3jl	! in place otherwise we would have died a nasty death already.
82225cf1a3jl	!
82325cf1a3jl	jmp	%l6				! goto real handler
82425cf1a3jl	  restore	%g0, 0, %o0		! dispose of copy window
82525cf1a3jl
82625cf1a3jl/*
82725cf1a3jl * We got here because of a fault in .copyerr.  We can't safely restore fp
82825cf1a3jl * state, so we panic.
82925cf1a3jl */
83025cf1a3jlfp_panic_msg:
83125cf1a3jl	.asciz	"Unable to restore fp state after copy operation"
83225cf1a3jl
83325cf1a3jl	.align	4
83425cf1a3jl.copyerr2:
83525cf1a3jl	set	fp_panic_msg, %o0
83625cf1a3jl	call	panic
83725cf1a3jl	  nop
83825cf1a3jl
83925cf1a3jl/*
84025cf1a3jl * We got here because of a fault during a small kcopy or bcopy.
84125cf1a3jl * No floating point registers are used by the small copies.
84225cf1a3jl * Errno value is in %g1.
84325cf1a3jl */
84425cf1a3jl.sm_copyerr:
84525cf1a3jl1:
84625cf1a3jl	btst	TRAMP_FLAG, %o4
84725cf1a3jl	membar	#Sync
84825cf1a3jl	andn	%o4, TRAMP_FLAG, %o4
84925cf1a3jl	bnz,pn	%ncc, 3f
85025cf1a3jl	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
85125cf1a3jl	retl
85225cf1a3jl	  mov	%g1, %o0
85325cf1a3jl3:
85425cf1a3jl	jmp	%o4				! goto real handler
85525cf1a3jl	  mov	%g0, %o0			!
85625cf1a3jl
85725cf1a3jl	SET_SIZE(kcopy)
85825cf1a3jl
85925cf1a3jl
86025cf1a3jl/*
86125cf1a3jl * Copy a block of storage - must not overlap (from + len <= to).
86225cf1a3jl * Registers: l6 - saved t_lofault
86325cf1a3jl * (for short copies, o4 - saved t_lofault)
86425cf1a3jl *
86525cf1a3jl * Copy a page of memory.
86625cf1a3jl * Assumes double word alignment and a count >= 256.
86725cf1a3jl */
86825cf1a3jl
86925cf1a3jl	ENTRY(bcopy)
87025cf1a3jl
87125cf1a3jl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
87225cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to larger cases
87325cf1a3jl	  xor	%o0, %o1, %o3			! are src, dst alignable?
87425cf1a3jl	btst	7, %o3				!
87525cf1a3jl	bz,pt	%ncc, .bcopy_8			! check for longword alignment
87625cf1a3jl	  nop
87725cf1a3jl	btst	1, %o3				!
87825cf1a3jl	bz,pt	%ncc, .bcopy_2			! check for half-word
87925cf1a3jl	  nop
88025cf1a3jl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
88125cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
88225cf1a3jl	tst	%o3
88325cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
88425cf1a3jl	  cmp	%o2, %o3			! if length <= limit
88525cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
88625cf1a3jl	  nop
88725cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
88825cf1a3jl	  nop
88925cf1a3jl.bcopy_2:
89025cf1a3jl	btst	3, %o3				!
89125cf1a3jl	bz,pt	%ncc, .bcopy_4			! check for word alignment
89225cf1a3jl	  nop
89325cf1a3jl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
89425cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
89525cf1a3jl	tst	%o3
89625cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
89725cf1a3jl	  cmp	%o2, %o3			! if length <= limit
89825cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
89925cf1a3jl	  nop
90025cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
90125cf1a3jl	  nop
90225cf1a3jl.bcopy_4:
90325cf1a3jl	! already checked longword, must be word aligned
90425cf1a3jl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
90525cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
90625cf1a3jl	tst	%o3
90725cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
90825cf1a3jl	  cmp	%o2, %o3			! if length <= limit
90925cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
91025cf1a3jl	  nop
91125cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
91225cf1a3jl	  nop
91325cf1a3jl.bcopy_8:
91425cf1a3jl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
91525cf1a3jl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
91625cf1a3jl	tst	%o3
91725cf1a3jl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
91825cf1a3jl	  cmp	%o2, %o3			! if length <= limit
91925cf1a3jl	bleu,pt	%ncc, .bcopy_small		! go to small copy
92025cf1a3jl	  nop
92125cf1a3jl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
92225cf1a3jl	  nop
92325cf1a3jl
92425cf1a3jl	.align	16
92525cf1a3jl.bcopy_small:
92625cf1a3jl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
92725cf1a3jl	tst	%o4
92825cf1a3jl	bz,pt	%icc, .sm_do_copy
92925cf1a3jl	  nop
93025cf1a3jl	sethi	%hi(.sm_copyerr), %o5
93125cf1a3jl	or	%o5, %lo(.sm_copyerr), %o5
93225cf1a3jl	membar	#Sync				! sync error barrier
93325cf1a3jl	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
93425cf1a3jl	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
93525cf1a3jl.sm_do_copy:
93625cf1a3jl	cmp	%o2, SHORTCOPY		! check for really short case
93725cf1a3jl	bleu,pt	%ncc, .bc_sm_left	!
93825cf1a3jl	  cmp	%o2, CHKSIZE		! check for medium length cases
93925cf1a3jl	bgu,pn	%ncc, .bc_med		!
94025cf1a3jl	  or	%o0, %o1, %o3		! prepare alignment check
94125cf1a3jl	andcc	%o3, 0x3, %g0		! test for alignment
94225cf1a3jl	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
94325cf1a3jl.bc_sm_movebytes:
94425cf1a3jl	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
94525cf1a3jl.bc_sm_notalign4:
94625cf1a3jl	ldub	[%o0], %o3		! read byte
94725cf1a3jl	stb	%o3, [%o1]		! write byte
94825cf1a3jl	subcc	%o2, 4, %o2		! reduce count by 4
94925cf1a3jl	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
95025cf1a3jl	add	%o0, 4, %o0		! advance SRC by 4
95125cf1a3jl	stb	%o3, [%o1 + 1]
95225cf1a3jl	ldub	[%o0 - 2], %o3
95325cf1a3jl	add	%o1, 4, %o1		! advance DST by 4
95425cf1a3jl	stb	%o3, [%o1 - 2]
95525cf1a3jl	ldub	[%o0 - 1], %o3
95625cf1a3jl	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
95725cf1a3jl	  stb	%o3, [%o1 - 1]
95825cf1a3jl	add	%o2, 3, %o2		! restore count
95925cf1a3jl.bc_sm_left:
96025cf1a3jl	tst	%o2
96125cf1a3jl	bz,pt	%ncc, .bc_sm_exit	! check for zero length
96225cf1a3jl	  deccc	%o2			! reduce count for cc test
96325cf1a3jl	ldub	[%o0], %o3		! move one byte
96425cf1a3jl	bz,pt	%ncc, .bc_sm_exit
96525cf1a3jl	  stb	%o3, [%o1]
96625cf1a3jl	ldub	[%o0 + 1], %o3		! move another byte
96725cf1a3jl	deccc	%o2			! check for more
96825cf1a3jl	bz,pt	%ncc, .bc_sm_exit
96925cf1a3jl	  stb	%o3, [%o1 + 1]
97025cf1a3jl	ldub	[%o0 + 2], %o3		! move final byte
971e64c6c3Michael Bergknoff	ba,pt   %ncc, .bc_sm_exit
972e64c6c3Michael Bergknoff	  stb	%o3, [%o1 + 2]
97325cf1a3jl	.align	16
97425cf1a3jl	nop				! instruction alignment
97525cf1a3jl					! see discussion at start of file
97625cf1a3jl.bc_sm_words:
97725cf1a3jl	lduw	[%o0], %o3		! read word
97825cf1a3jl.bc_sm_wordx:
97925cf1a3jl	subcc	%o2, 8, %o2		! update count
98025cf1a3jl	stw	%o3, [%o1]		! write word
98125cf1a3jl	add	%o0, 8, %o0		! update SRC
98225cf1a3jl	lduw	[%o0 - 4], %o3		! read word
98325cf1a3jl	add	%o1, 8, %o1		! update DST
98425cf1a3jl	bgt,pt	%ncc, .bc_sm_words	! loop til done
98525cf1a3jl	  stw	%o3, [%o1 - 4]		! write word
98625cf1a3jl	addcc	%o2, 7, %o2		! restore count
98725cf1a3jl	bz,pt	%ncc, .bc_sm_exit
98825cf1a3jl	  deccc	%o2
98925cf1a3jl	bz,pt	%ncc, .bc_sm_byte
99025cf1a3jl.bc_sm_half:
99125cf1a3jl	  subcc	%o2, 2, %o2		! reduce count by 2
99225cf1a3jl	add	%o0, 2, %o0		! advance SRC by 2
99325cf1a3jl	lduh	[%o0 - 2], %o3		! read half word
99425cf1a3jl	add	%o1, 2, %o1		! advance DST by 2
99525cf1a3jl	bgt,pt	%ncc, .bc_sm_half	! loop til done
99625cf1a3jl	  sth	%o3, [%o1 - 2]		! write half word
99725cf1a3jl	addcc	%o2, 1, %o2		! restore count
99825cf1a3jl	bz,pt	%ncc, .bc_sm_exit
99925cf1a3jl	  nop
100025cf1a3jl.bc_sm_byte:
100125cf1a3jl	ldub	[%o0], %o3
1002e64c6c3Michael Bergknoff	ba,pt   %ncc, .bc_sm_exit
1003e64c6c3Michael Bergknoff	  stb	%o3, [%o1]
100425cf1a3jl
100525cf1a3jl.bc_sm_word:
100625cf1a3jl	subcc	%o2, 4, %o2		! update count
100725cf1a3jl	bgt,pt	%ncc, .bc_sm_wordx
100825cf1a3jl	  lduw	[%o0], %o3		! read word
100925cf1a3jl	addcc	%o2, 3, %o2		! restore count
101025cf1a3jl	bz,pt	%ncc, .bc_sm_exit
101125cf1a3jl	  stw	%o3, [%o1]		! write word
101225cf1a3jl	deccc	%o2			! reduce count for cc test
101325cf1a3jl	ldub	[%o0 + 4], %o3		! load one byte
101425cf1a3jl	bz,pt	%ncc, .bc_sm_exit
101525cf1a3jl	  stb	%o3, [%o1 + 4]		! store one byte
101625cf1a3jl	ldub	[%o0 + 5], %o3		! load second byte
101725cf1a3jl	deccc	%o2
101825cf1a3jl	bz,pt	%ncc, .bc_sm_exit
101925cf1a3jl	  stb	%o3, [%o1 + 5]		! store second byte
102025cf1a3jl	ldub	[%o0 + 6], %o3		! load third byte
102125cf1a3jl	stb	%o3, [%o1 + 6]		! store third byte
102225cf1a3jl.bc_sm_exit:
1023e64c6c3Michael Bergknoff	ldn     [THREAD_REG + T_LOFAULT], %o3
1024e64c6c3Michael Bergknoff	brz,pt  %o3, .bc_sm_done
10250090fbakm	  nop
102625cf1a3jl	membar	#Sync				! sync error barrier
102725cf1a3jl	andn	%o4, TRAMP_FLAG, %o4
102825cf1a3jl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10290090fbakm.bc_sm_done:
103025cf1a3jl	retl
103125cf1a3jl	  mov	%g0, %o0		! return 0
103225cf1a3jl
103325cf1a3jl	.align 16
103425cf1a3jl.bc_med:
103525cf1a3jl	xor	%o0, %o1, %o3		! setup alignment check
103625cf1a3jl	btst	1, %o3
103725cf1a3jl	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
103825cf1a3jl	  nop
103925cf1a3jl	btst	3, %o3
104025cf1a3jl	bnz,pt	%ncc, .bc_med_half	! halfword aligned
104125cf1a3jl	  nop
104225cf1a3jl	btst	7, %o3
104325cf1a3jl	bnz,pt	%ncc, .bc_med_word	! word aligned
104425cf1a3jl	  nop
104525cf1a3jl.bc_med_long:
104625cf1a3jl	btst	3, %o0			! check for
104725cf1a3jl	bz,pt	%ncc, .bc_med_long1	! word alignment
104825cf1a3jl	  nop
104925cf1a3jl.bc_med_long0:
105025cf1a3jl	ldub	[%o0], %o3		! load one byte
105125cf1a3jl	inc	%o0
105225cf1a3jl	stb	%o3,[%o1]		! store byte
105325cf1a3jl	inc	%o1
105425cf1a3jl	btst	3, %o0
105525cf1a3jl	bnz,pt	%ncc, .bc_med_long0
105625cf1a3jl	  dec	%o2
105725cf1a3jl.bc_med_long1:			! word aligned
105825cf1a3jl	btst	7, %o0			! check for long word
105925cf1a3jl	bz,pt	%ncc, .bc_med_long2
106025cf1a3jl	  nop
106125cf1a3jl	lduw	[%o0], %o3		! load word
106225cf1a3jl	add	%o0, 4, %o0		! advance SRC by 4
106325cf1a3jl	stw	%o3, [%o1]		! store word
106425cf1a3jl	add	%o1, 4, %o1		! advance DST by 4
106525cf1a3jl	sub	%o2, 4, %o2		! reduce count by 4
106625cf1a3jl!
106725cf1a3jl!  Now long word aligned and have at least 32 bytes to move
106825cf1a3jl!
106925cf1a3jl.bc_med_long2:
107025cf1a3jl	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1071