125cf1a30Sjl/*
225cf1a30Sjl * CDDL HEADER START
325cf1a30Sjl *
425cf1a30Sjl * The contents of this file are subject to the terms of the
525cf1a30Sjl * Common Development and Distribution License (the "License").
625cf1a30Sjl * You may not use this file except in compliance with the License.
725cf1a30Sjl *
825cf1a30Sjl * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
925cf1a30Sjl * or http://www.opensolaris.org/os/licensing.
1025cf1a30Sjl * See the License for the specific language governing permissions
1125cf1a30Sjl * and limitations under the License.
1225cf1a30Sjl *
1325cf1a30Sjl * When distributing Covered Code, include this CDDL HEADER in each
1425cf1a30Sjl * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1525cf1a30Sjl * If applicable, add the following below this CDDL HEADER, with the
1625cf1a30Sjl * fields enclosed by brackets "[]" replaced with your own identifying
1725cf1a30Sjl * information: Portions Copyright [yyyy] [name of copyright owner]
1825cf1a30Sjl *
1925cf1a30Sjl * CDDL HEADER END
2025cf1a30Sjl */
2125cf1a30Sjl/*
22e64c6c3fSMichael Bergknoff * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
2325cf1a30Sjl * Use is subject to license terms.
2425cf1a30Sjl */
2525cf1a30Sjl
2625cf1a30Sjl#include <sys/param.h>
2725cf1a30Sjl#include <sys/errno.h>
2825cf1a30Sjl#include <sys/asm_linkage.h>
2925cf1a30Sjl#include <sys/vtrace.h>
3025cf1a30Sjl#include <sys/machthread.h>
3125cf1a30Sjl#include <sys/clock.h>
3225cf1a30Sjl#include <sys/asi.h>
3325cf1a30Sjl#include <sys/fsr.h>
3425cf1a30Sjl#include <sys/privregs.h>
3525cf1a30Sjl
3625cf1a30Sjl#include "assym.h"
3725cf1a30Sjl
3825cf1a30Sjl/*
3925cf1a30Sjl * Pseudo-code to aid in understanding the control flow of the
4025cf1a30Sjl * bcopy/copyin/copyout routines.
4125cf1a30Sjl *
4225cf1a30Sjl * On entry:
4325cf1a30Sjl *
4425cf1a30Sjl * 	! Determine whether to use the FP register version
4525cf1a30Sjl * 	! or the leaf routine version depending on size
4625cf1a30Sjl * 	! of copy and flags.  Set up error handling accordingly.
4725cf1a30Sjl *	! The transition point depends on whether the src and
4825cf1a30Sjl * 	! dst addresses can be aligned to long word, word,
4925cf1a30Sjl * 	! half word, or byte boundaries.
5025cf1a30Sjl *	!
5125cf1a30Sjl *	! WARNING: <Register usage convention>
5225cf1a30Sjl *	! For FP version, %l6 holds previous error handling and
5325cf1a30Sjl *	! a flag: TRAMP_FLAG (low bits)
5425cf1a30Sjl *	! for leaf routine version, %o4 holds those values.
5525cf1a30Sjl *	! So either %l6 or %o4 is reserved and not available for
5625cf1a30Sjl *	! any other use.
5725cf1a30Sjl *
5825cf1a30Sjl * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
5925cf1a30Sjl * 		go to small_copy;		! to speed short copies
6025cf1a30Sjl *
6125cf1a30Sjl * 	! src, dst long word alignable
6225cf1a30Sjl * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
6325cf1a30Sjl * 			go to small_copy;
6425cf1a30Sjl *		if (length <= hw_copy_limit_8)
6525cf1a30Sjl * 			go to small_copy;
6625cf1a30Sjl * 		go to FPBLK_copy;
6725cf1a30Sjl * 	}
6825cf1a30Sjl * 	if (src,dst not alignable) {
6925cf1a30Sjl * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
7025cf1a30Sjl * 			go to small_copy;
7125cf1a30Sjl *		if (length <= hw_copy_limit_1)
7225cf1a30Sjl * 			go to small_copy;
7325cf1a30Sjl * 		go to FPBLK_copy;
7425cf1a30Sjl * 	}
7525cf1a30Sjl * 	if (src,dst halfword alignable) {
7625cf1a30Sjl * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
7725cf1a30Sjl * 			go to small_copy;
7825cf1a30Sjl *		if (length <= hw_copy_limit_2)
7925cf1a30Sjl * 			go to small_copy;
8025cf1a30Sjl * 		go to FPBLK_copy;
8125cf1a30Sjl * 	}
8225cf1a30Sjl * 	if (src,dst word alignable) {
8325cf1a30Sjl * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
8425cf1a30Sjl * 			go to small_copy;
8525cf1a30Sjl *		if (length <= hw_copy_limit_4)
8625cf1a30Sjl * 			go to small_copy;
8725cf1a30Sjl * 		go to FPBLK_copy;
8825cf1a30Sjl * 	}
8925cf1a30Sjl *
9025cf1a30Sjl * small_copy:
9125cf1a30Sjl *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
9225cf1a30Sjl *
9325cf1a30Sjl *	if (count <= 3)				! fast path for tiny copies
9425cf1a30Sjl *		go to sm_left;			! special finish up code
9525cf1a30Sjl *	else
9625cf1a30Sjl *		if (count > CHKSIZE)		! medium sized copies
9725cf1a30Sjl *			go to sm_med		! tuned by alignment
9825cf1a30Sjl *		if(src&dst not both word aligned) {
9925cf1a30Sjl *	sm_movebytes:
10025cf1a30Sjl *			move byte by byte in 4-way unrolled loop
10125cf1a30Sjl *			fall into sm_left;
10225cf1a30Sjl *	sm_left:
10325cf1a30Sjl *			move 0-3 bytes byte at a time as needed.
10425cf1a30Sjl *			restore error handler and exit.
10525cf1a30Sjl *
10625cf1a30Sjl * 		} else {	! src&dst are word aligned
10725cf1a30Sjl *			check for at least 8 bytes left,
10825cf1a30Sjl *			move word at a time, unrolled by 2
10925cf1a30Sjl *			when fewer than 8 bytes left,
11025cf1a30Sjl *	sm_half:	move half word at a time while 2 or more bytes left
11125cf1a30Sjl *	sm_byte:	move final byte if necessary
11225cf1a30Sjl *	sm_exit:
11325cf1a30Sjl *			restore error handler and exit.
11425cf1a30Sjl *		}
11525cf1a30Sjl *
11625cf1a30Sjl * ! Medium length cases with at least CHKSIZE bytes available
11725cf1a30Sjl * ! method: line up src and dst as best possible, then
11825cf1a30Sjl * ! move data in 4-way unrolled loops.
11925cf1a30Sjl *
12025cf1a30Sjl * sm_med:
12125cf1a30Sjl *	if(src&dst unalignable)
12225cf1a30Sjl * 		go to sm_movebytes
12325cf1a30Sjl *	if(src&dst halfword alignable)
12425cf1a30Sjl *		go to sm_movehalf
12525cf1a30Sjl *	if(src&dst word alignable)
12625cf1a30Sjl *		go to sm_moveword
12725cf1a30Sjl * ! fall into long word movement
12825cf1a30Sjl *	move bytes until src is word aligned
12925cf1a30Sjl *	if not long word aligned, move a word
13025cf1a30Sjl *	move long words in 4-way unrolled loop until < 32 bytes left
13125cf1a30Sjl *      move long words in 1-way unrolled loop until < 8 bytes left
13225cf1a30Sjl *	if zero bytes left, goto sm_exit
13325cf1a30Sjl *	if one byte left, go to sm_byte
13425cf1a30Sjl *	else go to sm_half
13525cf1a30Sjl *
13625cf1a30Sjl * sm_moveword:
13725cf1a30Sjl *	move bytes until src is word aligned
13825cf1a30Sjl *	move words in 4-way unrolled loop until < 16 bytes left
13925cf1a30Sjl *      move words in 1-way unrolled loop until < 4 bytes left
14025cf1a30Sjl *	if zero bytes left, goto sm_exit
14125cf1a30Sjl *	if one byte left, go to sm_byte
14225cf1a30Sjl *	else go to sm_half
14325cf1a30Sjl *
14425cf1a30Sjl * sm_movehalf:
14525cf1a30Sjl *	move a byte if needed to align src on halfword
14625cf1a30Sjl *	move halfwords in 4-way unrolled loop until < 8 bytes left
14725cf1a30Sjl *	if zero bytes left, goto sm_exit
14825cf1a30Sjl *	if one byte left, go to sm_byte
14925cf1a30Sjl *	else go to sm_half
15025cf1a30Sjl *
15125cf1a30Sjl *
15225cf1a30Sjl * FPBLK_copy:
15325cf1a30Sjl * 	%l6 = curthread->t_lofault;
15425cf1a30Sjl * 	if (%l6 != NULL) {
15525cf1a30Sjl * 		membar #Sync
15625cf1a30Sjl * 		curthread->t_lofault = .copyerr;
15725cf1a30Sjl * 		caller_error_handler = TRUE             ! %l6 |= 2
15825cf1a30Sjl * 	}
15925cf1a30Sjl *
16025cf1a30Sjl *	! for FPU testing we must not migrate cpus
16125cf1a30Sjl * 	if (curthread->t_lwp == NULL) {
16225cf1a30Sjl *		! Kernel threads do not have pcb's in which to store
16325cf1a30Sjl *		! the floating point state, so disallow preemption during
16425cf1a30Sjl *		! the copy.  This also prevents cpu migration.
16525cf1a30Sjl * 		kpreempt_disable(curthread);
16625cf1a30Sjl *	} else {
16725cf1a30Sjl *		thread_nomigrate();
16825cf1a30Sjl *	}
16925cf1a30Sjl *
17025cf1a30Sjl * 	old_fprs = %fprs;
17125cf1a30Sjl * 	old_gsr = %gsr;
17225cf1a30Sjl * 	if (%fprs.fef) {
17325cf1a30Sjl * 		%fprs.fef = 1;
17425cf1a30Sjl * 		save current fpregs on stack using blockstore
17525cf1a30Sjl * 	} else {
17625cf1a30Sjl * 		%fprs.fef = 1;
17725cf1a30Sjl * 	}
17825cf1a30Sjl *
17925cf1a30Sjl *
18025cf1a30Sjl * 	do_blockcopy_here;
18125cf1a30Sjl *
18225cf1a30Sjl * In lofault handler:
18325cf1a30Sjl *	curthread->t_lofault = .copyerr2;
18425cf1a30Sjl *	Continue on with the normal exit handler
18525cf1a30Sjl *
18625cf1a30Sjl * On normal exit:
18725cf1a30Sjl * 	%gsr = old_gsr;
18825cf1a30Sjl * 	if (old_fprs & FPRS_FEF)
18925cf1a30Sjl * 		restore fpregs from stack using blockload
19025cf1a30Sjl *	else
19125cf1a30Sjl *		zero fpregs
19225cf1a30Sjl * 	%fprs = old_fprs;
19325cf1a30Sjl * 	membar #Sync
19425cf1a30Sjl * 	curthread->t_lofault = (%l6 & ~3);
19525cf1a30Sjl *	! following test omitted from copyin/copyout as they
19625cf1a30Sjl *	! will always have a current thread
19725cf1a30Sjl * 	if (curthread->t_lwp == NULL)
19825cf1a30Sjl *		kpreempt_enable(curthread);
19925cf1a30Sjl *	else
20025cf1a30Sjl *		thread_allowmigrate();
20125cf1a30Sjl * 	return (0)
20225cf1a30Sjl *
20325cf1a30Sjl * In second lofault handler (.copyerr2):
20425cf1a30Sjl *	We've tried to restore fp state from the stack and failed.  To
20525cf1a30Sjl *	prevent from returning with a corrupted fp state, we will panic.
20625cf1a30Sjl */
20725cf1a30Sjl
20825cf1a30Sjl/*
20925cf1a30Sjl * Comments about optimization choices
21025cf1a30Sjl *
21125cf1a30Sjl * The initial optimization decision in this code is to determine
21225cf1a30Sjl * whether to use the FP registers for a copy or not.  If we don't
21325cf1a30Sjl * use the FP registers, we can execute the copy as a leaf routine,
21425cf1a30Sjl * saving a register save and restore.  Also, less elaborate setup
21525cf1a30Sjl * is required, allowing short copies to be completed more quickly.
21625cf1a30Sjl * For longer copies, especially unaligned ones (where the src and
21725cf1a30Sjl * dst do not align to allow simple ldx,stx operation), the FP
21825cf1a30Sjl * registers allow much faster copy operations.
21925cf1a30Sjl *
22025cf1a30Sjl * The estimated extra cost of the FP path will vary depending on
22125cf1a30Sjl * src/dst alignment, dst offset from the next 64 byte FPblock store
22225cf1a30Sjl * boundary, remaining src data after the last full dst cache line is
22325cf1a30Sjl * moved whether the FP registers need to be saved, and some other
22425cf1a30Sjl * minor issues.  The average additional overhead is estimated to be
22525cf1a30Sjl * 400 clocks.  Since each non-repeated/predicted tst and branch costs
22625cf1a30Sjl * around 10 clocks, elaborate calculation would slow down to all
22725cf1a30Sjl * longer copies and only benefit a small portion of medium sized
22825cf1a30Sjl * copies.  Rather than incur such cost, we chose fixed transition
22925cf1a30Sjl * points for each of the alignment choices.
23025cf1a30Sjl *
23125cf1a30Sjl * For the inner loop, here is a comparison of the per cache line
23225cf1a30Sjl * costs for each alignment when src&dst are in cache:
23325cf1a30Sjl *
23425cf1a30Sjl * byte aligned:  108 clocks slower for non-FPBLK
23525cf1a30Sjl * half aligned:   44 clocks slower for non-FPBLK
23625cf1a30Sjl * word aligned:   12 clocks slower for non-FPBLK
23725cf1a30Sjl * long aligned:    4 clocks >>faster<< for non-FPBLK
23825cf1a30Sjl *
23925cf1a30Sjl * The long aligned loop runs faster because it does no prefetching.
24025cf1a30Sjl * That wins if the data is not in cache or there is too little
24125cf1a30Sjl * data to gain much benefit from prefetching.  But when there
24225cf1a30Sjl * is more data and that data is not in cache, failing to prefetch
24325cf1a30Sjl * can run much slower.  In addition, there is a 2 Kbyte store queue
24425cf1a30Sjl * which will cause the non-FPBLK inner loop to slow for larger copies.
24525cf1a30Sjl * The exact tradeoff is strongly load and application dependent, with
24625cf1a30Sjl * increasing risk of a customer visible performance regression if the
24725cf1a30Sjl * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
24825cf1a30Sjl * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
24925cf1a30Sjl * upper limit for the non-FPBLK code.  To minimize performance regression
25025cf1a30Sjl * risk while still gaining the primary benefits of the improvements to
25125cf1a30Sjl * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
25225cf1a30Sjl * hw_copy_limit_*.  Later experimental studies using different values
25325cf1a30Sjl * of hw_copy_limit_* can be used to make further adjustments if
25425cf1a30Sjl * appropriate.
25525cf1a30Sjl *
25625cf1a30Sjl * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
25725cf1a30Sjl * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
25825cf1a30Sjl * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
25925cf1a30Sjl * hw_copy_limit_8 = src and dst are longword aligned
26025cf1a30Sjl *
26125cf1a30Sjl * To say that src and dst are word aligned means that after
26225cf1a30Sjl * some initial alignment activity of moving 0 to 3 bytes,
26325cf1a30Sjl * both the src and dst will be on word boundaries so that
26425cf1a30Sjl * word loads and stores may be used.
26525cf1a30Sjl *
26625cf1a30Sjl * Default values at May,2005 are:
26725cf1a30Sjl * hw_copy_limit_1 =  256
26825cf1a30Sjl * hw_copy_limit_2 =  512
26925cf1a30Sjl * hw_copy_limit_4 = 1024
27025cf1a30Sjl * hw_copy_limit_8 = 1024 (or 1536 on some systems)
27125cf1a30Sjl *
27225cf1a30Sjl *
27325cf1a30Sjl * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
27425cf1a30Sjl * disabled for that alignment choice.
27525cf1a30Sjl * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
27625cf1a30Sjl * the value of VIS_COPY_THRESHOLD is used.
27725cf1a30Sjl * It is not envisioned that hw_copy_limit_? will be changed in the field
27825cf1a30Sjl * It is provided to allow for disabling FPBLK copies and to allow
27925cf1a30Sjl * easy testing of alternate values on future HW implementations
28025cf1a30Sjl * that might have different cache sizes, clock rates or instruction
28125cf1a30Sjl * timing rules.
28225cf1a30Sjl *
28325cf1a30Sjl * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
28425cf1a30Sjl * threshold to speedup all shorter copies (less than 256).  That
28525cf1a30Sjl * saves an alignment test, memory reference, and enabling test
28625cf1a30Sjl * for all short copies, or an estimated 24 clocks.
28725cf1a30Sjl *
28825cf1a30Sjl * The order in which these limits are checked does matter since each
28925cf1a30Sjl * non-predicted tst and branch costs around 10 clocks.
29025cf1a30Sjl * If src and dst are randomly selected addresses,
29125cf1a30Sjl * 4 of 8 will not be alignable.
29225cf1a30Sjl * 2 of 8 will be half word alignable.
29325cf1a30Sjl * 1 of 8 will be word alignable.
29425cf1a30Sjl * 1 of 8 will be long word alignable.
29525cf1a30Sjl * But, tests on running kernels show that src and dst to copy code
29625cf1a30Sjl * are typically not on random alignments.  Structure copies and
29725cf1a30Sjl * copies of larger data sizes are often on long word boundaries.
29825cf1a30Sjl * So we test the long word alignment case first, then
29925cf1a30Sjl * the byte alignment, then halfword, then word alignment.
30025cf1a30Sjl *
30125cf1a30Sjl * Several times, tests for length are made to split the code
30225cf1a30Sjl * into subcases.  These tests often allow later tests to be
30325cf1a30Sjl * avoided.  For example, within the non-FPBLK copy, we first
30425cf1a30Sjl * check for tiny copies of 3 bytes or less.  That allows us
30525cf1a30Sjl * to use a 4-way unrolled loop for the general byte copy case
30625cf1a30Sjl * without a test on loop entry.
30725cf1a30Sjl * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
30825cf1a30Sjl * vs longer cases.  For the really short case, we don't attempt
30925cf1a30Sjl * align src and dst.  We try to minimize special case tests in
31025cf1a30Sjl * the shortest loops as each test adds a significant percentage
31125cf1a30Sjl * to the total time.
31225cf1a30Sjl *
31325cf1a30Sjl * For the medium sized cases, we allow ourselves to adjust the
31425cf1a30Sjl * src and dst alignment and provide special cases for each of
31525cf1a30Sjl * the four adjusted alignment cases. The CHKSIZE that was used
31625cf1a30Sjl * to decide between short and medium size was chosen to be 39
31725cf1a30Sjl * as that allows for the worst case of 7 bytes of alignment
31825cf1a30Sjl * shift and 4 times 8 bytes for the first long word unrolling.
31925cf1a30Sjl * That knowledge saves an initial test for length on entry into
32025cf1a30Sjl * the medium cases.  If the general loop unrolling factor were
32125cf1a30Sjl * to be increases, this number would also need to be adjusted.
32225cf1a30Sjl *
32325cf1a30Sjl * For all cases in the non-FPBLK code where it is known that at
32425cf1a30Sjl * least 4 chunks of data are available for movement, the
32525cf1a30Sjl * loop is unrolled by four.  This 4-way loop runs in 8 clocks
32625cf1a30Sjl * or 2 clocks per data element.
32725cf1a30Sjl *
32825cf1a30Sjl * Instruction alignment is forced by used of .align 16 directives
32925cf1a30Sjl * and nops which are not executed in the code.  This
33025cf1a30Sjl * combination of operations shifts the alignment of following
33125cf1a30Sjl * loops to insure that loops are aligned so that their instructions
33225cf1a30Sjl * fall within the minimum number of 4 instruction fetch groups.
33325cf1a30Sjl * If instructions are inserted or removed between the .align
33425cf1a30Sjl * instruction and the unrolled loops, then the alignment needs
33525cf1a30Sjl * to be readjusted.  Misaligned loops can add a clock per loop
33625cf1a30Sjl * iteration to the loop timing.
33725cf1a30Sjl *
33825cf1a30Sjl * In a few cases, code is duplicated to avoid a branch.  Since
33925cf1a30Sjl * a non-predicted tst and branch takes 10 clocks, this savings
34025cf1a30Sjl * is judged an appropriate time-space tradeoff.
34125cf1a30Sjl *
34225cf1a30Sjl * Within the FPBLK-code, the prefetch method in the inner
34325cf1a30Sjl * loop needs to be explained as it is not standard.  Two
34425cf1a30Sjl * prefetches are issued for each cache line instead of one.
34525cf1a30Sjl * The primary one is at the maximum reach of 8 cache lines.
34625cf1a30Sjl * Most of the time, that maximum prefetch reach gives the
34725cf1a30Sjl * cache line more time to reach the processor for systems with
34825cf1a30Sjl * higher processor clocks.  But, sometimes memory interference
34925cf1a30Sjl * can cause that prefetch to be dropped.  Putting a second
35025cf1a30Sjl * prefetch at a reach of 5 cache lines catches the drops
35125cf1a30Sjl * three iterations later and shows a measured improvement
35225cf1a30Sjl * in performance over any similar loop with a single prefetch.
35325cf1a30Sjl * The prefetches are placed in the loop so they overlap with
35425cf1a30Sjl * non-memory instructions, so that there is no extra cost
35525cf1a30Sjl * when the data is already in-cache.
35625cf1a30Sjl *
35725cf1a30Sjl */
35825cf1a30Sjl
35925cf1a30Sjl/*
36025cf1a30Sjl * Notes on preserving existing fp state and on membars.
36125cf1a30Sjl *
36225cf1a30Sjl * When a copyOP decides to use fp we may have to preserve existing
36325cf1a30Sjl * floating point state.  It is not the caller's state that we need to
36425cf1a30Sjl * preserve - the rest of the kernel does not use fp and, anyway, fp
36525cf1a30Sjl * registers are volatile across a call.  Some examples:
36625cf1a30Sjl *
36725cf1a30Sjl *	- userland has fp state and is interrupted (device interrupt
36825cf1a30Sjl *	  or trap) and within the interrupt/trap handling we use
36925cf1a30Sjl *	  bcopy()
37025cf1a30Sjl *	- another (higher level) interrupt or trap handler uses bcopy
37125cf1a30Sjl *	  while a bcopy from an earlier interrupt is still active
37225cf1a30Sjl *	- an asynchronous error trap occurs while fp state exists (in
37325cf1a30Sjl *	  userland or in kernel copy) and the tl0 component of the handling
37425cf1a30Sjl *	  uses bcopy
37525cf1a30Sjl *	- a user process with fp state incurs a copy-on-write fault and
37625cf1a30Sjl *	  hwblkpagecopy always uses fp
37725cf1a30Sjl *
37825cf1a30Sjl * We therefore need a per-call place in which to preserve fp state -
37925cf1a30Sjl * using our stack is ideal (and since fp copy cannot be leaf optimized
38025cf1a30Sjl * because of calls it makes, this is no hardship).
38125cf1a30Sjl *
38225cf1a30Sjl * When we have finished fp copy (with it's repeated block stores)
38325cf1a30Sjl * we must membar #Sync so that our block stores may complete before
38425cf1a30Sjl * we either restore the original fp state into the fp registers or
38525cf1a30Sjl * return to a caller which may initiate other fp operations that could
38625cf1a30Sjl * modify the fp regs we used before the block stores complete.
38725cf1a30Sjl *
38825cf1a30Sjl * Synchronous faults (eg, unresolvable DMMU miss) that occur while
38925cf1a30Sjl * t_lofault is not NULL will not panic but will instead trampoline
39025cf1a30Sjl * to the registered lofault handler.  There is no need for any
39125cf1a30Sjl * membars for these - eg, our store to t_lofault will always be visible to
39225cf1a30Sjl * ourselves and it is our cpu which will take any trap.
39325cf1a30Sjl *
39425cf1a30Sjl * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
39525cf1a30Sjl * while t_lofault is not NULL will also not panic.  Since we're copying
39625cf1a30Sjl * to or from userland the extent of the damage is known - the destination
39725cf1a30Sjl * buffer is incomplete.  So trap handlers will trampoline to the lofault
39825cf1a30Sjl * handler in this case which should take some form of error action to
39925cf1a30Sjl * avoid using the incomplete buffer.  The trap handler also flags the
40025cf1a30Sjl * fault so that later return-from-trap handling (for the trap that brought
40125cf1a30Sjl * this thread into the kernel in the first place) can notify the process
40225cf1a30Sjl * and reboot the system (or restart the service with Greenline/Contracts).
40325cf1a30Sjl *
40425cf1a30Sjl * Asynchronous faults (eg, uncorrectable ECC error from memory) can
40525cf1a30Sjl * result in deferred error traps - the trap is taken sometime after
40625cf1a30Sjl * the event and the trap PC may not be the PC of the faulting access.
40725cf1a30Sjl * Delivery of such pending traps can be forced by a membar #Sync, acting
40825cf1a30Sjl * as an "error barrier" in this role.  To accurately apply the user/kernel
40925cf1a30Sjl * separation described in the preceding paragraph we must force delivery
41025cf1a30Sjl * of deferred traps affecting kernel state before we install a lofault
41125cf1a30Sjl * handler (if we interpose a new lofault handler on an existing one there
41225cf1a30Sjl * is no need to repeat this), and we must force delivery of deferred
41325cf1a30Sjl * errors affecting the lofault-protected region before we clear t_lofault.
41425cf1a30Sjl * Failure to do so results in lost kernel state being interpreted as
41525cf1a30Sjl * affecting a copyin/copyout only, or of an error that really only
41625cf1a30Sjl * affects copy data being interpreted as losing kernel state.
41725cf1a30Sjl *
41825cf1a30Sjl * Since the copy operations may preserve and later restore floating
41925cf1a30Sjl * point state that does not belong to the caller (see examples above),
42025cf1a30Sjl * we must be careful in how we do this in order to prevent corruption
42125cf1a30Sjl * of another program.
42225cf1a30Sjl *
42325cf1a30Sjl * To make sure that floating point state is always saved and restored
42425cf1a30Sjl * correctly, the following "big rules" must be followed when the floating
42525cf1a30Sjl * point registers will be used:
42625cf1a30Sjl *
42725cf1a30Sjl * 1. %l6 always holds the caller's lofault handler.  Also in this register,
42825cf1a30Sjl *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
42925cf1a30Sjl *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
43025cf1a30Sjl *    lofault handler was set coming in.
43125cf1a30Sjl *
43225cf1a30Sjl * 2. The FPUSED flag indicates that all FP state has been successfully stored
43325cf1a30Sjl *    on the stack.  It should not be set until this save has been completed.
43425cf1a30Sjl *
43525cf1a30Sjl * 3. The FPUSED flag should not be cleared on exit until all FP state has
43625cf1a30Sjl *    been restored from the stack.  If an error occurs while restoring
43725cf1a30Sjl *    data from the stack, the error handler can check this flag to see if
43825cf1a30Sjl *    a restore is necessary.
43925cf1a30Sjl *
44025cf1a30Sjl * 4. Code run under the new lofault handler must be kept to a minimum.  In
44125cf1a30Sjl *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
44225cf1a30Sjl *    to kpreempt(), should not be made until after the lofault handler has
44325cf1a30Sjl *    been restored.
44425cf1a30Sjl */
44525cf1a30Sjl
44625cf1a30Sjl/*
44725cf1a30Sjl * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
44825cf1a30Sjl * to "break even" using FP/VIS-accelerated memory operations.
44925cf1a30Sjl * The FPBLK code assumes a minimum number of bytes are available
45025cf1a30Sjl * to be moved on entry.  Check that code carefully before
45125cf1a30Sjl * reducing VIS_COPY_THRESHOLD below 256.
45225cf1a30Sjl */
45325cf1a30Sjl/*
45425cf1a30Sjl * This shadows sys/machsystm.h which can't be included due to the lack of
45525cf1a30Sjl * _ASM guards in include files it references. Change it here, change it there.
45625cf1a30Sjl */
45725cf1a30Sjl#define VIS_COPY_THRESHOLD 256
45825cf1a30Sjl
45925cf1a30Sjl/*
46025cf1a30Sjl * TEST for very short copies
46125cf1a30Sjl * Be aware that the maximum unroll for the short unaligned case
46225cf1a30Sjl * is SHORTCOPY+1
46325cf1a30Sjl */
46425cf1a30Sjl#define SHORTCOPY 3
46525cf1a30Sjl#define CHKSIZE  39
46625cf1a30Sjl
46725cf1a30Sjl/*
46825cf1a30Sjl * Indicates that we're to trampoline to the error handler.
46925cf1a30Sjl * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
47025cf1a30Sjl * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
47125cf1a30Sjl */
47225cf1a30Sjl#define	FPUSED_FLAG	1
47325cf1a30Sjl#define	TRAMP_FLAG	2
47425cf1a30Sjl#define	MASK_FLAGS	3
47525cf1a30Sjl
47625cf1a30Sjl/*
47725cf1a30Sjl * Number of outstanding prefetches.
478c8a722abSpm * first prefetch moves data from L2 to L1 (n_reads)
479c8a722abSpm * second prefetch moves data from memory to L2 (one_read)
48025cf1a30Sjl */
481c8a722abSpm#define	OLYMPUS_C_PREFETCH	24
482c8a722abSpm#define	OLYMPUS_C_2ND_PREFETCH	12
48325cf1a30Sjl
48425cf1a30Sjl#define	VIS_BLOCKSIZE		64
48525cf1a30Sjl
48625cf1a30Sjl/*
48725cf1a30Sjl * Size of stack frame in order to accomodate a 64-byte aligned
48825cf1a30Sjl * floating-point register save area and 2 64-bit temp locations.
48925cf1a30Sjl * All copy functions use two quadrants of fp registers; to assure a
49025cf1a30Sjl * block-aligned two block buffer in which to save we must reserve
49125cf1a30Sjl * three blocks on stack.  Not all functions preserve %pfrs on stack
49225cf1a30Sjl * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
49325cf1a30Sjl *
49425cf1a30Sjl *    _______________________________________ <-- %fp + STACK_BIAS
49525cf1a30Sjl *    | We may need to preserve 2 quadrants |
49625cf1a30Sjl *    | of fp regs, but since we do so with |
49725cf1a30Sjl *    | BST/BLD we need room in which to    |
49825cf1a30Sjl *    | align to VIS_BLOCKSIZE bytes.  So   |
49925cf1a30Sjl *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
50025cf1a30Sjl *    |-------------------------------------|
50125cf1a30Sjl *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
50225cf1a30Sjl *    |-------------------------------------|
50325cf1a30Sjl *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
50425cf1a30Sjl *    ---------------------------------------
50525cf1a30Sjl */
50625cf1a30Sjl#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
50725cf1a30Sjl#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
50825cf1a30Sjl#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
50925cf1a30Sjl#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
51025cf1a30Sjl#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
51125cf1a30Sjl
51225cf1a30Sjl/*
51325cf1a30Sjl * Common macros used by the various versions of the block copy
51425cf1a30Sjl * routines in this file.
51525cf1a30Sjl */
51625cf1a30Sjl
51725cf1a30Sjl/*
51825cf1a30Sjl * In FP copies if we do not have preserved data to restore over
51925cf1a30Sjl * the fp regs we used then we must zero those regs to avoid
52025cf1a30Sjl * exposing portions of the data to later threads (data security).
52125cf1a30Sjl *
52225cf1a30Sjl * Copy functions use either quadrants 1 and 3 or 2 and 4.
52325cf1a30Sjl *
52425cf1a30Sjl * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
52525cf1a30Sjl * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
52625cf1a30Sjl *
52725cf1a30Sjl * The instructions below are quicker than repeated fzero instructions
52825cf1a30Sjl * since they can dispatch down two fp pipelines.
52925cf1a30Sjl */
53025cf1a30Sjl#define	FZEROQ1Q3			\
53125cf1a30Sjl	fzero	%f0			;\
53225cf1a30Sjl	fmovd	%f0, %f2		;\
53325cf1a30Sjl	fmovd	%f0, %f4		;\
53425cf1a30Sjl	fmovd	%f0, %f6		;\
53525cf1a30Sjl	fmovd	%f0, %f8		;\
53625cf1a30Sjl	fmovd	%f0, %f10		;\
53725cf1a30Sjl	fmovd	%f0, %f12		;\
53825cf1a30Sjl	fmovd	%f0, %f14		;\
53925cf1a30Sjl	fmovd	%f0, %f32		;\
54025cf1a30Sjl	fmovd	%f0, %f34		;\
54125cf1a30Sjl	fmovd	%f0, %f36		;\
54225cf1a30Sjl	fmovd	%f0, %f38		;\
54325cf1a30Sjl	fmovd	%f0, %f40		;\
54425cf1a30Sjl	fmovd	%f0, %f42		;\
54525cf1a30Sjl	fmovd	%f0, %f44		;\
54625cf1a30Sjl	fmovd	%f0, %f46
54725cf1a30Sjl
54825cf1a30Sjl#define	FZEROQ2Q4			\
54925cf1a30Sjl	fzero	%f16			;\
55025cf1a30Sjl	fmovd	%f0, %f18		;\
55125cf1a30Sjl	fmovd	%f0, %f20		;\
55225cf1a30Sjl	fmovd	%f0, %f22		;\
55325cf1a30Sjl	fmovd	%f0, %f24		;\
55425cf1a30Sjl	fmovd	%f0, %f26		;\
55525cf1a30Sjl	fmovd	%f0, %f28		;\
55625cf1a30Sjl	fmovd	%f0, %f30		;\
55725cf1a30Sjl	fmovd	%f0, %f48		;\
55825cf1a30Sjl	fmovd	%f0, %f50		;\
55925cf1a30Sjl	fmovd	%f0, %f52		;\
56025cf1a30Sjl	fmovd	%f0, %f54		;\
56125cf1a30Sjl	fmovd	%f0, %f56		;\
56225cf1a30Sjl	fmovd	%f0, %f58		;\
56325cf1a30Sjl	fmovd	%f0, %f60		;\
56425cf1a30Sjl	fmovd	%f0, %f62
56525cf1a30Sjl
56625cf1a30Sjl/*
56725cf1a30Sjl * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
56825cf1a30Sjl * Used to save and restore in-use fp registers when we want to use FP
56925cf1a30Sjl * and find fp already in use and copy size still large enough to justify
57025cf1a30Sjl * the additional overhead of this save and restore.
57125cf1a30Sjl *
57225cf1a30Sjl * A membar #Sync is needed before save to sync fp ops initiated before
57325cf1a30Sjl * the call to the copy function (by whoever has fp in use); for example
57425cf1a30Sjl * an earlier block load to the quadrant we are about to save may still be
57525cf1a30Sjl * "in flight".  A membar #Sync is required at the end of the save to
57625cf1a30Sjl * sync our block store (the copy code is about to begin ldd's to the
57725cf1a30Sjl * first quadrant).
57825cf1a30Sjl *
57925cf1a30Sjl * Similarly: a membar #Sync before restore allows the block stores of
58025cf1a30Sjl * the copy operation to complete before we fill the quadrants with their
58125cf1a30Sjl * original data, and a membar #Sync after restore lets the block loads
58225cf1a30Sjl * of the restore complete before we return to whoever has the fp regs
58325cf1a30Sjl * in use.  To avoid repeated membar #Sync we make it the responsibility
58425cf1a30Sjl * of the copy code to membar #Sync immediately after copy is complete
58525cf1a30Sjl * and before using the BLD_*_FROMSTACK macro.
58625cf1a30Sjl */
58725cf1a30Sjl#define BST_FPQ1Q3_TOSTACK(tmp1)				\
58825cf1a30Sjl	/* membar #Sync	*/					;\
58925cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59025cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
59125cf1a30Sjl	stda	%f0, [tmp1]ASI_BLK_P				;\
59225cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
59325cf1a30Sjl	stda	%f32, [tmp1]ASI_BLK_P				;\
59425cf1a30Sjl	membar	#Sync
59525cf1a30Sjl
59625cf1a30Sjl#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
59725cf1a30Sjl	/* membar #Sync - provided at copy completion */	;\
59825cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59925cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60025cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f0				;\
60125cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
60225cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f32				;\
60325cf1a30Sjl	membar	#Sync
60425cf1a30Sjl
60525cf1a30Sjl#define BST_FPQ2Q4_TOSTACK(tmp1)				\
60625cf1a30Sjl	/* membar #Sync */					;\
60725cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
60825cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60925cf1a30Sjl	stda	%f16, [tmp1]ASI_BLK_P				;\
61025cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
61125cf1a30Sjl	stda	%f48, [tmp1]ASI_BLK_P				;\
61225cf1a30Sjl	membar	#Sync
61325cf1a30Sjl
61425cf1a30Sjl#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
61525cf1a30Sjl	/* membar #Sync - provided at copy completion */	;\
61625cf1a30Sjl	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
61725cf1a30Sjl	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
61825cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f16				;\
61925cf1a30Sjl	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
62025cf1a30Sjl	ldda	[tmp1]ASI_BLK_P, %f48				;\
62125cf1a30Sjl	membar	#Sync
62225cf1a30Sjl
62325cf1a30Sjl/*
62425cf1a30Sjl * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
62525cf1a30Sjl * prevent preemption if there is no t_lwp to save FP state to on context
62625cf1a30Sjl * switch) before commencing a FP copy, and reallow it on completion or
62725cf1a30Sjl * in error trampoline paths when we were using FP copy.
62825cf1a30Sjl *
62925cf1a30Sjl * Both macros may call other functions, so be aware that all outputs are
63025cf1a30Sjl * forfeit after using these macros.  For this reason we do not pass registers
63125cf1a30Sjl * to use - we just use any outputs we want.
63225cf1a30Sjl *
63325cf1a30Sjl * Pseudo code:
63425cf1a30Sjl *
63525cf1a30Sjl * FP_NOMIGRATE:
63625cf1a30Sjl *
63725cf1a30Sjl * if (curthread->t_lwp) {
63825cf1a30Sjl *	thread_nomigrate();
63925cf1a30Sjl * } else {
64025cf1a30Sjl *	kpreempt_disable();
64125cf1a30Sjl * }
64225cf1a30Sjl *
64325cf1a30Sjl * FP_ALLOWMIGRATE:
64425cf1a30Sjl *
64525cf1a30Sjl * if (curthread->t_lwp) {
64625cf1a30Sjl *	thread_allowmigrate();
64725cf1a30Sjl * } else {
64825cf1a30Sjl *	kpreempt_enable();
64925cf1a30Sjl * }
65025cf1a30Sjl */
65125cf1a30Sjl
65225cf1a30Sjl#define	FP_NOMIGRATE(label1, label2)				\
65325cf1a30Sjl	ldn	[THREAD_REG + T_LWP], %o0			;\
65425cf1a30Sjl	brz,a,pn %o0, label1/**/f				;\
65525cf1a30Sjl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
65625cf1a30Sjl	call	thread_nomigrate				;\
65725cf1a30Sjl	  nop							;\
65825cf1a30Sjl	ba	label2/**/f					;\
65925cf1a30Sjl	  nop							;\
66025cf1a30Sjllabel1:								;\
66125cf1a30Sjl	inc	%o1						;\
66225cf1a30Sjl	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
66325cf1a30Sjllabel2:
66425cf1a30Sjl
66525cf1a30Sjl#define	FP_ALLOWMIGRATE(label1, label2)			\
66625cf1a30Sjl	ldn	[THREAD_REG + T_LWP], %o0			;\
66725cf1a30Sjl	brz,a,pn %o0, label1/**/f				;\
66825cf1a30Sjl	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
66925cf1a30Sjl	call thread_allowmigrate				;\
67025cf1a30Sjl	  nop							;\
67125cf1a30Sjl	ba	label2/**/f					;\
67225cf1a30Sjl	  nop							;\
67325cf1a30Sjllabel1:								;\
67425cf1a30Sjl	dec	%o1						;\
67525cf1a30Sjl	brnz,pn	%o1, label2/**/f				;\
67625cf1a30Sjl	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
67725cf1a30Sjl	ldn	[THREAD_REG + T_CPU], %o0			;\
67825cf1a30Sjl	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
67925cf1a30Sjl	brz,pt	%o0, label2/**/f				;\
68025cf1a30Sjl	  nop							;\
68125cf1a30Sjl	call	kpreempt					;\
68225cf1a30Sjl	  rdpr	%pil, %o0					;\
68325cf1a30Sjllabel2:
68425cf1a30Sjl
68525cf1a30Sjl/*
68625cf1a30Sjl * Copy a block of storage, returning an error code if `from' or
68725cf1a30Sjl * `to' takes a kernel pagefault which cannot be resolved.
68825cf1a30Sjl * Returns errno value on pagefault error, 0 if all ok
68925cf1a30Sjl */
69025cf1a30Sjl
69125cf1a30Sjl	.seg	".text"
69225cf1a30Sjl	.align	4
69325cf1a30Sjl
69425cf1a30Sjl	ENTRY(kcopy)
69525cf1a30Sjl
69625cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
69725cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to larger cases
69825cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
69925cf1a30Sjl	btst	7, %o3				!
70025cf1a30Sjl	bz,pt	%ncc, .kcopy_8			! check for longword alignment
70125cf1a30Sjl	  nop
70225cf1a30Sjl	btst	1, %o3				!
70325cf1a30Sjl	bz,pt	%ncc, .kcopy_2			! check for half-word
70425cf1a30Sjl	  nop
70525cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
70625cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
70725cf1a30Sjl	tst	%o3
70825cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
70925cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
71025cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
71125cf1a30Sjl	  nop
71225cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
71325cf1a30Sjl	  nop
71425cf1a30Sjl.kcopy_2:
71525cf1a30Sjl	btst	3, %o3				!
71625cf1a30Sjl	bz,pt	%ncc, .kcopy_4			! check for word alignment
71725cf1a30Sjl	  nop
71825cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
71925cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
72025cf1a30Sjl	tst	%o3
72125cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
72225cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
72325cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
72425cf1a30Sjl	  nop
72525cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
72625cf1a30Sjl	  nop
72725cf1a30Sjl.kcopy_4:
72825cf1a30Sjl	! already checked longword, must be word aligned
72925cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
73025cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
73125cf1a30Sjl	tst	%o3
73225cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
73325cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
73425cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
73525cf1a30Sjl	  nop
73625cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
73725cf1a30Sjl	  nop
73825cf1a30Sjl.kcopy_8:
73925cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
74025cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
74125cf1a30Sjl	tst	%o3
74225cf1a30Sjl	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
74325cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
74425cf1a30Sjl	bleu,pt	%ncc, .kcopy_small		! go to small copy
74525cf1a30Sjl	  nop
74625cf1a30Sjl	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
74725cf1a30Sjl	  nop
74825cf1a30Sjl
74925cf1a30Sjl.kcopy_small:
75025cf1a30Sjl	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
75125cf1a30Sjl	or	%o5, %lo(.sm_copyerr), %o5
75225cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
75325cf1a30Sjl	membar	#Sync				! sync error barrier
75425cf1a30Sjl	ba,pt	%ncc, .sm_do_copy		! common code
75525cf1a30Sjl	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
75625cf1a30Sjl
75725cf1a30Sjl.kcopy_more:
75825cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
75925cf1a30Sjl	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
76025cf1a30Sjl	or	%l7, %lo(.copyerr), %l7
76125cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
76225cf1a30Sjl	membar	#Sync				! sync error barrier
76325cf1a30Sjl	ba,pt	%ncc, .do_copy			! common code
76425cf1a30Sjl	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
76525cf1a30Sjl
76625cf1a30Sjl
76725cf1a30Sjl/*
76825cf1a30Sjl * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
76925cf1a30Sjl * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
77025cf1a30Sjl */
77125cf1a30Sjl.copyerr:
77225cf1a30Sjl	set	.copyerr2, %l0
77325cf1a30Sjl	membar	#Sync				! sync error barrier
77425cf1a30Sjl	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
77525cf1a30Sjl	btst	FPUSED_FLAG, %l6
77625cf1a30Sjl	bz	%ncc, 1f
77725cf1a30Sjl	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
77825cf1a30Sjl
77925cf1a30Sjl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
78025cf1a30Sjl	wr	%o2, 0, %gsr
78125cf1a30Sjl
78225cf1a30Sjl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
78325cf1a30Sjl	btst	FPRS_FEF, %o3
78425cf1a30Sjl	bz,pt	%icc, 4f
78525cf1a30Sjl	  nop
78625cf1a30Sjl
78725cf1a30Sjl	BLD_FPQ1Q3_FROMSTACK(%o2)
78825cf1a30Sjl
78925cf1a30Sjl	ba,pt	%ncc, 1f
79025cf1a30Sjl	  wr	%o3, 0, %fprs		! restore fprs
79125cf1a30Sjl
79225cf1a30Sjl4:
79325cf1a30Sjl	FZEROQ1Q3
79425cf1a30Sjl	wr	%o3, 0, %fprs		! restore fprs
79525cf1a30Sjl
79625cf1a30Sjl	!
79725cf1a30Sjl	! Need to cater for the different expectations of kcopy
79825cf1a30Sjl	! and bcopy. kcopy will *always* set a t_lofault handler
79925cf1a30Sjl	! If it fires, we're expected to just return the error code
80025cf1a30Sjl	! and *not* to invoke any existing error handler. As far as
80125cf1a30Sjl	! bcopy is concerned, we only set t_lofault if there was an
80225cf1a30Sjl	! existing lofault handler. In that case we're expected to
80325cf1a30Sjl	! invoke the previously existing handler after resetting the
80425cf1a30Sjl	! t_lofault value.
80525cf1a30Sjl	!
80625cf1a30Sjl1:
80725cf1a30Sjl	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
80825cf1a30Sjl	membar	#Sync				! sync error barrier
80925cf1a30Sjl	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
81025cf1a30Sjl	FP_ALLOWMIGRATE(5, 6)
81125cf1a30Sjl
81225cf1a30Sjl	btst	TRAMP_FLAG, %l0
81325cf1a30Sjl	bnz,pn	%ncc, 3f
81425cf1a30Sjl	  nop
81525cf1a30Sjl	ret
81625cf1a30Sjl	  restore	%g1, 0, %o0
81725cf1a30Sjl
81825cf1a30Sjl3:
81925cf1a30Sjl	!
82025cf1a30Sjl	! We're here via bcopy. There *must* have been an error handler
82125cf1a30Sjl	! in place otherwise we would have died a nasty death already.
82225cf1a30Sjl	!
82325cf1a30Sjl	jmp	%l6				! goto real handler
82425cf1a30Sjl	  restore	%g0, 0, %o0		! dispose of copy window
82525cf1a30Sjl
82625cf1a30Sjl/*
82725cf1a30Sjl * We got here because of a fault in .copyerr.  We can't safely restore fp
82825cf1a30Sjl * state, so we panic.
82925cf1a30Sjl */
83025cf1a30Sjlfp_panic_msg:
83125cf1a30Sjl	.asciz	"Unable to restore fp state after copy operation"
83225cf1a30Sjl
83325cf1a30Sjl	.align	4
83425cf1a30Sjl.copyerr2:
83525cf1a30Sjl	set	fp_panic_msg, %o0
83625cf1a30Sjl	call	panic
83725cf1a30Sjl	  nop
83825cf1a30Sjl
83925cf1a30Sjl/*
84025cf1a30Sjl * We got here because of a fault during a small kcopy or bcopy.
84125cf1a30Sjl * No floating point registers are used by the small copies.
84225cf1a30Sjl * Errno value is in %g1.
84325cf1a30Sjl */
84425cf1a30Sjl.sm_copyerr:
84525cf1a30Sjl1:
84625cf1a30Sjl	btst	TRAMP_FLAG, %o4
84725cf1a30Sjl	membar	#Sync
84825cf1a30Sjl	andn	%o4, TRAMP_FLAG, %o4
84925cf1a30Sjl	bnz,pn	%ncc, 3f
85025cf1a30Sjl	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
85125cf1a30Sjl	retl
85225cf1a30Sjl	  mov	%g1, %o0
85325cf1a30Sjl3:
85425cf1a30Sjl	jmp	%o4				! goto real handler
85525cf1a30Sjl	  mov	%g0, %o0			!
85625cf1a30Sjl
85725cf1a30Sjl	SET_SIZE(kcopy)
85825cf1a30Sjl
85925cf1a30Sjl
86025cf1a30Sjl/*
86125cf1a30Sjl * Copy a block of storage - must not overlap (from + len <= to).
86225cf1a30Sjl * Registers: l6 - saved t_lofault
86325cf1a30Sjl * (for short copies, o4 - saved t_lofault)
86425cf1a30Sjl *
86525cf1a30Sjl * Copy a page of memory.
86625cf1a30Sjl * Assumes double word alignment and a count >= 256.
86725cf1a30Sjl */
86825cf1a30Sjl
86925cf1a30Sjl	ENTRY(bcopy)
87025cf1a30Sjl
87125cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
87225cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to larger cases
87325cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
87425cf1a30Sjl	btst	7, %o3				!
87525cf1a30Sjl	bz,pt	%ncc, .bcopy_8			! check for longword alignment
87625cf1a30Sjl	  nop
87725cf1a30Sjl	btst	1, %o3				!
87825cf1a30Sjl	bz,pt	%ncc, .bcopy_2			! check for half-word
87925cf1a30Sjl	  nop
88025cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
88125cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
88225cf1a30Sjl	tst	%o3
88325cf1a30Sjl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
88425cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
88525cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to small copy
88625cf1a30Sjl	  nop
88725cf1a30Sjl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
88825cf1a30Sjl	  nop
88925cf1a30Sjl.bcopy_2:
89025cf1a30Sjl	btst	3, %o3				!
89125cf1a30Sjl	bz,pt	%ncc, .bcopy_4			! check for word alignment
89225cf1a30Sjl	  nop
89325cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
89425cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
89525cf1a30Sjl	tst	%o3
89625cf1a30Sjl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
89725cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
89825cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to small copy
89925cf1a30Sjl	  nop
90025cf1a30Sjl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
90125cf1a30Sjl	  nop
90225cf1a30Sjl.bcopy_4:
90325cf1a30Sjl	! already checked longword, must be word aligned
90425cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
90525cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
90625cf1a30Sjl	tst	%o3
90725cf1a30Sjl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
90825cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
90925cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to small copy
91025cf1a30Sjl	  nop
91125cf1a30Sjl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
91225cf1a30Sjl	  nop
91325cf1a30Sjl.bcopy_8:
91425cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
91525cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
91625cf1a30Sjl	tst	%o3
91725cf1a30Sjl	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
91825cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
91925cf1a30Sjl	bleu,pt	%ncc, .bcopy_small		! go to small copy
92025cf1a30Sjl	  nop
92125cf1a30Sjl	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
92225cf1a30Sjl	  nop
92325cf1a30Sjl
92425cf1a30Sjl	.align	16
92525cf1a30Sjl.bcopy_small:
92625cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
92725cf1a30Sjl	tst	%o4
92825cf1a30Sjl	bz,pt	%icc, .sm_do_copy
92925cf1a30Sjl	  nop
93025cf1a30Sjl	sethi	%hi(.sm_copyerr), %o5
93125cf1a30Sjl	or	%o5, %lo(.sm_copyerr), %o5
93225cf1a30Sjl	membar	#Sync				! sync error barrier
93325cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
93425cf1a30Sjl	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
93525cf1a30Sjl.sm_do_copy:
93625cf1a30Sjl	cmp	%o2, SHORTCOPY		! check for really short case
93725cf1a30Sjl	bleu,pt	%ncc, .bc_sm_left	!
93825cf1a30Sjl	  cmp	%o2, CHKSIZE		! check for medium length cases
93925cf1a30Sjl	bgu,pn	%ncc, .bc_med		!
94025cf1a30Sjl	  or	%o0, %o1, %o3		! prepare alignment check
94125cf1a30Sjl	andcc	%o3, 0x3, %g0		! test for alignment
94225cf1a30Sjl	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
94325cf1a30Sjl.bc_sm_movebytes:
94425cf1a30Sjl	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
94525cf1a30Sjl.bc_sm_notalign4:
94625cf1a30Sjl	ldub	[%o0], %o3		! read byte
94725cf1a30Sjl	stb	%o3, [%o1]		! write byte
94825cf1a30Sjl	subcc	%o2, 4, %o2		! reduce count by 4
94925cf1a30Sjl	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
95025cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
95125cf1a30Sjl	stb	%o3, [%o1 + 1]
95225cf1a30Sjl	ldub	[%o0 - 2], %o3
95325cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
95425cf1a30Sjl	stb	%o3, [%o1 - 2]
95525cf1a30Sjl	ldub	[%o0 - 1], %o3
95625cf1a30Sjl	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
95725cf1a30Sjl	  stb	%o3, [%o1 - 1]
95825cf1a30Sjl	add	%o2, 3, %o2		! restore count
95925cf1a30Sjl.bc_sm_left:
96025cf1a30Sjl	tst	%o2
96125cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit	! check for zero length
96225cf1a30Sjl	  deccc	%o2			! reduce count for cc test
96325cf1a30Sjl	ldub	[%o0], %o3		! move one byte
96425cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
96525cf1a30Sjl	  stb	%o3, [%o1]
96625cf1a30Sjl	ldub	[%o0 + 1], %o3		! move another byte
96725cf1a30Sjl	deccc	%o2			! check for more
96825cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
96925cf1a30Sjl	  stb	%o3, [%o1 + 1]
97025cf1a30Sjl	ldub	[%o0 + 2], %o3		! move final byte
971e64c6c3fSMichael Bergknoff	ba,pt   %ncc, .bc_sm_exit
972e64c6c3fSMichael Bergknoff	  stb	%o3, [%o1 + 2]
97325cf1a30Sjl	.align	16
97425cf1a30Sjl	nop				! instruction alignment
97525cf1a30Sjl					! see discussion at start of file
97625cf1a30Sjl.bc_sm_words:
97725cf1a30Sjl	lduw	[%o0], %o3		! read word
97825cf1a30Sjl.bc_sm_wordx:
97925cf1a30Sjl	subcc	%o2, 8, %o2		! update count
98025cf1a30Sjl	stw	%o3, [%o1]		! write word
98125cf1a30Sjl	add	%o0, 8, %o0		! update SRC
98225cf1a30Sjl	lduw	[%o0 - 4], %o3		! read word
98325cf1a30Sjl	add	%o1, 8, %o1		! update DST
98425cf1a30Sjl	bgt,pt	%ncc, .bc_sm_words	! loop til done
98525cf1a30Sjl	  stw	%o3, [%o1 - 4]		! write word
98625cf1a30Sjl	addcc	%o2, 7, %o2		! restore count
98725cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
98825cf1a30Sjl	  deccc	%o2
98925cf1a30Sjl	bz,pt	%ncc, .bc_sm_byte
99025cf1a30Sjl.bc_sm_half:
99125cf1a30Sjl	  subcc	%o2, 2, %o2		! reduce count by 2
99225cf1a30Sjl	add	%o0, 2, %o0		! advance SRC by 2
99325cf1a30Sjl	lduh	[%o0 - 2], %o3		! read half word
99425cf1a30Sjl	add	%o1, 2, %o1		! advance DST by 2
99525cf1a30Sjl	bgt,pt	%ncc, .bc_sm_half	! loop til done
99625cf1a30Sjl	  sth	%o3, [%o1 - 2]		! write half word
99725cf1a30Sjl	addcc	%o2, 1, %o2		! restore count
99825cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
99925cf1a30Sjl	  nop
100025cf1a30Sjl.bc_sm_byte:
100125cf1a30Sjl	ldub	[%o0], %o3
1002e64c6c3fSMichael Bergknoff	ba,pt   %ncc, .bc_sm_exit
1003e64c6c3fSMichael Bergknoff	  stb	%o3, [%o1]
100425cf1a30Sjl
100525cf1a30Sjl.bc_sm_word:
100625cf1a30Sjl	subcc	%o2, 4, %o2		! update count
100725cf1a30Sjl	bgt,pt	%ncc, .bc_sm_wordx
100825cf1a30Sjl	  lduw	[%o0], %o3		! read word
100925cf1a30Sjl	addcc	%o2, 3, %o2		! restore count
101025cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
101125cf1a30Sjl	  stw	%o3, [%o1]		! write word
101225cf1a30Sjl	deccc	%o2			! reduce count for cc test
101325cf1a30Sjl	ldub	[%o0 + 4], %o3		! load one byte
101425cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
101525cf1a30Sjl	  stb	%o3, [%o1 + 4]		! store one byte
101625cf1a30Sjl	ldub	[%o0 + 5], %o3		! load second byte
101725cf1a30Sjl	deccc	%o2
101825cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
101925cf1a30Sjl	  stb	%o3, [%o1 + 5]		! store second byte
102025cf1a30Sjl	ldub	[%o0 + 6], %o3		! load third byte
102125cf1a30Sjl	stb	%o3, [%o1 + 6]		! store third byte
102225cf1a30Sjl.bc_sm_exit:
1023e64c6c3fSMichael Bergknoff	ldn     [THREAD_REG + T_LOFAULT], %o3
1024e64c6c3fSMichael Bergknoff	brz,pt  %o3, .bc_sm_done
10250090fbabSkm	  nop
102625cf1a30Sjl	membar	#Sync				! sync error barrier
102725cf1a30Sjl	andn	%o4, TRAMP_FLAG, %o4
102825cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10290090fbabSkm.bc_sm_done:
103025cf1a30Sjl	retl
103125cf1a30Sjl	  mov	%g0, %o0		! return 0
103225cf1a30Sjl
103325cf1a30Sjl	.align 16
103425cf1a30Sjl.bc_med:
103525cf1a30Sjl	xor	%o0, %o1, %o3		! setup alignment check
103625cf1a30Sjl	btst	1, %o3
103725cf1a30Sjl	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
103825cf1a30Sjl	  nop
103925cf1a30Sjl	btst	3, %o3
104025cf1a30Sjl	bnz,pt	%ncc, .bc_med_half	! halfword aligned
104125cf1a30Sjl	  nop
104225cf1a30Sjl	btst	7, %o3
104325cf1a30Sjl	bnz,pt	%ncc, .bc_med_word	! word aligned
104425cf1a30Sjl	  nop
104525cf1a30Sjl.bc_med_long:
104625cf1a30Sjl	btst	3, %o0			! check for
104725cf1a30Sjl	bz,pt	%ncc, .bc_med_long1	! word alignment
104825cf1a30Sjl	  nop
104925cf1a30Sjl.bc_med_long0:
105025cf1a30Sjl	ldub	[%o0], %o3		! load one byte
105125cf1a30Sjl	inc	%o0
105225cf1a30Sjl	stb	%o3,[%o1]		! store byte
105325cf1a30Sjl	inc	%o1
105425cf1a30Sjl	btst	3, %o0
105525cf1a30Sjl	bnz,pt	%ncc, .bc_med_long0
105625cf1a30Sjl	  dec	%o2
105725cf1a30Sjl.bc_med_long1:			! word aligned
105825cf1a30Sjl	btst	7, %o0			! check for long word
105925cf1a30Sjl	bz,pt	%ncc, .bc_med_long2
106025cf1a30Sjl	  nop
106125cf1a30Sjl	lduw	[%o0], %o3		! load word
106225cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
106325cf1a30Sjl	stw	%o3, [%o1]		! store word
106425cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
106525cf1a30Sjl	sub	%o2, 4, %o2		! reduce count by 4
106625cf1a30Sjl!
106725cf1a30Sjl!  Now long word aligned and have at least 32 bytes to move
106825cf1a30Sjl!
106925cf1a30Sjl.bc_med_long2:
107025cf1a30Sjl	sub	%o2, 31, %o2		! adjust count to allow cc zero test
107125cf1a30Sjl.bc_med_lmove:
107225cf1a30Sjl	ldx	[%o0], %o3		! read long word
107325cf1a30Sjl	stx	%o3, [%o1]		! write long word
107425cf1a30Sjl	subcc	%o2, 32, %o2		! reduce count by 32
107525cf1a30Sjl	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
107625cf1a30Sjl	add	%o0, 32, %o0		! advance SRC by 32
107725cf1a30Sjl	stx	%o3, [%o1 + 8]
107825cf1a30Sjl	ldx	[%o0 - 16], %o3
107925cf1a30Sjl	add	%o1, 32, %o1		! advance DST by 32
108025cf1a30Sjl	stx	%o3, [%o1 - 16]
108125cf1a30Sjl	ldx	[%o0 - 8], %o3
108225cf1a30Sjl	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
108325cf1a30Sjl	  stx	%o3, [%o1 - 8]
108425cf1a30Sjl	addcc	%o2, 24, %o2		! restore count to long word offset
108525cf1a30Sjl	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
108625cf1a30Sjl	  nop
108725cf1a30Sjl.bc_med_lword:
108825cf1a30Sjl	ldx	[%o0], %o3		! read long word
108925cf1a30Sjl	subcc	%o2, 8, %o2		! reduce count by 8
109025cf1a30Sjl	stx	%o3, [%o1]		! write long word
109125cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
109225cf1a30Sjl	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
109325cf1a30Sjl	  add	%o1, 8, %o1		! advance DST by 8
109425cf1a30Sjl.bc_med_lextra:
109525cf1a30Sjl	addcc	%o2, 7, %o2		! restore rest of count
109625cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
109725cf1a30Sjl	  deccc	%o2
109825cf1a30Sjl	bz,pt	%ncc, .bc_sm_byte
109925cf1a30Sjl	  nop
110025cf1a30Sjl	ba,pt	%ncc, .bc_sm_half
110125cf1a30Sjl	  nop
110225cf1a30Sjl
110325cf1a30Sjl	.align 16
110425cf1a30Sjl.bc_med_word:
110525cf1a30Sjl	btst	3, %o0			! check for
110625cf1a30Sjl	bz,pt	%ncc, .bc_med_word1	! word alignment
110725cf1a30Sjl	  nop
110825cf1a30Sjl.bc_med_word0:
110925cf1a30Sjl	ldub	[%o0], %o3		! load one byte
111025cf1a30Sjl	inc	%o0
111125cf1a30Sjl	stb	%o3,[%o1]		! store byte
111225cf1a30Sjl	inc	%o1
111325cf1a30Sjl	btst	3, %o0
111425cf1a30Sjl	bnz,pt	%ncc, .bc_med_word0
111525cf1a30Sjl	  dec	%o2
111625cf1a30Sjl!
111725cf1a30Sjl!  Now word aligned and have at least 36 bytes to move
111825cf1a30Sjl!
111925cf1a30Sjl.bc_med_word1:
112025cf1a30Sjl	sub	%o2, 15, %o2		! adjust count to allow cc zero test
112125cf1a30Sjl.bc_med_wmove:
112225cf1a30Sjl	lduw	[%o0], %o3		! read word
112325cf1a30Sjl	stw	%o3, [%o1]		! write word
112425cf1a30Sjl	subcc	%o2, 16, %o2		! reduce count by 16
112525cf1a30Sjl	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
112625cf1a30Sjl	add	%o0, 16, %o0		! advance SRC by 16
112725cf1a30Sjl	stw	%o3, [%o1 + 4]
112825cf1a30Sjl	lduw	[%o0 - 8], %o3
112925cf1a30Sjl	add	%o1, 16, %o1		! advance DST by 16
113025cf1a30Sjl	stw	%o3, [%o1 - 8]
113125cf1a30Sjl	lduw	[%o0 - 4], %o3
113225cf1a30Sjl	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
113325cf1a30Sjl	  stw	%o3, [%o1 - 4]
113425cf1a30Sjl	addcc	%o2, 12, %o2		! restore count to word offset
113525cf1a30Sjl	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
113625cf1a30Sjl	  nop
113725cf1a30Sjl.bc_med_word2:
113825cf1a30Sjl	lduw	[%o0], %o3		! read word
113925cf1a30Sjl	subcc	%o2, 4, %o2		! reduce count by 4
114025cf1a30Sjl	stw	%o3, [%o1]		! write word
114125cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
114225cf1a30Sjl	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
114325cf1a30Sjl	  add	%o1, 4, %o1		! advance DST by 4
114425cf1a30Sjl.bc_med_wextra:
114525cf1a30Sjl	addcc	%o2, 3, %o2		! restore rest of count
114625cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
114725cf1a30Sjl	  deccc	%o2
114825cf1a30Sjl	bz,pt	%ncc, .bc_sm_byte
114925cf1a30Sjl	  nop
115025cf1a30Sjl	ba,pt	%ncc, .bc_sm_half
115125cf1a30Sjl	  nop
115225cf1a30Sjl
115325cf1a30Sjl	.align 16
115425cf1a30Sjl.bc_med_half:
115525cf1a30Sjl	btst	1, %o0			! check for
115625cf1a30Sjl	bz,pt	%ncc, .bc_med_half1	! half word alignment
115725cf1a30Sjl	  nop
115825cf1a30Sjl	ldub	[%o0], %o3		! load one byte
115925cf1a30Sjl	inc	%o0
116025cf1a30Sjl	stb	%o3,[%o1]		! store byte
116125cf1a30Sjl	inc	%o1
116225cf1a30Sjl	dec	%o2
116325cf1a30Sjl!
116425cf1a30Sjl!  Now half word aligned and have at least 38 bytes to move
116525cf1a30Sjl!
116625cf1a30Sjl.bc_med_half1:
116725cf1a30Sjl	sub	%o2, 7, %o2		! adjust count to allow cc zero test
116825cf1a30Sjl.bc_med_hmove:
116925cf1a30Sjl	lduh	[%o0], %o3		! read half word
117025cf1a30Sjl	sth	%o3, [%o1]		! write half word
117125cf1a30Sjl	subcc	%o2, 8, %o2		! reduce count by 8
117225cf1a30Sjl	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
117325cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
117425cf1a30Sjl	sth	%o3, [%o1 + 2]
117525cf1a30Sjl	lduh	[%o0 - 4], %o3
117625cf1a30Sjl	add	%o1, 8, %o1		! advance DST by 8
117725cf1a30Sjl	sth	%o3, [%o1 - 4]
117825cf1a30Sjl	lduh	[%o0 - 2], %o3
117925cf1a30Sjl	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
118025cf1a30Sjl	  sth	%o3, [%o1 - 2]
118125cf1a30Sjl	addcc	%o2, 7, %o2		! restore count
118225cf1a30Sjl	bz,pt	%ncc, .bc_sm_exit
118325cf1a30Sjl	  deccc	%o2
118425cf1a30Sjl	bz,pt	%ncc, .bc_sm_byte
118525cf1a30Sjl	  nop
118625cf1a30Sjl	ba,pt	%ncc, .bc_sm_half
118725cf1a30Sjl	  nop
118825cf1a30Sjl
118925cf1a30Sjl	SET_SIZE(bcopy)
119025cf1a30Sjl
119125cf1a30Sjl/*
119225cf1a30Sjl * The _more entry points are not intended to be used directly by
119325cf1a30Sjl * any caller from outside this file.  They are provided to allow
119425cf1a30Sjl * profiling and dtrace of the portions of the copy code that uses
119525cf1a30Sjl * the floating point registers.
119625cf1a30Sjl * This entry is particularly important as DTRACE (at least as of
119725cf1a30Sjl * 4/2004) does not support leaf functions.
119825cf1a30Sjl */
119925cf1a30Sjl
120025cf1a30Sjl	ENTRY(bcopy_more)
120125cf1a30Sjl.bcopy_more:
120225cf1a30Sjl	prefetch [%o0], #n_reads
120325cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
120425cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
120525cf1a30Sjl	tst	%l6
120625cf1a30Sjl	bz,pt	%ncc, .do_copy
120725cf1a30Sjl	  nop
120825cf1a30Sjl	sethi	%hi(.copyerr), %o2
120925cf1a30Sjl	or	%o2, %lo(.copyerr), %o2
121025cf1a30Sjl	membar	#Sync				! sync error barrier
121125cf1a30Sjl	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
121225cf1a30Sjl	!
121325cf1a30Sjl	! We've already captured whether t_lofault was zero on entry.
121425cf1a30Sjl	! We need to mark ourselves as being from bcopy since both
121525cf1a30Sjl	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
121625cf1a30Sjl	! and the saved lofault was zero, we won't reset lofault on
121725cf1a30Sjl	! returning.
121825cf1a30Sjl	!
121925cf1a30Sjl	or	%l6, TRAMP_FLAG, %l6
122025cf1a30Sjl
122125cf1a30Sjl/*
122225cf1a30Sjl * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
122325cf1a30Sjl * Also, use of FP registers has been tested to be enabled
122425cf1a30Sjl */
122525cf1a30Sjl.do_copy:
122625cf1a30Sjl	FP_NOMIGRATE(6, 7)
122725cf1a30Sjl
122825cf1a30Sjl	rd	%fprs, %o2		! check for unused fp
122925cf1a30Sjl	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
123025cf1a30Sjl	btst	FPRS_FEF, %o2
123125cf1a30Sjl	bz,a,pt	%icc, .do_blockcopy
123225cf1a30Sjl	  wr	%g0, FPRS_FEF, %fprs
123325cf1a30Sjl
123425cf1a30Sjl	BST_FPQ1Q3_TOSTACK(%o2)
123525cf1a30Sjl
123625cf1a30Sjl.do_blockcopy:
123725cf1a30Sjl	rd	%gsr, %o2
123825cf1a30Sjl	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
123925cf1a30Sjl	or	%l6, FPUSED_FLAG, %l6
124025cf1a30Sjl
124125cf1a30Sjl#define	REALSRC	%i0
124225cf1a30Sjl#define	DST	%i1
124325cf1a30Sjl#define	CNT	%i2
124425cf1a30Sjl#define	SRC	%i3
124525cf1a30Sjl#define	TMP	%i5
124625cf1a30Sjl
124725cf1a30Sjl	andcc	DST, VIS_BLOCKSIZE - 1, TMP
124825cf1a30Sjl	bz,pt	%ncc, 2f
124925cf1a30Sjl	  neg	TMP
125025cf1a30Sjl	add	TMP, VIS_BLOCKSIZE, TMP
125125cf1a30Sjl
125225cf1a30Sjl	! TMP = bytes required to align DST on FP_BLOCK boundary
125325cf1a30Sjl	! Using SRC as a tmp here
125425cf1a30Sjl	cmp	TMP, 3
125525cf1a30Sjl	bleu,pt	%ncc, 1f
125625cf1a30Sjl	  sub	CNT,TMP,CNT		! adjust main count
125725cf1a30Sjl	sub	TMP, 3, TMP		! adjust for end of loop test
125825cf1a30Sjl.bc_blkalign:
125925cf1a30Sjl	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
126025cf1a30Sjl	stb	SRC, [DST]
126125cf1a30Sjl	subcc	TMP, 4, TMP
126225cf1a30Sjl	ldub	[REALSRC + 1], SRC
126325cf1a30Sjl	add	REALSRC, 4, REALSRC
126425cf1a30Sjl	stb	SRC, [DST + 1]
126525cf1a30Sjl	ldub	[REALSRC - 2], SRC
126625cf1a30Sjl	add	DST, 4, DST
126725cf1a30Sjl	stb	SRC, [DST - 2]
126825cf1a30Sjl	ldub	[REALSRC - 1], SRC
126925cf1a30Sjl	bgu,pt	%ncc, .bc_blkalign
127025cf1a30Sjl	  stb	SRC, [DST - 1]
127125cf1a30Sjl
127225cf1a30Sjl	addcc	TMP, 3, TMP		! restore count adjustment
127325cf1a30Sjl	bz,pt	%ncc, 2f		! no bytes left?
127425cf1a30Sjl	  nop
127525cf1a30Sjl1:	ldub	[REALSRC], SRC
127625cf1a30Sjl	inc	REALSRC
127725cf1a30Sjl	inc	DST
127825cf1a30Sjl	deccc	TMP
127925cf1a30Sjl	bgu	%ncc, 1b
128025cf1a30Sjl	  stb	SRC, [DST - 1]
128125cf1a30Sjl
128225cf1a30Sjl2:
128325cf1a30Sjl	membar	#StoreLoad
128425cf1a30Sjl	andn	REALSRC, 0x7, SRC
128525cf1a30Sjl
128625cf1a30Sjl	! SRC - 8-byte aligned
128725cf1a30Sjl	! DST - 64-byte aligned
128825cf1a30Sjl	ldd	[SRC], %f0
128925cf1a30Sjl	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
129025cf1a30Sjl	alignaddr REALSRC, %g0, %g0
129125cf1a30Sjl	ldd	[SRC + 0x08], %f2
129225cf1a30Sjl	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
129325cf1a30Sjl	faligndata %f0, %f2, %f32
129425cf1a30Sjl	ldd	[SRC + 0x10], %f4
1295c8a722abSpm	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
129625cf1a30Sjl	faligndata %f2, %f4, %f34
129725cf1a30Sjl	ldd	[SRC + 0x18], %f6
129825cf1a30Sjl	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
129925cf1a30Sjl	faligndata %f4, %f6, %f36
130025cf1a30Sjl	ldd	[SRC + 0x20], %f8
1301c8a722abSpm	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
130225cf1a30Sjl	faligndata %f6, %f8, %f38
130325cf1a30Sjl	ldd	[SRC + 0x28], %f10
1304c8a722abSpm	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
130525cf1a30Sjl	faligndata %f8, %f10, %f40
130625cf1a30Sjl	ldd	[SRC + 0x30], %f12
1307c8a722abSpm	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
130825cf1a30Sjl	faligndata %f10, %f12, %f42
130925cf1a30Sjl	ldd	[SRC + 0x38], %f14
131025cf1a30Sjl	ldd	[SRC + VIS_BLOCKSIZE], %f0
131125cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
131225cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
1313c8a722abSpm	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
131425cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
131525cf1a30Sjl	ba,pt	%ncc, 1f
1316c8a722abSpm	  prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
131725cf1a30Sjl	.align	32
131825cf1a30Sjl1:
131925cf1a30Sjl	ldd	[SRC + 0x08], %f2
132025cf1a30Sjl	faligndata %f12, %f14, %f44
132125cf1a30Sjl	ldd	[SRC + 0x10], %f4
132225cf1a30Sjl	faligndata %f14, %f0, %f46
132325cf1a30Sjl	stda	%f32, [DST]ASI_BLK_P
132425cf1a30Sjl	ldd	[SRC + 0x18], %f6
132525cf1a30Sjl	faligndata %f0, %f2, %f32
132625cf1a30Sjl	ldd	[SRC + 0x20], %f8
132725cf1a30Sjl	faligndata %f2, %f4, %f34
132825cf1a30Sjl	ldd	[SRC + 0x28], %f10
132925cf1a30Sjl	faligndata %f4, %f6, %f36
133025cf1a30Sjl	ldd	[SRC + 0x30], %f12
133125cf1a30Sjl	faligndata %f6, %f8, %f38
1332c8a722abSpm	sub	CNT, VIS_BLOCKSIZE, CNT
133325cf1a30Sjl	ldd	[SRC + 0x38], %f14
133425cf1a30Sjl	faligndata %f8, %f10, %f40
1335c8a722abSpm	add	DST, VIS_BLOCKSIZE, DST
133625cf1a30Sjl	ldd	[SRC + VIS_BLOCKSIZE], %f0
133725cf1a30Sjl	faligndata %f10, %f12, %f42
133825cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1339c8a722abSpm	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1340c8a722abSpm	add	SRC, VIS_BLOCKSIZE, SRC
1341c8a722abSpm	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
134225cf1a30Sjl	cmp	CNT, VIS_BLOCKSIZE + 8
134325cf1a30Sjl	bgu,pt	%ncc, 1b
1344c8a722abSpm	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
134525cf1a30Sjl
134625cf1a30Sjl	! only if REALSRC & 0x7 is 0
134725cf1a30Sjl	cmp	CNT, VIS_BLOCKSIZE
134825cf1a30Sjl	bne	%ncc, 3f
134925cf1a30Sjl	  andcc	REALSRC, 0x7, %g0
135025cf1a30Sjl	bz,pt	%ncc, 2f
135125cf1a30Sjl	  nop
135225cf1a30Sjl3:
135325cf1a30Sjl	faligndata %f12, %f14, %f44
135425cf1a30Sjl	faligndata %f14, %f0, %f46
135525cf1a30Sjl	stda	%f32, [DST]ASI_BLK_P
135625cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
135725cf1a30Sjl	ba,pt	%ncc, 3f
135825cf1a30Sjl	  nop
135925cf1a30Sjl2:
136025cf1a30Sjl	ldd	[SRC + 0x08], %f2
136125cf1a30Sjl	fsrc1	%f12, %f44
136225cf1a30Sjl	ldd	[SRC + 0x10], %f4
136325cf1a30Sjl	fsrc1	%f14, %f46
136425cf1a30Sjl	stda	%f32, [DST]ASI_BLK_P
136525cf1a30Sjl	ldd	[SRC + 0x18], %f6
136625cf1a30Sjl	fsrc1	%f0, %f32
136725cf1a30Sjl	ldd	[SRC + 0x20], %f8
136825cf1a30Sjl	fsrc1	%f2, %f34
136925cf1a30Sjl	ldd	[SRC + 0x28], %f10
137025cf1a30Sjl	fsrc1	%f4, %f36
137125cf1a30Sjl	ldd	[SRC + 0x30], %f12
137225cf1a30Sjl	fsrc1	%f6, %f38
137325cf1a30Sjl	ldd	[SRC + 0x38], %f14
137425cf1a30Sjl	fsrc1	%f8, %f40
137525cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
137625cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
137725cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
137825cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
137925cf1a30Sjl	fsrc1	%f10, %f42
138025cf1a30Sjl	fsrc1	%f12, %f44
138125cf1a30Sjl	fsrc1	%f14, %f46
138225cf1a30Sjl	stda	%f32, [DST]ASI_BLK_P
138325cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
138425cf1a30Sjl	ba,a,pt	%ncc, .bcb_exit
138525cf1a30Sjl	  nop
138625cf1a30Sjl
138725cf1a30Sjl3:	tst	CNT
138825cf1a30Sjl	bz,a,pt	%ncc, .bcb_exit
138925cf1a30Sjl	  nop
139025cf1a30Sjl
139125cf1a30Sjl5:	ldub	[REALSRC], TMP
139225cf1a30Sjl	inc	REALSRC
139325cf1a30Sjl	inc	DST
139425cf1a30Sjl	deccc	CNT
139525cf1a30Sjl	bgu	%ncc, 5b
139625cf1a30Sjl	  stb	TMP, [DST - 1]
139725cf1a30Sjl.bcb_exit:
139825cf1a30Sjl	membar	#Sync
139925cf1a30Sjl
140025cf1a30Sjl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
140125cf1a30Sjl	wr	%o2, 0, %gsr
140225cf1a30Sjl
140325cf1a30Sjl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
140425cf1a30Sjl	btst	FPRS_FEF, %o3
140525cf1a30Sjl	bz,pt	%icc, 4f
140625cf1a30Sjl	  nop
140725cf1a30Sjl
140825cf1a30Sjl	BLD_FPQ1Q3_FROMSTACK(%o2)
140925cf1a30Sjl
141025cf1a30Sjl	ba,pt	%ncc, 2f
141125cf1a30Sjl	  wr	%o3, 0, %fprs		! restore fprs
141225cf1a30Sjl4:
141325cf1a30Sjl	FZEROQ1Q3
141425cf1a30Sjl	wr	%o3, 0, %fprs		! restore fprs
141525cf1a30Sjl2:
141625cf1a30Sjl	membar	#Sync				! sync error barrier
141725cf1a30Sjl	andn	%l6, MASK_FLAGS, %l6
141825cf1a30Sjl	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
141925cf1a30Sjl	FP_ALLOWMIGRATE(5, 6)
142025cf1a30Sjl	ret
142125cf1a30Sjl	  restore	%g0, 0, %o0
142225cf1a30Sjl
142325cf1a30Sjl	SET_SIZE(bcopy_more)
142425cf1a30Sjl
142525cf1a30Sjl/*
142625cf1a30Sjl * Block copy with possibly overlapped operands.
142725cf1a30Sjl */
142825cf1a30Sjl
142925cf1a30Sjl	ENTRY(ovbcopy)
143025cf1a30Sjl	tst	%o2			! check count
143125cf1a30Sjl	bgu,a	%ncc, 1f		! nothing to do or bad arguments
143225cf1a30Sjl	  subcc	%o0, %o1, %o3		! difference of from and to address
143325cf1a30Sjl
143425cf1a30Sjl	retl				! return
143525cf1a30Sjl	  nop
143625cf1a30Sjl1:
143725cf1a30Sjl	bneg,a	%ncc, 2f
143825cf1a30Sjl	  neg	%o3			! if < 0, make it positive
143925cf1a30Sjl2:	cmp	%o2, %o3		! cmp size and abs(from - to)
144025cf1a30Sjl	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
144125cf1a30Sjl	  .empty				!   no overlap
144225cf1a30Sjl	  cmp	%o0, %o1		! compare from and to addresses
144325cf1a30Sjl	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
144425cf1a30Sjl	  nop
144525cf1a30Sjl	!
144625cf1a30Sjl	! Copy forwards.
144725cf1a30Sjl	!
144825cf1a30Sjl.ov_fwd:
144925cf1a30Sjl	ldub	[%o0], %o3		! read from address
145025cf1a30Sjl	inc	%o0			! inc from address
145125cf1a30Sjl	stb	%o3, [%o1]		! write to address
145225cf1a30Sjl	deccc	%o2			! dec count
145325cf1a30Sjl	bgu	%ncc, .ov_fwd		! loop till done
145425cf1a30Sjl	  inc	%o1			! inc to address
145525cf1a30Sjl
145625cf1a30Sjl	retl				! return
145725cf1a30Sjl	  nop
145825cf1a30Sjl	!
145925cf1a30Sjl	! Copy backwards.
146025cf1a30Sjl	!
146125cf1a30Sjl.ov_bkwd:
146225cf1a30Sjl	deccc	%o2			! dec count
146325cf1a30Sjl	ldub	[%o0 + %o2], %o3	! get byte at end of src
146425cf1a30Sjl	bgu	%ncc, .ov_bkwd		! loop till done
146525cf1a30Sjl	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
146625cf1a30Sjl
146725cf1a30Sjl	retl				! return
146825cf1a30Sjl	  nop
146925cf1a30Sjl
147025cf1a30Sjl	SET_SIZE(ovbcopy)
147125cf1a30Sjl
147225cf1a30Sjl
147325cf1a30Sjl/*
147425cf1a30Sjl * hwblkpagecopy()
147525cf1a30Sjl *
147625cf1a30Sjl * Copies exactly one page.  This routine assumes the caller (ppcopy)
147725cf1a30Sjl * has already disabled kernel preemption and has checked
147825cf1a30Sjl * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
147925cf1a30Sjl */
148025cf1a30Sjl	ENTRY(hwblkpagecopy)
148125cf1a30Sjl	! get another window w/space for three aligned blocks of saved fpregs
148225cf1a30Sjl	prefetch [%o0], #n_reads
148325cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
148425cf1a30Sjl
148525cf1a30Sjl	! %i0 - source address (arg)
148625cf1a30Sjl	! %i1 - destination address (arg)
148725cf1a30Sjl	! %i2 - length of region (not arg)
148825cf1a30Sjl	! %l0 - saved fprs
148925cf1a30Sjl	! %l1 - pointer to saved fpregs
149025cf1a30Sjl
149125cf1a30Sjl	rd	%fprs, %l0		! check for unused fp
149225cf1a30Sjl	btst	FPRS_FEF, %l0
149325cf1a30Sjl	bz,a,pt	%icc, 1f
149425cf1a30Sjl	  wr	%g0, FPRS_FEF, %fprs
149525cf1a30Sjl
149625cf1a30Sjl	BST_FPQ1Q3_TOSTACK(%l1)
149725cf1a30Sjl
149825cf1a30Sjl1:	set	PAGESIZE, CNT
149925cf1a30Sjl	mov	REALSRC, SRC
150025cf1a30Sjl
150125cf1a30Sjl	ldd	[SRC], %f0
150225cf1a30Sjl	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
150325cf1a30Sjl	ldd	[SRC + 0x08], %f2
150425cf1a30Sjl	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
150525cf1a30Sjl	fmovd	%f0, %f32
150625cf1a30Sjl	ldd	[SRC + 0x10], %f4
1507c8a722abSpm	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
150825cf1a30Sjl	fmovd	%f2, %f34
150925cf1a30Sjl	ldd	[SRC + 0x18], %f6
151025cf1a30Sjl	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
151125cf1a30Sjl	fmovd	%f4, %f36
151225cf1a30Sjl	ldd	[SRC + 0x20], %f8
1513c8a722abSpm	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
151425cf1a30Sjl	fmovd	%f6, %f38
151525cf1a30Sjl	ldd	[SRC + 0x28], %f10
1516c8a722abSpm	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
151725cf1a30Sjl	fmovd	%f8, %f40
151825cf1a30Sjl	ldd	[SRC + 0x30], %f12
1519c8a722abSpm	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
152025cf1a30Sjl	fmovd	%f10, %f42
152125cf1a30Sjl	ldd	[SRC + 0x38], %f14
152225cf1a30Sjl	ldd	[SRC + VIS_BLOCKSIZE], %f0
152325cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
152425cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
1525c8a722abSpm	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
152625cf1a30Sjl	ba,pt	%ncc, 2f
1527c8a722abSpm	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
152825cf1a30Sjl	.align	32
152925cf1a30Sjl2:
153025cf1a30Sjl	ldd	[SRC + 0x08], %f2
153125cf1a30Sjl	fmovd	%f12, %f44
153225cf1a30Sjl	ldd	[SRC + 0x10], %f4
153325cf1a30Sjl	fmovd	%f14, %f46
153425cf1a30Sjl	stda	%f32, [DST]ASI_BLK_P
153525cf1a30Sjl	ldd	[SRC + 0x18], %f6
153625cf1a30Sjl	fmovd	%f0, %f32
153725cf1a30Sjl	ldd	[SRC + 0x20], %f8
153825cf1a30Sjl	fmovd	%f2, %f34
153925cf1a30Sjl	ldd	[SRC + 0x28], %f10
154025cf1a30Sjl	fmovd	%f4, %f36
154125cf1a30Sjl	ldd	[SRC + 0x30], %f12
154225cf1a30Sjl	fmovd	%f6, %f38
154325cf1a30Sjl	ldd	[SRC + 0x38], %f14
154425cf1a30Sjl	fmovd	%f8, %f40
154525cf1a30Sjl	ldd	[SRC + VIS_BLOCKSIZE], %f0
154625cf1a30Sjl	fmovd	%f10, %f42
154725cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
1548c8a722abSpm	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
154925cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
1550c8a722abSpm	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1551c8a722abSpm	add	SRC, VIS_BLOCKSIZE, SRC
155225cf1a30Sjl	cmp	CNT, VIS_BLOCKSIZE + 8
155325cf1a30Sjl	bgu,pt	%ncc, 2b
1554c8a722abSpm	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
155525cf1a30Sjl
155625cf1a30Sjl	! trailing block
155725cf1a30Sjl	ldd	[SRC + 0x08], %f2
155825cf1a30Sjl	fsrc1	%f12, %f44
155925cf1a30Sjl	ldd	[SRC + 0x10], %f4
156025cf1a30Sjl	fsrc1	%f14, %f46
156125cf1a30Sjl	stda	%f32, [DST]ASI_BLK_P
156225cf1a30Sjl	ldd	[SRC + 0x18], %f6
156325cf1a30Sjl	fsrc1	%f0, %f32
156425cf1a30Sjl	ldd	[SRC + 0x20], %f8
156525cf1a30Sjl	fsrc1	%f2, %f34
156625cf1a30Sjl	ldd	[SRC + 0x28], %f10
156725cf1a30Sjl	fsrc1	%f4, %f36
156825cf1a30Sjl	ldd	[SRC + 0x30], %f12
156925cf1a30Sjl	fsrc1	%f6, %f38
157025cf1a30Sjl	ldd	[SRC + 0x38], %f14
157125cf1a30Sjl	fsrc1	%f8, %f40
157225cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
157325cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
157425cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
157525cf1a30Sjl	fsrc1	%f10, %f42
157625cf1a30Sjl	fsrc1	%f12, %f44
157725cf1a30Sjl	fsrc1	%f14, %f46
157825cf1a30Sjl	stda	%f32, [DST]ASI_BLK_P
157925cf1a30Sjl
158025cf1a30Sjl	membar	#Sync
158125cf1a30Sjl
158225cf1a30Sjl	btst	FPRS_FEF, %l0
158325cf1a30Sjl	bz,pt	%icc, 2f
158425cf1a30Sjl	  nop
158525cf1a30Sjl
158625cf1a30Sjl	BLD_FPQ1Q3_FROMSTACK(%l3)
158725cf1a30Sjl	ba	3f
158825cf1a30Sjl	  nop
158925cf1a30Sjl
159025cf1a30Sjl2:	FZEROQ1Q3
159125cf1a30Sjl
159225cf1a30Sjl3:	wr	%l0, 0, %fprs		! restore fprs
159325cf1a30Sjl	ret
159425cf1a30Sjl	  restore	%g0, 0, %o0
159525cf1a30Sjl
159625cf1a30Sjl	SET_SIZE(hwblkpagecopy)
159725cf1a30Sjl
159825cf1a30Sjl
159925cf1a30Sjl/*
160025cf1a30Sjl * Transfer data to and from user space -
160125cf1a30Sjl * Note that these routines can cause faults
160225cf1a30Sjl * It is assumed that the kernel has nothing at
160325cf1a30Sjl * less than KERNELBASE in the virtual address space.
160425cf1a30Sjl *
160525cf1a30Sjl * Note that copyin(9F) and copyout(9F) are part of the
160625cf1a30Sjl * DDI/DKI which specifies that they return '-1' on "errors."
160725cf1a30Sjl *
160825cf1a30Sjl * Sigh.
160925cf1a30Sjl *
161025cf1a30Sjl * So there's two extremely similar routines - xcopyin() and xcopyout()
161125cf1a30Sjl * which return the errno that we've faithfully computed.  This
161225cf1a30Sjl * allows other callers (e.g. uiomove(9F)) to work correctly.
161325cf1a30Sjl * Given that these are used pretty heavily, we expand the calling
161425cf1a30Sjl * sequences inline for all flavours (rather than making wrappers).
161525cf1a30Sjl *
161625cf1a30Sjl * There are also stub routines for xcopyout_little and xcopyin_little,
161725cf1a30Sjl * which currently are intended to handle requests of <= 16 bytes from
161825cf1a30Sjl * do_unaligned. Future enhancement to make them handle 8k pages efficiently
161925cf1a30Sjl * is left as an exercise...
162025cf1a30Sjl */
162125cf1a30Sjl
162225cf1a30Sjl/*
162325cf1a30Sjl * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
162425cf1a30Sjl *
162525cf1a30Sjl * General theory of operation:
162625cf1a30Sjl *
162725cf1a30Sjl * The only difference between copy{in,out} and
162825cf1a30Sjl * xcopy{in,out} is in the error handling routine they invoke
162925cf1a30Sjl * when a memory access error occurs. xcopyOP returns the errno
163025cf1a30Sjl * while copyOP returns -1 (see above). copy{in,out}_noerr set
163125cf1a30Sjl * a special flag (by oring the TRAMP_FLAG into the fault handler address)
163225cf1a30Sjl * if they are called with a fault handler already in place. That flag
163325cf1a30Sjl * causes the default handlers to trampoline to the previous handler
163425cf1a30Sjl * upon an error.
163525cf1a30Sjl *
163625cf1a30Sjl * None of the copyops routines grab a window until it's decided that
163725cf1a30Sjl * we need to do a HW block copy operation. This saves a window
163825cf1a30Sjl * spill/fill when we're called during socket ops. The typical IO
163925cf1a30Sjl * path won't cause spill/fill traps.
164025cf1a30Sjl *
164125cf1a30Sjl * This code uses a set of 4 limits for the maximum size that will
164225cf1a30Sjl * be copied given a particular input/output address alignment.
164325cf1a30Sjl * If the value for a particular limit is zero, the copy will be performed
164425cf1a30Sjl * by the plain copy loops rather than FPBLK.
164525cf1a30Sjl *
164625cf1a30Sjl * See the description of bcopy above for more details of the
164725cf1a30Sjl * data copying algorithm and the default limits.
164825cf1a30Sjl *
164925cf1a30Sjl */
165025cf1a30Sjl
165125cf1a30Sjl/*
165225cf1a30Sjl * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
165325cf1a30Sjl */
165425cf1a30Sjl
165525cf1a30Sjl/*
165625cf1a30Sjl * We save the arguments in the following registers in case of a fault:
165725cf1a30Sjl *	kaddr - %l1
165825cf1a30Sjl *	uaddr - %l2
165925cf1a30Sjl *	count - %l3
166025cf1a30Sjl */
166125cf1a30Sjl#define SAVE_SRC	%l1
166225cf1a30Sjl#define SAVE_DST	%l2
166325cf1a30Sjl#define SAVE_COUNT	%l3
166425cf1a30Sjl
166525cf1a30Sjl#define SM_SAVE_SRC		%g4
166625cf1a30Sjl#define SM_SAVE_DST		%g5
166725cf1a30Sjl#define SM_SAVE_COUNT		%o5
166825cf1a30Sjl#define ERRNO		%l5
166925cf1a30Sjl
167025cf1a30Sjl
167125cf1a30Sjl#define REAL_LOFAULT	%l4
167225cf1a30Sjl/*
167325cf1a30Sjl * Generic copyio fault handler.  This is the first line of defense when a
167425cf1a30Sjl * fault occurs in (x)copyin/(x)copyout.  In order for this to function
167525cf1a30Sjl * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
167625cf1a30Sjl * This allows us to share common code for all the flavors of the copy
167725cf1a30Sjl * operations, including the _noerr versions.
167825cf1a30Sjl *
167925cf1a30Sjl * Note that this function will restore the original input parameters before
168025cf1a30Sjl * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
168125cf1a30Sjl * member of the t_copyop structure, if needed.
168225cf1a30Sjl */
168325cf1a30Sjl	ENTRY(copyio_fault)
168425cf1a30Sjl	membar	#Sync
168525cf1a30Sjl	mov	%g1,ERRNO			! save errno in ERRNO
168625cf1a30Sjl	btst	FPUSED_FLAG, %l6
168725cf1a30Sjl	bz	%ncc, 1f
168825cf1a30Sjl	  nop
168925cf1a30Sjl
169025cf1a30Sjl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
169125cf1a30Sjl	wr	%o2, 0, %gsr    	! restore gsr
169225cf1a30Sjl
169325cf1a30Sjl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
169425cf1a30Sjl	btst	FPRS_FEF, %o3
169525cf1a30Sjl	bz,pt	%icc, 4f
169625cf1a30Sjl	  nop
169725cf1a30Sjl
169825cf1a30Sjl	BLD_FPQ2Q4_FROMSTACK(%o2)
169925cf1a30Sjl
170025cf1a30Sjl	ba,pt	%ncc, 1f
170125cf1a30Sjl	  wr	%o3, 0, %fprs   	! restore fprs
170225cf1a30Sjl
170325cf1a30Sjl4:
170425cf1a30Sjl	FZEROQ2Q4
170525cf1a30Sjl	wr	%o3, 0, %fprs   	! restore fprs
170625cf1a30Sjl
170725cf1a30Sjl1:
170825cf1a30Sjl	andn	%l6, FPUSED_FLAG, %l6
170925cf1a30Sjl	membar	#Sync
171025cf1a30Sjl	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
171125cf1a30Sjl	FP_ALLOWMIGRATE(5, 6)
171225cf1a30Sjl
171325cf1a30Sjl	mov	SAVE_SRC, %i0
171425cf1a30Sjl	mov	SAVE_DST, %i1
171525cf1a30Sjl	jmp	REAL_LOFAULT
171625cf1a30Sjl	  mov	SAVE_COUNT, %i2
171725cf1a30Sjl
171825cf1a30Sjl	SET_SIZE(copyio_fault)
171925cf1a30Sjl
172025cf1a30Sjl
172125cf1a30Sjl	ENTRY(copyout)
172225cf1a30Sjl
172325cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
172425cf1a30Sjl	bleu,pt	%ncc, .copyout_small		! go to larger cases
172525cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
172625cf1a30Sjl	btst	7, %o3				!
172725cf1a30Sjl	bz,pt	%ncc, .copyout_8		! check for longword alignment
172825cf1a30Sjl	  nop
172925cf1a30Sjl	btst	1, %o3				!
173025cf1a30Sjl	bz,pt	%ncc, .copyout_2		! check for half-word
173125cf1a30Sjl	  nop
173225cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
173325cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
173425cf1a30Sjl	tst	%o3
173525cf1a30Sjl	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
173625cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
173725cf1a30Sjl	bleu,pt	%ncc, .copyout_small		! go to small copy
173825cf1a30Sjl	  nop
173925cf1a30Sjl	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
174025cf1a30Sjl	  nop
174125cf1a30Sjl.copyout_2:
174225cf1a30Sjl	btst	3, %o3				!
174325cf1a30Sjl	bz,pt	%ncc, .copyout_4		! check for word alignment
174425cf1a30Sjl	  nop
174525cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
174625cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
174725cf1a30Sjl	tst	%o3
174825cf1a30Sjl	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
174925cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
175025cf1a30Sjl	bleu,pt	%ncc, .copyout_small		! go to small copy
175125cf1a30Sjl	  nop
175225cf1a30Sjl	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
175325cf1a30Sjl	  nop
175425cf1a30Sjl.copyout_4:
175525cf1a30Sjl	! already checked longword, must be word aligned
175625cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
175725cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
175825cf1a30Sjl	tst	%o3
175925cf1a30Sjl	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
176025cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
176125cf1a30Sjl	bleu,pt	%ncc, .copyout_small		! go to small copy
176225cf1a30Sjl	  nop
176325cf1a30Sjl	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
176425cf1a30Sjl	  nop
176525cf1a30Sjl.copyout_8:
176625cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
176725cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
176825cf1a30Sjl	tst	%o3
176925cf1a30Sjl	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
177025cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
177125cf1a30Sjl	bleu,pt	%ncc, .copyout_small		! go to small copy
177225cf1a30Sjl	  nop
177325cf1a30Sjl	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
177425cf1a30Sjl	  nop
177525cf1a30Sjl
177625cf1a30Sjl	.align	16
177725cf1a30Sjl	nop				! instruction alignment
177825cf1a30Sjl					! see discussion at start of file
177925cf1a30Sjl.copyout_small:
178025cf1a30Sjl	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
178125cf1a30Sjl	or	%o5, %lo(.sm_copyout_err), %o5
178225cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
178325cf1a30Sjl	membar	#Sync				! sync error barrier
178425cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
178525cf1a30Sjl.sm_do_copyout:
178625cf1a30Sjl	mov	%o0, SM_SAVE_SRC
178725cf1a30Sjl	mov	%o1, SM_SAVE_DST
178825cf1a30Sjl	cmp	%o2, SHORTCOPY		! check for really short case
178925cf1a30Sjl	bleu,pt	%ncc, .co_sm_left	!
179025cf1a30Sjl	  mov	%o2, SM_SAVE_COUNT
179125cf1a30Sjl	cmp	%o2, CHKSIZE		! check for medium length cases
179225cf1a30Sjl	bgu,pn	%ncc, .co_med		!
179325cf1a30Sjl	  or	%o0, %o1, %o3		! prepare alignment check
179425cf1a30Sjl	andcc	%o3, 0x3, %g0		! test for alignment
179525cf1a30Sjl	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
179625cf1a30Sjl.co_sm_movebytes:
179725cf1a30Sjl	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
179825cf1a30Sjl.co_sm_notalign4:
179925cf1a30Sjl	ldub	[%o0], %o3		! read byte
180025cf1a30Sjl	subcc	%o2, 4, %o2		! reduce count by 4
180125cf1a30Sjl	stba	%o3, [%o1]ASI_USER	! write byte
180225cf1a30Sjl	inc	%o1			! advance DST by 1
180325cf1a30Sjl	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
180425cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
180525cf1a30Sjl	stba	%o3, [%o1]ASI_USER
180625cf1a30Sjl	inc	%o1			! advance DST by 1
180725cf1a30Sjl	ldub	[%o0 - 2], %o3
180825cf1a30Sjl	stba	%o3, [%o1]ASI_USER
180925cf1a30Sjl	inc	%o1			! advance DST by 1
181025cf1a30Sjl	ldub	[%o0 - 1], %o3
181125cf1a30Sjl	stba	%o3, [%o1]ASI_USER
181225cf1a30Sjl	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
181325cf1a30Sjl	  inc	%o1			! advance DST by 1
181425cf1a30Sjl	add	%o2, 3, %o2		! restore count
181525cf1a30Sjl.co_sm_left:
181625cf1a30Sjl	tst	%o2
181725cf1a30Sjl	bz,pt	%ncc, .co_sm_exit	! check for zero length
181825cf1a30Sjl	  nop
181925cf1a30Sjl	ldub	[%o0], %o3		! load one byte
182025cf1a30Sjl	deccc	%o2			! reduce count for cc test
182125cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
182225cf1a30Sjl	  stba	%o3,[%o1]ASI_USER	! store one byte
182325cf1a30Sjl	ldub	[%o0 + 1], %o3		! load second byte
182425cf1a30Sjl	deccc	%o2
182525cf1a30Sjl	inc	%o1
182625cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
182725cf1a30Sjl	  stba	%o3,[%o1]ASI_USER	! store second byte
182825cf1a30Sjl	ldub	[%o0 + 2], %o3		! load third byte
182925cf1a30Sjl	inc	%o1
183025cf1a30Sjl	stba	%o3,[%o1]ASI_USER	! store third byte
183125cf1a30Sjl	membar	#Sync				! sync error barrier
183225cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
183325cf1a30Sjl	retl
183425cf1a30Sjl	  mov	%g0, %o0		! return 0
183525cf1a30Sjl	.align	16
183625cf1a30Sjl.co_sm_words:
183725cf1a30Sjl	lduw	[%o0], %o3		! read word
183825cf1a30Sjl.co_sm_wordx:
183925cf1a30Sjl	subcc	%o2, 8, %o2		! update count
184025cf1a30Sjl	stwa	%o3, [%o1]ASI_USER	! write word
184125cf1a30Sjl	add	%o0, 8, %o0		! update SRC
184225cf1a30Sjl	lduw	[%o0 - 4], %o3		! read word
184325cf1a30Sjl	add	%o1, 4, %o1		! update DST
184425cf1a30Sjl	stwa	%o3, [%o1]ASI_USER	! write word
184525cf1a30Sjl	bgt,pt	%ncc, .co_sm_words	! loop til done
184625cf1a30Sjl	  add	%o1, 4, %o1		! update DST
184725cf1a30Sjl	addcc	%o2, 7, %o2		! restore count
184825cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
184925cf1a30Sjl	  nop
185025cf1a30Sjl	deccc	%o2
185125cf1a30Sjl	bz,pt	%ncc, .co_sm_byte
185225cf1a30Sjl.co_sm_half:
185325cf1a30Sjl	  subcc	%o2, 2, %o2		! reduce count by 2
185425cf1a30Sjl	lduh	[%o0], %o3		! read half word
185525cf1a30Sjl	add	%o0, 2, %o0		! advance SRC by 2
185625cf1a30Sjl	stha	%o3, [%o1]ASI_USER	! write half word
185725cf1a30Sjl	bgt,pt	%ncc, .co_sm_half	! loop til done
185825cf1a30Sjl	  add	%o1, 2, %o1		! advance DST by 2
185925cf1a30Sjl	addcc	%o2, 1, %o2		! restore count
186025cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
186125cf1a30Sjl	  nop
186225cf1a30Sjl.co_sm_byte:
186325cf1a30Sjl	ldub	[%o0], %o3
186425cf1a30Sjl	stba	%o3, [%o1]ASI_USER
186525cf1a30Sjl	membar	#Sync				! sync error barrier
186625cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
186725cf1a30Sjl	retl
186825cf1a30Sjl	  mov	%g0, %o0		! return 0
186925cf1a30Sjl	.align 16
187025cf1a30Sjl.co_sm_word:
187125cf1a30Sjl	subcc	%o2, 4, %o2		! update count
187225cf1a30Sjl	bgt,pt	%ncc, .co_sm_wordx
187325cf1a30Sjl	  lduw	[%o0], %o3		! read word
187425cf1a30Sjl	addcc	%o2, 3, %o2		! restore count
187525cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
187625cf1a30Sjl	  stwa	%o3, [%o1]ASI_USER	! write word
187725cf1a30Sjl	deccc	%o2			! reduce count for cc test
187825cf1a30Sjl	ldub	[%o0 + 4], %o3		! load one byte
187925cf1a30Sjl	add	%o1, 4, %o1
188025cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
188125cf1a30Sjl	  stba	%o3, [%o1]ASI_USER	! store one byte
188225cf1a30Sjl	ldub	[%o0 + 5], %o3		! load second byte
188325cf1a30Sjl	deccc	%o2
188425cf1a30Sjl	inc	%o1
188525cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
188625cf1a30Sjl	  stba	%o3, [%o1]ASI_USER	! store second byte
188725cf1a30Sjl	ldub	[%o0 + 6], %o3		! load third byte
188825cf1a30Sjl	inc	%o1
188925cf1a30Sjl	stba	%o3, [%o1]ASI_USER	! store third byte
189025cf1a30Sjl.co_sm_exit:
189125cf1a30Sjl	  membar	#Sync				! sync error barrier
189225cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
189325cf1a30Sjl	retl
189425cf1a30Sjl	  mov	%g0, %o0		! return 0
189525cf1a30Sjl
189625cf1a30Sjl	.align 16
189725cf1a30Sjl.co_med:
189825cf1a30Sjl	xor	%o0, %o1, %o3		! setup alignment check
189925cf1a30Sjl	btst	1, %o3
190025cf1a30Sjl	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
190125cf1a30Sjl	  nop
190225cf1a30Sjl	btst	3, %o3
190325cf1a30Sjl	bnz,pt	%ncc, .co_med_half	! halfword aligned
190425cf1a30Sjl	  nop
190525cf1a30Sjl	btst	7, %o3
190625cf1a30Sjl	bnz,pt	%ncc, .co_med_word	! word aligned
190725cf1a30Sjl	  nop
190825cf1a30Sjl.co_med_long:
190925cf1a30Sjl	btst	3, %o0			! check for
191025cf1a30Sjl	bz,pt	%ncc, .co_med_long1	! word alignment
191125cf1a30Sjl	  nop
191225cf1a30Sjl.co_med_long0:
191325cf1a30Sjl	ldub	[%o0], %o3		! load one byte
191425cf1a30Sjl	inc	%o0
191525cf1a30Sjl	stba	%o3,[%o1]ASI_USER	! store byte
191625cf1a30Sjl	inc	%o1
191725cf1a30Sjl	btst	3, %o0
191825cf1a30Sjl	bnz,pt	%ncc, .co_med_long0
191925cf1a30Sjl	  dec	%o2
192025cf1a30Sjl.co_med_long1:			! word aligned
192125cf1a30Sjl	btst	7, %o0			! check for long word
192225cf1a30Sjl	bz,pt	%ncc, .co_med_long2
192325cf1a30Sjl	  nop
192425cf1a30Sjl	lduw	[%o0], %o3		! load word
192525cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
192625cf1a30Sjl	stwa	%o3, [%o1]ASI_USER	! store word
192725cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
192825cf1a30Sjl	sub	%o2, 4, %o2		! reduce count by 4
192925cf1a30Sjl!
193025cf1a30Sjl!  Now long word aligned and have at least 32 bytes to move
193125cf1a30Sjl!
193225cf1a30Sjl.co_med_long2:
193325cf1a30Sjl	sub	%o2, 31, %o2		! adjust count to allow cc zero test
193425cf1a30Sjl	sub	%o1, 8, %o1		! adjust pointer to allow store in
193525cf1a30Sjl					! branch delay slot instead of add
193625cf1a30Sjl.co_med_lmove:
193725cf1a30Sjl	add	%o1, 8, %o1		! advance DST by 8
193825cf1a30Sjl	ldx	[%o0], %o3		! read long word
193925cf1a30Sjl	subcc	%o2, 32, %o2		! reduce count by 32
194025cf1a30Sjl	stxa	%o3, [%o1]ASI_USER	! write long word
194125cf1a30Sjl	add	%o1, 8, %o1		! advance DST by 8
194225cf1a30Sjl	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
194325cf1a30Sjl	add	%o0, 32, %o0		! advance SRC by 32
194425cf1a30Sjl	stxa	%o3, [%o1]ASI_USER
194525cf1a30Sjl	ldx	[%o0 - 16], %o3
194625cf1a30Sjl	add	%o1, 8, %o1		! advance DST by 8
194725cf1a30Sjl	stxa	%o3, [%o1]ASI_USER
194825cf1a30Sjl	ldx	[%o0 - 8], %o3
194925cf1a30Sjl	add	%o1, 8, %o1		! advance DST by 8
195025cf1a30Sjl	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
195125cf1a30Sjl	  stxa	%o3, [%o1]ASI_USER
195225cf1a30Sjl	add	%o1, 8, %o1		! advance DST by 8
195325cf1a30Sjl	addcc	%o2, 24, %o2		! restore count to long word offset
195425cf1a30Sjl	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
195525cf1a30Sjl	  nop
195625cf1a30Sjl.co_med_lword:
195725cf1a30Sjl	ldx	[%o0], %o3		! read long word
195825cf1a30Sjl	subcc	%o2, 8, %o2		! reduce count by 8
195925cf1a30Sjl	stxa	%o3, [%o1]ASI_USER	! write long word
196025cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
196125cf1a30Sjl	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
196225cf1a30Sjl	  add	%o1, 8, %o1		! advance DST by 8
196325cf1a30Sjl.co_med_lextra:
196425cf1a30Sjl	addcc	%o2, 7, %o2		! restore rest of count
196525cf1a30Sjl	bz,pt	%ncc, .co_sm_exit	! if zero, then done
196625cf1a30Sjl	  deccc	%o2
196725cf1a30Sjl	bz,pt	%ncc, .co_sm_byte
196825cf1a30Sjl	  nop
196925cf1a30Sjl	ba,pt	%ncc, .co_sm_half
197025cf1a30Sjl	  nop
197125cf1a30Sjl
197225cf1a30Sjl	.align 16
197325cf1a30Sjl	nop				! instruction alignment
197425cf1a30Sjl					! see discussion at start of file
197525cf1a30Sjl.co_med_word:
197625cf1a30Sjl	btst	3, %o0			! check for
197725cf1a30Sjl	bz,pt	%ncc, .co_med_word1	! word alignment
197825cf1a30Sjl	  nop
197925cf1a30Sjl.co_med_word0:
198025cf1a30Sjl	ldub	[%o0], %o3		! load one byte
198125cf1a30Sjl	inc	%o0
198225cf1a30Sjl	stba	%o3,[%o1]ASI_USER	! store byte
198325cf1a30Sjl	inc	%o1
198425cf1a30Sjl	btst	3, %o0
198525cf1a30Sjl	bnz,pt	%ncc, .co_med_word0
198625cf1a30Sjl	  dec	%o2
198725cf1a30Sjl!
198825cf1a30Sjl!  Now word aligned and have at least 36 bytes to move
198925cf1a30Sjl!
199025cf1a30Sjl.co_med_word1:
199125cf1a30Sjl	sub	%o2, 15, %o2		! adjust count to allow cc zero test
199225cf1a30Sjl.co_med_wmove:
199325cf1a30Sjl	lduw	[%o0], %o3		! read word
199425cf1a30Sjl	subcc	%o2, 16, %o2		! reduce count by 16
199525cf1a30Sjl	stwa	%o3, [%o1]ASI_USER	! write word
199625cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
199725cf1a30Sjl	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
199825cf1a30Sjl	add	%o0, 16, %o0		! advance SRC by 16
199925cf1a30Sjl	stwa	%o3, [%o1]ASI_USER
200025cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
200125cf1a30Sjl	lduw	[%o0 - 8], %o3
200225cf1a30Sjl	stwa	%o3, [%o1]ASI_USER
200325cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
200425cf1a30Sjl	lduw	[%o0 - 4], %o3
200525cf1a30Sjl	stwa	%o3, [%o1]ASI_USER
200625cf1a30Sjl	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
200725cf1a30Sjl	  add	%o1, 4, %o1		! advance DST by 4
200825cf1a30Sjl	addcc	%o2, 12, %o2		! restore count to word offset
200925cf1a30Sjl	ble,pt	%ncc, .co_med_wextra	! check for more words to move
201025cf1a30Sjl	  nop
201125cf1a30Sjl.co_med_word2:
201225cf1a30Sjl	lduw	[%o0], %o3		! read word
201325cf1a30Sjl	subcc	%o2, 4, %o2		! reduce count by 4
201425cf1a30Sjl	stwa	%o3, [%o1]ASI_USER	! write word
201525cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
201625cf1a30Sjl	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
201725cf1a30Sjl	  add	%o1, 4, %o1		! advance DST by 4
201825cf1a30Sjl.co_med_wextra:
201925cf1a30Sjl	addcc	%o2, 3, %o2		! restore rest of count
202025cf1a30Sjl	bz,pt	%ncc, .co_sm_exit	! if zero, then done
202125cf1a30Sjl	  deccc	%o2
202225cf1a30Sjl	bz,pt	%ncc, .co_sm_byte
202325cf1a30Sjl	  nop
202425cf1a30Sjl	ba,pt	%ncc, .co_sm_half
202525cf1a30Sjl	  nop
202625cf1a30Sjl
202725cf1a30Sjl	.align 16
202825cf1a30Sjl	nop				! instruction alignment
202925cf1a30Sjl	nop				! see discussion at start of file
203025cf1a30Sjl	nop
203125cf1a30Sjl.co_med_half:
203225cf1a30Sjl	btst	1, %o0			! check for
203325cf1a30Sjl	bz,pt	%ncc, .co_med_half1	! half word alignment
203425cf1a30Sjl	  nop
203525cf1a30Sjl	ldub	[%o0], %o3		! load one byte
203625cf1a30Sjl	inc	%o0
203725cf1a30Sjl	stba	%o3,[%o1]ASI_USER	! store byte
203825cf1a30Sjl	inc	%o1
203925cf1a30Sjl	dec	%o2
204025cf1a30Sjl!
204125cf1a30Sjl!  Now half word aligned and have at least 38 bytes to move
204225cf1a30Sjl!
204325cf1a30Sjl.co_med_half1:
204425cf1a30Sjl	sub	%o2, 7, %o2		! adjust count to allow cc zero test
204525cf1a30Sjl.co_med_hmove:
204625cf1a30Sjl	lduh	[%o0], %o3		! read half word
204725cf1a30Sjl	subcc	%o2, 8, %o2		! reduce count by 8
204825cf1a30Sjl	stha	%o3, [%o1]ASI_USER	! write half word
204925cf1a30Sjl	add	%o1, 2, %o1		! advance DST by 2
205025cf1a30Sjl	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
205125cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
205225cf1a30Sjl	stha	%o3, [%o1]ASI_USER
205325cf1a30Sjl	add	%o1, 2, %o1		! advance DST by 2
205425cf1a30Sjl	lduh	[%o0 - 4], %o3
205525cf1a30Sjl	stha	%o3, [%o1]ASI_USER
205625cf1a30Sjl	add	%o1, 2, %o1		! advance DST by 2
205725cf1a30Sjl	lduh	[%o0 - 2], %o3
205825cf1a30Sjl	stha	%o3, [%o1]ASI_USER
205925cf1a30Sjl	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
206025cf1a30Sjl	  add	%o1, 2, %o1		! advance DST by 2
206125cf1a30Sjl	addcc	%o2, 7, %o2		! restore count
206225cf1a30Sjl	bz,pt	%ncc, .co_sm_exit
206325cf1a30Sjl	  deccc	%o2
206425cf1a30Sjl	bz,pt	%ncc, .co_sm_byte
206525cf1a30Sjl	  nop
206625cf1a30Sjl	ba,pt	%ncc, .co_sm_half
206725cf1a30Sjl	  nop
206825cf1a30Sjl
206925cf1a30Sjl/*
207025cf1a30Sjl * We got here because of a fault during short copyout.
207125cf1a30Sjl * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
207225cf1a30Sjl */
207325cf1a30Sjl.sm_copyout_err:
207425cf1a30Sjl	membar	#Sync
207525cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
207625cf1a30Sjl	mov	SM_SAVE_SRC, %o0
207725cf1a30Sjl	mov	SM_SAVE_DST, %o1
207825cf1a30Sjl	mov	SM_SAVE_COUNT, %o2
207925cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
208025cf1a30Sjl	tst	%o3
208125cf1a30Sjl	bz,pt	%ncc, 3f			! if not, return error
208225cf1a30Sjl	  nop
208325cf1a30Sjl	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
208425cf1a30Sjl	jmp	%o5				! original arguments
208525cf1a30Sjl	  nop
208625cf1a30Sjl3:
208725cf1a30Sjl	retl
208825cf1a30Sjl	  or	%g0, -1, %o0		! return error value
208925cf1a30Sjl
209025cf1a30Sjl	SET_SIZE(copyout)
209125cf1a30Sjl
209225cf1a30Sjl/*
209325cf1a30Sjl * The _more entry points are not intended to be used directly by
209425cf1a30Sjl * any caller from outside this file.  They are provided to allow
209525cf1a30Sjl * profiling and dtrace of the portions of the copy code that uses
209625cf1a30Sjl * the floating point registers.
209725cf1a30Sjl * This entry is particularly important as DTRACE (at least as of
209825cf1a30Sjl * 4/2004) does not support leaf functions.
209925cf1a30Sjl */
210025cf1a30Sjl
210125cf1a30Sjl	ENTRY(copyout_more)
210225cf1a30Sjl.copyout_more:
210325cf1a30Sjl	prefetch [%o0], #n_reads
210425cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
210525cf1a30Sjl	set	.copyout_err, REAL_LOFAULT
210625cf1a30Sjl
210725cf1a30Sjl/*
210825cf1a30Sjl * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
210925cf1a30Sjl */
211025cf1a30Sjl.do_copyout:
211125cf1a30Sjl        set     copyio_fault, %l7		! .copyio_fault is lofault val
211225cf1a30Sjl
211325cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
211425cf1a30Sjl	membar	#Sync				! sync error barrier
211525cf1a30Sjl	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
211625cf1a30Sjl
211725cf1a30Sjl	mov	%i0, SAVE_SRC
211825cf1a30Sjl	mov	%i1, SAVE_DST
211925cf1a30Sjl	mov	%i2, SAVE_COUNT
212025cf1a30Sjl
212125cf1a30Sjl	FP_NOMIGRATE(6, 7)
212225cf1a30Sjl
212325cf1a30Sjl	rd	%fprs, %o2		! check for unused fp
212425cf1a30Sjl	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
212525cf1a30Sjl	btst	FPRS_FEF, %o2
212625cf1a30Sjl	bz,a,pt	%icc, .do_blockcopyout
212725cf1a30Sjl	  wr	%g0, FPRS_FEF, %fprs
212825cf1a30Sjl
212925cf1a30Sjl	BST_FPQ2Q4_TOSTACK(%o2)
213025cf1a30Sjl
213125cf1a30Sjl.do_blockcopyout:
213225cf1a30Sjl	rd	%gsr, %o2
213325cf1a30Sjl	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
213425cf1a30Sjl	or	%l6, FPUSED_FLAG, %l6
213525cf1a30Sjl
213625cf1a30Sjl	andcc	DST, VIS_BLOCKSIZE - 1, TMP
213725cf1a30Sjl	mov	ASI_USER, %asi
213825cf1a30Sjl	bz,pt	%ncc, 2f
213925cf1a30Sjl	  neg	TMP
214025cf1a30Sjl	add	TMP, VIS_BLOCKSIZE, TMP
214125cf1a30Sjl
214225cf1a30Sjl	! TMP = bytes required to align DST on FP_BLOCK boundary
214325cf1a30Sjl	! Using SRC as a tmp here
214425cf1a30Sjl	cmp	TMP, 3
214525cf1a30Sjl	bleu,pt	%ncc, 1f
214625cf1a30Sjl	  sub	CNT,TMP,CNT		! adjust main count
214725cf1a30Sjl	sub	TMP, 3, TMP		! adjust for end of loop test
214825cf1a30Sjl.co_blkalign:
214925cf1a30Sjl	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
215025cf1a30Sjl	stba	SRC, [DST]%asi
215125cf1a30Sjl	subcc	TMP, 4, TMP
215225cf1a30Sjl	ldub	[REALSRC + 1], SRC
215325cf1a30Sjl	add	REALSRC, 4, REALSRC
215425cf1a30Sjl	stba	SRC, [DST + 1]%asi
215525cf1a30Sjl	ldub	[REALSRC - 2], SRC
215625cf1a30Sjl	add	DST, 4, DST
215725cf1a30Sjl	stba	SRC, [DST - 2]%asi
215825cf1a30Sjl	ldub	[REALSRC - 1], SRC
215925cf1a30Sjl	bgu,pt	%ncc, .co_blkalign
216025cf1a30Sjl	  stba	SRC, [DST - 1]%asi
216125cf1a30Sjl
216225cf1a30Sjl	addcc	TMP, 3, TMP		! restore count adjustment
216325cf1a30Sjl	bz,pt	%ncc, 2f		! no bytes left?
216425cf1a30Sjl	  nop
216525cf1a30Sjl1:	ldub	[REALSRC], SRC
216625cf1a30Sjl	inc	REALSRC
216725cf1a30Sjl	inc	DST
216825cf1a30Sjl	deccc	TMP
216925cf1a30Sjl	bgu	%ncc, 1b
217025cf1a30Sjl	  stba	SRC, [DST - 1]%asi
217125cf1a30Sjl
217225cf1a30Sjl2:
217325cf1a30Sjl	membar	#StoreLoad
217425cf1a30Sjl	andn	REALSRC, 0x7, SRC
217525cf1a30Sjl
217625cf1a30Sjl	! SRC - 8-byte aligned
217725cf1a30Sjl	! DST - 64-byte aligned
217825cf1a30Sjl	ldd	[SRC], %f16
217925cf1a30Sjl	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
218025cf1a30Sjl	alignaddr REALSRC, %g0, %g0
218125cf1a30Sjl	ldd	[SRC + 0x08], %f18
218225cf1a30Sjl	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
218325cf1a30Sjl	faligndata %f16, %f18, %f48
218425cf1a30Sjl	ldd	[SRC + 0x10], %f20
2185c8a722abSpm	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
218625cf1a30Sjl	faligndata %f18, %f20, %f50
218725cf1a30Sjl	ldd	[SRC + 0x18], %f22
218825cf1a30Sjl	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
218925cf1a30Sjl	faligndata %f20, %f22, %f52
219025cf1a30Sjl	ldd	[SRC + 0x20], %f24
2191c8a722abSpm	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
219225cf1a30Sjl	faligndata %f22, %f24, %f54
219325cf1a30Sjl	ldd	[SRC + 0x28], %f26
2194c8a722abSpm	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
219525cf1a30Sjl	faligndata %f24, %f26, %f56
219625cf1a30Sjl	ldd	[SRC + 0x30], %f28
2197c8a722abSpm	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
219825cf1a30Sjl	faligndata %f26, %f28, %f58
219925cf1a30Sjl	ldd	[SRC + 0x38], %f30
220025cf1a30Sjl	ldd	[SRC + VIS_BLOCKSIZE], %f16
220125cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
220225cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
2203c8a722abSpm	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
220425cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
220525cf1a30Sjl	ba,pt	%ncc, 1f
2206c8a722abSpm	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
220725cf1a30Sjl	.align	32
220825cf1a30Sjl1:
220925cf1a30Sjl	ldd	[SRC + 0x08], %f18
221025cf1a30Sjl	faligndata %f28, %f30, %f60
221125cf1a30Sjl	ldd	[SRC + 0x10], %f20
221225cf1a30Sjl	faligndata %f30, %f16, %f62
221325cf1a30Sjl	stda	%f48, [DST]ASI_BLK_AIUS
221425cf1a30Sjl	ldd	[SRC + 0x18], %f22
221525cf1a30Sjl	faligndata %f16, %f18, %f48
221625cf1a30Sjl	ldd	[SRC + 0x20], %f24
221725cf1a30Sjl	faligndata %f18, %f20, %f50
221825cf1a30Sjl	ldd	[SRC + 0x28], %f26
221925cf1a30Sjl	faligndata %f20, %f22, %f52
222025cf1a30Sjl	ldd	[SRC + 0x30], %f28
222125cf1a30Sjl	faligndata %f22, %f24, %f54
2222c8a722abSpm	sub	CNT, VIS_BLOCKSIZE, CNT
222325cf1a30Sjl	ldd	[SRC + 0x38], %f30
222425cf1a30Sjl	faligndata %f24, %f26, %f56
2225c8a722abSpm	add	DST, VIS_BLOCKSIZE, DST
222625cf1a30Sjl	ldd	[SRC + VIS_BLOCKSIZE], %f16
222725cf1a30Sjl	faligndata %f26, %f28, %f58
222825cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2229c8a722abSpm	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2230c8a722abSpm	add	SRC, VIS_BLOCKSIZE, SRC
2231c8a722abSpm	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
223225cf1a30Sjl	cmp	CNT, VIS_BLOCKSIZE + 8
223325cf1a30Sjl	bgu,pt	%ncc, 1b
2234c8a722abSpm	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
223525cf1a30Sjl
223625cf1a30Sjl	! only if REALSRC & 0x7 is 0
223725cf1a30Sjl	cmp	CNT, VIS_BLOCKSIZE
223825cf1a30Sjl	bne	%ncc, 3f
223925cf1a30Sjl	  andcc	REALSRC, 0x7, %g0
224025cf1a30Sjl	bz,pt	%ncc, 2f
224125cf1a30Sjl	  nop
224225cf1a30Sjl3:
224325cf1a30Sjl	faligndata %f28, %f30, %f60
224425cf1a30Sjl	faligndata %f30, %f16, %f62
224525cf1a30Sjl	stda	%f48, [DST]ASI_BLK_AIUS
224625cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
224725cf1a30Sjl	ba,pt	%ncc, 3f
224825cf1a30Sjl	  nop
224925cf1a30Sjl2:
225025cf1a30Sjl	ldd	[SRC + 0x08], %f18
225125cf1a30Sjl	fsrc1	%f28, %f60
225225cf1a30Sjl	ldd	[SRC + 0x10], %f20
225325cf1a30Sjl	fsrc1	%f30, %f62
225425cf1a30Sjl	stda	%f48, [DST]ASI_BLK_AIUS
225525cf1a30Sjl	ldd	[SRC + 0x18], %f22
225625cf1a30Sjl	fsrc1	%f16, %f48
225725cf1a30Sjl	ldd	[SRC + 0x20], %f24
225825cf1a30Sjl	fsrc1	%f18, %f50
225925cf1a30Sjl	ldd	[SRC + 0x28], %f26
226025cf1a30Sjl	fsrc1	%f20, %f52
226125cf1a30Sjl	ldd	[SRC + 0x30], %f28
226225cf1a30Sjl	fsrc1	%f22, %f54
226325cf1a30Sjl	ldd	[SRC + 0x38], %f30
226425cf1a30Sjl	fsrc1	%f24, %f56
226525cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
226625cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
226725cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
226825cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
226925cf1a30Sjl	fsrc1	%f26, %f58
227025cf1a30Sjl	fsrc1	%f28, %f60
227125cf1a30Sjl	fsrc1	%f30, %f62
227225cf1a30Sjl	stda	%f48, [DST]ASI_BLK_AIUS
227325cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
227425cf1a30Sjl	ba,a,pt	%ncc, 4f
227525cf1a30Sjl	  nop
227625cf1a30Sjl
227725cf1a30Sjl3:	tst	CNT
227825cf1a30Sjl	bz,a	%ncc, 4f
227925cf1a30Sjl	  nop
228025cf1a30Sjl
228125cf1a30Sjl5:	ldub	[REALSRC], TMP
228225cf1a30Sjl	inc	REALSRC
228325cf1a30Sjl	inc	DST
228425cf1a30Sjl	deccc	CNT
228525cf1a30Sjl	bgu	%ncc, 5b
228625cf1a30Sjl	  stba	TMP, [DST - 1]%asi
228725cf1a30Sjl4:
228825cf1a30Sjl
228925cf1a30Sjl.copyout_exit:
229025cf1a30Sjl	membar	#Sync
229125cf1a30Sjl
229225cf1a30Sjl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
229325cf1a30Sjl	wr	%o2, 0, %gsr		! restore gsr
229425cf1a30Sjl
229525cf1a30Sjl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
229625cf1a30Sjl	btst	FPRS_FEF, %o3
229725cf1a30Sjl	bz,pt	%icc, 4f
229825cf1a30Sjl	  nop
229925cf1a30Sjl
230025cf1a30Sjl	BLD_FPQ2Q4_FROMSTACK(%o2)
230125cf1a30Sjl
230225cf1a30Sjl	ba,pt	%ncc, 1f
230325cf1a30Sjl	  wr	%o3, 0, %fprs		! restore fprs
230425cf1a30Sjl
230525cf1a30Sjl4:
230625cf1a30Sjl	FZEROQ2Q4
230725cf1a30Sjl	wr	%o3, 0, %fprs		! restore fprs
230825cf1a30Sjl
230925cf1a30Sjl1:
231025cf1a30Sjl	membar	#Sync
231125cf1a30Sjl	andn	%l6, FPUSED_FLAG, %l6
231225cf1a30Sjl	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
231325cf1a30Sjl	FP_ALLOWMIGRATE(5, 6)
231425cf1a30Sjl	ret
231525cf1a30Sjl	  restore	%g0, 0, %o0
231625cf1a30Sjl
231725cf1a30Sjl/*
231825cf1a30Sjl * We got here because of a fault during copyout.
231925cf1a30Sjl * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
232025cf1a30Sjl */
232125cf1a30Sjl.copyout_err:
232225cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
232325cf1a30Sjl	tst	%o4
232425cf1a30Sjl	bz,pt	%ncc, 2f			! if not, return error
232525cf1a30Sjl	  nop
232625cf1a30Sjl	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
232725cf1a30Sjl	jmp	%g2				! original arguments
232825cf1a30Sjl	  restore %g0, 0, %g0			! dispose of copy window
232925cf1a30Sjl2:
233025cf1a30Sjl        ret
233125cf1a30Sjl	  restore %g0, -1, %o0			! return error value
233225cf1a30Sjl
233325cf1a30Sjl
233425cf1a30Sjl	SET_SIZE(copyout_more)
233525cf1a30Sjl
233625cf1a30Sjl
233725cf1a30Sjl	ENTRY(xcopyout)
233825cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
233925cf1a30Sjl	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
234025cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
234125cf1a30Sjl	btst	7, %o3				!
234225cf1a30Sjl	bz,pt	%ncc, .xcopyout_8		!
234325cf1a30Sjl	  nop
234425cf1a30Sjl	btst	1, %o3				!
234525cf1a30Sjl	bz,pt	%ncc, .xcopyout_2		! check for half-word
234625cf1a30Sjl	  nop
234725cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
234825cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
234925cf1a30Sjl	tst	%o3
235025cf1a30Sjl	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
235125cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
235225cf1a30Sjl	bleu,pt	%ncc, .xcopyout_small		! go to small copy
235325cf1a30Sjl	  nop
235425cf1a30Sjl	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
235525cf1a30Sjl	  nop
235625cf1a30Sjl.xcopyout_2:
235725cf1a30Sjl	btst	3, %o3				!
235825cf1a30Sjl	bz,pt	%ncc, .xcopyout_4		! check for word alignment
235925cf1a30Sjl	  nop
236025cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
236125cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
236225cf1a30Sjl	tst	%o3
236325cf1a30Sjl	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
236425cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
236525cf1a30Sjl	bleu,pt	%ncc, .xcopyout_small		! go to small copy
236625cf1a30Sjl	  nop
236725cf1a30Sjl	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
236825cf1a30Sjl	  nop
236925cf1a30Sjl.xcopyout_4:
237025cf1a30Sjl	! already checked longword, must be word aligned
237125cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
237225cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
237325cf1a30Sjl	tst	%o3
237425cf1a30Sjl	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
237525cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
237625cf1a30Sjl	bleu,pt	%ncc, .xcopyout_small		! go to small copy
237725cf1a30Sjl	  nop
237825cf1a30Sjl	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
237925cf1a30Sjl	  nop
238025cf1a30Sjl.xcopyout_8:
238125cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
238225cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
238325cf1a30Sjl	tst	%o3
238425cf1a30Sjl	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
238525cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
238625cf1a30Sjl	bleu,pt	%ncc, .xcopyout_small		! go to small copy
238725cf1a30Sjl	  nop
238825cf1a30Sjl	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
238925cf1a30Sjl	  nop
239025cf1a30Sjl
239125cf1a30Sjl.xcopyout_small:
239225cf1a30Sjl	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
239325cf1a30Sjl	or	%o5, %lo(.sm_xcopyout_err), %o5
239425cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
239525cf1a30Sjl	membar	#Sync				! sync error barrier
239625cf1a30Sjl	ba,pt	%ncc, .sm_do_copyout		! common code
239725cf1a30Sjl	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
239825cf1a30Sjl
239925cf1a30Sjl.xcopyout_more:
240025cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
240125cf1a30Sjl	sethi	%hi(.xcopyout_err), REAL_LOFAULT
240225cf1a30Sjl	ba,pt	%ncc, .do_copyout		! common code
240325cf1a30Sjl	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
240425cf1a30Sjl
240525cf1a30Sjl/*
240625cf1a30Sjl * We got here because of fault during xcopyout
240725cf1a30Sjl * Errno value is in ERRNO
240825cf1a30Sjl */
240925cf1a30Sjl.xcopyout_err:
241025cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
241125cf1a30Sjl	tst	%o4
241225cf1a30Sjl	bz,pt	%ncc, 2f			! if not, return error
241325cf1a30Sjl	  nop
241425cf1a30Sjl	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
241525cf1a30Sjl	jmp	%g2				! original arguments
241625cf1a30Sjl	  restore %g0, 0, %g0			! dispose of copy window
241725cf1a30Sjl2:
241825cf1a30Sjl        ret
241925cf1a30Sjl	  restore ERRNO, 0, %o0			! return errno value
242025cf1a30Sjl
242125cf1a30Sjl.sm_xcopyout_err:
242225cf1a30Sjl
242325cf1a30Sjl	membar	#Sync
242425cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
242525cf1a30Sjl	mov	SM_SAVE_SRC, %o0
242625cf1a30Sjl	mov	SM_SAVE_DST, %o1
242725cf1a30Sjl	mov	SM_SAVE_COUNT, %o2
242825cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
242925cf1a30Sjl	tst	%o3
243025cf1a30Sjl	bz,pt	%ncc, 3f			! if not, return error
243125cf1a30Sjl	  nop
243225cf1a30Sjl	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
243325cf1a30Sjl	jmp	%o5				! original arguments
243425cf1a30Sjl	  nop
243525cf1a30Sjl3:
243625cf1a30Sjl	retl
243725cf1a30Sjl	  or	%g1, 0, %o0		! return errno value
243825cf1a30Sjl
243925cf1a30Sjl	SET_SIZE(xcopyout)
244025cf1a30Sjl
244125cf1a30Sjl	ENTRY(xcopyout_little)
244225cf1a30Sjl	sethi	%hi(.xcopyio_err), %o5
244325cf1a30Sjl	or	%o5, %lo(.xcopyio_err), %o5
244425cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4
244525cf1a30Sjl	membar	#Sync				! sync error barrier
244625cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]
244725cf1a30Sjl	mov	%o4, %o5
244825cf1a30Sjl
244925cf1a30Sjl	subcc	%g0, %o2, %o3
245025cf1a30Sjl	add	%o0, %o2, %o0
245125cf1a30Sjl	bz,pn	%ncc, 2f		! check for zero bytes
245225cf1a30Sjl	  sub	%o2, 1, %o4
245325cf1a30Sjl	add	%o0, %o4, %o0		! start w/last byte
245425cf1a30Sjl	add	%o1, %o2, %o1
245525cf1a30Sjl	ldub	[%o0 + %o3], %o4
245625cf1a30Sjl
245725cf1a30Sjl1:	stba	%o4, [%o1 + %o3]ASI_AIUSL
245825cf1a30Sjl	inccc	%o3
245925cf1a30Sjl	sub	%o0, 2, %o0		! get next byte
246025cf1a30Sjl	bcc,a,pt %ncc, 1b
246125cf1a30Sjl	  ldub	[%o0 + %o3], %o4
246225cf1a30Sjl
246325cf1a30Sjl2:
246425cf1a30Sjl	membar	#Sync				! sync error barrier
246525cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
246625cf1a30Sjl	retl
246725cf1a30Sjl	  mov	%g0, %o0		! return (0)
246825cf1a30Sjl
246925cf1a30Sjl	SET_SIZE(xcopyout_little)
247025cf1a30Sjl
247125cf1a30Sjl/*
247225cf1a30Sjl * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
247325cf1a30Sjl */
247425cf1a30Sjl
247525cf1a30Sjl	ENTRY(copyin)
247625cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
247725cf1a30Sjl	bleu,pt	%ncc, .copyin_small		! go to larger cases
247825cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
247925cf1a30Sjl	btst	7, %o3				!
248025cf1a30Sjl	bz,pt	%ncc, .copyin_8			! check for longword alignment
248125cf1a30Sjl	  nop
248225cf1a30Sjl	btst	1, %o3				!
248325cf1a30Sjl	bz,pt	%ncc, .copyin_2			! check for half-word
248425cf1a30Sjl	  nop
248525cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
248625cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
248725cf1a30Sjl	tst	%o3
248825cf1a30Sjl	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
248925cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
249025cf1a30Sjl	bleu,pt	%ncc, .copyin_small		! go to small copy
249125cf1a30Sjl	  nop
249225cf1a30Sjl	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
249325cf1a30Sjl	  nop
249425cf1a30Sjl.copyin_2:
249525cf1a30Sjl	btst	3, %o3				!
249625cf1a30Sjl	bz,pt	%ncc, .copyin_4			! check for word alignment
249725cf1a30Sjl	  nop
249825cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
249925cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
250025cf1a30Sjl	tst	%o3
250125cf1a30Sjl	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
250225cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
250325cf1a30Sjl	bleu,pt	%ncc, .copyin_small		! go to small copy
250425cf1a30Sjl	  nop
250525cf1a30Sjl	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
250625cf1a30Sjl	  nop
250725cf1a30Sjl.copyin_4:
250825cf1a30Sjl	! already checked longword, must be word aligned
250925cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
251025cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
251125cf1a30Sjl	tst	%o3
251225cf1a30Sjl	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
251325cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
251425cf1a30Sjl	bleu,pt	%ncc, .copyin_small		! go to small copy
251525cf1a30Sjl	  nop
251625cf1a30Sjl	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
251725cf1a30Sjl	  nop
251825cf1a30Sjl.copyin_8:
251925cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
252025cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
252125cf1a30Sjl	tst	%o3
252225cf1a30Sjl	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
252325cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
252425cf1a30Sjl	bleu,pt	%ncc, .copyin_small		! go to small copy
252525cf1a30Sjl	  nop
252625cf1a30Sjl	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
252725cf1a30Sjl	  nop
252825cf1a30Sjl
252925cf1a30Sjl	.align	16
253025cf1a30Sjl	nop				! instruction alignment
253125cf1a30Sjl					! see discussion at start of file
253225cf1a30Sjl.copyin_small:
253325cf1a30Sjl	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
253425cf1a30Sjl	or	%o5, %lo(.sm_copyin_err), %o5
253525cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
253625cf1a30Sjl	membar	#Sync				! sync error barrier
253725cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]
253825cf1a30Sjl.sm_do_copyin:
253925cf1a30Sjl	mov	%o0, SM_SAVE_SRC
254025cf1a30Sjl	mov	%o1, SM_SAVE_DST
254125cf1a30Sjl	cmp	%o2, SHORTCOPY		! check for really short case
254225cf1a30Sjl	bleu,pt	%ncc, .ci_sm_left	!
254325cf1a30Sjl	  mov	%o2, SM_SAVE_COUNT
254425cf1a30Sjl	cmp	%o2, CHKSIZE		! check for medium length cases
254525cf1a30Sjl	bgu,pn	%ncc, .ci_med		!
254625cf1a30Sjl	  or	%o0, %o1, %o3		! prepare alignment check
254725cf1a30Sjl	andcc	%o3, 0x3, %g0		! test for alignment
254825cf1a30Sjl	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
254925cf1a30Sjl.ci_sm_movebytes:
255025cf1a30Sjl	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
255125cf1a30Sjl.ci_sm_notalign4:
255225cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! read byte
255325cf1a30Sjl	subcc	%o2, 4, %o2		! reduce count by 4
255425cf1a30Sjl	stb	%o3, [%o1]		! write byte
255525cf1a30Sjl	add	%o0, 1, %o0		! advance SRC by 1
255625cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
255725cf1a30Sjl	add	%o0, 1, %o0		! advance SRC by 1
255825cf1a30Sjl	stb	%o3, [%o1 + 1]
255925cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
256025cf1a30Sjl	lduba	[%o0]ASI_USER, %o3
256125cf1a30Sjl	add	%o0, 1, %o0		! advance SRC by 1
256225cf1a30Sjl	stb	%o3, [%o1 - 2]
256325cf1a30Sjl	lduba	[%o0]ASI_USER, %o3
256425cf1a30Sjl	add	%o0, 1, %o0		! advance SRC by 1
256525cf1a30Sjl	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
256625cf1a30Sjl	  stb	%o3, [%o1 - 1]
256725cf1a30Sjl	add	%o2, 3, %o2		! restore count
256825cf1a30Sjl.ci_sm_left:
256925cf1a30Sjl	tst	%o2
257025cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
257125cf1a30Sjl	  nop
257225cf1a30Sjl	lduba	[%o0]ASI_USER, %o3		! load one byte
257325cf1a30Sjl	deccc	%o2			! reduce count for cc test
257425cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
257525cf1a30Sjl	  stb	%o3,[%o1]		! store one byte
257625cf1a30Sjl	inc	%o0
257725cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! load second byte
257825cf1a30Sjl	deccc	%o2
257925cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
258025cf1a30Sjl	  stb	%o3,[%o1 + 1]		! store second byte
258125cf1a30Sjl	inc	%o0
258225cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! load third byte
258325cf1a30Sjl	stb	%o3,[%o1 + 2]		! store third byte
258425cf1a30Sjl	membar	#Sync				! sync error barrier
258525cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
258625cf1a30Sjl	retl
258725cf1a30Sjl	  mov	%g0, %o0		! return 0
258825cf1a30Sjl	.align	16
258925cf1a30Sjl.ci_sm_words:
259025cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3		! read word
259125cf1a30Sjl.ci_sm_wordx:
259225cf1a30Sjl	subcc	%o2, 8, %o2		! update count
259325cf1a30Sjl	stw	%o3, [%o1]		! write word
259425cf1a30Sjl	add	%o0, 4, %o0		! update SRC
259525cf1a30Sjl	add	%o1, 8, %o1		! update DST
259625cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3	! read word
259725cf1a30Sjl	add	%o0, 4, %o0		! update SRC
259825cf1a30Sjl	bgt,pt	%ncc, .ci_sm_words	! loop til done
259925cf1a30Sjl	  stw	%o3, [%o1 - 4]		! write word
260025cf1a30Sjl	addcc	%o2, 7, %o2		! restore count
260125cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
260225cf1a30Sjl	  nop
260325cf1a30Sjl	deccc	%o2
260425cf1a30Sjl	bz,pt	%ncc, .ci_sm_byte
260525cf1a30Sjl.ci_sm_half:
260625cf1a30Sjl	  subcc	%o2, 2, %o2		! reduce count by 2
260725cf1a30Sjl	lduha	[%o0]ASI_USER, %o3	! read half word
260825cf1a30Sjl	add	%o0, 2, %o0		! advance SRC by 2
260925cf1a30Sjl	add	%o1, 2, %o1		! advance DST by 2
261025cf1a30Sjl	bgt,pt	%ncc, .ci_sm_half	! loop til done
261125cf1a30Sjl	  sth	%o3, [%o1 - 2]		! write half word
261225cf1a30Sjl	addcc	%o2, 1, %o2		! restore count
261325cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
261425cf1a30Sjl	  nop
261525cf1a30Sjl.ci_sm_byte:
261625cf1a30Sjl	lduba	[%o0]ASI_USER, %o3
261725cf1a30Sjl	stb	%o3, [%o1]
261825cf1a30Sjl	membar	#Sync				! sync error barrier
261925cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
262025cf1a30Sjl	retl
262125cf1a30Sjl	  mov	%g0, %o0		! return 0
262225cf1a30Sjl	.align	16
262325cf1a30Sjl.ci_sm_word:
262425cf1a30Sjl	subcc	%o2, 4, %o2		! update count
262525cf1a30Sjl	bgt,pt	%ncc, .ci_sm_wordx
262625cf1a30Sjl	  lduwa	[%o0]ASI_USER, %o3		! read word
262725cf1a30Sjl	addcc	%o2, 3, %o2		! restore count
262825cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
262925cf1a30Sjl	  stw	%o3, [%o1]		! write word
263025cf1a30Sjl	deccc	%o2			! reduce count for cc test
263125cf1a30Sjl	add	%o0, 4, %o0
263225cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! load one byte
263325cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
263425cf1a30Sjl	  stb	%o3, [%o1 + 4]		! store one byte
263525cf1a30Sjl	inc	%o0
263625cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! load second byte
263725cf1a30Sjl	deccc	%o2
263825cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
263925cf1a30Sjl	  stb	%o3, [%o1 + 5]		! store second byte
264025cf1a30Sjl	inc	%o0
264125cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! load third byte
264225cf1a30Sjl	stb	%o3, [%o1 + 6]		! store third byte
264325cf1a30Sjl.ci_sm_exit:
264425cf1a30Sjl	membar	#Sync				! sync error barrier
264525cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
264625cf1a30Sjl	retl
264725cf1a30Sjl	  mov	%g0, %o0		! return 0
264825cf1a30Sjl
264925cf1a30Sjl	.align 16
265025cf1a30Sjl.ci_med:
265125cf1a30Sjl	xor	%o0, %o1, %o3		! setup alignment check
265225cf1a30Sjl	btst	1, %o3
265325cf1a30Sjl	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
265425cf1a30Sjl	  nop
265525cf1a30Sjl	btst	3, %o3
265625cf1a30Sjl	bnz,pt	%ncc, .ci_med_half	! halfword aligned
265725cf1a30Sjl	  nop
265825cf1a30Sjl	btst	7, %o3
265925cf1a30Sjl	bnz,pt	%ncc, .ci_med_word	! word aligned
266025cf1a30Sjl	  nop
266125cf1a30Sjl.ci_med_long:
266225cf1a30Sjl	btst	3, %o0			! check for
266325cf1a30Sjl	bz,pt	%ncc, .ci_med_long1	! word alignment
266425cf1a30Sjl	  nop
266525cf1a30Sjl.ci_med_long0:
266625cf1a30Sjl	lduba	[%o0]ASI_USER, %o3		! load one byte
266725cf1a30Sjl	inc	%o0
266825cf1a30Sjl	stb	%o3,[%o1]		! store byte
266925cf1a30Sjl	inc	%o1
267025cf1a30Sjl	btst	3, %o0
267125cf1a30Sjl	bnz,pt	%ncc, .ci_med_long0
267225cf1a30Sjl	  dec	%o2
267325cf1a30Sjl.ci_med_long1:			! word aligned
267425cf1a30Sjl	btst	7, %o0			! check for long word
267525cf1a30Sjl	bz,pt	%ncc, .ci_med_long2
267625cf1a30Sjl	  nop
267725cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3	! load word
267825cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
267925cf1a30Sjl	stw	%o3, [%o1]		! store word
268025cf1a30Sjl	add	%o1, 4, %o1		! advance DST by 4
268125cf1a30Sjl	sub	%o2, 4, %o2		! reduce count by 4
268225cf1a30Sjl!
268325cf1a30Sjl!  Now long word aligned and have at least 32 bytes to move
268425cf1a30Sjl!
268525cf1a30Sjl.ci_med_long2:
268625cf1a30Sjl	sub	%o2, 31, %o2		! adjust count to allow cc zero test
268725cf1a30Sjl.ci_med_lmove:
268825cf1a30Sjl	ldxa	[%o0]ASI_USER, %o3	! read long word
268925cf1a30Sjl	subcc	%o2, 32, %o2		! reduce count by 32
269025cf1a30Sjl	stx	%o3, [%o1]		! write long word
269125cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
269225cf1a30Sjl	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
269325cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
269425cf1a30Sjl	stx	%o3, [%o1 + 8]
269525cf1a30Sjl	add	%o1, 32, %o1		! advance DST by 32
269625cf1a30Sjl	ldxa	[%o0]ASI_USER, %o3
269725cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
269825cf1a30Sjl	stx	%o3, [%o1 - 16]
269925cf1a30Sjl	ldxa	[%o0]ASI_USER, %o3
270025cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
270125cf1a30Sjl	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
270225cf1a30Sjl	  stx	%o3, [%o1 - 8]
270325cf1a30Sjl	addcc	%o2, 24, %o2		! restore count to long word offset
270425cf1a30Sjl	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
270525cf1a30Sjl	  nop
270625cf1a30Sjl.ci_med_lword:
270725cf1a30Sjl	ldxa	[%o0]ASI_USER, %o3	! read long word
270825cf1a30Sjl	subcc	%o2, 8, %o2		! reduce count by 8
270925cf1a30Sjl	stx	%o3, [%o1]		! write long word
271025cf1a30Sjl	add	%o0, 8, %o0		! advance SRC by 8
271125cf1a30Sjl	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
271225cf1a30Sjl	  add	%o1, 8, %o1		! advance DST by 8
271325cf1a30Sjl.ci_med_lextra:
271425cf1a30Sjl	addcc	%o2, 7, %o2		! restore rest of count
271525cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
271625cf1a30Sjl	  deccc	%o2
271725cf1a30Sjl	bz,pt	%ncc, .ci_sm_byte
271825cf1a30Sjl	  nop
271925cf1a30Sjl	ba,pt	%ncc, .ci_sm_half
272025cf1a30Sjl	  nop
272125cf1a30Sjl
272225cf1a30Sjl	.align 16
272325cf1a30Sjl	nop				! instruction alignment
272425cf1a30Sjl					! see discussion at start of file
272525cf1a30Sjl.ci_med_word:
272625cf1a30Sjl	btst	3, %o0			! check for
272725cf1a30Sjl	bz,pt	%ncc, .ci_med_word1	! word alignment
272825cf1a30Sjl	  nop
272925cf1a30Sjl.ci_med_word0:
273025cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! load one byte
273125cf1a30Sjl	inc	%o0
273225cf1a30Sjl	stb	%o3,[%o1]		! store byte
273325cf1a30Sjl	inc	%o1
273425cf1a30Sjl	btst	3, %o0
273525cf1a30Sjl	bnz,pt	%ncc, .ci_med_word0
273625cf1a30Sjl	  dec	%o2
273725cf1a30Sjl!
273825cf1a30Sjl!  Now word aligned and have at least 36 bytes to move
273925cf1a30Sjl!
274025cf1a30Sjl.ci_med_word1:
274125cf1a30Sjl	sub	%o2, 15, %o2		! adjust count to allow cc zero test
274225cf1a30Sjl.ci_med_wmove:
274325cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3	! read word
274425cf1a30Sjl	subcc	%o2, 16, %o2		! reduce count by 16
274525cf1a30Sjl	stw	%o3, [%o1]		! write word
274625cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
274725cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
274825cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
274925cf1a30Sjl	stw	%o3, [%o1 + 4]
275025cf1a30Sjl	add	%o1, 16, %o1		! advance DST by 16
275125cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3
275225cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
275325cf1a30Sjl	stw	%o3, [%o1 - 8]
275425cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3
275525cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
275625cf1a30Sjl	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
275725cf1a30Sjl	  stw	%o3, [%o1 - 4]
275825cf1a30Sjl	addcc	%o2, 12, %o2		! restore count to word offset
275925cf1a30Sjl	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
276025cf1a30Sjl	  nop
276125cf1a30Sjl.ci_med_word2:
276225cf1a30Sjl	lduwa	[%o0]ASI_USER, %o3	! read word
276325cf1a30Sjl	subcc	%o2, 4, %o2		! reduce count by 4
276425cf1a30Sjl	stw	%o3, [%o1]		! write word
276525cf1a30Sjl	add	%o0, 4, %o0		! advance SRC by 4
276625cf1a30Sjl	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
276725cf1a30Sjl	  add	%o1, 4, %o1		! advance DST by 4
276825cf1a30Sjl.ci_med_wextra:
276925cf1a30Sjl	addcc	%o2, 3, %o2		! restore rest of count
277025cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
277125cf1a30Sjl	  deccc	%o2
277225cf1a30Sjl	bz,pt	%ncc, .ci_sm_byte
277325cf1a30Sjl	  nop
277425cf1a30Sjl	ba,pt	%ncc, .ci_sm_half
277525cf1a30Sjl	  nop
277625cf1a30Sjl
277725cf1a30Sjl	.align 16
277825cf1a30Sjl	nop				! instruction alignment
277925cf1a30Sjl					! see discussion at start of file
278025cf1a30Sjl.ci_med_half:
278125cf1a30Sjl	btst	1, %o0			! check for
278225cf1a30Sjl	bz,pt	%ncc, .ci_med_half1	! half word alignment
278325cf1a30Sjl	  nop
278425cf1a30Sjl	lduba	[%o0]ASI_USER, %o3	! load one byte
278525cf1a30Sjl	inc	%o0
278625cf1a30Sjl	stb	%o3,[%o1]		! store byte
278725cf1a30Sjl	inc	%o1
278825cf1a30Sjl	dec	%o2
278925cf1a30Sjl!
279025cf1a30Sjl!  Now half word aligned and have at least 38 bytes to move
279125cf1a30Sjl!
279225cf1a30Sjl.ci_med_half1:
279325cf1a30Sjl	sub	%o2, 7, %o2		! adjust count to allow cc zero test
279425cf1a30Sjl.ci_med_hmove:
279525cf1a30Sjl	lduha	[%o0]ASI_USER, %o3	! read half word
279625cf1a30Sjl	subcc	%o2, 8, %o2		! reduce count by 8
279725cf1a30Sjl	sth	%o3, [%o1]		! write half word
279825cf1a30Sjl	add	%o0, 2, %o0		! advance SRC by 2
279925cf1a30Sjl	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
280025cf1a30Sjl	add	%o0, 2, %o0		! advance SRC by 2
280125cf1a30Sjl	sth	%o3, [%o1 + 2]
280225cf1a30Sjl	add	%o1, 8, %o1		! advance DST by 8
280325cf1a30Sjl	lduha	[%o0]ASI_USER, %o3
280425cf1a30Sjl	add	%o0, 2, %o0		! advance SRC by 2
280525cf1a30Sjl	sth	%o3, [%o1 - 4]
280625cf1a30Sjl	lduha	[%o0]ASI_USER, %o3
280725cf1a30Sjl	add	%o0, 2, %o0		! advance SRC by 2
280825cf1a30Sjl	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
280925cf1a30Sjl	  sth	%o3, [%o1 - 2]
281025cf1a30Sjl	addcc	%o2, 7, %o2		! restore count
281125cf1a30Sjl	bz,pt	%ncc, .ci_sm_exit
281225cf1a30Sjl	  deccc	%o2
281325cf1a30Sjl	bz,pt	%ncc, .ci_sm_byte
281425cf1a30Sjl	  nop
281525cf1a30Sjl	ba,pt	%ncc, .ci_sm_half
281625cf1a30Sjl	  nop
281725cf1a30Sjl
281825cf1a30Sjl.sm_copyin_err:
281925cf1a30Sjl	membar	#Sync
282025cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
282125cf1a30Sjl	mov	SM_SAVE_SRC, %o0
282225cf1a30Sjl	mov	SM_SAVE_DST, %o1
282325cf1a30Sjl	mov	SM_SAVE_COUNT, %o2
282425cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
282525cf1a30Sjl	tst	%o3
282625cf1a30Sjl	bz,pt	%ncc, 3f			! if not, return error
282725cf1a30Sjl	  nop
282825cf1a30Sjl	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
282925cf1a30Sjl	jmp	%o5				! original arguments
283025cf1a30Sjl	  nop
283125cf1a30Sjl3:
283225cf1a30Sjl	retl
283325cf1a30Sjl	  or	%g0, -1, %o0		! return errno value
283425cf1a30Sjl
283525cf1a30Sjl	SET_SIZE(copyin)
283625cf1a30Sjl
283725cf1a30Sjl
283825cf1a30Sjl/*
283925cf1a30Sjl * The _more entry points are not intended to be used directly by
284025cf1a30Sjl * any caller from outside this file.  They are provided to allow
284125cf1a30Sjl * profiling and dtrace of the portions of the copy code that uses
284225cf1a30Sjl * the floating point registers.
284325cf1a30Sjl * This entry is particularly important as DTRACE (at least as of
284425cf1a30Sjl * 4/2004) does not support leaf functions.
284525cf1a30Sjl */
284625cf1a30Sjl
284725cf1a30Sjl	ENTRY(copyin_more)
284825cf1a30Sjl.copyin_more:
284925cf1a30Sjl	prefetch [%o0], #n_reads
285025cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
285125cf1a30Sjl	set	.copyin_err, REAL_LOFAULT
285225cf1a30Sjl
285325cf1a30Sjl/*
285425cf1a30Sjl * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
285525cf1a30Sjl */
285625cf1a30Sjl.do_copyin:
285725cf1a30Sjl	set	copyio_fault, %l7		! .copyio_fault is lofault val
285825cf1a30Sjl
285925cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
286025cf1a30Sjl	membar	#Sync				! sync error barrier
286125cf1a30Sjl	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
286225cf1a30Sjl
286325cf1a30Sjl	mov	%i0, SAVE_SRC
286425cf1a30Sjl	mov	%i1, SAVE_DST
286525cf1a30Sjl	mov	%i2, SAVE_COUNT
286625cf1a30Sjl
286725cf1a30Sjl	FP_NOMIGRATE(6, 7)
286825cf1a30Sjl
286925cf1a30Sjl	rd	%fprs, %o2		! check for unused fp
287025cf1a30Sjl	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
287125cf1a30Sjl	btst	FPRS_FEF, %o2
287225cf1a30Sjl	bz,a,pt	%icc, .do_blockcopyin
287325cf1a30Sjl	  wr	%g0, FPRS_FEF, %fprs
287425cf1a30Sjl
287525cf1a30Sjl	BST_FPQ2Q4_TOSTACK(%o2)
287625cf1a30Sjl
287725cf1a30Sjl.do_blockcopyin:
287825cf1a30Sjl	rd	%gsr, %o2
287925cf1a30Sjl	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
288025cf1a30Sjl	or	%l6, FPUSED_FLAG, %l6
288125cf1a30Sjl
288225cf1a30Sjl	andcc	DST, VIS_BLOCKSIZE - 1, TMP
288325cf1a30Sjl	mov	ASI_USER, %asi
288425cf1a30Sjl	bz,pt	%ncc, 2f
288525cf1a30Sjl	  neg	TMP
288625cf1a30Sjl	add	TMP, VIS_BLOCKSIZE, TMP
288725cf1a30Sjl
288825cf1a30Sjl	! TMP = bytes required to align DST on FP_BLOCK boundary
288925cf1a30Sjl	! Using SRC as a tmp here
289025cf1a30Sjl	cmp	TMP, 3
289125cf1a30Sjl	bleu,pt	%ncc, 1f
289225cf1a30Sjl	  sub	CNT,TMP,CNT		! adjust main count
289325cf1a30Sjl	sub	TMP, 3, TMP		! adjust for end of loop test
289425cf1a30Sjl.ci_blkalign:
289525cf1a30Sjl	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
289625cf1a30Sjl	stb	SRC, [DST]
289725cf1a30Sjl	subcc	TMP, 4, TMP
289825cf1a30Sjl	lduba	[REALSRC + 1]%asi, SRC
289925cf1a30Sjl	add	REALSRC, 4, REALSRC
290025cf1a30Sjl	stb	SRC, [DST + 1]
290125cf1a30Sjl	lduba	[REALSRC - 2]%asi, SRC
290225cf1a30Sjl	add	DST, 4, DST
290325cf1a30Sjl	stb	SRC, [DST - 2]
290425cf1a30Sjl	lduba	[REALSRC - 1]%asi, SRC
290525cf1a30Sjl	bgu,pt	%ncc, .ci_blkalign
290625cf1a30Sjl	  stb	SRC, [DST - 1]
290725cf1a30Sjl
290825cf1a30Sjl	addcc	TMP, 3, TMP		! restore count adjustment
290925cf1a30Sjl	bz,pt	%ncc, 2f		! no bytes left?
291025cf1a30Sjl	  nop
291125cf1a30Sjl1:	lduba	[REALSRC]%asi, SRC
291225cf1a30Sjl	inc	REALSRC
291325cf1a30Sjl	inc	DST
291425cf1a30Sjl	deccc	TMP
291525cf1a30Sjl	bgu	%ncc, 1b
291625cf1a30Sjl	  stb	SRC, [DST - 1]
291725cf1a30Sjl
291825cf1a30Sjl2:
291925cf1a30Sjl	membar	#StoreLoad
292025cf1a30Sjl	andn	REALSRC, 0x7, SRC
292125cf1a30Sjl
292225cf1a30Sjl	! SRC - 8-byte aligned
292325cf1a30Sjl	! DST - 64-byte aligned
292425cf1a30Sjl	ldda	[SRC]%asi, %f16
292525cf1a30Sjl	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
292625cf1a30Sjl	alignaddr REALSRC, %g0, %g0
292725cf1a30Sjl	ldda	[SRC + 0x08]%asi, %f18
292825cf1a30Sjl	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
292925cf1a30Sjl	faligndata %f16, %f18, %f48
293025cf1a30Sjl	ldda	[SRC + 0x10]%asi, %f20
2931c8a722abSpm	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
293225cf1a30Sjl	faligndata %f18, %f20, %f50
293325cf1a30Sjl	ldda	[SRC + 0x18]%asi, %f22
293425cf1a30Sjl	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
293525cf1a30Sjl	faligndata %f20, %f22, %f52
293625cf1a30Sjl	ldda	[SRC + 0x20]%asi, %f24
2937c8a722abSpm	prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
293825cf1a30Sjl	faligndata %f22, %f24, %f54
293925cf1a30Sjl	ldda	[SRC + 0x28]%asi, %f26
2940c8a722abSpm	prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
294125cf1a30Sjl	faligndata %f24, %f26, %f56
294225cf1a30Sjl	ldda	[SRC + 0x30]%asi, %f28
2943c8a722abSpm	prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
294425cf1a30Sjl	faligndata %f26, %f28, %f58
294525cf1a30Sjl	ldda	[SRC + 0x38]%asi, %f30
294625cf1a30Sjl	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
294725cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
294825cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
2949c8a722abSpm	prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
295025cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
295125cf1a30Sjl	ba,pt	%ncc, 1f
2952c8a722abSpm	prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
295325cf1a30Sjl	.align	32
295425cf1a30Sjl1:
295525cf1a30Sjl	ldda	[SRC + 0x08]%asi, %f18
295625cf1a30Sjl	faligndata %f28, %f30, %f60
295725cf1a30Sjl	ldda	[SRC + 0x10]%asi, %f20
295825cf1a30Sjl	faligndata %f30, %f16, %f62
295925cf1a30Sjl	stda	%f48, [DST]ASI_BLK_P
296025cf1a30Sjl	ldda	[SRC + 0x18]%asi, %f22
296125cf1a30Sjl	faligndata %f16, %f18, %f48
296225cf1a30Sjl	ldda	[SRC + 0x20]%asi, %f24
296325cf1a30Sjl	faligndata %f18, %f20, %f50
296425cf1a30Sjl	ldda	[SRC + 0x28]%asi, %f26
296525cf1a30Sjl	faligndata %f20, %f22, %f52
296625cf1a30Sjl	ldda	[SRC + 0x30]%asi, %f28
296725cf1a30Sjl	faligndata %f22, %f24, %f54
2968c8a722abSpm	sub	CNT, VIS_BLOCKSIZE, CNT
296925cf1a30Sjl	ldda	[SRC + 0x38]%asi, %f30
297025cf1a30Sjl	faligndata %f24, %f26, %f56
2971c8a722abSpm	add	DST, VIS_BLOCKSIZE, DST
297225cf1a30Sjl	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
297325cf1a30Sjl	faligndata %f26, %f28, %f58
297425cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2975c8a722abSpm	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
2976c8a722abSpm	add	SRC, VIS_BLOCKSIZE, SRC
2977c8a722abSpm	prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
297825cf1a30Sjl	cmp	CNT, VIS_BLOCKSIZE + 8
297925cf1a30Sjl	bgu,pt	%ncc, 1b
2980c8a722abSpm	  prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
298125cf1a30Sjl
298225cf1a30Sjl	! only if REALSRC & 0x7 is 0
298325cf1a30Sjl	cmp	CNT, VIS_BLOCKSIZE
298425cf1a30Sjl	bne	%ncc, 3f
298525cf1a30Sjl	  andcc	REALSRC, 0x7, %g0
298625cf1a30Sjl	bz,pt	%ncc, 2f
298725cf1a30Sjl	  nop
298825cf1a30Sjl3:
298925cf1a30Sjl	faligndata %f28, %f30, %f60
299025cf1a30Sjl	faligndata %f30, %f16, %f62
299125cf1a30Sjl	stda	%f48, [DST]ASI_BLK_P
299225cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
299325cf1a30Sjl	ba,pt	%ncc, 3f
299425cf1a30Sjl	  nop
299525cf1a30Sjl2:
299625cf1a30Sjl	ldda	[SRC + 0x08]%asi, %f18
299725cf1a30Sjl	fsrc1	%f28, %f60
299825cf1a30Sjl	ldda	[SRC + 0x10]%asi, %f20
299925cf1a30Sjl	fsrc1	%f30, %f62
300025cf1a30Sjl	stda	%f48, [DST]ASI_BLK_P
300125cf1a30Sjl	ldda	[SRC + 0x18]%asi, %f22
300225cf1a30Sjl	fsrc1	%f16, %f48
300325cf1a30Sjl	ldda	[SRC + 0x20]%asi, %f24
300425cf1a30Sjl	fsrc1	%f18, %f50
300525cf1a30Sjl	ldda	[SRC + 0x28]%asi, %f26
300625cf1a30Sjl	fsrc1	%f20, %f52
300725cf1a30Sjl	ldda	[SRC + 0x30]%asi, %f28
300825cf1a30Sjl	fsrc1	%f22, %f54
300925cf1a30Sjl	ldda	[SRC + 0x38]%asi, %f30
301025cf1a30Sjl	fsrc1	%f24, %f56
301125cf1a30Sjl	sub	CNT, VIS_BLOCKSIZE, CNT
301225cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
301325cf1a30Sjl	add	SRC, VIS_BLOCKSIZE, SRC
301425cf1a30Sjl	add	REALSRC, VIS_BLOCKSIZE, REALSRC
301525cf1a30Sjl	fsrc1	%f26, %f58
301625cf1a30Sjl	fsrc1	%f28, %f60
301725cf1a30Sjl	fsrc1	%f30, %f62
301825cf1a30Sjl	stda	%f48, [DST]ASI_BLK_P
301925cf1a30Sjl	add	DST, VIS_BLOCKSIZE, DST
302025cf1a30Sjl	ba,a,pt	%ncc, 4f
302125cf1a30Sjl	  nop
302225cf1a30Sjl
302325cf1a30Sjl3:	tst	CNT
302425cf1a30Sjl	bz,a	%ncc, 4f
302525cf1a30Sjl	  nop
302625cf1a30Sjl
302725cf1a30Sjl5:	lduba	[REALSRC]ASI_USER, TMP
302825cf1a30Sjl	inc	REALSRC
302925cf1a30Sjl	inc	DST
303025cf1a30Sjl	deccc	CNT
303125cf1a30Sjl	bgu	%ncc, 5b
303225cf1a30Sjl	  stb	TMP, [DST - 1]
303325cf1a30Sjl4:
303425cf1a30Sjl
303525cf1a30Sjl.copyin_exit:
303625cf1a30Sjl	membar	#Sync
303725cf1a30Sjl
303825cf1a30Sjl	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
303925cf1a30Sjl	wr	%o2, 0, %gsr
304025cf1a30Sjl
304125cf1a30Sjl	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
304225cf1a30Sjl	btst	FPRS_FEF, %o3
304325cf1a30Sjl	bz,pt	%icc, 4f
304425cf1a30Sjl	  nop
304525cf1a30Sjl
304625cf1a30Sjl	BLD_FPQ2Q4_FROMSTACK(%o2)
304725cf1a30Sjl
304825cf1a30Sjl	ba,pt	%ncc, 1f
304925cf1a30Sjl	  wr	%o3, 0, %fprs		! restore fprs
305025cf1a30Sjl
305125cf1a30Sjl4:
305225cf1a30Sjl	FZEROQ2Q4
305325cf1a30Sjl	wr	%o3, 0, %fprs		! restore fprs
305425cf1a30Sjl
305525cf1a30Sjl1:
305625cf1a30Sjl	membar	#Sync				! sync error barrier
305725cf1a30Sjl	andn	%l6, FPUSED_FLAG, %l6
305825cf1a30Sjl	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
305925cf1a30Sjl	FP_ALLOWMIGRATE(5, 6)
306025cf1a30Sjl	ret
306125cf1a30Sjl	  restore	%g0, 0, %o0
306225cf1a30Sjl/*
306325cf1a30Sjl * We got here because of a fault during copyin
306425cf1a30Sjl * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
306525cf1a30Sjl */
306625cf1a30Sjl.copyin_err:
306725cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
306825cf1a30Sjl	tst	%o4
306925cf1a30Sjl	bz,pt	%ncc, 2f			! if not, return error
307025cf1a30Sjl	nop
307125cf1a30Sjl	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
307225cf1a30Sjl	jmp	%g2				! original arguments
307325cf1a30Sjl	restore %g0, 0, %g0			! dispose of copy window
307425cf1a30Sjl2:
307525cf1a30Sjl	ret
307625cf1a30Sjl	restore %g0, -1, %o0			! return error value
307725cf1a30Sjl
307825cf1a30Sjl
307925cf1a30Sjl	SET_SIZE(copyin_more)
308025cf1a30Sjl
308125cf1a30Sjl	ENTRY(xcopyin)
308225cf1a30Sjl
308325cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
308425cf1a30Sjl	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
308525cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
308625cf1a30Sjl	btst	7, %o3				!
308725cf1a30Sjl	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
308825cf1a30Sjl	  nop
308925cf1a30Sjl	btst	1, %o3				!
309025cf1a30Sjl	bz,pt	%ncc, .xcopyin_2		! check for half-word
309125cf1a30Sjl	  nop
309225cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
309325cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
309425cf1a30Sjl	tst	%o3
309525cf1a30Sjl	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
309625cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
309725cf1a30Sjl	bleu,pt	%ncc, .xcopyin_small		! go to small copy
309825cf1a30Sjl	  nop
309925cf1a30Sjl	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
310025cf1a30Sjl	  nop
310125cf1a30Sjl.xcopyin_2:
310225cf1a30Sjl	btst	3, %o3				!
310325cf1a30Sjl	bz,pt	%ncc, .xcopyin_4		! check for word alignment
310425cf1a30Sjl	  nop
310525cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
310625cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
310725cf1a30Sjl	tst	%o3
310825cf1a30Sjl	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
310925cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
311025cf1a30Sjl	bleu,pt	%ncc, .xcopyin_small		! go to small copy
311125cf1a30Sjl	  nop
311225cf1a30Sjl	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
311325cf1a30Sjl	  nop
311425cf1a30Sjl.xcopyin_4:
311525cf1a30Sjl	! already checked longword, must be word aligned
311625cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
311725cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
311825cf1a30Sjl	tst	%o3
311925cf1a30Sjl	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
312025cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
312125cf1a30Sjl	bleu,pt	%ncc, .xcopyin_small		! go to small copy
312225cf1a30Sjl	  nop
312325cf1a30Sjl	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
312425cf1a30Sjl	  nop
312525cf1a30Sjl.xcopyin_8:
312625cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
312725cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
312825cf1a30Sjl	tst	%o3
312925cf1a30Sjl	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
313025cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
313125cf1a30Sjl	bleu,pt	%ncc, .xcopyin_small		! go to small copy
313225cf1a30Sjl	  nop
313325cf1a30Sjl	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
313425cf1a30Sjl	  nop
313525cf1a30Sjl
313625cf1a30Sjl.xcopyin_small:
313725cf1a30Sjl	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
313825cf1a30Sjl	or	%o5, %lo(.sm_xcopyin_err), %o5
313925cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
314025cf1a30Sjl	membar	#Sync				! sync error barrier
314125cf1a30Sjl	ba,pt	%ncc, .sm_do_copyin		! common code
314225cf1a30Sjl	  stn	%o5, [THREAD_REG + T_LOFAULT]
314325cf1a30Sjl
314425cf1a30Sjl.xcopyin_more:
314525cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
314625cf1a30Sjl	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
314725cf1a30Sjl	ba,pt	%ncc, .do_copyin
314825cf1a30Sjl	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
314925cf1a30Sjl
315025cf1a30Sjl/*
315125cf1a30Sjl * We got here because of fault during xcopyin
315225cf1a30Sjl * Errno value is in ERRNO
315325cf1a30Sjl */
315425cf1a30Sjl.xcopyin_err:
315525cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
315625cf1a30Sjl	tst	%o4
315725cf1a30Sjl	bz,pt	%ncc, 2f			! if not, return error
315825cf1a30Sjl	  nop
315925cf1a30Sjl	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
316025cf1a30Sjl	jmp	%g2				! original arguments
316125cf1a30Sjl	  restore %g0, 0, %g0			! dispose of copy window
316225cf1a30Sjl2:
316325cf1a30Sjl        ret
316425cf1a30Sjl	  restore ERRNO, 0, %o0			! return errno value
316525cf1a30Sjl
316625cf1a30Sjl.sm_xcopyin_err:
316725cf1a30Sjl
316825cf1a30Sjl	membar	#Sync
316925cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
317025cf1a30Sjl	mov	SM_SAVE_SRC, %o0
317125cf1a30Sjl	mov	SM_SAVE_DST, %o1
317225cf1a30Sjl	mov	SM_SAVE_COUNT, %o2
317325cf1a30Sjl	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
317425cf1a30Sjl	tst	%o3
317525cf1a30Sjl	bz,pt	%ncc, 3f			! if not, return error
317625cf1a30Sjl	  nop
317725cf1a30Sjl	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
317825cf1a30Sjl	jmp	%o5				! original arguments
317925cf1a30Sjl	  nop
318025cf1a30Sjl3:
318125cf1a30Sjl	retl
318225cf1a30Sjl	  or	%g1, 0, %o0		! return errno value
318325cf1a30Sjl
318425cf1a30Sjl	SET_SIZE(xcopyin)
318525cf1a30Sjl
318625cf1a30Sjl	ENTRY(xcopyin_little)
318725cf1a30Sjl	sethi	%hi(.xcopyio_err), %o5
318825cf1a30Sjl	or	%o5, %lo(.xcopyio_err), %o5
318925cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4
319025cf1a30Sjl	membar	#Sync				! sync error barrier
319125cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]
319225cf1a30Sjl	mov	%o4, %o5
319325cf1a30Sjl
319425cf1a30Sjl	subcc	%g0, %o2, %o3
319525cf1a30Sjl	add	%o0, %o2, %o0
319625cf1a30Sjl	bz,pn	%ncc, 2f		! check for zero bytes
319725cf1a30Sjl	  sub	%o2, 1, %o4
319825cf1a30Sjl	add	%o0, %o4, %o0		! start w/last byte
319925cf1a30Sjl	add	%o1, %o2, %o1
320025cf1a30Sjl	lduba	[%o0 + %o3]ASI_AIUSL, %o4
320125cf1a30Sjl
320225cf1a30Sjl1:	stb	%o4, [%o1 + %o3]
320325cf1a30Sjl	inccc	%o3
320425cf1a30Sjl	sub	%o0, 2, %o0		! get next byte
320525cf1a30Sjl	bcc,a,pt %ncc, 1b
320625cf1a30Sjl	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
320725cf1a30Sjl
320825cf1a30Sjl2:
320925cf1a30Sjl	membar	#Sync				! sync error barrier
321025cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
321125cf1a30Sjl	retl
321225cf1a30Sjl	  mov	%g0, %o0		! return (0)
321325cf1a30Sjl
321425cf1a30Sjl.xcopyio_err:
321525cf1a30Sjl	membar	#Sync				! sync error barrier
321625cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
321725cf1a30Sjl	retl
321825cf1a30Sjl	  mov	%g1, %o0
321925cf1a30Sjl
322025cf1a30Sjl	SET_SIZE(xcopyin_little)
322125cf1a30Sjl
322225cf1a30Sjl
322325cf1a30Sjl/*
322425cf1a30Sjl * Copy a block of storage - must not overlap (from + len <= to).
322525cf1a30Sjl * No fault handler installed (to be called under on_fault())
322625cf1a30Sjl */
322725cf1a30Sjl	ENTRY(copyin_noerr)
322825cf1a30Sjl
322925cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
323025cf1a30Sjl	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
323125cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
323225cf1a30Sjl	btst	7, %o3				!
323325cf1a30Sjl	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
323425cf1a30Sjl	  nop
323525cf1a30Sjl	btst	1, %o3				!
323625cf1a30Sjl	bz,pt	%ncc, .copyin_ne_2		! check for half-word
323725cf1a30Sjl	  nop
323825cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
323925cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
324025cf1a30Sjl	tst	%o3
324125cf1a30Sjl	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
324225cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
324325cf1a30Sjl	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
324425cf1a30Sjl	  nop
324525cf1a30Sjl	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
324625cf1a30Sjl	  nop
324725cf1a30Sjl.copyin_ne_2:
324825cf1a30Sjl	btst	3, %o3				!
324925cf1a30Sjl	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
325025cf1a30Sjl	  nop
325125cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
325225cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
325325cf1a30Sjl	tst	%o3
325425cf1a30Sjl	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
325525cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
325625cf1a30Sjl	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
325725cf1a30Sjl	  nop
325825cf1a30Sjl	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
325925cf1a30Sjl	  nop
326025cf1a30Sjl.copyin_ne_4:
326125cf1a30Sjl	! already checked longword, must be word aligned
326225cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
326325cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
326425cf1a30Sjl	tst	%o3
326525cf1a30Sjl	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
326625cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
326725cf1a30Sjl	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
326825cf1a30Sjl	  nop
326925cf1a30Sjl	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
327025cf1a30Sjl	  nop
327125cf1a30Sjl.copyin_ne_8:
327225cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
327325cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
327425cf1a30Sjl	tst	%o3
327525cf1a30Sjl	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
327625cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
327725cf1a30Sjl	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
327825cf1a30Sjl	  nop
327925cf1a30Sjl	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
328025cf1a30Sjl	  nop
328125cf1a30Sjl
328225cf1a30Sjl.copyin_ne_small:
328325cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4
328425cf1a30Sjl	tst	%o4
328525cf1a30Sjl	bz,pn	%ncc, .sm_do_copyin
328625cf1a30Sjl	  nop
328725cf1a30Sjl	sethi	%hi(.sm_copyio_noerr), %o5
328825cf1a30Sjl	or	%o5, %lo(.sm_copyio_noerr), %o5
328925cf1a30Sjl	membar	#Sync				! sync error barrier
329025cf1a30Sjl	ba,pt	%ncc, .sm_do_copyin
329125cf1a30Sjl	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
329225cf1a30Sjl
329325cf1a30Sjl.copyin_noerr_more:
329425cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
329525cf1a30Sjl	sethi	%hi(.copyio_noerr), REAL_LOFAULT
329625cf1a30Sjl	ba,pt	%ncc, .do_copyin
329725cf1a30Sjl	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
329825cf1a30Sjl
329925cf1a30Sjl.copyio_noerr:
330025cf1a30Sjl	jmp	%l6
330125cf1a30Sjl	  restore %g0,0,%g0
330225cf1a30Sjl
330325cf1a30Sjl.sm_copyio_noerr:
330425cf1a30Sjl	membar	#Sync
330525cf1a30Sjl	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
330625cf1a30Sjl	jmp	%o4
330725cf1a30Sjl	  nop
330825cf1a30Sjl
330925cf1a30Sjl	SET_SIZE(copyin_noerr)
331025cf1a30Sjl
331125cf1a30Sjl/*
331225cf1a30Sjl * Copy a block of storage - must not overlap (from + len <= to).
331325cf1a30Sjl * No fault handler installed (to be called under on_fault())
331425cf1a30Sjl */
331525cf1a30Sjl
331625cf1a30Sjl	ENTRY(copyout_noerr)
331725cf1a30Sjl
331825cf1a30Sjl	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
331925cf1a30Sjl	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
332025cf1a30Sjl	  xor	%o0, %o1, %o3			! are src, dst alignable?
332125cf1a30Sjl	btst	7, %o3				!
332225cf1a30Sjl	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
332325cf1a30Sjl	  nop
332425cf1a30Sjl	btst	1, %o3				!
332525cf1a30Sjl	bz,pt	%ncc, .copyout_ne_2		! check for half-word
332625cf1a30Sjl	  nop
332725cf1a30Sjl	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
332825cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
332925cf1a30Sjl	tst	%o3
333025cf1a30Sjl	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
333125cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
333225cf1a30Sjl	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
333325cf1a30Sjl	  nop
333425cf1a30Sjl	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
333525cf1a30Sjl	  nop
333625cf1a30Sjl.copyout_ne_2:
333725cf1a30Sjl	btst	3, %o3				!
333825cf1a30Sjl	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
333925cf1a30Sjl	  nop
334025cf1a30Sjl	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
334125cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
334225cf1a30Sjl	tst	%o3
334325cf1a30Sjl	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
334425cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
334525cf1a30Sjl	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
334625cf1a30Sjl	  nop
334725cf1a30Sjl	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
334825cf1a30Sjl	  nop
334925cf1a30Sjl.copyout_ne_4:
335025cf1a30Sjl	! already checked longword, must be word aligned
335125cf1a30Sjl	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
335225cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
335325cf1a30Sjl	tst	%o3
335425cf1a30Sjl	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
335525cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
335625cf1a30Sjl	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
335725cf1a30Sjl	  nop
335825cf1a30Sjl	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
335925cf1a30Sjl	  nop
336025cf1a30Sjl.copyout_ne_8:
336125cf1a30Sjl	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
336225cf1a30Sjl	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
336325cf1a30Sjl	tst	%o3
336425cf1a30Sjl	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
336525cf1a30Sjl	  cmp	%o2, %o3			! if length <= limit
336625cf1a30Sjl	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
336725cf1a30Sjl	  nop
336825cf1a30Sjl	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
336925cf1a30Sjl	  nop
337025cf1a30Sjl
337125cf1a30Sjl.copyout_ne_small:
337225cf1a30Sjl	ldn	[THREAD_REG + T_LOFAULT], %o4
337325cf1a30Sjl	tst	%o4
337425cf1a30Sjl	bz,pn	%ncc, .sm_do_copyout
337525cf1a30Sjl	  nop
337625cf1a30Sjl	sethi	%hi(.sm_copyio_noerr), %o5
337725cf1a30Sjl	or	%o5, %lo(.sm_copyio_noerr), %o5
337825cf1a30Sjl	membar	#Sync				! sync error barrier
337925cf1a30Sjl	ba,pt	%ncc, .sm_do_copyout
338025cf1a30Sjl	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
338125cf1a30Sjl
338225cf1a30Sjl.copyout_noerr_more:
338325cf1a30Sjl	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
338425cf1a30Sjl	sethi	%hi(.copyio_noerr), REAL_LOFAULT
338525cf1a30Sjl	ba,pt	%ncc, .do_copyout
338625cf1a30Sjl	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
338725cf1a30Sjl
338825cf1a30Sjl	SET_SIZE(copyout_noerr)
338925cf1a30Sjl
339025cf1a30Sjl
339125cf1a30Sjl/*
339225cf1a30Sjl * hwblkclr - clears block-aligned, block-multiple-sized regions that are
339325cf1a30Sjl * longer than 256 bytes in length using spitfire's block stores.  If
339425cf1a30Sjl * the criteria for using this routine are not met then it calls bzero
339525cf1a30Sjl * and returns 1.  Otherwise 0 is returned indicating success.
339625cf1a30Sjl * Caller is responsible for ensuring use_hw_bzero is true and that
339725cf1a30Sjl * kpreempt_disable() has been called.
339825cf1a30Sjl */
339925cf1a30Sjl	! %i0 - start address
340025cf1a30Sjl	! %i1 - length of region (multiple of 64)
340125cf1a30Sjl	! %l0 - saved fprs
340225cf1a30Sjl	! %l1 - pointer to saved %d0 block
340325cf1a30Sjl	! %l2 - saved curthread->t_lwp
340425cf1a30Sjl
340525cf1a30Sjl	ENTRY(hwblkclr)
340625cf1a30Sjl	! get another window w/space for one aligned block of saved fpregs
340725cf1a30Sjl	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
340825cf1a30Sjl
340925cf1a30Sjl	! Must be block-aligned
341025cf1a30Sjl	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
341125cf1a30Sjl	bnz,pn	%ncc, 1f
341225cf1a30Sjl	  nop
341325cf1a30Sjl
341425cf1a30Sjl	! ... and must be 256 bytes or more
341525cf1a30Sjl	cmp	%i1, 256
341625cf1a30Sjl	blu,pn	%ncc, 1f
341725cf1a30Sjl	  nop
341825cf1a30Sjl
341925cf1a30Sjl	! ... and length must be a multiple of VIS_BLOCKSIZE
342025cf1a30Sjl	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
342125cf1a30Sjl	bz,pn	%ncc, 2f
342225cf1a30Sjl	  nop
342325cf1a30Sjl
342425cf1a30Sjl1:	! punt, call bzero but notify the caller that bzero was used
342525cf1a30Sjl	mov	%i0, %o0
342625cf1a30Sjl	call	bzero
342725cf1a30Sjl	mov	%i1, %o1
342825cf1a30Sjl	ret
342925cf1a30Sjl	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
343025cf1a30Sjl
343125cf1a30Sjl2:	rd	%fprs, %l0		! check for unused fp
343225cf1a30Sjl	btst	FPRS_FEF, %l0
343325cf1a30Sjl	bz,pt	%icc, 1f
343425cf1a30Sjl	  nop
343525cf1a30Sjl
343625cf1a30Sjl	! save in-use fpregs on stack
343725cf1a30Sjl	membar	#Sync
343825cf1a30Sjl	add	%fp, STACK_BIAS - 65, %l1
343925cf1a30Sjl	and	%l1, -VIS_BLOCKSIZE, %l1
344025cf1a30Sjl	stda	%d0, [%l1]ASI_BLK_P
344125cf1a30Sjl
344225cf1a30Sjl1:	membar	#StoreStore|#StoreLoad|#LoadStore
344325cf1a30Sjl	wr	%g0, FPRS_FEF, %fprs
344425cf1a30Sjl	wr	%g0, ASI_BLK_P, %asi
344525cf1a30Sjl
344625cf1a30Sjl	! Clear block
344725cf1a30Sjl	fzero	%d0
344825cf1a30Sjl	fzero	%d2
344925cf1a30Sjl	fzero	%d4
345025cf1a30Sjl	fzero	%d6
345125cf1a30Sjl	fzero	%d8
345225cf1a30Sjl	fzero	%d10
345325cf1a30Sjl	fzero	%d12
345425cf1a30Sjl	fzero	%d14
345525cf1a30Sjl
345625cf1a30Sjl	mov	256, %i3
345725cf1a30Sjl	ba,pt	%ncc, .pz_doblock
345825cf1a30Sjl	  nop
345925cf1a30Sjl
346025cf1a30Sjl.pz_blkstart:
346125cf1a30Sjl      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
346225cf1a30Sjl	stda	%d0, [%i0 + 128]%asi
346325cf1a30Sjl	stda	%d0, [%i0 + 64]%asi
346425cf1a30Sjl	stda	%d0, [%i0]%asi
346525cf1a30Sjl.pz_zinst:
346625cf1a30Sjl	add	%i0, %i3, %i0
346725cf1a30Sjl	sub	%i1, %i3, %i1
346825cf1a30Sjl.pz_doblock:
346925cf1a30Sjl	cmp	%i1, 256
347025cf1a30Sjl	bgeu,a	%ncc, .pz_blkstart
347125cf1a30Sjl	  stda	%d0, [%i0 + 192]%asi
347225cf1a30Sjl
347325cf1a30Sjl	cmp	%i1, 64
347425cf1a30Sjl	blu	%ncc, .pz_finish
347525cf1a30Sjl
347625cf1a30Sjl	  andn	%i1, (64-1), %i3
347725cf1a30Sjl	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
347825cf1a30Sjl	set	.pz_zinst, %i4
347925cf1a30Sjl	sub	%i4, %i2, %i4
348025cf1a30Sjl	jmp	%i4
348125cf1a30Sjl	  nop
348225cf1a30Sjl
348325cf1a30Sjl.pz_finish:
348425cf1a30Sjl	membar	#Sync
348525cf1a30Sjl	btst	FPRS_FEF, %l0
348625cf1a30Sjl	bz,a	.pz_finished
348725cf1a30Sjl	  wr	%l0, 0, %fprs		! restore fprs
348825cf1a30Sjl
348925cf1a30Sjl	! restore fpregs from stack
349025cf1a30Sjl	ldda	[%l1]ASI_BLK_P, %d0
349125cf1a30Sjl	membar	#Sync
349225cf1a30Sjl	wr	%l0, 0, %fprs		! restore fprs
349325cf1a30Sjl
349425cf1a30Sjl.pz_finished:
349525cf1a30Sjl	ret
349625cf1a30Sjl	  restore	%g0, 0, %o0		! return (bzero or not)
349725cf1a30Sjl
349825cf1a30Sjl	SET_SIZE(hwblkclr)
34999b0bb795SJohn Levon
350025cf1a30Sjl	/*
350125cf1a30Sjl	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
350225cf1a30Sjl	 * using physical addresses.
350325cf1a30Sjl	 */
350425cf1a30Sjl	ENTRY_NP(hw_pa_bcopy32)
350525cf1a30Sjl	rdpr	%pstate, %g1
350625cf1a30Sjl	andn	%g1, PSTATE_IE, %g2
350725cf1a30Sjl	wrpr	%g0, %g2, %pstate
350825cf1a30Sjl
350925cf1a30Sjl	rdpr	%pstate, %g0
351025cf1a30Sjl	ldxa	[%o0]ASI_MEM, %o2
351125cf1a30Sjl	add	%o0, 8, %o0
351225cf1a30Sjl	ldxa	[%o0]ASI_MEM, %o3
351325cf1a30Sjl	add	%o0, 8, %o0
351425cf1a30Sjl	ldxa	[%o0]ASI_MEM, %o4
351525cf1a30Sjl	add	%o0, 8, %o0
351625cf1a30Sjl	ldxa	[%o0]ASI_MEM, %o5
351725cf1a30Sjl	membar	#Sync
351825cf1a30Sjl
351925cf1a30Sjl	stxa	%o2, [%o1]ASI_MEM
352025cf1a30Sjl	add	%o1, 8, %o1
352125cf1a30Sjl	stxa	%o3, [%o1]ASI_MEM
352225cf1a30Sjl	add	%o1, 8, %o1
352325cf1a30Sjl	stxa	%o4, [%o1]ASI_MEM
352425cf1a30Sjl	add	%o1, 8, %o1
352525cf1a30Sjl	stxa	%o5, [%o1]ASI_MEM
352625cf1a30Sjl
352725cf1a30Sjl	retl
352825cf1a30Sjl	  wrpr	  %g0, %g1, %pstate
352925cf1a30Sjl
353025cf1a30Sjl	SET_SIZE(hw_pa_bcopy32)
353125cf1a30Sjl
353225cf1a30Sjl	DGDEF(use_hw_bcopy)
353325cf1a30Sjl	.word	1
353425cf1a30Sjl	DGDEF(use_hw_bzero)
353525cf1a30Sjl	.word	1
353625cf1a30Sjl	DGDEF(hw_copy_limit_1)
353725cf1a30Sjl	.word	0
353825cf1a30Sjl	DGDEF(hw_copy_limit_2)
353925cf1a30Sjl	.word	0
354025cf1a30Sjl	DGDEF(hw_copy_limit_4)
354125cf1a30Sjl	.word	0
354225cf1a30Sjl	DGDEF(hw_copy_limit_8)
354325cf1a30Sjl	.word	0
354425cf1a30Sjl
354525cf1a30Sjl	.align	64
354625cf1a30Sjl	.section ".text"
3547