27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
57c478bdstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only
67c478bdstevel@tonic-gate * (the "License").  You may not use this file except in compliance
77c478bdstevel@tonic-gate * with the License.
87c478bdstevel@tonic-gate *
97c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
107c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
117c478bdstevel@tonic-gate * See the License for the specific language governing permissions
127c478bdstevel@tonic-gate * and limitations under the License.
137c478bdstevel@tonic-gate *
147c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
157c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
167c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
177c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
187c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
197c478bdstevel@tonic-gate *
207c478bdstevel@tonic-gate * CDDL HEADER END
217c478bdstevel@tonic-gate */
237c478bdstevel@tonic-gate * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
247c478bdstevel@tonic-gate * Use is subject to license terms.
257c478bdstevel@tonic-gate */
277c478bdstevel@tonic-gate#include <sys/param.h>
287c478bdstevel@tonic-gate#include <sys/errno.h>
297c478bdstevel@tonic-gate#include <sys/asm_linkage.h>
307c478bdstevel@tonic-gate#include <sys/vtrace.h>
317c478bdstevel@tonic-gate#include <sys/machthread.h>
327c478bdstevel@tonic-gate#include <sys/clock.h>
337c478bdstevel@tonic-gate#include <sys/asi.h>
347c478bdstevel@tonic-gate#include <sys/fsr.h>
357c478bdstevel@tonic-gate#include <sys/privregs.h>
367c478bdstevel@tonic-gate#include <sys/fpras_impl.h>
387c478bdstevel@tonic-gate#include "assym.h"
417c478bdstevel@tonic-gate * Pseudo-code to aid in understanding the control flow of the
427c478bdstevel@tonic-gate * bcopy/copyin/copyout routines.
437c478bdstevel@tonic-gate *
447c478bdstevel@tonic-gate * On entry:
457c478bdstevel@tonic-gate *
467c478bdstevel@tonic-gate * 	! Determine whether to use the FP register version
477c478bdstevel@tonic-gate * 	! or the leaf routine version depending on size
487c478bdstevel@tonic-gate * 	! of copy and flags.  Set up error handling accordingly.
497c478bdstevel@tonic-gate *	! The transition point depends on whether the src and
507c478bdstevel@tonic-gate * 	! dst addresses can be aligned to long word, word,
517c478bdstevel@tonic-gate * 	! half word, or byte boundaries.
527c478bdstevel@tonic-gate *	!
537c478bdstevel@tonic-gate *	! WARNING: <Register usage convention>
547c478bdstevel@tonic-gate *	! For FP version, %l6 holds previous error handling and
557c478bdstevel@tonic-gate *	! a flag: TRAMP_FLAG (low bits)
567c478bdstevel@tonic-gate *	! for leaf routine version, %o4 holds those values.
577c478bdstevel@tonic-gate *	! So either %l6 or %o4 is reserved and not available for
587c478bdstevel@tonic-gate *	! any other use.
597c478bdstevel@tonic-gate *
607c478bdstevel@tonic-gate * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
617c478bdstevel@tonic-gate * 		go to small_copy;		! to speed short copies
627c478bdstevel@tonic-gate *
637c478bdstevel@tonic-gate * 	! src, dst long word alignable
647c478bdstevel@tonic-gate * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
657c478bdstevel@tonic-gate * 			go to small_copy;
667c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_8)
677c478bdstevel@tonic-gate * 			go to small_copy;
687c478bdstevel@tonic-gate * 		go to FPBLK_copy;
697c478bdstevel@tonic-gate * 	}
707c478bdstevel@tonic-gate * 	if (src,dst not alignable) {
717c478bdstevel@tonic-gate * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
727c478bdstevel@tonic-gate * 			go to small_copy;
737c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_1)
747c478bdstevel@tonic-gate * 			go to small_copy;
757c478bdstevel@tonic-gate * 		go to FPBLK_copy;
767c478bdstevel@tonic-gate * 	}
777c478bdstevel@tonic-gate * 	if (src,dst halfword alignable) {
787c478bdstevel@tonic-gate * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
797c478bdstevel@tonic-gate * 			go to small_copy;
807c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_2)
817c478bdstevel@tonic-gate * 			go to small_copy;
827c478bdstevel@tonic-gate * 		go to FPBLK_copy;
837c478bdstevel@tonic-gate * 	}
847c478bdstevel@tonic-gate * 	if (src,dst word alignable) {
857c478bdstevel@tonic-gate * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
867c478bdstevel@tonic-gate * 			go to small_copy;
877c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_4)
887c478bdstevel@tonic-gate * 			go to small_copy;
897c478bdstevel@tonic-gate * 		go to FPBLK_copy;
907c478bdstevel@tonic-gate * 	}
917c478bdstevel@tonic-gate *
927c478bdstevel@tonic-gate * small_copy:
937c478bdstevel@tonic-gate *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
947c478bdstevel@tonic-gate *
957c478bdstevel@tonic-gate *	if (count <= 3)				! fast path for tiny copies
967c478bdstevel@tonic-gate *		go to sm_left;			! special finish up code
977c478bdstevel@tonic-gate *	else
987c478bdstevel@tonic-gate *		if (count > CHKSIZE)		! medium sized copies
997c478bdstevel@tonic-gate *			go to sm_med		! tuned by alignment
1007c478bdstevel@tonic-gate *		if(src&dst not both word aligned) {
1017c478bdstevel@tonic-gate *	sm_movebytes:
1027c478bdstevel@tonic-gate *			move byte by byte in 4-way unrolled loop
1037c478bdstevel@tonic-gate *			fall into sm_left;
1047c478bdstevel@tonic-gate *	sm_left:
1057c478bdstevel@tonic-gate *			move 0-3 bytes byte at a time as needed.
1067c478bdstevel@tonic-gate *			restore error handler and exit.
1077c478bdstevel@tonic-gate *
1087c478bdstevel@tonic-gate * 		} else {	! src&dst are word aligned
1097c478bdstevel@tonic-gate *			check for at least 8 bytes left,
1107c478bdstevel@tonic-gate *			move word at a time, unrolled by 2
1117c478bdstevel@tonic-gate *			when fewer than 8 bytes left,
1127c478bdstevel@tonic-gate *	sm_half:	move half word at a time while 2 or more bytes left
1137c478bdstevel@tonic-gate *	sm_byte:	move final byte if necessary
1147c478bdstevel@tonic-gate *	sm_exit:
1157c478bdstevel@tonic-gate *			restore error handler and exit.
1167c478bdstevel@tonic-gate *		}
1177c478bdstevel@tonic-gate *
1187c478bdstevel@tonic-gate * ! Medium length cases with at least CHKSIZE bytes available
1197c478bdstevel@tonic-gate * ! method: line up src and dst as best possible, then
1207c478bdstevel@tonic-gate * ! move data in 4-way unrolled loops.
1217c478bdstevel@tonic-gate *
1227c478bdstevel@tonic-gate * sm_med:
1237c478bdstevel@tonic-gate *	if(src&dst unalignable)
1247c478bdstevel@tonic-gate * 		go to sm_movebytes
1257c478bdstevel@tonic-gate *	if(src&dst halfword alignable)
1267c478bdstevel@tonic-gate *		go to sm_movehalf
1277c478bdstevel@tonic-gate *	if(src&dst word alignable)
1287c478bdstevel@tonic-gate *		go to sm_moveword
1297c478bdstevel@tonic-gate * ! fall into long word movement
1307c478bdstevel@tonic-gate *	move bytes until src is word aligned
1317c478bdstevel@tonic-gate *	if not long word aligned, move a word
1327c478bdstevel@tonic-gate *	move long words in 4-way unrolled loop until < 32 bytes left
1337c478bdstevel@tonic-gate *      move long words in 1-way unrolled loop until < 8 bytes left
1347c478bdstevel@tonic-gate *	if zero bytes left, goto sm_exit
1357c478bdstevel@tonic-gate *	if one byte left, go to sm_byte
1367c478bdstevel@tonic-gate *	else go to sm_half
1377c478bdstevel@tonic-gate *
1387c478bdstevel@tonic-gate * sm_moveword:
1397c478bdstevel@tonic-gate *	move bytes until src is word aligned
1407c478bdstevel@tonic-gate *	move words in 4-way unrolled loop until < 16 bytes left
1417c478bdstevel@tonic-gate *      move words in 1-way unrolled loop until < 4 bytes left
1427c478bdstevel@tonic-gate *	if zero bytes left, goto sm_exit
1437c478bdstevel@tonic-gate *	if one byte left, go to sm_byte
1447c478bdstevel@tonic-gate *	else go to sm_half
1457c478bdstevel@tonic-gate *
1467c478bdstevel@tonic-gate * sm_movehalf:
1477c478bdstevel@tonic-gate *	move a byte if needed to align src on halfword
1487c478bdstevel@tonic-gate *	move halfwords in 4-way unrolled loop until < 8 bytes left
1497c478bdstevel@tonic-gate *	if zero bytes left, goto sm_exit
1507c478bdstevel@tonic-gate *	if one byte left, go to sm_byte
1517c478bdstevel@tonic-gate *	else go to sm_half
1527c478bdstevel@tonic-gate *
1537c478bdstevel@tonic-gate *
1547c478bdstevel@tonic-gate * FPBLK_copy:
1557c478bdstevel@tonic-gate * 	%l6 = curthread->t_lofault;
1567c478bdstevel@tonic-gate * 	if (%l6 != NULL) {
1577c478bdstevel@tonic-gate * 		membar #Sync
1587c478bdstevel@tonic-gate * 		curthread->t_lofault = .copyerr;
1597c478bdstevel@tonic-gate * 		caller_error_handler = TRUE             ! %l6 |= 2
1607c478bdstevel@tonic-gate * 	}
1617c478bdstevel@tonic-gate *
1627c478bdstevel@tonic-gate *	! for FPU testing we must not migrate cpus
1637c478bdstevel@tonic-gate * 	if (curthread->t_lwp == NULL) {
1647c478bdstevel@tonic-gate *		! Kernel threads do not have pcb's in which to store
1657c478bdstevel@tonic-gate *		! the floating point state, so disallow preemption during
1667c478bdstevel@tonic-gate *		! the copy.  This also prevents cpu migration.
1677c478bdstevel@tonic-gate * 		kpreempt_disable(curthread);
1687c478bdstevel@tonic-gate *	} else {
1697c478bdstevel@tonic-gate *		thread_nomigrate();
1707c478bdstevel@tonic-gate *	}
1717c478bdstevel@tonic-gate *
1727c478bdstevel@tonic-gate * 	old_fprs = %fprs;
1737c478bdstevel@tonic-gate * 	old_gsr = %gsr;
1747c478bdstevel@tonic-gate * 	if (%fprs.fef) {
1757c478bdstevel@tonic-gate * 		%fprs.fef = 1;
1767c478bdstevel@tonic-gate * 		save current fpregs on stack using blockstore
1777c478bdstevel@tonic-gate * 	} else {
1787c478bdstevel@tonic-gate * 		%fprs.fef = 1;
1797c478bdstevel@tonic-gate * 	}
1807c478bdstevel@tonic-gate *
1817c478bdstevel@tonic-gate *
1827c478bdstevel@tonic-gate * 	do_blockcopy_here;
1837c478bdstevel@tonic-gate *
1847c478bdstevel@tonic-gate * In lofault handler:
1857c478bdstevel@tonic-gate *	curthread->t_lofault = .copyerr2;
1867c478bdstevel@tonic-gate *	Continue on with the normal exit handler
1877c478bdstevel@tonic-gate *
1887c478bdstevel@tonic-gate * On normal exit:
1897c478bdstevel@tonic-gate * 	%gsr = old_gsr;
1907c478bdstevel@tonic-gate * 	if (old_fprs & FPRS_FEF)
1917c478bdstevel@tonic-gate * 		restore fpregs from stack using blockload
1927c478bdstevel@tonic-gate *	else
1937c478bdstevel@tonic-gate *		zero fpregs
1947c478bdstevel@tonic-gate * 	%fprs = old_fprs;
1957c478bdstevel@tonic-gate * 	membar #Sync
1967c478bdstevel@tonic-gate * 	curthread->t_lofault = (%l6 & ~3);
1977c478bdstevel@tonic-gate *	! following test omitted from copyin/copyout as they
1987c478bdstevel@tonic-gate *	! will always have a current thread
1997c478bdstevel@tonic-gate * 	if (curthread->t_lwp == NULL)
2007c478bdstevel@tonic-gate *		kpreempt_enable(curthread);
2017c478bdstevel@tonic-gate *	else
2027c478bdstevel@tonic-gate *		thread_allowmigrate();
2037c478bdstevel@tonic-gate * 	return (0)
2047c478bdstevel@tonic-gate *
2057c478bdstevel@tonic-gate * In second lofault handler (.copyerr2):
2067c478bdstevel@tonic-gate *	We've tried to restore fp state from the stack and failed.  To
2077c478bdstevel@tonic-gate *	prevent from returning with a corrupted fp state, we will panic.
2087c478bdstevel@tonic-gate */
2117c478bdstevel@tonic-gate * Comments about optimization choices
2127c478bdstevel@tonic-gate *
2137c478bdstevel@tonic-gate * The initial optimization decision in this code is to determine
2147c478bdstevel@tonic-gate * whether to use the FP registers for a copy or not.  If we don't
2157c478bdstevel@tonic-gate * use the FP registers, we can execute the copy as a leaf routine,
2167c478bdstevel@tonic-gate * saving a register save and restore.  Also, less elaborate setup
2177c478bdstevel@tonic-gate * is required, allowing short copies to be completed more quickly.
2187c478bdstevel@tonic-gate * For longer copies, especially unaligned ones (where the src and
2197c478bdstevel@tonic-gate * dst do not align to allow simple ldx,stx operation), the FP
2207c478bdstevel@tonic-gate * registers allow much faster copy operations.
2217c478bdstevel@tonic-gate *
2227c478bdstevel@tonic-gate * The estimated extra cost of the FP path will vary depending on
2237c478bdstevel@tonic-gate * src/dst alignment, dst offset from the next 64 byte FPblock store
2247c478bdstevel@tonic-gate * boundary, remaining src data after the last full dst cache line is
2257c478bdstevel@tonic-gate * moved whether the FP registers need to be saved, and some other
2267c478bdstevel@tonic-gate * minor issues.  The average additional overhead is estimated to be
2277c478bdstevel@tonic-gate * 400 clocks.  Since each non-repeated/predicted tst and branch costs
2287c478bdstevel@tonic-gate * around 10 clocks, elaborate calculation would slow down to all
2297c478bdstevel@tonic-gate * longer copies and only benefit a small portion of medium sized
2307c478bdstevel@tonic-gate * copies.  Rather than incur such cost, we chose fixed transition
2317c478bdstevel@tonic-gate * points for each of the alignment choices.
2327c478bdstevel@tonic-gate *
2337c478bdstevel@tonic-gate * For the inner loop, here is a comparison of the per cache line
2347c478bdstevel@tonic-gate * costs for each alignment when src&dst are in cache:
2357c478bdstevel@tonic-gate *
2367c478bdstevel@tonic-gate * byte aligned:  108 clocks slower for non-FPBLK
2377c478bdstevel@tonic-gate * half aligned:   44 clocks slower for non-FPBLK
2387c478bdstevel@tonic-gate * word aligned:   12 clocks slower for non-FPBLK
2397c478bdstevel@tonic-gate * long aligned:    4 clocks >>faster<< for non-FPBLK
2407c478bdstevel@tonic-gate *
2417c478bdstevel@tonic-gate * The long aligned loop runs faster because it does no prefetching.
2427c478bdstevel@tonic-gate * That wins if the data is not in cache or there is too little
2437c478bdstevel@tonic-gate * data to gain much benefit from prefetching.  But when there
2447c478bdstevel@tonic-gate * is more data and that data is not in cache, failing to prefetch
2457c478bdstevel@tonic-gate * can run much slower.  In addition, there is a 2 Kbyte store queue
2467c478bdstevel@tonic-gate * which will cause the non-FPBLK inner loop to slow for larger copies.
2477c478bdstevel@tonic-gate * The exact tradeoff is strongly load and application dependent, with
2487c478bdstevel@tonic-gate * increasing risk of a customer visible performance regression if the
2497c478bdstevel@tonic-gate * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
2507c478bdstevel@tonic-gate * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
2517c478bdstevel@tonic-gate * upper limit for the non-FPBLK code.  To minimize performance regression
2527c478bdstevel@tonic-gate * risk while still gaining the primary benefits of the improvements to
2537c478bdstevel@tonic-gate * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
2547c478bdstevel@tonic-gate * hw_copy_limit_*.  Later experimental studies using different values
2557c478bdstevel@tonic-gate * of hw_copy_limit_* can be used to make further adjustments if
2567c478bdstevel@tonic-gate * appropriate.
2577c478bdstevel@tonic-gate *
2587c478bdstevel@tonic-gate * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
2597c478bdstevel@tonic-gate * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
2607c478bdstevel@tonic-gate * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
2617c478bdstevel@tonic-gate * hw_copy_limit_8 = src and dst are longword aligned
2627c478bdstevel@tonic-gate *
2637c478bdstevel@tonic-gate * To say that src and dst are word aligned means that after
2647c478bdstevel@tonic-gate * some initial alignment activity of moving 0 to 3 bytes,
2657c478bdstevel@tonic-gate * both the src and dst will be on word boundaries so that
2667c478bdstevel@tonic-gate * word loads and stores may be used.
2677c478bdstevel@tonic-gate *
2687c478bdstevel@tonic-gate * Recommended initial values as of Mar 2004, includes testing
2697c478bdstevel@tonic-gate * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
2707c478bdstevel@tonic-gate * hw_copy_limit_1 =  256
2717c478bdstevel@tonic-gate * hw_copy_limit_2 =  512
2727c478bdstevel@tonic-gate * hw_copy_limit_4 = 1024
2737c478bdstevel@tonic-gate * hw_copy_limit_8 = 1024 (or 1536 on some systems)
2747c478bdstevel@tonic-gate *
2757c478bdstevel@tonic-gate *
2767c478bdstevel@tonic-gate * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
2777c478bdstevel@tonic-gate * disabled for that alignment choice.
2787c478bdstevel@tonic-gate * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
2797c478bdstevel@tonic-gate * the value of VIS_COPY_THRESHOLD is used.
2807c478bdstevel@tonic-gate * It is not envisioned that hw_copy_limit_? will be changed in the field
2817c478bdstevel@tonic-gate * It is provided to allow for disabling FPBLK copies and to allow
2827c478bdstevel@tonic-gate * easy testing of alternate values on future HW implementations
2837c478bdstevel@tonic-gate * that might have different cache sizes, clock rates or instruction
2847c478bdstevel@tonic-gate * timing rules.
2857c478bdstevel@tonic-gate *
2867c478bdstevel@tonic-gate * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
2877c478bdstevel@tonic-gate * threshold to speedup all shorter copies (less than 256).  That
2887c478bdstevel@tonic-gate * saves an alignment test, memory reference, and enabling test
2897c478bdstevel@tonic-gate * for all short copies, or an estimated 24 clocks.
2907c478bdstevel@tonic-gate *
2917c478bdstevel@tonic-gate * The order in which these limits are checked does matter since each
2927c478bdstevel@tonic-gate * non-predicted tst and branch costs around 10 clocks.
2937c478bdstevel@tonic-gate * If src and dst are randomly selected addresses,
2947c478bdstevel@tonic-gate * 4 of 8 will not be alignable.
2957c478bdstevel@tonic-gate * 2 of 8 will be half word alignable.
2967c478bdstevel@tonic-gate * 1 of 8 will be word alignable.
2977c478bdstevel@tonic-gate * 1 of 8 will be long word alignable.
2987c478bdstevel@tonic-gate * But, tests on running kernels show that src and dst to copy code
2997c478bdstevel@tonic-gate * are typically not on random alignments.  Structure copies and
3007c478bdstevel@tonic-gate * copies of larger data sizes are often on long word boundaries.
3017c478bdstevel@tonic-gate * So we test the long word alignment case first, then
3027c478bdstevel@tonic-gate * the byte alignment, then halfword, then word alignment.
3037c478bdstevel@tonic-gate *
3047c478bdstevel@tonic-gate * Several times, tests for length are made to split the code
3057c478bdstevel@tonic-gate * into subcases.  These tests often allow later tests to be
3067c478bdstevel@tonic-gate * avoided.  For example, within the non-FPBLK copy, we first
3077c478bdstevel@tonic-gate * check for tiny copies of 3 bytes or less.  That allows us
3087c478bdstevel@tonic-gate * to use a 4-way unrolled loop for the general byte copy case
3097c478bdstevel@tonic-gate * without a test on loop entry.
3107c478bdstevel@tonic-gate * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
3117c478bdstevel@tonic-gate * vs longer cases.  For the really short case, we don't attempt
3127c478bdstevel@tonic-gate * align src and dst.  We try to minimize special case tests in
3137c478bdstevel@tonic-gate * the shortest loops as each test adds a significant percentage
3147c478bdstevel@tonic-gate * to the total time.
3157c478bdstevel@tonic-gate *
3167c478bdstevel@tonic-gate * For the medium sized cases, we allow ourselves to adjust the
3177c478bdstevel@tonic-gate * src and dst alignment and provide special cases for each of
3187c478bdstevel@tonic-gate * the four adjusted alignment cases. The CHKSIZE that was used
3197c478bdstevel@tonic-gate * to decide between short and medium size was chosen to be 39
3207c478bdstevel@tonic-gate * as that allows for the worst case of 7 bytes of alignment
3217c478bdstevel@tonic-gate * shift and 4 times 8 bytes for the first long word unrolling.
3227c478bdstevel@tonic-gate * That knowledge saves an initial test for length on entry into
3237c478bdstevel@tonic-gate * the medium cases.  If the general loop unrolling factor were
3247c478bdstevel@tonic-gate * to be increases, this number would also need to be adjusted.
3257c478bdstevel@tonic-gate *
3267c478bdstevel@tonic-gate * For all cases in the non-FPBLK code where it is known that at
3277c478bdstevel@tonic-gate * least 4 chunks of data are available for movement, the
3287c478bdstevel@tonic-gate * loop is unrolled by four.  This 4-way loop runs in 8 clocks
3297c478bdstevel@tonic-gate * or 2 clocks per data element.  Due to limitations of the
3307c478bdstevel@tonic-gate * branch instruction on Cheetah, Jaguar, and Panther, the
3317c478bdstevel@tonic-gate * minimum time for a small, tight loop is 3 clocks.  So
3327c478bdstevel@tonic-gate * the 4-way loop runs 50% faster than the fastest non-unrolled
3337c478bdstevel@tonic-gate * loop.
3347c478bdstevel@tonic-gate *
3357c478bdstevel@tonic-gate * Instruction alignment is forced by used of .align 16 directives
3367c478bdstevel@tonic-gate * and nops which are not executed in the code.  This
3377c478bdstevel@tonic-gate * combination of operations shifts the alignment of following
3387c478bdstevel@tonic-gate * loops to insure that loops are aligned so that their instructions
3397c478bdstevel@tonic-gate * fall within the minimum number of 4 instruction fetch groups.
3407c478bdstevel@tonic-gate * If instructions are inserted or removed between the .align
3417c478bdstevel@tonic-gate * instruction and the unrolled loops, then the alignment needs
3427c478bdstevel@tonic-gate * to be readjusted.  Misaligned loops can add a clock per loop
3437c478bdstevel@tonic-gate * iteration to the loop timing.
3447c478bdstevel@tonic-gate *
3457c478bdstevel@tonic-gate * In a few cases, code is duplicated to avoid a branch.  Since
3467c478bdstevel@tonic-gate * a non-predicted tst and branch takes 10 clocks, this savings
3477c478bdstevel@tonic-gate * is judged an appropriate time-space tradeoff.
3487c478bdstevel@tonic-gate *
3497c478bdstevel@tonic-gate * Within the FPBLK-code, the prefetch method in the inner
3507c478bdstevel@tonic-gate * loop needs to be explained as it is not standard.  Two
3517c478bdstevel@tonic-gate * prefetches are issued for each cache line instead of one.
3527c478bdstevel@tonic-gate * The primary one is at the maximum reach of 8 cache lines.
3537c478bdstevel@tonic-gate * Most of the time, that maximum prefetch reach gives the
3547c478bdstevel@tonic-gate * cache line more time to reach the processor for systems with
3557c478bdstevel@tonic-gate * higher processor clocks.  But, sometimes memory interference
3567c478bdstevel@tonic-gate * can cause that prefetch to be dropped.  Putting a second
3577c478bdstevel@tonic-gate * prefetch at a reach of 5 cache lines catches the drops
3587c478bdstevel@tonic-gate * three iterations later and shows a measured improvement
3597c478bdstevel@tonic-gate * in performance over any similar loop with a single prefetch.
3607c478bdstevel@tonic-gate * The prefetches are placed in the loop so they overlap with
3617c478bdstevel@tonic-gate * non-memory instructions, so that there is no extra cost
3627c478bdstevel@tonic-gate * when the data is already in-cache.
3637c478bdstevel@tonic-gate *
3647c478bdstevel@tonic-gate */
3677c478bdstevel@tonic-gate * Notes on preserving existing fp state and on membars.
3687c478bdstevel@tonic-gate *
3697c478bdstevel@tonic-gate * When a copyOP decides to use fp we may have to preserve existing
3707c478bdstevel@tonic-gate * floating point state.  It is not the caller's state that we need to
3717c478bdstevel@tonic-gate * preserve - the rest of the kernel does not use fp and, anyway, fp
3727c478bdstevel@tonic-gate * registers are volatile across a call.  Some examples:
3737c478bdstevel@tonic-gate *
3747c478bdstevel@tonic-gate *	- userland has fp state and is interrupted (device interrupt
3757c478bdstevel@tonic-gate *	  or trap) and within the interrupt/trap handling we use
3767c478bdstevel@tonic-gate *	  bcopy()
3777c478bdstevel@tonic-gate *	- another (higher level) interrupt or trap handler uses bcopy
3787c478bdstevel@tonic-gate *	  while a bcopy from an earlier interrupt is still active
3797c478bdstevel@tonic-gate *	- an asynchronous error trap occurs while fp state exists (in
3807c478bdstevel@tonic-gate *	  userland or in kernel copy) and the tl0 component of the handling
3817c478bdstevel@tonic-gate *	  uses bcopy
3827c478bdstevel@tonic-gate *	- a user process with fp state incurs a copy-on-write fault and
3837c478bdstevel@tonic-gate *	  hwblkpagecopy always uses fp
3847c478bdstevel@tonic-gate *
3857c478bdstevel@tonic-gate * We therefore need a per-call place in which to preserve fp state -
3867c478bdstevel@tonic-gate * using our stack is ideal (and since fp copy cannot be leaf optimized
3877c478bdstevel@tonic-gate * because of calls it makes, this is no hardship).
3887c478bdstevel@tonic-gate *
3897c478bdstevel@tonic-gate * The following membar BLD/BST discussion is Cheetah pipeline specific.
3907c478bdstevel@tonic-gate * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
3917c478bdstevel@tonic-gate * nops (those semantics always apply) and #StoreLoad is implemented
3927c478bdstevel@tonic-gate * as a membar #Sync.
3937c478bdstevel@tonic-gate *
3947c478bdstevel@tonic-gate * It is possible that the owner of the fp state has a block load or
3957c478bdstevel@tonic-gate * block store still "in flight" at the time we come to preserve that
3967c478bdstevel@tonic-gate * state.  Block loads are blocking in Cheetah pipelines so we do not
3977c478bdstevel@tonic-gate * need to sync with them.  In preserving fp regs we will use block stores
3987c478bdstevel@tonic-gate * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
3997c478bdstevel@tonic-gate * after storing state (so that our subsequent use of those registers
4007c478bdstevel@tonic-gate * does not modify them before the block stores complete);  this membar
4017c478bdstevel@tonic-gate * also serves to sync with block stores the owner of the fp state has
4027c478bdstevel@tonic-gate * initiated.
4037c478bdstevel@tonic-gate *
4047c478bdstevel@tonic-gate * When we have finished fp copy (with it's repeated block stores)
4057c478bdstevel@tonic-gate * we must membar #Sync so that our block stores may complete before
4067c478bdstevel@tonic-gate * we either restore the original fp state into the fp registers or
4077c478bdstevel@tonic-gate * return to a caller which may initiate other fp operations that could
4087c478bdstevel@tonic-gate * modify the fp regs we used before the block stores complete.
4097c478bdstevel@tonic-gate *
4107c478bdstevel@tonic-gate * Synchronous faults (eg, unresolvable DMMU miss) that occur while
4117c478bdstevel@tonic-gate * t_lofault is not NULL will not panic but will instead trampoline
4127c478bdstevel@tonic-gate * to the registered lofault handler.  There is no need for any
4137c478bdstevel@tonic-gate * membars for these - eg, our store to t_lofault will always be visible to
4147c478bdstevel@tonic-gate * ourselves and it is our cpu which will take any trap.
4157c478bdstevel@tonic-gate *
4167c478bdstevel@tonic-gate * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
4177c478bdstevel@tonic-gate * while t_lofault is not NULL will also not panic.  Since we're copying
4187c478bdstevel@tonic-gate * to or from userland the extent of the damage is known - the destination
4197c478bdstevel@tonic-gate * buffer is incomplete.  So trap handlers will trampoline to the lofault
4207c478bdstevel@tonic-gate * handler in this case which should take some form of error action to
4217c478bdstevel@tonic-gate * avoid using the incomplete buffer.  The trap handler also flags the
4227c478bdstevel@tonic-gate * fault so that later return-from-trap handling (for the trap that brought
4237c478bdstevel@tonic-gate * this thread into the kernel in the first place) can notify the process
4247c478bdstevel@tonic-gate * and reboot the system (or restart the service with Greenline/Contracts).
4257c478bdstevel@tonic-gate *
4267c478bdstevel@tonic-gate * Asynchronous faults (eg, uncorrectable ECC error from memory) can
4277c478bdstevel@tonic-gate * result in deferred error traps - the trap is taken sometime after
4287c478bdstevel@tonic-gate * the event and the trap PC may not be the PC of the faulting access.
4297c478bdstevel@tonic-gate * Delivery of such pending traps can be forced by a membar #Sync, acting
4307c478bdstevel@tonic-gate * as an "error barrier" in this role.  To accurately apply the user/kernel
4317c478bdstevel@tonic-gate * separation described in the preceding paragraph we must force delivery
4327c478bdstevel@tonic-gate * of deferred traps affecting kernel state before we install a lofault
4337c478bdstevel@tonic-gate * handler (if we interpose a new lofault handler on an existing one there
4347c478bdstevel@tonic-gate * is no need to repeat this), and we must force delivery of deferred
4357c478bdstevel@tonic-gate * errors affecting the lofault-protected region before we clear t_lofault.
4367c478bdstevel@tonic-gate * Failure to do so results in lost kernel state being interpreted as
4377c478bdstevel@tonic-gate * affecting a copyin/copyout only, or of an error that really only
4387c478bdstevel@tonic-gate * affects copy data being interpreted as losing kernel state.
4397c478bdstevel@tonic-gate *
4407c478bdstevel@tonic-gate * Since the copy operations may preserve and later restore floating
4417c478bdstevel@tonic-gate * point state that does not belong to the caller (see examples above),
4427c478bdstevel@tonic-gate * we must be careful in how we do this in order to prevent corruption
4437c478bdstevel@tonic-gate * of another program.
4447c478bdstevel@tonic-gate *
4457c478bdstevel@tonic-gate * To make sure that floating point state is always saved and restored
4467c478bdstevel@tonic-gate * correctly, the following "big rules" must be followed when the floating
4477c478bdstevel@tonic-gate * point registers will be used:
4487c478bdstevel@tonic-gate *
4497c478bdstevel@tonic-gate * 1. %l6 always holds the caller's lofault handler.  Also in this register,
4507c478bdstevel@tonic-gate *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
4517c478bdstevel@tonic-gate *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
4527c478bdstevel@tonic-gate *    lofault handler was set coming in.
4537c478bdstevel@tonic-gate *
4547c478bdstevel@tonic-gate * 2. The FPUSED flag indicates that all FP state has been successfully stored
4557c478bdstevel@tonic-gate *    on the stack.  It should not be set until this save has been completed.
4567c478bdstevel@tonic-gate *
4577c478bdstevel@tonic-gate * 3. The FPUSED flag should not be cleared on exit until all FP state has
4587c478bdstevel@tonic-gate *    been restored from the stack.  If an error occurs while restoring
4597c478bdstevel@tonic-gate *    data from the stack, the error handler can check this flag to see if
4607c478bdstevel@tonic-gate *    a restore is necessary.
4617c478bdstevel@tonic-gate *
4627c478bdstevel@tonic-gate * 4. Code run under the new lofault handler must be kept to a minimum.  In
4637c478bdstevel@tonic-gate *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
4647c478bdstevel@tonic-gate *    to kpreempt(), should not be made until after the lofault handler has
4657c478bdstevel@tonic-gate *    been restored.
4667c478bdstevel@tonic-gate */
4697c478bdstevel@tonic-gate * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
4707c478bdstevel@tonic-gate * to "break even" using FP/VIS-accelerated memory operations.
4717c478bdstevel@tonic-gate * The FPBLK code assumes a minimum number of bytes are available
4727c478bdstevel@tonic-gate * to be moved on entry.  Check that code carefully before
4737c478bdstevel@tonic-gate * reducing VIS_COPY_THRESHOLD below 256.
4747c478bdstevel@tonic-gate */
4767c478bdstevel@tonic-gate * This shadows sys/machsystm.h which can't be included due to the lack of
4777c478bdstevel@tonic-gate * _ASM guards in include files it references. Change it here, change it there.
4787c478bdstevel@tonic-gate */
4797c478bdstevel@tonic-gate#define VIS_COPY_THRESHOLD 256
4827c478bdstevel@tonic-gate * TEST for very short copies
4837c478bdstevel@tonic-gate * Be aware that the maximum unroll for the short unaligned case
4847c478bdstevel@tonic-gate * is SHORTCOPY+1
4857c478bdstevel@tonic-gate */
4867c478bdstevel@tonic-gate#define SHORTCOPY 3
4877c478bdstevel@tonic-gate#define CHKSIZE  39
4907c478bdstevel@tonic-gate * Indicates that we're to trampoline to the error handler.
4917c478bdstevel@tonic-gate * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
4927c478bdstevel@tonic-gate * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
4937c478bdstevel@tonic-gate */
4947c478bdstevel@tonic-gate#define	FPUSED_FLAG	1
4957c478bdstevel@tonic-gate#define	TRAMP_FLAG	2
4967c478bdstevel@tonic-gate#define	MASK_FLAGS	3
4997c478bdstevel@tonic-gate * Number of outstanding prefetches.
5007c478bdstevel@tonic-gate * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
5017c478bdstevel@tonic-gate * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
5027c478bdstevel@tonic-gate * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
5037c478bdstevel@tonic-gate * of 5% for large copies as compared to a single prefetch.  The reason
5047c478bdstevel@tonic-gate * for the improvement is that with Cheetah and Jaguar, some prefetches
5057c478bdstevel@tonic-gate * are dropped due to the prefetch queue being full.  The second prefetch
5067c478bdstevel@tonic-gate * reduces the number of cache lines that are dropped.
5077c478bdstevel@tonic-gate * Do not remove the double prefetch or change either CHEETAH_PREFETCH
5087c478bdstevel@tonic-gate * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
5097c478bdstevel@tonic-gate * there is no loss of performance.
5107c478bdstevel@tonic-gate */
5117c478bdstevel@tonic-gate#define	CHEETAH_PREFETCH	8
5127c478bdstevel@tonic-gate#define	CHEETAH_2ND_PREFETCH	5
5147c478bdstevel@tonic-gate#define	VIS_BLOCKSIZE		64
5177c478bdstevel@tonic-gate * Size of stack frame in order to accomodate a 64-byte aligned
5187c478bdstevel@tonic-gate * floating-point register save area and 2 64-bit temp locations.
5197c478bdstevel@tonic-gate * All copy functions use two quadrants of fp registers; to assure a
5207c478bdstevel@tonic-gate * block-aligned two block buffer in which to save we must reserve
5217c478bdstevel@tonic-gate * three blocks on stack.  Not all functions preserve %pfrs on stack
5227c478bdstevel@tonic-gate * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
5237c478bdstevel@tonic-gate *
5247c478bdstevel@tonic-gate *    _______________________________________ <-- %fp + STACK_BIAS
5257c478bdstevel@tonic-gate *    | We may need to preserve 2 quadrants |
5267c478bdstevel@tonic-gate *    | of fp regs, but since we do so with |
5277c478bdstevel@tonic-gate *    | BST/BLD we need room in which to    |
5287c478bdstevel@tonic-gate *    | align to VIS_BLOCKSIZE bytes.  So   |
5297c478bdstevel@tonic-gate *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
5307c478bdstevel@tonic-gate *    |-------------------------------------|
5317c478bdstevel@tonic-gate *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
5327c478bdstevel@tonic-gate *    |-------------------------------------|
5337c478bdstevel@tonic-gate *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
5347c478bdstevel@tonic-gate *    ---------------------------------------
5357c478bdstevel@tonic-gate */
5367c478bdstevel@tonic-gate#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
5377c478bdstevel@tonic-gate#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
5387c478bdstevel@tonic-gate#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
5397c478bdstevel@tonic-gate#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
5407c478bdstevel@tonic-gate#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
5437c478bdstevel@tonic-gate * Common macros used by the various versions of the block copy
5447c478bdstevel@tonic-gate * routines in this file.
5457c478bdstevel@tonic-gate */
5487c478bdstevel@tonic-gate * In FP copies if we do not have preserved data to restore over
5497c478bdstevel@tonic-gate * the fp regs we used then we must zero those regs to avoid
5507c478bdstevel@tonic-gate * exposing portions of the data to later threads (data security).
5517c478bdstevel@tonic-gate *
5527c478bdstevel@tonic-gate * Copy functions use either quadrants 1 and 3 or 2 and 4.
5537c478bdstevel@tonic-gate *
5547c478bdstevel@tonic-gate * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
5557c478bdstevel@tonic-gate * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
5567c478bdstevel@tonic-gate *
5577c478bdstevel@tonic-gate * The instructions below are quicker than repeated fzero instructions
5587c478bdstevel@tonic-gate * since they can dispatch down two fp pipelines.
5597c478bdstevel@tonic-gate */
5607c478bdstevel@tonic-gate#define	FZEROQ1Q3			\
5617c478bdstevel@tonic-gate	fzero	%f0			;\
5627c478bdstevel@tonic-gate	fzero	%f2			;\
5637c478bdstevel@tonic-gate	faddd	%f0, %f2, %f4		;\
5647c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f6		;\
5657c478bdstevel@tonic-gate	faddd	%f0, %f2, %f8		;\
5667c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f10		;\
5677c478bdstevel@tonic-gate	faddd	%f0, %f2, %f12		;\
5687c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f14		;\
5697c478bdstevel@tonic-gate	faddd	%f0, %f2, %f32		;\
5707c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f34		;\
5717c478bdstevel@tonic-gate	faddd	%f0, %f2, %f36		;\
5727c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f38		;\
5737c478bdstevel@tonic-gate	faddd	%f0, %f2, %f40		;\
5747c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f42		;\
5757c478bdstevel@tonic-gate	faddd	%f0, %f2, %f44		;\
5767c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f46
5787c478bdstevel@tonic-gate#define	FZEROQ2Q4			\
5797c478bdstevel@tonic-gate	fzero	%f16			;\
5807c478bdstevel@tonic-gate	fzero	%f18			;\
5817c478bdstevel@tonic-gate	faddd	%f16, %f18, %f20	;\
5827c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f22	;\
5837c478bdstevel@tonic-gate	faddd	%f16, %f18, %f24	;\
5847c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f26	;\
5857c478bdstevel@tonic-gate	faddd	%f16, %f18, %f28	;\
5867c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f30	;\
5877c478bdstevel@tonic-gate	faddd	%f16, %f18, %f48	;\
5887c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f50	;\
5897c478bdstevel@tonic-gate	faddd	%f16, %f18, %f52	;\
5907c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f54	;\
5917c478bdstevel@tonic-gate	faddd	%f16, %f18, %f56	;\
5927c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f58	;\
5937c478bdstevel@tonic-gate	faddd	%f16, %f18, %f60	;\
5947c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f62
5977c478bdstevel@tonic-gate * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
5987c478bdstevel@tonic-gate * Used to save and restore in-use fp registers when we want to use FP
5997c478bdstevel@tonic-gate * and find fp already in use and copy size still large enough to justify
6007c478bdstevel@tonic-gate * the additional overhead of this save and restore.
6017c478bdstevel@tonic-gate *
6027c478bdstevel@tonic-gate * A membar #Sync is needed before save to sync fp ops initiated before
6037c478bdstevel@tonic-gate * the call to the copy function (by whoever has fp in use); for example
6047c478bdstevel@tonic-gate * an earlier block load to the quadrant we are about to save may still be
6057c478bdstevel@tonic-gate * "in flight".  A membar #Sync is required at the end of the save to
6067c478bdstevel@tonic-gate * sync our block store (the copy code is about to begin ldd's to the
6077c478bdstevel@tonic-gate * first quadrant).  Note, however, that since Cheetah pipeline block load
6087c478bdstevel@tonic-gate * is blocking we can omit the initial membar before saving fp state (they're
6097c478bdstevel@tonic-gate * commented below in case of future porting to a chip that does not block
6107c478bdstevel@tonic-gate * on block load).
6117c478bdstevel@tonic-gate *
6127c478bdstevel@tonic-gate * Similarly: a membar #Sync before restore allows the block stores of
6137c478bdstevel@tonic-gate * the copy operation to complete before we fill the quadrants with their
6147c478bdstevel@tonic-gate * original data, and a membar #Sync after restore lets the block loads
6157c478bdstevel@tonic-gate * of the restore complete before we return to whoever has the fp regs
6167c478bdstevel@tonic-gate * in use.  To avoid repeated membar #Sync we make it the responsibility
6177c478bdstevel@tonic-gate * of the copy code to membar #Sync immediately after copy is complete
6187c478bdstevel@tonic-gate * and before using the BLD_*_FROMSTACK macro.
6197c478bdstevel@tonic-gate */
6207c478bdstevel@tonic-gate#define BST_FPQ1Q3_TOSTACK(tmp1)				\
6217c478bdstevel@tonic-gate	/* membar #Sync	*/					;\
6227c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6237c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6247c478bdstevel@tonic-gate	stda	%f0, [tmp1]ASI_BLK_P				;\
6257c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6267c478bdstevel@tonic-gate	stda	%f32, [tmp1]ASI_BLK_P				;\
6277c478bdstevel@tonic-gate	membar	#Sync
6297c478bdstevel@tonic-gate#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
6307c478bdstevel@tonic-gate	/* membar #Sync - provided at copy completion */	;\
6317c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6327c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6337c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f0				;\
6347c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6357c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f32				;\
6367c478bdstevel@tonic-gate	membar	#Sync
6387c478bdstevel@tonic-gate#define BST_FPQ2Q4_TOSTACK(tmp1)				\
6397c478bdstevel@tonic-gate	/* membar #Sync */					;\
6407c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6417c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6427c478bdstevel@tonic-gate	stda	%f16, [tmp1]ASI_BLK_P				;\
6437c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6447c478bdstevel@tonic-gate	stda	%f48, [tmp1]ASI_BLK_P				;\
6457c478bdstevel@tonic-gate	membar	#Sync
6477c478bdstevel@tonic-gate#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
6487c478bdstevel@tonic-gate	/* membar #Sync - provided at copy completion */	;\
6497c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6507c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6517c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f16				;\
6527c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6537c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f48				;\
6547c478bdstevel@tonic-gate	membar	#Sync
6577c478bdstevel@tonic-gate * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
6587c478bdstevel@tonic-gate * prevent preemption if there is no t_lwp to save FP state to on context
6597c478bdstevel@tonic-gate * switch) before commencing a FP copy, and reallow it on completion or
6607c478bdstevel@tonic-gate * in error trampoline paths when we were using FP copy.
6617c478bdstevel@tonic-gate *
6627c478bdstevel@tonic-gate * Both macros may call other functions, so be aware that all outputs are
6637c478bdstevel@tonic-gate * forfeit after using these macros.  For this reason we do not pass registers
6647c478bdstevel@tonic-gate * to use - we just use any outputs we want.
6657c478bdstevel@tonic-gate *
6667c478bdstevel@tonic-gate * For fpRAS we need to perform the fpRAS mechanism test on the same
6677c478bdstevel@tonic-gate * CPU as we use for the copy operation, both so that we validate the
6687c478bdstevel@tonic-gate * CPU we perform the copy on and so that we know which CPU failed
6697c478bdstevel@tonic-gate * if a failure is detected.  Hence we need to be bound to "our" CPU.
6707c478bdstevel@tonic-gate * This could be achieved through disabling preemption (and we have do it that
6717c478bdstevel@tonic-gate * way for threads with no t_lwp) but for larger copies this may hold
6727c478bdstevel@tonic-gate * higher priority threads off of cpu for too long (eg, realtime).  So we
6737c478bdstevel@tonic-gate * make use of the lightweight t_nomigrate mechanism where we can (ie, when
6747c478bdstevel@tonic-gate * we have a t_lwp).
6757c478bdstevel@tonic-gate *
6767c478bdstevel@tonic-gate * Pseudo code:
6777c478bdstevel@tonic-gate *
6787c478bdstevel@tonic-gate * FP_NOMIGRATE:
6797c478bdstevel@tonic-gate *
6807c478bdstevel@tonic-gate * if (curthread->t_lwp) {
6817c478bdstevel@tonic-gate *	thread_nomigrate();
6827c478bdstevel@tonic-gate * } else {
6837c478bdstevel@tonic-gate *	kpreempt_disable();
6847c478bdstevel@tonic-gate * }
6857c478bdstevel@tonic-gate *
6867c478bdstevel@tonic-gate * FP_ALLOWMIGRATE:
6877c478bdstevel@tonic-gate *
6887c478bdstevel@tonic-gate * if (curthread->t_lwp) {
6897c478bdstevel@tonic-gate *	thread_allowmigrate();
6907c478bdstevel@tonic-gate * } else {
6917c478bdstevel@tonic-gate *	kpreempt_enable();
6927c478bdstevel@tonic-gate * }
6937c478bdstevel@tonic-gate */
6957c478bdstevel@tonic-gate#define	FP_NOMIGRATE(label1, label2)				\
6967c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LWP], %o0			;\
6977c478bdstevel@tonic-gate	brz,a,pn %o0, label1/**/f				;\
6987c478bdstevel@tonic-gate	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
6997c478bdstevel@tonic-gate	call	thread_nomigrate				;\
7007c478bdstevel@tonic-gate	  nop							;\
7017c478bdstevel@tonic-gate	ba	label2/**/f					;\
7027c478bdstevel@tonic-gate	  nop							;\
7037c478bdstevel@tonic-gatelabel1:								;\
7047c478bdstevel@tonic-gate	inc	%o1						;\
7057c478bdstevel@tonic-gate	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
7087c478bdstevel@tonic-gate#define	FP_ALLOWMIGRATE(label1, label2)			\
7097c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LWP], %o0			;\
7107c478bdstevel@tonic-gate	brz,a,pn %o0, label1/**/f				;\
7117c478bdstevel@tonic-gate	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
7127c478bdstevel@tonic-gate	call thread_allowmigrate				;\
7137c478bdstevel@tonic-gate	  nop							;\
7147c478bdstevel@tonic-gate	ba	label2/**/f					;\
7157c478bdstevel@tonic-gate	  nop							;\
7167c478bdstevel@tonic-gatelabel1:								;\
7177c478bdstevel@tonic-gate	dec	%o1						;\
7187c478bdstevel@tonic-gate	brnz,pn	%o1, label2/**/f				;\
7197c478bdstevel@tonic-gate	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
7207c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_CPU], %o0			;\
7217c478bdstevel@tonic-gate	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
7227c478bdstevel@tonic-gate	brz,pt	%o0, label2/**/f				;\
7237c478bdstevel@tonic-gate	  nop							;\
7247c478bdstevel@tonic-gate	call	kpreempt					;\
7257c478bdstevel@tonic-gate	  rdpr	%pil, %o0					;\
7297c478bdstevel@tonic-gate * Copy a block of storage, returning an error code if `from' or
7307c478bdstevel@tonic-gate * `to' takes a kernel pagefault which cannot be resolved.
7317c478bdstevel@tonic-gate * Returns errno value on pagefault error, 0 if all ok
7327c478bdstevel@tonic-gate */
7347c478bdstevel@tonic-gate	.seg	".text"
7357c478bdstevel@tonic-gate	.align	4
7377c478bdstevel@tonic-gate	ENTRY(kcopy)
7397c478bdstevel@tonic-gate	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
7407c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to larger cases
7417c478bdstevel@tonic-gate	  xor	%o0, %o1, %o3			! are src, dst alignable?
7427c478bdstevel@tonic-gate	btst	7, %o3				!
7437c478bdstevel@tonic-gate	bz,pt	%ncc, .kcopy_8			! check for longword alignment
7447c478bdstevel@tonic-gate	  nop
7457c478bdstevel@tonic-gate	btst	1, %o3				!
7467c478bdstevel@tonic-gate	bz,pt	%ncc, .kcopy_2			! check for half-word
7477c478bdstevel@tonic-gate	  nop
7487c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
7497c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
7507c478bdstevel@tonic-gate	tst	%o3
7517c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
7527c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
7537c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
7547c478bdstevel@tonic-gate	  nop
7557c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
7567c478bdstevel@tonic-gate	  nop
7587c478bdstevel@tonic-gate	btst	3, %o3				!
7597c478bdstevel@tonic-gate	bz,pt	%ncc, .kcopy_4			! check for word alignment
7607c478bdstevel@tonic-gate	  nop
7617c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
7627c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
7637c478bdstevel@tonic-gate	tst	%o3
7647c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
7657c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
7667c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
7677c478bdstevel@tonic-gate	  nop
7687c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
7697c478bdstevel@tonic-gate	  nop
7717c478bdstevel@tonic-gate	! already checked longword, must be word aligned
7727c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
7737c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
7747c478bdstevel@tonic-gate	tst	%o3
7757c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
7767c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
7777c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
7787c478bdstevel@tonic-gate	  nop
7797c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
7807c478bdstevel@tonic-gate	  nop
7827c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
7837c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
7847c478bdstevel@tonic-gate	tst	%o3
7857c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
7867c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
7877c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
7887c478bdstevel@tonic-gate	  nop
7897c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
7907c478bdstevel@tonic-gate	  nop
7937c478bdstevel@tonic-gate	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
7947c478bdstevel@tonic-gate	or	%o5, %lo(.sm_copyerr), %o5
7957c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
7967c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
7977c478bdstevel@tonic-gate	ba,pt	%ncc, .sm_do_copy		! common code
7987c478bdstevel@tonic-gate	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
8017c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
8027c478bdstevel@tonic-gate	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
8037c478bdstevel@tonic-gate	or	%l7, %lo(.copyerr), %l7
8047c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
8057c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
8067c478bdstevel@tonic-gate	ba,pt	%ncc, .do_copy			! common code
8077c478bdstevel@tonic-gate	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
8117c478bdstevel@tonic-gate * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
8127c478bdstevel@tonic-gate * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
8137c478bdstevel@tonic-gate */
8157c478bdstevel@tonic-gate	set	.copyerr2, %l0
8167c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
8177c478bdstevel@tonic-gate	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
8187c478bdstevel@tonic-gate	btst	FPUSED_FLAG, %l6
8197c478bdstevel@tonic-gate	bz	%ncc, 1f
8207c478bdstevel@tonic-gate	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
8227c478bdstevel@tonic-gate	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
8237c478bdstevel@tonic-gate	wr	%o2, 0, %gsr
8257c478bdstevel@tonic-gate	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
8267c478bdstevel@tonic-gate	btst	FPRS_FEF, %o3
8277c478bdstevel@tonic-gate	bz,pt	%icc, 4f
8287c478bdstevel@tonic-gate	  nop
8307c478bdstevel@tonic-gate	BLD_FPQ1Q3_FROMSTACK(%o2)
8327c478bdstevel@tonic-gate	ba,pt	%ncc, 1f
8337c478bdstevel@tonic-gate	  wr	%o3, 0, %fprs		! restore fprs
8367c478bdstevel@tonic-gate	FZEROQ1Q3
8377c478bdstevel@tonic-gate	wr	%o3, 0, %fprs		! restore fprs
8397c478bdstevel@tonic-gate	!
8407c478bdstevel@tonic-gate	! Need to cater for the different expectations of kcopy
8417c478bdstevel@tonic-gate	! and bcopy. kcopy will *always* set a t_lofault handler
8427c478bdstevel@tonic-gate	! If it fires, we're expected to just return the error code
8437c478bdstevel@tonic-gate	! and *not* to invoke any existing error handler. As far as
8447c478bdstevel@tonic-gate	! bcopy is concerned, we only set t_lofault if there was an
8457c478bdstevel@tonic-gate	! existing lofault handler. In that case we're expected to
8467c478bdstevel@tonic-gate	! invoke the previously existing handler after resetting the
8477c478bdstevel@tonic-gate	! t_lofault value.
8487c478bdstevel@tonic-gate	!
8507c478bdstevel@tonic-gate	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
8517c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
8527c478bdstevel@tonic-gate	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
8537c478bdstevel@tonic-gate	FP_ALLOWMIGRATE(5, 6)
8557c478bdstevel@tonic-gate	btst	TRAMP_FLAG, %l0
8567c478bdstevel@tonic-gate	bnz,pn	%ncc, 3f
8577c478bdstevel@tonic-gate	  nop
8587c478bdstevel@tonic-gate	ret
8597c478bdstevel@tonic-gate	  restore	%g1, 0, %o0
8627c478bdstevel@tonic-gate	!
8637c478bdstevel@tonic-gate	! We're here via bcopy. There *must* have been an error handler
8647c478bdstevel@tonic-gate	! in place otherwise we would have died a nasty death already.
8657c478bdstevel@tonic-gate	!
8667c478bdstevel@tonic-gate	jmp	%l6				! goto real handler
8677c478bdstevel@tonic-gate	  restore	%g0, 0, %o0		! dispose of copy window
8707c478bdstevel@tonic-gate * We got here because of a fault in .copyerr.  We can't safely restore fp
8717c478bdstevel@tonic-gate * state, so we panic.
8727c478bdstevel@tonic-gate */
8747c478bdstevel@tonic-gate	.asciz	"Unable to restore fp state after copy operation"
8767c478bdstevel@tonic-gate	.align	4
8787c478bdstevel@tonic-gate	set	fp_panic_msg, %o0
8797c478bdstevel@tonic-gate	call	panic
8807c478bdstevel@tonic-gate	  nop
8837c478bdstevel@tonic-gate * We got here because of a fault during a small kcopy or bcopy.
8847c478bdstevel@tonic-gate * No floating point registers are used by the small copies.
8857c478bdstevel@tonic-gate * Errno value is in %g1.
8867c478bdstevel@tonic-gate */
8897c478bdstevel@tonic-gate	btst	TRAMP_FLAG, %o4
8907c478bdstevel@tonic-gate	membar	#Sync
8917c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
8927c478bdstevel@tonic-gate	bnz,pn	%ncc, 3f
8937c478bdstevel@tonic-gate	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
8947c478bdstevel@tonic-gate	retl
8957c478bdstevel@tonic-gate	  mov	%g1, %o0
8977c478bdstevel@tonic-gate	jmp	%o4				! goto real handler
8987c478bdstevel@tonic-gate	  mov	%g0, %o0			!
9007c478bdstevel@tonic-gate	SET_SIZE(kcopy)
9047c478bdstevel@tonic-gate * Copy a block of storage - must not overlap (from + len <= to).
9057c478bdstevel@tonic-gate * Registers: l6 - saved t_lofault
9067c478bdstevel@tonic-gate * (for short copies, o4 - saved t_lofault)
9077c478bdstevel@tonic-gate *
9087c478bdstevel@tonic-gate * Copy a page of memory.
9097c478bdstevel@tonic-gate * Assumes double word alignment and a count >= 256.
9107c478bdstevel@tonic-gate */
9127c478bdstevel@tonic-gate	ENTRY(bcopy)
9147c478bdstevel@tonic-gate	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
9157c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to larger cases
9167c478bdstevel@tonic-gate	  xor	%o0, %o1, %o3			! are src, dst alignable?
9177c478bdstevel@tonic-gate	btst	7, %o3				!
9187c478bdstevel@tonic-gate	bz,pt	%ncc, .bcopy_8			! check for longword alignment
9197c478bdstevel@tonic-gate	  nop
9207c478bdstevel@tonic-gate	btst	1, %o3				!
9217c478bdstevel@tonic-gate	bz,pt	%ncc, .bcopy_2			! check for half-word
9227c478bdstevel@tonic-gate	  nop
9237c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
9247c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
9257c478bdstevel@tonic-gate	tst	%o3
9267c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9277c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9287c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9297c478bdstevel@tonic-gate	  nop
9307c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9317c478bdstevel@tonic-gate	  nop
9337c478bdstevel@tonic-gate	btst	3, %o3				!
9347c478bdstevel@tonic-gate	bz,pt	%ncc, .bcopy_4			! check for word alignment
9357c478bdstevel@tonic-gate	  nop
9367c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
9377c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
9387c478bdstevel@tonic-gate	tst	%o3
9397c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9407c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9417c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9427c478bdstevel@tonic-gate	  nop
9437c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9447c478bdstevel@tonic-gate	  nop
9467c478bdstevel@tonic-gate	! already checked longword, must be word aligned
9477c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
9487c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
9497c478bdstevel@tonic-gate	tst	%o3
9507c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9517c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9527c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9537c478bdstevel@tonic-gate	  nop
9547c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9557c478bdstevel@tonic-gate	  nop
9577c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
9587c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
9597c478bdstevel@tonic-gate	tst	%o3
9607c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9617c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9627c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9637c478bdstevel@tonic-gate	  nop
9647c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9657c478bdstevel@tonic-gate	  nop
9677c478bdstevel@tonic-gate	.align	16
9697c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
9707c478bdstevel@tonic-gate	tst	%o4
9717c478bdstevel@tonic-gate	bz,pt	%icc, .sm_do_copy
9727c478bdstevel@tonic-gate	  nop
9737c478bdstevel@tonic-gate	sethi	%hi(.sm_copyerr), %o5
9747c478bdstevel@tonic-gate	or	%o5, %lo(.sm_copyerr), %o5
9757c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
9767c478bdstevel@tonic-gate	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
9777c478bdstevel@tonic-gate	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
9797c478bdstevel@tonic-gate	cmp	%o2, SHORTCOPY		! check for really short case
9807c478bdstevel@tonic-gate	bleu,pt	%ncc, .bc_sm_left	!
9817c478bdstevel@tonic-gate	  cmp	%o2, CHKSIZE		! check for medium length cases
9827c478bdstevel@tonic-gate	bgu,pn	%ncc, .bc_med		!
9837c478bdstevel@tonic-gate	  or	%o0, %o1, %o3		! prepare alignment check
9847c478bdstevel@tonic-gate	andcc	%o3, 0x3, %g0		! test for alignment
9857c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
9877c478bdstevel@tonic-gate	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
9897c478bdstevel@tonic-gate	ldub	[%o0], %o3		! read byte
9907c478bdstevel@tonic-gate	stb	%o3, [%o1]		! write byte
9917c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
9927c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
9937c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
9947c478bdstevel@tonic-gate	stb	%o3, [%o1 + 1]
9957c478bdstevel@tonic-gate	ldub	[%o0 - 2], %o3
9967c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
9977c478bdstevel@tonic-gate	stb	%o3, [%o1 - 2]
9987c478bdstevel@tonic-gate	ldub	[%o0 - 1], %o3
9997c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
10007c478bdstevel@tonic-gate	  stb	%o3, [%o1 - 1]
10017c478bdstevel@tonic-gate	add	%o2, 3, %o2		! restore count
10037c478bdstevel@tonic-gate	tst	%o2
10047c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit	! check for zero length
10057c478bdstevel@tonic-gate	  deccc	%o2			! reduce count for cc test
10067c478bdstevel@tonic-gate	ldub	[%o0], %o3		! move one byte
10077c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10087c478bdstevel@tonic-gate	  stb	%o3, [%o1]
10097c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! move another byte
10107c478bdstevel@tonic-gate	deccc	%o2			! check for more
10117c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10127c478bdstevel@tonic-gate	  stb	%o3, [%o1 + 1]
10137c478bdstevel@tonic-gate	ldub	[%o0 + 2], %o3		! move final byte
10147c478bdstevel@tonic-gate	stb	%o3, [%o1 + 2]
10157c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
10167c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
10177c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10187c478bdstevel@tonic-gate	retl
10197c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
10207c478bdstevel@tonic-gate	.align	16
10217c478bdstevel@tonic-gate	nop				! instruction alignment
10227c478bdstevel@tonic-gate					! see discussion at start of file
10247c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
10267c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! update count
10277c478bdstevel@tonic-gate	stw	%o3, [%o1]		! write word
10287c478bdstevel@tonic-gate	add	%o0, 8, %o0		! update SRC
10297c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3		! read word
10307c478bdstevel@tonic-gate	add	%o1, 8, %o1		! update DST
10317c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_words	! loop til done
10327c478bdstevel@tonic-gate	  stw	%o3, [%o1 - 4]		! write word
10337c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
10347c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10357c478bdstevel@tonic-gate	  deccc	%o2
10367c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
10387c478bdstevel@tonic-gate	  subcc	%o2, 2, %o2		! reduce count by 2
10397c478bdstevel@tonic-gate	add	%o0, 2, %o0		! advance SRC by 2
10407c478bdstevel@tonic-gate	lduh	[%o0 - 2], %o3		! read half word
10417c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
10427c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_half	! loop til done
10437c478bdstevel@tonic-gate	  sth	%o3, [%o1 - 2]		! write half word
10447c478bdstevel@tonic-gate	addcc	%o2, 1, %o2		! restore count
10457c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10467c478bdstevel@tonic-gate	  nop
10487c478bdstevel@tonic-gate	ldub	[%o0], %o3
10497c478bdstevel@tonic-gate	stb	%o3, [%o1]
10507c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
10517c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
10527c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10537c478bdstevel@tonic-gate	retl
10547c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
10577c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! update count
10587c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_wordx
10597c478bdstevel@tonic-gate	  lduw	[%o0], %o3		! read word
10607c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore count
10617c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10627c478bdstevel@tonic-gate	  stw	%o3, [%o1]		! write word
10637c478bdstevel@tonic-gate	deccc	%o2			! reduce count for cc test
10647c478bdstevel@tonic-gate	ldub	[%o0 + 4], %o3		! load one byte
10657c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10667c478bdstevel@tonic-gate	  stb	%o3, [%o1 + 4]		! store one byte
10677c478bdstevel@tonic-gate	ldub	[%o0 + 5], %o3		! load second byte
10687c478bdstevel@tonic-gate	deccc	%o2
10697c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10707c478bdstevel@tonic-gate	  stb	%o3, [%o1 + 5]		! store second byte
10717c478bdstevel@tonic-gate	ldub	[%o0 + 6], %o3		! load third byte
10727c478bdstevel@tonic-gate	stb	%o3, [%o1 + 6]		! store third byte
10747c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
10757c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
10767c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10777c478bdstevel@tonic-gate	retl
10787c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
10807c478bdstevel@tonic-gate	.align 16
10827c478bdstevel@tonic-gate	xor	%o0, %o1, %o3		! setup alignment check
10837c478bdstevel@tonic-gate	btst	1, %o3
10847c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
10857c478bdstevel@tonic-gate	  nop
10867c478bdstevel@tonic-gate	btst	3, %o3
10877c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_half	! halfword aligned
10887c478bdstevel@tonic-gate	  nop
10897c478bdstevel@tonic-gate	btst	7, %o3
10907c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_word	! word aligned
10917c478bdstevel@tonic-gate	  nop
10937c478bdstevel@tonic-gate	btst	3, %o0			! check for
10947c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_long1	! word alignment
10957c478bdstevel@tonic-gate	  nop
10977c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
10987c478bdstevel@tonic-gate	inc	%o0
10997c478bdstevel@tonic-gate	stb	%o3,[%o1]		! store byte
11007c478bdstevel@tonic-gate	inc	%o1
11017c478bdstevel@tonic-gate	btst	3, %o0
11027c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_long0
11037c478bdstevel@tonic-gate	  dec	%o2
11047c478bdstevel@tonic-gate.bc_med_long1:			! word aligned
11057c478bdstevel@tonic-gate	btst	7, %o0			! check for long word
11067c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_long2
11077c478bdstevel@tonic-gate	  nop
11087c478bdstevel@tonic-gate	lduw	[%o0], %o3		! load word
11097c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
11107c478bdstevel@tonic-gate	stw	%o3, [%o1]		! store word
11117c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
11127c478bdstevel@tonic-gate	sub	%o2, 4, %o2		! reduce count by 4
11147c478bdstevel@tonic-gate!  Now long word aligned and have at least 32 bytes to move
11177c478bdstevel@tonic-gate	sub	%o2, 31, %o2		! adjust count to allow cc zero test
11197c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
11207c478bdstevel@tonic-gate	stx	%o3, [%o1]		! write long word
11217c478bdstevel@tonic-gate	subcc	%o2, 32, %o2		! reduce count by 32
11227c478bdstevel@tonic-gate	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
11237c478bdstevel@tonic-gate	add	%o0, 32, %o0		! advance SRC by 32
11247c478bdstevel@tonic-gate	stx	%o3, [%o1 + 8]
11257c478bdstevel@tonic-gate	ldx	[%o0 - 16], %o3
11267c478bdstevel@tonic-gate	add	%o1, 32, %o1		! advance DST by 32
11277c478bdstevel@tonic-gate	stx	%o3, [%o1 - 16]
11287c478bdstevel@tonic-gate	ldx	[%o0 - 8], %o3
11297c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
11307c478bdstevel@tonic-gate	  stx	%o3, [%o1 - 8]
11317c478bdstevel@tonic-gate	addcc	%o2, 24, %o2		! restore count to long word offset
11327c478bdstevel@tonic-gate	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
11337c478bdstevel@tonic-gate	  nop
11357c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
11367c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
11377c478bdstevel@tonic-gate	stx	%o3, [%o1]		! write long word
11387c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
11397c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
11407c478bdstevel@tonic-gate	  add	%o1, 8, %o1		! advance DST by 8
11427c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore rest of count
11437c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
11447c478bdstevel@tonic-gate	  deccc	%o2
11457c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
11467c478bdstevel@tonic-gate	  nop
11477c478bdstevel@tonic-gate	ba,pt	%ncc, .bc_sm_half
11487c478bdstevel@tonic-gate	  nop
11507c478bdstevel@tonic-gate	.align 16
11527c478bdstevel@tonic-gate	btst	3, %o0			! check for
11537c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_word1	! word alignment
11547c478bdstevel@tonic-gate	  nop
11567c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
11577c478bdstevel@tonic-gate	inc	%o0
11587c478bdstevel@tonic-gate	stb	%o3,[%o1]		! store byte
11597c478bdstevel@tonic-gate	inc	%o1
11607c478bdstevel@tonic-gate	btst	3, %o0
11617c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_word0
11627c478bdstevel@tonic-gate	  dec	%o2
11647c478bdstevel@tonic-gate!  Now word aligned and have at least 36 bytes to move
11677c478bdstevel@tonic-gate	sub	%o2, 15, %o2		! adjust count to allow cc zero test
11697c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
11707c478bdstevel@tonic-gate	stw	%o3, [%o1]		! write word
11717c478bdstevel@tonic-gate	subcc	%o2, 16, %o2		! reduce count by 16
11727c478bdstevel@tonic-gate	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
11737c478bdstevel@tonic-gate	add	%o0, 16, %o0		! advance SRC by 16
11747c478bdstevel@tonic-gate	stw	%o3, [%o1 + 4]
11757c478bdstevel@tonic-gate	lduw	[%o0 - 8], %o3
11767c478bdstevel@tonic-gate	add	%o1, 16, %o1		! advance DST by 16
11777c478bdstevel@tonic-gate	stw	%o3, [%o1 - 8]
11787c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3
11797c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
11807c478bdstevel@tonic-gate	  stw	%o3, [%o1 - 4]
11817c478bdstevel@tonic-gate	addcc	%o2, 12, %o2		! restore count to word offset
11827c478bdstevel@tonic-gate	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
11837c478bdstevel@tonic-gate	  nop
11857c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
11867c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
11877c478bdstevel@tonic-gate	stw	%o3, [%o1]		! write word
11887c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
11897c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
11907c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! advance DST by 4
11927c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore rest of count
11937c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
11947c478bdstevel@tonic-gate	  deccc	%o2
11957c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
11967c478bdstevel@tonic-gate	  nop
11977c478bdstevel@tonic-gate	ba,pt	%ncc, .bc_sm_half
11987c478bdstevel@tonic-gate	  nop
12007c478bdstevel@tonic-gate	.align 16
12027c478bdstevel@tonic-gate	btst	1, %o0			! check for
12037c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_half1	! half word alignment
12047c478bdstevel@tonic-gate	  nop
12057c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
12067c478bdstevel@tonic-gate	inc	%o0
12077c478bdstevel@tonic-gate	stb	%o3,[%o1]		! store byte
12087c478bdstevel@tonic-gate	inc	%o1
12097c478bdstevel@tonic-gate	dec	%o2
12117c478bdstevel@tonic-gate!  Now half word aligned and have at least 38 bytes to move
12147c478bdstevel@tonic-gate	sub	%o2, 7, %o2		! adjust count to allow cc zero test
12167c478bdstevel@tonic-gate	lduh	[%o0], %o3		! read half word
12177c478bdstevel@tonic-gate	sth	%o3, [%o1]		! write half word
12187c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
12197c478bdstevel@tonic-gate	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
12207c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
12217c478bdstevel@tonic-gate	sth	%o3, [%o1 + 2]
12227c478bdstevel@tonic-gate	lduh	[%o0 - 4], %o3
12237c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
12247c478bdstevel@tonic-gate	sth	%o3, [%o1 - 4]
12257c478bdstevel@tonic-gate	lduh	[%o0 - 2], %o3
12267c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
12277c478bdstevel@tonic-gate	  sth	%o3, [%o1 - 2]
12287c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
12297c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
12307c478bdstevel@tonic-gate	  deccc	%o2
12317c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
12327c478bdstevel@tonic-gate	  nop
12337c478bdstevel@tonic-gate	ba,pt	%ncc, .bc_sm_half
12347c478bdstevel@tonic-gate	  nop
12367c478bdstevel@tonic-gate	SET_SIZE(bcopy)
12397c478bdstevel@tonic-gate * The _more entry points are not intended to be used directly by
12407c478bdstevel@tonic-gate * any caller from outside this file.  They are provided to allow
12417c478bdstevel@tonic-gate * profiling and dtrace of the portions of the copy code that uses
12427c478bdstevel@tonic-gate * the floating point registers.
12437c478bdstevel@tonic-gate * This entry is particularly important as DTRACE (at least as of
12447c478bdstevel@tonic-gate * 4/2004) does not support leaf functions.
12457c478bdstevel@tonic-gate */
12477c478bdstevel@tonic-gate	ENTRY(bcopy_more)
12497c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
12507c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
12517c478bdstevel@tonic-gate	tst	%l6
12527c478bdstevel@tonic-gate	bz,pt	%ncc, .do_copy
12537c478bdstevel@tonic-gate	  nop
12547c478bdstevel@tonic-gate	sethi	%hi(.copyerr), %o2
12557c478bdstevel@tonic-gate	or	%o2, %lo(.copyerr), %o2
12567c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
12577c478bdstevel@tonic-gate	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
12587c478bdstevel@tonic-gate	!
12597c478bdstevel@tonic-gate	! We've already captured whether t_lofault was zero on entry.
12607c478bdstevel@tonic-gate	! We need to mark ourselves as being from bcopy since both
12617c478bdstevel@tonic-gate	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
12627c478bdstevel@tonic-gate	! and the saved lofault was zero, we won't reset lofault on
12637c478bdstevel@tonic-gate	! returning.
12647c478bdstevel@tonic-gate	!
12657c478bdstevel@tonic-gate	or	%l6, TRAMP_FLAG, %l6
12687c478bdstevel@tonic-gate * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
12697c478bdstevel@tonic-gate * Also, use of FP registers has been tested to be enabled
12707c478bdstevel@tonic-gate */
12727c478bdstevel@tonic-gate	FP_NOMIGRATE(6, 7)
12747c478bdstevel@tonic-gate	rd	%fprs, %o2		! check for unused fp
12757c478bdstevel@tonic-gate	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
12767c478bdstevel@tonic-gate	btst	FPRS_FEF, %o2
12777c478bdstevel@tonic-gate	bz,a,pt	%icc, .do_blockcopy
12787c478bdstevel@tonic-gate	  wr	%g0, FPRS_FEF, %fprs
12807c478bdstevel@tonic-gate	BST_FPQ1Q3_TOSTACK(%o2)
12837c478bdstevel@tonic-gate	rd	%gsr, %o2
12847c478bdstevel@tonic-gate	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
12857c478bdstevel@tonic-gate	or	%l6, FPUSED_FLAG, %l6
12877c478bdstevel@tonic-gate#define	REALSRC	%i0
12887c478bdstevel@tonic-gate#define	DST	%i1
12897c478bdstevel@tonic-gate#define	CNT	%i2
12907c478bdstevel@tonic-gate#define	SRC	%i3
12917c478bdstevel@tonic-gate#define	TMP	%i5
12937c478bdstevel@tonic-gate	andcc	DST, VIS_BLOCKSIZE - 1, TMP
12947c478bdstevel@tonic-gate	bz,pt	%ncc, 2f
12957c478bdstevel@tonic-gate	  neg	TMP
12967c478bdstevel@tonic-gate	add	TMP, VIS_BLOCKSIZE, TMP
12987c478bdstevel@tonic-gate	! TMP = bytes required to align DST on FP_BLOCK boundary
12997c478bdstevel@tonic-gate	! Using SRC as a tmp here
13007c478bdstevel@tonic-gate	cmp	TMP, 3
13017c478bdstevel@tonic-gate	bleu,pt	%ncc, 1f
13027c478bdstevel@tonic-gate	  sub	CNT,TMP,CNT		! adjust main count
13037c478bdstevel@tonic-gate	sub	TMP, 3, TMP		! adjust for end of loop test
13057c478bdstevel@tonic-gate	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
13067c478bdstevel@tonic-gate	stb	SRC, [DST]
13077c478bdstevel@tonic-gate	subcc	TMP, 4, TMP
13087c478bdstevel@tonic-gate	ldub	[REALSRC + 1], SRC
13097c478bdstevel@tonic-gate	add	REALSRC, 4, REALSRC
13107c478bdstevel@tonic-gate	stb	SRC, [DST + 1]
13117c478bdstevel@tonic-gate	ldub	[REALSRC - 2], SRC
13127c478bdstevel@tonic-gate	add	DST, 4, DST
13137c478bdstevel@tonic-gate	stb	SRC, [DST - 2]
13147c478bdstevel@tonic-gate	ldub	[REALSRC - 1], SRC
13157c478bdstevel@tonic-gate	bgu,pt	%ncc, .bc_blkalign
13167c478bdstevel@tonic-gate	  stb	SRC, [DST - 1]
13187c478bdstevel@tonic-gate	addcc	TMP, 3, TMP		! restore count adjustment
13197c478bdstevel@tonic-gate	bz,pt	%ncc, 2f		! no bytes left?
13207c478bdstevel@tonic-gate	  nop
13217c478bdstevel@tonic-gate1:	ldub	[REALSRC], SRC
13227c478bdstevel@tonic-gate	inc	REALSRC
13237c478bdstevel@tonic-gate	inc	DST
13247c478bdstevel@tonic-gate	deccc	TMP
13257c478bdstevel@tonic-gate	bgu	%ncc, 1b
13267c478bdstevel@tonic-gate	  stb	SRC, [DST - 1]
13297c478bdstevel@tonic-gate	andn	REALSRC, 0x7, SRC
13307c478bdstevel@tonic-gate	alignaddr REALSRC, %g0, %g0
13327c478bdstevel@tonic-gate	! SRC - 8-byte aligned
13337c478bdstevel@tonic-gate	! DST - 64-byte aligned
13347c478bdstevel@tonic-gate	prefetch [SRC], #one_read
13357c478bdstevel@tonic-gate	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
13367c478bdstevel@tonic-gate	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
13377c478bdstevel@tonic-gate	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
13387c478bdstevel@tonic-gate	ldd	[SRC], %f0
13397c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 4
13407c478bdstevel@tonic-gate	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
13427c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
13437c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 5
13447c478bdstevel@tonic-gate	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
13467c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
13477c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 6
13487c478bdstevel@tonic-gate	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
13507c478bdstevel@tonic-gate	faligndata %f0, %f2, %f32
13517c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
13527c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 7
13537c478bdstevel@tonic-gate	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
13557c478bdstevel@tonic-gate	faligndata %f2, %f4, %f34
13567c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
13577c478bdstevel@tonic-gate	faligndata %f4, %f6, %f36
13587c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
13597c478bdstevel@tonic-gate	faligndata %f6, %f8, %f38
13607c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
13617c478bdstevel@tonic-gate	faligndata %f8, %f10, %f40
13627c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
13637c478bdstevel@tonic-gate	faligndata %f10, %f12, %f42
13647c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
13657c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
13667c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
13677c478bdstevel@tonic-gate	add	REALSRC, VIS_BLOCKSIZE, REALSRC
13687c478bdstevel@tonic-gate	ba,a,pt	%ncc, 1f
13697c478bdstevel@tonic-gate	  nop
13707c478bdstevel@tonic-gate	.align	16
13727c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
13737c478bdstevel@tonic-gate	faligndata %f12, %f14, %f44
13747c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
13757c478bdstevel@tonic-gate	faligndata %f14, %f0, %f46
13767c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
13777c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
13787c478bdstevel@tonic-gate	faligndata %f0, %f2, %f32
13797c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
13807c478bdstevel@tonic-gate	faligndata %f2, %f4, %f34
13817c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
13827c478bdstevel@tonic-gate	faligndata %f4, %f6, %f36
13837c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
13847c478bdstevel@tonic-gate	faligndata %f6, %f8, %f38
13857c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
13867c478bdstevel@tonic-gate	faligndata %f8, %f10, %f40
13877c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
13887c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
13897c478bdstevel@tonic-gate	faligndata %f10, %f12, %f42
13907c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
13917c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
13927c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
13937c478bdstevel@tonic-gate	add	REALSRC, VIS_BLOCKSIZE, REALSRC
13947c478bdstevel@tonic-gate	cmp	CNT, VIS_BLOCKSIZE + 8
13957c478bdstevel@tonic-gate	bgu,pt	%ncc, 1b
13967c478bdstevel@tonic-gate	  add	SRC, VIS_BLOCKSIZE, SRC
13987c478bdstevel@tonic-gate	! only if REALSRC & 0x7 is 0
13997c478bdstevel@tonic-gate	cmp	CNT, VIS_BLOCKSIZE
14007c478bdstevel@tonic-gate	bne	%ncc, 3f
14017c478bdstevel@tonic-gate	  andcc	REALSRC, 0x7, %g0
14027c478bdstevel@tonic-gate	bz,pt	%ncc, 2f
14037c478bdstevel@tonic-gate	  nop
14057c478bdstevel@tonic-gate	faligndata %f12, %f14, %f44
14067c478bdstevel@tonic-gate	faligndata %f14, %f0, %f46
14077c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
14087c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
14097c478bdstevel@tonic-gate	ba,pt	%ncc, 3f
14107c478bdstevel@tonic-gate	  nop
14127c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
14137c478bdstevel@tonic-gate	fsrc1	%f12, %f44
14147c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
14157c478bdstevel@tonic-gate	fsrc1	%f14, %f46
14167c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
14177c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
14187c478bdstevel@tonic-gate	fsrc1	%f0, %f32
14197c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
14207c478bdstevel@tonic-gate	fsrc1	%f2, %f34
14217c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
14227c478bdstevel@tonic-gate	fsrc1	%f4, %f36
14237c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
14247c478bdstevel@tonic-gate	fsrc1	%f6, %f38
14257c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
14267c478bdstevel@tonic-gate	fsrc1	%f8, %f40
14277c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
14287c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
14297c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
14307c478bdstevel@tonic-gate	add	REALSRC, VIS_BLOCKSIZE, REALSRC
14317c478bdstevel@tonic-gate	fsrc1	%f10, %f42
14327c478bdstevel@tonic-gate	fsrc1	%f12, %f44
14337c478bdstevel@tonic-gate	fsrc1	%f14, %f46
14347c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
14357c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
14367c478bdstevel@tonic-gate	ba,a,pt	%ncc, .bcb_exit
14377c478bdstevel@tonic-gate	  nop
14397c478bdstevel@tonic-gate3:	tst	CNT
14407c478bdstevel@tonic-gate	bz,a,pt	%ncc, .bcb_exit
14417c478bdstevel@tonic-gate	  nop
14437c478bdstevel@tonic-gate5:	ldub	[REALSRC], TMP
14447c478bdstevel@tonic-gate	inc	REALSRC
14457c478bdstevel@tonic-gate	inc	DST
14467c478bdstevel@tonic-gate	deccc	CNT
14477c478bdstevel@tonic-gate	bgu	%ncc, 5b
14487c478bdstevel@tonic-gate	  stb	TMP, [DST - 1]
14507c478bdstevel@tonic-gate	membar	#Sync
14527c478bdstevel@tonic-gate	FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
14537c478bdstevel@tonic-gate	FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
14547c478bdstevel@tonic-gate	FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)	! outputs lost
14567c478bdstevel@tonic-gate	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
14577c478bdstevel@tonic-gate	wr	%o2, 0, %gsr
14597c478bdstevel@tonic-gate	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
14607c478bdstevel@tonic-gate	btst	FPRS_FEF, %o3
14617c478bdstevel@tonic-gate	bz,pt	%icc, 4f
14627c478bdstevel@tonic-gate	  nop
14647c478bdstevel@tonic-gate	BLD_FPQ1Q3_FROMSTACK(%o2)
14667c478bdstevel@tonic-gate	ba,pt	%ncc, 2f
14677c478bdstevel@tonic-gate	  wr	%o3, 0, %fprs		! restore fprs
14697c478bdstevel@tonic-gate	FZEROQ1Q3
14707c478bdstevel@tonic-gate	wr	%o3, 0, %fprs		! restore fprs
14727c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
14737c478bdstevel@tonic-gate	andn	%l6, MASK_FLAGS, %l6
14747c478bdstevel@tonic-gate	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
14757c478bdstevel@tonic-gate	FP_ALLOWMIGRATE(5, 6)
14767c478bdstevel@tonic-gate	ret
14777c478bdstevel@tonic-gate	  restore	%g0, 0, %o0
14797c478bdstevel@tonic-gate	SET_SIZE(bcopy_more)
14827c478bdstevel@tonic-gate * Block copy with possibly overlapped operands.
14837c478bdstevel@tonic-gate */
14857c478bdstevel@tonic-gate	ENTRY(ovbcopy)
14867c478bdstevel@tonic-gate	tst	%o2			! check count
14877c478bdstevel@tonic-gate	bgu,a	%ncc, 1f		! nothing to do or bad arguments
14887c478bdstevel@tonic-gate	  subcc	%o0, %o1, %o3		! difference of from and to address
14907c478bdstevel@tonic-gate	retl				! return
14917c478bdstevel@tonic-gate	  nop
14937c478bdstevel@tonic-gate	bneg,a	%ncc, 2f
14947c478bdstevel@tonic-gate	  neg	%o3			! if < 0, make it positive
14957c478bdstevel@tonic-gate2:	cmp	%o2, %o3		! cmp size and abs(from - to)
14967c478bdstevel@tonic-gate	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
14977c478bdstevel@tonic-gate	  .empty				!   no overlap
14987c478bdstevel@tonic-gate	  cmp	%o0, %o1		! compare from and to addresses
14997c478bdstevel@tonic-gate	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
15007c478bdstevel@tonic-gate	  nop
15017c478bdstevel@tonic-gate	!
15027c478bdstevel@tonic-gate	! Copy forwards.
15037c478bdstevel@tonic-gate	!
15057c478bdstevel@tonic-gate	ldub	[%o0], %o3		! read from address
15067c478bdstevel@tonic-gate	inc	%o0			! inc from address
15077c478bdstevel@tonic-gate	stb	%o3, [%o1]		! write to address
15087c478bdstevel@tonic-gate	deccc	%o2			! dec count
15097c478bdstevel@tonic-gate	bgu	%ncc, .ov_fwd		! loop till done
15107c478bdstevel@tonic-gate	  inc	%o1			! inc to address
15127c478bdstevel@tonic-gate	retl				! return
15137c478bdstevel@tonic-gate	  nop
15147c478bdstevel@tonic-gate	!
15157c478bdstevel@tonic-gate	! Copy backwards.
15167c478bdstevel@tonic-gate	!
15187c478bdstevel@tonic-gate	deccc	%o2			! dec count
15197c478bdstevel@tonic-gate	ldub	[%o0 + %o2], %o3	! get byte at end of src
15207c478bdstevel@tonic-gate	bgu	%ncc, .ov_bkwd		! loop till done
15217c478bdstevel@tonic-gate	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
15237c478bdstevel@tonic-gate	retl				! return
15247c478bdstevel@tonic-gate	  nop
15267c478bdstevel@tonic-gate	SET_SIZE(ovbcopy)
15307c478bdstevel@tonic-gate * hwblkpagecopy()
15317c478bdstevel@tonic-gate *
15327c478bdstevel@tonic-gate * Copies exactly one page.  This routine assumes the caller (ppcopy)
15337c478bdstevel@tonic-gate * has already disabled kernel preemption and has checked
15347c478bdstevel@tonic-gate * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
15357c478bdstevel@tonic-gate */
15367c478bdstevel@tonic-gate	ENTRY(hwblkpagecopy)
15377c478bdstevel@tonic-gate	! get another window w/space for three aligned blocks of saved fpregs
15387c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
15407c478bdstevel@tonic-gate	! %i0 - source address (arg)
15417c478bdstevel@tonic-gate	! %i1 - destination address (arg)
15427c478bdstevel@tonic-gate	! %i2 - length of region (not arg)
15437c478bdstevel@tonic-gate	! %l0 - saved fprs
15447c478bdstevel@tonic-gate	! %l1 - pointer to saved fpregs
15467c478bdstevel@tonic-gate	rd	%fprs, %l0		! check for unused fp
15477c478bdstevel@tonic-gate	btst	FPRS_FEF, %l0
15487c478bdstevel@tonic-gate	bz,a,pt	%icc, 1f
15497c478bdstevel@tonic-gate	  wr	%g0, FPRS_FEF, %fprs
15517c478bdstevel@tonic-gate	BST_FPQ1Q3_TOSTACK(%l1)
15537c478bdstevel@tonic-gate1:	set	PAGESIZE, CNT
15547c478bdstevel@tonic-gate	mov	REALSRC, SRC
15567c478bdstevel@tonic-gate	prefetch [SRC], #one_read
15577c478bdstevel@tonic-gate	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
15587c478bdstevel@tonic-gate	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
15597c478bdstevel@tonic-gate	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
15607c478bdstevel@tonic-gate	ldd	[SRC], %f0
15617c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 4
15627c478bdstevel@tonic-gate	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
15647c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
15657c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 5
15667c478bdstevel@tonic-gate	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
15687c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
15697c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 6
15707c478bdstevel@tonic-gate	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
15727c478bdstevel@tonic-gate	fsrc1	%f0, %f32
15737c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
15747c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 7
15757c478bdstevel@tonic-gate	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
15777c478bdstevel@tonic-gate	fsrc1	%f2, %f34
15787c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
15797c478bdstevel@tonic-gate	fsrc1	%f4, %f36
15807c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
15817c478bdstevel@tonic-gate	fsrc1	%f6, %f38
15827c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
15837c478bdstevel@tonic-gate	fsrc1	%f8, %f40
15847c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
15857c478bdstevel@tonic-gate	fsrc1	%f10, %f42
15867c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
15877c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
15887c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
15897c478bdstevel@tonic-gate	ba,a,pt	%ncc, 2f
15907c478bdstevel@tonic-gate	  nop
15917c478bdstevel@tonic-gate	.align	16
15937c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
15947c478bdstevel@tonic-gate	fsrc1	%f12, %f44
15957c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
15967c478bdstevel@tonic-gate	fsrc1	%f14, %f46
15977c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
15987c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
15997c478bdstevel@tonic-gate	fsrc1	%f0, %f32
16007c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
16017c478bdstevel@tonic-gate	fsrc1	%f2, %f34
16027c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
16037c478bdstevel@tonic-gate	fsrc1	%f4, %f36
16047c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
16057c478bdstevel@tonic-gate	fsrc1	%f6, %f38
16067c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
16077c478bdstevel@tonic-gate	fsrc1	%f8, %f40
16087c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
16097c478bdstevel@tonic-gate	fsrc1	%f10, %f42
16107c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
16117c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
16127c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
16137c478bdstevel@tonic-gate	cmp	CNT, VIS_BLOCKSIZE + 8
16147c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
16157c478bdstevel@tonic-gate	bgu,pt	%ncc, 2b
16167c478bdstevel@tonic-gate	  add	SRC, VIS_BLOCKSIZE, SRC
16187c478bdstevel@tonic-gate	! trailing block
16197c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
16207c478bdstevel@tonic-gate	fsrc1	%f12, %f44
16217c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
16227c478bdstevel@tonic-gate	fsrc1	%f14, %f46
16237c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
16247c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
16257c478bdstevel@tonic-gate	fsrc1	%f0, %f32
16267c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
16277c478bdstevel@tonic-gate	fsrc1	%f2, %f34
16287c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
16297c478bdstevel@tonic-gate	fsrc1	%f4, %f36
16307c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
16317c478bdstevel@tonic-gate	fsrc1	%f6, %f38
16327c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
16337c478bdstevel@tonic-gate	fsrc1	%f8, %f40
16347c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
16357c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
16367c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
16377c478bdstevel@tonic-gate	fsrc1	%f10, %f42
16387c478bdstevel@tonic-gate	fsrc1	%f12, %f44
16397c478bdstevel@tonic-gate	fsrc1	%f14, %f46
16407c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
16427c478bdstevel@tonic-gate	membar	#Sync
16447c478bdstevel@tonic-gate	FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
16457c478bdstevel@tonic-gate	FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
16467c478bdstevel@tonic-gate	FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)	! lose outputs
16487c478bdstevel@tonic-gate	btst	FPRS_FEF, %l0
16497c478bdstevel@tonic-gate	bz,pt	%icc, 2f
16507c478bdstevel@tonic-gate	  nop
16527c478bdstevel@tonic-gate	BLD_FPQ1Q3_FROMSTACK(%l3)
16537c478bdstevel@tonic-gate	ba	3f
16547c478bdstevel@tonic-gate	  nop
16567c478bdstevel@tonic-gate2:	FZEROQ1Q3
16587c478bdstevel@tonic-gate3:	wr	%l0, 0, %fprs		! restore fprs
16597c478bdstevel@tonic-gate	ret
16607c478bdstevel@tonic-gate	  restore	%g0, 0, %o0
16627c478bdstevel@tonic-gate	SET_SIZE(hwblkpagecopy)
16667c478bdstevel@tonic-gate * Transfer data to and from user space -
16677c478bdstevel@tonic-gate * Note that these routines can cause faults
16687c478bdstevel@tonic-gate * It is assumed that the kernel has nothing at
16697c478bdstevel@tonic-gate * less than KERNELBASE in the virtual address space.
16707c478bdstevel@tonic-gate *
16717c478bdstevel@tonic-gate * Note that copyin(9F) and copyout(9F) are part of the
16727c478bdstevel@tonic-gate * DDI/DKI which specifies that they return '-1' on "errors."
16737c478bdstevel@tonic-gate *
16747c478bdstevel@tonic-gate * Sigh.
16757c478bdstevel@tonic-gate *
16767c478bdstevel@tonic-gate * So there's two extremely similar routines - xcopyin() and xcopyout()
16777c478bdstevel@tonic-gate * which return the errno that we've faithfully computed.  This
16787c478bdstevel@tonic-gate * allows other callers (e.g. uiomove(9F)) to work correctly.
16797c478bdstevel@tonic-gate * Given that these are used pretty heavily, we expand the calling
16807c478bdstevel@tonic-gate * sequences inline for all flavours (rather than making wrappers).
16817c478bdstevel@tonic-gate *
16827c478bdstevel@tonic-gate * There are also stub routines for xcopyout_little and xcopyin_little,
16837c478bdstevel@tonic-gate * which currently are intended to handle requests of <= 16 bytes from
16847c478bdstevel@tonic-gate * do_unaligned. Future enhancement to make them handle 8k pages efficiently
16857c478bdstevel@tonic-gate * is left as an exercise...
16867c478bdstevel@tonic-gate */
16897c478bdstevel@tonic-gate * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
16907c478bdstevel@tonic-gate *
16917c478bdstevel@tonic-gate * General theory of operation:
16927c478bdstevel@tonic-gate *
16937c478bdstevel@tonic-gate * The only difference between copy{in,out} and
16947c478bdstevel@tonic-gate * xcopy{in,out} is in the error handling routine they invoke
16957c478bdstevel@tonic-gate * when a memory access error occurs. xcopyOP returns the errno
16967c478bdstevel@tonic-gate * while copyOP returns -1 (see above). copy{in,out}_noerr set
16977c478bdstevel@tonic-gate * a special flag (by oring the TRAMP_FLAG into the fault handler address)
16987c478bdstevel@tonic-gate * if they are called with a fault handler already in place. That flag
16997c478bdstevel@tonic-gate * causes the default handlers to trampoline to the previous handler
17007c478bdstevel@tonic-gate * upon an error.
17017c478bdstevel@tonic-gate *
17027c478bdstevel@tonic-gate * None of the copyops routines grab a window until it's decided that
17037c478bdstevel@tonic-gate * we need to do a HW block copy operation. This saves a window
17047c478bdstevel@tonic-gate * spill/fill when we're called during socket ops. The typical IO
17057c478bdstevel@tonic-gate * path won't cause spill/fill traps.
17067c478bdstevel@tonic-gate *
17077c478bdstevel@tonic-gate * This code uses a set of 4 limits for the maximum size that will
17087c478bdstevel@tonic-gate * be copied given a particular input/output address alignment.
17097c478bdstevel@tonic-gate * If the value for a particular limit is zero, the copy will be performed
17107c478bdstevel@tonic-gate * by the plain copy loops rather than FPBLK.
17117c478bdstevel@tonic-gate *
17127c478bdstevel@tonic-gate * See the description of bcopy above for more details of the
17137c478bdstevel@tonic-gate * data copying algorithm and the default limits.
17147c478bdstevel@tonic-gate *
17157c478bdstevel@tonic-gate */
17187c478bdstevel@tonic-gate * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
17197c478bdstevel@tonic-gate */
17227c478bdstevel@tonic-gate * We save the arguments in the following registers in case of a fault:
17237c478bdstevel@tonic-gate *	kaddr - %l1
17247c478bdstevel@tonic-gate *	uaddr - %l2
17257c478bdstevel@tonic-gate *	count - %l3
17267c478bdstevel@tonic-gate */
17277c478bdstevel@tonic-gate#define SAVE_SRC	%l1
17287c478bdstevel@tonic-gate#define SAVE_DST	%l2
17297c478bdstevel@tonic-gate#define SAVE_COUNT	%l3
17317c478bdstevel@tonic-gate#define SM_SAVE_SRC		%g4
17327c478bdstevel@tonic-gate#define SM_SAVE_DST		%g5
17337c478bdstevel@tonic-gate#define SM_SAVE_COUNT		%o5
17347c478bdstevel@tonic-gate#define ERRNO		%l5
17377c478bdstevel@tonic-gate#define REAL_LOFAULT	%l4
17397c478bdstevel@tonic-gate * Generic copyio fault handler.  This is the first line of defense when a
17407c478bdstevel@tonic-gate * fault occurs in (x)copyin/(x)copyout.  In order for this to function
17417c478bdstevel@tonic-gate * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
17427c478bdstevel@tonic-gate * This allows us to share common code for all the flavors of the copy
17437c478bdstevel@tonic-gate * operations, including the _noerr versions.
17447c478bdstevel@tonic-gate *
17457c478bdstevel@tonic-gate * Note that this function will restore the original input parameters before
17467c478bdstevel@tonic-gate * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
17477c478bdstevel@tonic-gate * member of the t_copyop structure, if needed.
17487c478bdstevel@tonic-gate */
17497c478bdstevel@tonic-gate	ENTRY(copyio_fault)
17507c478bdstevel@tonic-gate	membar	#Sync
17517c478bdstevel@tonic-gate	mov	%g1,ERRNO			! save errno in ERRNO
17527c478bdstevel@tonic-gate	btst	FPUSED_FLAG, %l6
17537c478bdstevel@tonic-gate	bz	%ncc, 1f
17547c478bdstevel@tonic-gate	  nop
17567c478bdstevel@tonic-gate	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
17577c478bdstevel@tonic-gate	wr	%o2, 0, %gsr    	! restore gsr
17597c478bdstevel@tonic-gate	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
17607c478bdstevel@tonic-gate	btst	FPRS_FEF, %o3
17617c478bdstevel@tonic-gate	bz,pt	%icc, 4f
17627c478bdstevel@tonic-gate	  nop
17647c478bdstevel@tonic-gate	BLD_FPQ2Q4_FROMSTACK(%o2)
17667c478bdstevel@tonic-gate	ba,pt	%ncc, 1f
17677c478bdstevel@tonic-gate	  wr	%o3, 0, %fprs   	! restore fprs
17707c478bdstevel@tonic-gate	FZEROQ2Q4
17717c478bdstevel@tonic-gate	wr	%o3, 0, %fprs   	! restore fprs
17747c478bdstevel@tonic-gate	andn	%l6, FPUSED_FLAG, %l6
17757c478bdstevel@tonic-gate	membar	#Sync
17767c478bdstevel@tonic-gate	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
17777c478bdstevel@tonic-gate	FP_ALLOWMIGRATE(5, 6)
17797c478bdstevel@tonic-gate	mov	SAVE_SRC, %i0
17807c478bdstevel@tonic-gate	mov	SAVE_DST, %i1
17817c478bdstevel@tonic-gate	jmp	REAL_LOFAULT
17827c478bdstevel@tonic-gate	  mov	SAVE_COUNT, %i2
17847c478bdstevel@tonic-gate	SET_SIZE(copyio_fault)
17877c478bdstevel@tonic-gate	ENTRY(copyout)
17897c478bdstevel@tonic-gate	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
17907c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to larger cases
17917c478bdstevel@tonic-gate	  xor	%o0, %o1, %o3			! are src, dst alignable?
17927c478bdstevel@tonic-gate	btst	7, %o3				!
17937c478bdstevel@tonic-gate	bz,pt	%ncc, .copyout_8		! check for longword alignment
17947c478bdstevel@tonic-gate	  nop
17957c478bdstevel@tonic-gate	btst	1, %o3				!
17967c478bdstevel@tonic-gate	bz,pt	%ncc, .copyout_2		! check for half-word
17977c478bdstevel@tonic-gate	  nop
17987c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
17997c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
18007c478bdstevel@tonic-gate	tst	%o3
18017c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18027c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18037c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18047c478bdstevel@tonic-gate	  nop
18057c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18067c478bdstevel@tonic-gate	  nop
18087c478bdstevel@tonic-gate	btst	3, %o3				!
18097c478bdstevel@tonic-gate	bz,pt	%ncc, .copyout_4		! check for word alignment
18107c478bdstevel@tonic-gate	  nop
18117c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
18127c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
18137c478bdstevel@tonic-gate	tst	%o3
18147c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18157c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18167c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18177c478bdstevel@tonic-gate	  nop
18187c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18197c478bdstevel@tonic-gate	  nop
18217c478bdstevel@tonic-gate	! already checked longword, must be word aligned
18227c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
18237c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
18247c478bdstevel@tonic-gate	tst	%o3
18257c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18267c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18277c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18287c478bdstevel@tonic-gate	  nop
18297c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18307c478bdstevel@tonic-gate	  nop
18327c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
18337c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
18347c478bdstevel@tonic-gate	tst	%o3
18357c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18367c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18377c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18387c478bdstevel@tonic-gate	  nop
18397c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18407c478bdstevel@tonic-gate	  nop
18427c478bdstevel@tonic-gate	.align	16
18437c478bdstevel@tonic-gate	nop				! instruction alignment
18447c478bdstevel@tonic-gate					! see discussion at start of file
18467c478bdstevel@tonic-gate	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
18477c478bdstevel@tonic-gate	or	%o5, %lo(.sm_copyout_err), %o5
18487c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
18497c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
18507c478bdstevel@tonic-gate	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
18527c478bdstevel@tonic-gate	mov	%o0, SM_SAVE_SRC
18537c478bdstevel@tonic-gate	mov	%o1, SM_SAVE_DST
18547c478bdstevel@tonic-gate	cmp	%o2, SHORTCOPY		! check for really short case
18557c478bdstevel@tonic-gate	bleu,pt	%ncc, .co_sm_left	!
18567c478bdstevel@tonic-gate	  mov	%o2, SM_SAVE_COUNT
18577c478bdstevel@tonic-gate	cmp	%o2, CHKSIZE		! check for medium length cases
18587c478bdstevel@tonic-gate	bgu,pn	%ncc, .co_med		!
18597c478bdstevel@tonic-gate	  or	%o0, %o1, %o3		! prepare alignment check
18607c478bdstevel@tonic-gate	andcc	%o3, 0x3, %g0		! test for alignment
18617c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
18637c478bdstevel@tonic-gate	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
18657c478bdstevel@tonic-gate	ldub	[%o0], %o3		! read byte
18667c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
18677c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER	! write byte
18687c478bdstevel@tonic-gate	inc	%o1			! advance DST by 1
18697c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
18707c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
18717c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
18727c478bdstevel@tonic-gate	inc	%o1			! advance DST by 1
18737c478bdstevel@tonic-gate	ldub	[%o0 - 2], %o3
18747c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
18757c478bdstevel@tonic-gate	inc	%o1			! advance DST by 1
18767c478bdstevel@tonic-gate	ldub	[%o0 - 1], %o3
18777c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
18787c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
18797c478bdstevel@tonic-gate	  inc	%o1			! advance DST by 1
18807c478bdstevel@tonic-gate	add	%o2, 3, %o2		! restore count
18827c478bdstevel@tonic-gate	tst	%o2
18837c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit	! check for zero length
18847c478bdstevel@tonic-gate	  nop
18857c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
18867c478bdstevel@tonic-gate	deccc	%o2			! reduce count for cc test
18877c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
18887c478bdstevel@tonic-gate	  stba	%o3,[%o1]ASI_USER	! store one byte
18897c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! load second byte
18907c478bdstevel@tonic-gate	deccc	%o2
18917c478bdstevel@tonic-gate	inc	%o1
18927c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
18937c478bdstevel@tonic-gate	  stba	%o3,[%o1]ASI_USER	! store second byte
18947c478bdstevel@tonic-gate	ldub	[%o0 + 2], %o3		! load third byte
18957c478bdstevel@tonic-gate	inc	%o1
18967c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store third byte
18977c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
18987c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
18997c478bdstevel@tonic-gate	retl
19007c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
19017c478bdstevel@tonic-gate	.align	16
19037c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
19057c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! update count
19067c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
19077c478bdstevel@tonic-gate	add	%o0, 8, %o0		! update SRC
19087c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3		! read word
19097c478bdstevel@tonic-gate	add	%o1, 4, %o1		! update DST
19107c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
19117c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_words	! loop til done
19127c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! update DST
19137c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
19147c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19157c478bdstevel@tonic-gate	  nop
19167c478bdstevel@tonic-gate	deccc	%o2
19177c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
19197c478bdstevel@tonic-gate	  subcc	%o2, 2, %o2		! reduce count by 2
19207c478bdstevel@tonic-gate	lduh	[%o0], %o3		! read half word
19217c478bdstevel@tonic-gate	add	%o0, 2, %o0		! advance SRC by 2
19227c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER	! write half word
19237c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_half	! loop til done
19247c478bdstevel@tonic-gate	  add	%o1, 2, %o1		! advance DST by 2
19257c478bdstevel@tonic-gate	addcc	%o2, 1, %o2		! restore count
19267c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19277c478bdstevel@tonic-gate	  nop
19297c478bdstevel@tonic-gate	ldub	[%o0], %o3
19307c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
19317c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
19327c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
19337c478bdstevel@tonic-gate	retl
19347c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
19357c478bdstevel@tonic-gate	.align 16
19377c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! update count
19387c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_wordx
19397c478bdstevel@tonic-gate	  lduw	[%o0], %o3		! read word
19407c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore count
19417c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19427c478bdstevel@tonic-gate	  stwa	%o3, [%o1]ASI_USER	! write word
19437c478bdstevel@tonic-gate	deccc	%o2			! reduce count for cc test
19447c478bdstevel@tonic-gate	ldub	[%o0 + 4], %o3		! load one byte
19457c478bdstevel@tonic-gate	add	%o1, 4, %o1
19467c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19477c478bdstevel@tonic-gate	  stba	%o3, [%o1]ASI_USER	! store one byte
19487c478bdstevel@tonic-gate	ldub	[%o0 + 5], %o3		! load second byte
19497c478bdstevel@tonic-gate	deccc	%o2
19507c478bdstevel@tonic-gate	inc	%o1
19517c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19527c478bdstevel@tonic-gate	  stba	%o3, [%o1]ASI_USER	! store second byte
19537c478bdstevel@tonic-gate	ldub	[%o0 + 6], %o3		! load third byte
19547c478bdstevel@tonic-gate	inc	%o1
19557c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER	! store third byte
19577c478bdstevel@tonic-gate	  membar	#Sync				! sync error barrier
19587c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
19597c478bdstevel@tonic-gate	retl
19607c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
19627c478bdstevel@tonic-gate	.align 16
19647c478bdstevel@tonic-gate	xor	%o0, %o1, %o3		! setup alignment check
19657c478bdstevel@tonic-gate	btst	1, %o3
19667c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
19677c478bdstevel@tonic-gate	  nop
19687c478bdstevel@tonic-gate	btst	3, %o3
19697c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_half	! halfword aligned
19707c478bdstevel@tonic-gate	  nop
19717c478bdstevel@tonic-gate	btst	7, %o3
19727c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_word	! word aligned
19737c478bdstevel@tonic-gate	  nop
19757c478bdstevel@tonic-gate	btst	3, %o0			! check for
19767c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_long1	! word alignment
19777c478bdstevel@tonic-gate	  nop
19797c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
19807c478bdstevel@tonic-gate	inc	%o0
19817c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store byte
19827c478bdstevel@tonic-gate	inc	%o1
19837c478bdstevel@tonic-gate	btst	3, %o0
19847c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_long0
19857c478bdstevel@tonic-gate	  dec	%o2
19867c478bdstevel@tonic-gate.co_med_long1:			! word aligned
19877c478bdstevel@tonic-gate	btst	7, %o0			! check for long word
19887c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_long2
19897c478bdstevel@tonic-gate	  nop
19907c478bdstevel@tonic-gate	lduw	[%o0], %o3		! load word
19917c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
19927c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! store word
19937c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
19947c478bdstevel@tonic-gate	sub	%o2, 4, %o2		! reduce count by 4
19967c478bdstevel@tonic-gate!  Now long word aligned and have at least 32 bytes to move
19997c478bdstevel@tonic-gate	sub	%o2, 31, %o2		! adjust count to allow cc zero test
20007c478bdstevel@tonic-gate	sub	%o1, 8, %o1		! adjust pointer to allow store in
20017c478bdstevel@tonic-gate					! branch delay slot instead of add
20037c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20047c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
20057c478bdstevel@tonic-gate	subcc	%o2, 32, %o2		! reduce count by 32
20067c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER	! write long word
20077c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20087c478bdstevel@tonic-gate	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
20097c478bdstevel@tonic-gate	add	%o0, 32, %o0		! advance SRC by 32
20107c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER
20117c478bdstevel@tonic-gate	ldx	[%o0 - 16], %o3
20127c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20137c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER
20147c478bdstevel@tonic-gate	ldx	[%o0 - 8], %o3
20157c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20167c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
20177c478bdstevel@tonic-gate	  stxa	%o3, [%o1]ASI_USER
20187c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20197c478bdstevel@tonic-gate	addcc	%o2, 24, %o2		! restore count to long word offset
20207c478bdstevel@tonic-gate	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
20217c478bdstevel@tonic-gate	  nop
20237c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
20247c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
20257c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER	! write long word
20267c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
20277c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
20287c478bdstevel@tonic-gate	  add	%o1, 8, %o1		! advance DST by 8
20307c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore rest of count
20317c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit	! if zero, then done
20327c478bdstevel@tonic-gate	  deccc	%o2
20337c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
20347c478bdstevel@tonic-gate	  nop
20357c478bdstevel@tonic-gate	ba,pt	%ncc, .co_sm_half
20367c478bdstevel@tonic-gate	  nop
20387c478bdstevel@tonic-gate	.align 16
20397c478bdstevel@tonic-gate	nop				! instruction alignment
20407c478bdstevel@tonic-gate					! see discussion at start of file
20427c478bdstevel@tonic-gate	btst	3, %o0			! check for
20437c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_word1	! word alignment
20447c478bdstevel@tonic-gate	  nop
20467c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
20477c478bdstevel@tonic-gate	inc	%o0
20487c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store byte
20497c478bdstevel@tonic-gate	inc	%o1
20507c478bdstevel@tonic-gate	btst	3, %o0
20517c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_word0
20527c478bdstevel@tonic-gate	  dec	%o2
20547c478bdstevel@tonic-gate!  Now word aligned and have at least 36 bytes to move
20577c478bdstevel@tonic-gate	sub	%o2, 15, %o2		! adjust count to allow cc zero test
20597c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
20607c478bdstevel@tonic-gate	subcc	%o2, 16, %o2		! reduce count by 16
20617c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
20627c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
20637c478bdstevel@tonic-gate	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
20647c478bdstevel@tonic-gate	add	%o0, 16, %o0		! advance SRC by 16
20657c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER
20667c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
20677c478bdstevel@tonic-gate	lduw	[%o0 - 8], %o3
20687c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER
20697c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
20707c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3
20717c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER
20727c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
20737c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! advance DST by 4
20747c478bdstevel@tonic-gate	addcc	%o2, 12, %o2		! restore count to word offset
20757c478bdstevel@tonic-gate	ble,pt	%ncc, .co_med_wextra	! check for more words to move
20767c478bdstevel@tonic-gate	  nop
20787c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
20797c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
20807c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
20817c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
20827c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
20837c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! advance DST by 4
20857c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore rest of count
20867c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit	! if zero, then done
20877c478bdstevel@tonic-gate	  deccc	%o2
20887c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
20897c478bdstevel@tonic-gate	  nop
20907c478bdstevel@tonic-gate	ba,pt	%ncc, .co_sm_half
20917c478bdstevel@tonic-gate	  nop
20937c478bdstevel@tonic-gate	.align 16
20947c478bdstevel@tonic-gate	nop				! instruction alignment
20957c478bdstevel@tonic-gate	nop				! see discussion at start of file
20967c478bdstevel@tonic-gate	nop
20987c478bdstevel@tonic-gate	btst	1, %o0			! check for
20997c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_half1	! half word alignment
21007c478bdstevel@tonic-gate	  nop
21017c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
21027c478bdstevel@tonic-gate	inc	%o0
21037c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store byte
21047c478bdstevel@tonic-gate	inc	%o1
21057c478bdstevel@tonic-gate	dec	%o2
21077c478bdstevel@tonic-gate!  Now half word aligned and have at least 38 bytes to move
21107c478bdstevel@tonic-gate	sub	%o2, 7, %o2		! adjust count to allow cc zero test
21127c478bdstevel@tonic-gate	lduh	[%o0], %o3		! read half word
21137c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
21147c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER	! write half word
21157c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
21167c478bdstevel@tonic-gate	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
21177c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
21187c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER
21197c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
21207c478bdstevel@tonic-gate	lduh	[%o0 - 4], %o3
21217c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER
21227c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
21237c478bdstevel@tonic-gate	lduh	[%o0 - 2], %o3
21247c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER
21257c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
21267c478bdstevel@tonic-gate	  add	%o1, 2, %o1		! advance DST by 2
21277c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
21287c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
21297c478bdstevel@tonic-gate	  deccc	%o2
21307c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
21317c478bdstevel@tonic-gate	  nop
21327c478bdstevel@tonic-gate	ba,pt	%ncc, .co_sm_half
21337c478bdstevel@tonic-gate	  nop
21367c478bdstevel@tonic-gate * We got here because of a fault during short copyout.
21377c478bdstevel@tonic-gate * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
21387c478bdstevel@tonic-gate */
21407c478bdstevel@tonic-gate	membar	#Sync
21417c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
21427c478bdstevel@tonic-gate	mov	SM_SAVE_SRC, %o0
21437c478bdstevel@tonic-gate	mov	SM_SAVE_DST, %o1
21447c478bdstevel@tonic-gate	mov	SM_SAVE_COUNT, %o2
21457c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
21467c478bdstevel@tonic-gate	tst	%o3
21477c478bdstevel@tonic-gate	bz,pt	%ncc, 3f			! if not, return error
21487c478bdstevel@tonic-gate	  nop
21497c478bdstevel@tonic-gate	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
21507c478bdstevel@tonic-gate	jmp	%o5				! original arguments
21517c478bdstevel@tonic-gate	  nop
21537c478bdstevel@tonic-gate	retl
21547c478bdstevel@tonic-gate	  or	%g0, -1, %o0		! return error value
21567c478bdstevel@tonic-gate	SET_SIZE(copyout)
21597c478bdstevel@tonic-gate * The _more entry points are not intended to be used directly by
21607c478bdstevel@tonic-gate * any caller from outside this file.  They are provided to allow
21617c478bdstevel@tonic-gate * profiling and dtrace of the portions of the copy code that uses
21627c478bdstevel@tonic-gate * the floating point registers.
21637c478bdstevel@tonic-gate * This entry is particularly important as DTRACE (at least as of
21647c478bdstevel@tonic-gate * 4/2004) does not support leaf functions.
21657c478bdstevel@tonic-gate */
21677c478bdstevel@tonic-gate	ENTRY(copyout_more)
21697c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
21707c478bdstevel@tonic-gate	set	.copyout_err, REAL_LOFAULT
21737c478bdstevel@tonic-gate * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
21747c478bdstevel@tonic-gate */
21767c478bdstevel@tonic-gate        set     copyio_fault, %l7		! .copyio_fault is lofault val
21787c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
21797c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
21807c478bdstevel@tonic-gate	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
21827c478bdstevel@tonic-gate	mov	%i0, SAVE_SRC
21837c478bdstevel@tonic-gate	mov	%i1, SAVE_DST
21847c478bdstevel@tonic-gate	mov	%i2, SAVE_COUNT
21867c478bdstevel@tonic-gate	FP_NOMIGRATE(6, 7)
21887c478bdstevel@tonic-gate	rd	%fprs, %o2		! check for unused fp
21897c478bdstevel@tonic-gate	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
21907c478bdstevel@tonic-gate	btst	FPRS_FEF, %o2
21917c478bdstevel@tonic-gate	bz,a,pt	%icc, .do_blockcopyout
21927c478bdstevel@tonic-gate	  wr	%g0, FPRS_FEF, %fprs
21947c478bdstevel@tonic-gate	BST_FPQ2Q4_TOSTACK(%o2)
21977c478bdstevel@tonic-gate	rd	%gsr, %o2
21987c478bdstevel@tonic-gate	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
21997c478bdstevel@tonic-gate	or	%l6, FPUSED_FLAG, %l6
22017c478bdstevel@tonic-gate	andcc	DST, VIS_BLOCKSIZE - 1, TMP
22027c478bdstevel@tonic-gate	mov	ASI_USER, %asi
22037c478bdstevel@tonic-gate	bz,pt	%ncc, 2f
22047c478bdstevel@tonic-gate	  neg	TMP
22057c478bdstevel@tonic-gate	add	TMP, VIS_BLOCKSIZE, TMP
22077c478bdstevel@tonic-gate	! TMP = bytes required to align DST on FP_BLOCK boundary
22087c478bdstevel@tonic-gate	! Using SRC as a tmp here
22097c478bdstevel@tonic-gate	cmp	TMP, 3
22107c478bdstevel@tonic-gate	bleu,pt	%ncc, 1f
22117c478bdstevel@tonic-gate	  sub	CNT,TMP,CNT		! adjust main count
22127c478bdstevel@tonic-gate	sub	TMP, 3, TMP		! adjust for end of loop test
22147c478bdstevel@tonic-gate	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
22157c478bdstevel@tonic-gate	stba	SRC, [DST]%asi
22167c478bdstevel@tonic-gate	subcc	TMP, 4, TMP
22177c478bdstevel@tonic-gate	ldub	[REALSRC + 1], SRC
22187c478bdstevel@tonic-gate	add	REALSRC, 4, REALSRC
22197c478bdstevel@tonic-gate	stba	SRC, [DST + 1]%asi
22207c478bdstevel@tonic-gate	ldub	[REALSRC - 2], SRC
22217c478bdstevel@tonic-gate	add	DST, 4, DST
22227c478bdstevel@tonic-gate	stba	SRC, [DST - 2]%asi
22237c478bdstevel@tonic-gate	ldub	[REALSRC - 1], SRC
22247c478bdstevel@tonic-gate	bgu,pt	%ncc, .co_blkalign
22257c478bdstevel@tonic-gate	  stba	SRC, [DST - 1]%asi
22277c478bdstevel@tonic-gate	addcc	TMP, 3, TMP		! restore count adjustment
22287c478bdstevel@tonic-gate	bz,pt	%ncc, 2f		! no bytes left?
22297c478bdstevel@tonic-gate	  nop
22307c478bdstevel@tonic-gate1:	ldub	[REALSRC], SRC
22317c478bdstevel@tonic-gate	inc	REALSRC
22327c478bdstevel@tonic-gate	inc	DST
22337c478bdstevel@tonic-gate	deccc	TMP
22347c478bdstevel@tonic-gate	bgu	%ncc, 1b
22357c478bdstevel@tonic-gate	  stba	SRC, [DST - 1]%asi
22387c478bdstevel@tonic-gate	andn	REALSRC, 0x7, SRC
22397c478bdstevel@tonic-gate	alignaddr REALSRC, %g0, %g0
22417c478bdstevel@tonic-gate	! SRC - 8-byte aligned
22427c478bdstevel@tonic-gate	! DST - 64-byte aligned
22437c478bdstevel@tonic-gate	prefetch [SRC], #one_read
22447c478bdstevel@tonic-gate	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
22457c478bdstevel@tonic-gate	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
22467c478bdstevel@tonic-gate	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
22477c478bdstevel@tonic-gate	ldd	[SRC], %f16
22487c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 4
22497c478bdstevel@tonic-gate	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
22517c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f18
22527c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 5
22537c478bdstevel@tonic-gate	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
22557c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f20
22567c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 6
22577c478bdstevel@tonic-gate	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
22597c478bdstevel@tonic-gate	faligndata %f16, %f18, %f48
22607c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f22
22617c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 7
22627c478bdstevel@tonic-gate	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
22647c478bdstevel@tonic-gate	faligndata %f18, %f20, %f50
22657c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f24
22667c478bdstevel@tonic-gate	faligndata %f20, %f22, %f52
22677c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f26
22687c478bdstevel@tonic-gate	faligndata %f22, %f24, %f54
22697c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f28
22707c478bdstevel@tonic-gate	faligndata %f24, %f26, %f56
22717c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f30
22727c478bdstevel@tonic-gate	faligndata %f26, %f28, %f58
22737c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f16
22747c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
22757c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
22767c478bdstevel@tonic-gate	add	REALSRC, VIS_BLOCKSIZE, REALSRC
22777c478bdstevel@tonic-gate	ba,a,pt	%ncc, 1f
22787c478bdstevel@tonic-gate	  nop
22797c478bdstevel@tonic-gate	.align	16
22817c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f18
22827c478bdstevel@tonic-gate	faligndata %f28, %f30, %f60
22837c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f20
22847c478bdstevel@tonic-gate	faligndata %f30, %f16, %f62
22857c478bdstevel@tonic-gate	stda	%f48, [DST]ASI_BLK_AIUS
22867c478bdstevel@tonic-gate	ldd