27c478bdstevel@tonic-gate * CDDL HEADER START
37c478bdstevel@tonic-gate *
47c478bdstevel@tonic-gate * The contents of this file are subject to the terms of the
57c478bdstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only
67c478bdstevel@tonic-gate * (the "License").  You may not use this file except in compliance
77c478bdstevel@tonic-gate * with the License.
87c478bdstevel@tonic-gate *
97c478bdstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
107c478bdstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
117c478bdstevel@tonic-gate * See the License for the specific language governing permissions
127c478bdstevel@tonic-gate * and limitations under the License.
137c478bdstevel@tonic-gate *
147c478bdstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
157c478bdstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
167c478bdstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
177c478bdstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
187c478bdstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
197c478bdstevel@tonic-gate *
207c478bdstevel@tonic-gate * CDDL HEADER END
217c478bdstevel@tonic-gate */
237c478bdstevel@tonic-gate * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
247c478bdstevel@tonic-gate * Use is subject to license terms.
257c478bdstevel@tonic-gate */
277c478bdstevel@tonic-gate#pragma	ident	"%Z%%M%	%I%	%E% SMI"
297c478bdstevel@tonic-gate#include <sys/param.h>
307c478bdstevel@tonic-gate#include <sys/errno.h>
317c478bdstevel@tonic-gate#include <sys/asm_linkage.h>
327c478bdstevel@tonic-gate#include <sys/vtrace.h>
337c478bdstevel@tonic-gate#include <sys/machthread.h>
347c478bdstevel@tonic-gate#include <sys/clock.h>
357c478bdstevel@tonic-gate#include <sys/asi.h>
367c478bdstevel@tonic-gate#include <sys/fsr.h>
377c478bdstevel@tonic-gate#include <sys/privregs.h>
387c478bdstevel@tonic-gate#include <sys/fpras_impl.h>
407c478bdstevel@tonic-gate#if !defined(lint)
417c478bdstevel@tonic-gate#include "assym.h"
427c478bdstevel@tonic-gate#endif	/* lint */
457c478bdstevel@tonic-gate * Pseudo-code to aid in understanding the control flow of the
467c478bdstevel@tonic-gate * bcopy/copyin/copyout routines.
477c478bdstevel@tonic-gate *
487c478bdstevel@tonic-gate * On entry:
497c478bdstevel@tonic-gate *
507c478bdstevel@tonic-gate * 	! Determine whether to use the FP register version
517c478bdstevel@tonic-gate * 	! or the leaf routine version depending on size
527c478bdstevel@tonic-gate * 	! of copy and flags.  Set up error handling accordingly.
537c478bdstevel@tonic-gate *	! The transition point depends on whether the src and
547c478bdstevel@tonic-gate * 	! dst addresses can be aligned to long word, word,
557c478bdstevel@tonic-gate * 	! half word, or byte boundaries.
567c478bdstevel@tonic-gate *	!
577c478bdstevel@tonic-gate *	! WARNING: <Register usage convention>
587c478bdstevel@tonic-gate *	! For FP version, %l6 holds previous error handling and
597c478bdstevel@tonic-gate *	! a flag: TRAMP_FLAG (low bits)
607c478bdstevel@tonic-gate *	! for leaf routine version, %o4 holds those values.
617c478bdstevel@tonic-gate *	! So either %l6 or %o4 is reserved and not available for
627c478bdstevel@tonic-gate *	! any other use.
637c478bdstevel@tonic-gate *
647c478bdstevel@tonic-gate * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
657c478bdstevel@tonic-gate * 		go to small_copy;		! to speed short copies
667c478bdstevel@tonic-gate *
677c478bdstevel@tonic-gate * 	! src, dst long word alignable
687c478bdstevel@tonic-gate * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
697c478bdstevel@tonic-gate * 			go to small_copy;
707c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_8)
717c478bdstevel@tonic-gate * 			go to small_copy;
727c478bdstevel@tonic-gate * 		go to FPBLK_copy;
737c478bdstevel@tonic-gate * 	}
747c478bdstevel@tonic-gate * 	if (src,dst not alignable) {
757c478bdstevel@tonic-gate * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
767c478bdstevel@tonic-gate * 			go to small_copy;
777c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_1)
787c478bdstevel@tonic-gate * 			go to small_copy;
797c478bdstevel@tonic-gate * 		go to FPBLK_copy;
807c478bdstevel@tonic-gate * 	}
817c478bdstevel@tonic-gate * 	if (src,dst halfword alignable) {
827c478bdstevel@tonic-gate * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
837c478bdstevel@tonic-gate * 			go to small_copy;
847c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_2)
857c478bdstevel@tonic-gate * 			go to small_copy;
867c478bdstevel@tonic-gate * 		go to FPBLK_copy;
877c478bdstevel@tonic-gate * 	}
887c478bdstevel@tonic-gate * 	if (src,dst word alignable) {
897c478bdstevel@tonic-gate * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
907c478bdstevel@tonic-gate * 			go to small_copy;
917c478bdstevel@tonic-gate *		if (length <= hw_copy_limit_4)
927c478bdstevel@tonic-gate * 			go to small_copy;
937c478bdstevel@tonic-gate * 		go to FPBLK_copy;
947c478bdstevel@tonic-gate * 	}
957c478bdstevel@tonic-gate *
967c478bdstevel@tonic-gate * small_copy:
977c478bdstevel@tonic-gate *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
987c478bdstevel@tonic-gate *
997c478bdstevel@tonic-gate *	if (count <= 3)				! fast path for tiny copies
1007c478bdstevel@tonic-gate *		go to sm_left;			! special finish up code
1017c478bdstevel@tonic-gate *	else
1027c478bdstevel@tonic-gate *		if (count > CHKSIZE)		! medium sized copies
1037c478bdstevel@tonic-gate *			go to sm_med		! tuned by alignment
1047c478bdstevel@tonic-gate *		if(src&dst not both word aligned) {
1057c478bdstevel@tonic-gate *	sm_movebytes:
1067c478bdstevel@tonic-gate *			move byte by byte in 4-way unrolled loop
1077c478bdstevel@tonic-gate *			fall into sm_left;
1087c478bdstevel@tonic-gate *	sm_left:
1097c478bdstevel@tonic-gate *			move 0-3 bytes byte at a time as needed.
1107c478bdstevel@tonic-gate *			restore error handler and exit.
1117c478bdstevel@tonic-gate *
1127c478bdstevel@tonic-gate * 		} else {	! src&dst are word aligned
1137c478bdstevel@tonic-gate *			check for at least 8 bytes left,
1147c478bdstevel@tonic-gate *			move word at a time, unrolled by 2
1157c478bdstevel@tonic-gate *			when fewer than 8 bytes left,
1167c478bdstevel@tonic-gate *	sm_half:	move half word at a time while 2 or more bytes left
1177c478bdstevel@tonic-gate *	sm_byte:	move final byte if necessary
1187c478bdstevel@tonic-gate *	sm_exit:
1197c478bdstevel@tonic-gate *			restore error handler and exit.
1207c478bdstevel@tonic-gate *		}
1217c478bdstevel@tonic-gate *
1227c478bdstevel@tonic-gate * ! Medium length cases with at least CHKSIZE bytes available
1237c478bdstevel@tonic-gate * ! method: line up src and dst as best possible, then
1247c478bdstevel@tonic-gate * ! move data in 4-way unrolled loops.
1257c478bdstevel@tonic-gate *
1267c478bdstevel@tonic-gate * sm_med:
1277c478bdstevel@tonic-gate *	if(src&dst unalignable)
1287c478bdstevel@tonic-gate * 		go to sm_movebytes
1297c478bdstevel@tonic-gate *	if(src&dst halfword alignable)
1307c478bdstevel@tonic-gate *		go to sm_movehalf
1317c478bdstevel@tonic-gate *	if(src&dst word alignable)
1327c478bdstevel@tonic-gate *		go to sm_moveword
1337c478bdstevel@tonic-gate * ! fall into long word movement
1347c478bdstevel@tonic-gate *	move bytes until src is word aligned
1357c478bdstevel@tonic-gate *	if not long word aligned, move a word
1367c478bdstevel@tonic-gate *	move long words in 4-way unrolled loop until < 32 bytes left
1377c478bdstevel@tonic-gate *      move long words in 1-way unrolled loop until < 8 bytes left
1387c478bdstevel@tonic-gate *	if zero bytes left, goto sm_exit
1397c478bdstevel@tonic-gate *	if one byte left, go to sm_byte
1407c478bdstevel@tonic-gate *	else go to sm_half
1417c478bdstevel@tonic-gate *
1427c478bdstevel@tonic-gate * sm_moveword:
1437c478bdstevel@tonic-gate *	move bytes until src is word aligned
1447c478bdstevel@tonic-gate *	move words in 4-way unrolled loop until < 16 bytes left
1457c478bdstevel@tonic-gate *      move words in 1-way unrolled loop until < 4 bytes left
1467c478bdstevel@tonic-gate *	if zero bytes left, goto sm_exit
1477c478bdstevel@tonic-gate *	if one byte left, go to sm_byte
1487c478bdstevel@tonic-gate *	else go to sm_half
1497c478bdstevel@tonic-gate *
1507c478bdstevel@tonic-gate * sm_movehalf:
1517c478bdstevel@tonic-gate *	move a byte if needed to align src on halfword
1527c478bdstevel@tonic-gate *	move halfwords in 4-way unrolled loop until < 8 bytes left
1537c478bdstevel@tonic-gate *	if zero bytes left, goto sm_exit
1547c478bdstevel@tonic-gate *	if one byte left, go to sm_byte
1557c478bdstevel@tonic-gate *	else go to sm_half
1567c478bdstevel@tonic-gate *
1577c478bdstevel@tonic-gate *
1587c478bdstevel@tonic-gate * FPBLK_copy:
1597c478bdstevel@tonic-gate * 	%l6 = curthread->t_lofault;
1607c478bdstevel@tonic-gate * 	if (%l6 != NULL) {
1617c478bdstevel@tonic-gate * 		membar #Sync
1627c478bdstevel@tonic-gate * 		curthread->t_lofault = .copyerr;
1637c478bdstevel@tonic-gate * 		caller_error_handler = TRUE             ! %l6 |= 2
1647c478bdstevel@tonic-gate * 	}
1657c478bdstevel@tonic-gate *
1667c478bdstevel@tonic-gate *	! for FPU testing we must not migrate cpus
1677c478bdstevel@tonic-gate * 	if (curthread->t_lwp == NULL) {
1687c478bdstevel@tonic-gate *		! Kernel threads do not have pcb's in which to store
1697c478bdstevel@tonic-gate *		! the floating point state, so disallow preemption during
1707c478bdstevel@tonic-gate *		! the copy.  This also prevents cpu migration.
1717c478bdstevel@tonic-gate * 		kpreempt_disable(curthread);
1727c478bdstevel@tonic-gate *	} else {
1737c478bdstevel@tonic-gate *		thread_nomigrate();
1747c478bdstevel@tonic-gate *	}
1757c478bdstevel@tonic-gate *
1767c478bdstevel@tonic-gate * 	old_fprs = %fprs;
1777c478bdstevel@tonic-gate * 	old_gsr = %gsr;
1787c478bdstevel@tonic-gate * 	if (%fprs.fef) {
1797c478bdstevel@tonic-gate * 		%fprs.fef = 1;
1807c478bdstevel@tonic-gate * 		save current fpregs on stack using blockstore
1817c478bdstevel@tonic-gate * 	} else {
1827c478bdstevel@tonic-gate * 		%fprs.fef = 1;
1837c478bdstevel@tonic-gate * 	}
1847c478bdstevel@tonic-gate *
1857c478bdstevel@tonic-gate *
1867c478bdstevel@tonic-gate * 	do_blockcopy_here;
1877c478bdstevel@tonic-gate *
1887c478bdstevel@tonic-gate * In lofault handler:
1897c478bdstevel@tonic-gate *	curthread->t_lofault = .copyerr2;
1907c478bdstevel@tonic-gate *	Continue on with the normal exit handler
1917c478bdstevel@tonic-gate *
1927c478bdstevel@tonic-gate * On normal exit:
1937c478bdstevel@tonic-gate * 	%gsr = old_gsr;
1947c478bdstevel@tonic-gate * 	if (old_fprs & FPRS_FEF)
1957c478bdstevel@tonic-gate * 		restore fpregs from stack using blockload
1967c478bdstevel@tonic-gate *	else
1977c478bdstevel@tonic-gate *		zero fpregs
1987c478bdstevel@tonic-gate * 	%fprs = old_fprs;
1997c478bdstevel@tonic-gate * 	membar #Sync
2007c478bdstevel@tonic-gate * 	curthread->t_lofault = (%l6 & ~3);
2017c478bdstevel@tonic-gate *	! following test omitted from copyin/copyout as they
2027c478bdstevel@tonic-gate *	! will always have a current thread
2037c478bdstevel@tonic-gate * 	if (curthread->t_lwp == NULL)
2047c478bdstevel@tonic-gate *		kpreempt_enable(curthread);
2057c478bdstevel@tonic-gate *	else
2067c478bdstevel@tonic-gate *		thread_allowmigrate();
2077c478bdstevel@tonic-gate * 	return (0)
2087c478bdstevel@tonic-gate *
2097c478bdstevel@tonic-gate * In second lofault handler (.copyerr2):
2107c478bdstevel@tonic-gate *	We've tried to restore fp state from the stack and failed.  To
2117c478bdstevel@tonic-gate *	prevent from returning with a corrupted fp state, we will panic.
2127c478bdstevel@tonic-gate */
2157c478bdstevel@tonic-gate * Comments about optimization choices
2167c478bdstevel@tonic-gate *
2177c478bdstevel@tonic-gate * The initial optimization decision in this code is to determine
2187c478bdstevel@tonic-gate * whether to use the FP registers for a copy or not.  If we don't
2197c478bdstevel@tonic-gate * use the FP registers, we can execute the copy as a leaf routine,
2207c478bdstevel@tonic-gate * saving a register save and restore.  Also, less elaborate setup
2217c478bdstevel@tonic-gate * is required, allowing short copies to be completed more quickly.
2227c478bdstevel@tonic-gate * For longer copies, especially unaligned ones (where the src and
2237c478bdstevel@tonic-gate * dst do not align to allow simple ldx,stx operation), the FP
2247c478bdstevel@tonic-gate * registers allow much faster copy operations.
2257c478bdstevel@tonic-gate *
2267c478bdstevel@tonic-gate * The estimated extra cost of the FP path will vary depending on
2277c478bdstevel@tonic-gate * src/dst alignment, dst offset from the next 64 byte FPblock store
2287c478bdstevel@tonic-gate * boundary, remaining src data after the last full dst cache line is
2297c478bdstevel@tonic-gate * moved whether the FP registers need to be saved, and some other
2307c478bdstevel@tonic-gate * minor issues.  The average additional overhead is estimated to be
2317c478bdstevel@tonic-gate * 400 clocks.  Since each non-repeated/predicted tst and branch costs
2327c478bdstevel@tonic-gate * around 10 clocks, elaborate calculation would slow down to all
2337c478bdstevel@tonic-gate * longer copies and only benefit a small portion of medium sized
2347c478bdstevel@tonic-gate * copies.  Rather than incur such cost, we chose fixed transition
2357c478bdstevel@tonic-gate * points for each of the alignment choices.
2367c478bdstevel@tonic-gate *
2377c478bdstevel@tonic-gate * For the inner loop, here is a comparison of the per cache line
2387c478bdstevel@tonic-gate * costs for each alignment when src&dst are in cache:
2397c478bdstevel@tonic-gate *
2407c478bdstevel@tonic-gate * byte aligned:  108 clocks slower for non-FPBLK
2417c478bdstevel@tonic-gate * half aligned:   44 clocks slower for non-FPBLK
2427c478bdstevel@tonic-gate * word aligned:   12 clocks slower for non-FPBLK
2437c478bdstevel@tonic-gate * long aligned:    4 clocks >>faster<< for non-FPBLK
2447c478bdstevel@tonic-gate *
2457c478bdstevel@tonic-gate * The long aligned loop runs faster because it does no prefetching.
2467c478bdstevel@tonic-gate * That wins if the data is not in cache or there is too little
2477c478bdstevel@tonic-gate * data to gain much benefit from prefetching.  But when there
2487c478bdstevel@tonic-gate * is more data and that data is not in cache, failing to prefetch
2497c478bdstevel@tonic-gate * can run much slower.  In addition, there is a 2 Kbyte store queue
2507c478bdstevel@tonic-gate * which will cause the non-FPBLK inner loop to slow for larger copies.
2517c478bdstevel@tonic-gate * The exact tradeoff is strongly load and application dependent, with
2527c478bdstevel@tonic-gate * increasing risk of a customer visible performance regression if the
2537c478bdstevel@tonic-gate * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
2547c478bdstevel@tonic-gate * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
2557c478bdstevel@tonic-gate * upper limit for the non-FPBLK code.  To minimize performance regression
2567c478bdstevel@tonic-gate * risk while still gaining the primary benefits of the improvements to
2577c478bdstevel@tonic-gate * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
2587c478bdstevel@tonic-gate * hw_copy_limit_*.  Later experimental studies using different values
2597c478bdstevel@tonic-gate * of hw_copy_limit_* can be used to make further adjustments if
2607c478bdstevel@tonic-gate * appropriate.
2617c478bdstevel@tonic-gate *
2627c478bdstevel@tonic-gate * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
2637c478bdstevel@tonic-gate * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
2647c478bdstevel@tonic-gate * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
2657c478bdstevel@tonic-gate * hw_copy_limit_8 = src and dst are longword aligned
2667c478bdstevel@tonic-gate *
2677c478bdstevel@tonic-gate * To say that src and dst are word aligned means that after
2687c478bdstevel@tonic-gate * some initial alignment activity of moving 0 to 3 bytes,
2697c478bdstevel@tonic-gate * both the src and dst will be on word boundaries so that
2707c478bdstevel@tonic-gate * word loads and stores may be used.
2717c478bdstevel@tonic-gate *
2727c478bdstevel@tonic-gate * Recommended initial values as of Mar 2004, includes testing
2737c478bdstevel@tonic-gate * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
2747c478bdstevel@tonic-gate * hw_copy_limit_1 =  256
2757c478bdstevel@tonic-gate * hw_copy_limit_2 =  512
2767c478bdstevel@tonic-gate * hw_copy_limit_4 = 1024
2777c478bdstevel@tonic-gate * hw_copy_limit_8 = 1024 (or 1536 on some systems)
2787c478bdstevel@tonic-gate *
2797c478bdstevel@tonic-gate *
2807c478bdstevel@tonic-gate * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
2817c478bdstevel@tonic-gate * disabled for that alignment choice.
2827c478bdstevel@tonic-gate * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
2837c478bdstevel@tonic-gate * the value of VIS_COPY_THRESHOLD is used.
2847c478bdstevel@tonic-gate * It is not envisioned that hw_copy_limit_? will be changed in the field
2857c478bdstevel@tonic-gate * It is provided to allow for disabling FPBLK copies and to allow
2867c478bdstevel@tonic-gate * easy testing of alternate values on future HW implementations
2877c478bdstevel@tonic-gate * that might have different cache sizes, clock rates or instruction
2887c478bdstevel@tonic-gate * timing rules.
2897c478bdstevel@tonic-gate *
2907c478bdstevel@tonic-gate * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
2917c478bdstevel@tonic-gate * threshold to speedup all shorter copies (less than 256).  That
2927c478bdstevel@tonic-gate * saves an alignment test, memory reference, and enabling test
2937c478bdstevel@tonic-gate * for all short copies, or an estimated 24 clocks.
2947c478bdstevel@tonic-gate *
2957c478bdstevel@tonic-gate * The order in which these limits are checked does matter since each
2967c478bdstevel@tonic-gate * non-predicted tst and branch costs around 10 clocks.
2977c478bdstevel@tonic-gate * If src and dst are randomly selected addresses,
2987c478bdstevel@tonic-gate * 4 of 8 will not be alignable.
2997c478bdstevel@tonic-gate * 2 of 8 will be half word alignable.
3007c478bdstevel@tonic-gate * 1 of 8 will be word alignable.
3017c478bdstevel@tonic-gate * 1 of 8 will be long word alignable.
3027c478bdstevel@tonic-gate * But, tests on running kernels show that src and dst to copy code
3037c478bdstevel@tonic-gate * are typically not on random alignments.  Structure copies and
3047c478bdstevel@tonic-gate * copies of larger data sizes are often on long word boundaries.
3057c478bdstevel@tonic-gate * So we test the long word alignment case first, then
3067c478bdstevel@tonic-gate * the byte alignment, then halfword, then word alignment.
3077c478bdstevel@tonic-gate *
3087c478bdstevel@tonic-gate * Several times, tests for length are made to split the code
3097c478bdstevel@tonic-gate * into subcases.  These tests often allow later tests to be
3107c478bdstevel@tonic-gate * avoided.  For example, within the non-FPBLK copy, we first
3117c478bdstevel@tonic-gate * check for tiny copies of 3 bytes or less.  That allows us
3127c478bdstevel@tonic-gate * to use a 4-way unrolled loop for the general byte copy case
3137c478bdstevel@tonic-gate * without a test on loop entry.
3147c478bdstevel@tonic-gate * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
3157c478bdstevel@tonic-gate * vs longer cases.  For the really short case, we don't attempt
3167c478bdstevel@tonic-gate * align src and dst.  We try to minimize special case tests in
3177c478bdstevel@tonic-gate * the shortest loops as each test adds a significant percentage
3187c478bdstevel@tonic-gate * to the total time.
3197c478bdstevel@tonic-gate *
3207c478bdstevel@tonic-gate * For the medium sized cases, we allow ourselves to adjust the
3217c478bdstevel@tonic-gate * src and dst alignment and provide special cases for each of
3227c478bdstevel@tonic-gate * the four adjusted alignment cases. The CHKSIZE that was used
3237c478bdstevel@tonic-gate * to decide between short and medium size was chosen to be 39
3247c478bdstevel@tonic-gate * as that allows for the worst case of 7 bytes of alignment
3257c478bdstevel@tonic-gate * shift and 4 times 8 bytes for the first long word unrolling.
3267c478bdstevel@tonic-gate * That knowledge saves an initial test for length on entry into
3277c478bdstevel@tonic-gate * the medium cases.  If the general loop unrolling factor were
3287c478bdstevel@tonic-gate * to be increases, this number would also need to be adjusted.
3297c478bdstevel@tonic-gate *
3307c478bdstevel@tonic-gate * For all cases in the non-FPBLK code where it is known that at
3317c478bdstevel@tonic-gate * least 4 chunks of data are available for movement, the
3327c478bdstevel@tonic-gate * loop is unrolled by four.  This 4-way loop runs in 8 clocks
3337c478bdstevel@tonic-gate * or 2 clocks per data element.  Due to limitations of the
3347c478bdstevel@tonic-gate * branch instruction on Cheetah, Jaguar, and Panther, the
3357c478bdstevel@tonic-gate * minimum time for a small, tight loop is 3 clocks.  So
3367c478bdstevel@tonic-gate * the 4-way loop runs 50% faster than the fastest non-unrolled
3377c478bdstevel@tonic-gate * loop.
3387c478bdstevel@tonic-gate *
3397c478bdstevel@tonic-gate * Instruction alignment is forced by used of .align 16 directives
3407c478bdstevel@tonic-gate * and nops which are not executed in the code.  This
3417c478bdstevel@tonic-gate * combination of operations shifts the alignment of following
3427c478bdstevel@tonic-gate * loops to insure that loops are aligned so that their instructions
3437c478bdstevel@tonic-gate * fall within the minimum number of 4 instruction fetch groups.
3447c478bdstevel@tonic-gate * If instructions are inserted or removed between the .align
3457c478bdstevel@tonic-gate * instruction and the unrolled loops, then the alignment needs
3467c478bdstevel@tonic-gate * to be readjusted.  Misaligned loops can add a clock per loop
3477c478bdstevel@tonic-gate * iteration to the loop timing.
3487c478bdstevel@tonic-gate *
3497c478bdstevel@tonic-gate * In a few cases, code is duplicated to avoid a branch.  Since
3507c478bdstevel@tonic-gate * a non-predicted tst and branch takes 10 clocks, this savings
3517c478bdstevel@tonic-gate * is judged an appropriate time-space tradeoff.
3527c478bdstevel@tonic-gate *
3537c478bdstevel@tonic-gate * Within the FPBLK-code, the prefetch method in the inner
3547c478bdstevel@tonic-gate * loop needs to be explained as it is not standard.  Two
3557c478bdstevel@tonic-gate * prefetches are issued for each cache line instead of one.
3567c478bdstevel@tonic-gate * The primary one is at the maximum reach of 8 cache lines.
3577c478bdstevel@tonic-gate * Most of the time, that maximum prefetch reach gives the
3587c478bdstevel@tonic-gate * cache line more time to reach the processor for systems with
3597c478bdstevel@tonic-gate * higher processor clocks.  But, sometimes memory interference
3607c478bdstevel@tonic-gate * can cause that prefetch to be dropped.  Putting a second
3617c478bdstevel@tonic-gate * prefetch at a reach of 5 cache lines catches the drops
3627c478bdstevel@tonic-gate * three iterations later and shows a measured improvement
3637c478bdstevel@tonic-gate * in performance over any similar loop with a single prefetch.
3647c478bdstevel@tonic-gate * The prefetches are placed in the loop so they overlap with
3657c478bdstevel@tonic-gate * non-memory instructions, so that there is no extra cost
3667c478bdstevel@tonic-gate * when the data is already in-cache.
3677c478bdstevel@tonic-gate *
3687c478bdstevel@tonic-gate */
3717c478bdstevel@tonic-gate * Notes on preserving existing fp state and on membars.
3727c478bdstevel@tonic-gate *
3737c478bdstevel@tonic-gate * When a copyOP decides to use fp we may have to preserve existing
3747c478bdstevel@tonic-gate * floating point state.  It is not the caller's state that we need to
3757c478bdstevel@tonic-gate * preserve - the rest of the kernel does not use fp and, anyway, fp
3767c478bdstevel@tonic-gate * registers are volatile across a call.  Some examples:
3777c478bdstevel@tonic-gate *
3787c478bdstevel@tonic-gate *	- userland has fp state and is interrupted (device interrupt
3797c478bdstevel@tonic-gate *	  or trap) and within the interrupt/trap handling we use
3807c478bdstevel@tonic-gate *	  bcopy()
3817c478bdstevel@tonic-gate *	- another (higher level) interrupt or trap handler uses bcopy
3827c478bdstevel@tonic-gate *	  while a bcopy from an earlier interrupt is still active
3837c478bdstevel@tonic-gate *	- an asynchronous error trap occurs while fp state exists (in
3847c478bdstevel@tonic-gate *	  userland or in kernel copy) and the tl0 component of the handling
3857c478bdstevel@tonic-gate *	  uses bcopy
3867c478bdstevel@tonic-gate *	- a user process with fp state incurs a copy-on-write fault and
3877c478bdstevel@tonic-gate *	  hwblkpagecopy always uses fp
3887c478bdstevel@tonic-gate *
3897c478bdstevel@tonic-gate * We therefore need a per-call place in which to preserve fp state -
3907c478bdstevel@tonic-gate * using our stack is ideal (and since fp copy cannot be leaf optimized
3917c478bdstevel@tonic-gate * because of calls it makes, this is no hardship).
3927c478bdstevel@tonic-gate *
3937c478bdstevel@tonic-gate * The following membar BLD/BST discussion is Cheetah pipeline specific.
3947c478bdstevel@tonic-gate * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
3957c478bdstevel@tonic-gate * nops (those semantics always apply) and #StoreLoad is implemented
3967c478bdstevel@tonic-gate * as a membar #Sync.
3977c478bdstevel@tonic-gate *
3987c478bdstevel@tonic-gate * It is possible that the owner of the fp state has a block load or
3997c478bdstevel@tonic-gate * block store still "in flight" at the time we come to preserve that
4007c478bdstevel@tonic-gate * state.  Block loads are blocking in Cheetah pipelines so we do not
4017c478bdstevel@tonic-gate * need to sync with them.  In preserving fp regs we will use block stores
4027c478bdstevel@tonic-gate * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
4037c478bdstevel@tonic-gate * after storing state (so that our subsequent use of those registers
4047c478bdstevel@tonic-gate * does not modify them before the block stores complete);  this membar
4057c478bdstevel@tonic-gate * also serves to sync with block stores the owner of the fp state has
4067c478bdstevel@tonic-gate * initiated.
4077c478bdstevel@tonic-gate *
4087c478bdstevel@tonic-gate * When we have finished fp copy (with it's repeated block stores)
4097c478bdstevel@tonic-gate * we must membar #Sync so that our block stores may complete before
4107c478bdstevel@tonic-gate * we either restore the original fp state into the fp registers or
4117c478bdstevel@tonic-gate * return to a caller which may initiate other fp operations that could
4127c478bdstevel@tonic-gate * modify the fp regs we used before the block stores complete.
4137c478bdstevel@tonic-gate *
4147c478bdstevel@tonic-gate * Synchronous faults (eg, unresolvable DMMU miss) that occur while
4157c478bdstevel@tonic-gate * t_lofault is not NULL will not panic but will instead trampoline
4167c478bdstevel@tonic-gate * to the registered lofault handler.  There is no need for any
4177c478bdstevel@tonic-gate * membars for these - eg, our store to t_lofault will always be visible to
4187c478bdstevel@tonic-gate * ourselves and it is our cpu which will take any trap.
4197c478bdstevel@tonic-gate *
4207c478bdstevel@tonic-gate * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
4217c478bdstevel@tonic-gate * while t_lofault is not NULL will also not panic.  Since we're copying
4227c478bdstevel@tonic-gate * to or from userland the extent of the damage is known - the destination
4237c478bdstevel@tonic-gate * buffer is incomplete.  So trap handlers will trampoline to the lofault
4247c478bdstevel@tonic-gate * handler in this case which should take some form of error action to
4257c478bdstevel@tonic-gate * avoid using the incomplete buffer.  The trap handler also flags the
4267c478bdstevel@tonic-gate * fault so that later return-from-trap handling (for the trap that brought
4277c478bdstevel@tonic-gate * this thread into the kernel in the first place) can notify the process
4287c478bdstevel@tonic-gate * and reboot the system (or restart the service with Greenline/Contracts).
4297c478bdstevel@tonic-gate *
4307c478bdstevel@tonic-gate * Asynchronous faults (eg, uncorrectable ECC error from memory) can
4317c478bdstevel@tonic-gate * result in deferred error traps - the trap is taken sometime after
4327c478bdstevel@tonic-gate * the event and the trap PC may not be the PC of the faulting access.
4337c478bdstevel@tonic-gate * Delivery of such pending traps can be forced by a membar #Sync, acting
4347c478bdstevel@tonic-gate * as an "error barrier" in this role.  To accurately apply the user/kernel
4357c478bdstevel@tonic-gate * separation described in the preceding paragraph we must force delivery
4367c478bdstevel@tonic-gate * of deferred traps affecting kernel state before we install a lofault
4377c478bdstevel@tonic-gate * handler (if we interpose a new lofault handler on an existing one there
4387c478bdstevel@tonic-gate * is no need to repeat this), and we must force delivery of deferred
4397c478bdstevel@tonic-gate * errors affecting the lofault-protected region before we clear t_lofault.
4407c478bdstevel@tonic-gate * Failure to do so results in lost kernel state being interpreted as
4417c478bdstevel@tonic-gate * affecting a copyin/copyout only, or of an error that really only
4427c478bdstevel@tonic-gate * affects copy data being interpreted as losing kernel state.
4437c478bdstevel@tonic-gate *
4447c478bdstevel@tonic-gate * Since the copy operations may preserve and later restore floating
4457c478bdstevel@tonic-gate * point state that does not belong to the caller (see examples above),
4467c478bdstevel@tonic-gate * we must be careful in how we do this in order to prevent corruption
4477c478bdstevel@tonic-gate * of another program.
4487c478bdstevel@tonic-gate *
4497c478bdstevel@tonic-gate * To make sure that floating point state is always saved and restored
4507c478bdstevel@tonic-gate * correctly, the following "big rules" must be followed when the floating
4517c478bdstevel@tonic-gate * point registers will be used:
4527c478bdstevel@tonic-gate *
4537c478bdstevel@tonic-gate * 1. %l6 always holds the caller's lofault handler.  Also in this register,
4547c478bdstevel@tonic-gate *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
4557c478bdstevel@tonic-gate *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
4567c478bdstevel@tonic-gate *    lofault handler was set coming in.
4577c478bdstevel@tonic-gate *
4587c478bdstevel@tonic-gate * 2. The FPUSED flag indicates that all FP state has been successfully stored
4597c478bdstevel@tonic-gate *    on the stack.  It should not be set until this save has been completed.
4607c478bdstevel@tonic-gate *
4617c478bdstevel@tonic-gate * 3. The FPUSED flag should not be cleared on exit until all FP state has
4627c478bdstevel@tonic-gate *    been restored from the stack.  If an error occurs while restoring
4637c478bdstevel@tonic-gate *    data from the stack, the error handler can check this flag to see if
4647c478bdstevel@tonic-gate *    a restore is necessary.
4657c478bdstevel@tonic-gate *
4667c478bdstevel@tonic-gate * 4. Code run under the new lofault handler must be kept to a minimum.  In
4677c478bdstevel@tonic-gate *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
4687c478bdstevel@tonic-gate *    to kpreempt(), should not be made until after the lofault handler has
4697c478bdstevel@tonic-gate *    been restored.
4707c478bdstevel@tonic-gate */
4737c478bdstevel@tonic-gate * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
4747c478bdstevel@tonic-gate * to "break even" using FP/VIS-accelerated memory operations.
4757c478bdstevel@tonic-gate * The FPBLK code assumes a minimum number of bytes are available
4767c478bdstevel@tonic-gate * to be moved on entry.  Check that code carefully before
4777c478bdstevel@tonic-gate * reducing VIS_COPY_THRESHOLD below 256.
4787c478bdstevel@tonic-gate */
4807c478bdstevel@tonic-gate * This shadows sys/machsystm.h which can't be included due to the lack of
4817c478bdstevel@tonic-gate * _ASM guards in include files it references. Change it here, change it there.
4827c478bdstevel@tonic-gate */
4837c478bdstevel@tonic-gate#define VIS_COPY_THRESHOLD 256
4867c478bdstevel@tonic-gate * TEST for very short copies
4877c478bdstevel@tonic-gate * Be aware that the maximum unroll for the short unaligned case
4887c478bdstevel@tonic-gate * is SHORTCOPY+1
4897c478bdstevel@tonic-gate */
4907c478bdstevel@tonic-gate#define SHORTCOPY 3
4917c478bdstevel@tonic-gate#define CHKSIZE  39
4947c478bdstevel@tonic-gate * Indicates that we're to trampoline to the error handler.
4957c478bdstevel@tonic-gate * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
4967c478bdstevel@tonic-gate * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
4977c478bdstevel@tonic-gate */
4987c478bdstevel@tonic-gate#define	FPUSED_FLAG	1
4997c478bdstevel@tonic-gate#define	TRAMP_FLAG	2
5007c478bdstevel@tonic-gate#define	MASK_FLAGS	3
5037c478bdstevel@tonic-gate * Number of outstanding prefetches.
5047c478bdstevel@tonic-gate * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
5057c478bdstevel@tonic-gate * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
5067c478bdstevel@tonic-gate * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
5077c478bdstevel@tonic-gate * of 5% for large copies as compared to a single prefetch.  The reason
5087c478bdstevel@tonic-gate * for the improvement is that with Cheetah and Jaguar, some prefetches
5097c478bdstevel@tonic-gate * are dropped due to the prefetch queue being full.  The second prefetch
5107c478bdstevel@tonic-gate * reduces the number of cache lines that are dropped.
5117c478bdstevel@tonic-gate * Do not remove the double prefetch or change either CHEETAH_PREFETCH
5127c478bdstevel@tonic-gate * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
5137c478bdstevel@tonic-gate * there is no loss of performance.
5147c478bdstevel@tonic-gate */
5157c478bdstevel@tonic-gate#define	CHEETAH_PREFETCH	8
5167c478bdstevel@tonic-gate#define	CHEETAH_2ND_PREFETCH	5
5187c478bdstevel@tonic-gate#define	VIS_BLOCKSIZE		64
5217c478bdstevel@tonic-gate * Size of stack frame in order to accomodate a 64-byte aligned
5227c478bdstevel@tonic-gate * floating-point register save area and 2 64-bit temp locations.
5237c478bdstevel@tonic-gate * All copy functions use two quadrants of fp registers; to assure a
5247c478bdstevel@tonic-gate * block-aligned two block buffer in which to save we must reserve
5257c478bdstevel@tonic-gate * three blocks on stack.  Not all functions preserve %pfrs on stack
5267c478bdstevel@tonic-gate * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
5277c478bdstevel@tonic-gate *
5287c478bdstevel@tonic-gate *    _______________________________________ <-- %fp + STACK_BIAS
5297c478bdstevel@tonic-gate *    | We may need to preserve 2 quadrants |
5307c478bdstevel@tonic-gate *    | of fp regs, but since we do so with |
5317c478bdstevel@tonic-gate *    | BST/BLD we need room in which to    |
5327c478bdstevel@tonic-gate *    | align to VIS_BLOCKSIZE bytes.  So   |
5337c478bdstevel@tonic-gate *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
5347c478bdstevel@tonic-gate *    |-------------------------------------|
5357c478bdstevel@tonic-gate *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
5367c478bdstevel@tonic-gate *    |-------------------------------------|
5377c478bdstevel@tonic-gate *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
5387c478bdstevel@tonic-gate *    ---------------------------------------
5397c478bdstevel@tonic-gate */
5407c478bdstevel@tonic-gate#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
5417c478bdstevel@tonic-gate#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
5427c478bdstevel@tonic-gate#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
5437c478bdstevel@tonic-gate#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
5447c478bdstevel@tonic-gate#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
5477c478bdstevel@tonic-gate * Common macros used by the various versions of the block copy
5487c478bdstevel@tonic-gate * routines in this file.
5497c478bdstevel@tonic-gate */
5527c478bdstevel@tonic-gate * In FP copies if we do not have preserved data to restore over
5537c478bdstevel@tonic-gate * the fp regs we used then we must zero those regs to avoid
5547c478bdstevel@tonic-gate * exposing portions of the data to later threads (data security).
5557c478bdstevel@tonic-gate *
5567c478bdstevel@tonic-gate * Copy functions use either quadrants 1 and 3 or 2 and 4.
5577c478bdstevel@tonic-gate *
5587c478bdstevel@tonic-gate * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
5597c478bdstevel@tonic-gate * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
5607c478bdstevel@tonic-gate *
5617c478bdstevel@tonic-gate * The instructions below are quicker than repeated fzero instructions
5627c478bdstevel@tonic-gate * since they can dispatch down two fp pipelines.
5637c478bdstevel@tonic-gate */
5647c478bdstevel@tonic-gate#define	FZEROQ1Q3			\
5657c478bdstevel@tonic-gate	fzero	%f0			;\
5667c478bdstevel@tonic-gate	fzero	%f2			;\
5677c478bdstevel@tonic-gate	faddd	%f0, %f2, %f4		;\
5687c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f6		;\
5697c478bdstevel@tonic-gate	faddd	%f0, %f2, %f8		;\
5707c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f10		;\
5717c478bdstevel@tonic-gate	faddd	%f0, %f2, %f12		;\
5727c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f14		;\
5737c478bdstevel@tonic-gate	faddd	%f0, %f2, %f32		;\
5747c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f34		;\
5757c478bdstevel@tonic-gate	faddd	%f0, %f2, %f36		;\
5767c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f38		;\
5777c478bdstevel@tonic-gate	faddd	%f0, %f2, %f40		;\
5787c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f42		;\
5797c478bdstevel@tonic-gate	faddd	%f0, %f2, %f44		;\
5807c478bdstevel@tonic-gate	fmuld	%f0, %f2, %f46
5827c478bdstevel@tonic-gate#define	FZEROQ2Q4			\
5837c478bdstevel@tonic-gate	fzero	%f16			;\
5847c478bdstevel@tonic-gate	fzero	%f18			;\
5857c478bdstevel@tonic-gate	faddd	%f16, %f18, %f20	;\
5867c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f22	;\
5877c478bdstevel@tonic-gate	faddd	%f16, %f18, %f24	;\
5887c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f26	;\
5897c478bdstevel@tonic-gate	faddd	%f16, %f18, %f28	;\
5907c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f30	;\
5917c478bdstevel@tonic-gate	faddd	%f16, %f18, %f48	;\
5927c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f50	;\
5937c478bdstevel@tonic-gate	faddd	%f16, %f18, %f52	;\
5947c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f54	;\
5957c478bdstevel@tonic-gate	faddd	%f16, %f18, %f56	;\
5967c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f58	;\
5977c478bdstevel@tonic-gate	faddd	%f16, %f18, %f60	;\
5987c478bdstevel@tonic-gate	fmuld	%f16, %f18, %f62
6017c478bdstevel@tonic-gate * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
6027c478bdstevel@tonic-gate * Used to save and restore in-use fp registers when we want to use FP
6037c478bdstevel@tonic-gate * and find fp already in use and copy size still large enough to justify
6047c478bdstevel@tonic-gate * the additional overhead of this save and restore.
6057c478bdstevel@tonic-gate *
6067c478bdstevel@tonic-gate * A membar #Sync is needed before save to sync fp ops initiated before
6077c478bdstevel@tonic-gate * the call to the copy function (by whoever has fp in use); for example
6087c478bdstevel@tonic-gate * an earlier block load to the quadrant we are about to save may still be
6097c478bdstevel@tonic-gate * "in flight".  A membar #Sync is required at the end of the save to
6107c478bdstevel@tonic-gate * sync our block store (the copy code is about to begin ldd's to the
6117c478bdstevel@tonic-gate * first quadrant).  Note, however, that since Cheetah pipeline block load
6127c478bdstevel@tonic-gate * is blocking we can omit the initial membar before saving fp state (they're
6137c478bdstevel@tonic-gate * commented below in case of future porting to a chip that does not block
6147c478bdstevel@tonic-gate * on block load).
6157c478bdstevel@tonic-gate *
6167c478bdstevel@tonic-gate * Similarly: a membar #Sync before restore allows the block stores of
6177c478bdstevel@tonic-gate * the copy operation to complete before we fill the quadrants with their
6187c478bdstevel@tonic-gate * original data, and a membar #Sync after restore lets the block loads
6197c478bdstevel@tonic-gate * of the restore complete before we return to whoever has the fp regs
6207c478bdstevel@tonic-gate * in use.  To avoid repeated membar #Sync we make it the responsibility
6217c478bdstevel@tonic-gate * of the copy code to membar #Sync immediately after copy is complete
6227c478bdstevel@tonic-gate * and before using the BLD_*_FROMSTACK macro.
6237c478bdstevel@tonic-gate */
6247c478bdstevel@tonic-gate#if !defined(lint)
6257c478bdstevel@tonic-gate#define BST_FPQ1Q3_TOSTACK(tmp1)				\
6267c478bdstevel@tonic-gate	/* membar #Sync	*/					;\
6277c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6287c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6297c478bdstevel@tonic-gate	stda	%f0, [tmp1]ASI_BLK_P				;\
6307c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6317c478bdstevel@tonic-gate	stda	%f32, [tmp1]ASI_BLK_P				;\
6327c478bdstevel@tonic-gate	membar	#Sync
6347c478bdstevel@tonic-gate#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
6357c478bdstevel@tonic-gate	/* membar #Sync - provided at copy completion */	;\
6367c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6377c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6387c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f0				;\
6397c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6407c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f32				;\
6417c478bdstevel@tonic-gate	membar	#Sync
6437c478bdstevel@tonic-gate#define BST_FPQ2Q4_TOSTACK(tmp1)				\
6447c478bdstevel@tonic-gate	/* membar #Sync */					;\
6457c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6467c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6477c478bdstevel@tonic-gate	stda	%f16, [tmp1]ASI_BLK_P				;\
6487c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6497c478bdstevel@tonic-gate	stda	%f48, [tmp1]ASI_BLK_P				;\
6507c478bdstevel@tonic-gate	membar	#Sync
6527c478bdstevel@tonic-gate#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
6537c478bdstevel@tonic-gate	/* membar #Sync - provided at copy completion */	;\
6547c478bdstevel@tonic-gate	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
6557c478bdstevel@tonic-gate	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
6567c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f16				;\
6577c478bdstevel@tonic-gate	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
6587c478bdstevel@tonic-gate	ldda	[tmp1]ASI_BLK_P, %f48				;\
6597c478bdstevel@tonic-gate	membar	#Sync
6637c478bdstevel@tonic-gate * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
6647c478bdstevel@tonic-gate * prevent preemption if there is no t_lwp to save FP state to on context
6657c478bdstevel@tonic-gate * switch) before commencing a FP copy, and reallow it on completion or
6667c478bdstevel@tonic-gate * in error trampoline paths when we were using FP copy.
6677c478bdstevel@tonic-gate *
6687c478bdstevel@tonic-gate * Both macros may call other functions, so be aware that all outputs are
6697c478bdstevel@tonic-gate * forfeit after using these macros.  For this reason we do not pass registers
6707c478bdstevel@tonic-gate * to use - we just use any outputs we want.
6717c478bdstevel@tonic-gate *
6727c478bdstevel@tonic-gate * For fpRAS we need to perform the fpRAS mechanism test on the same
6737c478bdstevel@tonic-gate * CPU as we use for the copy operation, both so that we validate the
6747c478bdstevel@tonic-gate * CPU we perform the copy on and so that we know which CPU failed
6757c478bdstevel@tonic-gate * if a failure is detected.  Hence we need to be bound to "our" CPU.
6767c478bdstevel@tonic-gate * This could be achieved through disabling preemption (and we have do it that
6777c478bdstevel@tonic-gate * way for threads with no t_lwp) but for larger copies this may hold
6787c478bdstevel@tonic-gate * higher priority threads off of cpu for too long (eg, realtime).  So we
6797c478bdstevel@tonic-gate * make use of the lightweight t_nomigrate mechanism where we can (ie, when
6807c478bdstevel@tonic-gate * we have a t_lwp).
6817c478bdstevel@tonic-gate *
6827c478bdstevel@tonic-gate * Pseudo code:
6837c478bdstevel@tonic-gate *
6847c478bdstevel@tonic-gate * FP_NOMIGRATE:
6857c478bdstevel@tonic-gate *
6867c478bdstevel@tonic-gate * if (curthread->t_lwp) {
6877c478bdstevel@tonic-gate *	thread_nomigrate();
6887c478bdstevel@tonic-gate * } else {
6897c478bdstevel@tonic-gate *	kpreempt_disable();
6907c478bdstevel@tonic-gate * }
6917c478bdstevel@tonic-gate *
6927c478bdstevel@tonic-gate * FP_ALLOWMIGRATE:
6937c478bdstevel@tonic-gate *
6947c478bdstevel@tonic-gate * if (curthread->t_lwp) {
6957c478bdstevel@tonic-gate *	thread_allowmigrate();
6967c478bdstevel@tonic-gate * } else {
6977c478bdstevel@tonic-gate *	kpreempt_enable();
6987c478bdstevel@tonic-gate * }
6997c478bdstevel@tonic-gate */
7017c478bdstevel@tonic-gate#define	FP_NOMIGRATE(label1, label2)				\
7027c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LWP], %o0			;\
7037c478bdstevel@tonic-gate	brz,a,pn %o0, label1/**/f				;\
7047c478bdstevel@tonic-gate	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
7057c478bdstevel@tonic-gate	call	thread_nomigrate				;\
7067c478bdstevel@tonic-gate	  nop							;\
7077c478bdstevel@tonic-gate	ba	label2/**/f					;\
7087c478bdstevel@tonic-gate	  nop							;\
7097c478bdstevel@tonic-gatelabel1:								;\
7107c478bdstevel@tonic-gate	inc	%o1						;\
7117c478bdstevel@tonic-gate	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
7147c478bdstevel@tonic-gate#define	FP_ALLOWMIGRATE(label1, label2)			\
7157c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LWP], %o0			;\
7167c478bdstevel@tonic-gate	brz,a,pn %o0, label1/**/f				;\
7177c478bdstevel@tonic-gate	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
7187c478bdstevel@tonic-gate	call thread_allowmigrate				;\
7197c478bdstevel@tonic-gate	  nop							;\
7207c478bdstevel@tonic-gate	ba	label2/**/f					;\
7217c478bdstevel@tonic-gate	  nop							;\
7227c478bdstevel@tonic-gatelabel1:								;\
7237c478bdstevel@tonic-gate	dec	%o1						;\
7247c478bdstevel@tonic-gate	brnz,pn	%o1, label2/**/f				;\
7257c478bdstevel@tonic-gate	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
7267c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_CPU], %o0			;\
7277c478bdstevel@tonic-gate	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
7287c478bdstevel@tonic-gate	brz,pt	%o0, label2/**/f				;\
7297c478bdstevel@tonic-gate	  nop							;\
7307c478bdstevel@tonic-gate	call	kpreempt					;\
7317c478bdstevel@tonic-gate	  rdpr	%pil, %o0					;\
7357c478bdstevel@tonic-gate * Copy a block of storage, returning an error code if `from' or
7367c478bdstevel@tonic-gate * `to' takes a kernel pagefault which cannot be resolved.
7377c478bdstevel@tonic-gate * Returns errno value on pagefault error, 0 if all ok
7387c478bdstevel@tonic-gate */
7407c478bdstevel@tonic-gate#if defined(lint)
7427c478bdstevel@tonic-gate/* ARGSUSED */
7447c478bdstevel@tonic-gatekcopy(const void *from, void *to, size_t count)
7457c478bdstevel@tonic-gate{ return(0); }
7477c478bdstevel@tonic-gate#else	/* lint */
7497c478bdstevel@tonic-gate	.seg	".text"
7507c478bdstevel@tonic-gate	.align	4
7527c478bdstevel@tonic-gate	ENTRY(kcopy)
7547c478bdstevel@tonic-gate	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
7557c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to larger cases
7567c478bdstevel@tonic-gate	  xor	%o0, %o1, %o3			! are src, dst alignable?
7577c478bdstevel@tonic-gate	btst	7, %o3				!
7587c478bdstevel@tonic-gate	bz,pt	%ncc, .kcopy_8			! check for longword alignment
7597c478bdstevel@tonic-gate	  nop
7607c478bdstevel@tonic-gate	btst	1, %o3				!
7617c478bdstevel@tonic-gate	bz,pt	%ncc, .kcopy_2			! check for half-word
7627c478bdstevel@tonic-gate	  nop
7637c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
7647c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
7657c478bdstevel@tonic-gate	tst	%o3
7667c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
7677c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
7687c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
7697c478bdstevel@tonic-gate	  nop
7707c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
7717c478bdstevel@tonic-gate	  nop
7737c478bdstevel@tonic-gate	btst	3, %o3				!
7747c478bdstevel@tonic-gate	bz,pt	%ncc, .kcopy_4			! check for word alignment
7757c478bdstevel@tonic-gate	  nop
7767c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
7777c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
7787c478bdstevel@tonic-gate	tst	%o3
7797c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
7807c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
7817c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
7827c478bdstevel@tonic-gate	  nop
7837c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
7847c478bdstevel@tonic-gate	  nop
7867c478bdstevel@tonic-gate	! already checked longword, must be word aligned
7877c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
7887c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
7897c478bdstevel@tonic-gate	tst	%o3
7907c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
7917c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
7927c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
7937c478bdstevel@tonic-gate	  nop
7947c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
7957c478bdstevel@tonic-gate	  nop
7977c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
7987c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
7997c478bdstevel@tonic-gate	tst	%o3
8007c478bdstevel@tonic-gate	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
8017c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
8027c478bdstevel@tonic-gate	bleu,pt	%ncc, .kcopy_small		! go to small copy
8037c478bdstevel@tonic-gate	  nop
8047c478bdstevel@tonic-gate	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
8057c478bdstevel@tonic-gate	  nop
8087c478bdstevel@tonic-gate	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
8097c478bdstevel@tonic-gate	or	%o5, %lo(.sm_copyerr), %o5
8107c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
8117c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
8127c478bdstevel@tonic-gate	ba,pt	%ncc, .sm_do_copy		! common code
8137c478bdstevel@tonic-gate	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
8167c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
8177c478bdstevel@tonic-gate	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
8187c478bdstevel@tonic-gate	or	%l7, %lo(.copyerr), %l7
8197c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
8207c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
8217c478bdstevel@tonic-gate	ba,pt	%ncc, .do_copy			! common code
8227c478bdstevel@tonic-gate	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
8267c478bdstevel@tonic-gate * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
8277c478bdstevel@tonic-gate * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
8287c478bdstevel@tonic-gate */
8307c478bdstevel@tonic-gate	set	.copyerr2, %l0
8317c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
8327c478bdstevel@tonic-gate	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
8337c478bdstevel@tonic-gate	btst	FPUSED_FLAG, %l6
8347c478bdstevel@tonic-gate	bz	%ncc, 1f
8357c478bdstevel@tonic-gate	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
8377c478bdstevel@tonic-gate	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
8387c478bdstevel@tonic-gate	wr	%o2, 0, %gsr
8407c478bdstevel@tonic-gate	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
8417c478bdstevel@tonic-gate	btst	FPRS_FEF, %o3
8427c478bdstevel@tonic-gate	bz,pt	%icc, 4f
8437c478bdstevel@tonic-gate	  nop
8457c478bdstevel@tonic-gate	BLD_FPQ1Q3_FROMSTACK(%o2)
8477c478bdstevel@tonic-gate	ba,pt	%ncc, 1f
8487c478bdstevel@tonic-gate	  wr	%o3, 0, %fprs		! restore fprs
8517c478bdstevel@tonic-gate	FZEROQ1Q3
8527c478bdstevel@tonic-gate	wr	%o3, 0, %fprs		! restore fprs
8547c478bdstevel@tonic-gate	!
8557c478bdstevel@tonic-gate	! Need to cater for the different expectations of kcopy
8567c478bdstevel@tonic-gate	! and bcopy. kcopy will *always* set a t_lofault handler
8577c478bdstevel@tonic-gate	! If it fires, we're expected to just return the error code
8587c478bdstevel@tonic-gate	! and *not* to invoke any existing error handler. As far as
8597c478bdstevel@tonic-gate	! bcopy is concerned, we only set t_lofault if there was an
8607c478bdstevel@tonic-gate	! existing lofault handler. In that case we're expected to
8617c478bdstevel@tonic-gate	! invoke the previously existing handler after resetting the
8627c478bdstevel@tonic-gate	! t_lofault value.
8637c478bdstevel@tonic-gate	!
8657c478bdstevel@tonic-gate	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
8667c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
8677c478bdstevel@tonic-gate	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
8687c478bdstevel@tonic-gate	FP_ALLOWMIGRATE(5, 6)
8707c478bdstevel@tonic-gate	btst	TRAMP_FLAG, %l0
8717c478bdstevel@tonic-gate	bnz,pn	%ncc, 3f
8727c478bdstevel@tonic-gate	  nop
8737c478bdstevel@tonic-gate	ret
8747c478bdstevel@tonic-gate	  restore	%g1, 0, %o0
8777c478bdstevel@tonic-gate	!
8787c478bdstevel@tonic-gate	! We're here via bcopy. There *must* have been an error handler
8797c478bdstevel@tonic-gate	! in place otherwise we would have died a nasty death already.
8807c478bdstevel@tonic-gate	!
8817c478bdstevel@tonic-gate	jmp	%l6				! goto real handler
8827c478bdstevel@tonic-gate	  restore	%g0, 0, %o0		! dispose of copy window
8857c478bdstevel@tonic-gate * We got here because of a fault in .copyerr.  We can't safely restore fp
8867c478bdstevel@tonic-gate * state, so we panic.
8877c478bdstevel@tonic-gate */
8897c478bdstevel@tonic-gate	.asciz	"Unable to restore fp state after copy operation"
8917c478bdstevel@tonic-gate	.align	4
8937c478bdstevel@tonic-gate	set	fp_panic_msg, %o0
8947c478bdstevel@tonic-gate	call	panic
8957c478bdstevel@tonic-gate	  nop
8987c478bdstevel@tonic-gate * We got here because of a fault during a small kcopy or bcopy.
8997c478bdstevel@tonic-gate * No floating point registers are used by the small copies.
9007c478bdstevel@tonic-gate * Errno value is in %g1.
9017c478bdstevel@tonic-gate */
9047c478bdstevel@tonic-gate	btst	TRAMP_FLAG, %o4
9057c478bdstevel@tonic-gate	membar	#Sync
9067c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
9077c478bdstevel@tonic-gate	bnz,pn	%ncc, 3f
9087c478bdstevel@tonic-gate	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
9097c478bdstevel@tonic-gate	retl
9107c478bdstevel@tonic-gate	  mov	%g1, %o0
9127c478bdstevel@tonic-gate	jmp	%o4				! goto real handler
9137c478bdstevel@tonic-gate	  mov	%g0, %o0			!
9157c478bdstevel@tonic-gate	SET_SIZE(kcopy)
9167c478bdstevel@tonic-gate#endif	/* lint */
9207c478bdstevel@tonic-gate * Copy a block of storage - must not overlap (from + len <= to).
9217c478bdstevel@tonic-gate * Registers: l6 - saved t_lofault
9227c478bdstevel@tonic-gate * (for short copies, o4 - saved t_lofault)
9237c478bdstevel@tonic-gate *
9247c478bdstevel@tonic-gate * Copy a page of memory.
9257c478bdstevel@tonic-gate * Assumes double word alignment and a count >= 256.
9267c478bdstevel@tonic-gate */
9277c478bdstevel@tonic-gate#if defined(lint)
9297c478bdstevel@tonic-gate/* ARGSUSED */
9317c478bdstevel@tonic-gatebcopy(const void *from, void *to, size_t count)
9347c478bdstevel@tonic-gate#else	/* lint */
9367c478bdstevel@tonic-gate	ENTRY(bcopy)
9387c478bdstevel@tonic-gate	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
9397c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to larger cases
9407c478bdstevel@tonic-gate	  xor	%o0, %o1, %o3			! are src, dst alignable?
9417c478bdstevel@tonic-gate	btst	7, %o3				!
9427c478bdstevel@tonic-gate	bz,pt	%ncc, .bcopy_8			! check for longword alignment
9437c478bdstevel@tonic-gate	  nop
9447c478bdstevel@tonic-gate	btst	1, %o3				!
9457c478bdstevel@tonic-gate	bz,pt	%ncc, .bcopy_2			! check for half-word
9467c478bdstevel@tonic-gate	  nop
9477c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
9487c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
9497c478bdstevel@tonic-gate	tst	%o3
9507c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9517c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9527c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9537c478bdstevel@tonic-gate	  nop
9547c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9557c478bdstevel@tonic-gate	  nop
9577c478bdstevel@tonic-gate	btst	3, %o3				!
9587c478bdstevel@tonic-gate	bz,pt	%ncc, .bcopy_4			! check for word alignment
9597c478bdstevel@tonic-gate	  nop
9607c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
9617c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
9627c478bdstevel@tonic-gate	tst	%o3
9637c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9647c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9657c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9667c478bdstevel@tonic-gate	  nop
9677c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9687c478bdstevel@tonic-gate	  nop
9707c478bdstevel@tonic-gate	! already checked longword, must be word aligned
9717c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
9727c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
9737c478bdstevel@tonic-gate	tst	%o3
9747c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9757c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9767c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9777c478bdstevel@tonic-gate	  nop
9787c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9797c478bdstevel@tonic-gate	  nop
9817c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
9827c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
9837c478bdstevel@tonic-gate	tst	%o3
9847c478bdstevel@tonic-gate	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
9857c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
9867c478bdstevel@tonic-gate	bleu,pt	%ncc, .bcopy_small		! go to small copy
9877c478bdstevel@tonic-gate	  nop
9887c478bdstevel@tonic-gate	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
9897c478bdstevel@tonic-gate	  nop
9917c478bdstevel@tonic-gate	.align	16
9937c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
9947c478bdstevel@tonic-gate	tst	%o4
9957c478bdstevel@tonic-gate	bz,pt	%icc, .sm_do_copy
9967c478bdstevel@tonic-gate	  nop
9977c478bdstevel@tonic-gate	sethi	%hi(.sm_copyerr), %o5
9987c478bdstevel@tonic-gate	or	%o5, %lo(.sm_copyerr), %o5
9997c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
10007c478bdstevel@tonic-gate	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
10017c478bdstevel@tonic-gate	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
10037c478bdstevel@tonic-gate	cmp	%o2, SHORTCOPY		! check for really short case
10047c478bdstevel@tonic-gate	bleu,pt	%ncc, .bc_sm_left	!
10057c478bdstevel@tonic-gate	  cmp	%o2, CHKSIZE		! check for medium length cases
10067c478bdstevel@tonic-gate	bgu,pn	%ncc, .bc_med		!
10077c478bdstevel@tonic-gate	  or	%o0, %o1, %o3		! prepare alignment check
10087c478bdstevel@tonic-gate	andcc	%o3, 0x3, %g0		! test for alignment
10097c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
10117c478bdstevel@tonic-gate	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
10137c478bdstevel@tonic-gate	ldub	[%o0], %o3		! read byte
10147c478bdstevel@tonic-gate	stb	%o3, [%o1]		! write byte
10157c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
10167c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
10177c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
10187c478bdstevel@tonic-gate	stb	%o3, [%o1 + 1]
10197c478bdstevel@tonic-gate	ldub	[%o0 - 2], %o3
10207c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
10217c478bdstevel@tonic-gate	stb	%o3, [%o1 - 2]
10227c478bdstevel@tonic-gate	ldub	[%o0 - 1], %o3
10237c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
10247c478bdstevel@tonic-gate	  stb	%o3, [%o1 - 1]
10257c478bdstevel@tonic-gate	add	%o2, 3, %o2		! restore count
10277c478bdstevel@tonic-gate	tst	%o2
10287c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit	! check for zero length
10297c478bdstevel@tonic-gate	  deccc	%o2			! reduce count for cc test
10307c478bdstevel@tonic-gate	ldub	[%o0], %o3		! move one byte
10317c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10327c478bdstevel@tonic-gate	  stb	%o3, [%o1]
10337c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! move another byte
10347c478bdstevel@tonic-gate	deccc	%o2			! check for more
10357c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10367c478bdstevel@tonic-gate	  stb	%o3, [%o1 + 1]
10377c478bdstevel@tonic-gate	ldub	[%o0 + 2], %o3		! move final byte
10387c478bdstevel@tonic-gate	stb	%o3, [%o1 + 2]
10397c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
10407c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
10417c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10427c478bdstevel@tonic-gate	retl
10437c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
10447c478bdstevel@tonic-gate	.align	16
10457c478bdstevel@tonic-gate	nop				! instruction alignment
10467c478bdstevel@tonic-gate					! see discussion at start of file
10487c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
10507c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! update count
10517c478bdstevel@tonic-gate	stw	%o3, [%o1]		! write word
10527c478bdstevel@tonic-gate	add	%o0, 8, %o0		! update SRC
10537c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3		! read word
10547c478bdstevel@tonic-gate	add	%o1, 8, %o1		! update DST
10557c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_words	! loop til done
10567c478bdstevel@tonic-gate	  stw	%o3, [%o1 - 4]		! write word
10577c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
10587c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10597c478bdstevel@tonic-gate	  deccc	%o2
10607c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
10627c478bdstevel@tonic-gate	  subcc	%o2, 2, %o2		! reduce count by 2
10637c478bdstevel@tonic-gate	add	%o0, 2, %o0		! advance SRC by 2
10647c478bdstevel@tonic-gate	lduh	[%o0 - 2], %o3		! read half word
10657c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
10667c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_half	! loop til done
10677c478bdstevel@tonic-gate	  sth	%o3, [%o1 - 2]		! write half word
10687c478bdstevel@tonic-gate	addcc	%o2, 1, %o2		! restore count
10697c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10707c478bdstevel@tonic-gate	  nop
10727c478bdstevel@tonic-gate	ldub	[%o0], %o3
10737c478bdstevel@tonic-gate	stb	%o3, [%o1]
10747c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
10757c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
10767c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10777c478bdstevel@tonic-gate	retl
10787c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
10817c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! update count
10827c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_sm_wordx
10837c478bdstevel@tonic-gate	  lduw	[%o0], %o3		! read word
10847c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore count
10857c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10867c478bdstevel@tonic-gate	  stw	%o3, [%o1]		! write word
10877c478bdstevel@tonic-gate	deccc	%o2			! reduce count for cc test
10887c478bdstevel@tonic-gate	ldub	[%o0 + 4], %o3		! load one byte
10897c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10907c478bdstevel@tonic-gate	  stb	%o3, [%o1 + 4]		! store one byte
10917c478bdstevel@tonic-gate	ldub	[%o0 + 5], %o3		! load second byte
10927c478bdstevel@tonic-gate	deccc	%o2
10937c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
10947c478bdstevel@tonic-gate	  stb	%o3, [%o1 + 5]		! store second byte
10957c478bdstevel@tonic-gate	ldub	[%o0 + 6], %o3		! load third byte
10967c478bdstevel@tonic-gate	stb	%o3, [%o1 + 6]		! store third byte
10987c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
10997c478bdstevel@tonic-gate	andn	%o4, TRAMP_FLAG, %o4
11007c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
11017c478bdstevel@tonic-gate	retl
11027c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
11047c478bdstevel@tonic-gate	.align 16
11067c478bdstevel@tonic-gate	xor	%o0, %o1, %o3		! setup alignment check
11077c478bdstevel@tonic-gate	btst	1, %o3
11087c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
11097c478bdstevel@tonic-gate	  nop
11107c478bdstevel@tonic-gate	btst	3, %o3
11117c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_half	! halfword aligned
11127c478bdstevel@tonic-gate	  nop
11137c478bdstevel@tonic-gate	btst	7, %o3
11147c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_word	! word aligned
11157c478bdstevel@tonic-gate	  nop
11177c478bdstevel@tonic-gate	btst	3, %o0			! check for
11187c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_long1	! word alignment
11197c478bdstevel@tonic-gate	  nop
11217c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
11227c478bdstevel@tonic-gate	inc	%o0
11237c478bdstevel@tonic-gate	stb	%o3,[%o1]		! store byte
11247c478bdstevel@tonic-gate	inc	%o1
11257c478bdstevel@tonic-gate	btst	3, %o0
11267c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_long0
11277c478bdstevel@tonic-gate	  dec	%o2
11287c478bdstevel@tonic-gate.bc_med_long1:			! word aligned
11297c478bdstevel@tonic-gate	btst	7, %o0			! check for long word
11307c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_long2
11317c478bdstevel@tonic-gate	  nop
11327c478bdstevel@tonic-gate	lduw	[%o0], %o3		! load word
11337c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
11347c478bdstevel@tonic-gate	stw	%o3, [%o1]		! store word
11357c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
11367c478bdstevel@tonic-gate	sub	%o2, 4, %o2		! reduce count by 4
11387c478bdstevel@tonic-gate!  Now long word aligned and have at least 32 bytes to move
11417c478bdstevel@tonic-gate	sub	%o2, 31, %o2		! adjust count to allow cc zero test
11437c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
11447c478bdstevel@tonic-gate	stx	%o3, [%o1]		! write long word
11457c478bdstevel@tonic-gate	subcc	%o2, 32, %o2		! reduce count by 32
11467c478bdstevel@tonic-gate	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
11477c478bdstevel@tonic-gate	add	%o0, 32, %o0		! advance SRC by 32
11487c478bdstevel@tonic-gate	stx	%o3, [%o1 + 8]
11497c478bdstevel@tonic-gate	ldx	[%o0 - 16], %o3
11507c478bdstevel@tonic-gate	add	%o1, 32, %o1		! advance DST by 32
11517c478bdstevel@tonic-gate	stx	%o3, [%o1 - 16]
11527c478bdstevel@tonic-gate	ldx	[%o0 - 8], %o3
11537c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
11547c478bdstevel@tonic-gate	  stx	%o3, [%o1 - 8]
11557c478bdstevel@tonic-gate	addcc	%o2, 24, %o2		! restore count to long word offset
11567c478bdstevel@tonic-gate	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
11577c478bdstevel@tonic-gate	  nop
11597c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
11607c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
11617c478bdstevel@tonic-gate	stx	%o3, [%o1]		! write long word
11627c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
11637c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
11647c478bdstevel@tonic-gate	  add	%o1, 8, %o1		! advance DST by 8
11667c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore rest of count
11677c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
11687c478bdstevel@tonic-gate	  deccc	%o2
11697c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
11707c478bdstevel@tonic-gate	  nop
11717c478bdstevel@tonic-gate	ba,pt	%ncc, .bc_sm_half
11727c478bdstevel@tonic-gate	  nop
11747c478bdstevel@tonic-gate	.align 16
11767c478bdstevel@tonic-gate	btst	3, %o0			! check for
11777c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_word1	! word alignment
11787c478bdstevel@tonic-gate	  nop
11807c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
11817c478bdstevel@tonic-gate	inc	%o0
11827c478bdstevel@tonic-gate	stb	%o3,[%o1]		! store byte
11837c478bdstevel@tonic-gate	inc	%o1
11847c478bdstevel@tonic-gate	btst	3, %o0
11857c478bdstevel@tonic-gate	bnz,pt	%ncc, .bc_med_word0
11867c478bdstevel@tonic-gate	  dec	%o2
11887c478bdstevel@tonic-gate!  Now word aligned and have at least 36 bytes to move
11917c478bdstevel@tonic-gate	sub	%o2, 15, %o2		! adjust count to allow cc zero test
11937c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
11947c478bdstevel@tonic-gate	stw	%o3, [%o1]		! write word
11957c478bdstevel@tonic-gate	subcc	%o2, 16, %o2		! reduce count by 16
11967c478bdstevel@tonic-gate	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
11977c478bdstevel@tonic-gate	add	%o0, 16, %o0		! advance SRC by 16
11987c478bdstevel@tonic-gate	stw	%o3, [%o1 + 4]
11997c478bdstevel@tonic-gate	lduw	[%o0 - 8], %o3
12007c478bdstevel@tonic-gate	add	%o1, 16, %o1		! advance DST by 16
12017c478bdstevel@tonic-gate	stw	%o3, [%o1 - 8]
12027c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3
12037c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
12047c478bdstevel@tonic-gate	  stw	%o3, [%o1 - 4]
12057c478bdstevel@tonic-gate	addcc	%o2, 12, %o2		! restore count to word offset
12067c478bdstevel@tonic-gate	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
12077c478bdstevel@tonic-gate	  nop
12097c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
12107c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
12117c478bdstevel@tonic-gate	stw	%o3, [%o1]		! write word
12127c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
12137c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
12147c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! advance DST by 4
12167c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore rest of count
12177c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
12187c478bdstevel@tonic-gate	  deccc	%o2
12197c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
12207c478bdstevel@tonic-gate	  nop
12217c478bdstevel@tonic-gate	ba,pt	%ncc, .bc_sm_half
12227c478bdstevel@tonic-gate	  nop
12247c478bdstevel@tonic-gate	.align 16
12267c478bdstevel@tonic-gate	btst	1, %o0			! check for
12277c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_med_half1	! half word alignment
12287c478bdstevel@tonic-gate	  nop
12297c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
12307c478bdstevel@tonic-gate	inc	%o0
12317c478bdstevel@tonic-gate	stb	%o3,[%o1]		! store byte
12327c478bdstevel@tonic-gate	inc	%o1
12337c478bdstevel@tonic-gate	dec	%o2
12357c478bdstevel@tonic-gate!  Now half word aligned and have at least 38 bytes to move
12387c478bdstevel@tonic-gate	sub	%o2, 7, %o2		! adjust count to allow cc zero test
12407c478bdstevel@tonic-gate	lduh	[%o0], %o3		! read half word
12417c478bdstevel@tonic-gate	sth	%o3, [%o1]		! write half word
12427c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
12437c478bdstevel@tonic-gate	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
12447c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
12457c478bdstevel@tonic-gate	sth	%o3, [%o1 + 2]
12467c478bdstevel@tonic-gate	lduh	[%o0 - 4], %o3
12477c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
12487c478bdstevel@tonic-gate	sth	%o3, [%o1 - 4]
12497c478bdstevel@tonic-gate	lduh	[%o0 - 2], %o3
12507c478bdstevel@tonic-gate	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
12517c478bdstevel@tonic-gate	  sth	%o3, [%o1 - 2]
12527c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
12537c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_exit
12547c478bdstevel@tonic-gate	  deccc	%o2
12557c478bdstevel@tonic-gate	bz,pt	%ncc, .bc_sm_byte
12567c478bdstevel@tonic-gate	  nop
12577c478bdstevel@tonic-gate	ba,pt	%ncc, .bc_sm_half
12587c478bdstevel@tonic-gate	  nop
12607c478bdstevel@tonic-gate	SET_SIZE(bcopy)
12637c478bdstevel@tonic-gate * The _more entry points are not intended to be used directly by
12647c478bdstevel@tonic-gate * any caller from outside this file.  They are provided to allow
12657c478bdstevel@tonic-gate * profiling and dtrace of the portions of the copy code that uses
12667c478bdstevel@tonic-gate * the floating point registers.
12677c478bdstevel@tonic-gate * This entry is particularly important as DTRACE (at least as of
12687c478bdstevel@tonic-gate * 4/2004) does not support leaf functions.
12697c478bdstevel@tonic-gate */
12717c478bdstevel@tonic-gate	ENTRY(bcopy_more)
12737c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
12747c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
12757c478bdstevel@tonic-gate	tst	%l6
12767c478bdstevel@tonic-gate	bz,pt	%ncc, .do_copy
12777c478bdstevel@tonic-gate	  nop
12787c478bdstevel@tonic-gate	sethi	%hi(.copyerr), %o2
12797c478bdstevel@tonic-gate	or	%o2, %lo(.copyerr), %o2
12807c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
12817c478bdstevel@tonic-gate	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
12827c478bdstevel@tonic-gate	!
12837c478bdstevel@tonic-gate	! We've already captured whether t_lofault was zero on entry.
12847c478bdstevel@tonic-gate	! We need to mark ourselves as being from bcopy since both
12857c478bdstevel@tonic-gate	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
12867c478bdstevel@tonic-gate	! and the saved lofault was zero, we won't reset lofault on
12877c478bdstevel@tonic-gate	! returning.
12887c478bdstevel@tonic-gate	!
12897c478bdstevel@tonic-gate	or	%l6, TRAMP_FLAG, %l6
12927c478bdstevel@tonic-gate * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
12937c478bdstevel@tonic-gate * Also, use of FP registers has been tested to be enabled
12947c478bdstevel@tonic-gate */
12967c478bdstevel@tonic-gate	FP_NOMIGRATE(6, 7)
12987c478bdstevel@tonic-gate	rd	%fprs, %o2		! check for unused fp
12997c478bdstevel@tonic-gate	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
13007c478bdstevel@tonic-gate	btst	FPRS_FEF, %o2
13017c478bdstevel@tonic-gate	bz,a,pt	%icc, .do_blockcopy
13027c478bdstevel@tonic-gate	  wr	%g0, FPRS_FEF, %fprs
13047c478bdstevel@tonic-gate	BST_FPQ1Q3_TOSTACK(%o2)
13077c478bdstevel@tonic-gate	rd	%gsr, %o2
13087c478bdstevel@tonic-gate	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
13097c478bdstevel@tonic-gate	or	%l6, FPUSED_FLAG, %l6
13117c478bdstevel@tonic-gate#define	REALSRC	%i0
13127c478bdstevel@tonic-gate#define	DST	%i1
13137c478bdstevel@tonic-gate#define	CNT	%i2
13147c478bdstevel@tonic-gate#define	SRC	%i3
13157c478bdstevel@tonic-gate#define	TMP	%i5
13177c478bdstevel@tonic-gate	andcc	DST, VIS_BLOCKSIZE - 1, TMP
13187c478bdstevel@tonic-gate	bz,pt	%ncc, 2f
13197c478bdstevel@tonic-gate	  neg	TMP
13207c478bdstevel@tonic-gate	add	TMP, VIS_BLOCKSIZE, TMP
13227c478bdstevel@tonic-gate	! TMP = bytes required to align DST on FP_BLOCK boundary
13237c478bdstevel@tonic-gate	! Using SRC as a tmp here
13247c478bdstevel@tonic-gate	cmp	TMP, 3
13257c478bdstevel@tonic-gate	bleu,pt	%ncc, 1f
13267c478bdstevel@tonic-gate	  sub	CNT,TMP,CNT		! adjust main count
13277c478bdstevel@tonic-gate	sub	TMP, 3, TMP		! adjust for end of loop test
13297c478bdstevel@tonic-gate	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
13307c478bdstevel@tonic-gate	stb	SRC, [DST]
13317c478bdstevel@tonic-gate	subcc	TMP, 4, TMP
13327c478bdstevel@tonic-gate	ldub	[REALSRC + 1], SRC
13337c478bdstevel@tonic-gate	add	REALSRC, 4, REALSRC
13347c478bdstevel@tonic-gate	stb	SRC, [DST + 1]
13357c478bdstevel@tonic-gate	ldub	[REALSRC - 2], SRC
13367c478bdstevel@tonic-gate	add	DST, 4, DST
13377c478bdstevel@tonic-gate	stb	SRC, [DST - 2]
13387c478bdstevel@tonic-gate	ldub	[REALSRC - 1], SRC
13397c478bdstevel@tonic-gate	bgu,pt	%ncc, .bc_blkalign
13407c478bdstevel@tonic-gate	  stb	SRC, [DST - 1]
13427c478bdstevel@tonic-gate	addcc	TMP, 3, TMP		! restore count adjustment
13437c478bdstevel@tonic-gate	bz,pt	%ncc, 2f		! no bytes left?
13447c478bdstevel@tonic-gate	  nop
13457c478bdstevel@tonic-gate1:	ldub	[REALSRC], SRC
13467c478bdstevel@tonic-gate	inc	REALSRC
13477c478bdstevel@tonic-gate	inc	DST
13487c478bdstevel@tonic-gate	deccc	TMP
13497c478bdstevel@tonic-gate	bgu	%ncc, 1b
13507c478bdstevel@tonic-gate	  stb	SRC, [DST - 1]
13537c478bdstevel@tonic-gate	andn	REALSRC, 0x7, SRC
13547c478bdstevel@tonic-gate	alignaddr REALSRC, %g0, %g0
13567c478bdstevel@tonic-gate	! SRC - 8-byte aligned
13577c478bdstevel@tonic-gate	! DST - 64-byte aligned
13587c478bdstevel@tonic-gate	prefetch [SRC], #one_read
13597c478bdstevel@tonic-gate	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
13607c478bdstevel@tonic-gate	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
13617c478bdstevel@tonic-gate	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
13627c478bdstevel@tonic-gate	ldd	[SRC], %f0
13637c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 4
13647c478bdstevel@tonic-gate	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
13667c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
13677c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 5
13687c478bdstevel@tonic-gate	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
13707c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
13717c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 6
13727c478bdstevel@tonic-gate	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
13747c478bdstevel@tonic-gate	faligndata %f0, %f2, %f32
13757c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
13767c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 7
13777c478bdstevel@tonic-gate	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
13797c478bdstevel@tonic-gate	faligndata %f2, %f4, %f34
13807c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
13817c478bdstevel@tonic-gate	faligndata %f4, %f6, %f36
13827c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
13837c478bdstevel@tonic-gate	faligndata %f6, %f8, %f38
13847c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
13857c478bdstevel@tonic-gate	faligndata %f8, %f10, %f40
13867c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
13877c478bdstevel@tonic-gate	faligndata %f10, %f12, %f42
13887c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
13897c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
13907c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
13917c478bdstevel@tonic-gate	add	REALSRC, VIS_BLOCKSIZE, REALSRC
13927c478bdstevel@tonic-gate	ba,a,pt	%ncc, 1f
13937c478bdstevel@tonic-gate	  nop
13947c478bdstevel@tonic-gate	.align	16
13967c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
13977c478bdstevel@tonic-gate	faligndata %f12, %f14, %f44
13987c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
13997c478bdstevel@tonic-gate	faligndata %f14, %f0, %f46
14007c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
14017c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
14027c478bdstevel@tonic-gate	faligndata %f0, %f2, %f32
14037c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
14047c478bdstevel@tonic-gate	faligndata %f2, %f4, %f34
14057c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
14067c478bdstevel@tonic-gate	faligndata %f4, %f6, %f36
14077c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
14087c478bdstevel@tonic-gate	faligndata %f6, %f8, %f38
14097c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
14107c478bdstevel@tonic-gate	faligndata %f8, %f10, %f40
14117c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
14127c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
14137c478bdstevel@tonic-gate	faligndata %f10, %f12, %f42
14147c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
14157c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
14167c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
14177c478bdstevel@tonic-gate	add	REALSRC, VIS_BLOCKSIZE, REALSRC
14187c478bdstevel@tonic-gate	cmp	CNT, VIS_BLOCKSIZE + 8
14197c478bdstevel@tonic-gate	bgu,pt	%ncc, 1b
14207c478bdstevel@tonic-gate	  add	SRC, VIS_BLOCKSIZE, SRC
14227c478bdstevel@tonic-gate	! only if REALSRC & 0x7 is 0
14237c478bdstevel@tonic-gate	cmp	CNT, VIS_BLOCKSIZE
14247c478bdstevel@tonic-gate	bne	%ncc, 3f
14257c478bdstevel@tonic-gate	  andcc	REALSRC, 0x7, %g0
14267c478bdstevel@tonic-gate	bz,pt	%ncc, 2f
14277c478bdstevel@tonic-gate	  nop
14297c478bdstevel@tonic-gate	faligndata %f12, %f14, %f44
14307c478bdstevel@tonic-gate	faligndata %f14, %f0, %f46
14317c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
14327c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
14337c478bdstevel@tonic-gate	ba,pt	%ncc, 3f
14347c478bdstevel@tonic-gate	  nop
14367c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
14377c478bdstevel@tonic-gate	fsrc1	%f12, %f44
14387c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
14397c478bdstevel@tonic-gate	fsrc1	%f14, %f46
14407c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
14417c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
14427c478bdstevel@tonic-gate	fsrc1	%f0, %f32
14437c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
14447c478bdstevel@tonic-gate	fsrc1	%f2, %f34
14457c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
14467c478bdstevel@tonic-gate	fsrc1	%f4, %f36
14477c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
14487c478bdstevel@tonic-gate	fsrc1	%f6, %f38
14497c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
14507c478bdstevel@tonic-gate	fsrc1	%f8, %f40
14517c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
14527c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
14537c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
14547c478bdstevel@tonic-gate	add	REALSRC, VIS_BLOCKSIZE, REALSRC
14557c478bdstevel@tonic-gate	fsrc1	%f10, %f42
14567c478bdstevel@tonic-gate	fsrc1	%f12, %f44
14577c478bdstevel@tonic-gate	fsrc1	%f14, %f46
14587c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
14597c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
14607c478bdstevel@tonic-gate	ba,a,pt	%ncc, .bcb_exit
14617c478bdstevel@tonic-gate	  nop
14637c478bdstevel@tonic-gate3:	tst	CNT
14647c478bdstevel@tonic-gate	bz,a,pt	%ncc, .bcb_exit
14657c478bdstevel@tonic-gate	  nop
14677c478bdstevel@tonic-gate5:	ldub	[REALSRC], TMP
14687c478bdstevel@tonic-gate	inc	REALSRC
14697c478bdstevel@tonic-gate	inc	DST
14707c478bdstevel@tonic-gate	deccc	CNT
14717c478bdstevel@tonic-gate	bgu	%ncc, 5b
14727c478bdstevel@tonic-gate	  stb	TMP, [DST - 1]
14747c478bdstevel@tonic-gate	membar	#Sync
14767c478bdstevel@tonic-gate	FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
14777c478bdstevel@tonic-gate	FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
14787c478bdstevel@tonic-gate	FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)	! outputs lost
14807c478bdstevel@tonic-gate	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
14817c478bdstevel@tonic-gate	wr	%o2, 0, %gsr
14837c478bdstevel@tonic-gate	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
14847c478bdstevel@tonic-gate	btst	FPRS_FEF, %o3
14857c478bdstevel@tonic-gate	bz,pt	%icc, 4f
14867c478bdstevel@tonic-gate	  nop
14887c478bdstevel@tonic-gate	BLD_FPQ1Q3_FROMSTACK(%o2)
14907c478bdstevel@tonic-gate	ba,pt	%ncc, 2f
14917c478bdstevel@tonic-gate	  wr	%o3, 0, %fprs		! restore fprs
14937c478bdstevel@tonic-gate	FZEROQ1Q3
14947c478bdstevel@tonic-gate	wr	%o3, 0, %fprs		! restore fprs
14967c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
14977c478bdstevel@tonic-gate	andn	%l6, MASK_FLAGS, %l6
14987c478bdstevel@tonic-gate	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
14997c478bdstevel@tonic-gate	FP_ALLOWMIGRATE(5, 6)
15007c478bdstevel@tonic-gate	ret
15017c478bdstevel@tonic-gate	  restore	%g0, 0, %o0
15037c478bdstevel@tonic-gate	SET_SIZE(bcopy_more)
15057c478bdstevel@tonic-gate#endif	/* lint */
15087c478bdstevel@tonic-gate * Block copy with possibly overlapped operands.
15097c478bdstevel@tonic-gate */
15117c478bdstevel@tonic-gate#if defined(lint)
15157c478bdstevel@tonic-gateovbcopy(const void *from, void *to, size_t count)
15187c478bdstevel@tonic-gate#else	/* lint */
15207c478bdstevel@tonic-gate	ENTRY(ovbcopy)
15217c478bdstevel@tonic-gate	tst	%o2			! check count
15227c478bdstevel@tonic-gate	bgu,a	%ncc, 1f		! nothing to do or bad arguments
15237c478bdstevel@tonic-gate	  subcc	%o0, %o1, %o3		! difference of from and to address
15257c478bdstevel@tonic-gate	retl				! return
15267c478bdstevel@tonic-gate	  nop
15287c478bdstevel@tonic-gate	bneg,a	%ncc, 2f
15297c478bdstevel@tonic-gate	  neg	%o3			! if < 0, make it positive
15307c478bdstevel@tonic-gate2:	cmp	%o2, %o3		! cmp size and abs(from - to)
15317c478bdstevel@tonic-gate	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
15327c478bdstevel@tonic-gate	  .empty				!   no overlap
15337c478bdstevel@tonic-gate	  cmp	%o0, %o1		! compare from and to addresses
15347c478bdstevel@tonic-gate	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
15357c478bdstevel@tonic-gate	  nop
15367c478bdstevel@tonic-gate	!
15377c478bdstevel@tonic-gate	! Copy forwards.
15387c478bdstevel@tonic-gate	!
15407c478bdstevel@tonic-gate	ldub	[%o0], %o3		! read from address
15417c478bdstevel@tonic-gate	inc	%o0			! inc from address
15427c478bdstevel@tonic-gate	stb	%o3, [%o1]		! write to address
15437c478bdstevel@tonic-gate	deccc	%o2			! dec count
15447c478bdstevel@tonic-gate	bgu	%ncc, .ov_fwd		! loop till done
15457c478bdstevel@tonic-gate	  inc	%o1			! inc to address
15477c478bdstevel@tonic-gate	retl				! return
15487c478bdstevel@tonic-gate	  nop
15497c478bdstevel@tonic-gate	!
15507c478bdstevel@tonic-gate	! Copy backwards.
15517c478bdstevel@tonic-gate	!
15537c478bdstevel@tonic-gate	deccc	%o2			! dec count
15547c478bdstevel@tonic-gate	ldub	[%o0 + %o2], %o3	! get byte at end of src
15557c478bdstevel@tonic-gate	bgu	%ncc, .ov_bkwd		! loop till done
15567c478bdstevel@tonic-gate	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
15587c478bdstevel@tonic-gate	retl				! return
15597c478bdstevel@tonic-gate	  nop
15617c478bdstevel@tonic-gate	SET_SIZE(ovbcopy)
15637c478bdstevel@tonic-gate#endif	/* lint */
15677c478bdstevel@tonic-gate * hwblkpagecopy()
15687c478bdstevel@tonic-gate *
15697c478bdstevel@tonic-gate * Copies exactly one page.  This routine assumes the caller (ppcopy)
15707c478bdstevel@tonic-gate * has already disabled kernel preemption and has checked
15717c478bdstevel@tonic-gate * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
15727c478bdstevel@tonic-gate */
15737c478bdstevel@tonic-gate#ifdef lint
15767c478bdstevel@tonic-gatehwblkpagecopy(const void *src, void *dst)
15777c478bdstevel@tonic-gate{ }
15787c478bdstevel@tonic-gate#else /* lint */
15797c478bdstevel@tonic-gate	ENTRY(hwblkpagecopy)
15807c478bdstevel@tonic-gate	! get another window w/space for three aligned blocks of saved fpregs
15817c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
15837c478bdstevel@tonic-gate	! %i0 - source address (arg)
15847c478bdstevel@tonic-gate	! %i1 - destination address (arg)
15857c478bdstevel@tonic-gate	! %i2 - length of region (not arg)
15867c478bdstevel@tonic-gate	! %l0 - saved fprs
15877c478bdstevel@tonic-gate	! %l1 - pointer to saved fpregs
15897c478bdstevel@tonic-gate	rd	%fprs, %l0		! check for unused fp
15907c478bdstevel@tonic-gate	btst	FPRS_FEF, %l0
15917c478bdstevel@tonic-gate	bz,a,pt	%icc, 1f
15927c478bdstevel@tonic-gate	  wr	%g0, FPRS_FEF, %fprs
15947c478bdstevel@tonic-gate	BST_FPQ1Q3_TOSTACK(%l1)
15967c478bdstevel@tonic-gate1:	set	PAGESIZE, CNT
15977c478bdstevel@tonic-gate	mov	REALSRC, SRC
15997c478bdstevel@tonic-gate	prefetch [SRC], #one_read
16007c478bdstevel@tonic-gate	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
16017c478bdstevel@tonic-gate	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
16027c478bdstevel@tonic-gate	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
16037c478bdstevel@tonic-gate	ldd	[SRC], %f0
16047c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 4
16057c478bdstevel@tonic-gate	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
16077c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
16087c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 5
16097c478bdstevel@tonic-gate	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
16117c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
16127c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 6
16137c478bdstevel@tonic-gate	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
16157c478bdstevel@tonic-gate	fsrc1	%f0, %f32
16167c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
16177c478bdstevel@tonic-gate#if CHEETAH_PREFETCH > 7
16187c478bdstevel@tonic-gate	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
16207c478bdstevel@tonic-gate	fsrc1	%f2, %f34
16217c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
16227c478bdstevel@tonic-gate	fsrc1	%f4, %f36
16237c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
16247c478bdstevel@tonic-gate	fsrc1	%f6, %f38
16257c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
16267c478bdstevel@tonic-gate	fsrc1	%f8, %f40
16277c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
16287c478bdstevel@tonic-gate	fsrc1	%f10, %f42
16297c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
16307c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
16317c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
16327c478bdstevel@tonic-gate	ba,a,pt	%ncc, 2f
16337c478bdstevel@tonic-gate	  nop
16347c478bdstevel@tonic-gate	.align	16
16367c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
16377c478bdstevel@tonic-gate	fsrc1	%f12, %f44
16387c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
16397c478bdstevel@tonic-gate	fsrc1	%f14, %f46
16407c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
16417c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
16427c478bdstevel@tonic-gate	fsrc1	%f0, %f32
16437c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
16447c478bdstevel@tonic-gate	fsrc1	%f2, %f34
16457c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
16467c478bdstevel@tonic-gate	fsrc1	%f4, %f36
16477c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
16487c478bdstevel@tonic-gate	fsrc1	%f6, %f38
16497c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
16507c478bdstevel@tonic-gate	fsrc1	%f8, %f40
16517c478bdstevel@tonic-gate	ldd	[SRC + VIS_BLOCKSIZE], %f0
16527c478bdstevel@tonic-gate	fsrc1	%f10, %f42
16537c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
16547c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
16557c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
16567c478bdstevel@tonic-gate	cmp	CNT, VIS_BLOCKSIZE + 8
16577c478bdstevel@tonic-gate	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
16587c478bdstevel@tonic-gate	bgu,pt	%ncc, 2b
16597c478bdstevel@tonic-gate	  add	SRC, VIS_BLOCKSIZE, SRC
16617c478bdstevel@tonic-gate	! trailing block
16627c478bdstevel@tonic-gate	ldd	[SRC + 0x08], %f2
16637c478bdstevel@tonic-gate	fsrc1	%f12, %f44
16647c478bdstevel@tonic-gate	ldd	[SRC + 0x10], %f4
16657c478bdstevel@tonic-gate	fsrc1	%f14, %f46
16667c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
16677c478bdstevel@tonic-gate	ldd	[SRC + 0x18], %f6
16687c478bdstevel@tonic-gate	fsrc1	%f0, %f32
16697c478bdstevel@tonic-gate	ldd	[SRC + 0x20], %f8
16707c478bdstevel@tonic-gate	fsrc1	%f2, %f34
16717c478bdstevel@tonic-gate	ldd	[SRC + 0x28], %f10
16727c478bdstevel@tonic-gate	fsrc1	%f4, %f36
16737c478bdstevel@tonic-gate	ldd	[SRC + 0x30], %f12
16747c478bdstevel@tonic-gate	fsrc1	%f6, %f38
16757c478bdstevel@tonic-gate	ldd	[SRC + 0x38], %f14
16767c478bdstevel@tonic-gate	fsrc1	%f8, %f40
16777c478bdstevel@tonic-gate	sub	CNT, VIS_BLOCKSIZE, CNT
16787c478bdstevel@tonic-gate	add	DST, VIS_BLOCKSIZE, DST
16797c478bdstevel@tonic-gate	add	SRC, VIS_BLOCKSIZE, SRC
16807c478bdstevel@tonic-gate	fsrc1	%f10, %f42
16817c478bdstevel@tonic-gate	fsrc1	%f12, %f44
16827c478bdstevel@tonic-gate	fsrc1	%f14, %f46
16837c478bdstevel@tonic-gate	stda	%f32, [DST]ASI_BLK_P
16857c478bdstevel@tonic-gate	membar	#Sync
16877c478bdstevel@tonic-gate	FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
16887c478bdstevel@tonic-gate	FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
16897c478bdstevel@tonic-gate	FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)	! lose outputs
16917c478bdstevel@tonic-gate	btst	FPRS_FEF, %l0
16927c478bdstevel@tonic-gate	bz,pt	%icc, 2f
16937c478bdstevel@tonic-gate	  nop
16957c478bdstevel@tonic-gate	BLD_FPQ1Q3_FROMSTACK(%l3)
16967c478bdstevel@tonic-gate	ba	3f
16977c478bdstevel@tonic-gate	  nop
16997c478bdstevel@tonic-gate2:	FZEROQ1Q3
17017c478bdstevel@tonic-gate3:	wr	%l0, 0, %fprs		! restore fprs
17027c478bdstevel@tonic-gate	ret
17037c478bdstevel@tonic-gate	  restore	%g0, 0, %o0
17057c478bdstevel@tonic-gate	SET_SIZE(hwblkpagecopy)
17067c478bdstevel@tonic-gate#endif	/* lint */
17107c478bdstevel@tonic-gate * Transfer data to and from user space -
17117c478bdstevel@tonic-gate * Note that these routines can cause faults
17127c478bdstevel@tonic-gate * It is assumed that the kernel has nothing at
17137c478bdstevel@tonic-gate * less than KERNELBASE in the virtual address space.
17147c478bdstevel@tonic-gate *
17157c478bdstevel@tonic-gate * Note that copyin(9F) and copyout(9F) are part of the
17167c478bdstevel@tonic-gate * DDI/DKI which specifies that they return '-1' on "errors."
17177c478bdstevel@tonic-gate *
17187c478bdstevel@tonic-gate * Sigh.
17197c478bdstevel@tonic-gate *
17207c478bdstevel@tonic-gate * So there's two extremely similar routines - xcopyin() and xcopyout()
17217c478bdstevel@tonic-gate * which return the errno that we've faithfully computed.  This
17227c478bdstevel@tonic-gate * allows other callers (e.g. uiomove(9F)) to work correctly.
17237c478bdstevel@tonic-gate * Given that these are used pretty heavily, we expand the calling
17247c478bdstevel@tonic-gate * sequences inline for all flavours (rather than making wrappers).
17257c478bdstevel@tonic-gate *
17267c478bdstevel@tonic-gate * There are also stub routines for xcopyout_little and xcopyin_little,
17277c478bdstevel@tonic-gate * which currently are intended to handle requests of <= 16 bytes from
17287c478bdstevel@tonic-gate * do_unaligned. Future enhancement to make them handle 8k pages efficiently
17297c478bdstevel@tonic-gate * is left as an exercise...
17307c478bdstevel@tonic-gate */
17337c478bdstevel@tonic-gate * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
17347c478bdstevel@tonic-gate *
17357c478bdstevel@tonic-gate * General theory of operation:
17367c478bdstevel@tonic-gate *
17377c478bdstevel@tonic-gate * The only difference between copy{in,out} and
17387c478bdstevel@tonic-gate * xcopy{in,out} is in the error handling routine they invoke
17397c478bdstevel@tonic-gate * when a memory access error occurs. xcopyOP returns the errno
17407c478bdstevel@tonic-gate * while copyOP returns -1 (see above). copy{in,out}_noerr set
17417c478bdstevel@tonic-gate * a special flag (by oring the TRAMP_FLAG into the fault handler address)
17427c478bdstevel@tonic-gate * if they are called with a fault handler already in place. That flag
17437c478bdstevel@tonic-gate * causes the default handlers to trampoline to the previous handler
17447c478bdstevel@tonic-gate * upon an error.
17457c478bdstevel@tonic-gate *
17467c478bdstevel@tonic-gate * None of the copyops routines grab a window until it's decided that
17477c478bdstevel@tonic-gate * we need to do a HW block copy operation. This saves a window
17487c478bdstevel@tonic-gate * spill/fill when we're called during socket ops. The typical IO
17497c478bdstevel@tonic-gate * path won't cause spill/fill traps.
17507c478bdstevel@tonic-gate *
17517c478bdstevel@tonic-gate * This code uses a set of 4 limits for the maximum size that will
17527c478bdstevel@tonic-gate * be copied given a particular input/output address alignment.
17537c478bdstevel@tonic-gate * If the value for a particular limit is zero, the copy will be performed
17547c478bdstevel@tonic-gate * by the plain copy loops rather than FPBLK.
17557c478bdstevel@tonic-gate *
17567c478bdstevel@tonic-gate * See the description of bcopy above for more details of the
17577c478bdstevel@tonic-gate * data copying algorithm and the default limits.
17587c478bdstevel@tonic-gate *
17597c478bdstevel@tonic-gate */
17627c478bdstevel@tonic-gate * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
17637c478bdstevel@tonic-gate */
17657c478bdstevel@tonic-gate#if defined(lint)
17687c478bdstevel@tonic-gate#else	/* lint */
17707c478bdstevel@tonic-gate * We save the arguments in the following registers in case of a fault:
17717c478bdstevel@tonic-gate *	kaddr - %l1
17727c478bdstevel@tonic-gate *	uaddr - %l2
17737c478bdstevel@tonic-gate *	count - %l3
17747c478bdstevel@tonic-gate */
17757c478bdstevel@tonic-gate#define SAVE_SRC	%l1
17767c478bdstevel@tonic-gate#define SAVE_DST	%l2
17777c478bdstevel@tonic-gate#define SAVE_COUNT	%l3
17797c478bdstevel@tonic-gate#define SM_SAVE_SRC		%g4
17807c478bdstevel@tonic-gate#define SM_SAVE_DST		%g5
17817c478bdstevel@tonic-gate#define SM_SAVE_COUNT		%o5
17827c478bdstevel@tonic-gate#define ERRNO		%l5
17857c478bdstevel@tonic-gate#define REAL_LOFAULT	%l4
17877c478bdstevel@tonic-gate * Generic copyio fault handler.  This is the first line of defense when a
17887c478bdstevel@tonic-gate * fault occurs in (x)copyin/(x)copyout.  In order for this to function
17897c478bdstevel@tonic-gate * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
17907c478bdstevel@tonic-gate * This allows us to share common code for all the flavors of the copy
17917c478bdstevel@tonic-gate * operations, including the _noerr versions.
17927c478bdstevel@tonic-gate *
17937c478bdstevel@tonic-gate * Note that this function will restore the original input parameters before
17947c478bdstevel@tonic-gate * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
17957c478bdstevel@tonic-gate * member of the t_copyop structure, if needed.
17967c478bdstevel@tonic-gate */
17977c478bdstevel@tonic-gate	ENTRY(copyio_fault)
17987c478bdstevel@tonic-gate	membar	#Sync
17997c478bdstevel@tonic-gate	mov	%g1,ERRNO			! save errno in ERRNO
18007c478bdstevel@tonic-gate	btst	FPUSED_FLAG, %l6
18017c478bdstevel@tonic-gate	bz	%ncc, 1f
18027c478bdstevel@tonic-gate	  nop
18047c478bdstevel@tonic-gate	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
18057c478bdstevel@tonic-gate	wr	%o2, 0, %gsr    	! restore gsr
18077c478bdstevel@tonic-gate	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
18087c478bdstevel@tonic-gate	btst	FPRS_FEF, %o3
18097c478bdstevel@tonic-gate	bz,pt	%icc, 4f
18107c478bdstevel@tonic-gate	  nop
18127c478bdstevel@tonic-gate	BLD_FPQ2Q4_FROMSTACK(%o2)
18147c478bdstevel@tonic-gate	ba,pt	%ncc, 1f
18157c478bdstevel@tonic-gate	  wr	%o3, 0, %fprs   	! restore fprs
18187c478bdstevel@tonic-gate	FZEROQ2Q4
18197c478bdstevel@tonic-gate	wr	%o3, 0, %fprs   	! restore fprs
18227c478bdstevel@tonic-gate	andn	%l6, FPUSED_FLAG, %l6
18237c478bdstevel@tonic-gate	membar	#Sync
18247c478bdstevel@tonic-gate	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
18257c478bdstevel@tonic-gate	FP_ALLOWMIGRATE(5, 6)
18277c478bdstevel@tonic-gate	mov	SAVE_SRC, %i0
18287c478bdstevel@tonic-gate	mov	SAVE_DST, %i1
18297c478bdstevel@tonic-gate	jmp	REAL_LOFAULT
18307c478bdstevel@tonic-gate	  mov	SAVE_COUNT, %i2
18327c478bdstevel@tonic-gate	SET_SIZE(copyio_fault)
18377c478bdstevel@tonic-gate#if defined(lint)
18417c478bdstevel@tonic-gatecopyout(const void *kaddr, void *uaddr, size_t count)
18427c478bdstevel@tonic-gate{ return (0); }
18447c478bdstevel@tonic-gate#else	/* lint */
18467c478bdstevel@tonic-gate	ENTRY(copyout)
18487c478bdstevel@tonic-gate	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
18497c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to larger cases
18507c478bdstevel@tonic-gate	  xor	%o0, %o1, %o3			! are src, dst alignable?
18517c478bdstevel@tonic-gate	btst	7, %o3				!
18527c478bdstevel@tonic-gate	bz,pt	%ncc, .copyout_8		! check for longword alignment
18537c478bdstevel@tonic-gate	  nop
18547c478bdstevel@tonic-gate	btst	1, %o3				!
18557c478bdstevel@tonic-gate	bz,pt	%ncc, .copyout_2		! check for half-word
18567c478bdstevel@tonic-gate	  nop
18577c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
18587c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
18597c478bdstevel@tonic-gate	tst	%o3
18607c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18617c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18627c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18637c478bdstevel@tonic-gate	  nop
18647c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18657c478bdstevel@tonic-gate	  nop
18677c478bdstevel@tonic-gate	btst	3, %o3				!
18687c478bdstevel@tonic-gate	bz,pt	%ncc, .copyout_4		! check for word alignment
18697c478bdstevel@tonic-gate	  nop
18707c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
18717c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
18727c478bdstevel@tonic-gate	tst	%o3
18737c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18747c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18757c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18767c478bdstevel@tonic-gate	  nop
18777c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18787c478bdstevel@tonic-gate	  nop
18807c478bdstevel@tonic-gate	! already checked longword, must be word aligned
18817c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
18827c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
18837c478bdstevel@tonic-gate	tst	%o3
18847c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18857c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18867c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18877c478bdstevel@tonic-gate	  nop
18887c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18897c478bdstevel@tonic-gate	  nop
18917c478bdstevel@tonic-gate	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
18927c478bdstevel@tonic-gate	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
18937c478bdstevel@tonic-gate	tst	%o3
18947c478bdstevel@tonic-gate	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
18957c478bdstevel@tonic-gate	  cmp	%o2, %o3			! if length <= limit
18967c478bdstevel@tonic-gate	bleu,pt	%ncc, .copyout_small		! go to small copy
18977c478bdstevel@tonic-gate	  nop
18987c478bdstevel@tonic-gate	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
18997c478bdstevel@tonic-gate	  nop
19017c478bdstevel@tonic-gate	.align	16
19027c478bdstevel@tonic-gate	nop				! instruction alignment
19037c478bdstevel@tonic-gate					! see discussion at start of file
19057c478bdstevel@tonic-gate	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
19067c478bdstevel@tonic-gate	or	%o5, %lo(.sm_copyout_err), %o5
19077c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
19087c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
19097c478bdstevel@tonic-gate	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
19117c478bdstevel@tonic-gate	mov	%o0, SM_SAVE_SRC
19127c478bdstevel@tonic-gate	mov	%o1, SM_SAVE_DST
19137c478bdstevel@tonic-gate	cmp	%o2, SHORTCOPY		! check for really short case
19147c478bdstevel@tonic-gate	bleu,pt	%ncc, .co_sm_left	!
19157c478bdstevel@tonic-gate	  mov	%o2, SM_SAVE_COUNT
19167c478bdstevel@tonic-gate	cmp	%o2, CHKSIZE		! check for medium length cases
19177c478bdstevel@tonic-gate	bgu,pn	%ncc, .co_med		!
19187c478bdstevel@tonic-gate	  or	%o0, %o1, %o3		! prepare alignment check
19197c478bdstevel@tonic-gate	andcc	%o3, 0x3, %g0		! test for alignment
19207c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
19227c478bdstevel@tonic-gate	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
19247c478bdstevel@tonic-gate	ldub	[%o0], %o3		! read byte
19257c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
19267c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER	! write byte
19277c478bdstevel@tonic-gate	inc	%o1			! advance DST by 1
19287c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
19297c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
19307c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
19317c478bdstevel@tonic-gate	inc	%o1			! advance DST by 1
19327c478bdstevel@tonic-gate	ldub	[%o0 - 2], %o3
19337c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
19347c478bdstevel@tonic-gate	inc	%o1			! advance DST by 1
19357c478bdstevel@tonic-gate	ldub	[%o0 - 1], %o3
19367c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
19377c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
19387c478bdstevel@tonic-gate	  inc	%o1			! advance DST by 1
19397c478bdstevel@tonic-gate	add	%o2, 3, %o2		! restore count
19417c478bdstevel@tonic-gate	tst	%o2
19427c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit	! check for zero length
19437c478bdstevel@tonic-gate	  nop
19447c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
19457c478bdstevel@tonic-gate	deccc	%o2			! reduce count for cc test
19467c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19477c478bdstevel@tonic-gate	  stba	%o3,[%o1]ASI_USER	! store one byte
19487c478bdstevel@tonic-gate	ldub	[%o0 + 1], %o3		! load second byte
19497c478bdstevel@tonic-gate	deccc	%o2
19507c478bdstevel@tonic-gate	inc	%o1
19517c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19527c478bdstevel@tonic-gate	  stba	%o3,[%o1]ASI_USER	! store second byte
19537c478bdstevel@tonic-gate	ldub	[%o0 + 2], %o3		! load third byte
19547c478bdstevel@tonic-gate	inc	%o1
19557c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store third byte
19567c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
19577c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
19587c478bdstevel@tonic-gate	retl
19597c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
19607c478bdstevel@tonic-gate	.align	16
19627c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
19647c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! update count
19657c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
19667c478bdstevel@tonic-gate	add	%o0, 8, %o0		! update SRC
19677c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3		! read word
19687c478bdstevel@tonic-gate	add	%o1, 4, %o1		! update DST
19697c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
19707c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_words	! loop til done
19717c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! update DST
19727c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
19737c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19747c478bdstevel@tonic-gate	  nop
19757c478bdstevel@tonic-gate	deccc	%o2
19767c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
19787c478bdstevel@tonic-gate	  subcc	%o2, 2, %o2		! reduce count by 2
19797c478bdstevel@tonic-gate	lduh	[%o0], %o3		! read half word
19807c478bdstevel@tonic-gate	add	%o0, 2, %o0		! advance SRC by 2
19817c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER	! write half word
19827c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_half	! loop til done
19837c478bdstevel@tonic-gate	  add	%o1, 2, %o1		! advance DST by 2
19847c478bdstevel@tonic-gate	addcc	%o2, 1, %o2		! restore count
19857c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
19867c478bdstevel@tonic-gate	  nop
19887c478bdstevel@tonic-gate	ldub	[%o0], %o3
19897c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER
19907c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
19917c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
19927c478bdstevel@tonic-gate	retl
19937c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
19947c478bdstevel@tonic-gate	.align 16
19967c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! update count
19977c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_sm_wordx
19987c478bdstevel@tonic-gate	  lduw	[%o0], %o3		! read word
19997c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore count
20007c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
20017c478bdstevel@tonic-gate	  stwa	%o3, [%o1]ASI_USER	! write word
20027c478bdstevel@tonic-gate	deccc	%o2			! reduce count for cc test
20037c478bdstevel@tonic-gate	ldub	[%o0 + 4], %o3		! load one byte
20047c478bdstevel@tonic-gate	add	%o1, 4, %o1
20057c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
20067c478bdstevel@tonic-gate	  stba	%o3, [%o1]ASI_USER	! store one byte
20077c478bdstevel@tonic-gate	ldub	[%o0 + 5], %o3		! load second byte
20087c478bdstevel@tonic-gate	deccc	%o2
20097c478bdstevel@tonic-gate	inc	%o1
20107c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
20117c478bdstevel@tonic-gate	  stba	%o3, [%o1]ASI_USER	! store second byte
20127c478bdstevel@tonic-gate	ldub	[%o0 + 6], %o3		! load third byte
20137c478bdstevel@tonic-gate	inc	%o1
20147c478bdstevel@tonic-gate	stba	%o3, [%o1]ASI_USER	! store third byte
20167c478bdstevel@tonic-gate	  membar	#Sync				! sync error barrier
20177c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
20187c478bdstevel@tonic-gate	retl
20197c478bdstevel@tonic-gate	  mov	%g0, %o0		! return 0
20217c478bdstevel@tonic-gate	.align 16
20237c478bdstevel@tonic-gate	xor	%o0, %o1, %o3		! setup alignment check
20247c478bdstevel@tonic-gate	btst	1, %o3
20257c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
20267c478bdstevel@tonic-gate	  nop
20277c478bdstevel@tonic-gate	btst	3, %o3
20287c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_half	! halfword aligned
20297c478bdstevel@tonic-gate	  nop
20307c478bdstevel@tonic-gate	btst	7, %o3
20317c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_word	! word aligned
20327c478bdstevel@tonic-gate	  nop
20347c478bdstevel@tonic-gate	btst	3, %o0			! check for
20357c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_long1	! word alignment
20367c478bdstevel@tonic-gate	  nop
20387c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
20397c478bdstevel@tonic-gate	inc	%o0
20407c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store byte
20417c478bdstevel@tonic-gate	inc	%o1
20427c478bdstevel@tonic-gate	btst	3, %o0
20437c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_long0
20447c478bdstevel@tonic-gate	  dec	%o2
20457c478bdstevel@tonic-gate.co_med_long1:			! word aligned
20467c478bdstevel@tonic-gate	btst	7, %o0			! check for long word
20477c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_long2
20487c478bdstevel@tonic-gate	  nop
20497c478bdstevel@tonic-gate	lduw	[%o0], %o3		! load word
20507c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
20517c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! store word
20527c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
20537c478bdstevel@tonic-gate	sub	%o2, 4, %o2		! reduce count by 4
20557c478bdstevel@tonic-gate!  Now long word aligned and have at least 32 bytes to move
20587c478bdstevel@tonic-gate	sub	%o2, 31, %o2		! adjust count to allow cc zero test
20597c478bdstevel@tonic-gate	sub	%o1, 8, %o1		! adjust pointer to allow store in
20607c478bdstevel@tonic-gate					! branch delay slot instead of add
20627c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20637c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
20647c478bdstevel@tonic-gate	subcc	%o2, 32, %o2		! reduce count by 32
20657c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER	! write long word
20667c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20677c478bdstevel@tonic-gate	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
20687c478bdstevel@tonic-gate	add	%o0, 32, %o0		! advance SRC by 32
20697c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER
20707c478bdstevel@tonic-gate	ldx	[%o0 - 16], %o3
20717c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20727c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER
20737c478bdstevel@tonic-gate	ldx	[%o0 - 8], %o3
20747c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20757c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
20767c478bdstevel@tonic-gate	  stxa	%o3, [%o1]ASI_USER
20777c478bdstevel@tonic-gate	add	%o1, 8, %o1		! advance DST by 8
20787c478bdstevel@tonic-gate	addcc	%o2, 24, %o2		! restore count to long word offset
20797c478bdstevel@tonic-gate	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
20807c478bdstevel@tonic-gate	  nop
20827c478bdstevel@tonic-gate	ldx	[%o0], %o3		! read long word
20837c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
20847c478bdstevel@tonic-gate	stxa	%o3, [%o1]ASI_USER	! write long word
20857c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
20867c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
20877c478bdstevel@tonic-gate	  add	%o1, 8, %o1		! advance DST by 8
20897c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore rest of count
20907c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit	! if zero, then done
20917c478bdstevel@tonic-gate	  deccc	%o2
20927c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
20937c478bdstevel@tonic-gate	  nop
20947c478bdstevel@tonic-gate	ba,pt	%ncc, .co_sm_half
20957c478bdstevel@tonic-gate	  nop
20977c478bdstevel@tonic-gate	.align 16
20987c478bdstevel@tonic-gate	nop				! instruction alignment
20997c478bdstevel@tonic-gate					! see discussion at start of file
21017c478bdstevel@tonic-gate	btst	3, %o0			! check for
21027c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_word1	! word alignment
21037c478bdstevel@tonic-gate	  nop
21057c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
21067c478bdstevel@tonic-gate	inc	%o0
21077c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store byte
21087c478bdstevel@tonic-gate	inc	%o1
21097c478bdstevel@tonic-gate	btst	3, %o0
21107c478bdstevel@tonic-gate	bnz,pt	%ncc, .co_med_word0
21117c478bdstevel@tonic-gate	  dec	%o2
21137c478bdstevel@tonic-gate!  Now word aligned and have at least 36 bytes to move
21167c478bdstevel@tonic-gate	sub	%o2, 15, %o2		! adjust count to allow cc zero test
21187c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
21197c478bdstevel@tonic-gate	subcc	%o2, 16, %o2		! reduce count by 16
21207c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
21217c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
21227c478bdstevel@tonic-gate	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
21237c478bdstevel@tonic-gate	add	%o0, 16, %o0		! advance SRC by 16
21247c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER
21257c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
21267c478bdstevel@tonic-gate	lduw	[%o0 - 8], %o3
21277c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER
21287c478bdstevel@tonic-gate	add	%o1, 4, %o1		! advance DST by 4
21297c478bdstevel@tonic-gate	lduw	[%o0 - 4], %o3
21307c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER
21317c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
21327c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! advance DST by 4
21337c478bdstevel@tonic-gate	addcc	%o2, 12, %o2		! restore count to word offset
21347c478bdstevel@tonic-gate	ble,pt	%ncc, .co_med_wextra	! check for more words to move
21357c478bdstevel@tonic-gate	  nop
21377c478bdstevel@tonic-gate	lduw	[%o0], %o3		! read word
21387c478bdstevel@tonic-gate	subcc	%o2, 4, %o2		! reduce count by 4
21397c478bdstevel@tonic-gate	stwa	%o3, [%o1]ASI_USER	! write word
21407c478bdstevel@tonic-gate	add	%o0, 4, %o0		! advance SRC by 4
21417c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
21427c478bdstevel@tonic-gate	  add	%o1, 4, %o1		! advance DST by 4
21447c478bdstevel@tonic-gate	addcc	%o2, 3, %o2		! restore rest of count
21457c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit	! if zero, then done
21467c478bdstevel@tonic-gate	  deccc	%o2
21477c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
21487c478bdstevel@tonic-gate	  nop
21497c478bdstevel@tonic-gate	ba,pt	%ncc, .co_sm_half
21507c478bdstevel@tonic-gate	  nop
21527c478bdstevel@tonic-gate	.align 16
21537c478bdstevel@tonic-gate	nop				! instruction alignment
21547c478bdstevel@tonic-gate	nop				! see discussion at start of file
21557c478bdstevel@tonic-gate	nop
21577c478bdstevel@tonic-gate	btst	1, %o0			! check for
21587c478bdstevel@tonic-gate	bz,pt	%ncc, .co_med_half1	! half word alignment
21597c478bdstevel@tonic-gate	  nop
21607c478bdstevel@tonic-gate	ldub	[%o0], %o3		! load one byte
21617c478bdstevel@tonic-gate	inc	%o0
21627c478bdstevel@tonic-gate	stba	%o3,[%o1]ASI_USER	! store byte
21637c478bdstevel@tonic-gate	inc	%o1
21647c478bdstevel@tonic-gate	dec	%o2
21667c478bdstevel@tonic-gate!  Now half word aligned and have at least 38 bytes to move
21697c478bdstevel@tonic-gate	sub	%o2, 7, %o2		! adjust count to allow cc zero test
21717c478bdstevel@tonic-gate	lduh	[%o0], %o3		! read half word
21727c478bdstevel@tonic-gate	subcc	%o2, 8, %o2		! reduce count by 8
21737c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER	! write half word
21747c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
21757c478bdstevel@tonic-gate	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
21767c478bdstevel@tonic-gate	add	%o0, 8, %o0		! advance SRC by 8
21777c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER
21787c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
21797c478bdstevel@tonic-gate	lduh	[%o0 - 4], %o3
21807c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER
21817c478bdstevel@tonic-gate	add	%o1, 2, %o1		! advance DST by 2
21827c478bdstevel@tonic-gate	lduh	[%o0 - 2], %o3
21837c478bdstevel@tonic-gate	stha	%o3, [%o1]ASI_USER
21847c478bdstevel@tonic-gate	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
21857c478bdstevel@tonic-gate	  add	%o1, 2, %o1		! advance DST by 2
21867c478bdstevel@tonic-gate	addcc	%o2, 7, %o2		! restore count
21877c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_exit
21887c478bdstevel@tonic-gate	  deccc	%o2
21897c478bdstevel@tonic-gate	bz,pt	%ncc, .co_sm_byte
21907c478bdstevel@tonic-gate	  nop
21917c478bdstevel@tonic-gate	ba,pt	%ncc, .co_sm_half
21927c478bdstevel@tonic-gate	  nop
21957c478bdstevel@tonic-gate * We got here because of a fault during short copyout.
21967c478bdstevel@tonic-gate * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
21977c478bdstevel@tonic-gate */
21997c478bdstevel@tonic-gate	membar	#Sync
22007c478bdstevel@tonic-gate	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
22017c478bdstevel@tonic-gate	mov	SM_SAVE_SRC, %o0
22027c478bdstevel@tonic-gate	mov	SM_SAVE_DST, %o1
22037c478bdstevel@tonic-gate	mov	SM_SAVE_COUNT, %o2
22047c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
22057c478bdstevel@tonic-gate	tst	%o3
22067c478bdstevel@tonic-gate	bz,pt	%ncc, 3f			! if not, return error
22077c478bdstevel@tonic-gate	  nop
22087c478bdstevel@tonic-gate	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
22097c478bdstevel@tonic-gate	jmp	%o5				! original arguments
22107c478bdstevel@tonic-gate	  nop
22127c478bdstevel@tonic-gate	retl
22137c478bdstevel@tonic-gate	  or	%g0, -1, %o0		! return error value
22157c478bdstevel@tonic-gate	SET_SIZE(copyout)
22187c478bdstevel@tonic-gate * The _more entry points are not intended to be used directly by
22197c478bdstevel@tonic-gate * any caller from outside this file.  They are provided to allow
22207c478bdstevel@tonic-gate * profiling and dtrace of the portions of the copy code that uses
22217c478bdstevel@tonic-gate * the floating point registers.
22227c478bdstevel@tonic-gate * This entry is particularly important as DTRACE (at least as of
22237c478bdstevel@tonic-gate * 4/2004) does not support leaf functions.
22247c478bdstevel@tonic-gate */
22267c478bdstevel@tonic-gate	ENTRY(copyout_more)
22287c478bdstevel@tonic-gate	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
22297c478bdstevel@tonic-gate	set	.copyout_err, REAL_LOFAULT
22327c478bdstevel@tonic-gate * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
22337c478bdstevel@tonic-gate */
22357c478bdstevel@tonic-gate        set     copyio_fault, %l7		! .copyio_fault is lofault val
22377c478bdstevel@tonic-gate	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
22387c478bdstevel@tonic-gate	membar	#Sync				! sync error barrier
22397c478bdstevel@tonic-gate	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
22417c478bdstevel@tonic-gate	mov	%i0, SAVE_SRC
22427c478bdstevel@tonic-gate	mov	%i1, SAVE_DST
22437c478bdstevel@tonic-gate	mov	%i2, SAVE_COUNT
22457c478bdstevel@tonic-gate	FP_NOMIGRATE(6, 7)
22477c478bdstevel@tonic-gate	rd	%fprs, %o2		! check for unused fp
22487c478bdstevel@tonic-gate	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
22497c478bdstevel@tonic-gate	btst	FPRS_FEF, %o2
22507c478bdstevel@tonic-gate	bz,a,pt	%icc, .do_blockcopyout
22517c478bdstevel@tonic-gate	  wr	%g0, FPRS_FEF, %fprs
22537c478bdstevel@tonic-gate	BST_FPQ2Q4_TOSTACK(%o2)
22567c478bdstevel@tonic-gate	rd	%gsr, %o2
22577c478bdstevel@tonic-gate	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
22587c478bdstevel@tonic-gate	or	%l6, FPUSED_FLAG, %l6
22607c478bdstevel@tonic-gate	andcc	DST, VIS_BLOCKSIZE - 1, TMP
22617c478bdstevel@tonic-gate	mov	ASI_USER, %asi
22627c478bdstevel@tonic-gate	bz,pt	%ncc, 2f
22637c478bdstevel@tonic-gate	  neg	TMP
22647c478bdstevel@tonic-gate	add	TMP, VIS_BLOCKSIZE, TMP
22667c478bdstevel@tonic-gate	! TMP = bytes required to align DST on FP_BLOCK boundary
22677c478bdstevel@tonic-gate	! Using SRC as a tmp here
22687c478bdstevel@tonic-gate	cmp	TMP, 3
22697c478bdstevel@tonic-gate	bleu,pt	%ncc, 1f
22707c478bdstevel@tonic-gate	  sub	CNT,TMP,CNT		! adjust main count
22717c478bdstevel@tonic-gate	sub	TMP, 3, TMP		! adjust for end of loop test
22737c478bdstevel@tonic-gate	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
22747c478bdstevel@tonic-gate	stba	SRC, [DST]%asi
22757c478bdstevel@tonic-gate	subcc	TMP, 4, TMP
22767c478bdstevel@tonic-gate	ldub	[REALSRC + 1], SRC
22777c478bdstevel@tonic-gate	add	REALSRC, 4, REALSRC
22787c478bdstevel@tonic-gate	stba	SRC, [DST + 1]%asi
22797c478bdstevel@tonic-gate	ldub	[REALSRC - 2], SRC
22807c478bdstevel@tonic-gate	add	DST, 4, DST
22817c478bdstevel@tonic-gate	stba	SRC, [DST - 2]%asi
22827c478bdstevel@tonic-gate	ldub	[REALSRC - 1], SRC
22837c478bdstevel@tonic-gate	bgu,pt	%ncc, .co_blkalign
22847c478bdstevel@tonic-gate	  stba	SRC, [DST - 1]%asi
22867c478bdstevel@tonic-gate	addcc	TMP, 3, TMP		! restore count adjustment
22877c478bdstevel@tonic-gate	bz,pt	%ncc, 2f		! no bytes left?
22887c478bdstevel@tonic-gate	  nop
22897c478bdstevel@tonic-gate1:	ldub	[REALSRC], SRC
22907c478bdstevel@tonic-gate	inc	REALSRC
22917c478bdstevel@tonic-gate	inc	DST
22927c478bdstevel@tonic-gate	deccc	TMP
22937c478bdstevel@tonic-gate	bgu	%ncc, 1b
22947c478bdstevel@tonic-gate	  stba	SRC, [DST - 1]%asi
22977c478bdstevel@tonic-gate	andn	REALSRC, 0x7, SRC
22987c478bdstevel@tonic-gate	alignaddr REALSRC, %g0, %g0
23007c478bdstevel@tonic-gate	! SRC - 8-byte aligned
23017c478bdstevel@tonic-gate	! DST - 64-byte aligned
23027c478bdstevel@tonic-gate	prefetch [SRC], #one_read