125cf1a30Sjl/* 225cf1a30Sjl * CDDL HEADER START 325cf1a30Sjl * 425cf1a30Sjl * The contents of this file are subject to the terms of the 525cf1a30Sjl * Common Development and Distribution License (the "License"). 625cf1a30Sjl * You may not use this file except in compliance with the License. 725cf1a30Sjl * 825cf1a30Sjl * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 925cf1a30Sjl * or http://www.opensolaris.org/os/licensing. 1025cf1a30Sjl * See the License for the specific language governing permissions 1125cf1a30Sjl * and limitations under the License. 1225cf1a30Sjl * 1325cf1a30Sjl * When distributing Covered Code, include this CDDL HEADER in each 1425cf1a30Sjl * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1525cf1a30Sjl * If applicable, add the following below this CDDL HEADER, with the 1625cf1a30Sjl * fields enclosed by brackets "[]" replaced with your own identifying 1725cf1a30Sjl * information: Portions Copyright [yyyy] [name of copyright owner] 1825cf1a30Sjl * 1925cf1a30Sjl * CDDL HEADER END 2025cf1a30Sjl */ 2125cf1a30Sjl/* 22e64c6c3fSMichael Bergknoff * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 2325cf1a30Sjl * Use is subject to license terms. 2425cf1a30Sjl */ 2525cf1a30Sjl 2625cf1a30Sjl#include <sys/param.h> 2725cf1a30Sjl#include <sys/errno.h> 2825cf1a30Sjl#include <sys/asm_linkage.h> 2925cf1a30Sjl#include <sys/vtrace.h> 3025cf1a30Sjl#include <sys/machthread.h> 3125cf1a30Sjl#include <sys/clock.h> 3225cf1a30Sjl#include <sys/asi.h> 3325cf1a30Sjl#include <sys/fsr.h> 3425cf1a30Sjl#include <sys/privregs.h> 3525cf1a30Sjl 3625cf1a30Sjl#include "assym.h" 3725cf1a30Sjl 3825cf1a30Sjl/* 3925cf1a30Sjl * Pseudo-code to aid in understanding the control flow of the 4025cf1a30Sjl * bcopy/copyin/copyout routines. 4125cf1a30Sjl * 4225cf1a30Sjl * On entry: 4325cf1a30Sjl * 4425cf1a30Sjl * ! Determine whether to use the FP register version 4525cf1a30Sjl * ! or the leaf routine version depending on size 4625cf1a30Sjl * ! of copy and flags. Set up error handling accordingly. 4725cf1a30Sjl * ! The transition point depends on whether the src and 4825cf1a30Sjl * ! dst addresses can be aligned to long word, word, 4925cf1a30Sjl * ! half word, or byte boundaries. 5025cf1a30Sjl * ! 5125cf1a30Sjl * ! WARNING: <Register usage convention> 5225cf1a30Sjl * ! For FP version, %l6 holds previous error handling and 5325cf1a30Sjl * ! a flag: TRAMP_FLAG (low bits) 5425cf1a30Sjl * ! for leaf routine version, %o4 holds those values. 5525cf1a30Sjl * ! So either %l6 or %o4 is reserved and not available for 5625cf1a30Sjl * ! any other use. 5725cf1a30Sjl * 5825cf1a30Sjl * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 5925cf1a30Sjl * go to small_copy; ! to speed short copies 6025cf1a30Sjl * 6125cf1a30Sjl * ! src, dst long word alignable 6225cf1a30Sjl * if (hw_copy_limit_8 == 0) ! hw_copy disabled 6325cf1a30Sjl * go to small_copy; 6425cf1a30Sjl * if (length <= hw_copy_limit_8) 6525cf1a30Sjl * go to small_copy; 6625cf1a30Sjl * go to FPBLK_copy; 6725cf1a30Sjl * } 6825cf1a30Sjl * if (src,dst not alignable) { 6925cf1a30Sjl * if (hw_copy_limit_1 == 0) ! hw_copy disabled 7025cf1a30Sjl * go to small_copy; 7125cf1a30Sjl * if (length <= hw_copy_limit_1) 7225cf1a30Sjl * go to small_copy; 7325cf1a30Sjl * go to FPBLK_copy; 7425cf1a30Sjl * } 7525cf1a30Sjl * if (src,dst halfword alignable) { 7625cf1a30Sjl * if (hw_copy_limit_2 == 0) ! hw_copy disabled 7725cf1a30Sjl * go to small_copy; 7825cf1a30Sjl * if (length <= hw_copy_limit_2) 7925cf1a30Sjl * go to small_copy; 8025cf1a30Sjl * go to FPBLK_copy; 8125cf1a30Sjl * } 8225cf1a30Sjl * if (src,dst word alignable) { 8325cf1a30Sjl * if (hw_copy_limit_4 == 0) ! hw_copy disabled 8425cf1a30Sjl * go to small_copy; 8525cf1a30Sjl * if (length <= hw_copy_limit_4) 8625cf1a30Sjl * go to small_copy; 8725cf1a30Sjl * go to FPBLK_copy; 8825cf1a30Sjl * } 8925cf1a30Sjl * 9025cf1a30Sjl * small_copy: 9125cf1a30Sjl * Setup_leaf_rtn_error_handler; ! diffs for each entry point 9225cf1a30Sjl * 9325cf1a30Sjl * if (count <= 3) ! fast path for tiny copies 9425cf1a30Sjl * go to sm_left; ! special finish up code 9525cf1a30Sjl * else 9625cf1a30Sjl * if (count > CHKSIZE) ! medium sized copies 9725cf1a30Sjl * go to sm_med ! tuned by alignment 9825cf1a30Sjl * if(src&dst not both word aligned) { 9925cf1a30Sjl * sm_movebytes: 10025cf1a30Sjl * move byte by byte in 4-way unrolled loop 10125cf1a30Sjl * fall into sm_left; 10225cf1a30Sjl * sm_left: 10325cf1a30Sjl * move 0-3 bytes byte at a time as needed. 10425cf1a30Sjl * restore error handler and exit. 10525cf1a30Sjl * 10625cf1a30Sjl * } else { ! src&dst are word aligned 10725cf1a30Sjl * check for at least 8 bytes left, 10825cf1a30Sjl * move word at a time, unrolled by 2 10925cf1a30Sjl * when fewer than 8 bytes left, 11025cf1a30Sjl * sm_half: move half word at a time while 2 or more bytes left 11125cf1a30Sjl * sm_byte: move final byte if necessary 11225cf1a30Sjl * sm_exit: 11325cf1a30Sjl * restore error handler and exit. 11425cf1a30Sjl * } 11525cf1a30Sjl * 11625cf1a30Sjl * ! Medium length cases with at least CHKSIZE bytes available 11725cf1a30Sjl * ! method: line up src and dst as best possible, then 11825cf1a30Sjl * ! move data in 4-way unrolled loops. 11925cf1a30Sjl * 12025cf1a30Sjl * sm_med: 12125cf1a30Sjl * if(src&dst unalignable) 12225cf1a30Sjl * go to sm_movebytes 12325cf1a30Sjl * if(src&dst halfword alignable) 12425cf1a30Sjl * go to sm_movehalf 12525cf1a30Sjl * if(src&dst word alignable) 12625cf1a30Sjl * go to sm_moveword 12725cf1a30Sjl * ! fall into long word movement 12825cf1a30Sjl * move bytes until src is word aligned 12925cf1a30Sjl * if not long word aligned, move a word 13025cf1a30Sjl * move long words in 4-way unrolled loop until < 32 bytes left 13125cf1a30Sjl * move long words in 1-way unrolled loop until < 8 bytes left 13225cf1a30Sjl * if zero bytes left, goto sm_exit 13325cf1a30Sjl * if one byte left, go to sm_byte 13425cf1a30Sjl * else go to sm_half 13525cf1a30Sjl * 13625cf1a30Sjl * sm_moveword: 13725cf1a30Sjl * move bytes until src is word aligned 13825cf1a30Sjl * move words in 4-way unrolled loop until < 16 bytes left 13925cf1a30Sjl * move words in 1-way unrolled loop until < 4 bytes left 14025cf1a30Sjl * if zero bytes left, goto sm_exit 14125cf1a30Sjl * if one byte left, go to sm_byte 14225cf1a30Sjl * else go to sm_half 14325cf1a30Sjl * 14425cf1a30Sjl * sm_movehalf: 14525cf1a30Sjl * move a byte if needed to align src on halfword 14625cf1a30Sjl * move halfwords in 4-way unrolled loop until < 8 bytes left 14725cf1a30Sjl * if zero bytes left, goto sm_exit 14825cf1a30Sjl * if one byte left, go to sm_byte 14925cf1a30Sjl * else go to sm_half 15025cf1a30Sjl * 15125cf1a30Sjl * 15225cf1a30Sjl * FPBLK_copy: 15325cf1a30Sjl * %l6 = curthread->t_lofault; 15425cf1a30Sjl * if (%l6 != NULL) { 15525cf1a30Sjl * membar #Sync 15625cf1a30Sjl * curthread->t_lofault = .copyerr; 15725cf1a30Sjl * caller_error_handler = TRUE ! %l6 |= 2 15825cf1a30Sjl * } 15925cf1a30Sjl * 16025cf1a30Sjl * ! for FPU testing we must not migrate cpus 16125cf1a30Sjl * if (curthread->t_lwp == NULL) { 16225cf1a30Sjl * ! Kernel threads do not have pcb's in which to store 16325cf1a30Sjl * ! the floating point state, so disallow preemption during 16425cf1a30Sjl * ! the copy. This also prevents cpu migration. 16525cf1a30Sjl * kpreempt_disable(curthread); 16625cf1a30Sjl * } else { 16725cf1a30Sjl * thread_nomigrate(); 16825cf1a30Sjl * } 16925cf1a30Sjl * 17025cf1a30Sjl * old_fprs = %fprs; 17125cf1a30Sjl * old_gsr = %gsr; 17225cf1a30Sjl * if (%fprs.fef) { 17325cf1a30Sjl * %fprs.fef = 1; 17425cf1a30Sjl * save current fpregs on stack using blockstore 17525cf1a30Sjl * } else { 17625cf1a30Sjl * %fprs.fef = 1; 17725cf1a30Sjl * } 17825cf1a30Sjl * 17925cf1a30Sjl * 18025cf1a30Sjl * do_blockcopy_here; 18125cf1a30Sjl * 18225cf1a30Sjl * In lofault handler: 18325cf1a30Sjl * curthread->t_lofault = .copyerr2; 18425cf1a30Sjl * Continue on with the normal exit handler 18525cf1a30Sjl * 18625cf1a30Sjl * On normal exit: 18725cf1a30Sjl * %gsr = old_gsr; 18825cf1a30Sjl * if (old_fprs & FPRS_FEF) 18925cf1a30Sjl * restore fpregs from stack using blockload 19025cf1a30Sjl * else 19125cf1a30Sjl * zero fpregs 19225cf1a30Sjl * %fprs = old_fprs; 19325cf1a30Sjl * membar #Sync 19425cf1a30Sjl * curthread->t_lofault = (%l6 & ~3); 19525cf1a30Sjl * ! following test omitted from copyin/copyout as they 19625cf1a30Sjl * ! will always have a current thread 19725cf1a30Sjl * if (curthread->t_lwp == NULL) 19825cf1a30Sjl * kpreempt_enable(curthread); 19925cf1a30Sjl * else 20025cf1a30Sjl * thread_allowmigrate(); 20125cf1a30Sjl * return (0) 20225cf1a30Sjl * 20325cf1a30Sjl * In second lofault handler (.copyerr2): 20425cf1a30Sjl * We've tried to restore fp state from the stack and failed. To 20525cf1a30Sjl * prevent from returning with a corrupted fp state, we will panic. 20625cf1a30Sjl */ 20725cf1a30Sjl 20825cf1a30Sjl/* 20925cf1a30Sjl * Comments about optimization choices 21025cf1a30Sjl * 21125cf1a30Sjl * The initial optimization decision in this code is to determine 21225cf1a30Sjl * whether to use the FP registers for a copy or not. If we don't 21325cf1a30Sjl * use the FP registers, we can execute the copy as a leaf routine, 21425cf1a30Sjl * saving a register save and restore. Also, less elaborate setup 21525cf1a30Sjl * is required, allowing short copies to be completed more quickly. 21625cf1a30Sjl * For longer copies, especially unaligned ones (where the src and 21725cf1a30Sjl * dst do not align to allow simple ldx,stx operation), the FP 21825cf1a30Sjl * registers allow much faster copy operations. 21925cf1a30Sjl * 22025cf1a30Sjl * The estimated extra cost of the FP path will vary depending on 22125cf1a30Sjl * src/dst alignment, dst offset from the next 64 byte FPblock store 22225cf1a30Sjl * boundary, remaining src data after the last full dst cache line is 22325cf1a30Sjl * moved whether the FP registers need to be saved, and some other 22425cf1a30Sjl * minor issues. The average additional overhead is estimated to be 22525cf1a30Sjl * 400 clocks. Since each non-repeated/predicted tst and branch costs 22625cf1a30Sjl * around 10 clocks, elaborate calculation would slow down to all 22725cf1a30Sjl * longer copies and only benefit a small portion of medium sized 22825cf1a30Sjl * copies. Rather than incur such cost, we chose fixed transition 22925cf1a30Sjl * points for each of the alignment choices. 23025cf1a30Sjl * 23125cf1a30Sjl * For the inner loop, here is a comparison of the per cache line 23225cf1a30Sjl * costs for each alignment when src&dst are in cache: 23325cf1a30Sjl * 23425cf1a30Sjl * byte aligned: 108 clocks slower for non-FPBLK 23525cf1a30Sjl * half aligned: 44 clocks slower for non-FPBLK 23625cf1a30Sjl * word aligned: 12 clocks slower for non-FPBLK 23725cf1a30Sjl * long aligned: 4 clocks >>faster<< for non-FPBLK 23825cf1a30Sjl * 23925cf1a30Sjl * The long aligned loop runs faster because it does no prefetching. 24025cf1a30Sjl * That wins if the data is not in cache or there is too little 24125cf1a30Sjl * data to gain much benefit from prefetching. But when there 24225cf1a30Sjl * is more data and that data is not in cache, failing to prefetch 24325cf1a30Sjl * can run much slower. In addition, there is a 2 Kbyte store queue 24425cf1a30Sjl * which will cause the non-FPBLK inner loop to slow for larger copies. 24525cf1a30Sjl * The exact tradeoff is strongly load and application dependent, with 24625cf1a30Sjl * increasing risk of a customer visible performance regression if the 24725cf1a30Sjl * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 24825cf1a30Sjl * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 24925cf1a30Sjl * upper limit for the non-FPBLK code. To minimize performance regression 25025cf1a30Sjl * risk while still gaining the primary benefits of the improvements to 25125cf1a30Sjl * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 25225cf1a30Sjl * hw_copy_limit_*. Later experimental studies using different values 25325cf1a30Sjl * of hw_copy_limit_* can be used to make further adjustments if 25425cf1a30Sjl * appropriate. 25525cf1a30Sjl * 25625cf1a30Sjl * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 25725cf1a30Sjl * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 25825cf1a30Sjl * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 25925cf1a30Sjl * hw_copy_limit_8 = src and dst are longword aligned 26025cf1a30Sjl * 26125cf1a30Sjl * To say that src and dst are word aligned means that after 26225cf1a30Sjl * some initial alignment activity of moving 0 to 3 bytes, 26325cf1a30Sjl * both the src and dst will be on word boundaries so that 26425cf1a30Sjl * word loads and stores may be used. 26525cf1a30Sjl * 26625cf1a30Sjl * Default values at May,2005 are: 26725cf1a30Sjl * hw_copy_limit_1 = 256 26825cf1a30Sjl * hw_copy_limit_2 = 512 26925cf1a30Sjl * hw_copy_limit_4 = 1024 27025cf1a30Sjl * hw_copy_limit_8 = 1024 (or 1536 on some systems) 27125cf1a30Sjl * 27225cf1a30Sjl * 27325cf1a30Sjl * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 27425cf1a30Sjl * disabled for that alignment choice. 27525cf1a30Sjl * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 27625cf1a30Sjl * the value of VIS_COPY_THRESHOLD is used. 27725cf1a30Sjl * It is not envisioned that hw_copy_limit_? will be changed in the field 27825cf1a30Sjl * It is provided to allow for disabling FPBLK copies and to allow 27925cf1a30Sjl * easy testing of alternate values on future HW implementations 28025cf1a30Sjl * that might have different cache sizes, clock rates or instruction 28125cf1a30Sjl * timing rules. 28225cf1a30Sjl * 28325cf1a30Sjl * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 28425cf1a30Sjl * threshold to speedup all shorter copies (less than 256). That 28525cf1a30Sjl * saves an alignment test, memory reference, and enabling test 28625cf1a30Sjl * for all short copies, or an estimated 24 clocks. 28725cf1a30Sjl * 28825cf1a30Sjl * The order in which these limits are checked does matter since each 28925cf1a30Sjl * non-predicted tst and branch costs around 10 clocks. 29025cf1a30Sjl * If src and dst are randomly selected addresses, 29125cf1a30Sjl * 4 of 8 will not be alignable. 29225cf1a30Sjl * 2 of 8 will be half word alignable. 29325cf1a30Sjl * 1 of 8 will be word alignable. 29425cf1a30Sjl * 1 of 8 will be long word alignable. 29525cf1a30Sjl * But, tests on running kernels show that src and dst to copy code 29625cf1a30Sjl * are typically not on random alignments. Structure copies and 29725cf1a30Sjl * copies of larger data sizes are often on long word boundaries. 29825cf1a30Sjl * So we test the long word alignment case first, then 29925cf1a30Sjl * the byte alignment, then halfword, then word alignment. 30025cf1a30Sjl * 30125cf1a30Sjl * Several times, tests for length are made to split the code 30225cf1a30Sjl * into subcases. These tests often allow later tests to be 30325cf1a30Sjl * avoided. For example, within the non-FPBLK copy, we first 30425cf1a30Sjl * check for tiny copies of 3 bytes or less. That allows us 30525cf1a30Sjl * to use a 4-way unrolled loop for the general byte copy case 30625cf1a30Sjl * without a test on loop entry. 30725cf1a30Sjl * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 30825cf1a30Sjl * vs longer cases. For the really short case, we don't attempt 30925cf1a30Sjl * align src and dst. We try to minimize special case tests in 31025cf1a30Sjl * the shortest loops as each test adds a significant percentage 31125cf1a30Sjl * to the total time. 31225cf1a30Sjl * 31325cf1a30Sjl * For the medium sized cases, we allow ourselves to adjust the 31425cf1a30Sjl * src and dst alignment and provide special cases for each of 31525cf1a30Sjl * the four adjusted alignment cases. The CHKSIZE that was used 31625cf1a30Sjl * to decide between short and medium size was chosen to be 39 31725cf1a30Sjl * as that allows for the worst case of 7 bytes of alignment 31825cf1a30Sjl * shift and 4 times 8 bytes for the first long word unrolling. 31925cf1a30Sjl * That knowledge saves an initial test for length on entry into 32025cf1a30Sjl * the medium cases. If the general loop unrolling factor were 32125cf1a30Sjl * to be increases, this number would also need to be adjusted. 32225cf1a30Sjl * 32325cf1a30Sjl * For all cases in the non-FPBLK code where it is known that at 32425cf1a30Sjl * least 4 chunks of data are available for movement, the 32525cf1a30Sjl * loop is unrolled by four. This 4-way loop runs in 8 clocks 32625cf1a30Sjl * or 2 clocks per data element. 32725cf1a30Sjl * 32825cf1a30Sjl * Instruction alignment is forced by used of .align 16 directives 32925cf1a30Sjl * and nops which are not executed in the code. This 33025cf1a30Sjl * combination of operations shifts the alignment of following 33125cf1a30Sjl * loops to insure that loops are aligned so that their instructions 33225cf1a30Sjl * fall within the minimum number of 4 instruction fetch groups. 33325cf1a30Sjl * If instructions are inserted or removed between the .align 33425cf1a30Sjl * instruction and the unrolled loops, then the alignment needs 33525cf1a30Sjl * to be readjusted. Misaligned loops can add a clock per loop 33625cf1a30Sjl * iteration to the loop timing. 33725cf1a30Sjl * 33825cf1a30Sjl * In a few cases, code is duplicated to avoid a branch. Since 33925cf1a30Sjl * a non-predicted tst and branch takes 10 clocks, this savings 34025cf1a30Sjl * is judged an appropriate time-space tradeoff. 34125cf1a30Sjl * 34225cf1a30Sjl * Within the FPBLK-code, the prefetch method in the inner 34325cf1a30Sjl * loop needs to be explained as it is not standard. Two 34425cf1a30Sjl * prefetches are issued for each cache line instead of one. 34525cf1a30Sjl * The primary one is at the maximum reach of 8 cache lines. 34625cf1a30Sjl * Most of the time, that maximum prefetch reach gives the 34725cf1a30Sjl * cache line more time to reach the processor for systems with 34825cf1a30Sjl * higher processor clocks. But, sometimes memory interference 34925cf1a30Sjl * can cause that prefetch to be dropped. Putting a second 35025cf1a30Sjl * prefetch at a reach of 5 cache lines catches the drops 35125cf1a30Sjl * three iterations later and shows a measured improvement 35225cf1a30Sjl * in performance over any similar loop with a single prefetch. 35325cf1a30Sjl * The prefetches are placed in the loop so they overlap with 35425cf1a30Sjl * non-memory instructions, so that there is no extra cost 35525cf1a30Sjl * when the data is already in-cache. 35625cf1a30Sjl * 35725cf1a30Sjl */ 35825cf1a30Sjl 35925cf1a30Sjl/* 36025cf1a30Sjl * Notes on preserving existing fp state and on membars. 36125cf1a30Sjl * 36225cf1a30Sjl * When a copyOP decides to use fp we may have to preserve existing 36325cf1a30Sjl * floating point state. It is not the caller's state that we need to 36425cf1a30Sjl * preserve - the rest of the kernel does not use fp and, anyway, fp 36525cf1a30Sjl * registers are volatile across a call. Some examples: 36625cf1a30Sjl * 36725cf1a30Sjl * - userland has fp state and is interrupted (device interrupt 36825cf1a30Sjl * or trap) and within the interrupt/trap handling we use 36925cf1a30Sjl * bcopy() 37025cf1a30Sjl * - another (higher level) interrupt or trap handler uses bcopy 37125cf1a30Sjl * while a bcopy from an earlier interrupt is still active 37225cf1a30Sjl * - an asynchronous error trap occurs while fp state exists (in 37325cf1a30Sjl * userland or in kernel copy) and the tl0 component of the handling 37425cf1a30Sjl * uses bcopy 37525cf1a30Sjl * - a user process with fp state incurs a copy-on-write fault and 37625cf1a30Sjl * hwblkpagecopy always uses fp 37725cf1a30Sjl * 37825cf1a30Sjl * We therefore need a per-call place in which to preserve fp state - 37925cf1a30Sjl * using our stack is ideal (and since fp copy cannot be leaf optimized 38025cf1a30Sjl * because of calls it makes, this is no hardship). 38125cf1a30Sjl * 38225cf1a30Sjl * When we have finished fp copy (with it's repeated block stores) 38325cf1a30Sjl * we must membar #Sync so that our block stores may complete before 38425cf1a30Sjl * we either restore the original fp state into the fp registers or 38525cf1a30Sjl * return to a caller which may initiate other fp operations that could 38625cf1a30Sjl * modify the fp regs we used before the block stores complete. 38725cf1a30Sjl * 38825cf1a30Sjl * Synchronous faults (eg, unresolvable DMMU miss) that occur while 38925cf1a30Sjl * t_lofault is not NULL will not panic but will instead trampoline 39025cf1a30Sjl * to the registered lofault handler. There is no need for any 39125cf1a30Sjl * membars for these - eg, our store to t_lofault will always be visible to 39225cf1a30Sjl * ourselves and it is our cpu which will take any trap. 39325cf1a30Sjl * 39425cf1a30Sjl * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 39525cf1a30Sjl * while t_lofault is not NULL will also not panic. Since we're copying 39625cf1a30Sjl * to or from userland the extent of the damage is known - the destination 39725cf1a30Sjl * buffer is incomplete. So trap handlers will trampoline to the lofault 39825cf1a30Sjl * handler in this case which should take some form of error action to 39925cf1a30Sjl * avoid using the incomplete buffer. The trap handler also flags the 40025cf1a30Sjl * fault so that later return-from-trap handling (for the trap that brought 40125cf1a30Sjl * this thread into the kernel in the first place) can notify the process 40225cf1a30Sjl * and reboot the system (or restart the service with Greenline/Contracts). 40325cf1a30Sjl * 40425cf1a30Sjl * Asynchronous faults (eg, uncorrectable ECC error from memory) can 40525cf1a30Sjl * result in deferred error traps - the trap is taken sometime after 40625cf1a30Sjl * the event and the trap PC may not be the PC of the faulting access. 40725cf1a30Sjl * Delivery of such pending traps can be forced by a membar #Sync, acting 40825cf1a30Sjl * as an "error barrier" in this role. To accurately apply the user/kernel 40925cf1a30Sjl * separation described in the preceding paragraph we must force delivery 41025cf1a30Sjl * of deferred traps affecting kernel state before we install a lofault 41125cf1a30Sjl * handler (if we interpose a new lofault handler on an existing one there 41225cf1a30Sjl * is no need to repeat this), and we must force delivery of deferred 41325cf1a30Sjl * errors affecting the lofault-protected region before we clear t_lofault. 41425cf1a30Sjl * Failure to do so results in lost kernel state being interpreted as 41525cf1a30Sjl * affecting a copyin/copyout only, or of an error that really only 41625cf1a30Sjl * affects copy data being interpreted as losing kernel state. 41725cf1a30Sjl * 41825cf1a30Sjl * Since the copy operations may preserve and later restore floating 41925cf1a30Sjl * point state that does not belong to the caller (see examples above), 42025cf1a30Sjl * we must be careful in how we do this in order to prevent corruption 42125cf1a30Sjl * of another program. 42225cf1a30Sjl * 42325cf1a30Sjl * To make sure that floating point state is always saved and restored 42425cf1a30Sjl * correctly, the following "big rules" must be followed when the floating 42525cf1a30Sjl * point registers will be used: 42625cf1a30Sjl * 42725cf1a30Sjl * 1. %l6 always holds the caller's lofault handler. Also in this register, 42825cf1a30Sjl * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 42925cf1a30Sjl * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 43025cf1a30Sjl * lofault handler was set coming in. 43125cf1a30Sjl * 43225cf1a30Sjl * 2. The FPUSED flag indicates that all FP state has been successfully stored 43325cf1a30Sjl * on the stack. It should not be set until this save has been completed. 43425cf1a30Sjl * 43525cf1a30Sjl * 3. The FPUSED flag should not be cleared on exit until all FP state has 43625cf1a30Sjl * been restored from the stack. If an error occurs while restoring 43725cf1a30Sjl * data from the stack, the error handler can check this flag to see if 43825cf1a30Sjl * a restore is necessary. 43925cf1a30Sjl * 44025cf1a30Sjl * 4. Code run under the new lofault handler must be kept to a minimum. In 44125cf1a30Sjl * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 44225cf1a30Sjl * to kpreempt(), should not be made until after the lofault handler has 44325cf1a30Sjl * been restored. 44425cf1a30Sjl */ 44525cf1a30Sjl 44625cf1a30Sjl/* 44725cf1a30Sjl * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 44825cf1a30Sjl * to "break even" using FP/VIS-accelerated memory operations. 44925cf1a30Sjl * The FPBLK code assumes a minimum number of bytes are available 45025cf1a30Sjl * to be moved on entry. Check that code carefully before 45125cf1a30Sjl * reducing VIS_COPY_THRESHOLD below 256. 45225cf1a30Sjl */ 45325cf1a30Sjl/* 45425cf1a30Sjl * This shadows sys/machsystm.h which can't be included due to the lack of 45525cf1a30Sjl * _ASM guards in include files it references. Change it here, change it there. 45625cf1a30Sjl */ 45725cf1a30Sjl#define VIS_COPY_THRESHOLD 256 45825cf1a30Sjl 45925cf1a30Sjl/* 46025cf1a30Sjl * TEST for very short copies 46125cf1a30Sjl * Be aware that the maximum unroll for the short unaligned case 46225cf1a30Sjl * is SHORTCOPY+1 46325cf1a30Sjl */ 46425cf1a30Sjl#define SHORTCOPY 3 46525cf1a30Sjl#define CHKSIZE 39 46625cf1a30Sjl 46725cf1a30Sjl/* 46825cf1a30Sjl * Indicates that we're to trampoline to the error handler. 46925cf1a30Sjl * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 47025cf1a30Sjl * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 47125cf1a30Sjl */ 47225cf1a30Sjl#define FPUSED_FLAG 1 47325cf1a30Sjl#define TRAMP_FLAG 2 47425cf1a30Sjl#define MASK_FLAGS 3 47525cf1a30Sjl 47625cf1a30Sjl/* 47725cf1a30Sjl * Number of outstanding prefetches. 478c8a722abSpm * first prefetch moves data from L2 to L1 (n_reads) 479c8a722abSpm * second prefetch moves data from memory to L2 (one_read) 48025cf1a30Sjl */ 481c8a722abSpm#define OLYMPUS_C_PREFETCH 24 482c8a722abSpm#define OLYMPUS_C_2ND_PREFETCH 12 48325cf1a30Sjl 48425cf1a30Sjl#define VIS_BLOCKSIZE 64 48525cf1a30Sjl 48625cf1a30Sjl/* 48725cf1a30Sjl * Size of stack frame in order to accomodate a 64-byte aligned 48825cf1a30Sjl * floating-point register save area and 2 64-bit temp locations. 48925cf1a30Sjl * All copy functions use two quadrants of fp registers; to assure a 49025cf1a30Sjl * block-aligned two block buffer in which to save we must reserve 49125cf1a30Sjl * three blocks on stack. Not all functions preserve %pfrs on stack 49225cf1a30Sjl * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 49325cf1a30Sjl * 49425cf1a30Sjl * _______________________________________ <-- %fp + STACK_BIAS 49525cf1a30Sjl * | We may need to preserve 2 quadrants | 49625cf1a30Sjl * | of fp regs, but since we do so with | 49725cf1a30Sjl * | BST/BLD we need room in which to | 49825cf1a30Sjl * | align to VIS_BLOCKSIZE bytes. So | 49925cf1a30Sjl * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 50025cf1a30Sjl * |-------------------------------------| 50125cf1a30Sjl * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 50225cf1a30Sjl * |-------------------------------------| 50325cf1a30Sjl * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 50425cf1a30Sjl * --------------------------------------- 50525cf1a30Sjl */ 50625cf1a30Sjl#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 50725cf1a30Sjl#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 50825cf1a30Sjl#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 50925cf1a30Sjl#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 51025cf1a30Sjl#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 51125cf1a30Sjl 51225cf1a30Sjl/* 51325cf1a30Sjl * Common macros used by the various versions of the block copy 51425cf1a30Sjl * routines in this file. 51525cf1a30Sjl */ 51625cf1a30Sjl 51725cf1a30Sjl/* 51825cf1a30Sjl * In FP copies if we do not have preserved data to restore over 51925cf1a30Sjl * the fp regs we used then we must zero those regs to avoid 52025cf1a30Sjl * exposing portions of the data to later threads (data security). 52125cf1a30Sjl * 52225cf1a30Sjl * Copy functions use either quadrants 1 and 3 or 2 and 4. 52325cf1a30Sjl * 52425cf1a30Sjl * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 52525cf1a30Sjl * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 52625cf1a30Sjl * 52725cf1a30Sjl * The instructions below are quicker than repeated fzero instructions 52825cf1a30Sjl * since they can dispatch down two fp pipelines. 52925cf1a30Sjl */ 53025cf1a30Sjl#define FZEROQ1Q3 \ 53125cf1a30Sjl fzero %f0 ;\ 53225cf1a30Sjl fmovd %f0, %f2 ;\ 53325cf1a30Sjl fmovd %f0, %f4 ;\ 53425cf1a30Sjl fmovd %f0, %f6 ;\ 53525cf1a30Sjl fmovd %f0, %f8 ;\ 53625cf1a30Sjl fmovd %f0, %f10 ;\ 53725cf1a30Sjl fmovd %f0, %f12 ;\ 53825cf1a30Sjl fmovd %f0, %f14 ;\ 53925cf1a30Sjl fmovd %f0, %f32 ;\ 54025cf1a30Sjl fmovd %f0, %f34 ;\ 54125cf1a30Sjl fmovd %f0, %f36 ;\ 54225cf1a30Sjl fmovd %f0, %f38 ;\ 54325cf1a30Sjl fmovd %f0, %f40 ;\ 54425cf1a30Sjl fmovd %f0, %f42 ;\ 54525cf1a30Sjl fmovd %f0, %f44 ;\ 54625cf1a30Sjl fmovd %f0, %f46 54725cf1a30Sjl 54825cf1a30Sjl#define FZEROQ2Q4 \ 54925cf1a30Sjl fzero %f16 ;\ 55025cf1a30Sjl fmovd %f0, %f18 ;\ 55125cf1a30Sjl fmovd %f0, %f20 ;\ 55225cf1a30Sjl fmovd %f0, %f22 ;\ 55325cf1a30Sjl fmovd %f0, %f24 ;\ 55425cf1a30Sjl fmovd %f0, %f26 ;\ 55525cf1a30Sjl fmovd %f0, %f28 ;\ 55625cf1a30Sjl fmovd %f0, %f30 ;\ 55725cf1a30Sjl fmovd %f0, %f48 ;\ 55825cf1a30Sjl fmovd %f0, %f50 ;\ 55925cf1a30Sjl fmovd %f0, %f52 ;\ 56025cf1a30Sjl fmovd %f0, %f54 ;\ 56125cf1a30Sjl fmovd %f0, %f56 ;\ 56225cf1a30Sjl fmovd %f0, %f58 ;\ 56325cf1a30Sjl fmovd %f0, %f60 ;\ 56425cf1a30Sjl fmovd %f0, %f62 56525cf1a30Sjl 56625cf1a30Sjl/* 56725cf1a30Sjl * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 56825cf1a30Sjl * Used to save and restore in-use fp registers when we want to use FP 56925cf1a30Sjl * and find fp already in use and copy size still large enough to justify 57025cf1a30Sjl * the additional overhead of this save and restore. 57125cf1a30Sjl * 57225cf1a30Sjl * A membar #Sync is needed before save to sync fp ops initiated before 57325cf1a30Sjl * the call to the copy function (by whoever has fp in use); for example 57425cf1a30Sjl * an earlier block load to the quadrant we are about to save may still be 57525cf1a30Sjl * "in flight". A membar #Sync is required at the end of the save to 57625cf1a30Sjl * sync our block store (the copy code is about to begin ldd's to the 57725cf1a30Sjl * first quadrant). 57825cf1a30Sjl * 57925cf1a30Sjl * Similarly: a membar #Sync before restore allows the block stores of 58025cf1a30Sjl * the copy operation to complete before we fill the quadrants with their 58125cf1a30Sjl * original data, and a membar #Sync after restore lets the block loads 58225cf1a30Sjl * of the restore complete before we return to whoever has the fp regs 58325cf1a30Sjl * in use. To avoid repeated membar #Sync we make it the responsibility 58425cf1a30Sjl * of the copy code to membar #Sync immediately after copy is complete 58525cf1a30Sjl * and before using the BLD_*_FROMSTACK macro. 58625cf1a30Sjl */ 58725cf1a30Sjl#define BST_FPQ1Q3_TOSTACK(tmp1) \ 58825cf1a30Sjl /* membar #Sync */ ;\ 58925cf1a30Sjl add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 59025cf1a30Sjl and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 59125cf1a30Sjl stda %f0, [tmp1]ASI_BLK_P ;\ 59225cf1a30Sjl add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 59325cf1a30Sjl stda %f32, [tmp1]ASI_BLK_P ;\ 59425cf1a30Sjl membar #Sync 59525cf1a30Sjl 59625cf1a30Sjl#define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 59725cf1a30Sjl /* membar #Sync - provided at copy completion */ ;\ 59825cf1a30Sjl add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 59925cf1a30Sjl and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 60025cf1a30Sjl ldda [tmp1]ASI_BLK_P, %f0 ;\ 60125cf1a30Sjl add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 60225cf1a30Sjl ldda [tmp1]ASI_BLK_P, %f32 ;\ 60325cf1a30Sjl membar #Sync 60425cf1a30Sjl 60525cf1a30Sjl#define BST_FPQ2Q4_TOSTACK(tmp1) \ 60625cf1a30Sjl /* membar #Sync */ ;\ 60725cf1a30Sjl add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 60825cf1a30Sjl and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 60925cf1a30Sjl stda %f16, [tmp1]ASI_BLK_P ;\ 61025cf1a30Sjl add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 61125cf1a30Sjl stda %f48, [tmp1]ASI_BLK_P ;\ 61225cf1a30Sjl membar #Sync 61325cf1a30Sjl 61425cf1a30Sjl#define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 61525cf1a30Sjl /* membar #Sync - provided at copy completion */ ;\ 61625cf1a30Sjl add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 61725cf1a30Sjl and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 61825cf1a30Sjl ldda [tmp1]ASI_BLK_P, %f16 ;\ 61925cf1a30Sjl add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 62025cf1a30Sjl ldda [tmp1]ASI_BLK_P, %f48 ;\ 62125cf1a30Sjl membar #Sync 62225cf1a30Sjl 62325cf1a30Sjl/* 62425cf1a30Sjl * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 62525cf1a30Sjl * prevent preemption if there is no t_lwp to save FP state to on context 62625cf1a30Sjl * switch) before commencing a FP copy, and reallow it on completion or 62725cf1a30Sjl * in error trampoline paths when we were using FP copy. 62825cf1a30Sjl * 62925cf1a30Sjl * Both macros may call other functions, so be aware that all outputs are 63025cf1a30Sjl * forfeit after using these macros. For this reason we do not pass registers 63125cf1a30Sjl * to use - we just use any outputs we want. 63225cf1a30Sjl * 63325cf1a30Sjl * Pseudo code: 63425cf1a30Sjl * 63525cf1a30Sjl * FP_NOMIGRATE: 63625cf1a30Sjl * 63725cf1a30Sjl * if (curthread->t_lwp) { 63825cf1a30Sjl * thread_nomigrate(); 63925cf1a30Sjl * } else { 64025cf1a30Sjl * kpreempt_disable(); 64125cf1a30Sjl * } 64225cf1a30Sjl * 64325cf1a30Sjl * FP_ALLOWMIGRATE: 64425cf1a30Sjl * 64525cf1a30Sjl * if (curthread->t_lwp) { 64625cf1a30Sjl * thread_allowmigrate(); 64725cf1a30Sjl * } else { 64825cf1a30Sjl * kpreempt_enable(); 64925cf1a30Sjl * } 65025cf1a30Sjl */ 65125cf1a30Sjl 65225cf1a30Sjl#define FP_NOMIGRATE(label1, label2) \ 65325cf1a30Sjl ldn [THREAD_REG + T_LWP], %o0 ;\ 65425cf1a30Sjl brz,a,pn %o0, label1/**/f ;\ 65525cf1a30Sjl ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 65625cf1a30Sjl call thread_nomigrate ;\ 65725cf1a30Sjl nop ;\ 65825cf1a30Sjl ba label2/**/f ;\ 65925cf1a30Sjl nop ;\ 66025cf1a30Sjllabel1: ;\ 66125cf1a30Sjl inc %o1 ;\ 66225cf1a30Sjl stb %o1, [THREAD_REG + T_PREEMPT] ;\ 66325cf1a30Sjllabel2: 66425cf1a30Sjl 66525cf1a30Sjl#define FP_ALLOWMIGRATE(label1, label2) \ 66625cf1a30Sjl ldn [THREAD_REG + T_LWP], %o0 ;\ 66725cf1a30Sjl brz,a,pn %o0, label1/**/f ;\ 66825cf1a30Sjl ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 66925cf1a30Sjl call thread_allowmigrate ;\ 67025cf1a30Sjl nop ;\ 67125cf1a30Sjl ba label2/**/f ;\ 67225cf1a30Sjl nop ;\ 67325cf1a30Sjllabel1: ;\ 67425cf1a30Sjl dec %o1 ;\ 67525cf1a30Sjl brnz,pn %o1, label2/**/f ;\ 67625cf1a30Sjl stb %o1, [THREAD_REG + T_PREEMPT] ;\ 67725cf1a30Sjl ldn [THREAD_REG + T_CPU], %o0 ;\ 67825cf1a30Sjl ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 67925cf1a30Sjl brz,pt %o0, label2/**/f ;\ 68025cf1a30Sjl nop ;\ 68125cf1a30Sjl call kpreempt ;\ 68225cf1a30Sjl rdpr %pil, %o0 ;\ 68325cf1a30Sjllabel2: 68425cf1a30Sjl 68525cf1a30Sjl/* 68625cf1a30Sjl * Copy a block of storage, returning an error code if `from' or 68725cf1a30Sjl * `to' takes a kernel pagefault which cannot be resolved. 68825cf1a30Sjl * Returns errno value on pagefault error, 0 if all ok 68925cf1a30Sjl */ 69025cf1a30Sjl 69125cf1a30Sjl .seg ".text" 69225cf1a30Sjl .align 4 69325cf1a30Sjl 69425cf1a30Sjl ENTRY(kcopy) 69525cf1a30Sjl 69625cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 69725cf1a30Sjl bleu,pt %ncc, .kcopy_small ! go to larger cases 69825cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 69925cf1a30Sjl btst 7, %o3 ! 70025cf1a30Sjl bz,pt %ncc, .kcopy_8 ! check for longword alignment 70125cf1a30Sjl nop 70225cf1a30Sjl btst 1, %o3 ! 70325cf1a30Sjl bz,pt %ncc, .kcopy_2 ! check for half-word 70425cf1a30Sjl nop 70525cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 70625cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 70725cf1a30Sjl tst %o3 70825cf1a30Sjl bz,pn %icc, .kcopy_small ! if zero, disable HW copy 70925cf1a30Sjl cmp %o2, %o3 ! if length <= limit 71025cf1a30Sjl bleu,pt %ncc, .kcopy_small ! go to small copy 71125cf1a30Sjl nop 71225cf1a30Sjl ba,pt %ncc, .kcopy_more ! otherwise go to large copy 71325cf1a30Sjl nop 71425cf1a30Sjl.kcopy_2: 71525cf1a30Sjl btst 3, %o3 ! 71625cf1a30Sjl bz,pt %ncc, .kcopy_4 ! check for word alignment 71725cf1a30Sjl nop 71825cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 71925cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 72025cf1a30Sjl tst %o3 72125cf1a30Sjl bz,pn %icc, .kcopy_small ! if zero, disable HW copy 72225cf1a30Sjl cmp %o2, %o3 ! if length <= limit 72325cf1a30Sjl bleu,pt %ncc, .kcopy_small ! go to small copy 72425cf1a30Sjl nop 72525cf1a30Sjl ba,pt %ncc, .kcopy_more ! otherwise go to large copy 72625cf1a30Sjl nop 72725cf1a30Sjl.kcopy_4: 72825cf1a30Sjl ! already checked longword, must be word aligned 72925cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 73025cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 73125cf1a30Sjl tst %o3 73225cf1a30Sjl bz,pn %icc, .kcopy_small ! if zero, disable HW copy 73325cf1a30Sjl cmp %o2, %o3 ! if length <= limit 73425cf1a30Sjl bleu,pt %ncc, .kcopy_small ! go to small copy 73525cf1a30Sjl nop 73625cf1a30Sjl ba,pt %ncc, .kcopy_more ! otherwise go to large copy 73725cf1a30Sjl nop 73825cf1a30Sjl.kcopy_8: 73925cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 74025cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 74125cf1a30Sjl tst %o3 74225cf1a30Sjl bz,pn %icc, .kcopy_small ! if zero, disable HW copy 74325cf1a30Sjl cmp %o2, %o3 ! if length <= limit 74425cf1a30Sjl bleu,pt %ncc, .kcopy_small ! go to small copy 74525cf1a30Sjl nop 74625cf1a30Sjl ba,pt %ncc, .kcopy_more ! otherwise go to large copy 74725cf1a30Sjl nop 74825cf1a30Sjl 74925cf1a30Sjl.kcopy_small: 75025cf1a30Sjl sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 75125cf1a30Sjl or %o5, %lo(.sm_copyerr), %o5 75225cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 75325cf1a30Sjl membar #Sync ! sync error barrier 75425cf1a30Sjl ba,pt %ncc, .sm_do_copy ! common code 75525cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 75625cf1a30Sjl 75725cf1a30Sjl.kcopy_more: 75825cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 75925cf1a30Sjl sethi %hi(.copyerr), %l7 ! copyerr is lofault value 76025cf1a30Sjl or %l7, %lo(.copyerr), %l7 76125cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 76225cf1a30Sjl membar #Sync ! sync error barrier 76325cf1a30Sjl ba,pt %ncc, .do_copy ! common code 76425cf1a30Sjl stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 76525cf1a30Sjl 76625cf1a30Sjl 76725cf1a30Sjl/* 76825cf1a30Sjl * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 76925cf1a30Sjl * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 77025cf1a30Sjl */ 77125cf1a30Sjl.copyerr: 77225cf1a30Sjl set .copyerr2, %l0 77325cf1a30Sjl membar #Sync ! sync error barrier 77425cf1a30Sjl stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 77525cf1a30Sjl btst FPUSED_FLAG, %l6 77625cf1a30Sjl bz %ncc, 1f 77725cf1a30Sjl and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 77825cf1a30Sjl 77925cf1a30Sjl ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 78025cf1a30Sjl wr %o2, 0, %gsr 78125cf1a30Sjl 78225cf1a30Sjl ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 78325cf1a30Sjl btst FPRS_FEF, %o3 78425cf1a30Sjl bz,pt %icc, 4f 78525cf1a30Sjl nop 78625cf1a30Sjl 78725cf1a30Sjl BLD_FPQ1Q3_FROMSTACK(%o2) 78825cf1a30Sjl 78925cf1a30Sjl ba,pt %ncc, 1f 79025cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 79125cf1a30Sjl 79225cf1a30Sjl4: 79325cf1a30Sjl FZEROQ1Q3 79425cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 79525cf1a30Sjl 79625cf1a30Sjl ! 79725cf1a30Sjl ! Need to cater for the different expectations of kcopy 79825cf1a30Sjl ! and bcopy. kcopy will *always* set a t_lofault handler 79925cf1a30Sjl ! If it fires, we're expected to just return the error code 80025cf1a30Sjl ! and *not* to invoke any existing error handler. As far as 80125cf1a30Sjl ! bcopy is concerned, we only set t_lofault if there was an 80225cf1a30Sjl ! existing lofault handler. In that case we're expected to 80325cf1a30Sjl ! invoke the previously existing handler after resetting the 80425cf1a30Sjl ! t_lofault value. 80525cf1a30Sjl ! 80625cf1a30Sjl1: 80725cf1a30Sjl andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 80825cf1a30Sjl membar #Sync ! sync error barrier 80925cf1a30Sjl stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 81025cf1a30Sjl FP_ALLOWMIGRATE(5, 6) 81125cf1a30Sjl 81225cf1a30Sjl btst TRAMP_FLAG, %l0 81325cf1a30Sjl bnz,pn %ncc, 3f 81425cf1a30Sjl nop 81525cf1a30Sjl ret 81625cf1a30Sjl restore %g1, 0, %o0 81725cf1a30Sjl 81825cf1a30Sjl3: 81925cf1a30Sjl ! 82025cf1a30Sjl ! We're here via bcopy. There *must* have been an error handler 82125cf1a30Sjl ! in place otherwise we would have died a nasty death already. 82225cf1a30Sjl ! 82325cf1a30Sjl jmp %l6 ! goto real handler 82425cf1a30Sjl restore %g0, 0, %o0 ! dispose of copy window 82525cf1a30Sjl 82625cf1a30Sjl/* 82725cf1a30Sjl * We got here because of a fault in .copyerr. We can't safely restore fp 82825cf1a30Sjl * state, so we panic. 82925cf1a30Sjl */ 83025cf1a30Sjlfp_panic_msg: 83125cf1a30Sjl .asciz "Unable to restore fp state after copy operation" 83225cf1a30Sjl 83325cf1a30Sjl .align 4 83425cf1a30Sjl.copyerr2: 83525cf1a30Sjl set fp_panic_msg, %o0 83625cf1a30Sjl call panic 83725cf1a30Sjl nop 83825cf1a30Sjl 83925cf1a30Sjl/* 84025cf1a30Sjl * We got here because of a fault during a small kcopy or bcopy. 84125cf1a30Sjl * No floating point registers are used by the small copies. 84225cf1a30Sjl * Errno value is in %g1. 84325cf1a30Sjl */ 84425cf1a30Sjl.sm_copyerr: 84525cf1a30Sjl1: 84625cf1a30Sjl btst TRAMP_FLAG, %o4 84725cf1a30Sjl membar #Sync 84825cf1a30Sjl andn %o4, TRAMP_FLAG, %o4 84925cf1a30Sjl bnz,pn %ncc, 3f 85025cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 85125cf1a30Sjl retl 85225cf1a30Sjl mov %g1, %o0 85325cf1a30Sjl3: 85425cf1a30Sjl jmp %o4 ! goto real handler 85525cf1a30Sjl mov %g0, %o0 ! 85625cf1a30Sjl 85725cf1a30Sjl SET_SIZE(kcopy) 85825cf1a30Sjl 85925cf1a30Sjl 86025cf1a30Sjl/* 86125cf1a30Sjl * Copy a block of storage - must not overlap (from + len <= to). 86225cf1a30Sjl * Registers: l6 - saved t_lofault 86325cf1a30Sjl * (for short copies, o4 - saved t_lofault) 86425cf1a30Sjl * 86525cf1a30Sjl * Copy a page of memory. 86625cf1a30Sjl * Assumes double word alignment and a count >= 256. 86725cf1a30Sjl */ 86825cf1a30Sjl 86925cf1a30Sjl ENTRY(bcopy) 87025cf1a30Sjl 87125cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 87225cf1a30Sjl bleu,pt %ncc, .bcopy_small ! go to larger cases 87325cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 87425cf1a30Sjl btst 7, %o3 ! 87525cf1a30Sjl bz,pt %ncc, .bcopy_8 ! check for longword alignment 87625cf1a30Sjl nop 87725cf1a30Sjl btst 1, %o3 ! 87825cf1a30Sjl bz,pt %ncc, .bcopy_2 ! check for half-word 87925cf1a30Sjl nop 88025cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 88125cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 88225cf1a30Sjl tst %o3 88325cf1a30Sjl bz,pn %icc, .bcopy_small ! if zero, disable HW copy 88425cf1a30Sjl cmp %o2, %o3 ! if length <= limit 88525cf1a30Sjl bleu,pt %ncc, .bcopy_small ! go to small copy 88625cf1a30Sjl nop 88725cf1a30Sjl ba,pt %ncc, .bcopy_more ! otherwise go to large copy 88825cf1a30Sjl nop 88925cf1a30Sjl.bcopy_2: 89025cf1a30Sjl btst 3, %o3 ! 89125cf1a30Sjl bz,pt %ncc, .bcopy_4 ! check for word alignment 89225cf1a30Sjl nop 89325cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 89425cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 89525cf1a30Sjl tst %o3 89625cf1a30Sjl bz,pn %icc, .bcopy_small ! if zero, disable HW copy 89725cf1a30Sjl cmp %o2, %o3 ! if length <= limit 89825cf1a30Sjl bleu,pt %ncc, .bcopy_small ! go to small copy 89925cf1a30Sjl nop 90025cf1a30Sjl ba,pt %ncc, .bcopy_more ! otherwise go to large copy 90125cf1a30Sjl nop 90225cf1a30Sjl.bcopy_4: 90325cf1a30Sjl ! already checked longword, must be word aligned 90425cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 90525cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 90625cf1a30Sjl tst %o3 90725cf1a30Sjl bz,pn %icc, .bcopy_small ! if zero, disable HW copy 90825cf1a30Sjl cmp %o2, %o3 ! if length <= limit 90925cf1a30Sjl bleu,pt %ncc, .bcopy_small ! go to small copy 91025cf1a30Sjl nop 91125cf1a30Sjl ba,pt %ncc, .bcopy_more ! otherwise go to large copy 91225cf1a30Sjl nop 91325cf1a30Sjl.bcopy_8: 91425cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 91525cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 91625cf1a30Sjl tst %o3 91725cf1a30Sjl bz,pn %icc, .bcopy_small ! if zero, disable HW copy 91825cf1a30Sjl cmp %o2, %o3 ! if length <= limit 91925cf1a30Sjl bleu,pt %ncc, .bcopy_small ! go to small copy 92025cf1a30Sjl nop 92125cf1a30Sjl ba,pt %ncc, .bcopy_more ! otherwise go to large copy 92225cf1a30Sjl nop 92325cf1a30Sjl 92425cf1a30Sjl .align 16 92525cf1a30Sjl.bcopy_small: 92625cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 92725cf1a30Sjl tst %o4 92825cf1a30Sjl bz,pt %icc, .sm_do_copy 92925cf1a30Sjl nop 93025cf1a30Sjl sethi %hi(.sm_copyerr), %o5 93125cf1a30Sjl or %o5, %lo(.sm_copyerr), %o5 93225cf1a30Sjl membar #Sync ! sync error barrier 93325cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 93425cf1a30Sjl or %o4, TRAMP_FLAG, %o4 ! error should trampoline 93525cf1a30Sjl.sm_do_copy: 93625cf1a30Sjl cmp %o2, SHORTCOPY ! check for really short case 93725cf1a30Sjl bleu,pt %ncc, .bc_sm_left ! 93825cf1a30Sjl cmp %o2, CHKSIZE ! check for medium length cases 93925cf1a30Sjl bgu,pn %ncc, .bc_med ! 94025cf1a30Sjl or %o0, %o1, %o3 ! prepare alignment check 94125cf1a30Sjl andcc %o3, 0x3, %g0 ! test for alignment 94225cf1a30Sjl bz,pt %ncc, .bc_sm_word ! branch to word aligned case 94325cf1a30Sjl.bc_sm_movebytes: 94425cf1a30Sjl sub %o2, 3, %o2 ! adjust count to allow cc zero test 94525cf1a30Sjl.bc_sm_notalign4: 94625cf1a30Sjl ldub [%o0], %o3 ! read byte 94725cf1a30Sjl stb %o3, [%o1] ! write byte 94825cf1a30Sjl subcc %o2, 4, %o2 ! reduce count by 4 94925cf1a30Sjl ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 95025cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 95125cf1a30Sjl stb %o3, [%o1 + 1] 95225cf1a30Sjl ldub [%o0 - 2], %o3 95325cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 95425cf1a30Sjl stb %o3, [%o1 - 2] 95525cf1a30Sjl ldub [%o0 - 1], %o3 95625cf1a30Sjl bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 95725cf1a30Sjl stb %o3, [%o1 - 1] 95825cf1a30Sjl add %o2, 3, %o2 ! restore count 95925cf1a30Sjl.bc_sm_left: 96025cf1a30Sjl tst %o2 96125cf1a30Sjl bz,pt %ncc, .bc_sm_exit ! check for zero length 96225cf1a30Sjl deccc %o2 ! reduce count for cc test 96325cf1a30Sjl ldub [%o0], %o3 ! move one byte 96425cf1a30Sjl bz,pt %ncc, .bc_sm_exit 96525cf1a30Sjl stb %o3, [%o1] 96625cf1a30Sjl ldub [%o0 + 1], %o3 ! move another byte 96725cf1a30Sjl deccc %o2 ! check for more 96825cf1a30Sjl bz,pt %ncc, .bc_sm_exit 96925cf1a30Sjl stb %o3, [%o1 + 1] 97025cf1a30Sjl ldub [%o0 + 2], %o3 ! move final byte 971e64c6c3fSMichael Bergknoff ba,pt %ncc, .bc_sm_exit 972e64c6c3fSMichael Bergknoff stb %o3, [%o1 + 2] 97325cf1a30Sjl .align 16 97425cf1a30Sjl nop ! instruction alignment 97525cf1a30Sjl ! see discussion at start of file 97625cf1a30Sjl.bc_sm_words: 97725cf1a30Sjl lduw [%o0], %o3 ! read word 97825cf1a30Sjl.bc_sm_wordx: 97925cf1a30Sjl subcc %o2, 8, %o2 ! update count 98025cf1a30Sjl stw %o3, [%o1] ! write word 98125cf1a30Sjl add %o0, 8, %o0 ! update SRC 98225cf1a30Sjl lduw [%o0 - 4], %o3 ! read word 98325cf1a30Sjl add %o1, 8, %o1 ! update DST 98425cf1a30Sjl bgt,pt %ncc, .bc_sm_words ! loop til done 98525cf1a30Sjl stw %o3, [%o1 - 4] ! write word 98625cf1a30Sjl addcc %o2, 7, %o2 ! restore count 98725cf1a30Sjl bz,pt %ncc, .bc_sm_exit 98825cf1a30Sjl deccc %o2 98925cf1a30Sjl bz,pt %ncc, .bc_sm_byte 99025cf1a30Sjl.bc_sm_half: 99125cf1a30Sjl subcc %o2, 2, %o2 ! reduce count by 2 99225cf1a30Sjl add %o0, 2, %o0 ! advance SRC by 2 99325cf1a30Sjl lduh [%o0 - 2], %o3 ! read half word 99425cf1a30Sjl add %o1, 2, %o1 ! advance DST by 2 99525cf1a30Sjl bgt,pt %ncc, .bc_sm_half ! loop til done 99625cf1a30Sjl sth %o3, [%o1 - 2] ! write half word 99725cf1a30Sjl addcc %o2, 1, %o2 ! restore count 99825cf1a30Sjl bz,pt %ncc, .bc_sm_exit 99925cf1a30Sjl nop 100025cf1a30Sjl.bc_sm_byte: 100125cf1a30Sjl ldub [%o0], %o3 1002e64c6c3fSMichael Bergknoff ba,pt %ncc, .bc_sm_exit 1003e64c6c3fSMichael Bergknoff stb %o3, [%o1] 100425cf1a30Sjl 100525cf1a30Sjl.bc_sm_word: 100625cf1a30Sjl subcc %o2, 4, %o2 ! update count 100725cf1a30Sjl bgt,pt %ncc, .bc_sm_wordx 100825cf1a30Sjl lduw [%o0], %o3 ! read word 100925cf1a30Sjl addcc %o2, 3, %o2 ! restore count 101025cf1a30Sjl bz,pt %ncc, .bc_sm_exit 101125cf1a30Sjl stw %o3, [%o1] ! write word 101225cf1a30Sjl deccc %o2 ! reduce count for cc test 101325cf1a30Sjl ldub [%o0 + 4], %o3 ! load one byte 101425cf1a30Sjl bz,pt %ncc, .bc_sm_exit 101525cf1a30Sjl stb %o3, [%o1 + 4] ! store one byte 101625cf1a30Sjl ldub [%o0 + 5], %o3 ! load second byte 101725cf1a30Sjl deccc %o2 101825cf1a30Sjl bz,pt %ncc, .bc_sm_exit 101925cf1a30Sjl stb %o3, [%o1 + 5] ! store second byte 102025cf1a30Sjl ldub [%o0 + 6], %o3 ! load third byte 102125cf1a30Sjl stb %o3, [%o1 + 6] ! store third byte 102225cf1a30Sjl.bc_sm_exit: 1023e64c6c3fSMichael Bergknoff ldn [THREAD_REG + T_LOFAULT], %o3 1024e64c6c3fSMichael Bergknoff brz,pt %o3, .bc_sm_done 10250090fbabSkm nop 102625cf1a30Sjl membar #Sync ! sync error barrier 102725cf1a30Sjl andn %o4, TRAMP_FLAG, %o4 102825cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 10290090fbabSkm.bc_sm_done: 103025cf1a30Sjl retl 103125cf1a30Sjl mov %g0, %o0 ! return 0 103225cf1a30Sjl 103325cf1a30Sjl .align 16 103425cf1a30Sjl.bc_med: 103525cf1a30Sjl xor %o0, %o1, %o3 ! setup alignment check 103625cf1a30Sjl btst 1, %o3 103725cf1a30Sjl bnz,pt %ncc, .bc_sm_movebytes ! unaligned 103825cf1a30Sjl nop 103925cf1a30Sjl btst 3, %o3 104025cf1a30Sjl bnz,pt %ncc, .bc_med_half ! halfword aligned 104125cf1a30Sjl nop 104225cf1a30Sjl btst 7, %o3 104325cf1a30Sjl bnz,pt %ncc, .bc_med_word ! word aligned 104425cf1a30Sjl nop 104525cf1a30Sjl.bc_med_long: 104625cf1a30Sjl btst 3, %o0 ! check for 104725cf1a30Sjl bz,pt %ncc, .bc_med_long1 ! word alignment 104825cf1a30Sjl nop 104925cf1a30Sjl.bc_med_long0: 105025cf1a30Sjl ldub [%o0], %o3 ! load one byte 105125cf1a30Sjl inc %o0 105225cf1a30Sjl stb %o3,[%o1] ! store byte 105325cf1a30Sjl inc %o1 105425cf1a30Sjl btst 3, %o0 105525cf1a30Sjl bnz,pt %ncc, .bc_med_long0 105625cf1a30Sjl dec %o2 105725cf1a30Sjl.bc_med_long1: ! word aligned 105825cf1a30Sjl btst 7, %o0 ! check for long word 105925cf1a30Sjl bz,pt %ncc, .bc_med_long2 106025cf1a30Sjl nop 106125cf1a30Sjl lduw [%o0], %o3 ! load word 106225cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 106325cf1a30Sjl stw %o3, [%o1] ! store word 106425cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 106525cf1a30Sjl sub %o2, 4, %o2 ! reduce count by 4 106625cf1a30Sjl! 106725cf1a30Sjl! Now long word aligned and have at least 32 bytes to move 106825cf1a30Sjl! 106925cf1a30Sjl.bc_med_long2: 107025cf1a30Sjl sub %o2, 31, %o2 ! adjust count to allow cc zero test 107125cf1a30Sjl.bc_med_lmove: 107225cf1a30Sjl ldx [%o0], %o3 ! read long word 107325cf1a30Sjl stx %o3, [%o1] ! write long word 107425cf1a30Sjl subcc %o2, 32, %o2 ! reduce count by 32 107525cf1a30Sjl ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 107625cf1a30Sjl add %o0, 32, %o0 ! advance SRC by 32 107725cf1a30Sjl stx %o3, [%o1 + 8] 107825cf1a30Sjl ldx [%o0 - 16], %o3 107925cf1a30Sjl add %o1, 32, %o1 ! advance DST by 32 108025cf1a30Sjl stx %o3, [%o1 - 16] 108125cf1a30Sjl ldx [%o0 - 8], %o3 108225cf1a30Sjl bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 108325cf1a30Sjl stx %o3, [%o1 - 8] 108425cf1a30Sjl addcc %o2, 24, %o2 ! restore count to long word offset 108525cf1a30Sjl ble,pt %ncc, .bc_med_lextra ! check for more long words to move 108625cf1a30Sjl nop 108725cf1a30Sjl.bc_med_lword: 108825cf1a30Sjl ldx [%o0], %o3 ! read long word 108925cf1a30Sjl subcc %o2, 8, %o2 ! reduce count by 8 109025cf1a30Sjl stx %o3, [%o1] ! write long word 109125cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 109225cf1a30Sjl bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 109325cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 109425cf1a30Sjl.bc_med_lextra: 109525cf1a30Sjl addcc %o2, 7, %o2 ! restore rest of count 109625cf1a30Sjl bz,pt %ncc, .bc_sm_exit ! if zero, then done 109725cf1a30Sjl deccc %o2 109825cf1a30Sjl bz,pt %ncc, .bc_sm_byte 109925cf1a30Sjl nop 110025cf1a30Sjl ba,pt %ncc, .bc_sm_half 110125cf1a30Sjl nop 110225cf1a30Sjl 110325cf1a30Sjl .align 16 110425cf1a30Sjl.bc_med_word: 110525cf1a30Sjl btst 3, %o0 ! check for 110625cf1a30Sjl bz,pt %ncc, .bc_med_word1 ! word alignment 110725cf1a30Sjl nop 110825cf1a30Sjl.bc_med_word0: 110925cf1a30Sjl ldub [%o0], %o3 ! load one byte 111025cf1a30Sjl inc %o0 111125cf1a30Sjl stb %o3,[%o1] ! store byte 111225cf1a30Sjl inc %o1 111325cf1a30Sjl btst 3, %o0 111425cf1a30Sjl bnz,pt %ncc, .bc_med_word0 111525cf1a30Sjl dec %o2 111625cf1a30Sjl! 111725cf1a30Sjl! Now word aligned and have at least 36 bytes to move 111825cf1a30Sjl! 111925cf1a30Sjl.bc_med_word1: 112025cf1a30Sjl sub %o2, 15, %o2 ! adjust count to allow cc zero test 112125cf1a30Sjl.bc_med_wmove: 112225cf1a30Sjl lduw [%o0], %o3 ! read word 112325cf1a30Sjl stw %o3, [%o1] ! write word 112425cf1a30Sjl subcc %o2, 16, %o2 ! reduce count by 16 112525cf1a30Sjl lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 112625cf1a30Sjl add %o0, 16, %o0 ! advance SRC by 16 112725cf1a30Sjl stw %o3, [%o1 + 4] 112825cf1a30Sjl lduw [%o0 - 8], %o3 112925cf1a30Sjl add %o1, 16, %o1 ! advance DST by 16 113025cf1a30Sjl stw %o3, [%o1 - 8] 113125cf1a30Sjl lduw [%o0 - 4], %o3 113225cf1a30Sjl bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 113325cf1a30Sjl stw %o3, [%o1 - 4] 113425cf1a30Sjl addcc %o2, 12, %o2 ! restore count to word offset 113525cf1a30Sjl ble,pt %ncc, .bc_med_wextra ! check for more words to move 113625cf1a30Sjl nop 113725cf1a30Sjl.bc_med_word2: 113825cf1a30Sjl lduw [%o0], %o3 ! read word 113925cf1a30Sjl subcc %o2, 4, %o2 ! reduce count by 4 114025cf1a30Sjl stw %o3, [%o1] ! write word 114125cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 114225cf1a30Sjl bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 114325cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 114425cf1a30Sjl.bc_med_wextra: 114525cf1a30Sjl addcc %o2, 3, %o2 ! restore rest of count 114625cf1a30Sjl bz,pt %ncc, .bc_sm_exit ! if zero, then done 114725cf1a30Sjl deccc %o2 114825cf1a30Sjl bz,pt %ncc, .bc_sm_byte 114925cf1a30Sjl nop 115025cf1a30Sjl ba,pt %ncc, .bc_sm_half 115125cf1a30Sjl nop 115225cf1a30Sjl 115325cf1a30Sjl .align 16 115425cf1a30Sjl.bc_med_half: 115525cf1a30Sjl btst 1, %o0 ! check for 115625cf1a30Sjl bz,pt %ncc, .bc_med_half1 ! half word alignment 115725cf1a30Sjl nop 115825cf1a30Sjl ldub [%o0], %o3 ! load one byte 115925cf1a30Sjl inc %o0 116025cf1a30Sjl stb %o3,[%o1] ! store byte 116125cf1a30Sjl inc %o1 116225cf1a30Sjl dec %o2 116325cf1a30Sjl! 116425cf1a30Sjl! Now half word aligned and have at least 38 bytes to move 116525cf1a30Sjl! 116625cf1a30Sjl.bc_med_half1: 116725cf1a30Sjl sub %o2, 7, %o2 ! adjust count to allow cc zero test 116825cf1a30Sjl.bc_med_hmove: 116925cf1a30Sjl lduh [%o0], %o3 ! read half word 117025cf1a30Sjl sth %o3, [%o1] ! write half word 117125cf1a30Sjl subcc %o2, 8, %o2 ! reduce count by 8 117225cf1a30Sjl lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 117325cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 117425cf1a30Sjl sth %o3, [%o1 + 2] 117525cf1a30Sjl lduh [%o0 - 4], %o3 117625cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 117725cf1a30Sjl sth %o3, [%o1 - 4] 117825cf1a30Sjl lduh [%o0 - 2], %o3 117925cf1a30Sjl bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 118025cf1a30Sjl sth %o3, [%o1 - 2] 118125cf1a30Sjl addcc %o2, 7, %o2 ! restore count 118225cf1a30Sjl bz,pt %ncc, .bc_sm_exit 118325cf1a30Sjl deccc %o2 118425cf1a30Sjl bz,pt %ncc, .bc_sm_byte 118525cf1a30Sjl nop 118625cf1a30Sjl ba,pt %ncc, .bc_sm_half 118725cf1a30Sjl nop 118825cf1a30Sjl 118925cf1a30Sjl SET_SIZE(bcopy) 119025cf1a30Sjl 119125cf1a30Sjl/* 119225cf1a30Sjl * The _more entry points are not intended to be used directly by 119325cf1a30Sjl * any caller from outside this file. They are provided to allow 119425cf1a30Sjl * profiling and dtrace of the portions of the copy code that uses 119525cf1a30Sjl * the floating point registers. 119625cf1a30Sjl * This entry is particularly important as DTRACE (at least as of 119725cf1a30Sjl * 4/2004) does not support leaf functions. 119825cf1a30Sjl */ 119925cf1a30Sjl 120025cf1a30Sjl ENTRY(bcopy_more) 120125cf1a30Sjl.bcopy_more: 120225cf1a30Sjl prefetch [%o0], #n_reads 120325cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 120425cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 120525cf1a30Sjl tst %l6 120625cf1a30Sjl bz,pt %ncc, .do_copy 120725cf1a30Sjl nop 120825cf1a30Sjl sethi %hi(.copyerr), %o2 120925cf1a30Sjl or %o2, %lo(.copyerr), %o2 121025cf1a30Sjl membar #Sync ! sync error barrier 121125cf1a30Sjl stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 121225cf1a30Sjl ! 121325cf1a30Sjl ! We've already captured whether t_lofault was zero on entry. 121425cf1a30Sjl ! We need to mark ourselves as being from bcopy since both 121525cf1a30Sjl ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 121625cf1a30Sjl ! and the saved lofault was zero, we won't reset lofault on 121725cf1a30Sjl ! returning. 121825cf1a30Sjl ! 121925cf1a30Sjl or %l6, TRAMP_FLAG, %l6 122025cf1a30Sjl 122125cf1a30Sjl/* 122225cf1a30Sjl * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 122325cf1a30Sjl * Also, use of FP registers has been tested to be enabled 122425cf1a30Sjl */ 122525cf1a30Sjl.do_copy: 122625cf1a30Sjl FP_NOMIGRATE(6, 7) 122725cf1a30Sjl 122825cf1a30Sjl rd %fprs, %o2 ! check for unused fp 122925cf1a30Sjl st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 123025cf1a30Sjl btst FPRS_FEF, %o2 123125cf1a30Sjl bz,a,pt %icc, .do_blockcopy 123225cf1a30Sjl wr %g0, FPRS_FEF, %fprs 123325cf1a30Sjl 123425cf1a30Sjl BST_FPQ1Q3_TOSTACK(%o2) 123525cf1a30Sjl 123625cf1a30Sjl.do_blockcopy: 123725cf1a30Sjl rd %gsr, %o2 123825cf1a30Sjl stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 123925cf1a30Sjl or %l6, FPUSED_FLAG, %l6 124025cf1a30Sjl 124125cf1a30Sjl#define REALSRC %i0 124225cf1a30Sjl#define DST %i1 124325cf1a30Sjl#define CNT %i2 124425cf1a30Sjl#define SRC %i3 124525cf1a30Sjl#define TMP %i5 124625cf1a30Sjl 124725cf1a30Sjl andcc DST, VIS_BLOCKSIZE - 1, TMP 124825cf1a30Sjl bz,pt %ncc, 2f 124925cf1a30Sjl neg TMP 125025cf1a30Sjl add TMP, VIS_BLOCKSIZE, TMP 125125cf1a30Sjl 125225cf1a30Sjl ! TMP = bytes required to align DST on FP_BLOCK boundary 125325cf1a30Sjl ! Using SRC as a tmp here 125425cf1a30Sjl cmp TMP, 3 125525cf1a30Sjl bleu,pt %ncc, 1f 125625cf1a30Sjl sub CNT,TMP,CNT ! adjust main count 125725cf1a30Sjl sub TMP, 3, TMP ! adjust for end of loop test 125825cf1a30Sjl.bc_blkalign: 125925cf1a30Sjl ldub [REALSRC], SRC ! move 4 bytes per loop iteration 126025cf1a30Sjl stb SRC, [DST] 126125cf1a30Sjl subcc TMP, 4, TMP 126225cf1a30Sjl ldub [REALSRC + 1], SRC 126325cf1a30Sjl add REALSRC, 4, REALSRC 126425cf1a30Sjl stb SRC, [DST + 1] 126525cf1a30Sjl ldub [REALSRC - 2], SRC 126625cf1a30Sjl add DST, 4, DST 126725cf1a30Sjl stb SRC, [DST - 2] 126825cf1a30Sjl ldub [REALSRC - 1], SRC 126925cf1a30Sjl bgu,pt %ncc, .bc_blkalign 127025cf1a30Sjl stb SRC, [DST - 1] 127125cf1a30Sjl 127225cf1a30Sjl addcc TMP, 3, TMP ! restore count adjustment 127325cf1a30Sjl bz,pt %ncc, 2f ! no bytes left? 127425cf1a30Sjl nop 127525cf1a30Sjl1: ldub [REALSRC], SRC 127625cf1a30Sjl inc REALSRC 127725cf1a30Sjl inc DST 127825cf1a30Sjl deccc TMP 127925cf1a30Sjl bgu %ncc, 1b 128025cf1a30Sjl stb SRC, [DST - 1] 128125cf1a30Sjl 128225cf1a30Sjl2: 128325cf1a30Sjl membar #StoreLoad 128425cf1a30Sjl andn REALSRC, 0x7, SRC 128525cf1a30Sjl 128625cf1a30Sjl ! SRC - 8-byte aligned 128725cf1a30Sjl ! DST - 64-byte aligned 128825cf1a30Sjl ldd [SRC], %f0 128925cf1a30Sjl prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 129025cf1a30Sjl alignaddr REALSRC, %g0, %g0 129125cf1a30Sjl ldd [SRC + 0x08], %f2 129225cf1a30Sjl prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 129325cf1a30Sjl faligndata %f0, %f2, %f32 129425cf1a30Sjl ldd [SRC + 0x10], %f4 1295c8a722abSpm prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 129625cf1a30Sjl faligndata %f2, %f4, %f34 129725cf1a30Sjl ldd [SRC + 0x18], %f6 129825cf1a30Sjl prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 129925cf1a30Sjl faligndata %f4, %f6, %f36 130025cf1a30Sjl ldd [SRC + 0x20], %f8 1301c8a722abSpm prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 130225cf1a30Sjl faligndata %f6, %f8, %f38 130325cf1a30Sjl ldd [SRC + 0x28], %f10 1304c8a722abSpm prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 130525cf1a30Sjl faligndata %f8, %f10, %f40 130625cf1a30Sjl ldd [SRC + 0x30], %f12 1307c8a722abSpm prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 130825cf1a30Sjl faligndata %f10, %f12, %f42 130925cf1a30Sjl ldd [SRC + 0x38], %f14 131025cf1a30Sjl ldd [SRC + VIS_BLOCKSIZE], %f0 131125cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 131225cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 1313c8a722abSpm prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 131425cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 131525cf1a30Sjl ba,pt %ncc, 1f 1316c8a722abSpm prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 131725cf1a30Sjl .align 32 131825cf1a30Sjl1: 131925cf1a30Sjl ldd [SRC + 0x08], %f2 132025cf1a30Sjl faligndata %f12, %f14, %f44 132125cf1a30Sjl ldd [SRC + 0x10], %f4 132225cf1a30Sjl faligndata %f14, %f0, %f46 132325cf1a30Sjl stda %f32, [DST]ASI_BLK_P 132425cf1a30Sjl ldd [SRC + 0x18], %f6 132525cf1a30Sjl faligndata %f0, %f2, %f32 132625cf1a30Sjl ldd [SRC + 0x20], %f8 132725cf1a30Sjl faligndata %f2, %f4, %f34 132825cf1a30Sjl ldd [SRC + 0x28], %f10 132925cf1a30Sjl faligndata %f4, %f6, %f36 133025cf1a30Sjl ldd [SRC + 0x30], %f12 133125cf1a30Sjl faligndata %f6, %f8, %f38 1332c8a722abSpm sub CNT, VIS_BLOCKSIZE, CNT 133325cf1a30Sjl ldd [SRC + 0x38], %f14 133425cf1a30Sjl faligndata %f8, %f10, %f40 1335c8a722abSpm add DST, VIS_BLOCKSIZE, DST 133625cf1a30Sjl ldd [SRC + VIS_BLOCKSIZE], %f0 133725cf1a30Sjl faligndata %f10, %f12, %f42 133825cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 1339c8a722abSpm prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1340c8a722abSpm add SRC, VIS_BLOCKSIZE, SRC 1341c8a722abSpm prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 134225cf1a30Sjl cmp CNT, VIS_BLOCKSIZE + 8 134325cf1a30Sjl bgu,pt %ncc, 1b 1344c8a722abSpm prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 134525cf1a30Sjl 134625cf1a30Sjl ! only if REALSRC & 0x7 is 0 134725cf1a30Sjl cmp CNT, VIS_BLOCKSIZE 134825cf1a30Sjl bne %ncc, 3f 134925cf1a30Sjl andcc REALSRC, 0x7, %g0 135025cf1a30Sjl bz,pt %ncc, 2f 135125cf1a30Sjl nop 135225cf1a30Sjl3: 135325cf1a30Sjl faligndata %f12, %f14, %f44 135425cf1a30Sjl faligndata %f14, %f0, %f46 135525cf1a30Sjl stda %f32, [DST]ASI_BLK_P 135625cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 135725cf1a30Sjl ba,pt %ncc, 3f 135825cf1a30Sjl nop 135925cf1a30Sjl2: 136025cf1a30Sjl ldd [SRC + 0x08], %f2 136125cf1a30Sjl fsrc1 %f12, %f44 136225cf1a30Sjl ldd [SRC + 0x10], %f4 136325cf1a30Sjl fsrc1 %f14, %f46 136425cf1a30Sjl stda %f32, [DST]ASI_BLK_P 136525cf1a30Sjl ldd [SRC + 0x18], %f6 136625cf1a30Sjl fsrc1 %f0, %f32 136725cf1a30Sjl ldd [SRC + 0x20], %f8 136825cf1a30Sjl fsrc1 %f2, %f34 136925cf1a30Sjl ldd [SRC + 0x28], %f10 137025cf1a30Sjl fsrc1 %f4, %f36 137125cf1a30Sjl ldd [SRC + 0x30], %f12 137225cf1a30Sjl fsrc1 %f6, %f38 137325cf1a30Sjl ldd [SRC + 0x38], %f14 137425cf1a30Sjl fsrc1 %f8, %f40 137525cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 137625cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 137725cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 137825cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 137925cf1a30Sjl fsrc1 %f10, %f42 138025cf1a30Sjl fsrc1 %f12, %f44 138125cf1a30Sjl fsrc1 %f14, %f46 138225cf1a30Sjl stda %f32, [DST]ASI_BLK_P 138325cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 138425cf1a30Sjl ba,a,pt %ncc, .bcb_exit 138525cf1a30Sjl nop 138625cf1a30Sjl 138725cf1a30Sjl3: tst CNT 138825cf1a30Sjl bz,a,pt %ncc, .bcb_exit 138925cf1a30Sjl nop 139025cf1a30Sjl 139125cf1a30Sjl5: ldub [REALSRC], TMP 139225cf1a30Sjl inc REALSRC 139325cf1a30Sjl inc DST 139425cf1a30Sjl deccc CNT 139525cf1a30Sjl bgu %ncc, 5b 139625cf1a30Sjl stb TMP, [DST - 1] 139725cf1a30Sjl.bcb_exit: 139825cf1a30Sjl membar #Sync 139925cf1a30Sjl 140025cf1a30Sjl ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 140125cf1a30Sjl wr %o2, 0, %gsr 140225cf1a30Sjl 140325cf1a30Sjl ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 140425cf1a30Sjl btst FPRS_FEF, %o3 140525cf1a30Sjl bz,pt %icc, 4f 140625cf1a30Sjl nop 140725cf1a30Sjl 140825cf1a30Sjl BLD_FPQ1Q3_FROMSTACK(%o2) 140925cf1a30Sjl 141025cf1a30Sjl ba,pt %ncc, 2f 141125cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 141225cf1a30Sjl4: 141325cf1a30Sjl FZEROQ1Q3 141425cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 141525cf1a30Sjl2: 141625cf1a30Sjl membar #Sync ! sync error barrier 141725cf1a30Sjl andn %l6, MASK_FLAGS, %l6 141825cf1a30Sjl stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 141925cf1a30Sjl FP_ALLOWMIGRATE(5, 6) 142025cf1a30Sjl ret 142125cf1a30Sjl restore %g0, 0, %o0 142225cf1a30Sjl 142325cf1a30Sjl SET_SIZE(bcopy_more) 142425cf1a30Sjl 142525cf1a30Sjl/* 142625cf1a30Sjl * Block copy with possibly overlapped operands. 142725cf1a30Sjl */ 142825cf1a30Sjl 142925cf1a30Sjl ENTRY(ovbcopy) 143025cf1a30Sjl tst %o2 ! check count 143125cf1a30Sjl bgu,a %ncc, 1f ! nothing to do or bad arguments 143225cf1a30Sjl subcc %o0, %o1, %o3 ! difference of from and to address 143325cf1a30Sjl 143425cf1a30Sjl retl ! return 143525cf1a30Sjl nop 143625cf1a30Sjl1: 143725cf1a30Sjl bneg,a %ncc, 2f 143825cf1a30Sjl neg %o3 ! if < 0, make it positive 143925cf1a30Sjl2: cmp %o2, %o3 ! cmp size and abs(from - to) 144025cf1a30Sjl bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 144125cf1a30Sjl .empty ! no overlap 144225cf1a30Sjl cmp %o0, %o1 ! compare from and to addresses 144325cf1a30Sjl blu %ncc, .ov_bkwd ! if from < to, copy backwards 144425cf1a30Sjl nop 144525cf1a30Sjl ! 144625cf1a30Sjl ! Copy forwards. 144725cf1a30Sjl ! 144825cf1a30Sjl.ov_fwd: 144925cf1a30Sjl ldub [%o0], %o3 ! read from address 145025cf1a30Sjl inc %o0 ! inc from address 145125cf1a30Sjl stb %o3, [%o1] ! write to address 145225cf1a30Sjl deccc %o2 ! dec count 145325cf1a30Sjl bgu %ncc, .ov_fwd ! loop till done 145425cf1a30Sjl inc %o1 ! inc to address 145525cf1a30Sjl 145625cf1a30Sjl retl ! return 145725cf1a30Sjl nop 145825cf1a30Sjl ! 145925cf1a30Sjl ! Copy backwards. 146025cf1a30Sjl ! 146125cf1a30Sjl.ov_bkwd: 146225cf1a30Sjl deccc %o2 ! dec count 146325cf1a30Sjl ldub [%o0 + %o2], %o3 ! get byte at end of src 146425cf1a30Sjl bgu %ncc, .ov_bkwd ! loop till done 146525cf1a30Sjl stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 146625cf1a30Sjl 146725cf1a30Sjl retl ! return 146825cf1a30Sjl nop 146925cf1a30Sjl 147025cf1a30Sjl SET_SIZE(ovbcopy) 147125cf1a30Sjl 147225cf1a30Sjl 147325cf1a30Sjl/* 147425cf1a30Sjl * hwblkpagecopy() 147525cf1a30Sjl * 147625cf1a30Sjl * Copies exactly one page. This routine assumes the caller (ppcopy) 147725cf1a30Sjl * has already disabled kernel preemption and has checked 147825cf1a30Sjl * use_hw_bcopy. Preventing preemption also prevents cpu migration. 147925cf1a30Sjl */ 148025cf1a30Sjl ENTRY(hwblkpagecopy) 148125cf1a30Sjl ! get another window w/space for three aligned blocks of saved fpregs 148225cf1a30Sjl prefetch [%o0], #n_reads 148325cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 148425cf1a30Sjl 148525cf1a30Sjl ! %i0 - source address (arg) 148625cf1a30Sjl ! %i1 - destination address (arg) 148725cf1a30Sjl ! %i2 - length of region (not arg) 148825cf1a30Sjl ! %l0 - saved fprs 148925cf1a30Sjl ! %l1 - pointer to saved fpregs 149025cf1a30Sjl 149125cf1a30Sjl rd %fprs, %l0 ! check for unused fp 149225cf1a30Sjl btst FPRS_FEF, %l0 149325cf1a30Sjl bz,a,pt %icc, 1f 149425cf1a30Sjl wr %g0, FPRS_FEF, %fprs 149525cf1a30Sjl 149625cf1a30Sjl BST_FPQ1Q3_TOSTACK(%l1) 149725cf1a30Sjl 149825cf1a30Sjl1: set PAGESIZE, CNT 149925cf1a30Sjl mov REALSRC, SRC 150025cf1a30Sjl 150125cf1a30Sjl ldd [SRC], %f0 150225cf1a30Sjl prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 150325cf1a30Sjl ldd [SRC + 0x08], %f2 150425cf1a30Sjl prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 150525cf1a30Sjl fmovd %f0, %f32 150625cf1a30Sjl ldd [SRC + 0x10], %f4 1507c8a722abSpm prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 150825cf1a30Sjl fmovd %f2, %f34 150925cf1a30Sjl ldd [SRC + 0x18], %f6 151025cf1a30Sjl prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 151125cf1a30Sjl fmovd %f4, %f36 151225cf1a30Sjl ldd [SRC + 0x20], %f8 1513c8a722abSpm prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 151425cf1a30Sjl fmovd %f6, %f38 151525cf1a30Sjl ldd [SRC + 0x28], %f10 1516c8a722abSpm prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 151725cf1a30Sjl fmovd %f8, %f40 151825cf1a30Sjl ldd [SRC + 0x30], %f12 1519c8a722abSpm prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 152025cf1a30Sjl fmovd %f10, %f42 152125cf1a30Sjl ldd [SRC + 0x38], %f14 152225cf1a30Sjl ldd [SRC + VIS_BLOCKSIZE], %f0 152325cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 152425cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 1525c8a722abSpm prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 152625cf1a30Sjl ba,pt %ncc, 2f 1527c8a722abSpm prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 152825cf1a30Sjl .align 32 152925cf1a30Sjl2: 153025cf1a30Sjl ldd [SRC + 0x08], %f2 153125cf1a30Sjl fmovd %f12, %f44 153225cf1a30Sjl ldd [SRC + 0x10], %f4 153325cf1a30Sjl fmovd %f14, %f46 153425cf1a30Sjl stda %f32, [DST]ASI_BLK_P 153525cf1a30Sjl ldd [SRC + 0x18], %f6 153625cf1a30Sjl fmovd %f0, %f32 153725cf1a30Sjl ldd [SRC + 0x20], %f8 153825cf1a30Sjl fmovd %f2, %f34 153925cf1a30Sjl ldd [SRC + 0x28], %f10 154025cf1a30Sjl fmovd %f4, %f36 154125cf1a30Sjl ldd [SRC + 0x30], %f12 154225cf1a30Sjl fmovd %f6, %f38 154325cf1a30Sjl ldd [SRC + 0x38], %f14 154425cf1a30Sjl fmovd %f8, %f40 154525cf1a30Sjl ldd [SRC + VIS_BLOCKSIZE], %f0 154625cf1a30Sjl fmovd %f10, %f42 154725cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 1548c8a722abSpm prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 154925cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 1550c8a722abSpm prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1551c8a722abSpm add SRC, VIS_BLOCKSIZE, SRC 155225cf1a30Sjl cmp CNT, VIS_BLOCKSIZE + 8 155325cf1a30Sjl bgu,pt %ncc, 2b 1554c8a722abSpm prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 155525cf1a30Sjl 155625cf1a30Sjl ! trailing block 155725cf1a30Sjl ldd [SRC + 0x08], %f2 155825cf1a30Sjl fsrc1 %f12, %f44 155925cf1a30Sjl ldd [SRC + 0x10], %f4 156025cf1a30Sjl fsrc1 %f14, %f46 156125cf1a30Sjl stda %f32, [DST]ASI_BLK_P 156225cf1a30Sjl ldd [SRC + 0x18], %f6 156325cf1a30Sjl fsrc1 %f0, %f32 156425cf1a30Sjl ldd [SRC + 0x20], %f8 156525cf1a30Sjl fsrc1 %f2, %f34 156625cf1a30Sjl ldd [SRC + 0x28], %f10 156725cf1a30Sjl fsrc1 %f4, %f36 156825cf1a30Sjl ldd [SRC + 0x30], %f12 156925cf1a30Sjl fsrc1 %f6, %f38 157025cf1a30Sjl ldd [SRC + 0x38], %f14 157125cf1a30Sjl fsrc1 %f8, %f40 157225cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 157325cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 157425cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 157525cf1a30Sjl fsrc1 %f10, %f42 157625cf1a30Sjl fsrc1 %f12, %f44 157725cf1a30Sjl fsrc1 %f14, %f46 157825cf1a30Sjl stda %f32, [DST]ASI_BLK_P 157925cf1a30Sjl 158025cf1a30Sjl membar #Sync 158125cf1a30Sjl 158225cf1a30Sjl btst FPRS_FEF, %l0 158325cf1a30Sjl bz,pt %icc, 2f 158425cf1a30Sjl nop 158525cf1a30Sjl 158625cf1a30Sjl BLD_FPQ1Q3_FROMSTACK(%l3) 158725cf1a30Sjl ba 3f 158825cf1a30Sjl nop 158925cf1a30Sjl 159025cf1a30Sjl2: FZEROQ1Q3 159125cf1a30Sjl 159225cf1a30Sjl3: wr %l0, 0, %fprs ! restore fprs 159325cf1a30Sjl ret 159425cf1a30Sjl restore %g0, 0, %o0 159525cf1a30Sjl 159625cf1a30Sjl SET_SIZE(hwblkpagecopy) 159725cf1a30Sjl 159825cf1a30Sjl 159925cf1a30Sjl/* 160025cf1a30Sjl * Transfer data to and from user space - 160125cf1a30Sjl * Note that these routines can cause faults 160225cf1a30Sjl * It is assumed that the kernel has nothing at 160325cf1a30Sjl * less than KERNELBASE in the virtual address space. 160425cf1a30Sjl * 160525cf1a30Sjl * Note that copyin(9F) and copyout(9F) are part of the 160625cf1a30Sjl * DDI/DKI which specifies that they return '-1' on "errors." 160725cf1a30Sjl * 160825cf1a30Sjl * Sigh. 160925cf1a30Sjl * 161025cf1a30Sjl * So there's two extremely similar routines - xcopyin() and xcopyout() 161125cf1a30Sjl * which return the errno that we've faithfully computed. This 161225cf1a30Sjl * allows other callers (e.g. uiomove(9F)) to work correctly. 161325cf1a30Sjl * Given that these are used pretty heavily, we expand the calling 161425cf1a30Sjl * sequences inline for all flavours (rather than making wrappers). 161525cf1a30Sjl * 161625cf1a30Sjl * There are also stub routines for xcopyout_little and xcopyin_little, 161725cf1a30Sjl * which currently are intended to handle requests of <= 16 bytes from 161825cf1a30Sjl * do_unaligned. Future enhancement to make them handle 8k pages efficiently 161925cf1a30Sjl * is left as an exercise... 162025cf1a30Sjl */ 162125cf1a30Sjl 162225cf1a30Sjl/* 162325cf1a30Sjl * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 162425cf1a30Sjl * 162525cf1a30Sjl * General theory of operation: 162625cf1a30Sjl * 162725cf1a30Sjl * The only difference between copy{in,out} and 162825cf1a30Sjl * xcopy{in,out} is in the error handling routine they invoke 162925cf1a30Sjl * when a memory access error occurs. xcopyOP returns the errno 163025cf1a30Sjl * while copyOP returns -1 (see above). copy{in,out}_noerr set 163125cf1a30Sjl * a special flag (by oring the TRAMP_FLAG into the fault handler address) 163225cf1a30Sjl * if they are called with a fault handler already in place. That flag 163325cf1a30Sjl * causes the default handlers to trampoline to the previous handler 163425cf1a30Sjl * upon an error. 163525cf1a30Sjl * 163625cf1a30Sjl * None of the copyops routines grab a window until it's decided that 163725cf1a30Sjl * we need to do a HW block copy operation. This saves a window 163825cf1a30Sjl * spill/fill when we're called during socket ops. The typical IO 163925cf1a30Sjl * path won't cause spill/fill traps. 164025cf1a30Sjl * 164125cf1a30Sjl * This code uses a set of 4 limits for the maximum size that will 164225cf1a30Sjl * be copied given a particular input/output address alignment. 164325cf1a30Sjl * If the value for a particular limit is zero, the copy will be performed 164425cf1a30Sjl * by the plain copy loops rather than FPBLK. 164525cf1a30Sjl * 164625cf1a30Sjl * See the description of bcopy above for more details of the 164725cf1a30Sjl * data copying algorithm and the default limits. 164825cf1a30Sjl * 164925cf1a30Sjl */ 165025cf1a30Sjl 165125cf1a30Sjl/* 165225cf1a30Sjl * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 165325cf1a30Sjl */ 165425cf1a30Sjl 165525cf1a30Sjl/* 165625cf1a30Sjl * We save the arguments in the following registers in case of a fault: 165725cf1a30Sjl * kaddr - %l1 165825cf1a30Sjl * uaddr - %l2 165925cf1a30Sjl * count - %l3 166025cf1a30Sjl */ 166125cf1a30Sjl#define SAVE_SRC %l1 166225cf1a30Sjl#define SAVE_DST %l2 166325cf1a30Sjl#define SAVE_COUNT %l3 166425cf1a30Sjl 166525cf1a30Sjl#define SM_SAVE_SRC %g4 166625cf1a30Sjl#define SM_SAVE_DST %g5 166725cf1a30Sjl#define SM_SAVE_COUNT %o5 166825cf1a30Sjl#define ERRNO %l5 166925cf1a30Sjl 167025cf1a30Sjl 167125cf1a30Sjl#define REAL_LOFAULT %l4 167225cf1a30Sjl/* 167325cf1a30Sjl * Generic copyio fault handler. This is the first line of defense when a 167425cf1a30Sjl * fault occurs in (x)copyin/(x)copyout. In order for this to function 167525cf1a30Sjl * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 167625cf1a30Sjl * This allows us to share common code for all the flavors of the copy 167725cf1a30Sjl * operations, including the _noerr versions. 167825cf1a30Sjl * 167925cf1a30Sjl * Note that this function will restore the original input parameters before 168025cf1a30Sjl * calling REAL_LOFAULT. So the real handler can vector to the appropriate 168125cf1a30Sjl * member of the t_copyop structure, if needed. 168225cf1a30Sjl */ 168325cf1a30Sjl ENTRY(copyio_fault) 168425cf1a30Sjl membar #Sync 168525cf1a30Sjl mov %g1,ERRNO ! save errno in ERRNO 168625cf1a30Sjl btst FPUSED_FLAG, %l6 168725cf1a30Sjl bz %ncc, 1f 168825cf1a30Sjl nop 168925cf1a30Sjl 169025cf1a30Sjl ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 169125cf1a30Sjl wr %o2, 0, %gsr ! restore gsr 169225cf1a30Sjl 169325cf1a30Sjl ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 169425cf1a30Sjl btst FPRS_FEF, %o3 169525cf1a30Sjl bz,pt %icc, 4f 169625cf1a30Sjl nop 169725cf1a30Sjl 169825cf1a30Sjl BLD_FPQ2Q4_FROMSTACK(%o2) 169925cf1a30Sjl 170025cf1a30Sjl ba,pt %ncc, 1f 170125cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 170225cf1a30Sjl 170325cf1a30Sjl4: 170425cf1a30Sjl FZEROQ2Q4 170525cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 170625cf1a30Sjl 170725cf1a30Sjl1: 170825cf1a30Sjl andn %l6, FPUSED_FLAG, %l6 170925cf1a30Sjl membar #Sync 171025cf1a30Sjl stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 171125cf1a30Sjl FP_ALLOWMIGRATE(5, 6) 171225cf1a30Sjl 171325cf1a30Sjl mov SAVE_SRC, %i0 171425cf1a30Sjl mov SAVE_DST, %i1 171525cf1a30Sjl jmp REAL_LOFAULT 171625cf1a30Sjl mov SAVE_COUNT, %i2 171725cf1a30Sjl 171825cf1a30Sjl SET_SIZE(copyio_fault) 171925cf1a30Sjl 172025cf1a30Sjl 172125cf1a30Sjl ENTRY(copyout) 172225cf1a30Sjl 172325cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 172425cf1a30Sjl bleu,pt %ncc, .copyout_small ! go to larger cases 172525cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 172625cf1a30Sjl btst 7, %o3 ! 172725cf1a30Sjl bz,pt %ncc, .copyout_8 ! check for longword alignment 172825cf1a30Sjl nop 172925cf1a30Sjl btst 1, %o3 ! 173025cf1a30Sjl bz,pt %ncc, .copyout_2 ! check for half-word 173125cf1a30Sjl nop 173225cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 173325cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 173425cf1a30Sjl tst %o3 173525cf1a30Sjl bz,pn %icc, .copyout_small ! if zero, disable HW copy 173625cf1a30Sjl cmp %o2, %o3 ! if length <= limit 173725cf1a30Sjl bleu,pt %ncc, .copyout_small ! go to small copy 173825cf1a30Sjl nop 173925cf1a30Sjl ba,pt %ncc, .copyout_more ! otherwise go to large copy 174025cf1a30Sjl nop 174125cf1a30Sjl.copyout_2: 174225cf1a30Sjl btst 3, %o3 ! 174325cf1a30Sjl bz,pt %ncc, .copyout_4 ! check for word alignment 174425cf1a30Sjl nop 174525cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 174625cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 174725cf1a30Sjl tst %o3 174825cf1a30Sjl bz,pn %icc, .copyout_small ! if zero, disable HW copy 174925cf1a30Sjl cmp %o2, %o3 ! if length <= limit 175025cf1a30Sjl bleu,pt %ncc, .copyout_small ! go to small copy 175125cf1a30Sjl nop 175225cf1a30Sjl ba,pt %ncc, .copyout_more ! otherwise go to large copy 175325cf1a30Sjl nop 175425cf1a30Sjl.copyout_4: 175525cf1a30Sjl ! already checked longword, must be word aligned 175625cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 175725cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 175825cf1a30Sjl tst %o3 175925cf1a30Sjl bz,pn %icc, .copyout_small ! if zero, disable HW copy 176025cf1a30Sjl cmp %o2, %o3 ! if length <= limit 176125cf1a30Sjl bleu,pt %ncc, .copyout_small ! go to small copy 176225cf1a30Sjl nop 176325cf1a30Sjl ba,pt %ncc, .copyout_more ! otherwise go to large copy 176425cf1a30Sjl nop 176525cf1a30Sjl.copyout_8: 176625cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 176725cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 176825cf1a30Sjl tst %o3 176925cf1a30Sjl bz,pn %icc, .copyout_small ! if zero, disable HW copy 177025cf1a30Sjl cmp %o2, %o3 ! if length <= limit 177125cf1a30Sjl bleu,pt %ncc, .copyout_small ! go to small copy 177225cf1a30Sjl nop 177325cf1a30Sjl ba,pt %ncc, .copyout_more ! otherwise go to large copy 177425cf1a30Sjl nop 177525cf1a30Sjl 177625cf1a30Sjl .align 16 177725cf1a30Sjl nop ! instruction alignment 177825cf1a30Sjl ! see discussion at start of file 177925cf1a30Sjl.copyout_small: 178025cf1a30Sjl sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 178125cf1a30Sjl or %o5, %lo(.sm_copyout_err), %o5 178225cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 178325cf1a30Sjl membar #Sync ! sync error barrier 178425cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 178525cf1a30Sjl.sm_do_copyout: 178625cf1a30Sjl mov %o0, SM_SAVE_SRC 178725cf1a30Sjl mov %o1, SM_SAVE_DST 178825cf1a30Sjl cmp %o2, SHORTCOPY ! check for really short case 178925cf1a30Sjl bleu,pt %ncc, .co_sm_left ! 179025cf1a30Sjl mov %o2, SM_SAVE_COUNT 179125cf1a30Sjl cmp %o2, CHKSIZE ! check for medium length cases 179225cf1a30Sjl bgu,pn %ncc, .co_med ! 179325cf1a30Sjl or %o0, %o1, %o3 ! prepare alignment check 179425cf1a30Sjl andcc %o3, 0x3, %g0 ! test for alignment 179525cf1a30Sjl bz,pt %ncc, .co_sm_word ! branch to word aligned case 179625cf1a30Sjl.co_sm_movebytes: 179725cf1a30Sjl sub %o2, 3, %o2 ! adjust count to allow cc zero test 179825cf1a30Sjl.co_sm_notalign4: 179925cf1a30Sjl ldub [%o0], %o3 ! read byte 180025cf1a30Sjl subcc %o2, 4, %o2 ! reduce count by 4 180125cf1a30Sjl stba %o3, [%o1]ASI_USER ! write byte 180225cf1a30Sjl inc %o1 ! advance DST by 1 180325cf1a30Sjl ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 180425cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 180525cf1a30Sjl stba %o3, [%o1]ASI_USER 180625cf1a30Sjl inc %o1 ! advance DST by 1 180725cf1a30Sjl ldub [%o0 - 2], %o3 180825cf1a30Sjl stba %o3, [%o1]ASI_USER 180925cf1a30Sjl inc %o1 ! advance DST by 1 181025cf1a30Sjl ldub [%o0 - 1], %o3 181125cf1a30Sjl stba %o3, [%o1]ASI_USER 181225cf1a30Sjl bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 181325cf1a30Sjl inc %o1 ! advance DST by 1 181425cf1a30Sjl add %o2, 3, %o2 ! restore count 181525cf1a30Sjl.co_sm_left: 181625cf1a30Sjl tst %o2 181725cf1a30Sjl bz,pt %ncc, .co_sm_exit ! check for zero length 181825cf1a30Sjl nop 181925cf1a30Sjl ldub [%o0], %o3 ! load one byte 182025cf1a30Sjl deccc %o2 ! reduce count for cc test 182125cf1a30Sjl bz,pt %ncc, .co_sm_exit 182225cf1a30Sjl stba %o3,[%o1]ASI_USER ! store one byte 182325cf1a30Sjl ldub [%o0 + 1], %o3 ! load second byte 182425cf1a30Sjl deccc %o2 182525cf1a30Sjl inc %o1 182625cf1a30Sjl bz,pt %ncc, .co_sm_exit 182725cf1a30Sjl stba %o3,[%o1]ASI_USER ! store second byte 182825cf1a30Sjl ldub [%o0 + 2], %o3 ! load third byte 182925cf1a30Sjl inc %o1 183025cf1a30Sjl stba %o3,[%o1]ASI_USER ! store third byte 183125cf1a30Sjl membar #Sync ! sync error barrier 183225cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 183325cf1a30Sjl retl 183425cf1a30Sjl mov %g0, %o0 ! return 0 183525cf1a30Sjl .align 16 183625cf1a30Sjl.co_sm_words: 183725cf1a30Sjl lduw [%o0], %o3 ! read word 183825cf1a30Sjl.co_sm_wordx: 183925cf1a30Sjl subcc %o2, 8, %o2 ! update count 184025cf1a30Sjl stwa %o3, [%o1]ASI_USER ! write word 184125cf1a30Sjl add %o0, 8, %o0 ! update SRC 184225cf1a30Sjl lduw [%o0 - 4], %o3 ! read word 184325cf1a30Sjl add %o1, 4, %o1 ! update DST 184425cf1a30Sjl stwa %o3, [%o1]ASI_USER ! write word 184525cf1a30Sjl bgt,pt %ncc, .co_sm_words ! loop til done 184625cf1a30Sjl add %o1, 4, %o1 ! update DST 184725cf1a30Sjl addcc %o2, 7, %o2 ! restore count 184825cf1a30Sjl bz,pt %ncc, .co_sm_exit 184925cf1a30Sjl nop 185025cf1a30Sjl deccc %o2 185125cf1a30Sjl bz,pt %ncc, .co_sm_byte 185225cf1a30Sjl.co_sm_half: 185325cf1a30Sjl subcc %o2, 2, %o2 ! reduce count by 2 185425cf1a30Sjl lduh [%o0], %o3 ! read half word 185525cf1a30Sjl add %o0, 2, %o0 ! advance SRC by 2 185625cf1a30Sjl stha %o3, [%o1]ASI_USER ! write half word 185725cf1a30Sjl bgt,pt %ncc, .co_sm_half ! loop til done 185825cf1a30Sjl add %o1, 2, %o1 ! advance DST by 2 185925cf1a30Sjl addcc %o2, 1, %o2 ! restore count 186025cf1a30Sjl bz,pt %ncc, .co_sm_exit 186125cf1a30Sjl nop 186225cf1a30Sjl.co_sm_byte: 186325cf1a30Sjl ldub [%o0], %o3 186425cf1a30Sjl stba %o3, [%o1]ASI_USER 186525cf1a30Sjl membar #Sync ! sync error barrier 186625cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 186725cf1a30Sjl retl 186825cf1a30Sjl mov %g0, %o0 ! return 0 186925cf1a30Sjl .align 16 187025cf1a30Sjl.co_sm_word: 187125cf1a30Sjl subcc %o2, 4, %o2 ! update count 187225cf1a30Sjl bgt,pt %ncc, .co_sm_wordx 187325cf1a30Sjl lduw [%o0], %o3 ! read word 187425cf1a30Sjl addcc %o2, 3, %o2 ! restore count 187525cf1a30Sjl bz,pt %ncc, .co_sm_exit 187625cf1a30Sjl stwa %o3, [%o1]ASI_USER ! write word 187725cf1a30Sjl deccc %o2 ! reduce count for cc test 187825cf1a30Sjl ldub [%o0 + 4], %o3 ! load one byte 187925cf1a30Sjl add %o1, 4, %o1 188025cf1a30Sjl bz,pt %ncc, .co_sm_exit 188125cf1a30Sjl stba %o3, [%o1]ASI_USER ! store one byte 188225cf1a30Sjl ldub [%o0 + 5], %o3 ! load second byte 188325cf1a30Sjl deccc %o2 188425cf1a30Sjl inc %o1 188525cf1a30Sjl bz,pt %ncc, .co_sm_exit 188625cf1a30Sjl stba %o3, [%o1]ASI_USER ! store second byte 188725cf1a30Sjl ldub [%o0 + 6], %o3 ! load third byte 188825cf1a30Sjl inc %o1 188925cf1a30Sjl stba %o3, [%o1]ASI_USER ! store third byte 189025cf1a30Sjl.co_sm_exit: 189125cf1a30Sjl membar #Sync ! sync error barrier 189225cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 189325cf1a30Sjl retl 189425cf1a30Sjl mov %g0, %o0 ! return 0 189525cf1a30Sjl 189625cf1a30Sjl .align 16 189725cf1a30Sjl.co_med: 189825cf1a30Sjl xor %o0, %o1, %o3 ! setup alignment check 189925cf1a30Sjl btst 1, %o3 190025cf1a30Sjl bnz,pt %ncc, .co_sm_movebytes ! unaligned 190125cf1a30Sjl nop 190225cf1a30Sjl btst 3, %o3 190325cf1a30Sjl bnz,pt %ncc, .co_med_half ! halfword aligned 190425cf1a30Sjl nop 190525cf1a30Sjl btst 7, %o3 190625cf1a30Sjl bnz,pt %ncc, .co_med_word ! word aligned 190725cf1a30Sjl nop 190825cf1a30Sjl.co_med_long: 190925cf1a30Sjl btst 3, %o0 ! check for 191025cf1a30Sjl bz,pt %ncc, .co_med_long1 ! word alignment 191125cf1a30Sjl nop 191225cf1a30Sjl.co_med_long0: 191325cf1a30Sjl ldub [%o0], %o3 ! load one byte 191425cf1a30Sjl inc %o0 191525cf1a30Sjl stba %o3,[%o1]ASI_USER ! store byte 191625cf1a30Sjl inc %o1 191725cf1a30Sjl btst 3, %o0 191825cf1a30Sjl bnz,pt %ncc, .co_med_long0 191925cf1a30Sjl dec %o2 192025cf1a30Sjl.co_med_long1: ! word aligned 192125cf1a30Sjl btst 7, %o0 ! check for long word 192225cf1a30Sjl bz,pt %ncc, .co_med_long2 192325cf1a30Sjl nop 192425cf1a30Sjl lduw [%o0], %o3 ! load word 192525cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 192625cf1a30Sjl stwa %o3, [%o1]ASI_USER ! store word 192725cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 192825cf1a30Sjl sub %o2, 4, %o2 ! reduce count by 4 192925cf1a30Sjl! 193025cf1a30Sjl! Now long word aligned and have at least 32 bytes to move 193125cf1a30Sjl! 193225cf1a30Sjl.co_med_long2: 193325cf1a30Sjl sub %o2, 31, %o2 ! adjust count to allow cc zero test 193425cf1a30Sjl sub %o1, 8, %o1 ! adjust pointer to allow store in 193525cf1a30Sjl ! branch delay slot instead of add 193625cf1a30Sjl.co_med_lmove: 193725cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 193825cf1a30Sjl ldx [%o0], %o3 ! read long word 193925cf1a30Sjl subcc %o2, 32, %o2 ! reduce count by 32 194025cf1a30Sjl stxa %o3, [%o1]ASI_USER ! write long word 194125cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 194225cf1a30Sjl ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 194325cf1a30Sjl add %o0, 32, %o0 ! advance SRC by 32 194425cf1a30Sjl stxa %o3, [%o1]ASI_USER 194525cf1a30Sjl ldx [%o0 - 16], %o3 194625cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 194725cf1a30Sjl stxa %o3, [%o1]ASI_USER 194825cf1a30Sjl ldx [%o0 - 8], %o3 194925cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 195025cf1a30Sjl bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 195125cf1a30Sjl stxa %o3, [%o1]ASI_USER 195225cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 195325cf1a30Sjl addcc %o2, 24, %o2 ! restore count to long word offset 195425cf1a30Sjl ble,pt %ncc, .co_med_lextra ! check for more long words to move 195525cf1a30Sjl nop 195625cf1a30Sjl.co_med_lword: 195725cf1a30Sjl ldx [%o0], %o3 ! read long word 195825cf1a30Sjl subcc %o2, 8, %o2 ! reduce count by 8 195925cf1a30Sjl stxa %o3, [%o1]ASI_USER ! write long word 196025cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 196125cf1a30Sjl bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 196225cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 196325cf1a30Sjl.co_med_lextra: 196425cf1a30Sjl addcc %o2, 7, %o2 ! restore rest of count 196525cf1a30Sjl bz,pt %ncc, .co_sm_exit ! if zero, then done 196625cf1a30Sjl deccc %o2 196725cf1a30Sjl bz,pt %ncc, .co_sm_byte 196825cf1a30Sjl nop 196925cf1a30Sjl ba,pt %ncc, .co_sm_half 197025cf1a30Sjl nop 197125cf1a30Sjl 197225cf1a30Sjl .align 16 197325cf1a30Sjl nop ! instruction alignment 197425cf1a30Sjl ! see discussion at start of file 197525cf1a30Sjl.co_med_word: 197625cf1a30Sjl btst 3, %o0 ! check for 197725cf1a30Sjl bz,pt %ncc, .co_med_word1 ! word alignment 197825cf1a30Sjl nop 197925cf1a30Sjl.co_med_word0: 198025cf1a30Sjl ldub [%o0], %o3 ! load one byte 198125cf1a30Sjl inc %o0 198225cf1a30Sjl stba %o3,[%o1]ASI_USER ! store byte 198325cf1a30Sjl inc %o1 198425cf1a30Sjl btst 3, %o0 198525cf1a30Sjl bnz,pt %ncc, .co_med_word0 198625cf1a30Sjl dec %o2 198725cf1a30Sjl! 198825cf1a30Sjl! Now word aligned and have at least 36 bytes to move 198925cf1a30Sjl! 199025cf1a30Sjl.co_med_word1: 199125cf1a30Sjl sub %o2, 15, %o2 ! adjust count to allow cc zero test 199225cf1a30Sjl.co_med_wmove: 199325cf1a30Sjl lduw [%o0], %o3 ! read word 199425cf1a30Sjl subcc %o2, 16, %o2 ! reduce count by 16 199525cf1a30Sjl stwa %o3, [%o1]ASI_USER ! write word 199625cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 199725cf1a30Sjl lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 199825cf1a30Sjl add %o0, 16, %o0 ! advance SRC by 16 199925cf1a30Sjl stwa %o3, [%o1]ASI_USER 200025cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 200125cf1a30Sjl lduw [%o0 - 8], %o3 200225cf1a30Sjl stwa %o3, [%o1]ASI_USER 200325cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 200425cf1a30Sjl lduw [%o0 - 4], %o3 200525cf1a30Sjl stwa %o3, [%o1]ASI_USER 200625cf1a30Sjl bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 200725cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 200825cf1a30Sjl addcc %o2, 12, %o2 ! restore count to word offset 200925cf1a30Sjl ble,pt %ncc, .co_med_wextra ! check for more words to move 201025cf1a30Sjl nop 201125cf1a30Sjl.co_med_word2: 201225cf1a30Sjl lduw [%o0], %o3 ! read word 201325cf1a30Sjl subcc %o2, 4, %o2 ! reduce count by 4 201425cf1a30Sjl stwa %o3, [%o1]ASI_USER ! write word 201525cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 201625cf1a30Sjl bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 201725cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 201825cf1a30Sjl.co_med_wextra: 201925cf1a30Sjl addcc %o2, 3, %o2 ! restore rest of count 202025cf1a30Sjl bz,pt %ncc, .co_sm_exit ! if zero, then done 202125cf1a30Sjl deccc %o2 202225cf1a30Sjl bz,pt %ncc, .co_sm_byte 202325cf1a30Sjl nop 202425cf1a30Sjl ba,pt %ncc, .co_sm_half 202525cf1a30Sjl nop 202625cf1a30Sjl 202725cf1a30Sjl .align 16 202825cf1a30Sjl nop ! instruction alignment 202925cf1a30Sjl nop ! see discussion at start of file 203025cf1a30Sjl nop 203125cf1a30Sjl.co_med_half: 203225cf1a30Sjl btst 1, %o0 ! check for 203325cf1a30Sjl bz,pt %ncc, .co_med_half1 ! half word alignment 203425cf1a30Sjl nop 203525cf1a30Sjl ldub [%o0], %o3 ! load one byte 203625cf1a30Sjl inc %o0 203725cf1a30Sjl stba %o3,[%o1]ASI_USER ! store byte 203825cf1a30Sjl inc %o1 203925cf1a30Sjl dec %o2 204025cf1a30Sjl! 204125cf1a30Sjl! Now half word aligned and have at least 38 bytes to move 204225cf1a30Sjl! 204325cf1a30Sjl.co_med_half1: 204425cf1a30Sjl sub %o2, 7, %o2 ! adjust count to allow cc zero test 204525cf1a30Sjl.co_med_hmove: 204625cf1a30Sjl lduh [%o0], %o3 ! read half word 204725cf1a30Sjl subcc %o2, 8, %o2 ! reduce count by 8 204825cf1a30Sjl stha %o3, [%o1]ASI_USER ! write half word 204925cf1a30Sjl add %o1, 2, %o1 ! advance DST by 2 205025cf1a30Sjl lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 205125cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 205225cf1a30Sjl stha %o3, [%o1]ASI_USER 205325cf1a30Sjl add %o1, 2, %o1 ! advance DST by 2 205425cf1a30Sjl lduh [%o0 - 4], %o3 205525cf1a30Sjl stha %o3, [%o1]ASI_USER 205625cf1a30Sjl add %o1, 2, %o1 ! advance DST by 2 205725cf1a30Sjl lduh [%o0 - 2], %o3 205825cf1a30Sjl stha %o3, [%o1]ASI_USER 205925cf1a30Sjl bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 206025cf1a30Sjl add %o1, 2, %o1 ! advance DST by 2 206125cf1a30Sjl addcc %o2, 7, %o2 ! restore count 206225cf1a30Sjl bz,pt %ncc, .co_sm_exit 206325cf1a30Sjl deccc %o2 206425cf1a30Sjl bz,pt %ncc, .co_sm_byte 206525cf1a30Sjl nop 206625cf1a30Sjl ba,pt %ncc, .co_sm_half 206725cf1a30Sjl nop 206825cf1a30Sjl 206925cf1a30Sjl/* 207025cf1a30Sjl * We got here because of a fault during short copyout. 207125cf1a30Sjl * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 207225cf1a30Sjl */ 207325cf1a30Sjl.sm_copyout_err: 207425cf1a30Sjl membar #Sync 207525cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 207625cf1a30Sjl mov SM_SAVE_SRC, %o0 207725cf1a30Sjl mov SM_SAVE_DST, %o1 207825cf1a30Sjl mov SM_SAVE_COUNT, %o2 207925cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 208025cf1a30Sjl tst %o3 208125cf1a30Sjl bz,pt %ncc, 3f ! if not, return error 208225cf1a30Sjl nop 208325cf1a30Sjl ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 208425cf1a30Sjl jmp %o5 ! original arguments 208525cf1a30Sjl nop 208625cf1a30Sjl3: 208725cf1a30Sjl retl 208825cf1a30Sjl or %g0, -1, %o0 ! return error value 208925cf1a30Sjl 209025cf1a30Sjl SET_SIZE(copyout) 209125cf1a30Sjl 209225cf1a30Sjl/* 209325cf1a30Sjl * The _more entry points are not intended to be used directly by 209425cf1a30Sjl * any caller from outside this file. They are provided to allow 209525cf1a30Sjl * profiling and dtrace of the portions of the copy code that uses 209625cf1a30Sjl * the floating point registers. 209725cf1a30Sjl * This entry is particularly important as DTRACE (at least as of 209825cf1a30Sjl * 4/2004) does not support leaf functions. 209925cf1a30Sjl */ 210025cf1a30Sjl 210125cf1a30Sjl ENTRY(copyout_more) 210225cf1a30Sjl.copyout_more: 210325cf1a30Sjl prefetch [%o0], #n_reads 210425cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 210525cf1a30Sjl set .copyout_err, REAL_LOFAULT 210625cf1a30Sjl 210725cf1a30Sjl/* 210825cf1a30Sjl * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 210925cf1a30Sjl */ 211025cf1a30Sjl.do_copyout: 211125cf1a30Sjl set copyio_fault, %l7 ! .copyio_fault is lofault val 211225cf1a30Sjl 211325cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 211425cf1a30Sjl membar #Sync ! sync error barrier 211525cf1a30Sjl stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 211625cf1a30Sjl 211725cf1a30Sjl mov %i0, SAVE_SRC 211825cf1a30Sjl mov %i1, SAVE_DST 211925cf1a30Sjl mov %i2, SAVE_COUNT 212025cf1a30Sjl 212125cf1a30Sjl FP_NOMIGRATE(6, 7) 212225cf1a30Sjl 212325cf1a30Sjl rd %fprs, %o2 ! check for unused fp 212425cf1a30Sjl st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 212525cf1a30Sjl btst FPRS_FEF, %o2 212625cf1a30Sjl bz,a,pt %icc, .do_blockcopyout 212725cf1a30Sjl wr %g0, FPRS_FEF, %fprs 212825cf1a30Sjl 212925cf1a30Sjl BST_FPQ2Q4_TOSTACK(%o2) 213025cf1a30Sjl 213125cf1a30Sjl.do_blockcopyout: 213225cf1a30Sjl rd %gsr, %o2 213325cf1a30Sjl stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 213425cf1a30Sjl or %l6, FPUSED_FLAG, %l6 213525cf1a30Sjl 213625cf1a30Sjl andcc DST, VIS_BLOCKSIZE - 1, TMP 213725cf1a30Sjl mov ASI_USER, %asi 213825cf1a30Sjl bz,pt %ncc, 2f 213925cf1a30Sjl neg TMP 214025cf1a30Sjl add TMP, VIS_BLOCKSIZE, TMP 214125cf1a30Sjl 214225cf1a30Sjl ! TMP = bytes required to align DST on FP_BLOCK boundary 214325cf1a30Sjl ! Using SRC as a tmp here 214425cf1a30Sjl cmp TMP, 3 214525cf1a30Sjl bleu,pt %ncc, 1f 214625cf1a30Sjl sub CNT,TMP,CNT ! adjust main count 214725cf1a30Sjl sub TMP, 3, TMP ! adjust for end of loop test 214825cf1a30Sjl.co_blkalign: 214925cf1a30Sjl ldub [REALSRC], SRC ! move 4 bytes per loop iteration 215025cf1a30Sjl stba SRC, [DST]%asi 215125cf1a30Sjl subcc TMP, 4, TMP 215225cf1a30Sjl ldub [REALSRC + 1], SRC 215325cf1a30Sjl add REALSRC, 4, REALSRC 215425cf1a30Sjl stba SRC, [DST + 1]%asi 215525cf1a30Sjl ldub [REALSRC - 2], SRC 215625cf1a30Sjl add DST, 4, DST 215725cf1a30Sjl stba SRC, [DST - 2]%asi 215825cf1a30Sjl ldub [REALSRC - 1], SRC 215925cf1a30Sjl bgu,pt %ncc, .co_blkalign 216025cf1a30Sjl stba SRC, [DST - 1]%asi 216125cf1a30Sjl 216225cf1a30Sjl addcc TMP, 3, TMP ! restore count adjustment 216325cf1a30Sjl bz,pt %ncc, 2f ! no bytes left? 216425cf1a30Sjl nop 216525cf1a30Sjl1: ldub [REALSRC], SRC 216625cf1a30Sjl inc REALSRC 216725cf1a30Sjl inc DST 216825cf1a30Sjl deccc TMP 216925cf1a30Sjl bgu %ncc, 1b 217025cf1a30Sjl stba SRC, [DST - 1]%asi 217125cf1a30Sjl 217225cf1a30Sjl2: 217325cf1a30Sjl membar #StoreLoad 217425cf1a30Sjl andn REALSRC, 0x7, SRC 217525cf1a30Sjl 217625cf1a30Sjl ! SRC - 8-byte aligned 217725cf1a30Sjl ! DST - 64-byte aligned 217825cf1a30Sjl ldd [SRC], %f16 217925cf1a30Sjl prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 218025cf1a30Sjl alignaddr REALSRC, %g0, %g0 218125cf1a30Sjl ldd [SRC + 0x08], %f18 218225cf1a30Sjl prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 218325cf1a30Sjl faligndata %f16, %f18, %f48 218425cf1a30Sjl ldd [SRC + 0x10], %f20 2185c8a722abSpm prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 218625cf1a30Sjl faligndata %f18, %f20, %f50 218725cf1a30Sjl ldd [SRC + 0x18], %f22 218825cf1a30Sjl prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 218925cf1a30Sjl faligndata %f20, %f22, %f52 219025cf1a30Sjl ldd [SRC + 0x20], %f24 2191c8a722abSpm prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 219225cf1a30Sjl faligndata %f22, %f24, %f54 219325cf1a30Sjl ldd [SRC + 0x28], %f26 2194c8a722abSpm prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 219525cf1a30Sjl faligndata %f24, %f26, %f56 219625cf1a30Sjl ldd [SRC + 0x30], %f28 2197c8a722abSpm prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 219825cf1a30Sjl faligndata %f26, %f28, %f58 219925cf1a30Sjl ldd [SRC + 0x38], %f30 220025cf1a30Sjl ldd [SRC + VIS_BLOCKSIZE], %f16 220125cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 220225cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 2203c8a722abSpm prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 220425cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 220525cf1a30Sjl ba,pt %ncc, 1f 2206c8a722abSpm prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 220725cf1a30Sjl .align 32 220825cf1a30Sjl1: 220925cf1a30Sjl ldd [SRC + 0x08], %f18 221025cf1a30Sjl faligndata %f28, %f30, %f60 221125cf1a30Sjl ldd [SRC + 0x10], %f20 221225cf1a30Sjl faligndata %f30, %f16, %f62 221325cf1a30Sjl stda %f48, [DST]ASI_BLK_AIUS 221425cf1a30Sjl ldd [SRC + 0x18], %f22 221525cf1a30Sjl faligndata %f16, %f18, %f48 221625cf1a30Sjl ldd [SRC + 0x20], %f24 221725cf1a30Sjl faligndata %f18, %f20, %f50 221825cf1a30Sjl ldd [SRC + 0x28], %f26 221925cf1a30Sjl faligndata %f20, %f22, %f52 222025cf1a30Sjl ldd [SRC + 0x30], %f28 222125cf1a30Sjl faligndata %f22, %f24, %f54 2222c8a722abSpm sub CNT, VIS_BLOCKSIZE, CNT 222325cf1a30Sjl ldd [SRC + 0x38], %f30 222425cf1a30Sjl faligndata %f24, %f26, %f56 2225c8a722abSpm add DST, VIS_BLOCKSIZE, DST 222625cf1a30Sjl ldd [SRC + VIS_BLOCKSIZE], %f16 222725cf1a30Sjl faligndata %f26, %f28, %f58 222825cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 2229c8a722abSpm prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2230c8a722abSpm add SRC, VIS_BLOCKSIZE, SRC 2231c8a722abSpm prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 223225cf1a30Sjl cmp CNT, VIS_BLOCKSIZE + 8 223325cf1a30Sjl bgu,pt %ncc, 1b 2234c8a722abSpm prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 223525cf1a30Sjl 223625cf1a30Sjl ! only if REALSRC & 0x7 is 0 223725cf1a30Sjl cmp CNT, VIS_BLOCKSIZE 223825cf1a30Sjl bne %ncc, 3f 223925cf1a30Sjl andcc REALSRC, 0x7, %g0 224025cf1a30Sjl bz,pt %ncc, 2f 224125cf1a30Sjl nop 224225cf1a30Sjl3: 224325cf1a30Sjl faligndata %f28, %f30, %f60 224425cf1a30Sjl faligndata %f30, %f16, %f62 224525cf1a30Sjl stda %f48, [DST]ASI_BLK_AIUS 224625cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 224725cf1a30Sjl ba,pt %ncc, 3f 224825cf1a30Sjl nop 224925cf1a30Sjl2: 225025cf1a30Sjl ldd [SRC + 0x08], %f18 225125cf1a30Sjl fsrc1 %f28, %f60 225225cf1a30Sjl ldd [SRC + 0x10], %f20 225325cf1a30Sjl fsrc1 %f30, %f62 225425cf1a30Sjl stda %f48, [DST]ASI_BLK_AIUS 225525cf1a30Sjl ldd [SRC + 0x18], %f22 225625cf1a30Sjl fsrc1 %f16, %f48 225725cf1a30Sjl ldd [SRC + 0x20], %f24 225825cf1a30Sjl fsrc1 %f18, %f50 225925cf1a30Sjl ldd [SRC + 0x28], %f26 226025cf1a30Sjl fsrc1 %f20, %f52 226125cf1a30Sjl ldd [SRC + 0x30], %f28 226225cf1a30Sjl fsrc1 %f22, %f54 226325cf1a30Sjl ldd [SRC + 0x38], %f30 226425cf1a30Sjl fsrc1 %f24, %f56 226525cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 226625cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 226725cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 226825cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 226925cf1a30Sjl fsrc1 %f26, %f58 227025cf1a30Sjl fsrc1 %f28, %f60 227125cf1a30Sjl fsrc1 %f30, %f62 227225cf1a30Sjl stda %f48, [DST]ASI_BLK_AIUS 227325cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 227425cf1a30Sjl ba,a,pt %ncc, 4f 227525cf1a30Sjl nop 227625cf1a30Sjl 227725cf1a30Sjl3: tst CNT 227825cf1a30Sjl bz,a %ncc, 4f 227925cf1a30Sjl nop 228025cf1a30Sjl 228125cf1a30Sjl5: ldub [REALSRC], TMP 228225cf1a30Sjl inc REALSRC 228325cf1a30Sjl inc DST 228425cf1a30Sjl deccc CNT 228525cf1a30Sjl bgu %ncc, 5b 228625cf1a30Sjl stba TMP, [DST - 1]%asi 228725cf1a30Sjl4: 228825cf1a30Sjl 228925cf1a30Sjl.copyout_exit: 229025cf1a30Sjl membar #Sync 229125cf1a30Sjl 229225cf1a30Sjl ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 229325cf1a30Sjl wr %o2, 0, %gsr ! restore gsr 229425cf1a30Sjl 229525cf1a30Sjl ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 229625cf1a30Sjl btst FPRS_FEF, %o3 229725cf1a30Sjl bz,pt %icc, 4f 229825cf1a30Sjl nop 229925cf1a30Sjl 230025cf1a30Sjl BLD_FPQ2Q4_FROMSTACK(%o2) 230125cf1a30Sjl 230225cf1a30Sjl ba,pt %ncc, 1f 230325cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 230425cf1a30Sjl 230525cf1a30Sjl4: 230625cf1a30Sjl FZEROQ2Q4 230725cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 230825cf1a30Sjl 230925cf1a30Sjl1: 231025cf1a30Sjl membar #Sync 231125cf1a30Sjl andn %l6, FPUSED_FLAG, %l6 231225cf1a30Sjl stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 231325cf1a30Sjl FP_ALLOWMIGRATE(5, 6) 231425cf1a30Sjl ret 231525cf1a30Sjl restore %g0, 0, %o0 231625cf1a30Sjl 231725cf1a30Sjl/* 231825cf1a30Sjl * We got here because of a fault during copyout. 231925cf1a30Sjl * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 232025cf1a30Sjl */ 232125cf1a30Sjl.copyout_err: 232225cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 232325cf1a30Sjl tst %o4 232425cf1a30Sjl bz,pt %ncc, 2f ! if not, return error 232525cf1a30Sjl nop 232625cf1a30Sjl ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 232725cf1a30Sjl jmp %g2 ! original arguments 232825cf1a30Sjl restore %g0, 0, %g0 ! dispose of copy window 232925cf1a30Sjl2: 233025cf1a30Sjl ret 233125cf1a30Sjl restore %g0, -1, %o0 ! return error value 233225cf1a30Sjl 233325cf1a30Sjl 233425cf1a30Sjl SET_SIZE(copyout_more) 233525cf1a30Sjl 233625cf1a30Sjl 233725cf1a30Sjl ENTRY(xcopyout) 233825cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 233925cf1a30Sjl bleu,pt %ncc, .xcopyout_small ! go to larger cases 234025cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 234125cf1a30Sjl btst 7, %o3 ! 234225cf1a30Sjl bz,pt %ncc, .xcopyout_8 ! 234325cf1a30Sjl nop 234425cf1a30Sjl btst 1, %o3 ! 234525cf1a30Sjl bz,pt %ncc, .xcopyout_2 ! check for half-word 234625cf1a30Sjl nop 234725cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 234825cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 234925cf1a30Sjl tst %o3 235025cf1a30Sjl bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 235125cf1a30Sjl cmp %o2, %o3 ! if length <= limit 235225cf1a30Sjl bleu,pt %ncc, .xcopyout_small ! go to small copy 235325cf1a30Sjl nop 235425cf1a30Sjl ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 235525cf1a30Sjl nop 235625cf1a30Sjl.xcopyout_2: 235725cf1a30Sjl btst 3, %o3 ! 235825cf1a30Sjl bz,pt %ncc, .xcopyout_4 ! check for word alignment 235925cf1a30Sjl nop 236025cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 236125cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 236225cf1a30Sjl tst %o3 236325cf1a30Sjl bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 236425cf1a30Sjl cmp %o2, %o3 ! if length <= limit 236525cf1a30Sjl bleu,pt %ncc, .xcopyout_small ! go to small copy 236625cf1a30Sjl nop 236725cf1a30Sjl ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 236825cf1a30Sjl nop 236925cf1a30Sjl.xcopyout_4: 237025cf1a30Sjl ! already checked longword, must be word aligned 237125cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 237225cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 237325cf1a30Sjl tst %o3 237425cf1a30Sjl bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 237525cf1a30Sjl cmp %o2, %o3 ! if length <= limit 237625cf1a30Sjl bleu,pt %ncc, .xcopyout_small ! go to small copy 237725cf1a30Sjl nop 237825cf1a30Sjl ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 237925cf1a30Sjl nop 238025cf1a30Sjl.xcopyout_8: 238125cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 238225cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 238325cf1a30Sjl tst %o3 238425cf1a30Sjl bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 238525cf1a30Sjl cmp %o2, %o3 ! if length <= limit 238625cf1a30Sjl bleu,pt %ncc, .xcopyout_small ! go to small copy 238725cf1a30Sjl nop 238825cf1a30Sjl ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 238925cf1a30Sjl nop 239025cf1a30Sjl 239125cf1a30Sjl.xcopyout_small: 239225cf1a30Sjl sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 239325cf1a30Sjl or %o5, %lo(.sm_xcopyout_err), %o5 239425cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 239525cf1a30Sjl membar #Sync ! sync error barrier 239625cf1a30Sjl ba,pt %ncc, .sm_do_copyout ! common code 239725cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 239825cf1a30Sjl 239925cf1a30Sjl.xcopyout_more: 240025cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 240125cf1a30Sjl sethi %hi(.xcopyout_err), REAL_LOFAULT 240225cf1a30Sjl ba,pt %ncc, .do_copyout ! common code 240325cf1a30Sjl or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 240425cf1a30Sjl 240525cf1a30Sjl/* 240625cf1a30Sjl * We got here because of fault during xcopyout 240725cf1a30Sjl * Errno value is in ERRNO 240825cf1a30Sjl */ 240925cf1a30Sjl.xcopyout_err: 241025cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 241125cf1a30Sjl tst %o4 241225cf1a30Sjl bz,pt %ncc, 2f ! if not, return error 241325cf1a30Sjl nop 241425cf1a30Sjl ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 241525cf1a30Sjl jmp %g2 ! original arguments 241625cf1a30Sjl restore %g0, 0, %g0 ! dispose of copy window 241725cf1a30Sjl2: 241825cf1a30Sjl ret 241925cf1a30Sjl restore ERRNO, 0, %o0 ! return errno value 242025cf1a30Sjl 242125cf1a30Sjl.sm_xcopyout_err: 242225cf1a30Sjl 242325cf1a30Sjl membar #Sync 242425cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 242525cf1a30Sjl mov SM_SAVE_SRC, %o0 242625cf1a30Sjl mov SM_SAVE_DST, %o1 242725cf1a30Sjl mov SM_SAVE_COUNT, %o2 242825cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 242925cf1a30Sjl tst %o3 243025cf1a30Sjl bz,pt %ncc, 3f ! if not, return error 243125cf1a30Sjl nop 243225cf1a30Sjl ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 243325cf1a30Sjl jmp %o5 ! original arguments 243425cf1a30Sjl nop 243525cf1a30Sjl3: 243625cf1a30Sjl retl 243725cf1a30Sjl or %g1, 0, %o0 ! return errno value 243825cf1a30Sjl 243925cf1a30Sjl SET_SIZE(xcopyout) 244025cf1a30Sjl 244125cf1a30Sjl ENTRY(xcopyout_little) 244225cf1a30Sjl sethi %hi(.xcopyio_err), %o5 244325cf1a30Sjl or %o5, %lo(.xcopyio_err), %o5 244425cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 244525cf1a30Sjl membar #Sync ! sync error barrier 244625cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] 244725cf1a30Sjl mov %o4, %o5 244825cf1a30Sjl 244925cf1a30Sjl subcc %g0, %o2, %o3 245025cf1a30Sjl add %o0, %o2, %o0 245125cf1a30Sjl bz,pn %ncc, 2f ! check for zero bytes 245225cf1a30Sjl sub %o2, 1, %o4 245325cf1a30Sjl add %o0, %o4, %o0 ! start w/last byte 245425cf1a30Sjl add %o1, %o2, %o1 245525cf1a30Sjl ldub [%o0 + %o3], %o4 245625cf1a30Sjl 245725cf1a30Sjl1: stba %o4, [%o1 + %o3]ASI_AIUSL 245825cf1a30Sjl inccc %o3 245925cf1a30Sjl sub %o0, 2, %o0 ! get next byte 246025cf1a30Sjl bcc,a,pt %ncc, 1b 246125cf1a30Sjl ldub [%o0 + %o3], %o4 246225cf1a30Sjl 246325cf1a30Sjl2: 246425cf1a30Sjl membar #Sync ! sync error barrier 246525cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 246625cf1a30Sjl retl 246725cf1a30Sjl mov %g0, %o0 ! return (0) 246825cf1a30Sjl 246925cf1a30Sjl SET_SIZE(xcopyout_little) 247025cf1a30Sjl 247125cf1a30Sjl/* 247225cf1a30Sjl * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 247325cf1a30Sjl */ 247425cf1a30Sjl 247525cf1a30Sjl ENTRY(copyin) 247625cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 247725cf1a30Sjl bleu,pt %ncc, .copyin_small ! go to larger cases 247825cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 247925cf1a30Sjl btst 7, %o3 ! 248025cf1a30Sjl bz,pt %ncc, .copyin_8 ! check for longword alignment 248125cf1a30Sjl nop 248225cf1a30Sjl btst 1, %o3 ! 248325cf1a30Sjl bz,pt %ncc, .copyin_2 ! check for half-word 248425cf1a30Sjl nop 248525cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 248625cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 248725cf1a30Sjl tst %o3 248825cf1a30Sjl bz,pn %icc, .copyin_small ! if zero, disable HW copy 248925cf1a30Sjl cmp %o2, %o3 ! if length <= limit 249025cf1a30Sjl bleu,pt %ncc, .copyin_small ! go to small copy 249125cf1a30Sjl nop 249225cf1a30Sjl ba,pt %ncc, .copyin_more ! otherwise go to large copy 249325cf1a30Sjl nop 249425cf1a30Sjl.copyin_2: 249525cf1a30Sjl btst 3, %o3 ! 249625cf1a30Sjl bz,pt %ncc, .copyin_4 ! check for word alignment 249725cf1a30Sjl nop 249825cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 249925cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 250025cf1a30Sjl tst %o3 250125cf1a30Sjl bz,pn %icc, .copyin_small ! if zero, disable HW copy 250225cf1a30Sjl cmp %o2, %o3 ! if length <= limit 250325cf1a30Sjl bleu,pt %ncc, .copyin_small ! go to small copy 250425cf1a30Sjl nop 250525cf1a30Sjl ba,pt %ncc, .copyin_more ! otherwise go to large copy 250625cf1a30Sjl nop 250725cf1a30Sjl.copyin_4: 250825cf1a30Sjl ! already checked longword, must be word aligned 250925cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 251025cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 251125cf1a30Sjl tst %o3 251225cf1a30Sjl bz,pn %icc, .copyin_small ! if zero, disable HW copy 251325cf1a30Sjl cmp %o2, %o3 ! if length <= limit 251425cf1a30Sjl bleu,pt %ncc, .copyin_small ! go to small copy 251525cf1a30Sjl nop 251625cf1a30Sjl ba,pt %ncc, .copyin_more ! otherwise go to large copy 251725cf1a30Sjl nop 251825cf1a30Sjl.copyin_8: 251925cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 252025cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 252125cf1a30Sjl tst %o3 252225cf1a30Sjl bz,pn %icc, .copyin_small ! if zero, disable HW copy 252325cf1a30Sjl cmp %o2, %o3 ! if length <= limit 252425cf1a30Sjl bleu,pt %ncc, .copyin_small ! go to small copy 252525cf1a30Sjl nop 252625cf1a30Sjl ba,pt %ncc, .copyin_more ! otherwise go to large copy 252725cf1a30Sjl nop 252825cf1a30Sjl 252925cf1a30Sjl .align 16 253025cf1a30Sjl nop ! instruction alignment 253125cf1a30Sjl ! see discussion at start of file 253225cf1a30Sjl.copyin_small: 253325cf1a30Sjl sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 253425cf1a30Sjl or %o5, %lo(.sm_copyin_err), %o5 253525cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 253625cf1a30Sjl membar #Sync ! sync error barrier 253725cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] 253825cf1a30Sjl.sm_do_copyin: 253925cf1a30Sjl mov %o0, SM_SAVE_SRC 254025cf1a30Sjl mov %o1, SM_SAVE_DST 254125cf1a30Sjl cmp %o2, SHORTCOPY ! check for really short case 254225cf1a30Sjl bleu,pt %ncc, .ci_sm_left ! 254325cf1a30Sjl mov %o2, SM_SAVE_COUNT 254425cf1a30Sjl cmp %o2, CHKSIZE ! check for medium length cases 254525cf1a30Sjl bgu,pn %ncc, .ci_med ! 254625cf1a30Sjl or %o0, %o1, %o3 ! prepare alignment check 254725cf1a30Sjl andcc %o3, 0x3, %g0 ! test for alignment 254825cf1a30Sjl bz,pt %ncc, .ci_sm_word ! branch to word aligned case 254925cf1a30Sjl.ci_sm_movebytes: 255025cf1a30Sjl sub %o2, 3, %o2 ! adjust count to allow cc zero test 255125cf1a30Sjl.ci_sm_notalign4: 255225cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! read byte 255325cf1a30Sjl subcc %o2, 4, %o2 ! reduce count by 4 255425cf1a30Sjl stb %o3, [%o1] ! write byte 255525cf1a30Sjl add %o0, 1, %o0 ! advance SRC by 1 255625cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 255725cf1a30Sjl add %o0, 1, %o0 ! advance SRC by 1 255825cf1a30Sjl stb %o3, [%o1 + 1] 255925cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 256025cf1a30Sjl lduba [%o0]ASI_USER, %o3 256125cf1a30Sjl add %o0, 1, %o0 ! advance SRC by 1 256225cf1a30Sjl stb %o3, [%o1 - 2] 256325cf1a30Sjl lduba [%o0]ASI_USER, %o3 256425cf1a30Sjl add %o0, 1, %o0 ! advance SRC by 1 256525cf1a30Sjl bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 256625cf1a30Sjl stb %o3, [%o1 - 1] 256725cf1a30Sjl add %o2, 3, %o2 ! restore count 256825cf1a30Sjl.ci_sm_left: 256925cf1a30Sjl tst %o2 257025cf1a30Sjl bz,pt %ncc, .ci_sm_exit 257125cf1a30Sjl nop 257225cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load one byte 257325cf1a30Sjl deccc %o2 ! reduce count for cc test 257425cf1a30Sjl bz,pt %ncc, .ci_sm_exit 257525cf1a30Sjl stb %o3,[%o1] ! store one byte 257625cf1a30Sjl inc %o0 257725cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load second byte 257825cf1a30Sjl deccc %o2 257925cf1a30Sjl bz,pt %ncc, .ci_sm_exit 258025cf1a30Sjl stb %o3,[%o1 + 1] ! store second byte 258125cf1a30Sjl inc %o0 258225cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load third byte 258325cf1a30Sjl stb %o3,[%o1 + 2] ! store third byte 258425cf1a30Sjl membar #Sync ! sync error barrier 258525cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 258625cf1a30Sjl retl 258725cf1a30Sjl mov %g0, %o0 ! return 0 258825cf1a30Sjl .align 16 258925cf1a30Sjl.ci_sm_words: 259025cf1a30Sjl lduwa [%o0]ASI_USER, %o3 ! read word 259125cf1a30Sjl.ci_sm_wordx: 259225cf1a30Sjl subcc %o2, 8, %o2 ! update count 259325cf1a30Sjl stw %o3, [%o1] ! write word 259425cf1a30Sjl add %o0, 4, %o0 ! update SRC 259525cf1a30Sjl add %o1, 8, %o1 ! update DST 259625cf1a30Sjl lduwa [%o0]ASI_USER, %o3 ! read word 259725cf1a30Sjl add %o0, 4, %o0 ! update SRC 259825cf1a30Sjl bgt,pt %ncc, .ci_sm_words ! loop til done 259925cf1a30Sjl stw %o3, [%o1 - 4] ! write word 260025cf1a30Sjl addcc %o2, 7, %o2 ! restore count 260125cf1a30Sjl bz,pt %ncc, .ci_sm_exit 260225cf1a30Sjl nop 260325cf1a30Sjl deccc %o2 260425cf1a30Sjl bz,pt %ncc, .ci_sm_byte 260525cf1a30Sjl.ci_sm_half: 260625cf1a30Sjl subcc %o2, 2, %o2 ! reduce count by 2 260725cf1a30Sjl lduha [%o0]ASI_USER, %o3 ! read half word 260825cf1a30Sjl add %o0, 2, %o0 ! advance SRC by 2 260925cf1a30Sjl add %o1, 2, %o1 ! advance DST by 2 261025cf1a30Sjl bgt,pt %ncc, .ci_sm_half ! loop til done 261125cf1a30Sjl sth %o3, [%o1 - 2] ! write half word 261225cf1a30Sjl addcc %o2, 1, %o2 ! restore count 261325cf1a30Sjl bz,pt %ncc, .ci_sm_exit 261425cf1a30Sjl nop 261525cf1a30Sjl.ci_sm_byte: 261625cf1a30Sjl lduba [%o0]ASI_USER, %o3 261725cf1a30Sjl stb %o3, [%o1] 261825cf1a30Sjl membar #Sync ! sync error barrier 261925cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 262025cf1a30Sjl retl 262125cf1a30Sjl mov %g0, %o0 ! return 0 262225cf1a30Sjl .align 16 262325cf1a30Sjl.ci_sm_word: 262425cf1a30Sjl subcc %o2, 4, %o2 ! update count 262525cf1a30Sjl bgt,pt %ncc, .ci_sm_wordx 262625cf1a30Sjl lduwa [%o0]ASI_USER, %o3 ! read word 262725cf1a30Sjl addcc %o2, 3, %o2 ! restore count 262825cf1a30Sjl bz,pt %ncc, .ci_sm_exit 262925cf1a30Sjl stw %o3, [%o1] ! write word 263025cf1a30Sjl deccc %o2 ! reduce count for cc test 263125cf1a30Sjl add %o0, 4, %o0 263225cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load one byte 263325cf1a30Sjl bz,pt %ncc, .ci_sm_exit 263425cf1a30Sjl stb %o3, [%o1 + 4] ! store one byte 263525cf1a30Sjl inc %o0 263625cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load second byte 263725cf1a30Sjl deccc %o2 263825cf1a30Sjl bz,pt %ncc, .ci_sm_exit 263925cf1a30Sjl stb %o3, [%o1 + 5] ! store second byte 264025cf1a30Sjl inc %o0 264125cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load third byte 264225cf1a30Sjl stb %o3, [%o1 + 6] ! store third byte 264325cf1a30Sjl.ci_sm_exit: 264425cf1a30Sjl membar #Sync ! sync error barrier 264525cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 264625cf1a30Sjl retl 264725cf1a30Sjl mov %g0, %o0 ! return 0 264825cf1a30Sjl 264925cf1a30Sjl .align 16 265025cf1a30Sjl.ci_med: 265125cf1a30Sjl xor %o0, %o1, %o3 ! setup alignment check 265225cf1a30Sjl btst 1, %o3 265325cf1a30Sjl bnz,pt %ncc, .ci_sm_movebytes ! unaligned 265425cf1a30Sjl nop 265525cf1a30Sjl btst 3, %o3 265625cf1a30Sjl bnz,pt %ncc, .ci_med_half ! halfword aligned 265725cf1a30Sjl nop 265825cf1a30Sjl btst 7, %o3 265925cf1a30Sjl bnz,pt %ncc, .ci_med_word ! word aligned 266025cf1a30Sjl nop 266125cf1a30Sjl.ci_med_long: 266225cf1a30Sjl btst 3, %o0 ! check for 266325cf1a30Sjl bz,pt %ncc, .ci_med_long1 ! word alignment 266425cf1a30Sjl nop 266525cf1a30Sjl.ci_med_long0: 266625cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load one byte 266725cf1a30Sjl inc %o0 266825cf1a30Sjl stb %o3,[%o1] ! store byte 266925cf1a30Sjl inc %o1 267025cf1a30Sjl btst 3, %o0 267125cf1a30Sjl bnz,pt %ncc, .ci_med_long0 267225cf1a30Sjl dec %o2 267325cf1a30Sjl.ci_med_long1: ! word aligned 267425cf1a30Sjl btst 7, %o0 ! check for long word 267525cf1a30Sjl bz,pt %ncc, .ci_med_long2 267625cf1a30Sjl nop 267725cf1a30Sjl lduwa [%o0]ASI_USER, %o3 ! load word 267825cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 267925cf1a30Sjl stw %o3, [%o1] ! store word 268025cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 268125cf1a30Sjl sub %o2, 4, %o2 ! reduce count by 4 268225cf1a30Sjl! 268325cf1a30Sjl! Now long word aligned and have at least 32 bytes to move 268425cf1a30Sjl! 268525cf1a30Sjl.ci_med_long2: 268625cf1a30Sjl sub %o2, 31, %o2 ! adjust count to allow cc zero test 268725cf1a30Sjl.ci_med_lmove: 268825cf1a30Sjl ldxa [%o0]ASI_USER, %o3 ! read long word 268925cf1a30Sjl subcc %o2, 32, %o2 ! reduce count by 32 269025cf1a30Sjl stx %o3, [%o1] ! write long word 269125cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 269225cf1a30Sjl ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 269325cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 269425cf1a30Sjl stx %o3, [%o1 + 8] 269525cf1a30Sjl add %o1, 32, %o1 ! advance DST by 32 269625cf1a30Sjl ldxa [%o0]ASI_USER, %o3 269725cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 269825cf1a30Sjl stx %o3, [%o1 - 16] 269925cf1a30Sjl ldxa [%o0]ASI_USER, %o3 270025cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 270125cf1a30Sjl bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 270225cf1a30Sjl stx %o3, [%o1 - 8] 270325cf1a30Sjl addcc %o2, 24, %o2 ! restore count to long word offset 270425cf1a30Sjl ble,pt %ncc, .ci_med_lextra ! check for more long words to move 270525cf1a30Sjl nop 270625cf1a30Sjl.ci_med_lword: 270725cf1a30Sjl ldxa [%o0]ASI_USER, %o3 ! read long word 270825cf1a30Sjl subcc %o2, 8, %o2 ! reduce count by 8 270925cf1a30Sjl stx %o3, [%o1] ! write long word 271025cf1a30Sjl add %o0, 8, %o0 ! advance SRC by 8 271125cf1a30Sjl bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 271225cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 271325cf1a30Sjl.ci_med_lextra: 271425cf1a30Sjl addcc %o2, 7, %o2 ! restore rest of count 271525cf1a30Sjl bz,pt %ncc, .ci_sm_exit ! if zero, then done 271625cf1a30Sjl deccc %o2 271725cf1a30Sjl bz,pt %ncc, .ci_sm_byte 271825cf1a30Sjl nop 271925cf1a30Sjl ba,pt %ncc, .ci_sm_half 272025cf1a30Sjl nop 272125cf1a30Sjl 272225cf1a30Sjl .align 16 272325cf1a30Sjl nop ! instruction alignment 272425cf1a30Sjl ! see discussion at start of file 272525cf1a30Sjl.ci_med_word: 272625cf1a30Sjl btst 3, %o0 ! check for 272725cf1a30Sjl bz,pt %ncc, .ci_med_word1 ! word alignment 272825cf1a30Sjl nop 272925cf1a30Sjl.ci_med_word0: 273025cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load one byte 273125cf1a30Sjl inc %o0 273225cf1a30Sjl stb %o3,[%o1] ! store byte 273325cf1a30Sjl inc %o1 273425cf1a30Sjl btst 3, %o0 273525cf1a30Sjl bnz,pt %ncc, .ci_med_word0 273625cf1a30Sjl dec %o2 273725cf1a30Sjl! 273825cf1a30Sjl! Now word aligned and have at least 36 bytes to move 273925cf1a30Sjl! 274025cf1a30Sjl.ci_med_word1: 274125cf1a30Sjl sub %o2, 15, %o2 ! adjust count to allow cc zero test 274225cf1a30Sjl.ci_med_wmove: 274325cf1a30Sjl lduwa [%o0]ASI_USER, %o3 ! read word 274425cf1a30Sjl subcc %o2, 16, %o2 ! reduce count by 16 274525cf1a30Sjl stw %o3, [%o1] ! write word 274625cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 274725cf1a30Sjl lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 274825cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 274925cf1a30Sjl stw %o3, [%o1 + 4] 275025cf1a30Sjl add %o1, 16, %o1 ! advance DST by 16 275125cf1a30Sjl lduwa [%o0]ASI_USER, %o3 275225cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 275325cf1a30Sjl stw %o3, [%o1 - 8] 275425cf1a30Sjl lduwa [%o0]ASI_USER, %o3 275525cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 275625cf1a30Sjl bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 275725cf1a30Sjl stw %o3, [%o1 - 4] 275825cf1a30Sjl addcc %o2, 12, %o2 ! restore count to word offset 275925cf1a30Sjl ble,pt %ncc, .ci_med_wextra ! check for more words to move 276025cf1a30Sjl nop 276125cf1a30Sjl.ci_med_word2: 276225cf1a30Sjl lduwa [%o0]ASI_USER, %o3 ! read word 276325cf1a30Sjl subcc %o2, 4, %o2 ! reduce count by 4 276425cf1a30Sjl stw %o3, [%o1] ! write word 276525cf1a30Sjl add %o0, 4, %o0 ! advance SRC by 4 276625cf1a30Sjl bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 276725cf1a30Sjl add %o1, 4, %o1 ! advance DST by 4 276825cf1a30Sjl.ci_med_wextra: 276925cf1a30Sjl addcc %o2, 3, %o2 ! restore rest of count 277025cf1a30Sjl bz,pt %ncc, .ci_sm_exit ! if zero, then done 277125cf1a30Sjl deccc %o2 277225cf1a30Sjl bz,pt %ncc, .ci_sm_byte 277325cf1a30Sjl nop 277425cf1a30Sjl ba,pt %ncc, .ci_sm_half 277525cf1a30Sjl nop 277625cf1a30Sjl 277725cf1a30Sjl .align 16 277825cf1a30Sjl nop ! instruction alignment 277925cf1a30Sjl ! see discussion at start of file 278025cf1a30Sjl.ci_med_half: 278125cf1a30Sjl btst 1, %o0 ! check for 278225cf1a30Sjl bz,pt %ncc, .ci_med_half1 ! half word alignment 278325cf1a30Sjl nop 278425cf1a30Sjl lduba [%o0]ASI_USER, %o3 ! load one byte 278525cf1a30Sjl inc %o0 278625cf1a30Sjl stb %o3,[%o1] ! store byte 278725cf1a30Sjl inc %o1 278825cf1a30Sjl dec %o2 278925cf1a30Sjl! 279025cf1a30Sjl! Now half word aligned and have at least 38 bytes to move 279125cf1a30Sjl! 279225cf1a30Sjl.ci_med_half1: 279325cf1a30Sjl sub %o2, 7, %o2 ! adjust count to allow cc zero test 279425cf1a30Sjl.ci_med_hmove: 279525cf1a30Sjl lduha [%o0]ASI_USER, %o3 ! read half word 279625cf1a30Sjl subcc %o2, 8, %o2 ! reduce count by 8 279725cf1a30Sjl sth %o3, [%o1] ! write half word 279825cf1a30Sjl add %o0, 2, %o0 ! advance SRC by 2 279925cf1a30Sjl lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 280025cf1a30Sjl add %o0, 2, %o0 ! advance SRC by 2 280125cf1a30Sjl sth %o3, [%o1 + 2] 280225cf1a30Sjl add %o1, 8, %o1 ! advance DST by 8 280325cf1a30Sjl lduha [%o0]ASI_USER, %o3 280425cf1a30Sjl add %o0, 2, %o0 ! advance SRC by 2 280525cf1a30Sjl sth %o3, [%o1 - 4] 280625cf1a30Sjl lduha [%o0]ASI_USER, %o3 280725cf1a30Sjl add %o0, 2, %o0 ! advance SRC by 2 280825cf1a30Sjl bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 280925cf1a30Sjl sth %o3, [%o1 - 2] 281025cf1a30Sjl addcc %o2, 7, %o2 ! restore count 281125cf1a30Sjl bz,pt %ncc, .ci_sm_exit 281225cf1a30Sjl deccc %o2 281325cf1a30Sjl bz,pt %ncc, .ci_sm_byte 281425cf1a30Sjl nop 281525cf1a30Sjl ba,pt %ncc, .ci_sm_half 281625cf1a30Sjl nop 281725cf1a30Sjl 281825cf1a30Sjl.sm_copyin_err: 281925cf1a30Sjl membar #Sync 282025cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 282125cf1a30Sjl mov SM_SAVE_SRC, %o0 282225cf1a30Sjl mov SM_SAVE_DST, %o1 282325cf1a30Sjl mov SM_SAVE_COUNT, %o2 282425cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 282525cf1a30Sjl tst %o3 282625cf1a30Sjl bz,pt %ncc, 3f ! if not, return error 282725cf1a30Sjl nop 282825cf1a30Sjl ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 282925cf1a30Sjl jmp %o5 ! original arguments 283025cf1a30Sjl nop 283125cf1a30Sjl3: 283225cf1a30Sjl retl 283325cf1a30Sjl or %g0, -1, %o0 ! return errno value 283425cf1a30Sjl 283525cf1a30Sjl SET_SIZE(copyin) 283625cf1a30Sjl 283725cf1a30Sjl 283825cf1a30Sjl/* 283925cf1a30Sjl * The _more entry points are not intended to be used directly by 284025cf1a30Sjl * any caller from outside this file. They are provided to allow 284125cf1a30Sjl * profiling and dtrace of the portions of the copy code that uses 284225cf1a30Sjl * the floating point registers. 284325cf1a30Sjl * This entry is particularly important as DTRACE (at least as of 284425cf1a30Sjl * 4/2004) does not support leaf functions. 284525cf1a30Sjl */ 284625cf1a30Sjl 284725cf1a30Sjl ENTRY(copyin_more) 284825cf1a30Sjl.copyin_more: 284925cf1a30Sjl prefetch [%o0], #n_reads 285025cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 285125cf1a30Sjl set .copyin_err, REAL_LOFAULT 285225cf1a30Sjl 285325cf1a30Sjl/* 285425cf1a30Sjl * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 285525cf1a30Sjl */ 285625cf1a30Sjl.do_copyin: 285725cf1a30Sjl set copyio_fault, %l7 ! .copyio_fault is lofault val 285825cf1a30Sjl 285925cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 286025cf1a30Sjl membar #Sync ! sync error barrier 286125cf1a30Sjl stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 286225cf1a30Sjl 286325cf1a30Sjl mov %i0, SAVE_SRC 286425cf1a30Sjl mov %i1, SAVE_DST 286525cf1a30Sjl mov %i2, SAVE_COUNT 286625cf1a30Sjl 286725cf1a30Sjl FP_NOMIGRATE(6, 7) 286825cf1a30Sjl 286925cf1a30Sjl rd %fprs, %o2 ! check for unused fp 287025cf1a30Sjl st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 287125cf1a30Sjl btst FPRS_FEF, %o2 287225cf1a30Sjl bz,a,pt %icc, .do_blockcopyin 287325cf1a30Sjl wr %g0, FPRS_FEF, %fprs 287425cf1a30Sjl 287525cf1a30Sjl BST_FPQ2Q4_TOSTACK(%o2) 287625cf1a30Sjl 287725cf1a30Sjl.do_blockcopyin: 287825cf1a30Sjl rd %gsr, %o2 287925cf1a30Sjl stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 288025cf1a30Sjl or %l6, FPUSED_FLAG, %l6 288125cf1a30Sjl 288225cf1a30Sjl andcc DST, VIS_BLOCKSIZE - 1, TMP 288325cf1a30Sjl mov ASI_USER, %asi 288425cf1a30Sjl bz,pt %ncc, 2f 288525cf1a30Sjl neg TMP 288625cf1a30Sjl add TMP, VIS_BLOCKSIZE, TMP 288725cf1a30Sjl 288825cf1a30Sjl ! TMP = bytes required to align DST on FP_BLOCK boundary 288925cf1a30Sjl ! Using SRC as a tmp here 289025cf1a30Sjl cmp TMP, 3 289125cf1a30Sjl bleu,pt %ncc, 1f 289225cf1a30Sjl sub CNT,TMP,CNT ! adjust main count 289325cf1a30Sjl sub TMP, 3, TMP ! adjust for end of loop test 289425cf1a30Sjl.ci_blkalign: 289525cf1a30Sjl lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 289625cf1a30Sjl stb SRC, [DST] 289725cf1a30Sjl subcc TMP, 4, TMP 289825cf1a30Sjl lduba [REALSRC + 1]%asi, SRC 289925cf1a30Sjl add REALSRC, 4, REALSRC 290025cf1a30Sjl stb SRC, [DST + 1] 290125cf1a30Sjl lduba [REALSRC - 2]%asi, SRC 290225cf1a30Sjl add DST, 4, DST 290325cf1a30Sjl stb SRC, [DST - 2] 290425cf1a30Sjl lduba [REALSRC - 1]%asi, SRC 290525cf1a30Sjl bgu,pt %ncc, .ci_blkalign 290625cf1a30Sjl stb SRC, [DST - 1] 290725cf1a30Sjl 290825cf1a30Sjl addcc TMP, 3, TMP ! restore count adjustment 290925cf1a30Sjl bz,pt %ncc, 2f ! no bytes left? 291025cf1a30Sjl nop 291125cf1a30Sjl1: lduba [REALSRC]%asi, SRC 291225cf1a30Sjl inc REALSRC 291325cf1a30Sjl inc DST 291425cf1a30Sjl deccc TMP 291525cf1a30Sjl bgu %ncc, 1b 291625cf1a30Sjl stb SRC, [DST - 1] 291725cf1a30Sjl 291825cf1a30Sjl2: 291925cf1a30Sjl membar #StoreLoad 292025cf1a30Sjl andn REALSRC, 0x7, SRC 292125cf1a30Sjl 292225cf1a30Sjl ! SRC - 8-byte aligned 292325cf1a30Sjl ! DST - 64-byte aligned 292425cf1a30Sjl ldda [SRC]%asi, %f16 292525cf1a30Sjl prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads 292625cf1a30Sjl alignaddr REALSRC, %g0, %g0 292725cf1a30Sjl ldda [SRC + 0x08]%asi, %f18 292825cf1a30Sjl prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads 292925cf1a30Sjl faligndata %f16, %f18, %f48 293025cf1a30Sjl ldda [SRC + 0x10]%asi, %f20 2931c8a722abSpm prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 293225cf1a30Sjl faligndata %f18, %f20, %f50 293325cf1a30Sjl ldda [SRC + 0x18]%asi, %f22 293425cf1a30Sjl prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 293525cf1a30Sjl faligndata %f20, %f22, %f52 293625cf1a30Sjl ldda [SRC + 0x20]%asi, %f24 2937c8a722abSpm prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read 293825cf1a30Sjl faligndata %f22, %f24, %f54 293925cf1a30Sjl ldda [SRC + 0x28]%asi, %f26 2940c8a722abSpm prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read 294125cf1a30Sjl faligndata %f24, %f26, %f56 294225cf1a30Sjl ldda [SRC + 0x30]%asi, %f28 2943c8a722abSpm prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read 294425cf1a30Sjl faligndata %f26, %f28, %f58 294525cf1a30Sjl ldda [SRC + 0x38]%asi, %f30 294625cf1a30Sjl ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 294725cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 294825cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 2949c8a722abSpm prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read 295025cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 295125cf1a30Sjl ba,pt %ncc, 1f 2952c8a722abSpm prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read 295325cf1a30Sjl .align 32 295425cf1a30Sjl1: 295525cf1a30Sjl ldda [SRC + 0x08]%asi, %f18 295625cf1a30Sjl faligndata %f28, %f30, %f60 295725cf1a30Sjl ldda [SRC + 0x10]%asi, %f20 295825cf1a30Sjl faligndata %f30, %f16, %f62 295925cf1a30Sjl stda %f48, [DST]ASI_BLK_P 296025cf1a30Sjl ldda [SRC + 0x18]%asi, %f22 296125cf1a30Sjl faligndata %f16, %f18, %f48 296225cf1a30Sjl ldda [SRC + 0x20]%asi, %f24 296325cf1a30Sjl faligndata %f18, %f20, %f50 296425cf1a30Sjl ldda [SRC + 0x28]%asi, %f26 296525cf1a30Sjl faligndata %f20, %f22, %f52 296625cf1a30Sjl ldda [SRC + 0x30]%asi, %f28 296725cf1a30Sjl faligndata %f22, %f24, %f54 2968c8a722abSpm sub CNT, VIS_BLOCKSIZE, CNT 296925cf1a30Sjl ldda [SRC + 0x38]%asi, %f30 297025cf1a30Sjl faligndata %f24, %f26, %f56 2971c8a722abSpm add DST, VIS_BLOCKSIZE, DST 297225cf1a30Sjl ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 297325cf1a30Sjl faligndata %f26, %f28, %f58 297425cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 2975c8a722abSpm prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 2976c8a722abSpm add SRC, VIS_BLOCKSIZE, SRC 2977c8a722abSpm prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 297825cf1a30Sjl cmp CNT, VIS_BLOCKSIZE + 8 297925cf1a30Sjl bgu,pt %ncc, 1b 2980c8a722abSpm prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 298125cf1a30Sjl 298225cf1a30Sjl ! only if REALSRC & 0x7 is 0 298325cf1a30Sjl cmp CNT, VIS_BLOCKSIZE 298425cf1a30Sjl bne %ncc, 3f 298525cf1a30Sjl andcc REALSRC, 0x7, %g0 298625cf1a30Sjl bz,pt %ncc, 2f 298725cf1a30Sjl nop 298825cf1a30Sjl3: 298925cf1a30Sjl faligndata %f28, %f30, %f60 299025cf1a30Sjl faligndata %f30, %f16, %f62 299125cf1a30Sjl stda %f48, [DST]ASI_BLK_P 299225cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 299325cf1a30Sjl ba,pt %ncc, 3f 299425cf1a30Sjl nop 299525cf1a30Sjl2: 299625cf1a30Sjl ldda [SRC + 0x08]%asi, %f18 299725cf1a30Sjl fsrc1 %f28, %f60 299825cf1a30Sjl ldda [SRC + 0x10]%asi, %f20 299925cf1a30Sjl fsrc1 %f30, %f62 300025cf1a30Sjl stda %f48, [DST]ASI_BLK_P 300125cf1a30Sjl ldda [SRC + 0x18]%asi, %f22 300225cf1a30Sjl fsrc1 %f16, %f48 300325cf1a30Sjl ldda [SRC + 0x20]%asi, %f24 300425cf1a30Sjl fsrc1 %f18, %f50 300525cf1a30Sjl ldda [SRC + 0x28]%asi, %f26 300625cf1a30Sjl fsrc1 %f20, %f52 300725cf1a30Sjl ldda [SRC + 0x30]%asi, %f28 300825cf1a30Sjl fsrc1 %f22, %f54 300925cf1a30Sjl ldda [SRC + 0x38]%asi, %f30 301025cf1a30Sjl fsrc1 %f24, %f56 301125cf1a30Sjl sub CNT, VIS_BLOCKSIZE, CNT 301225cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 301325cf1a30Sjl add SRC, VIS_BLOCKSIZE, SRC 301425cf1a30Sjl add REALSRC, VIS_BLOCKSIZE, REALSRC 301525cf1a30Sjl fsrc1 %f26, %f58 301625cf1a30Sjl fsrc1 %f28, %f60 301725cf1a30Sjl fsrc1 %f30, %f62 301825cf1a30Sjl stda %f48, [DST]ASI_BLK_P 301925cf1a30Sjl add DST, VIS_BLOCKSIZE, DST 302025cf1a30Sjl ba,a,pt %ncc, 4f 302125cf1a30Sjl nop 302225cf1a30Sjl 302325cf1a30Sjl3: tst CNT 302425cf1a30Sjl bz,a %ncc, 4f 302525cf1a30Sjl nop 302625cf1a30Sjl 302725cf1a30Sjl5: lduba [REALSRC]ASI_USER, TMP 302825cf1a30Sjl inc REALSRC 302925cf1a30Sjl inc DST 303025cf1a30Sjl deccc CNT 303125cf1a30Sjl bgu %ncc, 5b 303225cf1a30Sjl stb TMP, [DST - 1] 303325cf1a30Sjl4: 303425cf1a30Sjl 303525cf1a30Sjl.copyin_exit: 303625cf1a30Sjl membar #Sync 303725cf1a30Sjl 303825cf1a30Sjl ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 303925cf1a30Sjl wr %o2, 0, %gsr 304025cf1a30Sjl 304125cf1a30Sjl ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 304225cf1a30Sjl btst FPRS_FEF, %o3 304325cf1a30Sjl bz,pt %icc, 4f 304425cf1a30Sjl nop 304525cf1a30Sjl 304625cf1a30Sjl BLD_FPQ2Q4_FROMSTACK(%o2) 304725cf1a30Sjl 304825cf1a30Sjl ba,pt %ncc, 1f 304925cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 305025cf1a30Sjl 305125cf1a30Sjl4: 305225cf1a30Sjl FZEROQ2Q4 305325cf1a30Sjl wr %o3, 0, %fprs ! restore fprs 305425cf1a30Sjl 305525cf1a30Sjl1: 305625cf1a30Sjl membar #Sync ! sync error barrier 305725cf1a30Sjl andn %l6, FPUSED_FLAG, %l6 305825cf1a30Sjl stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 305925cf1a30Sjl FP_ALLOWMIGRATE(5, 6) 306025cf1a30Sjl ret 306125cf1a30Sjl restore %g0, 0, %o0 306225cf1a30Sjl/* 306325cf1a30Sjl * We got here because of a fault during copyin 306425cf1a30Sjl * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 306525cf1a30Sjl */ 306625cf1a30Sjl.copyin_err: 306725cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 306825cf1a30Sjl tst %o4 306925cf1a30Sjl bz,pt %ncc, 2f ! if not, return error 307025cf1a30Sjl nop 307125cf1a30Sjl ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 307225cf1a30Sjl jmp %g2 ! original arguments 307325cf1a30Sjl restore %g0, 0, %g0 ! dispose of copy window 307425cf1a30Sjl2: 307525cf1a30Sjl ret 307625cf1a30Sjl restore %g0, -1, %o0 ! return error value 307725cf1a30Sjl 307825cf1a30Sjl 307925cf1a30Sjl SET_SIZE(copyin_more) 308025cf1a30Sjl 308125cf1a30Sjl ENTRY(xcopyin) 308225cf1a30Sjl 308325cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 308425cf1a30Sjl bleu,pt %ncc, .xcopyin_small ! go to larger cases 308525cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 308625cf1a30Sjl btst 7, %o3 ! 308725cf1a30Sjl bz,pt %ncc, .xcopyin_8 ! check for longword alignment 308825cf1a30Sjl nop 308925cf1a30Sjl btst 1, %o3 ! 309025cf1a30Sjl bz,pt %ncc, .xcopyin_2 ! check for half-word 309125cf1a30Sjl nop 309225cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 309325cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 309425cf1a30Sjl tst %o3 309525cf1a30Sjl bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 309625cf1a30Sjl cmp %o2, %o3 ! if length <= limit 309725cf1a30Sjl bleu,pt %ncc, .xcopyin_small ! go to small copy 309825cf1a30Sjl nop 309925cf1a30Sjl ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 310025cf1a30Sjl nop 310125cf1a30Sjl.xcopyin_2: 310225cf1a30Sjl btst 3, %o3 ! 310325cf1a30Sjl bz,pt %ncc, .xcopyin_4 ! check for word alignment 310425cf1a30Sjl nop 310525cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 310625cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 310725cf1a30Sjl tst %o3 310825cf1a30Sjl bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 310925cf1a30Sjl cmp %o2, %o3 ! if length <= limit 311025cf1a30Sjl bleu,pt %ncc, .xcopyin_small ! go to small copy 311125cf1a30Sjl nop 311225cf1a30Sjl ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 311325cf1a30Sjl nop 311425cf1a30Sjl.xcopyin_4: 311525cf1a30Sjl ! already checked longword, must be word aligned 311625cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 311725cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 311825cf1a30Sjl tst %o3 311925cf1a30Sjl bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 312025cf1a30Sjl cmp %o2, %o3 ! if length <= limit 312125cf1a30Sjl bleu,pt %ncc, .xcopyin_small ! go to small copy 312225cf1a30Sjl nop 312325cf1a30Sjl ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 312425cf1a30Sjl nop 312525cf1a30Sjl.xcopyin_8: 312625cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 312725cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 312825cf1a30Sjl tst %o3 312925cf1a30Sjl bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 313025cf1a30Sjl cmp %o2, %o3 ! if length <= limit 313125cf1a30Sjl bleu,pt %ncc, .xcopyin_small ! go to small copy 313225cf1a30Sjl nop 313325cf1a30Sjl ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 313425cf1a30Sjl nop 313525cf1a30Sjl 313625cf1a30Sjl.xcopyin_small: 313725cf1a30Sjl sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 313825cf1a30Sjl or %o5, %lo(.sm_xcopyin_err), %o5 313925cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 314025cf1a30Sjl membar #Sync ! sync error barrier 314125cf1a30Sjl ba,pt %ncc, .sm_do_copyin ! common code 314225cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] 314325cf1a30Sjl 314425cf1a30Sjl.xcopyin_more: 314525cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 314625cf1a30Sjl sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 314725cf1a30Sjl ba,pt %ncc, .do_copyin 314825cf1a30Sjl or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 314925cf1a30Sjl 315025cf1a30Sjl/* 315125cf1a30Sjl * We got here because of fault during xcopyin 315225cf1a30Sjl * Errno value is in ERRNO 315325cf1a30Sjl */ 315425cf1a30Sjl.xcopyin_err: 315525cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 315625cf1a30Sjl tst %o4 315725cf1a30Sjl bz,pt %ncc, 2f ! if not, return error 315825cf1a30Sjl nop 315925cf1a30Sjl ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 316025cf1a30Sjl jmp %g2 ! original arguments 316125cf1a30Sjl restore %g0, 0, %g0 ! dispose of copy window 316225cf1a30Sjl2: 316325cf1a30Sjl ret 316425cf1a30Sjl restore ERRNO, 0, %o0 ! return errno value 316525cf1a30Sjl 316625cf1a30Sjl.sm_xcopyin_err: 316725cf1a30Sjl 316825cf1a30Sjl membar #Sync 316925cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 317025cf1a30Sjl mov SM_SAVE_SRC, %o0 317125cf1a30Sjl mov SM_SAVE_DST, %o1 317225cf1a30Sjl mov SM_SAVE_COUNT, %o2 317325cf1a30Sjl ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 317425cf1a30Sjl tst %o3 317525cf1a30Sjl bz,pt %ncc, 3f ! if not, return error 317625cf1a30Sjl nop 317725cf1a30Sjl ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 317825cf1a30Sjl jmp %o5 ! original arguments 317925cf1a30Sjl nop 318025cf1a30Sjl3: 318125cf1a30Sjl retl 318225cf1a30Sjl or %g1, 0, %o0 ! return errno value 318325cf1a30Sjl 318425cf1a30Sjl SET_SIZE(xcopyin) 318525cf1a30Sjl 318625cf1a30Sjl ENTRY(xcopyin_little) 318725cf1a30Sjl sethi %hi(.xcopyio_err), %o5 318825cf1a30Sjl or %o5, %lo(.xcopyio_err), %o5 318925cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 319025cf1a30Sjl membar #Sync ! sync error barrier 319125cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] 319225cf1a30Sjl mov %o4, %o5 319325cf1a30Sjl 319425cf1a30Sjl subcc %g0, %o2, %o3 319525cf1a30Sjl add %o0, %o2, %o0 319625cf1a30Sjl bz,pn %ncc, 2f ! check for zero bytes 319725cf1a30Sjl sub %o2, 1, %o4 319825cf1a30Sjl add %o0, %o4, %o0 ! start w/last byte 319925cf1a30Sjl add %o1, %o2, %o1 320025cf1a30Sjl lduba [%o0 + %o3]ASI_AIUSL, %o4 320125cf1a30Sjl 320225cf1a30Sjl1: stb %o4, [%o1 + %o3] 320325cf1a30Sjl inccc %o3 320425cf1a30Sjl sub %o0, 2, %o0 ! get next byte 320525cf1a30Sjl bcc,a,pt %ncc, 1b 320625cf1a30Sjl lduba [%o0 + %o3]ASI_AIUSL, %o4 320725cf1a30Sjl 320825cf1a30Sjl2: 320925cf1a30Sjl membar #Sync ! sync error barrier 321025cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 321125cf1a30Sjl retl 321225cf1a30Sjl mov %g0, %o0 ! return (0) 321325cf1a30Sjl 321425cf1a30Sjl.xcopyio_err: 321525cf1a30Sjl membar #Sync ! sync error barrier 321625cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 321725cf1a30Sjl retl 321825cf1a30Sjl mov %g1, %o0 321925cf1a30Sjl 322025cf1a30Sjl SET_SIZE(xcopyin_little) 322125cf1a30Sjl 322225cf1a30Sjl 322325cf1a30Sjl/* 322425cf1a30Sjl * Copy a block of storage - must not overlap (from + len <= to). 322525cf1a30Sjl * No fault handler installed (to be called under on_fault()) 322625cf1a30Sjl */ 322725cf1a30Sjl ENTRY(copyin_noerr) 322825cf1a30Sjl 322925cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 323025cf1a30Sjl bleu,pt %ncc, .copyin_ne_small ! go to larger cases 323125cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 323225cf1a30Sjl btst 7, %o3 ! 323325cf1a30Sjl bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 323425cf1a30Sjl nop 323525cf1a30Sjl btst 1, %o3 ! 323625cf1a30Sjl bz,pt %ncc, .copyin_ne_2 ! check for half-word 323725cf1a30Sjl nop 323825cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 323925cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 324025cf1a30Sjl tst %o3 324125cf1a30Sjl bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 324225cf1a30Sjl cmp %o2, %o3 ! if length <= limit 324325cf1a30Sjl bleu,pt %ncc, .copyin_ne_small ! go to small copy 324425cf1a30Sjl nop 324525cf1a30Sjl ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 324625cf1a30Sjl nop 324725cf1a30Sjl.copyin_ne_2: 324825cf1a30Sjl btst 3, %o3 ! 324925cf1a30Sjl bz,pt %ncc, .copyin_ne_4 ! check for word alignment 325025cf1a30Sjl nop 325125cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 325225cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 325325cf1a30Sjl tst %o3 325425cf1a30Sjl bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 325525cf1a30Sjl cmp %o2, %o3 ! if length <= limit 325625cf1a30Sjl bleu,pt %ncc, .copyin_ne_small ! go to small copy 325725cf1a30Sjl nop 325825cf1a30Sjl ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 325925cf1a30Sjl nop 326025cf1a30Sjl.copyin_ne_4: 326125cf1a30Sjl ! already checked longword, must be word aligned 326225cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 326325cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 326425cf1a30Sjl tst %o3 326525cf1a30Sjl bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 326625cf1a30Sjl cmp %o2, %o3 ! if length <= limit 326725cf1a30Sjl bleu,pt %ncc, .copyin_ne_small ! go to small copy 326825cf1a30Sjl nop 326925cf1a30Sjl ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 327025cf1a30Sjl nop 327125cf1a30Sjl.copyin_ne_8: 327225cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 327325cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 327425cf1a30Sjl tst %o3 327525cf1a30Sjl bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 327625cf1a30Sjl cmp %o2, %o3 ! if length <= limit 327725cf1a30Sjl bleu,pt %ncc, .copyin_ne_small ! go to small copy 327825cf1a30Sjl nop 327925cf1a30Sjl ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 328025cf1a30Sjl nop 328125cf1a30Sjl 328225cf1a30Sjl.copyin_ne_small: 328325cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 328425cf1a30Sjl tst %o4 328525cf1a30Sjl bz,pn %ncc, .sm_do_copyin 328625cf1a30Sjl nop 328725cf1a30Sjl sethi %hi(.sm_copyio_noerr), %o5 328825cf1a30Sjl or %o5, %lo(.sm_copyio_noerr), %o5 328925cf1a30Sjl membar #Sync ! sync error barrier 329025cf1a30Sjl ba,pt %ncc, .sm_do_copyin 329125cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 329225cf1a30Sjl 329325cf1a30Sjl.copyin_noerr_more: 329425cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 329525cf1a30Sjl sethi %hi(.copyio_noerr), REAL_LOFAULT 329625cf1a30Sjl ba,pt %ncc, .do_copyin 329725cf1a30Sjl or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 329825cf1a30Sjl 329925cf1a30Sjl.copyio_noerr: 330025cf1a30Sjl jmp %l6 330125cf1a30Sjl restore %g0,0,%g0 330225cf1a30Sjl 330325cf1a30Sjl.sm_copyio_noerr: 330425cf1a30Sjl membar #Sync 330525cf1a30Sjl stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 330625cf1a30Sjl jmp %o4 330725cf1a30Sjl nop 330825cf1a30Sjl 330925cf1a30Sjl SET_SIZE(copyin_noerr) 331025cf1a30Sjl 331125cf1a30Sjl/* 331225cf1a30Sjl * Copy a block of storage - must not overlap (from + len <= to). 331325cf1a30Sjl * No fault handler installed (to be called under on_fault()) 331425cf1a30Sjl */ 331525cf1a30Sjl 331625cf1a30Sjl ENTRY(copyout_noerr) 331725cf1a30Sjl 331825cf1a30Sjl cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 331925cf1a30Sjl bleu,pt %ncc, .copyout_ne_small ! go to larger cases 332025cf1a30Sjl xor %o0, %o1, %o3 ! are src, dst alignable? 332125cf1a30Sjl btst 7, %o3 ! 332225cf1a30Sjl bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 332325cf1a30Sjl nop 332425cf1a30Sjl btst 1, %o3 ! 332525cf1a30Sjl bz,pt %ncc, .copyout_ne_2 ! check for half-word 332625cf1a30Sjl nop 332725cf1a30Sjl sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 332825cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_1)], %o3 332925cf1a30Sjl tst %o3 333025cf1a30Sjl bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 333125cf1a30Sjl cmp %o2, %o3 ! if length <= limit 333225cf1a30Sjl bleu,pt %ncc, .copyout_ne_small ! go to small copy 333325cf1a30Sjl nop 333425cf1a30Sjl ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 333525cf1a30Sjl nop 333625cf1a30Sjl.copyout_ne_2: 333725cf1a30Sjl btst 3, %o3 ! 333825cf1a30Sjl bz,pt %ncc, .copyout_ne_4 ! check for word alignment 333925cf1a30Sjl nop 334025cf1a30Sjl sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 334125cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_2)], %o3 334225cf1a30Sjl tst %o3 334325cf1a30Sjl bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 334425cf1a30Sjl cmp %o2, %o3 ! if length <= limit 334525cf1a30Sjl bleu,pt %ncc, .copyout_ne_small ! go to small copy 334625cf1a30Sjl nop 334725cf1a30Sjl ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 334825cf1a30Sjl nop 334925cf1a30Sjl.copyout_ne_4: 335025cf1a30Sjl ! already checked longword, must be word aligned 335125cf1a30Sjl sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 335225cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_4)], %o3 335325cf1a30Sjl tst %o3 335425cf1a30Sjl bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 335525cf1a30Sjl cmp %o2, %o3 ! if length <= limit 335625cf1a30Sjl bleu,pt %ncc, .copyout_ne_small ! go to small copy 335725cf1a30Sjl nop 335825cf1a30Sjl ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 335925cf1a30Sjl nop 336025cf1a30Sjl.copyout_ne_8: 336125cf1a30Sjl sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 336225cf1a30Sjl ld [%o3 + %lo(hw_copy_limit_8)], %o3 336325cf1a30Sjl tst %o3 336425cf1a30Sjl bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 336525cf1a30Sjl cmp %o2, %o3 ! if length <= limit 336625cf1a30Sjl bleu,pt %ncc, .copyout_ne_small ! go to small copy 336725cf1a30Sjl nop 336825cf1a30Sjl ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 336925cf1a30Sjl nop 337025cf1a30Sjl 337125cf1a30Sjl.copyout_ne_small: 337225cf1a30Sjl ldn [THREAD_REG + T_LOFAULT], %o4 337325cf1a30Sjl tst %o4 337425cf1a30Sjl bz,pn %ncc, .sm_do_copyout 337525cf1a30Sjl nop 337625cf1a30Sjl sethi %hi(.sm_copyio_noerr), %o5 337725cf1a30Sjl or %o5, %lo(.sm_copyio_noerr), %o5 337825cf1a30Sjl membar #Sync ! sync error barrier 337925cf1a30Sjl ba,pt %ncc, .sm_do_copyout 338025cf1a30Sjl stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 338125cf1a30Sjl 338225cf1a30Sjl.copyout_noerr_more: 338325cf1a30Sjl save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 338425cf1a30Sjl sethi %hi(.copyio_noerr), REAL_LOFAULT 338525cf1a30Sjl ba,pt %ncc, .do_copyout 338625cf1a30Sjl or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 338725cf1a30Sjl 338825cf1a30Sjl SET_SIZE(copyout_noerr) 338925cf1a30Sjl 339025cf1a30Sjl 339125cf1a30Sjl/* 339225cf1a30Sjl * hwblkclr - clears block-aligned, block-multiple-sized regions that are 339325cf1a30Sjl * longer than 256 bytes in length using spitfire's block stores. If 339425cf1a30Sjl * the criteria for using this routine are not met then it calls bzero 339525cf1a30Sjl * and returns 1. Otherwise 0 is returned indicating success. 339625cf1a30Sjl * Caller is responsible for ensuring use_hw_bzero is true and that 339725cf1a30Sjl * kpreempt_disable() has been called. 339825cf1a30Sjl */ 339925cf1a30Sjl ! %i0 - start address 340025cf1a30Sjl ! %i1 - length of region (multiple of 64) 340125cf1a30Sjl ! %l0 - saved fprs 340225cf1a30Sjl ! %l1 - pointer to saved %d0 block 340325cf1a30Sjl ! %l2 - saved curthread->t_lwp 340425cf1a30Sjl 340525cf1a30Sjl ENTRY(hwblkclr) 340625cf1a30Sjl ! get another window w/space for one aligned block of saved fpregs 340725cf1a30Sjl save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 340825cf1a30Sjl 340925cf1a30Sjl ! Must be block-aligned 341025cf1a30Sjl andcc %i0, (VIS_BLOCKSIZE-1), %g0 341125cf1a30Sjl bnz,pn %ncc, 1f 341225cf1a30Sjl nop 341325cf1a30Sjl 341425cf1a30Sjl ! ... and must be 256 bytes or more 341525cf1a30Sjl cmp %i1, 256 341625cf1a30Sjl blu,pn %ncc, 1f 341725cf1a30Sjl nop 341825cf1a30Sjl 341925cf1a30Sjl ! ... and length must be a multiple of VIS_BLOCKSIZE 342025cf1a30Sjl andcc %i1, (VIS_BLOCKSIZE-1), %g0 342125cf1a30Sjl bz,pn %ncc, 2f 342225cf1a30Sjl nop 342325cf1a30Sjl 342425cf1a30Sjl1: ! punt, call bzero but notify the caller that bzero was used 342525cf1a30Sjl mov %i0, %o0 342625cf1a30Sjl call bzero 342725cf1a30Sjl mov %i1, %o1 342825cf1a30Sjl ret 342925cf1a30Sjl restore %g0, 1, %o0 ! return (1) - did not use block operations 343025cf1a30Sjl 343125cf1a30Sjl2: rd %fprs, %l0 ! check for unused fp 343225cf1a30Sjl btst FPRS_FEF, %l0 343325cf1a30Sjl bz,pt %icc, 1f 343425cf1a30Sjl nop 343525cf1a30Sjl 343625cf1a30Sjl ! save in-use fpregs on stack 343725cf1a30Sjl membar #Sync 343825cf1a30Sjl add %fp, STACK_BIAS - 65, %l1 343925cf1a30Sjl and %l1, -VIS_BLOCKSIZE, %l1 344025cf1a30Sjl stda %d0, [%l1]ASI_BLK_P 344125cf1a30Sjl 344225cf1a30Sjl1: membar #StoreStore|#StoreLoad|#LoadStore 344325cf1a30Sjl wr %g0, FPRS_FEF, %fprs 344425cf1a30Sjl wr %g0, ASI_BLK_P, %asi 344525cf1a30Sjl 344625cf1a30Sjl ! Clear block 344725cf1a30Sjl fzero %d0 344825cf1a30Sjl fzero %d2 344925cf1a30Sjl fzero %d4 345025cf1a30Sjl fzero %d6 345125cf1a30Sjl fzero %d8 345225cf1a30Sjl fzero %d10 345325cf1a30Sjl fzero %d12 345425cf1a30Sjl fzero %d14 345525cf1a30Sjl 345625cf1a30Sjl mov 256, %i3 345725cf1a30Sjl ba,pt %ncc, .pz_doblock 345825cf1a30Sjl nop 345925cf1a30Sjl 346025cf1a30Sjl.pz_blkstart: 346125cf1a30Sjl ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 346225cf1a30Sjl stda %d0, [%i0 + 128]%asi 346325cf1a30Sjl stda %d0, [%i0 + 64]%asi 346425cf1a30Sjl stda %d0, [%i0]%asi 346525cf1a30Sjl.pz_zinst: 346625cf1a30Sjl add %i0, %i3, %i0 346725cf1a30Sjl sub %i1, %i3, %i1 346825cf1a30Sjl.pz_doblock: 346925cf1a30Sjl cmp %i1, 256 347025cf1a30Sjl bgeu,a %ncc, .pz_blkstart 347125cf1a30Sjl stda %d0, [%i0 + 192]%asi 347225cf1a30Sjl 347325cf1a30Sjl cmp %i1, 64 347425cf1a30Sjl blu %ncc, .pz_finish 347525cf1a30Sjl 347625cf1a30Sjl andn %i1, (64-1), %i3 347725cf1a30Sjl srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 347825cf1a30Sjl set .pz_zinst, %i4 347925cf1a30Sjl sub %i4, %i2, %i4 348025cf1a30Sjl jmp %i4 348125cf1a30Sjl nop 348225cf1a30Sjl 348325cf1a30Sjl.pz_finish: 348425cf1a30Sjl membar #Sync 348525cf1a30Sjl btst FPRS_FEF, %l0 348625cf1a30Sjl bz,a .pz_finished 348725cf1a30Sjl wr %l0, 0, %fprs ! restore fprs 348825cf1a30Sjl 348925cf1a30Sjl ! restore fpregs from stack 349025cf1a30Sjl ldda [%l1]ASI_BLK_P, %d0 349125cf1a30Sjl membar #Sync 349225cf1a30Sjl wr %l0, 0, %fprs ! restore fprs 349325cf1a30Sjl 349425cf1a30Sjl.pz_finished: 349525cf1a30Sjl ret 349625cf1a30Sjl restore %g0, 0, %o0 ! return (bzero or not) 349725cf1a30Sjl 349825cf1a30Sjl SET_SIZE(hwblkclr) 34999b0bb795SJohn Levon 350025cf1a30Sjl /* 350125cf1a30Sjl * Copy 32 bytes of data from src (%o0) to dst (%o1) 350225cf1a30Sjl * using physical addresses. 350325cf1a30Sjl */ 350425cf1a30Sjl ENTRY_NP(hw_pa_bcopy32) 350525cf1a30Sjl rdpr %pstate, %g1 350625cf1a30Sjl andn %g1, PSTATE_IE, %g2 350725cf1a30Sjl wrpr %g0, %g2, %pstate 350825cf1a30Sjl 350925cf1a30Sjl rdpr %pstate, %g0 351025cf1a30Sjl ldxa [%o0]ASI_MEM, %o2 351125cf1a30Sjl add %o0, 8, %o0 351225cf1a30Sjl ldxa [%o0]ASI_MEM, %o3 351325cf1a30Sjl add %o0, 8, %o0 351425cf1a30Sjl ldxa [%o0]ASI_MEM, %o4 351525cf1a30Sjl add %o0, 8, %o0 351625cf1a30Sjl ldxa [%o0]ASI_MEM, %o5 351725cf1a30Sjl membar #Sync 351825cf1a30Sjl 351925cf1a30Sjl stxa %o2, [%o1]ASI_MEM 352025cf1a30Sjl add %o1, 8, %o1 352125cf1a30Sjl stxa %o3, [%o1]ASI_MEM 352225cf1a30Sjl add %o1, 8, %o1 352325cf1a30Sjl stxa %o4, [%o1]ASI_MEM 352425cf1a30Sjl add %o1, 8, %o1 352525cf1a30Sjl stxa %o5, [%o1]ASI_MEM 352625cf1a30Sjl 352725cf1a30Sjl retl 352825cf1a30Sjl wrpr %g0, %g1, %pstate 352925cf1a30Sjl 353025cf1a30Sjl SET_SIZE(hw_pa_bcopy32) 353125cf1a30Sjl 353225cf1a30Sjl DGDEF(use_hw_bcopy) 353325cf1a30Sjl .word 1 353425cf1a30Sjl DGDEF(use_hw_bzero) 353525cf1a30Sjl .word 1 353625cf1a30Sjl DGDEF(hw_copy_limit_1) 353725cf1a30Sjl .word 0 353825cf1a30Sjl DGDEF(hw_copy_limit_2) 353925cf1a30Sjl .word 0 354025cf1a30Sjl DGDEF(hw_copy_limit_4) 354125cf1a30Sjl .word 0 354225cf1a30Sjl DGDEF(hw_copy_limit_8) 354325cf1a30Sjl .word 0 354425cf1a30Sjl 354525cf1a30Sjl .align 64 354625cf1a30Sjl .section ".text" 3547