1/*-
2 * Copyright (c) 2001 Jake Burkholder.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <machine/asm.h>
28__FBSDID("$FreeBSD$");
29
30#include "opt_kstack_pages.h"
31
32#include <sys/errno.h>
33
34#include <machine/asi.h>
35#include <machine/asmacros.h>
36#include <machine/fsr.h>
37#include <machine/intr_machdep.h>
38#include <machine/pcb.h>
39#include <machine/pstate.h>
40#include <machine/wstate.h>
41
42#include "assym.inc"
43
44	.register %g2, #ignore
45	.register %g3, #ignore
46	.register %g6, #ignore
47
48/*
49 * Common code for copy routines.
50 *
51 * We use large macros to generate functions for each of the copy routines.
52 * This allows the load and store instructions to be generated for the right
53 * operation, asi or not.  It is possible to write an asi independent function
54 * but this would require 2 expensive wrs in the main loop to switch %asi.
55 * It would also screw up profiling (if we ever get it), but may save some I$.
56 * We assume that either one of dasi and sasi is empty, or that they are both
57 * the same (empty or non-empty).  It is up to the caller to set %asi.
58 */
59
60/*
61 * ASI independent implementation of copystr(9).
62 * Used to implement copyinstr() and copystr().
63 *
64 * Return value is in %g1.
65 */
66#define	_COPYSTR(src, dst, len, done, sa, sasi, da, dasi) \
67	brz	len, 4f ; \
68	 mov	src, %g2 ; \
691:	deccc	1, len ; \
70	bl,a,pn	%xcc, 3f ; \
71	 nop ; \
72	LD(ub, sa) [src] sasi, %g1 ; \
73	ST(b, da) %g1, [dst] dasi ; \
74	brz,pn	%g1, 3f ; \
75	 inc	src ; \
76	ba	%xcc, 1b ; \
77	 inc	dst ; \
782:	mov	ENAMETOOLONG, %g1 ; \
793:	sub	src, %g2, %g2 ; \
80	brnz,a	done, 4f ; \
81	 stx	%g2, [done] ; \
824:
83
84/*
85 * ASI independent implementation of memset(3).
86 * Used to implement bzero(), memset() and aszero().
87 *
88 * If the pattern is non-zero, duplicate it to fill 64 bits.
89 * Store bytes until dst is 8-byte aligned, then store 8 bytes.
90 * It has yet to be determined how much unrolling is beneficial.
91 * Could also read and compare before writing to minimize snoop traffic.
92 *
93 * XXX bzero() should be implemented as
94 * #define bzero(dst, len) (void)memset((dst), 0, (len))
95 * if at all.
96 */
97#define	_MEMSET(dst, pat, len, da, dasi) \
98	brlez,pn len, 5f ; \
99	 and	pat, 0xff, pat ; \
100	brz,pt	pat, 1f ; \
101	 sllx	pat, 8, %g1 ; \
102	or	pat, %g1, pat ; \
103	sllx	pat, 16, %g1 ; \
104	or	pat, %g1, pat ; \
105	sllx	pat, 32, %g1 ; \
106	or	pat, %g1, pat ; \
107	.align 16 ; \
1081:	deccc	1, len ; \
109	bl,pn	%xcc, 5f ; \
110	 btst	7, dst ; \
111	bz,a,pt	%xcc, 2f ; \
112	 inc	1, len ; \
113	ST(b, da) pat, [dst] dasi ; \
114	ba	%xcc, 1b ; \
115	 inc	dst ; \
116	.align 16 ; \
1172:	deccc	32, len ; \
118	bl,a,pn	%xcc, 3f ; \
119	 inc	32, len ; \
120	ST(x, da) pat, [dst] dasi ; \
121	ST(x, da) pat, [dst + 8] dasi ; \
122	ST(x, da) pat, [dst + 16] dasi ; \
123	ST(x, da) pat, [dst + 24] dasi ; \
124	ba	%xcc, 2b ; \
125	 inc	32, dst ; \
126	.align 16 ; \
1273:	deccc	8, len ; \
128	bl,a,pn	%xcc, 4f ; \
129	 inc	8, len ; \
130	ST(x, da) pat, [dst] dasi ; \
131	ba	%xcc, 3b ; \
132	 inc	8, dst ; \
133	.align 16 ; \
1344:	deccc	1, len ; \
135	bl,a,pn	%xcc, 5f ; \
136	 nop ; \
137	ST(b, da) pat, [dst] dasi ; \
138	ba	%xcc, 4b ; \
139	 inc	1, dst ; \
1405:
141
142/*
143 * ASI independent implementation of memcpy(3).
144 * Used to implement bcopy(), copyin(), copyout(), memcpy(), ascopy(),
145 * ascopyfrom() and ascopyto().
146 *
147 * Transfer bytes until dst is 8-byte aligned.  If src is then also 8 byte
148 * aligned, transfer 8 bytes, otherwise finish with bytes.  The unaligned
149 * case could be optimized, but it is expected that this is the uncommon
150 * case and of questionable value.  The code to do so is also rather large
151 * and ugly.  It has yet to be determined how much unrolling is beneficial.
152 *
153 * XXX bcopy() must also check for overlap.  This is stupid.
154 * XXX bcopy() should be implemented as
155 * #define bcopy(src, dst, len) (void)memcpy((dst), (src), (len))
156 * if at all.
157 */
158#define	_MEMCPY(dst, src, len, da, dasi, sa, sasi) \
1591:	deccc	1, len ; \
160	bl,pn	%xcc, 6f ; \
161	 btst	7, dst ; \
162	bz,a,pt	%xcc, 2f ; \
163	 inc	1, len ; \
164	LD(ub, sa) [src] sasi, %g1 ; \
165	ST(b, da) %g1, [dst] dasi ; \
166	inc	1, src ; \
167	ba	%xcc, 1b ; \
168	 inc	1, dst ; \
169	.align 16 ; \
1702:	btst	7, src ; \
171	bz,a,pt	%xcc, 3f ; \
172	 nop ; \
173	ba,a	%xcc, 5f ; \
174	.align 16 ; \
1753:	deccc	32, len ; \
176	bl,a,pn	%xcc, 4f ; \
177	 inc	32, len ; \
178	LD(x, sa) [src] sasi, %g1 ; \
179	LD(x, sa) [src + 8] sasi, %g2 ; \
180	LD(x, sa) [src + 16] sasi, %g3 ; \
181	LD(x, sa) [src + 24] sasi, %g4 ; \
182	ST(x, da) %g1, [dst] dasi ; \
183	ST(x, da) %g2, [dst + 8] dasi ; \
184	ST(x, da) %g3, [dst + 16] dasi ; \
185	ST(x, da) %g4, [dst + 24] dasi ; \
186	inc	32, src ; \
187	ba	%xcc, 3b ; \
188	 inc	32, dst ; \
189	.align 16 ; \
1904:	deccc	8, len ; \
191	bl,a,pn	%xcc, 5f ; \
192	 inc	8, len ; \
193	LD(x, sa) [src] sasi, %g1 ; \
194	ST(x, da) %g1, [dst] dasi ; \
195	inc	8, src ; \
196	ba	%xcc, 4b ; \
197	 inc	8, dst ; \
198	.align 16 ; \
1995:	deccc	1, len ; \
200	bl,a,pn	%xcc, 6f ; \
201	 nop ; \
202	LD(ub, sa) [src] sasi, %g1 ; \
203	ST(b, da) %g1, [dst] dasi ; \
204	inc	src ; \
205	ba	%xcc, 5b ; \
206	 inc	dst ; \
2076:
208
209/*
210 * Extension of _MEMCPY dealing with overlap, but unaware of ASIs.
211 * Used for bcopy() and memmove().
212 */
213#define	_MEMMOVE(dst, src, len) \
214	/* Check for overlap, and copy backwards if so. */ \
215	sub	dst, src, %g1 ; \
216	cmp	%g1, len ; \
217	bgeu,a,pt %xcc, 2f ; \
218	 nop ; \
219	/* Copy backwards. */ \
220	add	src, len, src ; \
221	add	dst, len, dst ; \
2221:	deccc	1, len ; \
223	bl,pn	%xcc, 3f ; \
224	 dec	1, src ; \
225	ldub	[src], %g1 ; \
226	dec	1, dst ; \
227	ba	%xcc, 1b ; \
228	 stb	%g1, [dst] ; \
2292:	/* Do the fast version. */ \
230	_MEMCPY(dst, src, len, EMPTY, EMPTY, EMPTY, EMPTY) ; \
2313:
232
233/*
234 * void ascopy(u_long asi, vm_offset_t src, vm_offset_t dst, size_t len)
235 */
236ENTRY(ascopy)
237	wr	%o0, 0, %asi
238	_MEMCPY(%o2, %o1, %o3, a, %asi, a, %asi)
239	retl
240	 nop
241END(ascopy)
242
243/*
244 * void ascopyfrom(u_long sasi, vm_offset_t src, caddr_t dst, size_t len)
245 */
246ENTRY(ascopyfrom)
247	wr	%o0, 0, %asi
248	_MEMCPY(%o2, %o1, %o3, EMPTY, EMPTY, a, %asi)
249	retl
250	 nop
251END(ascopyfrom)
252
253/*
254 * void ascopyto(caddr_t src, u_long dasi, vm_offset_t dst, size_t len)
255 */
256ENTRY(ascopyto)
257	wr	%o1, 0, %asi
258	_MEMCPY(%o2, %o0, %o3, a, %asi, EMPTY, EMPTY)
259	retl
260	 nop
261END(ascopyto)
262
263/*
264 * void aszero(u_long asi, vm_offset_t pa, size_t len)
265 */
266ENTRY(aszero)
267	wr	%o0, 0, %asi
268	_MEMSET(%o1, %g0, %o2, a, %asi)
269	retl
270	 nop
271END(aszero)
272
273/*
274 * int bcmp(const void *b1, const void *b2, size_t len)
275 */
276ENTRY(bcmp)
277	brz,pn	%o2, 2f
278	 clr	%o3
2791:	ldub	[%o0 + %o3], %o4
280	ldub	[%o1 + %o3], %o5
281	cmp	%o4, %o5
282	bne,pn	%xcc, 2f
283	 inc	%o3
284	deccc	%o2
285	bne,pt	%xcc, 1b
286	 nop
2872:	retl
288	 mov	%o2, %o0
289END(bcmp)
290
291/*
292 * void bcopy(const void *src, void *dst, size_t len)
293 */
294ENTRY(bcopy)
295	_MEMMOVE(%o1, %o0, %o2)
296	retl
297	 nop
298END(bcopy)
299
300/*
301 * void bzero(void *b, size_t len)
302 */
303ENTRY(bzero)
304	_MEMSET(%o0, %g0, %o1, EMPTY, EMPTY)
305	retl
306	 nop
307END(bzero)
308
309/*
310 * int copystr(const void *src, void *dst, size_t len, size_t *done)
311 */
312ENTRY(copystr)
313	_COPYSTR(%o0, %o1, %o2, %o3, EMPTY, EMPTY, EMPTY, EMPTY)
314	retl
315	 mov	%g1, %o0
316END(copystr)
317
318/*
319 * void *memcpy(void *dst, const void *src, size_t len)
320 */
321ENTRY(memcpy)
322	mov	%o0, %o3
323	_MEMCPY(%o3, %o1, %o2, EMPTY, EMPTY, EMPTY, EMPTY)
324	retl
325	 nop
326END(memcpy)
327
328/*
329 * void *memmove(void *dst, const void *src, size_t len)
330 */
331ENTRY(memmove)
332	mov	%o0, %o3
333	_MEMMOVE(%o3, %o1, %o2)
334	retl
335	 nop
336END(memmove)
337
338/*
339 * void *memset(void *b, int c, size_t len)
340 */
341ENTRY(memset)
342	mov	%o0, %o3
343	_MEMSET(%o3, %o1, %o2, EMPTY, EMPTY)
344	retl
345	 nop
346END(memset)
347
348	.globl	copy_nofault_begin
349copy_nofault_begin:
350	nop
351
352/*
353 * int copyin(const void *uaddr, void *kaddr, size_t len)
354 */
355ENTRY(copyin)
356	wr	%g0, ASI_AIUP, %asi
357	_MEMCPY(%o1, %o0, %o2, EMPTY, EMPTY, a, %asi)
358	retl
359	 clr	%o0
360END(copyin)
361
362/*
363 * int copyinstr(const void *uaddr, void *kaddr, size_t len, size_t *done)
364 */
365ENTRY(copyinstr)
366	wr	%g0, ASI_AIUP, %asi
367	_COPYSTR(%o0, %o1, %o2, %o3, a, %asi, EMPTY, EMPTY)
368	retl
369	 mov	%g1, %o0
370END(copyinstr)
371
372/*
373 * int copyout(const void *kaddr, void *uaddr, size_t len)
374 */
375ENTRY(copyout)
376	wr	%g0, ASI_AIUP, %asi
377	_MEMCPY(%o1, %o0, %o2, a, %asi, EMPTY, EMPTY)
378	retl
379	 clr	%o0
380END(copyout)
381
382	.globl	copy_nofault_end
383copy_nofault_end:
384	nop
385
386ENTRY(copy_fault)
387	retl
388	 mov	EFAULT, %o0
389END(copy_fault)
390
391	.globl	fs_nofault_begin
392fs_nofault_begin:
393	nop
394
395/*
396 * Chatty aliases for fetch, store functions.
397 */
398	.globl	fubyte, fusword, fuword, subyte, susword, suword
399	.set	fubyte, fuword8
400	.set	fusword, fuword16
401	.set	fuword, fuword64
402	.set	subyte, suword8
403	.set	susword, suword16
404	.set	suword, suword64
405
406	.globl	casuword32_int, casuword64_int, fuptr, suptr
407	.set	fuptr, fuword64
408	.set	suptr, suword64
409
410/*
411 * int32_t casuword32(volatile int32_t *p, int32_t e, int32_t s)
412 */
413ENTRY(casuword32_int)
414	casa	[%o0] ASI_AIUP, %o1, %o2
415	retl
416	 mov	%o2, %o0
417END(casuword32_int)
418
419/*
420 * int64_t casuword64(volatile int64_t *p, int64_t e, int64_t s)
421 */
422ENTRY(casuword64_int)
423	casxa	[%o0] ASI_AIUP, %o1, %o2
424	retl
425	 mov	%o2, %o0
426END(casuword64_int)
427
428/*
429 * int fuword8(const void *base)
430 */
431ENTRY(fuword8)
432	retl
433	 lduba	[%o0] ASI_AIUP, %o0
434END(fuword8)
435
436/*
437 * int fuword16(const void *base)
438 */
439ENTRY(fuword16)
440	retl
441	 lduha	[%o0] ASI_AIUP, %o0
442END(fuword16)
443
444/*
445 * int32_t fuword32(const void *base)
446 */
447ENTRY(fuword32)
448	retl
449	 lduwa	[%o0] ASI_AIUP, %o0
450END(fuword32)
451
452/*
453 * int64_t fuword64(const void *base)
454 */
455ENTRY(fuword64)
456	retl
457	 ldxa	[%o0] ASI_AIUP, %o0
458END(fuword64)
459
460/*
461 * int suword8(const void *base, int word)
462 */
463ENTRY(suword8)
464	stba	%o1, [%o0] ASI_AIUP
465	retl
466	 clr	%o0
467END(suword8)
468
469/*
470 * int suword16(const void *base, int word)
471 */
472ENTRY(suword16)
473	stha	%o1, [%o0] ASI_AIUP
474	retl
475	 clr	%o0
476END(suword16)
477
478/*
479 * int suword32(const void *base, int32_t word)
480 */
481ENTRY(suword32)
482	stwa	%o1, [%o0] ASI_AIUP
483	retl
484	 clr	%o0
485END(suword32)
486
487/*
488 * int suword64(const void *base, int64_t word)
489 */
490ENTRY(suword64)
491	stxa	%o1, [%o0] ASI_AIUP
492	retl
493	 clr	%o0
494END(suword64)
495
496	.globl	fs_nofault_end
497fs_nofault_end:
498	nop
499
500ENTRY(fs_fault)
501	retl
502	 mov	-1, %o0
503END(fs_fault)
504
505	.globl	fas_nofault_begin
506fas_nofault_begin:
507
508/*
509 * int fasword8(u_long asi, uint64_t addr, uint8_t *val)
510 */
511ENTRY(fasword8)
512	wr	%o0, 0, %asi
513	membar	#Sync
514	lduba	[%o1] %asi, %o3
515	membar	#Sync
516	stb	%o3, [%o2]
517	retl
518	 clr	%o0
519END(fasword8)
520
521/*
522 * int fasword16(u_long asi, uint64_t addr, uint16_t *val)
523 */
524ENTRY(fasword16)
525	wr	%o0, 0, %asi
526	membar	#Sync
527	lduha	[%o1] %asi, %o3
528	membar	#Sync
529	sth	%o3, [%o2]
530	retl
531	 clr	%o0
532END(fasword16)
533
534/*
535 * int fasword32(u_long asi, uint64_t addr, uint32_t *val)
536 */
537ENTRY(fasword32)
538	wr	%o0, 0, %asi
539	membar	#Sync
540	lduwa	[%o1] %asi, %o3
541	membar	#Sync
542	stw	%o3, [%o2]
543	retl
544	 clr	%o0
545END(fasword32)
546
547	.globl	fas_nofault_end
548fas_nofault_end:
549	nop
550
551	.globl	fas_fault
552ENTRY(fas_fault)
553	retl
554	 mov	-1, %o0
555END(fas_fault)
556
557	.globl	fpu_fault_begin
558fpu_fault_begin:
559	nop
560
561/*
562 * void spitfire_block_copy(void *src, void *dst, size_t len)
563 */
564ENTRY(spitfire_block_copy)
565	rdpr	%pstate, %o3
566	wrpr	%g0, PSTATE_NORMAL, %pstate
567
568	wr	%g0, ASI_BLK_S, %asi
569	wr	%g0, FPRS_FEF, %fprs
570
571	sub	PCB_REG, TF_SIZEOF, %o4
572	ldx	[%o4 + TF_FPRS], %o5
573	andcc	%o5, FPRS_FEF, %g0
574	bz,a,pt	%xcc, 1f
575	 nop
576	stda	%f0, [PCB_REG + PCB_UFP + (0 * VIS_BLOCKSIZE)] %asi
577	stda	%f16, [PCB_REG + PCB_UFP + (1 * VIS_BLOCKSIZE)] %asi
578	stda	%f32, [PCB_REG + PCB_UFP + (2 * VIS_BLOCKSIZE)] %asi
579	stda	%f48, [PCB_REG + PCB_UFP + (3 * VIS_BLOCKSIZE)] %asi
580	membar	#Sync
581
582	andn	%o5, FPRS_FEF, %o5
583	stx	%o5, [%o4 + TF_FPRS]
584	ldx	[PCB_REG + PCB_FLAGS], %o4
585	or	%o4, PCB_FEF, %o4
586	stx	%o4, [PCB_REG + PCB_FLAGS]
587
5881:	wrpr	%o3, 0, %pstate
589
590	ldda	[%o0] %asi, %f0
591	add	%o0, VIS_BLOCKSIZE, %o0
592	sub	%o2, VIS_BLOCKSIZE, %o2
593
5942:	ldda	[%o0] %asi, %f16
595	fsrc1	%f0, %f32
596	fsrc1	%f2, %f34
597	fsrc1	%f4, %f36
598	fsrc1	%f6, %f38
599	fsrc1	%f8, %f40
600	fsrc1	%f10, %f42
601	fsrc1	%f12, %f44
602	fsrc1	%f14, %f46
603	stda	%f32, [%o1] %asi
604	add	%o0, VIS_BLOCKSIZE, %o0
605	subcc	%o2, VIS_BLOCKSIZE, %o2
606	bz,pn	%xcc, 3f
607	 add	%o1, VIS_BLOCKSIZE, %o1
608	ldda	[%o0] %asi, %f0
609	fsrc1	%f16, %f32
610	fsrc1	%f18, %f34
611	fsrc1	%f20, %f36
612	fsrc1	%f22, %f38
613	fsrc1	%f24, %f40
614	fsrc1	%f26, %f42
615	fsrc1	%f28, %f44
616	fsrc1	%f30, %f46
617	stda	%f32, [%o1] %asi
618	add	%o0, VIS_BLOCKSIZE, %o0
619	sub	%o2, VIS_BLOCKSIZE, %o2
620	ba,pt	%xcc, 2b
621	 add	%o1, VIS_BLOCKSIZE, %o1
622
6233:	membar	#Sync
624
625	stda	%f16, [%o1] %asi
626	membar	#Sync
627
628	retl
629	 wr	%g0, 0, %fprs
630END(spitfire_block_copy)
631
632/*
633 * void zeus_block_copy(void *src, void *dst, size_t len)
634 */
635ENTRY(zeus_block_copy)
636	prefetch [%o0 + (0 * VIS_BLOCKSIZE)], 0
637
638	rdpr	%pstate, %o3
639	wrpr	%g0, PSTATE_NORMAL, %pstate
640
641	wr	%g0, ASI_BLK_S, %asi
642	wr	%g0, FPRS_FEF, %fprs
643
644	sub	PCB_REG, TF_SIZEOF, %o4
645	ldx	[%o4 + TF_FPRS], %o5
646	andcc	%o5, FPRS_FEF, %g0
647	bz,a,pt	%xcc, 1f
648	 nop
649	stda	%f0, [PCB_REG + PCB_UFP + (0 * VIS_BLOCKSIZE)] %asi
650	stda	%f16, [PCB_REG + PCB_UFP + (1 * VIS_BLOCKSIZE)] %asi
651	stda	%f32, [PCB_REG + PCB_UFP + (2 * VIS_BLOCKSIZE)] %asi
652	stda	%f48, [PCB_REG + PCB_UFP + (3 * VIS_BLOCKSIZE)] %asi
653	membar	#Sync
654
655	andn	%o5, FPRS_FEF, %o5
656	stx	%o5, [%o4 + TF_FPRS]
657	ldx	[PCB_REG + PCB_FLAGS], %o4
658	or	%o4, PCB_FEF, %o4
659	stx	%o4, [PCB_REG + PCB_FLAGS]
660
6611:	wrpr	%o3, 0, %pstate
662
663	ldd	[%o0 + (0 * 8)], %f0
664	prefetch [%o0 + (1 * VIS_BLOCKSIZE)], 0
665	ldd	[%o0 + (1 * 8)], %f2
666	prefetch [%o0 + (2 * VIS_BLOCKSIZE)], 0
667	fmovd	%f0, %f32
668	ldd	[%o0 + (2 * 8)], %f4
669	prefetch [%o0 + (3 * VIS_BLOCKSIZE)], 0
670	fmovd	%f2, %f34
671	ldd	[%o0 + (3 * 8)], %f6
672	prefetch [%o0 + (4 * VIS_BLOCKSIZE)], 1
673	fmovd	%f4, %f36
674	ldd	[%o0 + (4 * 8)], %f8
675	prefetch [%o0 + (8 * VIS_BLOCKSIZE)], 1
676	fmovd	%f6, %f38
677	ldd	[%o0 + (5 * 8)], %f10
678	prefetch [%o0 + (12 * VIS_BLOCKSIZE)], 1
679	fmovd	%f8, %f40
680	ldd	[%o0 + (6 * 8)], %f12
681	prefetch [%o0 + (16 * VIS_BLOCKSIZE)], 1
682	fmovd	%f10, %f42
683	ldd	[%o0 + (7 * 8)], %f14
684	ldd	[%o0 + (8 * 8)], %f0
685	sub	%o2, VIS_BLOCKSIZE, %o2
686	add	%o0, VIS_BLOCKSIZE, %o0
687	prefetch [%o0 + (19 * VIS_BLOCKSIZE)], 1
688	ba,pt	%xcc, 2f
689	 prefetch [%o0 + (23 * VIS_BLOCKSIZE)], 1
690	.align	32
691
6922:	ldd	[%o0 + (1 * 8)], %f2
693	fmovd	%f12, %f44
694	ldd	[%o0 + (2 * 8)], %f4
695	fmovd	%f14, %f46
696	stda	%f32, [%o1] %asi
697	ldd	[%o0 + (3 * 8)], %f6
698	fmovd	%f0, %f32
699	ldd	[%o0 + (4 * 8)], %f8
700	fmovd	%f2, %f34
701	ldd	[%o0 + (5 * 8)], %f10
702	fmovd	%f4, %f36
703	ldd	[%o0 + (6 * 8)], %f12
704	fmovd	%f6, %f38
705	ldd	[%o0 + (7 * 8)], %f14
706	fmovd	%f8, %f40
707	ldd	[%o0 + (8 * 8)], %f0
708	fmovd	%f10, %f42
709	sub	%o2, VIS_BLOCKSIZE, %o2
710	prefetch [%o0 + (3 * VIS_BLOCKSIZE)], 0
711	add	%o1, VIS_BLOCKSIZE, %o1
712	prefetch [%o0 + (24 * VIS_BLOCKSIZE)], 1
713	add	%o0, VIS_BLOCKSIZE, %o0
714	cmp	%o2, VIS_BLOCKSIZE + 8
715	bgu,pt	%xcc, 2b
716	 prefetch [%o0 + (12 * VIS_BLOCKSIZE)], 1
717	ldd	[%o0 + (1 * 8)], %f2
718	fsrc1	%f12, %f44
719	ldd	[%o0 + (2 * 8)], %f4
720	fsrc1	%f14, %f46
721	stda	%f32, [%o1] %asi
722	ldd	[%o0 + (3 * 8)], %f6
723	fsrc1	%f0, %f32
724	ldd	[%o0 + (4 * 8)], %f8
725	fsrc1	%f2, %f34
726	ldd	[%o0 + (5 * 8)], %f10
727	fsrc1	%f4, %f36
728	ldd	[%o0 + (6 * 8)], %f12
729	fsrc1	%f6, %f38
730	ldd	[%o0 + (7 * 8)], %f14
731	fsrc1	%f8, %f40
732	add	%o1, VIS_BLOCKSIZE, %o1
733	fsrc1	%f10, %f42
734	fsrc1	%f12, %f44
735	fsrc1	%f14, %f46
736	stda	%f32, [%o1] %asi
737	membar	#Sync
738
739	retl
740	 wr	%g0, 0, %fprs
741END(zeus_block_copy)
742
743/*
744 * void spitfire_block_zero(void *dst, size_t len)
745 * void zeus_block_zero(void *dst, size_t len)
746 */
747ALTENTRY(zeus_block_zero)
748ENTRY(spitfire_block_zero)
749	rdpr	%pstate, %o3
750	wrpr	%g0, PSTATE_NORMAL, %pstate
751
752	wr	%g0, ASI_BLK_S, %asi
753	wr	%g0, FPRS_FEF, %fprs
754
755	sub	PCB_REG, TF_SIZEOF, %o4
756	ldx	[%o4 + TF_FPRS], %o5
757	andcc	%o5, FPRS_FEF, %g0
758	bz,a,pt	%xcc, 1f
759	 nop
760	stda	%f0, [PCB_REG + PCB_UFP + (0 * VIS_BLOCKSIZE)] %asi
761	stda	%f16, [PCB_REG + PCB_UFP + (1 * VIS_BLOCKSIZE)] %asi
762	stda	%f32, [PCB_REG + PCB_UFP + (2 * VIS_BLOCKSIZE)] %asi
763	stda	%f48, [PCB_REG + PCB_UFP + (3 * VIS_BLOCKSIZE)] %asi
764	membar	#Sync
765
766	andn	%o5, FPRS_FEF, %o5
767	stx	%o5, [%o4 + TF_FPRS]
768	ldx	[PCB_REG + PCB_FLAGS], %o4
769	or	%o4, PCB_FEF, %o4
770	stx	%o4, [PCB_REG + PCB_FLAGS]
771
7721:	wrpr	%o3, 0, %pstate
773
774	fzero	%f0
775	fzero	%f2
776	fzero	%f4
777	fzero	%f6
778	fzero	%f8
779	fzero	%f10
780	fzero	%f12
781	fzero	%f14
782
7831:	stda	%f0, [%o0 + (0 * VIS_BLOCKSIZE)] %asi
784	stda	%f0, [%o0 + (1 * VIS_BLOCKSIZE)] %asi
785	stda	%f0, [%o0 + (2 * VIS_BLOCKSIZE)] %asi
786	stda	%f0, [%o0 + (3 * VIS_BLOCKSIZE)] %asi
787	sub	%o1, (4 * VIS_BLOCKSIZE), %o1
788	brnz,pt	%o1, 1b
789	 add	%o0, (4 * VIS_BLOCKSIZE), %o0
790	membar	#Sync
791
792	retl
793	 wr	%g0, 0, %fprs
794END(spitfire_block_zero)
795
796	.globl	fpu_fault_end
797fpu_fault_end:
798	nop
799
800	.globl	fpu_fault_size
801	.set	fpu_fault_size, fpu_fault_end - fpu_fault_begin
802
803ENTRY(longjmp)
804	set	1, %g3
805	movrz	%o1, %o1, %g3
806	mov	%o0, %g1
807	ldx	[%g1 + _JB_FP], %g2
8081:	cmp	%fp, %g2
809	bl,a,pt	%xcc, 1b
810	 restore
811	bne,pn	%xcc, 2f
812	 ldx	[%g1 + _JB_SP], %o2
813	cmp	%o2, %sp
814	blt,pn	%xcc, 2f
815	 movge	%xcc, %o2, %sp
816	ldx	[%g1 + _JB_PC], %o7
817	retl
818	 mov	%g3, %o0
8192:	PANIC("longjmp botch", %l1)
820END(longjmp)
821
822ENTRY(setjmp)
823	stx	%sp, [%o0 + _JB_SP]
824	stx	%o7, [%o0 + _JB_PC]
825	stx	%fp, [%o0 + _JB_FP]
826	retl
827	 clr	%o0
828END(setjmp)
829
830/*
831 * void ofw_entry(cell_t args[])
832 */
833ENTRY(ofw_entry)
834	save	%sp, -CCFSZ, %sp
835	SET(ofw_vec, %l7, %l6)
836	ldx	[%l6], %l6
837	rdpr	%pstate, %l7
838	andn	%l7, PSTATE_AM | PSTATE_IE, %l5
839	wrpr	%l5, 0, %pstate
840	SET(tba_taken_over, %l5, %l4)
841	brz,pn	%l4, 1f
842	 rdpr	%wstate, %l5
843	andn	%l5, WSTATE_PROM_MASK, %l3
844	wrpr	%l3, WSTATE_PROM_KMIX, %wstate
8451:	call	%l6
846	 mov	%i0, %o0
847	brz,pn	%l4, 1f
848	 nop
849	wrpr	%g0, %l5, %wstate
8501:	wrpr	%l7, 0, %pstate
851	ret
852	 restore %o0, %g0, %o0
853END(ofw_entry)
854
855/*
856 * void ofw_exit(cell_t args[])
857 */
858ENTRY(ofw_exit)
859	save	%sp, -CCFSZ, %sp
860	flushw
861	SET(ofw_tba, %l7, %l5)
862	ldx	[%l5], %l5
863	rdpr	%pstate, %l7
864	andn	%l7, PSTATE_AM | PSTATE_IE, %l7
865	wrpr	%l7, 0, %pstate
866	rdpr	%wstate, %l7
867	andn	%l7, WSTATE_PROM_MASK, %l7
868	wrpr	%l7, WSTATE_PROM_KMIX, %wstate
869	wrpr	%l5, 0, %tba			! restore the OFW trap table
870	SET(ofw_vec, %l7, %l6)
871	ldx	[%l6], %l6
872	SET(kstack0 + KSTACK_PAGES * PAGE_SIZE - PCB_SIZEOF, %l7, %l0)
873	sub	%l0, SPOFF, %fp			! setup a stack in a locked page
874	sub	%l0, SPOFF + CCFSZ, %sp
875	mov	AA_DMMU_PCXR, %l3		! force primary DMMU context 0
876	sethi	%hi(KERNBASE), %l5
877	stxa	%g0, [%l3] ASI_DMMU
878	flush	%l5
879	wrpr	%g0, 0, %tl			! force trap level 0
880	call	%l6
881	 mov	%i0, %o0
882	! never to return
883END(ofw_exit)
884
885#ifdef GPROF
886
887ENTRY(user)
888	nop
889
890ENTRY(btrap)
891	nop
892
893ENTRY(etrap)
894	nop
895
896ENTRY(bintr)
897	nop
898
899ENTRY(eintr)
900	nop
901
902/*
903 * XXX including sys/gmon.h in genassym.c is not possible due to uintfptr_t
904 * badness.
905 */
906#define	GM_STATE	0x0
907#define	GMON_PROF_OFF	3
908#define	GMON_PROF_HIRES	4
909
910	.globl	_mcount
911	.set	_mcount, __cyg_profile_func_enter
912
913ENTRY(__cyg_profile_func_enter)
914	SET(_gmonparam, %o3, %o2)
915	lduw	[%o2 + GM_STATE], %o3
916	cmp	%o3, GMON_PROF_OFF
917	be,a,pn %icc, 1f
918	 nop
919	SET(mcount, %o3, %o2)
920	jmpl	%o2, %g0
921	 nop
9221:	retl
923	 nop
924END(__cyg_profile_func_enter)
925
926#ifdef GUPROF
927
928ENTRY(__cyg_profile_func_exit)
929	SET(_gmonparam, %o3, %o2)
930	lduw	[%o2 + GM_STATE], %o3
931	cmp	%o3, GMON_PROF_HIRES
932	be,a,pn %icc, 1f
933	 nop
934	SET(mexitcount, %o3, %o2)
935	jmpl	%o2, %g0
936	 nop
9371:	retl
938	 nop
939END(__cyg_profile_func_exit)
940
941#endif /* GUPROF */
942
943#endif /* GPROF */
944