xref: /illumos-gate/usr/src/uts/sun4u/cpu/common_asm.S (revision 5d9d9091)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25#include "assym.h"
26
27/*
28 * General assembly language routines.
29 * It is the intent of this file to contain routines that are
30 * specific to cpu architecture.
31 */
32
33/*
34 * WARNING: If you add a fast trap handler which can be invoked by a
35 * non-privileged user, you may have to use the FAST_TRAP_DONE macro
36 * instead of "done" instruction to return back to the user mode. See
37 * comments for the "fast_trap_done" entry point for more information.
38 */
39#define	FAST_TRAP_DONE	\
40	ba,a	fast_trap_done
41
42/*
43 * Override GET_NATIVE_TIME for the cpu module code.  This is not
44 * guaranteed to be exactly one instruction, be careful of using
45 * the macro in delay slots.
46 *
47 * Do not use any instruction that modifies condition codes as the
48 * caller may depend on these to remain unchanged across the macro.
49 */
50#if defined(CHEETAH) || defined(OLYMPUS_C)
51
52#define	GET_NATIVE_TIME(out, scr1, scr2) \
53	rd	STICK, out
54#define	DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
55	rd	STICK, reg;		\
56	add	reg, delta, reg;	\
57	wr	reg, STICK
58#define	RD_TICKCMPR(out, scr)		\
59	rd	STICK_COMPARE, out
60#define	WR_TICKCMPR(in, scr1, scr2, label) \
61	wr	in, STICK_COMPARE
62
63#elif defined(HUMMINGBIRD)
64#include <sys/spitregs.h>
65
66/*
67 * the current hummingbird version of %stick and %stick_cmp
68 * were both implemented as (2) 32-bit locations in ASI_IO space;
69 * the hdwr should support atomic r/w; meanwhile: ugly alert! ...
70 *
71 * 64-bit opcodes are required, but move only 32-bits:
72 *
73 * ldxa [phys]ASI_IO, %dst 	reads  the low 32-bits from phys into %dst
74 * stxa %src, [phys]ASI_IO 	writes the low 32-bits from %src into phys
75 *
76 * reg equivalent		[phys]ASI_IO
77 * ------------------		---------------
78 * %stick_cmp  low-32		0x1FE.0000.F060
79 * %stick_cmp high-32		0x1FE.0000.F068
80 * %stick      low-32		0x1FE.0000.F070
81 * %stick     high-32		0x1FE.0000.F078
82 */
83#define	HSTC_LOW	0x60			/* stick_cmp low  32-bits */
84#define	HSTC_HIGH	0x68			/* stick_cmp high 32-bits */
85#define	HST_LOW		0x70			/* stick low  32-bits */
86#define	HST_HIGH	0x78			/* stick high 32-bits */
87#define	HST_DIFF	0x08			/* low<-->high diff */
88
89/*
90 * Any change in the number of instructions in SETL41()
91 * will affect SETL41_OFF
92 */
93#define	SETL41(reg, byte) \
94	sethi	%hi(0x1FE00000), reg;		/* 0000.0000.1FE0.0000 */ \
95	or	reg, 0xF, reg;			/* 0000.0000.1FE0.000F */ \
96	sllx	reg, 12, reg;			/* 0000.01FE.0000.F000 */ \
97	or	reg, byte, reg;			/* 0000.01FE.0000.F0xx */
98
99/*
100 * SETL41_OFF is used to calulate the relative PC value when a
101 * branch instruction needs to go over SETL41() macro
102 */
103#define SETL41_OFF  16
104
105/*
106 * reading stick requires 2 loads, and there could be an intervening
107 * low-to-high 32-bit rollover resulting in a return value that is
108 * off by about (2 ^ 32); this rare case is prevented by re-reading
109 * the low-32 bits after the high-32 and verifying the "after" value
110 * is >= the "before" value; if not, increment the high-32 value.
111 *
112 * this method is limited to 1 rollover, and based on the fixed
113 * stick-frequency (5555555), requires the loads to complete within
114 * 773 seconds; incrementing the high-32 value will not overflow for
115 * about 52644 years.
116 *
117 * writing stick requires 2 stores; if the old/new low-32 value is
118 * near 0xffffffff, there could be another rollover (also rare).
119 * to prevent this, we first write a 0 to the low-32, then write
120 * new values to the high-32 then the low-32.
121 *
122 * When we detect a carry in the lower %stick register, we need to
123 * read HST_HIGH again. However at the point where we detect this,
124 * we need to rebuild the register address HST_HIGH.This involves more
125 * than one instructions and a branch is unavoidable. However, most of
126 * the time, there is no carry. So we take the penalty of a branch
127 * instruction only when there is carry (less frequent).
128 *
129 * For GET_NATIVE_TIME(), we start afresh and branch to SETL41().
130 * For DELTA_NATIVE_TIME(), we branch to just after SETL41() since
131 * addr already points to HST_LOW.
132 *
133 * NOTE: this method requires disabling interrupts before using
134 * DELTA_NATIVE_TIME.
135 */
136#define	GET_NATIVE_TIME(out, scr, tmp)	\
137	SETL41(scr, HST_LOW);		\
138	ldxa	[scr]ASI_IO, tmp;	\
139	inc	HST_DIFF, scr;		\
140	ldxa	[scr]ASI_IO, out;	\
141	dec	HST_DIFF, scr;		\
142	ldxa	[scr]ASI_IO, scr;	\
143	sub	scr, tmp, tmp;		\
144	brlz,pn tmp, .-(SETL41_OFF+24); \
145	sllx	out, 32, out;		\
146	or	out, scr, out
147#define	DELTA_NATIVE_TIME(delta, addr, high, low, tmp) \
148	SETL41(addr, HST_LOW);		\
149	ldxa	[addr]ASI_IO, tmp;	\
150	inc	HST_DIFF, addr;		\
151	ldxa	[addr]ASI_IO, high;	\
152	dec	HST_DIFF, addr;		\
153	ldxa	[addr]ASI_IO, low;	\
154	sub	low, tmp, tmp;		\
155	brlz,pn tmp, .-24;		\
156	sllx	high, 32, high;		\
157	or	high, low, high;	\
158	add	high, delta, high;	\
159	srl	high, 0, low;		\
160	srlx	high, 32, high;		\
161	stxa	%g0, [addr]ASI_IO;	\
162	inc	HST_DIFF, addr;		\
163	stxa	high, [addr]ASI_IO;	\
164	dec	HST_DIFF, addr;		\
165	stxa	low, [addr]ASI_IO
166#define RD_TICKCMPR(out, scr)		\
167	SETL41(scr, HSTC_LOW);		\
168	ldxa	[scr]ASI_IO, out;	\
169	inc	HST_DIFF, scr;		\
170	ldxa	[scr]ASI_IO, scr;	\
171	sllx	scr, 32, scr;		\
172	or	scr, out, out
173#define WR_TICKCMPR(in, scra, scrd, label) \
174	SETL41(scra, HSTC_HIGH);	\
175	srlx	in, 32, scrd;		\
176	stxa	scrd, [scra]ASI_IO;	\
177	dec	HST_DIFF, scra;		\
178	stxa	in, [scra]ASI_IO
179
180#else	/* !CHEETAH && !HUMMINGBIRD */
181
182#define	GET_NATIVE_TIME(out, scr1, scr2) \
183	rdpr	%tick, out
184#define	DELTA_NATIVE_TIME(delta, reg, scr1, scr2, scr3) \
185	rdpr	%tick, reg;		\
186	add	reg, delta, reg;	\
187	wrpr	reg, %tick
188#define	RD_TICKCMPR(out, scr)		\
189	rd	TICK_COMPARE, out
190#ifdef BB_ERRATA_1 /* writes to TICK_COMPARE may fail */
191/*
192 * Writes to the TICK_COMPARE register sometimes fail on blackbird modules.
193 * The failure occurs only when the following instruction decodes to wr or
194 * wrpr.  The workaround is to immediately follow writes to TICK_COMPARE
195 * with a read, thus stalling the pipe and keeping following instructions
196 * from causing data corruption.  Aligning to a quadword will ensure these
197 * two instructions are not split due to i$ misses.
198 */
199#define WR_TICKCMPR(cmpr,scr1,scr2,label)	\
200	ba,a	.bb_errata_1.label		;\
201	.align	64				;\
202.bb_errata_1.label:				;\
203	wr	cmpr, TICK_COMPARE		;\
204	rd	TICK_COMPARE, %g0
205#else	/* BB_ERRATA_1 */
206#define	WR_TICKCMPR(in,scr1,scr2,label)		\
207	wr	in, TICK_COMPARE
208#endif	/* BB_ERRATA_1 */
209
210#endif	/* !CHEETAH && !HUMMINGBIRD */
211
212#include <sys/clock.h>
213
214
215#include <sys/asm_linkage.h>
216#include <sys/privregs.h>
217#include <sys/machparam.h>	/* To get SYSBASE and PAGESIZE */
218#include <sys/machthread.h>
219#include <sys/clock.h>
220#include <sys/intreg.h>
221#include <sys/psr_compat.h>
222#include <sys/isa_defs.h>
223#include <sys/dditypes.h>
224#include <sys/intr.h>
225
226#include "assym.h"
227
228	ENTRY(get_impl)
229	GET_CPU_IMPL(%o0)
230	retl
231	nop
232	SET_SIZE(get_impl)
233
234/*
235 * Softint generated when counter field of tick reg matches value field
236 * of tick_cmpr reg
237 */
238	ENTRY_NP(tickcmpr_set)
239	! get 64-bit clock_cycles interval
240	mov	%o0, %o2
241	mov	8, %o3			! A reasonable initial step size
2421:
243	WR_TICKCMPR(%o2,%o4,%o5,__LINE__)	! Write to TICK_CMPR
244
245	GET_NATIVE_TIME(%o0, %o4, %o5)	! Read %tick to confirm the
246	sllx	%o0, 1, %o0		!   value we wrote was in the future.
247	srlx	%o0, 1, %o0
248
249	cmp	%o2, %o0		! If the value we wrote was in the
250	bg,pt	%xcc, 2f		!   future, then blow out of here.
251	sllx	%o3, 1, %o3		! If not, then double our step size,
252	ba,pt	%xcc, 1b		!   and take another lap.
253	add	%o0, %o3, %o2		!
2542:
255	retl
256	nop
257	SET_SIZE(tickcmpr_set)
258
259	ENTRY_NP(tickcmpr_disable)
260	mov	1, %g1
261	sllx	%g1, TICKINT_DIS_SHFT, %o0
262	WR_TICKCMPR(%o0,%o4,%o5,__LINE__)	! Write to TICK_CMPR
263	retl
264	nop
265	SET_SIZE(tickcmpr_disable)
266
267#ifdef DEBUG
268	.seg	".text"
269tick_write_panic:
270	.asciz	"tick_write_delta: interrupts already disabled on entry"
271#endif	/* DEBUG */
272
273/*
274 * tick_write_delta() increments %tick by the specified delta.  This should
275 * only be called after a CPR event to assure that gethrtime() continues to
276 * increase monotonically.  Obviously, writing %tick needs to de done very
277 * carefully to avoid introducing unnecessary %tick skew across CPUs.  For
278 * this reason, we make sure we're i-cache hot before actually writing to
279 * %tick.
280 */
281	ENTRY_NP(tick_write_delta)
282	rdpr	%pstate, %g1
283#ifdef DEBUG
284	andcc	%g1, PSTATE_IE, %g0	! If DEBUG, check that interrupts
285	bnz	0f			! aren't already disabled.
286	sethi	%hi(tick_write_panic), %o1
287        save    %sp, -SA(MINFRAME), %sp ! get a new window to preserve caller
288	call	panic
289	or	%i1, %lo(tick_write_panic), %o0
290#endif	/* DEBUG */
2910:	wrpr	%g1, PSTATE_IE, %pstate	! Disable interrupts
292	mov	%o0, %o2
293	ba	0f			! Branch to cache line-aligned instr.
294	nop
295	.align	16
2960:	nop				! The next 3 instructions are now hot.
297	DELTA_NATIVE_TIME(%o2, %o3, %o4, %o5, %g2)	! read/inc/write %tick
298
299	retl				! Return
300	wrpr	%g0, %g1, %pstate	!     delay: Re-enable interrupts
301
302	ENTRY_NP(tickcmpr_disabled)
303	RD_TICKCMPR(%g1, %o0)
304	retl
305	srlx	%g1, TICKINT_DIS_SHFT, %o0
306	SET_SIZE(tickcmpr_disabled)
307
308/*
309 * Get current tick
310 */
311
312	ENTRY(gettick)
313	ALTENTRY(randtick)
314	GET_NATIVE_TIME(%o0, %o2, %o3)
315	retl
316	nop
317	SET_SIZE(randtick)
318	SET_SIZE(gettick)
319
320
321/*
322 * Return the counter portion of the tick register.
323 */
324
325	ENTRY_NP(gettick_counter)
326	rdpr	%tick, %o0
327	sllx	%o0, 1, %o0
328	retl
329	srlx	%o0, 1, %o0		! shake off npt bit
330	SET_SIZE(gettick_counter)
331
332/*
333 * Provide a C callable interface to the trap that reads the hi-res timer.
334 * Returns 64-bit nanosecond timestamp in %o0 and %o1.
335 */
336
337	ENTRY_NP(gethrtime)
338	GET_HRTIME(%g1, %o0, %o1, %o2, %o3, %o4, %o5, %g2)
339							! %g1 = hrtime
340	retl
341	mov	%g1, %o0
342	SET_SIZE(gethrtime)
343
344	ENTRY_NP(gethrtime_unscaled)
345	GET_NATIVE_TIME(%g1, %o2, %o3)			! %g1 = native time
346	retl
347	mov	%g1, %o0
348	SET_SIZE(gethrtime_unscaled)
349
350	ENTRY_NP(gethrtime_waitfree)
351	ALTENTRY(dtrace_gethrtime)
352	GET_NATIVE_TIME(%g1, %o2, %o3)			! %g1 = native time
353	NATIVE_TIME_TO_NSEC(%g1, %o2, %o3)
354	retl
355	mov	%g1, %o0
356	SET_SIZE(dtrace_gethrtime)
357	SET_SIZE(gethrtime_waitfree)
358
359	ENTRY(gethrtime_max)
360	NATIVE_TIME_MAX(%g1)
361	NATIVE_TIME_TO_NSEC(%g1, %o0, %o1)
362
363	! hrtime_t's are signed, max hrtime_t must be positive
364	mov	-1, %o2
365	brlz,a	%g1, 1f
366	srlx	%o2, 1, %g1
3671:
368	retl
369	mov	%g1, %o0
370	SET_SIZE(gethrtime_max)
371
372	ENTRY(scalehrtime)
373	ldx	[%o0], %o1
374	NATIVE_TIME_TO_NSEC(%o1, %o2, %o3)
375	retl
376	stx	%o1, [%o0]
377	SET_SIZE(scalehrtime)
378
379/*
380 * Fast trap to return a timestamp, uses trap window, leaves traps
381 * disabled.  Returns a 64-bit nanosecond timestamp in %o0 and %o1.
382 *
383 * This is the handler for the ST_GETHRTIME trap.
384 */
385
386	ENTRY_NP(get_timestamp)
387	GET_HRTIME(%g1, %g2, %g3, %g4, %g5, %o0, %o1, %o2)	! %g1 = hrtime
388	srlx	%g1, 32, %o0				! %o0 = hi32(%g1)
389	srl	%g1, 0, %o1				! %o1 = lo32(%g1)
390	FAST_TRAP_DONE
391	SET_SIZE(get_timestamp)
392
393/*
394 * Macro to convert GET_HRESTIME() bits into a timestamp.
395 *
396 * We use two separate macros so that the platform-dependent GET_HRESTIME()
397 * can be as small as possible; CONV_HRESTIME() implements the generic part.
398 */
399#define	CONV_HRESTIME(hrestsec, hrestnsec, adj, nslt, nano) \
400	brz,pt	adj, 3f;		/* no adjustments, it's easy */	\
401	add	hrestnsec, nslt, hrestnsec; /* hrest.tv_nsec += nslt */	\
402	brlz,pn	adj, 2f;		/* if hrestime_adj negative */	\
403	srlx	nslt, ADJ_SHIFT, nslt;	/* delay: nslt >>= 4 */		\
404	subcc	adj, nslt, %g0;		/* hrestime_adj - nslt/16 */	\
405	movg	%xcc, nslt, adj;	/* adj by min(adj, nslt/16) */	\
406	ba	3f;			/* go convert to sec/nsec */	\
407	add	hrestnsec, adj, hrestnsec; /* delay: apply adjustment */ \
4082:	addcc	adj, nslt, %g0;		/* hrestime_adj + nslt/16 */	\
409	bge,a,pt %xcc, 3f;		/* is adj less negative? */	\
410	add	hrestnsec, adj, hrestnsec; /* yes: hrest.nsec += adj */	\
411	sub	hrestnsec, nslt, hrestnsec; /* no: hrest.nsec -= nslt/16 */ \
4123:	cmp	hrestnsec, nano;	/* more than a billion? */	\
413	bl,pt	%xcc, 4f;		/* if not, we're done */	\
414	nop;				/* delay: do nothing :( */	\
415	add	hrestsec, 1, hrestsec;	/* hrest.tv_sec++; */		\
416	sub	hrestnsec, nano, hrestnsec; /* hrest.tv_nsec -= NANOSEC; */ \
417	ba,a	3b;			/* check >= billion again */	\
4184:
419
420	ENTRY_NP(gethrestime)
421	GET_HRESTIME(%o1, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
422	CONV_HRESTIME(%o1, %o2, %o3, %o4, %o5)
423	stn	%o1, [%o0]
424	retl
425	stn	%o2, [%o0 + CLONGSIZE]
426	SET_SIZE(gethrestime)
427
428/*
429 * Similar to gethrestime(), but gethrestime_sec() returns current hrestime
430 * seconds.
431 */
432	ENTRY_NP(gethrestime_sec)
433	GET_HRESTIME(%o0, %o2, %o3, %o4, %o5, %g1, %g2, %g3, %g4)
434	CONV_HRESTIME(%o0, %o2, %o3, %o4, %o5)
435	retl					! %o0 current hrestime seconds
436	nop
437	SET_SIZE(gethrestime_sec)
438
439/*
440 * Returns the hrestime on the last tick.  This is simpler than gethrestime()
441 * and gethrestime_sec():  no conversion is required.  gethrestime_lasttick()
442 * follows the same locking algorithm as GET_HRESTIME and GET_HRTIME,
443 * outlined in detail in clock.h.  (Unlike GET_HRESTIME/GET_HRTIME, we don't
444 * rely on load dependencies to effect the membar #LoadLoad, instead declaring
445 * it explicitly.)
446 */
447	ENTRY_NP(gethrestime_lasttick)
448	sethi	%hi(hres_lock), %o1
4490:
450	lduw	[%o1 + %lo(hres_lock)], %o2	! Load lock value
451	membar	#LoadLoad			! Load of lock must complete
452	andn	%o2, 1, %o2			! Mask off lowest bit
453	ldn	[%o1 + %lo(hrestime)], %g1	! Seconds.
454	add	%o1, %lo(hrestime), %o4
455	ldn	[%o4 + CLONGSIZE], %g2		! Nanoseconds.
456	membar	#LoadLoad			! All loads must complete
457	lduw	[%o1 + %lo(hres_lock)], %o3	! Reload lock value
458	cmp	%o3, %o2			! If lock is locked or has
459	bne	0b				!   changed, retry.
460	stn	%g1, [%o0]			! Delay: store seconds
461	retl
462	stn	%g2, [%o0 + CLONGSIZE]		! Delay: store nanoseconds
463	SET_SIZE(gethrestime_lasttick)
464
465/*
466 * Fast trap for gettimeofday().  Returns a timestruc_t in %o0 and %o1.
467 *
468 * This is the handler for the ST_GETHRESTIME trap.
469 */
470
471	ENTRY_NP(get_hrestime)
472	GET_HRESTIME(%o0, %o1, %g1, %g2, %g3, %g4, %g5, %o2, %o3)
473	CONV_HRESTIME(%o0, %o1, %g1, %g2, %g3)
474	FAST_TRAP_DONE
475	SET_SIZE(get_hrestime)
476
477/*
478 * Fast trap to return lwp virtual time, uses trap window, leaves traps
479 * disabled.  Returns a 64-bit number in %o0:%o1, which is the number
480 * of nanoseconds consumed.
481 *
482 * This is the handler for the ST_GETHRVTIME trap.
483 *
484 * Register usage:
485 *	%o0, %o1 = return lwp virtual time
486 * 	%o2 = CPU/thread
487 * 	%o3 = lwp
488 * 	%g1 = scratch
489 * 	%g5 = scratch
490 */
491	ENTRY_NP(get_virtime)
492	GET_NATIVE_TIME(%g5, %g1, %g2)	! %g5 = native time in ticks
493	CPU_ADDR(%g2, %g3)			! CPU struct ptr to %g2
494	ldn	[%g2 + CPU_THREAD], %g2		! thread pointer to %g2
495	ldn	[%g2 + T_LWP], %g3		! lwp pointer to %g3
496
497	/*
498	 * Subtract start time of current microstate from time
499	 * of day to get increment for lwp virtual time.
500	 */
501	ldx	[%g3 + LWP_STATE_START], %g1	! ms_state_start
502	sub	%g5, %g1, %g5
503
504	/*
505	 * Add current value of ms_acct[LMS_USER]
506	 */
507	ldx	[%g3 + LWP_ACCT_USER], %g1	! ms_acct[LMS_USER]
508	add	%g5, %g1, %g5
509	NATIVE_TIME_TO_NSEC(%g5, %g1, %o0)
510
511	srl	%g5, 0, %o1			! %o1 = lo32(%g5)
512	srlx	%g5, 32, %o0			! %o0 = hi32(%g5)
513
514	FAST_TRAP_DONE
515	SET_SIZE(get_virtime)
516
517
518
519	.seg	".text"
520hrtime_base_panic:
521	.asciz	"hrtime_base stepping back"
522
523
524	ENTRY_NP(hres_tick)
525	save	%sp, -SA(MINFRAME), %sp	! get a new window
526
527	sethi	%hi(hrestime), %l4
528	ldstub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5	! try locking
5297:	tst	%l5
530	bz,pt	%xcc, 8f			! if we got it, drive on
531	ld	[%l4 + %lo(nsec_scale)], %l5	! delay: %l5 = scaling factor
532	ldub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
5339:	tst	%l5
534	bz,a,pn	%xcc, 7b
535	ldstub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
536	ba,pt	%xcc, 9b
537	ldub	[%l4 + %lo(hres_lock + HRES_LOCK_OFFSET)], %l5
5388:
539	membar	#StoreLoad|#StoreStore
540
541	!
542	! update hres_last_tick.  %l5 has the scaling factor (nsec_scale).
543	!
544	ldx	[%l4 + %lo(hrtime_base)], %g1	! load current hrtime_base
545	GET_NATIVE_TIME(%l0, %l3, %l6)		! current native time
546	stx	%l0, [%l4 + %lo(hres_last_tick)]! prev = current
547	! convert native time to nsecs
548	NATIVE_TIME_TO_NSEC_SCALE(%l0, %l5, %l2, NSEC_SHIFT)
549
550	sub	%l0, %g1, %i1			! get accurate nsec delta
551
552	ldx	[%l4 + %lo(hrtime_base)], %l1
553	cmp	%l1, %l0
554	bg,pn	%xcc, 9f
555	nop
556
557	stx	%l0, [%l4 + %lo(hrtime_base)]	! update hrtime_base
558
559	!
560	! apply adjustment, if any
561	!
562	ldx	[%l4 + %lo(hrestime_adj)], %l0	! %l0 = hrestime_adj
563	brz	%l0, 2f
564						! hrestime_adj == 0 ?
565						! yes, skip adjustments
566	clr	%l5				! delay: set adj to zero
567	tst	%l0				! is hrestime_adj >= 0 ?
568	bge,pt	%xcc, 1f			! yes, go handle positive case
569	srl	%i1, ADJ_SHIFT, %l5		! delay: %l5 = adj
570
571	addcc	%l0, %l5, %g0			! hrestime_adj < -adj ?
572	bl,pt	%xcc, 2f			! yes, use current adj
573	neg	%l5				! delay: %l5 = -adj
574	ba,pt	%xcc, 2f
575	mov	%l0, %l5			! no, so set adj = hrestime_adj
5761:
577	subcc	%l0, %l5, %g0			! hrestime_adj < adj ?
578	bl,a,pt	%xcc, 2f			! yes, set adj = hrestime_adj
579	mov	%l0, %l5			! delay: adj = hrestime_adj
5802:
581	ldx	[%l4 + %lo(timedelta)], %l0	! %l0 = timedelta
582	sub	%l0, %l5, %l0			! timedelta -= adj
583
584	stx	%l0, [%l4 + %lo(timedelta)]	! store new timedelta
585	stx	%l0, [%l4 + %lo(hrestime_adj)]	! hrestime_adj = timedelta
586
587	or	%l4, %lo(hrestime), %l2
588	ldn	[%l2], %i2			! %i2:%i3 = hrestime sec:nsec
589	ldn	[%l2 + CLONGSIZE], %i3
590	add	%i3, %l5, %i3			! hrestime.nsec += adj
591	add	%i3, %i1, %i3			! hrestime.nsec += nslt
592
593	set	NANOSEC, %l5			! %l5 = NANOSEC
594	cmp	%i3, %l5
595	bl,pt	%xcc, 5f			! if hrestime.tv_nsec < NANOSEC
596	sethi	%hi(one_sec), %i1		! delay
597	add	%i2, 0x1, %i2			! hrestime.tv_sec++
598	sub	%i3, %l5, %i3			! hrestime.tv_nsec - NANOSEC
599	mov	0x1, %l5
600	st	%l5, [%i1 + %lo(one_sec)]
6015:
602	stn	%i2, [%l2]
603	stn	%i3, [%l2 + CLONGSIZE]		! store the new hrestime
604
605	membar	#StoreStore
606
607	ld	[%l4 + %lo(hres_lock)], %i1
608	inc	%i1				! release lock
609	st	%i1, [%l4 + %lo(hres_lock)]	! clear hres_lock
610
611	ret
612	restore
613
6149:
615	!
616	! release hres_lock
617	!
618	ld	[%l4 + %lo(hres_lock)], %i1
619	inc	%i1
620	st	%i1, [%l4 + %lo(hres_lock)]
621
622	sethi	%hi(hrtime_base_panic), %o0
623	call	panic
624	or	%o0, %lo(hrtime_base_panic), %o0
625
626	SET_SIZE(hres_tick)
627
628	.seg	".text"
629kstat_q_panic_msg:
630	.asciz	"kstat_q_exit: qlen == 0"
631
632	ENTRY(kstat_q_panic)
633	save	%sp, -SA(MINFRAME), %sp
634	sethi	%hi(kstat_q_panic_msg), %o0
635	call	panic
636	or	%o0, %lo(kstat_q_panic_msg), %o0
637	/*NOTREACHED*/
638	SET_SIZE(kstat_q_panic)
639
640#define	BRZPN	brz,pn
641#define	BRZPT	brz,pt
642
643#define	KSTAT_Q_UPDATE(QOP, QBR, QZERO, QRETURN, QTYPE) \
644	ld	[%o0 + QTYPE##CNT], %o1;	/* %o1 = old qlen */	\
645	QOP	%o1, 1, %o2;			/* %o2 = new qlen */	\
646	QBR	%o1, QZERO;			/* done if qlen == 0 */	\
647	st	%o2, [%o0 + QTYPE##CNT];	/* delay: save qlen */	\
648	ldx	[%o0 + QTYPE##LASTUPDATE], %o3;			\
649	ldx	[%o0 + QTYPE##TIME], %o4;	/* %o4 = old time */	\
650	ldx	[%o0 + QTYPE##LENTIME], %o5;	/* %o5 = old lentime */	\
651	sub	%g1, %o3, %o2;			/* %o2 = time delta */	\
652	mulx	%o1, %o2, %o3;			/* %o3 = cur lentime */	\
653	add	%o4, %o2, %o4;			/* %o4 = new time */	\
654	add	%o5, %o3, %o5;			/* %o5 = new lentime */	\
655	stx	%o4, [%o0 + QTYPE##TIME];	/* save time */		\
656	stx	%o5, [%o0 + QTYPE##LENTIME];	/* save lentime */	\
657QRETURN;								\
658	stx	%g1, [%o0 + QTYPE##LASTUPDATE]; /* lastupdate = now */
659
660#if !defined(DEBUG)
661/*
662 * same as KSTAT_Q_UPDATE but without:
663 * QBR     %o1, QZERO;
664 * to be used only with non-debug build. mimics ASSERT() behaviour.
665 */
666#define	KSTAT_Q_UPDATE_ND(QOP, QRETURN, QTYPE) \
667	ld	[%o0 + QTYPE##CNT], %o1;	/* %o1 = old qlen */	\
668	QOP	%o1, 1, %o2;			/* %o2 = new qlen */	\
669	st	%o2, [%o0 + QTYPE##CNT];	/* delay: save qlen */	\
670	ldx	[%o0 + QTYPE##LASTUPDATE], %o3;			\
671	ldx	[%o0 + QTYPE##TIME], %o4;	/* %o4 = old time */	\
672	ldx	[%o0 + QTYPE##LENTIME], %o5;	/* %o5 = old lentime */	\
673	sub	%g1, %o3, %o2;			/* %o2 = time delta */	\
674	mulx	%o1, %o2, %o3;			/* %o3 = cur lentime */	\
675	add	%o4, %o2, %o4;			/* %o4 = new time */	\
676	add	%o5, %o3, %o5;			/* %o5 = new lentime */	\
677	stx	%o4, [%o0 + QTYPE##TIME];	/* save time */		\
678	stx	%o5, [%o0 + QTYPE##LENTIME];	/* save lentime */	\
679QRETURN;								\
680	stx	%g1, [%o0 + QTYPE##LASTUPDATE]; /* lastupdate = now */
681#endif
682
683	.align 16
684	ENTRY(kstat_waitq_enter)
685	GET_NATIVE_TIME(%g1, %g2, %g3)
686	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
687	SET_SIZE(kstat_waitq_enter)
688
689	.align 16
690	ENTRY(kstat_waitq_exit)
691	GET_NATIVE_TIME(%g1, %g2, %g3)
692#if defined(DEBUG)
693	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_W)
694#else
695	KSTAT_Q_UPDATE_ND(sub, retl, KSTAT_IO_W)
696#endif
697	SET_SIZE(kstat_waitq_exit)
698
699	.align 16
700	ENTRY(kstat_runq_enter)
701	GET_NATIVE_TIME(%g1, %g2, %g3)
702	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
703	SET_SIZE(kstat_runq_enter)
704
705	.align 16
706	ENTRY(kstat_runq_exit)
707	GET_NATIVE_TIME(%g1, %g2, %g3)
708#if defined(DEBUG)
709	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, retl, KSTAT_IO_R)
710#else
711	KSTAT_Q_UPDATE_ND(sub, retl, KSTAT_IO_R)
712#endif
713	SET_SIZE(kstat_runq_exit)
714
715	.align 16
716	ENTRY(kstat_waitq_to_runq)
717	GET_NATIVE_TIME(%g1, %g2, %g3)
718#if defined(DEBUG)
719	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_W)
720#else
721	KSTAT_Q_UPDATE_ND(sub, 1:, KSTAT_IO_W)
722#endif
723	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_R)
724	SET_SIZE(kstat_waitq_to_runq)
725
726	.align 16
727	ENTRY(kstat_runq_back_to_waitq)
728	GET_NATIVE_TIME(%g1, %g2, %g3)
729#if defined(DEBUG)
730	KSTAT_Q_UPDATE(sub, BRZPN, kstat_q_panic, 1:, KSTAT_IO_R)
731#else
732	KSTAT_Q_UPDATE_ND(sub, 1:, KSTAT_IO_R)
733#endif
734	KSTAT_Q_UPDATE(add, BRZPT, 1f, 1:retl, KSTAT_IO_W)
735	SET_SIZE(kstat_runq_back_to_waitq)
736
737	/*
738	 *  -- WARNING --
739	 *
740	 * The following variables MUST be together on a 128-byte boundary.
741	 * In addition to the primary performance motivation (having them all
742	 * on the same cache line(s)), code here and in the GET*TIME() macros
743	 * assumes that they all have the same high 22 address bits (so
744	 * there's only one sethi).
745	 */
746	.seg	".data"
747	.global	timedelta, hres_last_tick, hrestime, hrestime_adj
748	.global	hres_lock, nsec_scale, hrtime_base, traptrace_use_stick
749	.global	nsec_shift, adj_shift
750
751	/* XXX - above comment claims 128-bytes is necessary */
752	.align	64
753timedelta:
754	.word	0, 0		/* int64_t */
755hres_last_tick:
756	.word	0, 0		/* hrtime_t */
757hrestime:
758	.nword	0, 0		/* 2 longs */
759hrestime_adj:
760	.word	0, 0		/* int64_t */
761hres_lock:
762	.word	0
763nsec_scale:
764	.word	0
765hrtime_base:
766	.word	0, 0
767traptrace_use_stick:
768	.word	0
769nsec_shift:
770	.word	NSEC_SHIFT
771adj_shift:
772	.word	ADJ_SHIFT
773
774
775/*
776 * drv_usecwait(clock_t n)	[DDI/DKI - section 9F]
777 * usec_delay(int n)		[compatibility - should go one day]
778 * Delay by spinning.
779 *
780 * delay for n microseconds.  numbers <= 0 delay 1 usec
781 *
782 * With UltraSPARC-III the combination of supporting mixed-speed CPUs
783 * and variable clock rate for power management requires that we
784 * use %stick to implement this routine.
785 *
786 * For OPL platforms that support the "sleep" instruction, we
787 * conditionally (ifdef'ed) insert a "sleep" instruction in
788 * the loop. Note that theoritically we should have move (duplicated)
789 * the code down to spitfire/us3/opl specific asm files - but this
790 * is alot of code duplication just to add one "sleep" instruction.
791 * We chose less code duplication for this.
792 */
793
794	ENTRY(drv_usecwait)
795	ALTENTRY(usec_delay)
796	brlez,a,pn %o0, 0f
797	mov	1, %o0
7980:
799	sethi	%hi(sticks_per_usec), %o1
800	lduw	[%o1 + %lo(sticks_per_usec)], %o1
801	mulx	%o1, %o0, %o1		! Scale usec to ticks
802	inc	%o1			! We don't start on a tick edge
803	GET_NATIVE_TIME(%o2, %o3, %o4)
804	add	%o1, %o2, %o1
805
8061:
807#ifdef	_OPL
808	.word 0x81b01060		! insert "sleep" instruction
809#endif /* _OPL */			! use byte code for now
810	cmp	%o1, %o2
811	GET_NATIVE_TIME(%o2, %o3, %o4)
812	bgeu,pt	%xcc, 1b
813	nop
814	retl
815	nop
816	SET_SIZE(usec_delay)
817	SET_SIZE(drv_usecwait)
818
819/*
820 * Level-14 interrupt prologue.
821 */
822	ENTRY_NP(pil14_interrupt)
823	CPU_ADDR(%g1, %g2)
824	rdpr	%pil, %g6			! %g6 = interrupted PIL
825	stn	%g6, [%g1 + CPU_PROFILE_PIL]	! record interrupted PIL
826	rdpr	%tstate, %g6
827	rdpr	%tpc, %g5
828	btst	TSTATE_PRIV, %g6		! trap from supervisor mode?
829	bnz,a,pt %xcc, 1f
830	stn	%g5, [%g1 + CPU_PROFILE_PC]	! if so, record kernel PC
831	stn	%g5, [%g1 + CPU_PROFILE_UPC]	! if not, record user PC
832	ba	pil_interrupt_common		! must be large-disp branch
833	stn	%g0, [%g1 + CPU_PROFILE_PC]	! zero kernel PC
8341:	ba	pil_interrupt_common		! must be large-disp branch
835	stn	%g0, [%g1 + CPU_PROFILE_UPC]	! zero user PC
836	SET_SIZE(pil14_interrupt)
837
838	ENTRY_NP(tick_rtt)
839	!
840	! Load TICK_COMPARE into %o5; if bit 63 is set, then TICK_COMPARE is
841	! disabled.  If TICK_COMPARE is enabled, we know that we need to
842	! reenqueue the interrupt request structure.  We'll then check TICKINT
843	! in SOFTINT; if it's set, then we know that we were in a TICK_COMPARE
844	! interrupt.  In this case, TICK_COMPARE may have been rewritten
845	! recently; we'll compare %o5 to the current time to verify that it's
846	! in the future.
847	!
848	! Note that %o5 is live until after 1f.
849	! XXX - there is a subroutine call while %o5 is live!
850	!
851	RD_TICKCMPR(%o5, %g1)
852	srlx	%o5, TICKINT_DIS_SHFT, %g1
853	brnz,pt	%g1, 2f
854	nop
855
856	rdpr 	%pstate, %g5
857	andn	%g5, PSTATE_IE, %g1
858	wrpr	%g0, %g1, %pstate		! Disable vec interrupts
859
860	sethi	%hi(cbe_level14_inum), %o1
861	ldx	[%o1 + %lo(cbe_level14_inum)], %o1
862	call	intr_enqueue_req ! preserves %o5 and %g5
863	mov	PIL_14, %o0
864
865	! Check SOFTINT for TICKINT/STICKINT
866	rd	SOFTINT, %o4
867	set	(TICK_INT_MASK | STICK_INT_MASK), %o0
868	andcc	%o4, %o0, %g0
869	bz,a,pn	%icc, 2f
870	wrpr	%g0, %g5, %pstate		! Enable vec interrupts
871
872	! clear TICKINT/STICKINT
873	wr	%o0, CLEAR_SOFTINT
874
875	!
876	! Now that we've cleared TICKINT, we can reread %tick and confirm
877	! that the value we programmed is still in the future.  If it isn't,
878	! we need to reprogram TICK_COMPARE to fire as soon as possible.
879	!
880	GET_NATIVE_TIME(%o0, %g1, %g2)		! %o0 = tick
881	sllx	%o0, 1, %o0			! Clear the DIS bit
882	srlx	%o0, 1, %o0
883	cmp	%o5, %o0			! In the future?
884	bg,a,pt	%xcc, 2f			! Yes, drive on.
885	wrpr	%g0, %g5, %pstate		!   delay: enable vec intr
886
887	!
888	! If we're here, then we have programmed TICK_COMPARE with a %tick
889	! which is in the past; we'll now load an initial step size, and loop
890	! until we've managed to program TICK_COMPARE to fire in the future.
891	!
892	mov	8, %o4				! 8 = arbitrary inital step
8931:	add	%o0, %o4, %o5			! Add the step
894	WR_TICKCMPR(%o5,%g1,%g2,__LINE__)	! Write to TICK_CMPR
895	GET_NATIVE_TIME(%o0, %g1, %g2)		! %o0 = tick
896	sllx	%o0, 1, %o0			! Clear the DIS bit
897	srlx	%o0, 1, %o0
898	cmp	%o5, %o0			! In the future?
899	bg,a,pt	%xcc, 2f			! Yes, drive on.
900	wrpr	%g0, %g5, %pstate		!    delay: enable vec intr
901	ba	1b				! No, try again.
902	sllx	%o4, 1, %o4			!    delay: double step size
903
9042:	ba	current_thread_complete
905	nop
906	SET_SIZE(tick_rtt)
907
908/*
909 * Level-15 interrupt prologue.
910 */
911       ENTRY_NP(pil15_interrupt)
912       CPU_ADDR(%g1, %g2)
913       rdpr    %tstate, %g6
914       rdpr    %tpc, %g5
915       btst    TSTATE_PRIV, %g6                ! trap from supervisor mode?
916       bnz,a,pt %xcc, 1f
917       stn     %g5, [%g1 + CPU_CPCPROFILE_PC]  ! if so, record kernel PC
918       stn     %g5, [%g1 + CPU_CPCPROFILE_UPC] ! if not, record user PC
919       ba      pil15_epilogue                  ! must be large-disp branch
920       stn     %g0, [%g1 + CPU_CPCPROFILE_PC]  ! zero kernel PC
9211:     ba      pil15_epilogue                  ! must be large-disp branch
922       stn     %g0, [%g1 + CPU_CPCPROFILE_UPC] ! zero user PC
923       SET_SIZE(pil15_interrupt)
924
925#ifdef DEBUG
926	.seg	".text"
927find_cpufreq_panic:
928	.asciz	"find_cpufrequency: interrupts already disabled on entry"
929#endif	/* DEBUG */
930
931	ENTRY_NP(find_cpufrequency)
932	rdpr	%pstate, %g1
933
934#ifdef DEBUG
935	andcc	%g1, PSTATE_IE, %g0	! If DEBUG, check that interrupts
936	bnz	0f			! are currently enabled
937	sethi	%hi(find_cpufreq_panic), %o1
938	call	panic
939	or	%o1, %lo(find_cpufreq_panic), %o0
940#endif	/* DEBUG */
941
9420:
943	wrpr	%g1, PSTATE_IE, %pstate	! Disable interrupts
9443:
945	ldub	[%o0], %o1		! Read the number of seconds
946	mov	%o1, %o2		! remember initial value in %o2
9471:
948	GET_NATIVE_TIME(%o3, %g4, %g5)
949	cmp	%o1, %o2		! did the seconds register roll over?
950	be,pt	%icc, 1b		! branch back if unchanged
951	ldub	[%o0], %o2		!   delay: load the new seconds val
952
953	brz,pn	%o2, 3b			! if the minutes just rolled over,
954					! the last second could have been
955					! inaccurate; try again.
956	mov	%o2, %o4		!   delay: store init. val. in %o2
9572:
958	GET_NATIVE_TIME(%o5, %g4, %g5)
959	cmp	%o2, %o4		! did the seconds register roll over?
960	be,pt	%icc, 2b		! branch back if unchanged
961	ldub	[%o0], %o4		!   delay: load the new seconds val
962
963	brz,pn	%o4, 0b			! if the minutes just rolled over,
964					! the last second could have been
965					! inaccurate; try again.
966	wrpr	%g0, %g1, %pstate	!   delay: re-enable interrupts
967
968	retl
969	sub	%o5, %o3, %o0		! return the difference in ticks
970	SET_SIZE(find_cpufrequency)
971
972#if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
973	defined(SERRANO)
974	!
975	! On US-III, the prefetch instruction queue is 8 entries deep.
976	! Also, prefetches for write put data in the E$, which has
977	! lines of 512 bytes for an 8MB cache. Each E$ line is further
978	! subblocked into 64 byte chunks.
979	!
980	! Since prefetch can only bring in 64 bytes at a time (See Sparc
981	! v9 Architecture Manual pp.204) and a page_t is 128 bytes,
982	! then 2 prefetches are required in order to bring an entire
983	! page into the E$.
984	!
985	! Since the prefetch queue is 8 entries deep, we currently can
986	! only have 4 prefetches for page_t's outstanding. Thus, we
987	! prefetch n+4 ahead of where we are now:
988	!
989	!      4 * sizeof(page_t)     -> 512
990	!      4 * sizeof(page_t) +64 -> 576
991	!
992	! Example
993	! =======
994	! contiguous page array in memory...
995	!
996	! |AAA1|AAA2|BBB1|BBB2|CCC1|CCC2|DDD1|DDD2|XXX1|XXX2|YYY1|YYY2|...
997	! ^         ^         ^         ^         ^    ^
998	! pp                                      |    pp+4*sizeof(page)+64
999	!                                         |
1000	!                                         pp+4*sizeof(page)
1001	!
1002	!  Prefetch
1003	!   Queue
1004	! +-------+<--- In this iteration, we're working with pp (AAA1),
1005	! |Preftch|     but we enqueue prefetch for addr = XXX1
1006	! | XXX1  |
1007	! +-------+<--- this queue slot will be a prefetch instruction for
1008	! |Preftch|     for addr = pp + 4*sizeof(page_t) + 64 (or second
1009	! | XXX2  |     half of page XXX)
1010	! +-------+
1011	! |Preftch|<-+- The next time around this function, we'll be
1012	! | YYY1  |  |  working with pp = BBB1, but will be enqueueing
1013	! +-------+  |  prefetches to for both halves of page YYY,
1014	! |Preftch|  |  while both halves of page XXX are in transit
1015	! | YYY2  |<-+  make their way into the E$.
1016	! +-------+
1017	! |Preftch|
1018	! | ZZZ1  |
1019	! +-------+
1020	! .       .
1021	! :       :
1022	!
1023	!  E$
1024	! +============================================...
1025	! | XXX1 | XXX2 | YYY1 | YYY2 | ZZZ1 | ZZZ2 |
1026	! +============================================...
1027	! |      |      |      |      |      |      |
1028	! +============================================...
1029	! .
1030	! :
1031	!
1032	! So we should expect the first four page accesses to stall
1033	! while we warm up the cache, afterwhich, most of the pages
1034	! will have their pp ready in the E$.
1035	!
1036	! Also note that if sizeof(page_t) grows beyond 128, then
1037	! we'll need an additional prefetch to get an entire page
1038	! into the E$, thus reducing the number of outstanding page
1039	! prefetches to 2 (ie. 3 prefetches/page = 6 queue slots)
1040	! etc.
1041	!
1042	! Cheetah+
1043	! ========
1044	! On Cheetah+ we use "#n_write" prefetches as these avoid
1045	! unnecessary RTS->RTO bus transaction state change, and
1046	! just issues RTO transaction. (See pp.77 of Cheetah+ Delta
1047	! PRM). On Cheetah, #n_write prefetches are reflected with
1048	! RTS->RTO state transition regardless.
1049	!
1050#define STRIDE1 512
1051#define STRIDE2 576
1052
1053#if	STRIDE1 != (PAGE_SIZE * 4)
1054#error	"STRIDE1 != (PAGE_SIZE * 4)"
1055#endif	/* STRIDE1 != (PAGE_SIZE * 4) */
1056
1057/*
1058 * Prefetch a page_t for write or read, this assumes a linear
1059 * scan of sequential page_t's.
1060 */
1061        ENTRY(prefetch_page_w)
1062        prefetch        [%o0+STRIDE1], #n_writes
1063        retl
1064        prefetch        [%o0+STRIDE2], #n_writes
1065        SET_SIZE(prefetch_page_w)
1066
1067	!
1068	! Note on CHEETAH to prefetch for read, we really use #one_write.
1069	! This fetches to E$ (general use) rather than P$ (floating point use).
1070	!
1071        ENTRY(prefetch_page_r)
1072        prefetch        [%o0+STRIDE1], #one_write
1073        retl
1074        prefetch        [%o0+STRIDE2], #one_write
1075        SET_SIZE(prefetch_page_r)
1076
1077#elif defined(SPITFIRE) || defined(HUMMINGBIRD)
1078
1079	!
1080	! UltraSparcII can have up to 3 prefetches outstanding.
1081	! A page_t is 128 bytes (2 prefetches of 64 bytes each)
1082	! So prefetch for pp + 1, which is
1083	!
1084	!       pp + sizeof(page_t)
1085	! and
1086	!       pp + sizeof(page_t) + 64
1087	!
1088#define STRIDE1	128
1089#define STRIDE2	192
1090
1091#if	STRIDE1 != PAGE_SIZE
1092#error	"STRIDE1 != PAGE_SIZE"
1093#endif	/* STRIDE1 != PAGE_SIZE */
1094
1095        ENTRY(prefetch_page_w)
1096        prefetch        [%o0+STRIDE1], #n_writes
1097        retl
1098        prefetch        [%o0+STRIDE2], #n_writes
1099        SET_SIZE(prefetch_page_w)
1100
1101        ENTRY(prefetch_page_r)
1102        prefetch        [%o0+STRIDE1], #n_reads
1103        retl
1104        prefetch        [%o0+STRIDE2], #n_reads
1105        SET_SIZE(prefetch_page_r)
1106
1107#elif defined(OLYMPUS_C)
1108	!
1109	! Prefetch strides for Olympus-C
1110	!
1111
1112#define STRIDE1	0x440
1113#define STRIDE2	0x640
1114
1115	ENTRY(prefetch_page_w)
1116        prefetch        [%o0+STRIDE1], #n_writes
1117	retl
1118        prefetch        [%o0+STRIDE2], #n_writes
1119	SET_SIZE(prefetch_page_w)
1120
1121	ENTRY(prefetch_page_r)
1122        prefetch        [%o0+STRIDE1], #n_writes
1123	retl
1124        prefetch        [%o0+STRIDE2], #n_writes
1125	SET_SIZE(prefetch_page_r)
1126#else	/* OLYMPUS_C */
1127
1128#error "You need to fix this for your new cpu type."
1129
1130#endif	/* OLYMPUS_C */
1131
1132#if defined(CHEETAH) || defined(CHEETAH_PLUS) || defined(JALAPENO) || \
1133	defined(SERRANO)
1134
1135#define	PREFETCH_Q_LEN 8
1136
1137#elif defined(SPITFIRE) || defined(HUMMINGBIRD)
1138
1139#define	PREFETCH_Q_LEN 3
1140
1141#elif defined(OLYMPUS_C)
1142	!
1143	! Use length of one for now.
1144	!
1145#define	PREFETCH_Q_LEN	1
1146
1147#else 	/* OLYMPUS_C */
1148
1149#error You need to fix this for your new cpu type.
1150
1151#endif	/* OLYMPUS_C */
1152
1153#include <vm/kpm.h>
1154
1155#ifdef	SEGKPM_SUPPORT
1156
1157#define	SMAP_SIZE 72
1158#define SMAP_STRIDE (((PREFETCH_Q_LEN * 64) / SMAP_SIZE) * 64)
1159
1160#else	/* SEGKPM_SUPPORT */
1161
1162	!
1163	! The hardware will prefetch the 64 byte cache aligned block
1164	! that contains the address specified in the prefetch instruction.
1165	! Since the size of the smap struct is 48 bytes, issuing 1 prefetch
1166	! per pass will suffice as long as we prefetch far enough ahead to
1167	! make sure we don't stall for the cases where the smap object
1168	! spans multiple hardware prefetch blocks.  Let's prefetch as far
1169	! ahead as the hardware will allow.
1170	!
1171	! The smap array is processed with decreasing address pointers.
1172	!
1173#define	SMAP_SIZE 48
1174#define	SMAP_STRIDE (PREFETCH_Q_LEN * SMAP_SIZE)
1175
1176#endif	/* SEGKPM_SUPPORT */
1177
1178/*
1179 * Prefetch struct smap for write.
1180 */
1181	ENTRY(prefetch_smap_w)
1182	retl
1183	prefetch	[%o0-SMAP_STRIDE], #n_writes
1184	SET_SIZE(prefetch_smap_w)
1185
1186	ENTRY_NP(getidsr)
1187	retl
1188	ldxa	[%g0]ASI_INTR_DISPATCH_STATUS, %o0
1189	SET_SIZE(getidsr)
1190