xref: /illumos-gate/usr/src/uts/sun4u/sys/cheetahasm.h (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef	_CHEETAHASM_H
28 #define	_CHEETAHASM_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #ifdef	__cplusplus
33 extern "C" {
34 #endif
35 
36 #ifdef _ASM
37 /* BEGIN CSTYLED */
38 
39 #define	ASM_LD(reg, symbol)						\
40 	sethi	%hi(symbol), reg;					\
41 	ld	[reg + %lo(symbol)], reg;				\
42 
43 #define	ASM_LDX(reg, symbol)						\
44 	sethi	%hi(symbol), reg;					\
45 	ldx	[reg + %lo(symbol)], reg;				\
46 
47 #define	ASM_JMP(reg, symbol)						\
48 	sethi	%hi(symbol), reg;					\
49 	jmp	reg + %lo(symbol);					\
50 	nop
51 
52 /*
53  * Macro for getting to offset from 'cpu_private' ptr.  The 'cpu_private'
54  * ptr is in the machcpu structure.
55  *  off_reg:  Register offset from 'cpu_private' ptr.
56  *  scr1:    Scratch, ptr is returned in this register.
57  *  scr2:    Scratch
58  *  label:   Label to branch to if cpu_private ptr is null/zero.
59  */
60 #define	GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label)			\
61 	CPU_ADDR(scr1, scr2);						\
62 	ldn	[scr1 + CPU_PRIVATE], scr1;				\
63 	cmp	scr1, 0;						\
64 	be	label;							\
65 	  nop;								\
66 	add	scr1, off_reg, scr1
67 
68 /*
69  * Macro version of get_dcache_dtag.  We use this macro in the
70  * CPU logout code. Since the Dcache is virtually indexed, only
71  * bits [12:5] of the AFAR can be used so we need to search through
72  * 8 indexes (4 ways + bit 13) in order to find the tag we want.
73  *   afar:  input AFAR, not modified.
74  *   datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
75  *   scr1:  scratch.
76  *   scr2:  scratch, will hold tag to look for.
77  *   scr3:  used for Dcache index, loops through 4 ways.
78  */
79 #define	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
80 	set	CH_DCACHE_IDX_MASK, scr3;				\
81 	and	afar, scr3, scr3;					\
82 	srlx	afar, CH_DCTAG_PA_SHIFT, scr2;				\
83 	b	1f;							\
84 	  or	scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */	\
85 	.align	128;							\
86 1:									\
87 	ldxa	[scr3]ASI_DC_TAG, scr1;		/* read tag */		\
88 	cmp	scr1, scr2;						\
89 	bne	4f;				/* not found? */	\
90 	  nop;								\
91 	stxa	scr3, [datap + CH_DC_IDX]%asi;	/* store index */	\
92 	stxa	scr1, [datap + CH_DC_TAG]%asi;	/* store tag */		\
93 	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
94 	ldxa	[scr3]ASI_DC_UTAG, scr1;	/* read utag */		\
95 	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
96 	stxa	scr1, [datap + CH_DC_UTAG]%asi;				\
97 	ldxa	[scr3]ASI_DC_SNP_TAG, scr1;	/* read snoop tag */	\
98 	stxa	scr1, [datap + CH_DC_SNTAG]%asi;			\
99 	add	datap, CH_DC_DATA, datap;				\
100 	clr	scr2;							\
101 2:									\
102 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
103 	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read data */		\
104 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
105 	stxa	scr1, [datap]%asi;					\
106 	add	datap, 8, datap;					\
107 	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
108 	blt	2b;							\
109 	  add	scr2, 8, scr2;						\
110 									\
111 	GET_CPU_IMPL(scr2);	/* Parity bits are elsewhere for */	\
112 	cmp	scr2, PANTHER_IMPL;	/* panther processors. */	\
113 	bne,a	5f;			/* Done if not panther. */	\
114 	  add	datap, 8, datap; /* Skip to the end of the struct. */	\
115 	clr	scr2;							\
116 	add	datap, 7, datap; /* offset of the last parity byte */	\
117 	mov	1, scr1;						\
118 	sll	scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1;		\
119 	or	scr3, scr1, scr3; /* add DC_data_parity bit to index */	\
120 3:									\
121 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
122 	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read parity bits */	\
123 	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
124 	stba	scr1, [datap]%asi;					\
125 	dec	datap;							\
126 	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
127 	blt	3b;							\
128 	  add	scr2, 8, scr2;						\
129 	b	5f;							\
130 	  add	datap, 5, datap; /* set pointer to end of our struct */	\
131 4:									\
132 	set	CH_DCACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
133 	add	scr3, scr1, scr3;					\
134 	set	CH_DCACHE_IDX_LIMIT, scr1;	/* done? */		\
135 	cmp	scr3, scr1;						\
136 	blt	1b;							\
137 	  nop;								\
138 	add	datap, CH_DC_DATA_SIZE, datap;				\
139 5:
140 
141 /*
142  * Macro version of get_icache_dtag.  We use this macro in the CPU
143  * logout code. If the Icache is on, we don't want to capture the data.
144  *   afar:  input AFAR, not modified.
145  *   datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
146  *   scr1:  scratch.
147  *   scr2:  scratch, will hold tag to look for.
148  *   scr3:  used for Icache index, loops through 4 ways.
149  * Note: For Panther, the Icache is virtually indexed and increases in
150  * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
151  * of 32). This means the IC_addr index bits[14:7] for Panther now
152  * correspond to VA bits[13:6]. But since it is virtually indexed, we
153  * still mask out only bits[12:5] from the AFAR (we have to manually
154  * check bit 13). In order to make this code work for all processors,
155  * we end up checking twice as many indexes (8 instead of 4) as required
156  * for non-Panther CPUs and saving off twice as much data (16 instructions
157  * instead of just 8).
158  */
159 #define	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
160 	ldxa	[%g0]ASI_DCU, scr1;					\
161 	btst	DCU_IC, scr1;		/* is Icache enabled? */	\
162 	bne,a	6f;			/* yes, don't capture */	\
163 	  add	datap, CH_IC_DATA_SIZE, datap;	/* anul if no branch */	\
164 	GET_CPU_IMPL(scr2);	/* Panther only uses VA[13:6] */	\
165 	cmp	scr2, PANTHER_IMPL;	/* and we also want to mask */	\
166 	be	1f;			/* out bit 13 since the */	\
167 	  nop;				/* Panther I$ is VIPT. */	\
168 	set	CH_ICACHE_IDX_MASK, scr3;				\
169 	b	2f;							\
170 	  nop;								\
171 1:									\
172 	set	PN_ICACHE_VA_IDX_MASK, scr3;				\
173 2:									\
174 	and	afar, scr3, scr3;					\
175 	sllx	scr3, CH_ICACHE_IDX_SHIFT, scr3;			\
176 	srlx	afar, CH_ICPATAG_SHIFT, scr2;	/* pa tag we want */	\
177 	andn	scr2, CH_ICPATAG_LBITS, scr2;	/* mask off lower */	\
178 	b	3f;							\
179 	  nop;								\
180 	.align	128;							\
181 3:									\
182 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read pa tag */	\
183 	andn	scr1, CH_ICPATAG_LBITS, scr1;	/* mask off lower */	\
184 	cmp	scr1, scr2;						\
185 	bne	5f;				/* not found? */	\
186 	  nop;								\
187 	stxa	scr3, [datap + CH_IC_IDX]%asi;	/* store index */	\
188 	stxa	scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */	\
189 	add	scr3, CH_ICTAG_UTAG, scr3;	/* read utag */		\
190 	ldxa	[scr3]ASI_IC_TAG, scr1;					\
191 	add	scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3;		\
192 	stxa	scr1, [datap + CH_IC_UTAG]%asi;				\
193 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read upper tag */	\
194 	add	scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3;		\
195 	stxa	scr1, [datap + CH_IC_UPPER]%asi;			\
196 	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read lower tag */	\
197 	andn	scr3, CH_ICTAG_TMASK, scr3;				\
198 	stxa	scr1, [datap + CH_IC_LOWER]%asi;			\
199 	ldxa	[scr3]ASI_IC_SNP_TAG, scr1;	/* read snoop tag */	\
200 	stxa	scr1, [datap + CH_IC_SNTAG]%asi;			\
201 	add	datap, CH_IC_DATA, datap;				\
202 	clr	scr2;							\
203 4:									\
204 	ldxa	[scr3 + scr2]ASI_IC_DATA, scr1;	/* read ins. data */	\
205 	stxa	scr1, [datap]%asi;					\
206 	add	datap, 8, datap;					\
207 	cmp	scr2, PN_IC_DATA_REG_SIZE - 8;				\
208 	blt	4b;							\
209 	  add	scr2, 8, scr2;						\
210 	b	6f;							\
211 	  nop;								\
212 5:									\
213 	set	CH_ICACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
214 	add	scr3, scr1, scr3;					\
215 	set	PN_ICACHE_IDX_LIMIT, scr1;	/* done? */		\
216 	cmp	scr3, scr1;						\
217 	blt	3b;							\
218 	  nop;								\
219 	add	datap, CH_IC_DATA_SIZE, datap;				\
220 6:
221 
222 #if defined(JALAPENO) || defined(SERRANO)
223 /*
224  * Macro version of get_ecache_dtag.  We use this macro in the
225  * CPU logout code.
226  *   afar:	input AFAR, not modified
227  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
228  *   ec_way:	Constant value (way number)
229  *   scr1:      Scratch
230  *   scr2:	Scratch.
231  *   scr3:	Scratch.
232  */
233 #define	GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3)		\
234 	mov	ec_way, scr1;						\
235 	and	scr1, JP_ECACHE_NWAY - 1, scr1;	/* mask E$ way bits */	\
236 	sllx	scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1;			\
237 	set	((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2;	\
238 	and	afar, scr2, scr3;		/* get set offset */	\
239 	andn	scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */	\
240 	or	scr3, scr1, scr3;		/* or WAY bits */	\
241 	b	1f;							\
242 	  stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
243 	.align	64;							\
244 1:									\
245 	JP_EC_DIAG_ACCESS_MEMBAR;					\
246 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
247 	JP_EC_DIAG_ACCESS_MEMBAR;					\
248 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
249 	add	datap, CH_EC_DATA, datap;				\
250 2:									\
251 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
252 	clr	scr1;							\
253 3:						/* loop thru 5 regs */	\
254 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
255 	stxa	scr2, [datap]%asi;					\
256 	add	datap, 8, datap;					\
257 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
258 	bne	3b;							\
259 	   add	scr1, 8, scr1;						\
260 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
261 	beq	2b;							\
262 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
263 
264 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
265 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
266 	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
267 	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
268 	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
269 	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
270 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
271 
272 /*
273  * Jalapeno does not have cores so these macros are null.
274  */
275 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
276 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
277 
278 #if defined(JALAPENO)
279 /*
280  * Jalapeno gets primary AFSR and AFAR.  All bits in the AFSR except
281  * the fatal error bits are cleared.
282  *	datap:		pointer to cpu logout structure.
283  *	afar:		returned primary AFAR value.
284  *	scr1:		scratch
285  *	scr2:		scratch
286  */
287 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
288 	ldxa	[%g0]ASI_AFAR, afar;					\
289 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
290 	ldxa	[%g0]ASI_AFSR, scr2;					\
291 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
292 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
293 	sllx	scr1, 32, scr1;						\
294 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
295 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
296 	membar	#Sync
297 
298 /*
299  * Jalapeno has no shadow AFAR, null operation.
300  */
301 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
302 
303 #elif defined(SERRANO)
304 /*
305  * Serrano gets primary AFSR and AFAR.  All bits in the AFSR except
306  * the fatal error bits are cleared.  For Serrano, we also save the
307  * AFAR2 register.
308  *	datap:	pointer to cpu logout structure.
309  *	afar:	returned primary AFAR value.
310  *	scr1:	scratch
311  *	scr2:	scratch
312  */
313 #define GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
314 	set	ASI_MCU_AFAR2_VA, scr1;					\
315 	ldxa	[scr1]ASI_MCU_CTRL, afar;				\
316 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi;	\
317 	ldxa	[%g0]ASI_AFAR, afar;					\
318 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
319 	ldxa	[%g0]ASI_AFSR, scr2;					\
320 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
321 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
322 	sllx	scr1, 32, scr1;						\
323 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
324 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ 	\
325 	membar	#Sync
326 
327 /*
328  * Serrano needs to capture E$, D$ and I$ lines associated with afar2.
329  *      afar:   scratch, holds afar2.
330  *      datap:  pointer to cpu logout structure
331  *      scr1:   scratch
332  *      scr2:   scratch
333  *      scr3:   scratch
334  */
335 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
336 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar;	\
337 	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;		\
338 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
339 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
340 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
341 	sub	datap, CH_CPU_LOGOUT_SIZE, datap
342 #endif /* SERRANO */
343 
344 #elif defined(CHEETAH_PLUS)
345 /*
346  * Macro version of get_ecache_dtag.  We use this macro in the
347  * CPU logout code.
348  *   afar:	input AFAR, not modified.
349  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
350  *   pn_way:	ecache way for panther (value = 0-3). For non-panther
351  *		cpus, this macro will be called with pn_way = 0.
352  *   scr1:	Scratch.
353  *   scr2:	Scratch.
354  *   scr3:	Scratch.
355  */
356 #define	GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3)		\
357 	mov	afar, scr3;						\
358 	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
359 	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
360 	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
361 	mov	pn_way, scr1;	/* panther L3$ is 4-way so we ...    */	\
362 	sllx	scr1, PN_L3_WAY_SHIFT, scr1;	/* need to mask...   */	\
363 	or	scr3, scr1, scr3;	/* in the way bits <24:23>.  */	\
364 	b	1f;							\
365 	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
366 	.align	64;							\
367 1:									\
368 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
369 	stxa     scr1, [datap + CH_EC_TAG]%asi;				\
370 	set	CHP_ECACHE_IDX_TAG_ECC, scr1;				\
371 	or	scr3, scr1, scr1;					\
372 	ldxa    [scr1]ASI_EC_DIAG, scr1;	/* get E$ tag ECC */	\
373 	stxa	scr1, [datap + CH_EC_TAG_ECC]%asi;			\
374 	add	datap, CH_EC_DATA, datap;				\
375 2:									\
376 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
377 	clr	scr1;							\
378 3:						/* loop thru 5 regs */	\
379 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
380 	stxa	scr2, [datap]%asi;					\
381 	add	datap, 8, datap;					\
382 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
383 	bne	3b;							\
384 	   add	scr1, 8, scr1;						\
385 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
386 	beq	2b;							\
387 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
388 
389 /*
390  * If this is a panther, we need to make sure the sibling core is
391  * parked so that we avoid any race conditions during diagnostic
392  * accesses to the shared L2 and L3 caches.
393  * dcucr_reg:	This register will be used to keep track of whether
394  *		or not we need to unpark the core later.
395  *		It just so happens that we also use this same register
396  *		to keep track of our saved DCUCR value so we only touch
397  *		bit 4 of the register (which is a "reserved" bit in the
398  *		DCUCR) for keeping track of core parking.
399  * scr1:	Scratch register.
400  * scr2:	Scratch register.
401  */
402 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
403 	GET_CPU_IMPL(scr1);						\
404 	cmp	scr1, PANTHER_IMPL;	/* only park for panthers */	\
405 	bne,a	%xcc, 2f;						\
406 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
407 	set	ASI_CORE_RUNNING_STATUS, scr1;	/* check other core */	\
408 	ldxa	[scr1]ASI_CMP_SHARED, scr2;	/* is it running?   */	\
409 	cmp	scr2, PN_BOTH_CORES_RUNNING;				\
410 	bne,a	%xcc, 2f;	/* if not running, we are done */	\
411 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
412 	or	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
413 	set	ASI_CORE_ID, scr1;					\
414 	ldxa	[scr1]ASI_CMP_PER_CORE, scr2;				\
415 	and	scr2, COREID_MASK, scr2;				\
416 	or	%g0, 1, scr1;		/* find out which core... */	\
417 	sll	scr1, scr2, scr2;	/* ... we need to park... */	\
418 1:									\
419 	set	ASI_CORE_RUNNING_RW, scr1;				\
420 	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ... and park it. */	\
421 	membar	#Sync;				/* spin until the... */	\
422 	ldxa	[scr1]ASI_CMP_SHARED, scr1;	/* ... the other...  */	\
423 	cmp	scr1, scr2;	/* ...core is parked according to... */	\
424 	bne,a	%xcc, 1b;	/* ...the core running status reg.  */	\
425 	  nop;								\
426 2:
427 
428 /*
429  * The core running this code will unpark its sibling core if the
430  * sibling core had been parked by the current core earlier in this
431  * trap handler.
432  * dcucr_reg:	This register is used to keep track of whether or not
433  *		we need to unpark our sibling core.
434  *		It just so happens that we also use this same register
435  *		to keep track of our saved DCUCR value so we only touch
436  *		bit 4 of the register (which is a "reserved" bit in the
437  *		DCUCR) for keeping track of core parking.
438  * scr1:	Scratch register.
439  * scr2:	Scratch register.
440  */
441 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
442 	btst	PN_PARKED_OTHER_CORE, dcucr_reg;			\
443 	bz,pt	%xcc, 1f;	/* if nothing to unpark, we are done */	\
444 	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
445 	set	ASI_CORE_RUNNING_RW, scr1;				\
446 	set	PN_BOTH_CORES_RUNNING, scr2;	/* we want both...   */	\
447 	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ...cores running. */	\
448 	membar	#Sync;							\
449 1:
450 
451 /*
452  * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR.  All bits
453  * in the primary AFSR are cleared except the fatal error bits.  For Panther,
454  * we also have to read and clear the AFSR_EXT, again leaving the fatal
455  * error bits alone.
456  *	datap:		pointer to cpu logout structure.
457  *	afar:		returned primary AFAR value.
458  *	scr1:		scratch
459  *	scr2:		scratch
460  */
461 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
462 	set	ASI_SHADOW_REG_VA, scr1;				\
463 	ldxa	[scr1]ASI_AFAR, scr2;					\
464 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi;	\
465 	ldxa	[scr1]ASI_AFSR, scr2;					\
466 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi;	\
467 	ldxa	[%g0]ASI_AFAR, afar;					\
468 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
469 	ldxa	[%g0]ASI_AFSR, scr2;					\
470 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
471 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
472 	sllx	scr1, 32, scr1;						\
473 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */ 	\
474 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
475 	membar	#Sync;							\
476 	GET_CPU_IMPL(scr1);						\
477 	cmp	scr1, PANTHER_IMPL;					\
478 	bne	%xcc, 1f;						\
479 	   nop;								\
480 	set	ASI_SHADOW_AFSR_EXT_VA, scr1;	/* shadow AFSR_EXT */	\
481 	ldxa	[scr1]ASI_AFSR, scr2;					\
482 	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \
483 	set	ASI_AFSR_EXT_VA, scr1;		/* primary AFSR_EXT */	\
484 	ldxa	[scr1]ASI_AFSR, scr2;					\
485 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi;	\
486 	set	C_AFSR_EXT_FATAL_ERRS, scr1;				\
487 	bclr	scr1, scr2;	/* Clear fatal error bits here, */	\
488 	set	ASI_AFSR_EXT_VA, scr1;	/* so they're left */		\
489 	stxa	scr2, [scr1]ASI_AFSR;	/* as is in AFSR_EXT */		\
490 	membar	#Sync;							\
491 1:
492 
493 /*
494  * This macro is used in the CPU logout code to capture diagnostic
495  * information from the L2 cache on panther processors.
496  *   afar:	input AFAR, not modified.
497  *   datap:	Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
498  *   scr1:	Scratch.
499  *   scr2:	Scratch.
500  *   scr3:	Scratch.
501  */
502 #define	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3)		\
503 	mov	afar, scr3;						\
504 	set	PN_L2_INDEX_MASK, scr1;					\
505 	and	scr3, scr1, scr3;					\
506 	b	1f;	/* code to read tags and data should be ...  */	\
507 	   nop;		/* ...on the same cache line if possible.    */	\
508 	.align	128;	/* update this line if you add lines below. */	\
509 1:									\
510 	stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store L2$ index  */	\
511 	ldxa	[scr3]ASI_L2_TAG, scr1;		/* read the L2$ tag */	\
512 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
513 	add	datap, CH_EC_DATA, datap;				\
514 	clr	scr1;							\
515 2:									\
516 	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
517 	stxa	scr2, [datap]%asi;		/* <511:256> of L2  */	\
518 	add	datap, 8, datap;		/* data and record  */	\
519 	cmp	scr1, (PN_L2_LINESIZE / 2) - 8;	/* it in the cpu    */	\
520 	bne	2b;				/* logout struct.   */	\
521 	  add	scr1, 8, scr1;						\
522 	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
523 	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
524 	stxa	scr2, [datap]%asi;		/* ecc of <511:256> */	\
525 	add	datap, 8, datap;					\
526 3:									\
527 	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
528 	stxa	scr2, [datap]%asi;		/* <255:0> of L2    */	\
529 	add	datap, 8, datap;		/* data and record  */	\
530 	cmp	scr1, PN_L2_LINESIZE - 8;	/* it in the cpu    */	\
531 	bne	3b;				/* logout struct.   */	\
532 	  add	scr1, 8, scr1;						\
533 	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
534 	add	scr2, PN_L2_ECC_LO_REG, scr2;				\
535 	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
536 	stxa	scr2, [datap]%asi;		/* ecc of <255:0>.  */	\
537 	add	datap, 8, datap;		/* Advance pointer  */	\
538 	set	PN_L2_SET_SIZE, scr2;					\
539 	set	PN_L2_MAX_SET, scr1;					\
540 	cmp	scr1, scr3;	/* more ways to try for this line? */	\
541 	bg,a	%xcc, 1b;	/* if so, start over with next way */	\
542 	  add	scr3, scr2, scr3
543 
544 /*
545  * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
546  *	afar:	AFAR from access.
547  *	datap:	pointer to cpu logout structure.
548  *	scr1:	scratch
549  *	scr2:	scratch
550  *	scr3:	scratch
551  */
552 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
553 	GET_CPU_IMPL(scr1);						\
554 	cmp	scr1, PANTHER_IMPL;					\
555 	bne	%xcc, 4f;						\
556 	  nop;								\
557 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
558 	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
559 	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
560 	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
561 	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
562 	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
563 	b	5f;							\
564 	  nop;								\
565 4:									\
566 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
567 	GET_ECACHE_WAY_BIT(scr1, scr2);					\
568 	xor	afar, scr1, afar;					\
569 	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
570 	GET_ECACHE_WAY_BIT(scr1, scr2);		/* restore AFAR */	\
571 	xor	afar, scr1, afar;					\
572 	add	datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap;	\
573 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
574 5:
575 
576 /*
577  * Cheetah+ needs to capture E$, D$ and I$ lines associated with
578  * shadow afar.
579  *	afar:	scratch, holds shadow afar.
580  *	datap:	pointer to cpu logout structure
581  *	scr1:	scratch
582  *	scr2:	scratch
583  *	scr3:	scratch
584  */
585 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
586 	ldxa	[datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar;	\
587 	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;	\
588 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
589 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
590 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
591 	sub	datap, CH_CPU_LOGOUT_SIZE, datap
592 
593 /*
594  * Compute the "Way" bit for 2-way Ecache for Cheetah+.
595  */
596 #define	GET_ECACHE_WAY_BIT(scr1, scr2)					\
597 	CPU_INDEX(scr1, scr2);						\
598 	mulx	scr1, CPU_NODE_SIZE, scr1;				\
599 	add	scr1, ECACHE_SIZE, scr1;				\
600 	set	cpunodes, scr2;						\
601 	ld	[scr1 + scr2], scr1;					\
602 	srlx	scr1, 1, scr1
603 
604 #else /* CHEETAH_PLUS */
605 /*
606  * Macro version of get_ecache_dtag.  We use this macro in the
607  * CPU logout code.
608  *   afar:	input AFAR, not modified.
609  *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
610  *   scr1:      Scratch.
611  *   scr2:	Scratch.
612  *   scr3:	Scratch.
613  */
614 #define	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
615 	mov	afar, scr3;						\
616 	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
617 	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
618 	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
619 	b	1f;							\
620 	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
621 	.align	64;							\
622 1:									\
623 	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
624 	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
625 	add	datap, CH_EC_DATA, datap;				\
626 2:									\
627 	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
628 	clr	scr1;							\
629 3:						/* loop thru 5 regs */	\
630 	ldxa	[scr1]ASI_EC_DATA, scr2;				\
631 	stxa	scr2, [datap]%asi;					\
632 	add	datap, 8, datap;					\
633 	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
634 	bne	3b;							\
635 	   add	scr1, 8, scr1;						\
636 	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
637 	beq	2b;							\
638 	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
639 
640 /*
641  * Cheetah does not have cores so these macros are null.
642  */
643 #define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
644 #define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
645 
646 /*
647  * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
648  * fatal error bits.
649  *	datap:		pointer to cpu logout structure.
650  *	afar:		returned primary AFAR value.
651  *	scr1:		scratch
652  *	scr2:		scratch
653  */
654 #define	GET_AFSR_AFAR(datap, afar, scr1, scr2)	\
655 	ldxa	[%g0]ASI_AFAR, afar;					\
656 	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
657 	ldxa	[%g0]ASI_AFSR, scr2;					\
658 	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
659 	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
660 	sllx	scr1, 32, scr1;						\
661 	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
662 	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
663 	membar	#Sync
664 
665 /*
666  * Cheetah E$ is direct-mapped, so we grab line data and skip second line.
667  *	afar:	AFAR from access.
668  *	datap:	pointer to cpu logout structure.
669  *	scr1:	scratch
670  *	scr2:	scratch
671  *	scr3:	scratch
672  */
673 #define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
674 	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
675 	add	datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap;	\
676 	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
677 
678 /*
679  * Cheetah has no shadow AFAR, null operation.
680  */
681 #define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
682 
683 #endif	/* CHEETAH_PLUS */
684 
685 /*
686  * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
687  * logout data at TL>0. r_val is a register that returns the "failure count"
688  * to the caller, and may be used as a scratch register until the end of
689  * the macro.  afar is used to return the primary AFAR value to the caller
690  * and it too can be used as a scratch register until the end. r_or_s is
691  * a reg or symbol that has the offset within the "cpu_private" data area
692  * to deposit the logout data.  t_flags is a register that has the
693  * trap-type/trap-level/CEEN info. This t_flags register may be used after
694  * the GET_AFSR_AFAR macro.
695  *
696  * The CPU logout operation will fail (r_val > 0) if the logout
697  * structure in question is already being used. Otherwise, the CPU
698  * logout operation will succeed (r_val = 0). For failures, r_val
699  * returns the busy count (# of times we tried using this CPU logout
700  * structure when it was busy.)
701  *
702  *   Register usage:
703  *	%asi:   Must be set to either ASI_MEM if the address in datap
704  *		is a physical address or to ASI_N if the address in
705  *		datap is a virtual address.
706  *	r_val:	This register is the return value which tells the
707  *		caller whether or not the LOGOUT operation was successful.
708  *		For failures, r_val returns the fail count (i.e. number of
709  *		times we have tried to use this logout structure when it was
710  *		already being used.
711  *	afar:	output: contains AFAR on exit
712  *	t_flags: input trap type info, may be used as scratch after stored
713  *		to cpu log out structure.
714  *	datap:	Points to log out data area.
715  *	scr1:	Scratch
716  *	scr2:	Scratch (may be r_val)
717  *	scr3:   Scratch (may be t_flags)
718  */
719 #define	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \
720 	setx	LOGOUT_INVALID, scr2, scr1;				\
721 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2;	\
722 	cmp	scr2, scr1;						\
723 	bne	8f;							\
724 	  nop;								\
725 	stxa	t_flags, [datap + CH_CLO_FLAGS]%asi;			\
726 	GET_AFSR_AFAR(datap, afar, scr1, scr2);				\
727 	add	datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap;		\
728 	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
729 	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
730 	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
731 	sub	datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap;		\
732 	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3);			\
733 	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar;	\
734 	set	0, r_val;	/* return value for success */		\
735 	ba	9f;							\
736 	  nop;								\
737 8:									\
738 	ldxa	[%g0]ASI_AFAR, afar;					\
739 	ldxa	[datap + CH_CLO_NEST_CNT]%asi, r_val;			\
740 	inc	r_val;		/* return value for failure */		\
741 	stxa	r_val, [datap + CH_CLO_NEST_CNT]%asi;			\
742 	membar	#Sync;							\
743 9:
744 
745 /*
746  * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
747  * logout data.  Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
748  * up the expected data pointer in the scr1 register and sets the %asi
749  * register to ASI_N for kernel virtual addresses instead of ASI_MEM as
750  * is used at TL>0.
751  *
752  * The CPU logout operation will fail (r_val > 0) if the logout
753  * structure in question is already being used. Otherwise, the CPU
754  * logout operation will succeed (r_val = 0). For failures, r_val
755  * returns the busy count (# of times we tried using this CPU logout
756  * structure when it was busy.)
757  *
758  *   Register usage:
759  *	r_val:	This register is the return value which tells the
760  *		caller whether or not the LOGOUT operation was successful.
761  *		For failures, r_val returns the fail count (i.e. number of
762  *		times we have tried to use this logout structure when it was
763  *		already being used.
764  *	afar:	returns AFAR, used internally as afar value.
765  *		output: if the cpu_private struct has not been initialized,
766  *		        then we return the t_flags value listed below.
767  *	r_or_s:	input offset, either register or constant (symbol).  It's
768  *		OK for r_or_s to be a register as long as it's not scr1 or
769  *		scr3.
770  *	t_flags: input trap type info, may be used as scratch after stored
771  *		to cpu log out structure.
772  *	scr1:	Scratch, points to log out data area.
773  *	scr2:	Scratch (may be r_or_s)
774  *	scr3:	Scratch (may be r_val)
775  *	scr4:   Scratch (may be t_flags)
776  */
777 #define	DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \
778 	GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \
779 	wr	%g0, ASI_N, %asi;					\
780 	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4)	\
781 	ba	6f;							\
782 	  nop;								\
783 7:									\
784 	mov	t_flags, afar;		/* depends on afar = %g2  */	\
785 	set	0, r_val;		/* success in this case.  */	\
786 6:
787 
788 /*
789  * The P$ is flushed as a side effect of writing to the Primary
790  * or Secondary Context Register. After writing to a context
791  * register, every line of the P$ in the Valid state is invalidated,
792  * regardless of which context it belongs to.
793  * This routine simply touches the Primary context register by
794  * reading the current value and writing it back. The Primary
795  * context is not changed.
796  */
797 #define	PCACHE_FLUSHALL(tmp1, tmp2, tmp3)				\
798 	sethi	%hi(FLUSH_ADDR), tmp1					;\
799 	set	MMU_PCONTEXT, tmp2					;\
800 	ldxa	[tmp2]ASI_DMMU, tmp3					;\
801 	stxa	tmp3, [tmp2]ASI_DMMU					;\
802 	flush	tmp1	/* See Cheetah PRM 8.10.2 */
803 
804 /*
805  * Macro that flushes the entire Dcache.
806  *
807  * arg1 = dcache size
808  * arg2 = dcache linesize
809  */
810 #define	CH_DCACHE_FLUSHALL(arg1, arg2, tmp1)				\
811 	sub	arg1, arg2, tmp1;					\
812 1:									\
813 	stxa	%g0, [tmp1]ASI_DC_TAG;					\
814 	membar	#Sync;							\
815 	cmp	%g0, tmp1;						\
816 	bne,pt	%icc, 1b;						\
817 	  sub	tmp1, arg2, tmp1;
818 
819 /*
820  * Macro that flushes the entire Icache.
821  *
822  * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
823  * because accesses to ASI 0x67 interfere with Icache coherency.  We
824  * must make sure the Icache is off, then turn it back on after the entire
825  * cache has been invalidated.  If the Icache is originally off, we'll just
826  * clear the tags but not turn the Icache on.
827  *
828  * arg1 = icache size
829  * arg2 = icache linesize
830  */
831 #define	CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)			\
832 	ldxa	[%g0]ASI_DCU, tmp2;					\
833 	andn	tmp2, DCU_IC, tmp1;					\
834 	stxa	tmp1, [%g0]ASI_DCU;					\
835 	flush	%g0;	/* flush required after changing the IC bit */	\
836 	sllx	arg2, 1, arg2;		/* arg2 = linesize * 2 */	\
837 	sllx	arg1, 1, arg1;		/* arg1 = size * 2 */		\
838 	sub	arg1, arg2, arg1;					\
839 	or	arg1, CH_ICTAG_LOWER, arg1;	/* "write" tag */	\
840 1:									\
841 	stxa	%g0, [arg1]ASI_IC_TAG;					\
842 	membar	#Sync;				/* Cheetah PRM 8.9.3 */	\
843 	cmp	arg1, CH_ICTAG_LOWER;					\
844 	bne,pt	%icc, 1b;						\
845 	  sub	arg1, arg2, arg1;					\
846 	stxa	tmp2, [%g0]ASI_DCU;					\
847 	flush	%g0;	/* flush required after changing the IC bit */
848 
849 
850 #if defined(JALAPENO) || defined(SERRANO)
851 
852 /*
853  * ASI access to the L2 tag or L2 flush can hang the cpu when interacting
854  * with combinations of L2 snoops, victims and stores.
855  *
856  * A possible workaround is to surround each L2 ASI access with membars
857  * and make sure that the code is hitting in the Icache.  This requires
858  * aligning code sequence at E$ boundary and forcing I$ fetch by
859  * jumping to selected offsets so that we don't take any I$ misses
860  * during ASI access to the L2 tag or L2 flush.  This also requires
861  * making sure that we don't take any interrupts or traps (such as
862  * fast ECC trap, I$/D$ tag parity error) which can result in eviction
863  * of this code sequence from I$, thus causing a miss.
864  *
865  * Because of the complexity/risk, we have decided to do a partial fix
866  * of adding membar around each ASI access to the L2 tag or L2 flush.
867  */
868 
869 #define	JP_EC_DIAG_ACCESS_MEMBAR	\
870 	membar	#Sync
871 
872 /*
873  * Jalapeno version of macro that flushes the entire Ecache.
874  *
875  * Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
876  *
877  * arg1 = ecache size
878  * arg2 = ecache linesize - not modified; can be an immediate constant.
879  */
880 #define	ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)	\
881 	CPU_INDEX(tmp1, tmp2);						\
882 	set	JP_ECACHE_IDX_DISP_FLUSH, tmp2;				\
883 	sllx	tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1;			\
884 	or	tmp1, tmp2, tmp1;					\
885 	srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2;			\
886 1:									\
887 	subcc	tmp2, arg2, tmp2;					\
888 	JP_EC_DIAG_ACCESS_MEMBAR;					\
889 	ldxa	[tmp1 + tmp2]ASI_EC_DIAG, %g0;				\
890 	JP_EC_DIAG_ACCESS_MEMBAR;					\
891 	bg,pt	%xcc, 1b;						\
892 	  nop;								\
893 	mov	1, tmp2;						\
894 	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
895 	add	tmp1, tmp2, tmp1;					\
896 	mov	(JP_ECACHE_NWAY-1), tmp2;				\
897 	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
898 	andcc	tmp1, tmp2, tmp2;					\
899 	bnz,pt	%xcc, 1b;						\
900 	  srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2
901 
902 #else	/* JALAPENO || SERRANO */
903 
904 /*
905  * Cheetah version of macro that flushes the entire Ecache.
906  *
907  *  Need to displacement flush 2x ecache size from Ecache flush area.
908  *
909  * arg1 = ecache size
910  * arg2 = ecache linesize
911  * arg3 = ecache flush address - for cheetah only
912  */
913 #define	CH_ECACHE_FLUSHALL(arg1, arg2, arg3)				\
914 	sllx	arg1, 1, arg1;						\
915 1:									\
916 	subcc	arg1, arg2, arg1;					\
917 	bg,pt	%xcc, 1b;						\
918 	  ldxa	[arg1 + arg3]ASI_MEM, %g0;
919 
920 /*
921  * Cheetah+ version of macro that flushes the entire Ecache.
922  *
923  * Uses the displacement flush feature.
924  *
925  * arg1 = ecache size
926  * arg2 = ecache linesize
927  * impl = CPU implementation as returned from GET_CPU_IMPL()
928  *        The value in this register is destroyed during execution
929  *        of the macro.
930  */
931 #if defined(CHEETAH_PLUS)
932 #define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)				\
933 	cmp	impl, PANTHER_IMPL;					\
934 	bne	%xcc, 1f;						\
935 	  nop;								\
936 	set	PN_L3_IDX_DISP_FLUSH, impl;				\
937 	b	2f;							\
938 	  nop;								\
939 1:									\
940 	set	CHP_ECACHE_IDX_DISP_FLUSH, impl;			\
941 2:									\
942 	subcc	arg1, arg2, arg1;					\
943 	bg,pt	%xcc, 2b;						\
944 	  ldxa	[arg1 + impl]ASI_EC_DIAG, %g0;
945 #else	/* CHEETAH_PLUS */
946 #define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)
947 #endif	/* CHEETAH_PLUS */
948 
949 /*
950  * Macro that flushes the entire Ecache.
951  *
952  * arg1 = ecache size
953  * arg2 = ecache linesize
954  * arg3 = ecache flush address - for cheetah only
955  */
956 #define	ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1)				\
957 	GET_CPU_IMPL(tmp1);						\
958 	cmp	tmp1, CHEETAH_IMPL;					\
959 	bne	%xcc, 2f;						\
960 	  nop;								\
961 	CH_ECACHE_FLUSHALL(arg1, arg2, arg3);				\
962 	ba	3f;							\
963 	  nop;								\
964 2:									\
965 	CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1);				\
966 3:
967 
968 #endif	/* JALAPENO || SERRANO */
969 
970 /*
971  * Macro that flushes the Panther L2 cache.
972  */
973 #if defined(CHEETAH_PLUS)
974 #define	PN_L2_FLUSHALL(scr1, scr2, scr3)				\
975 	GET_CPU_IMPL(scr3);						\
976 	cmp	scr3, PANTHER_IMPL;					\
977 	bne	%xcc, 2f;						\
978 	  nop;								\
979 	set	PN_L2_SIZE, scr1;					\
980 	set	PN_L2_LINESIZE, scr2;					\
981 	set	PN_L2_IDX_DISP_FLUSH, scr3;				\
982 1:									\
983 	subcc	scr1, scr2, scr1;					\
984 	bg,pt	%xcc, 1b;						\
985 	  ldxa	[scr1 + scr3]ASI_L2_TAG, %g0;				\
986 2:
987 #else	/* CHEETAH_PLUS */
988 #define	PN_L2_FLUSHALL(scr1, scr2, scr3)
989 #endif	/* CHEETAH_PLUS */
990 
991 /*
992  * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
993  * this macro returns the TLB index for that mapping based on a 512 entry
994  * (2-way set associative) TLB. Aaside from the 16 entry fully associative
995  * TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
996  *
997  * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
998  * mask out all but the lower 8 bits because:
999  *
1000  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for   8K
1001  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for  64K
1002  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
1003  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for   4M
1004  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for  32M
1005  *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
1006  *
1007  * and
1008  *
1009  *    array index for   8K pages = VA[20:13]
1010  *    array index for  64K pages = VA[23:16]
1011  *    array index for 512K pages = VA[26:19]
1012  *    array index for   4M pages = VA[29:22]
1013  *    array index for  32M pages = VA[32:25]
1014  *    array index for 256M pages = VA[35:28]
1015  *
1016  * Inputs:
1017  *
1018  *    va	- Register.
1019  *		  Input: Virtual address in which we are interested.
1020  *		  Output: TLB index value.
1021  *    pg_sz	- Register. Page Size of the TLB in question as encoded
1022  *		  in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
1023  */
1024 #if defined(CHEETAH_PLUS)
1025 #define	PN_GET_TLB_INDEX(va, pg_sz)					\
1026 	srlx	va, 13, va;	/* first shift the 13 bits and then */	\
1027 	srlx	va, pg_sz, va;	/* shift by pg_sz three times. */	\
1028 	srlx	va, pg_sz, va;						\
1029 	srlx	va, pg_sz, va;						\
1030 	and	va, 0xff, va;	/* mask out all but the lower 8 bits */
1031 #endif	/* CHEETAH_PLUS */
1032 
1033 /*
1034  * The following macros are for error traps at TL>0.
1035  * The issue with error traps at TL>0 is that there are no safely
1036  * available global registers.  So we use the trick of generating a
1037  * software trap, then using the %tpc, %tnpc and %tstate registers to
1038  * temporarily save the values of %g1 and %g2.
1039  */
1040 
1041 /*
1042  * Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
1043  * Does the following steps:
1044  *	1. membar #Sync - required for USIII family errors.
1045  *	2. Specified software trap.
1046  * NB: Must be 8 instructions or less to fit in trap table and code must
1047  *     be relocatable.
1048  */
1049 #define	CH_ERR_TL1_TRAPENTRY(trapno)		\
1050 	membar	#Sync;				\
1051 	ta	trapno;				\
1052 	nop; nop; nop; nop; nop; nop
1053 
1054 /*
1055  * Macro to generate 8-instruction trap table entry for TL>0 software trap.
1056  * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
1057  * the low-order two bits of %tpc/%tnpc are reserved and read as zero,
1058  * we need to put the low-order two bits of %g1 and %g2 in %tstate).
1059  * Note that %tstate has a reserved hole from bits 3-7, so we put the
1060  * low-order two bits of %g1 in bits 0-1 and the low-order two bits of
1061  * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
1062  * state bits).  Note that we must do a jmp instruction, since this
1063  * is moved into the trap table entry.
1064  * NB: Must be 8 instructions or less to fit in trap table and code must
1065  *     be relocatable.
1066  */
1067 #define	CH_ERR_TL1_SWTRAPENTRY(label)		\
1068 	wrpr	%g1, %tpc;			\
1069 	and	%g1, 3, %g1;			\
1070 	wrpr	%g2, %tnpc;			\
1071 	sllx	%g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \
1072 	or	%g1, %g2, %g2;			\
1073 	sethi	%hi(label), %g1;		\
1074 	jmp	%g1+%lo(label);			\
1075 	  wrpr	%g2, %tstate
1076 
1077 /*
1078  * Macro to get ptr to ch_err_tl1_data.
1079  * reg1 will either point to a physaddr with ASI_MEM in %asi OR it
1080  * will point to a kernel nucleus virtual address with ASI_N in %asi.
1081  * This allows us to:
1082  *   1. Avoid getting MMU misses.  We may have gotten the original
1083  *	Fast ECC error in an MMU handler and if we get an MMU trap
1084  *	in the TL>0 handlers, we'll scribble on the MMU regs.
1085  *   2. Allows us to use the same code in the TL>0 handlers whether
1086  *	we're accessing kernel nucleus virtual addresses or physical
1087  *	addresses.
1088  * pseudo-code:
1089  *	reg1 <- ch_err_tl1_paddrs[CPUID];
1090  *	if (reg1 == NULL) {
1091  *		reg1 <- &ch_err_tl1_data
1092  *		%asi <- ASI_N
1093  *	} else {
1094  *		reg1 <- reg1 + offset +
1095  *		    sizeof (ch_err_tl1_data) * (%tl - 3)
1096  *		%asi <- ASI_MEM
1097  *	}
1098  */
1099 #define	GET_CH_ERR_TL1_PTR(reg1, reg2, offset)	\
1100 	CPU_INDEX(reg1, reg2);			\
1101 	sllx	reg1, 3, reg1;			\
1102 	set	ch_err_tl1_paddrs, reg2;	\
1103 	ldx	[reg1+reg2], reg1;		\
1104 	brnz	reg1, 1f;			\
1105 	add	reg1, offset, reg1;		\
1106 	set	ch_err_tl1_data, reg1;		\
1107 	ba	2f;				\
1108 	wr	%g0, ASI_N, %asi;		\
1109 1:	rdpr	%tl, reg2;			\
1110 	sub	reg2, 3, reg2;			\
1111 	mulx	reg2, CH_ERR_TL1_DATA_SIZE, reg2;	\
1112 	add	reg1, reg2, reg1;		\
1113 	wr	%g0, ASI_MEM, %asi;		\
1114 2:
1115 
1116 /*
1117  * Macro to generate entry code for TL>0 error handlers.
1118  * At the end of this macro, %g1 will point to the ch_err_tl1_data
1119  * structure and %g2 will have the original flags in the ch_err_tl1_data
1120  * structure and %g5 will have the value of %tstate where the Fast ECC
1121  * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
1122  * All %g registers except for %g1, %g2 and %g5 will be available after
1123  * this macro.
1124  * Does the following steps:
1125  *   1. Compute physical address of per-cpu/per-tl save area using
1126  *	only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
1127  *	leaving address in %g1 and updating the %asi register.
1128  *	If there is no data area available, we branch to label.
1129  *   2. Save %g3-%g7 in save area.
1130  *   3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
1131  *	original %g1+%g2 values (because we're going to change %tl).
1132  *   4. set %tl <- %tl - 1.  We do this ASAP to make window of
1133  *	running at %tl+1 as small as possible.
1134  *   5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
1135  *	%tstate (%g5) and save in save area, carefully preserving %g5
1136  *	because it has the CH_ERR_TSTATE_DC_ON value.
1137  *   6. Load existing ch_err_tl1_data flags in %g2
1138  *   7. Compute the new flags
1139  *   8. If %g2 is non-zero (the structure was busy), shift the new
1140  *	flags by CH_ERR_ME_SHIFT and or them with the old flags.
1141  *   9. Store the updated flags into ch_err_tl1_data flags.
1142  *   10. If %g2 is non-zero, read the %tpc and store it in
1143  *	ch_err_tl1_data.
1144  */
1145 #define	CH_ERR_TL1_ENTER(flags)			\
1146 	GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA);	\
1147 	stxa	%g3, [%g1 + CH_ERR_TL1_G3]%asi;	\
1148 	stxa	%g4, [%g1 + CH_ERR_TL1_G4]%asi;	\
1149 	stxa	%g5, [%g1 + CH_ERR_TL1_G5]%asi;	\
1150 	stxa	%g6, [%g1 + CH_ERR_TL1_G6]%asi;	\
1151 	stxa	%g7, [%g1 + CH_ERR_TL1_G7]%asi;	\
1152 	rdpr	%tpc, %g3;			\
1153 	rdpr	%tnpc, %g4;			\
1154 	rdpr	%tstate, %g5;			\
1155 	rdpr	%tl, %g6;			\
1156 	sub	%g6, 1, %g6;			\
1157 	wrpr	%g6, %tl;			\
1158 	and	%g5, 3, %g6;			\
1159 	andn	%g3, 3, %g3;			\
1160 	or	%g3, %g6, %g3;			\
1161 	stxa	%g3, [%g1 + CH_ERR_TL1_G1]%asi;	\
1162 	srlx	%g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6;	\
1163 	and	%g6, 3, %g6;			\
1164 	andn	%g4, 3, %g4;			\
1165 	or	%g6, %g4, %g4;			\
1166 	stxa	%g4, [%g1 + CH_ERR_TL1_G2]%asi;	\
1167 	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1168 	set	flags | CH_ERR_TL, %g3;		\
1169 	brz	%g2, 9f;			\
1170 	sllx	%g3, CH_ERR_ME_SHIFT, %g4;	\
1171 	or	%g2, %g4, %g3;			\
1172 9:	stxa	%g3, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1173 	brnz	%g2, 8f;			\
1174 	rdpr	%tpc, %g4;			\
1175 	stxa	%g4, [%g1 + CH_ERR_TL1_TPC]%asi;	\
1176 8:
1177 
1178 /*
1179  * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
1180  * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON).  This is invoked on Fast ECC
1181  * at TL>0 handlers because the D$ may have corrupted data and we need to
1182  * turn off the I$ to allow for diagnostic accesses.  We then invoke
1183  * the normal entry macro and after it is done we save the values of
1184  * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
1185  * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
1186  */
1187 #define	CH_ERR_TL1_FECC_ENTER			\
1188 	ldxa	[%g0]ASI_DCU, %g1;		\
1189 	andn	%g1, DCU_DC + DCU_IC, %g2;	\
1190 	stxa	%g2, [%g0]ASI_DCU;		\
1191 	flush	%g0;	/* DCU_IC need flush */	\
1192 	rdpr	%tstate, %g2;			\
1193 	and	%g1, DCU_DC + DCU_IC, %g1;	\
1194 	sllx	%g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1;	\
1195 	or	%g1, %g2, %g2;			\
1196 	wrpr	%g2, %tstate;			\
1197 	CH_ERR_TL1_ENTER(CH_ERR_FECC);		\
1198 	and	%g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5;	\
1199 	stxa	%g5, [%g1 + CH_ERR_TL1_TMP]%asi
1200 
1201 /*
1202  * Macro to generate exit code for TL>0 error handlers.
1203  * We fall into this macro if we've successfully logged the error in
1204  * the ch_err_tl1_data structure and want the PIL15 softint to pick
1205  * it up and log it.
1206  * Does the following steps:
1207  *   1.	Set pending flag for this cpu in ch_err_tl1_pending.
1208  *   2.	Write %set_softint with (1<<pil) to cause a pil level trap
1209  *   3.	Restore registers from ch_err_tl1_data, which is pointed to
1210  *	by %g1, last register to restore is %g1 since it's pointing
1211  *	to the save area.
1212  *   4. Execute retry
1213  */
1214 #define	CH_ERR_TL1_EXIT				\
1215 	CPU_INDEX(%g2, %g3);			\
1216 	set	ch_err_tl1_pending, %g3;	\
1217 	set	-1, %g4;			\
1218 	stb	%g4, [%g2 + %g3];		\
1219 	mov	1, %g2;				\
1220 	sll	%g2, PIL_15, %g2;		\
1221 	wr	%g2, SET_SOFTINT;		\
1222 	ldxa	[%g1 + CH_ERR_TL1_G7]%asi, %g7;	\
1223 	ldxa	[%g1 + CH_ERR_TL1_G6]%asi, %g6;	\
1224 	ldxa	[%g1 + CH_ERR_TL1_G5]%asi, %g5;	\
1225 	ldxa	[%g1 + CH_ERR_TL1_G4]%asi, %g4;	\
1226 	ldxa	[%g1 + CH_ERR_TL1_G3]%asi, %g3;	\
1227 	ldxa	[%g1 + CH_ERR_TL1_G2]%asi, %g2;	\
1228 	ldxa	[%g1 + CH_ERR_TL1_G1]%asi, %g1;	\
1229 	retry
1230 
1231 /*
1232  * Generates unrecoverable error label for TL>0 handlers.
1233  * At label (Unrecoverable error routine)
1234  *   1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
1235  *	argument to cpu_tl1_err_panic).
1236  *   2.	Call cpu_tl1_err_panic via systrap at PIL 15
1237  */
1238 #define	CH_ERR_TL1_PANIC_EXIT(label)		\
1239 label:	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1240 	or	%g2, CH_ERR_TL | CH_ERR_PANIC, %g2;	\
1241 	stxa	%g2, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1242 	set	cpu_tl1_err_panic, %g1;		\
1243 	ba	sys_trap;			\
1244 	  mov	PIL_15, %g4
1245 
1246 
1247 
1248 /* END CSTYLED */
1249 #endif	/* _ASM */
1250 
1251 #ifdef	__cplusplus
1252 }
1253 #endif
1254 
1255 #endif /* _CHEETAHASM_H */
1256