1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#ifndef	_CHEETAHASM_H
27#define	_CHEETAHASM_H
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"
30
31#ifdef	__cplusplus
32extern "C" {
33#endif
34
35#ifdef _ASM
36/* BEGIN CSTYLED */
37
38#define	ASM_LD(reg, symbol)						\
39	sethi	%hi(symbol), reg;					\
40	ld	[reg + %lo(symbol)], reg;				\
41
42#define	ASM_LDX(reg, symbol)						\
43	sethi	%hi(symbol), reg;					\
44	ldx	[reg + %lo(symbol)], reg;				\
45
46#define	ASM_JMP(reg, symbol)						\
47	sethi	%hi(symbol), reg;					\
48	jmp	reg + %lo(symbol);					\
49	nop
50
51/*
52 * Macro for getting to offset from 'cpu_private' ptr.  The 'cpu_private'
53 * ptr is in the machcpu structure.
54 *  off_reg:  Register offset from 'cpu_private' ptr.
55 *  scr1:    Scratch, ptr is returned in this register.
56 *  scr2:    Scratch
57 *  label:   Label to branch to if cpu_private ptr is null/zero.
58 */
59#define	GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label)			\
60	CPU_ADDR(scr1, scr2);						\
61	ldn	[scr1 + CPU_PRIVATE], scr1;				\
62	cmp	scr1, 0;						\
63	be	label;							\
64	  nop;								\
65	add	scr1, off_reg, scr1
66
67/*
68 * Macro version of get_dcache_dtag.  We use this macro in the
69 * CPU logout code. Since the Dcache is virtually indexed, only
70 * bits [12:5] of the AFAR can be used so we need to search through
71 * 8 indexes (4 ways + bit 13) in order to find the tag we want.
72 *   afar:  input AFAR, not modified.
73 *   datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t.
74 *   scr1:  scratch.
75 *   scr2:  scratch, will hold tag to look for.
76 *   scr3:  used for Dcache index, loops through 4 ways.
77 */
78#define	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
79	set	CH_DCACHE_IDX_MASK, scr3;				\
80	and	afar, scr3, scr3;					\
81	srlx	afar, CH_DCTAG_PA_SHIFT, scr2;				\
82	b	1f;							\
83	  or	scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */	\
84	.align	128;							\
851:									\
86	ldxa	[scr3]ASI_DC_TAG, scr1;		/* read tag */		\
87	cmp	scr1, scr2;						\
88	bne	4f;				/* not found? */	\
89	  nop;								\
90	stxa	scr3, [datap + CH_DC_IDX]%asi;	/* store index */	\
91	stxa	scr1, [datap + CH_DC_TAG]%asi;	/* store tag */		\
92	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
93	ldxa	[scr3]ASI_DC_UTAG, scr1;	/* read utag */		\
94	membar	#Sync;			/* Cheetah PRM 10.6.3 */	\
95	stxa	scr1, [datap + CH_DC_UTAG]%asi;				\
96	ldxa	[scr3]ASI_DC_SNP_TAG, scr1;	/* read snoop tag */	\
97	stxa	scr1, [datap + CH_DC_SNTAG]%asi;			\
98	add	datap, CH_DC_DATA, datap;				\
99	clr	scr2;							\
1002:									\
101	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
102	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read data */		\
103	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
104	stxa	scr1, [datap]%asi;					\
105	add	datap, 8, datap;					\
106	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
107	blt	2b;							\
108	  add	scr2, 8, scr2;						\
109									\
110	GET_CPU_IMPL(scr2);	/* Parity bits are elsewhere for */	\
111	cmp	scr2, PANTHER_IMPL;	/* panther processors. */	\
112	bne,a	5f;			/* Done if not panther. */	\
113	  add	datap, 8, datap; /* Skip to the end of the struct. */	\
114	clr	scr2;							\
115	add	datap, 7, datap; /* offset of the last parity byte */	\
116	mov	1, scr1;						\
117	sll	scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1;		\
118	or	scr3, scr1, scr3; /* add DC_data_parity bit to index */	\
1193:									\
120	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
121	ldxa	[scr3 + scr2]ASI_DC_DATA, scr1;	/* read parity bits */	\
122	membar	#Sync;			/* Cheetah PRM 10.6.1 */	\
123	stba	scr1, [datap]%asi;					\
124	dec	datap;							\
125	cmp	scr2, CH_DC_DATA_REG_SIZE - 8;				\
126	blt	3b;							\
127	  add	scr2, 8, scr2;						\
128	b	5f;							\
129	  add	datap, 5, datap; /* set pointer to end of our struct */	\
1304:									\
131	set	CH_DCACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
132	add	scr3, scr1, scr3;					\
133	set	CH_DCACHE_IDX_LIMIT, scr1;	/* done? */		\
134	cmp	scr3, scr1;						\
135	blt	1b;							\
136	  nop;								\
137	add	datap, CH_DC_DATA_SIZE, datap;				\
1385:
139
140/*
141 * Macro version of get_icache_dtag.  We use this macro in the CPU
142 * logout code. If the Icache is on, we don't want to capture the data.
143 *   afar:  input AFAR, not modified.
144 *   datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t.
145 *   scr1:  scratch.
146 *   scr2:  scratch, will hold tag to look for.
147 *   scr3:  used for Icache index, loops through 4 ways.
148 * Note: For Panther, the Icache is virtually indexed and increases in
149 * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead
150 * of 32). This means the IC_addr index bits[14:7] for Panther now
151 * correspond to VA bits[13:6]. But since it is virtually indexed, we
152 * still mask out only bits[12:5] from the AFAR (we have to manually
153 * check bit 13). In order to make this code work for all processors,
154 * we end up checking twice as many indexes (8 instead of 4) as required
155 * for non-Panther CPUs and saving off twice as much data (16 instructions
156 * instead of just 8).
157 */
158#define	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
159	ldxa	[%g0]ASI_DCU, scr1;					\
160	btst	DCU_IC, scr1;		/* is Icache enabled? */	\
161	bne,a	6f;			/* yes, don't capture */	\
162	  add	datap, CH_IC_DATA_SIZE, datap;	/* anul if no branch */	\
163	GET_CPU_IMPL(scr2);	/* Panther only uses VA[13:6] */	\
164	cmp	scr2, PANTHER_IMPL;	/* and we also want to mask */	\
165	be	1f;			/* out bit 13 since the */	\
166	  nop;				/* Panther I$ is VIPT. */	\
167	set	CH_ICACHE_IDX_MASK, scr3;				\
168	b	2f;							\
169	  nop;								\
1701:									\
171	set	PN_ICACHE_VA_IDX_MASK, scr3;				\
1722:									\
173	and	afar, scr3, scr3;					\
174	sllx	scr3, CH_ICACHE_IDX_SHIFT, scr3;			\
175	srlx	afar, CH_ICPATAG_SHIFT, scr2;	/* pa tag we want */	\
176	andn	scr2, CH_ICPATAG_LBITS, scr2;	/* mask off lower */	\
177	b	3f;							\
178	  nop;								\
179	.align	128;							\
1803:									\
181	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read pa tag */	\
182	andn	scr1, CH_ICPATAG_LBITS, scr1;	/* mask off lower */	\
183	cmp	scr1, scr2;						\
184	bne	5f;				/* not found? */	\
185	  nop;								\
186	stxa	scr3, [datap + CH_IC_IDX]%asi;	/* store index */	\
187	stxa	scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */	\
188	add	scr3, CH_ICTAG_UTAG, scr3;	/* read utag */		\
189	ldxa	[scr3]ASI_IC_TAG, scr1;					\
190	add	scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3;		\
191	stxa	scr1, [datap + CH_IC_UTAG]%asi;				\
192	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read upper tag */	\
193	add	scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3;		\
194	stxa	scr1, [datap + CH_IC_UPPER]%asi;			\
195	ldxa	[scr3]ASI_IC_TAG, scr1;		/* read lower tag */	\
196	andn	scr3, CH_ICTAG_TMASK, scr3;				\
197	stxa	scr1, [datap + CH_IC_LOWER]%asi;			\
198	ldxa	[scr3]ASI_IC_SNP_TAG, scr1;	/* read snoop tag */	\
199	stxa	scr1, [datap + CH_IC_SNTAG]%asi;			\
200	add	datap, CH_IC_DATA, datap;				\
201	clr	scr2;							\
2024:									\
203	ldxa	[scr3 + scr2]ASI_IC_DATA, scr1;	/* read ins. data */	\
204	stxa	scr1, [datap]%asi;					\
205	add	datap, 8, datap;					\
206	cmp	scr2, PN_IC_DATA_REG_SIZE - 8;				\
207	blt	4b;							\
208	  add	scr2, 8, scr2;						\
209	b	6f;							\
210	  nop;								\
2115:									\
212	set	CH_ICACHE_IDX_INCR, scr1;	/* incr. idx (scr3) */	\
213	add	scr3, scr1, scr3;					\
214	set	PN_ICACHE_IDX_LIMIT, scr1;	/* done? */		\
215	cmp	scr3, scr1;						\
216	blt	3b;							\
217	  nop;								\
218	add	datap, CH_IC_DATA_SIZE, datap;				\
2196:
220
221#if defined(JALAPENO) || defined(SERRANO)
222/*
223 * Macro version of get_ecache_dtag.  We use this macro in the
224 * CPU logout code.
225 *   afar:	input AFAR, not modified
226 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
227 *   ec_way:	Constant value (way number)
228 *   scr1:      Scratch
229 *   scr2:	Scratch.
230 *   scr3:	Scratch.
231 */
232#define	GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3)		\
233	mov	ec_way, scr1;						\
234	and	scr1, JP_ECACHE_NWAY - 1, scr1;	/* mask E$ way bits */	\
235	sllx	scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1;			\
236	set	((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2;	\
237	and	afar, scr2, scr3;		/* get set offset */	\
238	andn	scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */	\
239	or	scr3, scr1, scr3;		/* or WAY bits */	\
240	b	1f;							\
241	  stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
242	.align	64;							\
2431:									\
244	JP_EC_DIAG_ACCESS_MEMBAR;					\
245	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
246	JP_EC_DIAG_ACCESS_MEMBAR;					\
247	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
248	add	datap, CH_EC_DATA, datap;				\
2492:									\
250	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
251	clr	scr1;							\
2523:						/* loop thru 5 regs */	\
253	ldxa	[scr1]ASI_EC_DATA, scr2;				\
254	stxa	scr2, [datap]%asi;					\
255	add	datap, 8, datap;					\
256	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
257	bne	3b;							\
258	   add	scr1, 8, scr1;						\
259	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
260	beq	2b;							\
261	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
262
263#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
264	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
265	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
266	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
267	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
268	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
269	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
270
271/*
272 * Jalapeno does not have cores so these macros are null.
273 */
274#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
275#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
276
277#if defined(JALAPENO)
278/*
279 * Jalapeno gets primary AFSR and AFAR.  All bits in the AFSR except
280 * the fatal error bits are cleared.
281 *	datap:		pointer to cpu logout structure.
282 *	afar:		returned primary AFAR value.
283 *	scr1:		scratch
284 *	scr2:		scratch
285 */
286#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
287	ldxa	[%g0]ASI_AFAR, afar;					\
288	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
289	ldxa	[%g0]ASI_AFSR, scr2;					\
290	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
291	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
292	sllx	scr1, 32, scr1;						\
293	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
294	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
295	membar	#Sync
296
297/*
298 * Jalapeno has no shadow AFAR, null operation.
299 */
300#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
301
302#elif defined(SERRANO)
303/*
304 * Serrano gets primary AFSR and AFAR.  All bits in the AFSR except
305 * the fatal error bits are cleared.  For Serrano, we also save the
306 * AFAR2 register.
307 *	datap:	pointer to cpu logout structure.
308 *	afar:	returned primary AFAR value.
309 *	scr1:	scratch
310 *	scr2:	scratch
311 */
312#define GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
313	set	ASI_MCU_AFAR2_VA, scr1;					\
314	ldxa	[scr1]ASI_MCU_CTRL, afar;				\
315	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi;	\
316	ldxa	[%g0]ASI_AFAR, afar;					\
317	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
318	ldxa	[%g0]ASI_AFSR, scr2;					\
319	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
320	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
321	sllx	scr1, 32, scr1;						\
322	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
323	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ 	\
324	membar	#Sync
325
326/*
327 * Serrano needs to capture E$, D$ and I$ lines associated with afar2.
328 *      afar:   scratch, holds afar2.
329 *      datap:  pointer to cpu logout structure
330 *      scr1:   scratch
331 *      scr2:   scratch
332 *      scr3:   scratch
333 */
334#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
335	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar;	\
336	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;		\
337	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
338	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
339	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
340	sub	datap, CH_CPU_LOGOUT_SIZE, datap
341#endif /* SERRANO */
342
343#elif defined(CHEETAH_PLUS)
344/*
345 * Macro version of get_ecache_dtag.  We use this macro in the
346 * CPU logout code.
347 *   afar:	input AFAR, not modified.
348 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
349 *   pn_way:	ecache way for panther (value = 0-3). For non-panther
350 *		cpus, this macro will be called with pn_way = 0.
351 *   scr1:	Scratch.
352 *   scr2:	Scratch.
353 *   scr3:	Scratch.
354 */
355#define	GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3)		\
356	mov	afar, scr3;						\
357	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
358	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
359	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
360	mov	pn_way, scr1;	/* panther L3$ is 4-way so we ...    */	\
361	sllx	scr1, PN_L3_WAY_SHIFT, scr1;	/* need to mask...   */	\
362	or	scr3, scr1, scr3;	/* in the way bits <24:23>.  */	\
363	b	1f;							\
364	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
365	.align	64;							\
3661:									\
367	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
368	stxa     scr1, [datap + CH_EC_TAG]%asi;				\
369	set	CHP_ECACHE_IDX_TAG_ECC, scr1;				\
370	or	scr3, scr1, scr1;					\
371	ldxa    [scr1]ASI_EC_DIAG, scr1;	/* get E$ tag ECC */	\
372	stxa	scr1, [datap + CH_EC_TAG_ECC]%asi;			\
373	add	datap, CH_EC_DATA, datap;				\
3742:									\
375	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
376	clr	scr1;							\
3773:						/* loop thru 5 regs */	\
378	ldxa	[scr1]ASI_EC_DATA, scr2;				\
379	stxa	scr2, [datap]%asi;					\
380	add	datap, 8, datap;					\
381	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
382	bne	3b;							\
383	   add	scr1, 8, scr1;						\
384	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
385	beq	2b;							\
386	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
387
388/*
389 * If this is a panther, we need to make sure the sibling core is
390 * parked so that we avoid any race conditions during diagnostic
391 * accesses to the shared L2 and L3 caches.
392 * dcucr_reg:	This register will be used to keep track of whether
393 *		or not we need to unpark the core later.
394 *		It just so happens that we also use this same register
395 *		to keep track of our saved DCUCR value so we only touch
396 *		bit 4 of the register (which is a "reserved" bit in the
397 *		DCUCR) for keeping track of core parking.
398 * scr1:	Scratch register.
399 * scr2:	Scratch register.
400 */
401#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
402	GET_CPU_IMPL(scr1);						\
403	cmp	scr1, PANTHER_IMPL;	/* only park for panthers */	\
404	bne,a	%xcc, 2f;						\
405	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
406	set	ASI_CORE_RUNNING_STATUS, scr1;	/* check other core */	\
407	ldxa	[scr1]ASI_CMP_SHARED, scr2;	/* is it running?   */	\
408	cmp	scr2, PN_BOTH_CORES_RUNNING;				\
409	bne,a	%xcc, 2f;	/* if not running, we are done */	\
410	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
411	or	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
412	set	ASI_CORE_ID, scr1;					\
413	ldxa	[scr1]ASI_CMP_PER_CORE, scr2;				\
414	and	scr2, COREID_MASK, scr2;				\
415	or	%g0, 1, scr1;		/* find out which core... */	\
416	sll	scr1, scr2, scr2;	/* ... we need to park... */	\
4171:									\
418	set	ASI_CORE_RUNNING_RW, scr1;				\
419	ldxa    [scr1]ASI_CMP_SHARED, scr1;	/* ...but are we? */	\
420	btst    scr1, scr2;        /* check our own parked status */	\
421	bz      %xcc, 1b;        /* if we are then go round again */	\
422	nop;								\
423	set	ASI_CORE_RUNNING_RW, scr1;	/* else proceed... */	\
424	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ... and park it. */	\
425	membar	#Sync;							\
426	set	ASI_CORE_RUNNING_STATUS, scr1;	/* spin until... */	\
427	ldxa	[scr1]ASI_CMP_SHARED, scr1;	/* ... the other...  */	\
428	cmp	scr1, scr2;	/* ...core is parked according to... */	\
429	bne,a	%xcc, 1b;	/* ...the core running status reg.  */	\
430	  nop;								\
4312:
432
433/*
434 * The core running this code will unpark its sibling core if the
435 * sibling core had been parked by the current core earlier in this
436 * trap handler.
437 * dcucr_reg:	This register is used to keep track of whether or not
438 *		we need to unpark our sibling core.
439 *		It just so happens that we also use this same register
440 *		to keep track of our saved DCUCR value so we only touch
441 *		bit 4 of the register (which is a "reserved" bit in the
442 *		DCUCR) for keeping track of core parking.
443 * scr1:	Scratch register.
444 * scr2:	Scratch register.
445 */
446#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)			\
447	btst	PN_PARKED_OTHER_CORE, dcucr_reg;			\
448	bz,pt	%xcc, 1f;	/* if nothing to unpark, we are done */	\
449	  andn	dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg;		\
450	set	ASI_CORE_RUNNING_RW, scr1;				\
451	set	PN_BOTH_CORES_RUNNING, scr2;	/* we want both...   */	\
452	stxa	scr2, [scr1]ASI_CMP_SHARED;	/* ...cores running. */	\
453	membar	#Sync;							\
4541:
455
456/*
457 * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR.  All bits
458 * in the primary AFSR are cleared except the fatal error bits.  For Panther,
459 * we also have to read and clear the AFSR_EXT, again leaving the fatal
460 * error bits alone.
461 *	datap:		pointer to cpu logout structure.
462 *	afar:		returned primary AFAR value.
463 *	scr1:		scratch
464 *	scr2:		scratch
465 */
466#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)				\
467	set	ASI_SHADOW_REG_VA, scr1;				\
468	ldxa	[scr1]ASI_AFAR, scr2;					\
469	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi;	\
470	ldxa	[scr1]ASI_AFSR, scr2;					\
471	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi;	\
472	ldxa	[%g0]ASI_AFAR, afar;					\
473	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
474	ldxa	[%g0]ASI_AFSR, scr2;					\
475	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
476	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
477	sllx	scr1, 32, scr1;						\
478	bclr	scr1, scr2;	/* Clear fatal error bits here, so */ 	\
479	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
480	membar	#Sync;							\
481	GET_CPU_IMPL(scr1);						\
482	cmp	scr1, PANTHER_IMPL;					\
483	bne	%xcc, 1f;						\
484	   nop;								\
485	set	ASI_SHADOW_AFSR_EXT_VA, scr1;	/* shadow AFSR_EXT */	\
486	ldxa	[scr1]ASI_AFSR, scr2;					\
487	stxa	scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \
488	set	ASI_AFSR_EXT_VA, scr1;		/* primary AFSR_EXT */	\
489	ldxa	[scr1]ASI_AFSR, scr2;					\
490	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi;	\
491	set	C_AFSR_EXT_FATAL_ERRS, scr1;				\
492	bclr	scr1, scr2;	/* Clear fatal error bits here, */	\
493	set	ASI_AFSR_EXT_VA, scr1;	/* so they're left */		\
494	stxa	scr2, [scr1]ASI_AFSR;	/* as is in AFSR_EXT */		\
495	membar	#Sync;							\
4961:
497
498/*
499 * This macro is used in the CPU logout code to capture diagnostic
500 * information from the L2 cache on panther processors.
501 *   afar:	input AFAR, not modified.
502 *   datap:	Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t.
503 *   scr1:	Scratch.
504 *   scr2:	Scratch.
505 *   scr3:	Scratch.
506 */
507#define	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3)		\
508	mov	afar, scr3;						\
509	set	PN_L2_INDEX_MASK, scr1;					\
510	and	scr3, scr1, scr3;					\
511	b	1f;	/* code to read tags and data should be ...  */	\
512	   nop;		/* ...on the same cache line if possible.    */	\
513	.align	128;	/* update this line if you add lines below. */	\
5141:									\
515	stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store L2$ index  */	\
516	ldxa	[scr3]ASI_L2_TAG, scr1;		/* read the L2$ tag */	\
517	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
518	add	datap, CH_EC_DATA, datap;				\
519	clr	scr1;							\
5202:									\
521	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
522	stxa	scr2, [datap]%asi;		/* <511:256> of L2  */	\
523	add	datap, 8, datap;		/* data and record  */	\
524	cmp	scr1, (PN_L2_LINESIZE / 2) - 8;	/* it in the cpu    */	\
525	bne	2b;				/* logout struct.   */	\
526	  add	scr1, 8, scr1;						\
527	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
528	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
529	stxa	scr2, [datap]%asi;		/* ecc of <511:256> */	\
530	add	datap, 8, datap;					\
5313:									\
532	ldxa	[scr3 + scr1]ASI_L2_DATA, scr2;	/* loop through     */	\
533	stxa	scr2, [datap]%asi;		/* <255:0> of L2    */	\
534	add	datap, 8, datap;		/* data and record  */	\
535	cmp	scr1, PN_L2_LINESIZE - 8;	/* it in the cpu    */	\
536	bne	3b;				/* logout struct.   */	\
537	  add	scr1, 8, scr1;						\
538	set	PN_L2_DATA_ECC_SEL, scr2;	/* ECC_sel bit.     */	\
539	add	scr2, PN_L2_ECC_LO_REG, scr2;				\
540	ldxa	[scr3 + scr2]ASI_L2_DATA, scr2;	/* Read and record  */	\
541	stxa	scr2, [datap]%asi;		/* ecc of <255:0>.  */	\
542	add	datap, 8, datap;		/* Advance pointer  */	\
543	set	PN_L2_SET_SIZE, scr2;					\
544	set	PN_L2_MAX_SET, scr1;					\
545	cmp	scr1, scr3;	/* more ways to try for this line? */	\
546	bg,a	%xcc, 1b;	/* if so, start over with next way */	\
547	  add	scr3, scr2, scr3
548
549/*
550 * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar.
551 *	afar:	AFAR from access.
552 *	datap:	pointer to cpu logout structure.
553 *	scr1:	scratch
554 *	scr2:	scratch
555 *	scr3:	scratch
556 */
557#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
558	GET_CPU_IMPL(scr1);						\
559	cmp	scr1, PANTHER_IMPL;					\
560	bne	%xcc, 4f;						\
561	  nop;								\
562	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
563	GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3);		\
564	GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3);		\
565	GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3);		\
566	add	datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap;	\
567	GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
568	b	5f;							\
569	  nop;								\
5704:									\
571	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
572	GET_ECACHE_WAY_BIT(scr1, scr2);					\
573	xor	afar, scr1, afar;					\
574	GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3);		\
575	GET_ECACHE_WAY_BIT(scr1, scr2);		/* restore AFAR */	\
576	xor	afar, scr1, afar;					\
577	add	datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap;	\
578	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
5795:
580
581/*
582 * Cheetah+ needs to capture E$, D$ and I$ lines associated with
583 * shadow afar.
584 *	afar:	scratch, holds shadow afar.
585 *	datap:	pointer to cpu logout structure
586 *	scr1:	scratch
587 *	scr2:	scratch
588 *	scr3:	scratch
589 */
590#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)		\
591	ldxa	[datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar;	\
592	add	datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap;	\
593	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
594	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
595	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
596	sub	datap, CH_CPU_LOGOUT_SIZE, datap
597
598/*
599 * Compute the "Way" bit for 2-way Ecache for Cheetah+.
600 */
601#define	GET_ECACHE_WAY_BIT(scr1, scr2)					\
602	CPU_INDEX(scr1, scr2);						\
603	mulx	scr1, CPU_NODE_SIZE, scr1;				\
604	add	scr1, ECACHE_SIZE, scr1;				\
605	set	cpunodes, scr2;						\
606	ld	[scr1 + scr2], scr1;					\
607	srlx	scr1, 1, scr1
608
609#else /* CHEETAH_PLUS */
610/*
611 * Macro version of get_ecache_dtag.  We use this macro in the
612 * CPU logout code.
613 *   afar:	input AFAR, not modified.
614 *   datap:	Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t.
615 *   scr1:      Scratch.
616 *   scr2:	Scratch.
617 *   scr3:	Scratch.
618 */
619#define	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3)			\
620	mov	afar, scr3;						\
621	andn	scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\
622	set	(CH_ECACHE_8M_SIZE - 1), scr2;				\
623	and	scr3, scr2, scr3;		/* VA<63:23>=0 */	\
624	b	1f;							\
625	   stxa	scr3, [datap + CH_EC_IDX]%asi;	/* store E$ index */	\
626	.align	64;							\
6271:									\
628	ldxa    [scr3]ASI_EC_DIAG, scr1;	/* get E$ tag */	\
629	stxa	scr1, [datap + CH_EC_TAG]%asi;				\
630	add	datap, CH_EC_DATA, datap;				\
6312:									\
632	ldxa	[scr3]ASI_EC_R, %g0;		/* ld E$ stging regs */	\
633	clr	scr1;							\
6343:						/* loop thru 5 regs */	\
635	ldxa	[scr1]ASI_EC_DATA, scr2;				\
636	stxa	scr2, [datap]%asi;					\
637	add	datap, 8, datap;					\
638	cmp	scr1, CH_ECACHE_STGREG_TOTALSIZE - 8;			\
639	bne	3b;							\
640	   add	scr1, 8, scr1;						\
641	btst	CH_ECACHE_STGREG_SIZE, scr3;	/* done? */		\
642	beq	2b;							\
643	   add	scr3, CH_ECACHE_STGREG_SIZE, scr3
644
645/*
646 * Cheetah does not have cores so these macros are null.
647 */
648#define	PARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
649#define	UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2)
650
651/*
652 * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the
653 * fatal error bits.
654 *	datap:		pointer to cpu logout structure.
655 *	afar:		returned primary AFAR value.
656 *	scr1:		scratch
657 *	scr2:		scratch
658 */
659#define	GET_AFSR_AFAR(datap, afar, scr1, scr2)	\
660	ldxa	[%g0]ASI_AFAR, afar;					\
661	stxa	afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi;	\
662	ldxa	[%g0]ASI_AFSR, scr2;					\
663	stxa	scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi;	\
664	sethi	%hh(C_AFSR_FATAL_ERRS), scr1;				\
665	sllx	scr1, 32, scr1;						\
666	bclr	scr1, scr2;	/* Clear fatal error bits here, so */	\
667	stxa	scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */	\
668	membar	#Sync
669
670/*
671 * Cheetah E$ is direct-mapped, so we grab line data and skip second line.
672 *	afar:	AFAR from access.
673 *	datap:	pointer to cpu logout structure.
674 *	scr1:	scratch
675 *	scr2:	scratch
676 *	scr3:	scratch
677 */
678#define	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3)			\
679	GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
680	add	datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap;	\
681	add	datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap;		\
682
683/*
684 * Cheetah has no shadow AFAR, null operation.
685 */
686#define	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3)
687
688#endif	/* CHEETAH_PLUS */
689
690/*
691 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
692 * logout data at TL>0. r_val is a register that returns the "failure count"
693 * to the caller, and may be used as a scratch register until the end of
694 * the macro.  afar is used to return the primary AFAR value to the caller
695 * and it too can be used as a scratch register until the end. r_or_s is
696 * a reg or symbol that has the offset within the "cpu_private" data area
697 * to deposit the logout data.  t_flags is a register that has the
698 * trap-type/trap-level/CEEN info. This t_flags register may be used after
699 * the GET_AFSR_AFAR macro.
700 *
701 * The CPU logout operation will fail (r_val > 0) if the logout
702 * structure in question is already being used. Otherwise, the CPU
703 * logout operation will succeed (r_val = 0). For failures, r_val
704 * returns the busy count (# of times we tried using this CPU logout
705 * structure when it was busy.)
706 *
707 *   Register usage:
708 *	%asi:   Must be set to either ASI_MEM if the address in datap
709 *		is a physical address or to ASI_N if the address in
710 *		datap is a virtual address.
711 *	r_val:	This register is the return value which tells the
712 *		caller whether or not the LOGOUT operation was successful.
713 *		For failures, r_val returns the fail count (i.e. number of
714 *		times we have tried to use this logout structure when it was
715 *		already being used.
716 *	afar:	output: contains AFAR on exit
717 *	t_flags: input trap type info, may be used as scratch after stored
718 *		to cpu log out structure.
719 *	datap:	Points to log out data area.
720 *	scr1:	Scratch
721 *	scr2:	Scratch (may be r_val)
722 *	scr3:   Scratch (may be t_flags)
723 */
724#define	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \
725	setx	LOGOUT_INVALID, scr2, scr1;				\
726	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2;	\
727	cmp	scr2, scr1;						\
728	bne	8f;							\
729	  nop;								\
730	stxa	t_flags, [datap + CH_CLO_FLAGS]%asi;			\
731	GET_AFSR_AFAR(datap, afar, scr1, scr2);				\
732	add	datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap;		\
733	GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3);		\
734	GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
735	GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3);			\
736	sub	datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap;		\
737	GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3);			\
738	ldxa	[datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar;	\
739	set	0, r_val;	/* return value for success */		\
740	ba	9f;							\
741	  nop;								\
7428:									\
743	ldxa	[%g0]ASI_AFAR, afar;					\
744	ldxa	[datap + CH_CLO_NEST_CNT]%asi, r_val;			\
745	inc	r_val;		/* return value for failure */		\
746	stxa	r_val, [datap + CH_CLO_NEST_CNT]%asi;			\
747	membar	#Sync;							\
7489:
749
750/*
751 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU
752 * logout data.  Uses DO_TL1_CPU_LOGOUT macro defined above, and sets
753 * up the expected data pointer in the scr1 register and sets the %asi
754 * register to ASI_N for kernel virtual addresses instead of ASI_MEM as
755 * is used at TL>0.
756 *
757 * The CPU logout operation will fail (r_val > 0) if the logout
758 * structure in question is already being used. Otherwise, the CPU
759 * logout operation will succeed (r_val = 0). For failures, r_val
760 * returns the busy count (# of times we tried using this CPU logout
761 * structure when it was busy.)
762 *
763 *   Register usage:
764 *	r_val:	This register is the return value which tells the
765 *		caller whether or not the LOGOUT operation was successful.
766 *		For failures, r_val returns the fail count (i.e. number of
767 *		times we have tried to use this logout structure when it was
768 *		already being used.
769 *	afar:	returns AFAR, used internally as afar value.
770 *		output: if the cpu_private struct has not been initialized,
771 *		        then we return the t_flags value listed below.
772 *	r_or_s:	input offset, either register or constant (symbol).  It's
773 *		OK for r_or_s to be a register as long as it's not scr1 or
774 *		scr3.
775 *	t_flags: input trap type info, may be used as scratch after stored
776 *		to cpu log out structure.
777 *	scr1:	Scratch, points to log out data area.
778 *	scr2:	Scratch (may be r_or_s)
779 *	scr3:	Scratch (may be r_val)
780 *	scr4:   Scratch (may be t_flags)
781 */
782#define	DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \
783	GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \
784	wr	%g0, ASI_N, %asi;					\
785	DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4)	\
786	ba	6f;							\
787	  nop;								\
7887:									\
789	mov	t_flags, afar;		/* depends on afar = %g2  */	\
790	set	0, r_val;		/* success in this case.  */	\
7916:
792
793/*
794 * The P$ is flushed as a side effect of writing to the Primary
795 * or Secondary Context Register. After writing to a context
796 * register, every line of the P$ in the Valid state is invalidated,
797 * regardless of which context it belongs to.
798 * This routine simply touches the Primary context register by
799 * reading the current value and writing it back. The Primary
800 * context is not changed.
801 */
802#define	PCACHE_FLUSHALL(tmp1, tmp2, tmp3)				\
803	sethi	%hi(FLUSH_ADDR), tmp1					;\
804	set	MMU_PCONTEXT, tmp2					;\
805	ldxa	[tmp2]ASI_DMMU, tmp3					;\
806	stxa	tmp3, [tmp2]ASI_DMMU					;\
807	flush	tmp1	/* See Cheetah PRM 8.10.2 */
808
809/*
810 * Macro that flushes the entire Dcache.
811 *
812 * arg1 = dcache size
813 * arg2 = dcache linesize
814 */
815#define	CH_DCACHE_FLUSHALL(arg1, arg2, tmp1)				\
816	sub	arg1, arg2, tmp1;					\
8171:									\
818	stxa	%g0, [tmp1]ASI_DC_TAG;					\
819	membar	#Sync;							\
820	cmp	%g0, tmp1;						\
821	bne,pt	%icc, 1b;						\
822	  sub	tmp1, arg2, tmp1;
823
824/*
825 * Macro that flushes the entire Icache.
826 *
827 * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on,
828 * because accesses to ASI 0x67 interfere with Icache coherency.  We
829 * must make sure the Icache is off, then turn it back on after the entire
830 * cache has been invalidated.  If the Icache is originally off, we'll just
831 * clear the tags but not turn the Icache on.
832 *
833 * arg1 = icache size
834 * arg2 = icache linesize
835 */
836#define	CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)			\
837	ldxa	[%g0]ASI_DCU, tmp2;					\
838	andn	tmp2, DCU_IC, tmp1;					\
839	stxa	tmp1, [%g0]ASI_DCU;					\
840	flush	%g0;	/* flush required after changing the IC bit */	\
841	sllx	arg2, 1, arg2;		/* arg2 = linesize * 2 */	\
842	sllx	arg1, 1, arg1;		/* arg1 = size * 2 */		\
843	sub	arg1, arg2, arg1;					\
844	or	arg1, CH_ICTAG_LOWER, arg1;	/* "write" tag */	\
8451:									\
846	stxa	%g0, [arg1]ASI_IC_TAG;					\
847	membar	#Sync;				/* Cheetah PRM 8.9.3 */	\
848	cmp	arg1, CH_ICTAG_LOWER;					\
849	bne,pt	%icc, 1b;						\
850	  sub	arg1, arg2, arg1;					\
851	stxa	tmp2, [%g0]ASI_DCU;					\
852	flush	%g0;	/* flush required after changing the IC bit */
853
854
855#if defined(JALAPENO) || defined(SERRANO)
856
857/*
858 * ASI access to the L2 tag or L2 flush can hang the cpu when interacting
859 * with combinations of L2 snoops, victims and stores.
860 *
861 * A possible workaround is to surround each L2 ASI access with membars
862 * and make sure that the code is hitting in the Icache.  This requires
863 * aligning code sequence at E$ boundary and forcing I$ fetch by
864 * jumping to selected offsets so that we don't take any I$ misses
865 * during ASI access to the L2 tag or L2 flush.  This also requires
866 * making sure that we don't take any interrupts or traps (such as
867 * fast ECC trap, I$/D$ tag parity error) which can result in eviction
868 * of this code sequence from I$, thus causing a miss.
869 *
870 * Because of the complexity/risk, we have decided to do a partial fix
871 * of adding membar around each ASI access to the L2 tag or L2 flush.
872 */
873
874#define	JP_EC_DIAG_ACCESS_MEMBAR	\
875	membar	#Sync
876
877/*
878 * Jalapeno version of macro that flushes the entire Ecache.
879 *
880 * Uses Jalapeno displacement flush feature of ASI_EC_DIAG.
881 *
882 * arg1 = ecache size
883 * arg2 = ecache linesize - not modified; can be an immediate constant.
884 */
885#define	ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2)	\
886	CPU_INDEX(tmp1, tmp2);						\
887	set	JP_ECACHE_IDX_DISP_FLUSH, tmp2;				\
888	sllx	tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1;			\
889	or	tmp1, tmp2, tmp1;					\
890	srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2;			\
8911:									\
892	subcc	tmp2, arg2, tmp2;					\
893	JP_EC_DIAG_ACCESS_MEMBAR;					\
894	ldxa	[tmp1 + tmp2]ASI_EC_DIAG, %g0;				\
895	JP_EC_DIAG_ACCESS_MEMBAR;					\
896	bg,pt	%xcc, 1b;						\
897	  nop;								\
898	mov	1, tmp2;						\
899	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
900	add	tmp1, tmp2, tmp1;					\
901	mov	(JP_ECACHE_NWAY-1), tmp2;				\
902	sllx	tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2;			\
903	andcc	tmp1, tmp2, tmp2;					\
904	bnz,pt	%xcc, 1b;						\
905	  srlx	arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2
906
907#else	/* JALAPENO || SERRANO */
908
909/*
910 * Cheetah version of macro that flushes the entire Ecache.
911 *
912 *  Need to displacement flush 2x ecache size from Ecache flush area.
913 *
914 * arg1 = ecache size
915 * arg2 = ecache linesize
916 * arg3 = ecache flush address - for cheetah only
917 */
918#define	CH_ECACHE_FLUSHALL(arg1, arg2, arg3)				\
919	sllx	arg1, 1, arg1;						\
9201:									\
921	subcc	arg1, arg2, arg1;					\
922	bg,pt	%xcc, 1b;						\
923	  ldxa	[arg1 + arg3]ASI_MEM, %g0;
924
925/*
926 * Cheetah+ version of macro that flushes the entire Ecache.
927 *
928 * Uses the displacement flush feature.
929 *
930 * arg1 = ecache size
931 * arg2 = ecache linesize
932 * impl = CPU implementation as returned from GET_CPU_IMPL()
933 *        The value in this register is destroyed during execution
934 *        of the macro.
935 */
936#if defined(CHEETAH_PLUS)
937#define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)				\
938	cmp	impl, PANTHER_IMPL;					\
939	bne	%xcc, 1f;						\
940	  nop;								\
941	set	PN_L3_IDX_DISP_FLUSH, impl;				\
942	b	2f;							\
943	  nop;								\
9441:									\
945	set	CHP_ECACHE_IDX_DISP_FLUSH, impl;			\
9462:									\
947	subcc	arg1, arg2, arg1;					\
948	bg,pt	%xcc, 2b;						\
949	  ldxa	[arg1 + impl]ASI_EC_DIAG, %g0;
950#else	/* CHEETAH_PLUS */
951#define	CHP_ECACHE_FLUSHALL(arg1, arg2, impl)
952#endif	/* CHEETAH_PLUS */
953
954/*
955 * Macro that flushes the entire Ecache.
956 *
957 * arg1 = ecache size
958 * arg2 = ecache linesize
959 * arg3 = ecache flush address - for cheetah only
960 */
961#define	ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1)				\
962	GET_CPU_IMPL(tmp1);						\
963	cmp	tmp1, CHEETAH_IMPL;					\
964	bne	%xcc, 2f;						\
965	  nop;								\
966	CH_ECACHE_FLUSHALL(arg1, arg2, arg3);				\
967	ba	3f;							\
968	  nop;								\
9692:									\
970	CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1);				\
9713:
972
973#endif	/* JALAPENO || SERRANO */
974
975/*
976 * Macro that flushes the Panther L2 cache.
977 */
978#if defined(CHEETAH_PLUS)
979#define	PN_L2_FLUSHALL(scr1, scr2, scr3)				\
980	GET_CPU_IMPL(scr3);						\
981	cmp	scr3, PANTHER_IMPL;					\
982	bne	%xcc, 2f;						\
983	  nop;								\
984	set	PN_L2_SIZE, scr1;					\
985	set	PN_L2_LINESIZE, scr2;					\
986	set	PN_L2_IDX_DISP_FLUSH, scr3;				\
9871:									\
988	subcc	scr1, scr2, scr1;					\
989	bg,pt	%xcc, 1b;						\
990	  ldxa	[scr1 + scr3]ASI_L2_TAG, %g0;				\
9912:
992#else	/* CHEETAH_PLUS */
993#define	PN_L2_FLUSHALL(scr1, scr2, scr3)
994#endif	/* CHEETAH_PLUS */
995
996/*
997 * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT),
998 * this macro returns the TLB index for that mapping based on a 512 entry
999 * (2-way set associative) TLB. Aaside from the 16 entry fully associative
1000 * TLBs, all TLBs in Panther are 512 entry, 2-way set associative.
1001 *
1002 * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then
1003 * mask out all but the lower 8 bits because:
1004 *
1005 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for   8K
1006 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for  64K
1007 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K
1008 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for   4M
1009 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for  32M
1010 *    ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M
1011 *
1012 * and
1013 *
1014 *    array index for   8K pages = VA[20:13]
1015 *    array index for  64K pages = VA[23:16]
1016 *    array index for 512K pages = VA[26:19]
1017 *    array index for   4M pages = VA[29:22]
1018 *    array index for  32M pages = VA[32:25]
1019 *    array index for 256M pages = VA[35:28]
1020 *
1021 * Inputs:
1022 *
1023 *    va	- Register.
1024 *		  Input: Virtual address in which we are interested.
1025 *		  Output: TLB index value.
1026 *    pg_sz	- Register. Page Size of the TLB in question as encoded
1027 *		  in the ASI_[D|I]MMU_TAG_ACCESS_EXT register.
1028 */
1029#if defined(CHEETAH_PLUS)
1030#define	PN_GET_TLB_INDEX(va, pg_sz)					\
1031	srlx	va, 13, va;	/* first shift the 13 bits and then */	\
1032	srlx	va, pg_sz, va;	/* shift by pg_sz three times. */	\
1033	srlx	va, pg_sz, va;						\
1034	srlx	va, pg_sz, va;						\
1035	and	va, 0xff, va;	/* mask out all but the lower 8 bits */
1036#endif	/* CHEETAH_PLUS */
1037
1038/*
1039 * The following macros are for error traps at TL>0.
1040 * The issue with error traps at TL>0 is that there are no safely
1041 * available global registers.  So we use the trick of generating a
1042 * software trap, then using the %tpc, %tnpc and %tstate registers to
1043 * temporarily save the values of %g1 and %g2.
1044 */
1045
1046/*
1047 * Macro to generate 8-instruction trap table entry for TL>0 trap handlers.
1048 * Does the following steps:
1049 *	1. membar #Sync - required for USIII family errors.
1050 *	2. Specified software trap.
1051 * NB: Must be 8 instructions or less to fit in trap table and code must
1052 *     be relocatable.
1053 */
1054#define	CH_ERR_TL1_TRAPENTRY(trapno)		\
1055	membar	#Sync;				\
1056	ta	trapno;				\
1057	nop; nop; nop; nop; nop; nop
1058
1059/*
1060 * Macro to generate 8-instruction trap table entry for TL>0 software trap.
1061 * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since
1062 * the low-order two bits of %tpc/%tnpc are reserved and read as zero,
1063 * we need to put the low-order two bits of %g1 and %g2 in %tstate).
1064 * Note that %tstate has a reserved hole from bits 3-7, so we put the
1065 * low-order two bits of %g1 in bits 0-1 and the low-order two bits of
1066 * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$
1067 * state bits).  Note that we must do a jmp instruction, since this
1068 * is moved into the trap table entry.
1069 * NB: Must be 8 instructions or less to fit in trap table and code must
1070 *     be relocatable.
1071 */
1072#define	CH_ERR_TL1_SWTRAPENTRY(label)		\
1073	wrpr	%g1, %tpc;			\
1074	and	%g1, 3, %g1;			\
1075	wrpr	%g2, %tnpc;			\
1076	sllx	%g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \
1077	or	%g1, %g2, %g2;			\
1078	sethi	%hi(label), %g1;		\
1079	jmp	%g1+%lo(label);			\
1080	  wrpr	%g2, %tstate
1081
1082/*
1083 * Macro to get ptr to ch_err_tl1_data.
1084 * reg1 will either point to a physaddr with ASI_MEM in %asi OR it
1085 * will point to a kernel nucleus virtual address with ASI_N in %asi.
1086 * This allows us to:
1087 *   1. Avoid getting MMU misses.  We may have gotten the original
1088 *	Fast ECC error in an MMU handler and if we get an MMU trap
1089 *	in the TL>0 handlers, we'll scribble on the MMU regs.
1090 *   2. Allows us to use the same code in the TL>0 handlers whether
1091 *	we're accessing kernel nucleus virtual addresses or physical
1092 *	addresses.
1093 * pseudo-code:
1094 *	reg1 <- ch_err_tl1_paddrs[CPUID];
1095 *	if (reg1 == NULL) {
1096 *		reg1 <- &ch_err_tl1_data
1097 *		%asi <- ASI_N
1098 *	} else {
1099 *		reg1 <- reg1 + offset +
1100 *		    sizeof (ch_err_tl1_data) * (%tl - 3)
1101 *		%asi <- ASI_MEM
1102 *	}
1103 */
1104#define	GET_CH_ERR_TL1_PTR(reg1, reg2, offset)	\
1105	CPU_INDEX(reg1, reg2);			\
1106	sllx	reg1, 3, reg1;			\
1107	set	ch_err_tl1_paddrs, reg2;	\
1108	ldx	[reg1+reg2], reg1;		\
1109	brnz	reg1, 1f;			\
1110	add	reg1, offset, reg1;		\
1111	set	ch_err_tl1_data, reg1;		\
1112	ba	2f;				\
1113	wr	%g0, ASI_N, %asi;		\
11141:	rdpr	%tl, reg2;			\
1115	sub	reg2, 3, reg2;			\
1116	mulx	reg2, CH_ERR_TL1_DATA_SIZE, reg2;	\
1117	add	reg1, reg2, reg1;		\
1118	wr	%g0, ASI_MEM, %asi;		\
11192:
1120
1121/*
1122 * Macro to generate entry code for TL>0 error handlers.
1123 * At the end of this macro, %g1 will point to the ch_err_tl1_data
1124 * structure and %g2 will have the original flags in the ch_err_tl1_data
1125 * structure and %g5 will have the value of %tstate where the Fast ECC
1126 * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON.
1127 * All %g registers except for %g1, %g2 and %g5 will be available after
1128 * this macro.
1129 * Does the following steps:
1130 *   1. Compute physical address of per-cpu/per-tl save area using
1131 *	only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate)
1132 *	leaving address in %g1 and updating the %asi register.
1133 *	If there is no data area available, we branch to label.
1134 *   2. Save %g3-%g7 in save area.
1135 *   3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain
1136 *	original %g1+%g2 values (because we're going to change %tl).
1137 *   4. set %tl <- %tl - 1.  We do this ASAP to make window of
1138 *	running at %tl+1 as small as possible.
1139 *   5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4),
1140 *	%tstate (%g5) and save in save area, carefully preserving %g5
1141 *	because it has the CH_ERR_TSTATE_DC_ON value.
1142 *   6. Load existing ch_err_tl1_data flags in %g2
1143 *   7. Compute the new flags
1144 *   8. If %g2 is non-zero (the structure was busy), shift the new
1145 *	flags by CH_ERR_ME_SHIFT and or them with the old flags.
1146 *   9. Store the updated flags into ch_err_tl1_data flags.
1147 *   10. If %g2 is non-zero, read the %tpc and store it in
1148 *	ch_err_tl1_data.
1149 */
1150#define	CH_ERR_TL1_ENTER(flags)			\
1151	GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA);	\
1152	stxa	%g3, [%g1 + CH_ERR_TL1_G3]%asi;	\
1153	stxa	%g4, [%g1 + CH_ERR_TL1_G4]%asi;	\
1154	stxa	%g5, [%g1 + CH_ERR_TL1_G5]%asi;	\
1155	stxa	%g6, [%g1 + CH_ERR_TL1_G6]%asi;	\
1156	stxa	%g7, [%g1 + CH_ERR_TL1_G7]%asi;	\
1157	rdpr	%tpc, %g3;			\
1158	rdpr	%tnpc, %g4;			\
1159	rdpr	%tstate, %g5;			\
1160	rdpr	%tl, %g6;			\
1161	sub	%g6, 1, %g6;			\
1162	wrpr	%g6, %tl;			\
1163	and	%g5, 3, %g6;			\
1164	andn	%g3, 3, %g3;			\
1165	or	%g3, %g6, %g3;			\
1166	stxa	%g3, [%g1 + CH_ERR_TL1_G1]%asi;	\
1167	srlx	%g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6;	\
1168	and	%g6, 3, %g6;			\
1169	andn	%g4, 3, %g4;			\
1170	or	%g6, %g4, %g4;			\
1171	stxa	%g4, [%g1 + CH_ERR_TL1_G2]%asi;	\
1172	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1173	set	flags | CH_ERR_TL, %g3;		\
1174	brz	%g2, 9f;			\
1175	sllx	%g3, CH_ERR_ME_SHIFT, %g4;	\
1176	or	%g2, %g4, %g3;			\
11779:	stxa	%g3, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1178	brnz	%g2, 8f;			\
1179	rdpr	%tpc, %g4;			\
1180	stxa	%g4, [%g1 + CH_ERR_TL1_TPC]%asi;	\
11818:
1182
1183/*
1184 * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9
1185 * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON).  This is invoked on Fast ECC
1186 * at TL>0 handlers because the D$ may have corrupted data and we need to
1187 * turn off the I$ to allow for diagnostic accesses.  We then invoke
1188 * the normal entry macro and after it is done we save the values of
1189 * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/
1190 * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp.
1191 */
1192#define	CH_ERR_TL1_FECC_ENTER			\
1193	ldxa	[%g0]ASI_DCU, %g1;		\
1194	andn	%g1, DCU_DC + DCU_IC, %g2;	\
1195	stxa	%g2, [%g0]ASI_DCU;		\
1196	flush	%g0;	/* DCU_IC need flush */	\
1197	rdpr	%tstate, %g2;			\
1198	and	%g1, DCU_DC + DCU_IC, %g1;	\
1199	sllx	%g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1;	\
1200	or	%g1, %g2, %g2;			\
1201	wrpr	%g2, %tstate;			\
1202	CH_ERR_TL1_ENTER(CH_ERR_FECC);		\
1203	and	%g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5;	\
1204	stxa	%g5, [%g1 + CH_ERR_TL1_TMP]%asi
1205
1206/*
1207 * Macro to generate exit code for TL>0 error handlers.
1208 * We fall into this macro if we've successfully logged the error in
1209 * the ch_err_tl1_data structure and want the PIL15 softint to pick
1210 * it up and log it.
1211 * Does the following steps:
1212 *   1.	Set pending flag for this cpu in ch_err_tl1_pending.
1213 *   2.	Write %set_softint with (1<<pil) to cause a pil level trap
1214 *   3.	Restore registers from ch_err_tl1_data, which is pointed to
1215 *	by %g1, last register to restore is %g1 since it's pointing
1216 *	to the save area.
1217 *   4. Execute retry
1218 */
1219#define	CH_ERR_TL1_EXIT				\
1220	CPU_INDEX(%g2, %g3);			\
1221	set	ch_err_tl1_pending, %g3;	\
1222	set	-1, %g4;			\
1223	stb	%g4, [%g2 + %g3];		\
1224	mov	1, %g2;				\
1225	sll	%g2, PIL_15, %g2;		\
1226	wr	%g2, SET_SOFTINT;		\
1227	ldxa	[%g1 + CH_ERR_TL1_G7]%asi, %g7;	\
1228	ldxa	[%g1 + CH_ERR_TL1_G6]%asi, %g6;	\
1229	ldxa	[%g1 + CH_ERR_TL1_G5]%asi, %g5;	\
1230	ldxa	[%g1 + CH_ERR_TL1_G4]%asi, %g4;	\
1231	ldxa	[%g1 + CH_ERR_TL1_G3]%asi, %g3;	\
1232	ldxa	[%g1 + CH_ERR_TL1_G2]%asi, %g2;	\
1233	ldxa	[%g1 + CH_ERR_TL1_G1]%asi, %g1;	\
1234	retry
1235
1236/*
1237 * Generates unrecoverable error label for TL>0 handlers.
1238 * At label (Unrecoverable error routine)
1239 *   1. Sets flags in ch_err_tl1_data and leaves in %g2 (first
1240 *	argument to cpu_tl1_err_panic).
1241 *   2.	Call cpu_tl1_err_panic via systrap at PIL 15
1242 */
1243#define	CH_ERR_TL1_PANIC_EXIT(label)		\
1244label:	ldxa	[%g1 + CH_ERR_TL1_FLAGS]%asi, %g2;	\
1245	or	%g2, CH_ERR_TL | CH_ERR_PANIC, %g2;	\
1246	stxa	%g2, [%g1 + CH_ERR_TL1_FLAGS]%asi;	\
1247	set	cpu_tl1_err_panic, %g1;		\
1248	ba	sys_trap;			\
1249	  mov	PIL_15, %g4
1250
1251
1252
1253/* END CSTYLED */
1254#endif	/* _ASM */
1255
1256#ifdef	__cplusplus
1257}
1258#endif
1259
1260#endif /* _CHEETAHASM_H */
1261