1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 /*
28  * Support routines for managing per-Lxcache state.
29  */
30 
31 #include <sys/types.h>
32 #include <errno.h>
33 #include <strings.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <stropts.h>
38 #include <fm/fmd_api.h>
39 #include <sys/fm/protocol.h>
40 #include <sys/fm/cpu/UltraSPARC-III.h>
41 #include <sys/cpuvar.h>
42 #include <cmd_Lxcache.h>
43 #include <cmd_mem.h>
44 #include <cmd_cpu.h>
45 #include <cmd_state.h>
46 #include <cmd.h>
47 #define	_KERNEL
48 #include <sys/cheetahregs.h>
49 #include <sys/mem_cache.h>
50 #undef _KERNEL
51 #include <sys/errclassify.h>
52 #include <sys/fm/io/sun4upci.h>
53 
54 #include <fmd_adm.h>
55 #include <fmd_adm_impl.h>
56 #include <fmd_rpc_adm.h>
57 
58 #define	PN_CACHE_ERRORS (CMD_ERRCL_UCC | CMD_ERRCL_WDC | \
59 			    CMD_ERRCL_CPC | CMD_ERRCL_EDC | \
60 			    CMD_ERRCL_L3_UCC | CMD_ERRCL_L3_CPC |\
61 			    CMD_ERRCL_L3_WDC | CMD_ERRCL_L3_EDC)
62 
63 /* Note that these are the same for panther L2 and L3 (see prm) */
64 
65 #define	LX_INDEX_MASK		PN_L2_INDEX_MASK
66 #define	LX_INDEX_SHIFT		6
67 #define	PN_ECSTATE_NA	5
68 #define	PN_ECSTATE_INV	0
69 
70 #define	PN_L3_INDEX_MASK	PN_L3_TAG_RD_MASK
71 
72 static const errdata_t l3errdata =
73 	{ &cmd.cmd_l3data_serd, "l3cachedata", CMD_PTR_LxCACHE_CASE };
74 static const errdata_t l2errdata =
75 	{ &cmd.cmd_l2data_serd, "l2cachedata", CMD_PTR_LxCACHE_CASE };
76 
77 /* Macro for putting 64-bit onto stack as two 32-bit ints */
78 #define	PRTF_64_TO_32(x)	(uint32_t)((x)>>32), (uint32_t)(x)
79 
80 #define	LX_PA_MASK2_32BIT_CORRECT	16
81 #define	LX_PA_MASK3_32BIT_CORRECT	24
82 #define	LX_PA_MASK2 0x7fffff8
83 #define	LX_PA_MASK3 0x7ffff8
84 
85 
86 #define	MAX_RETRIES_FOR_ECC_MATCH	3
87 #define	PN_TAG_ECC_MASK 0x7fc0
88 #define	PN_L2_PTAG_SHIFT	19
89 #define	PN_L3_PTAG_SHIFT	24
90 #define	L2_PTAG_MASK		0xffffff
91 #define	L3_PTAG_MASK		0xfffff
92 #define	BIT_MASK		0x7f
93 #define	MSB_BIT			0x8000
94 #define	SET_MSB_BIT		0x8000
95 #define	CLEAR_MSB_BIT		0x7fff
96 #define	PN_LX_TAG_ECC_START_BIT	6
97 #define	PN_LX_TAG_ECC_END_BIT	14
98 #define	PN_LX_STATE_END_BIT	2
99 #define	PN_LX_NUM_OF_BITS_IN_ECC	9
100 
101 #define	LX_NWAYS		4
102 
103 int test_mode = 0;	/* should be 0 in production version. */
104 #define	FM_EREPORT_RECHECK_OF_TAGS "recheck_tags"
105 #define	RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO	3
106 uint32_t cmd_Lxcache_recheck_tags_delay
107 	[RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO + 1] = {0, 1, 2, 4};
108 
109 /*
110  * e (for ecctable) maps single bit positions (0-127, or 0-0x7F) to the
111  * corresponding ECC syndromes for an error in that position.
112  */
113 int e[] = {
114 	/* From Table P-4, JPS1 US-III Supplement */
115 		/* 0	1	2	3	4	5	6	7 */
116 /* 00 */	0x03B,	0x127,	0x067,	0x097,	0x10F,	0x08F,	0x04F,	0x02C,
117 /* 08 */	0x147,	0x0C7,	0x02F,	0x01C,	0x117,	0x032,	0x08A,	0x04A,
118 /* 10 */	0x01F,	0x086,	0x046,	0x026,	0x09B,	0x08C,	0x0C1,	0x0A1,
119 /* 18 */	0x01A,	0x016,	0x061,	0x091,	0x052,	0x00E,	0x109,	0x029,
120 /* 20 */	0x02A,	0x019,	0x105,	0x085,	0x045,	0x025,	0x015,	0x103,
121 /* 28 */	0x031,	0x00D,	0x083,	0x043,	0x051,	0x089,	0x023,	0x007,
122 /* 30 */	0x0B9,	0x049,	0x013,	0x0A7,	0x057,	0x00B,	0x07A,	0x187,
123 /* 38 */	0x0F8,	0x11B,	0x079,	0x034,	0x178,	0x1D8,	0x05B,	0x04C,
124 /* 40 */	0x064,	0x1B4,	0x037,	0x03D,	0x058,	0x13C,	0x1B1,	0x03E,
125 /* 48 */	0x1C3,	0x0BC,	0x1A0,	0x1D4,	0x1CA,	0x190,	0x124,	0x13A,
126 /* 50 */	0x1C0,	0x188,	0x122,	0x114,	0x184,	0x182,	0x160,	0x118,
127 /* 58 */	0x181,	0x150,	0x148,	0x144,	0x142,	0x141,	0x130,	0x0A8,
128 /* 60 */	0x128,	0x121,	0x0E0,	0x094,	0x112,	0x10C,	0x0D0,	0x0B0,
129 /* 68 */	0x10A,	0x106,	0x062,	0x1B2,	0x0C8,	0x0C4,	0x0C2,	0x1F0,
130 /* 70 */	0x0A4,	0x0A2,	0x098,	0x1D1,	0x070,	0x1E8,	0x1C6,	0x1C5,
131 /* 78 */	0x068,	0x1E4,	0x1E2,	0x1E1,	0x1D2,	0x1CC,	0x1C9,	0x1B8,
132 	/* Now we have the check bits */
133 	/* C0	C1	C2	C3	C4	C5	C6	C7	C8 */
134 	0x001,	0x002,	0x004,	0x008,	0x010,	0x020,	0x040,	0x080,	0x100,
135 };
136 
137 #define	NBITS (sizeof (e)/sizeof (e[0]))
138 #define	NDATABITS (128)
139 /*
140  * This table is used to determine which bit(s) is(are) bad when an ECC
141  * error occurs.  The array is indexed by an 9-bit syndrome.  The entries
142  * of this array have the following semantics:
143  *
144  *      00-127  The number of the bad bit, when only one bit is bad.
145  *      128     ECC bit C0 is bad.
146  *      129     ECC bit C1 is bad.
147  *      130     ECC bit C2 is bad.
148  *      131     ECC bit C3 is bad.
149  *      132     ECC bit C4 is bad.
150  *      133     ECC bit C5 is bad.
151  *      134     ECC bit C6 is bad.
152  *      135     ECC bit C7 is bad.
153  *      136     ECC bit C8 is bad.
154  *	137-143 reserved for Mtag Data and ECC.
155  *      144(M2) Two bits are bad within a nibble.
156  *      145(M3) Three bits are bad within a nibble.
157  *      146(M3) Four bits are bad within a nibble.
158  *      147(M)  Multiple bits (5 or more) are bad.
159  *      148     NO bits are bad.
160  * Based on "Cheetah Programmer's Reference Manual" rev 1.1, Tables 11-4,11-5.
161  */
162 
163 #define	C0	128
164 #define	C1	129
165 #define	C2	130
166 #define	C3	131
167 #define	C4	132
168 #define	C5	133
169 #define	C6	134
170 #define	C7	135
171 #define	C8	136
172 #define	MT0	137	/* Mtag Data bit 0 */
173 #define	MT1	138
174 #define	MT2	139
175 #define	MTC0	140	/* Mtag Check bit 0 */
176 #define	MTC1	141
177 #define	MTC2	142
178 #define	MTC3	143
179 #define	M2	144
180 #define	M3	145
181 #define	M4	146
182 #define	M	147
183 #define	NA	148
184 #if defined(JALAPENO) || defined(SERRANO)
185 #define	S003	149	/* Syndrome 0x003 => likely from CPU/EDU:ST/FRU/BP */
186 #define	S003MEM	150	/* Syndrome 0x003 => likely from WDU/WBP */
187 #define	SLAST	S003MEM	/* last special syndrome */
188 #else /* JALAPENO || SERRANO */
189 #define	S003	149	/* Syndrome 0x003 => likely from EDU:ST */
190 #define	S071	150	/* Syndrome 0x071 => likely from WDU/CPU */
191 #define	S11C	151	/* Syndrome 0x11c => likely from BERR/DBERR */
192 #define	SLAST	S11C	/* last special syndrome */
193 #endif /* JALAPENO || SERRANO */
194 #if defined(JALAPENO) || defined(SERRANO)
195 #define	BPAR0	152	/* syndrom 152 through 167 for bus parity */
196 #define	BPAR15	167
197 #endif	/* JALAPENO || SERRANO */
198 
199 static uint8_t ecc_syndrome_tab[] =
200 {
201 NA,  C0,  C1, S003, C2,  M2,  M3,  47,  C3,  M2,  M2,  53,  M2,  41,  29,   M,
202 C4,   M,   M,  50,  M2,  38,  25,  M2,  M2,  33,  24,  M2,  11,   M,  M2,  16,
203 C5,   M,   M,  46,  M2,  37,  19,  M2,   M,  31,  32,   M,   7,  M2,  M2,  10,
204 M2,  40,  13,  M2,  59,   M,  M2,  66,   M,  M2,  M2,   0,  M2,  67,  71,   M,
205 C6,   M,   M,  43,   M,  36,  18,   M,  M2,  49,  15,   M,  63,  M2,  M2,   6,
206 M2,  44,  28,  M2,   M,  M2,  M2,  52,  68,  M2,  M2,  62,  M2,  M3,  M3,  M4,
207 M2,  26, 106,  M2,  64,   M,  M2,   2, 120,   M,  M2,  M3,   M,  M3,  M3,  M4,
208 #if defined(JALAPENO) || defined(SERRANO)
209 116, M2,  M2,  M3,  M2,  M3,   M,  M4,  M2,  58,  54,  M2,   M,  M4,  M4,  M3,
210 #else	/* JALAPENO || SERRANO */
211 116, S071, M2,  M3,  M2,  M3,   M,  M4,  M2,  58,  54,  M2,   M,  M4,  M4,  M3,
212 #endif	/* JALAPENO || SERRANO */
213 C7,  M2,   M,  42,   M,  35,  17,  M2,   M,  45,  14,  M2,  21,  M2,  M2,   5,
214 M,   27,   M,   M,  99,   M,   M,   3, 114,  M2,  M2,  20,  M2,  M3,  M3,   M,
215 M2,  23, 113,  M2, 112,  M2,   M,  51,  95,   M,  M2,  M3,  M2,  M3,  M3,  M2,
216 103,  M,  M2,  M3,  M2,  M3,  M3,  M4,  M2,  48,   M,   M,  73,  M2,   M,  M3,
217 M2,  22, 110,  M2, 109,  M2,   M,   9, 108,  M2,   M,  M3,  M2,  M3,  M3,   M,
218 102, M2,   M,   M,  M2,  M3,  M3,   M,  M2,  M3,  M3,  M2,   M,  M4,   M,  M3,
219 98,   M,  M2,  M3,  M2,   M,  M3,  M4,  M2,  M3,  M3,  M4,  M3,   M,   M,   M,
220 M2,  M3,  M3,   M,  M3,   M,   M,   M,  56,  M4,   M,  M3,  M4,   M,   M,   M,
221 C8,   M,  M2,  39,   M,  34, 105,  M2,   M,  30, 104,   M, 101,   M,   M,   4,
222 #if defined(JALAPENO) || defined(SERRANO)
223 M,    M, 100,   M,  83,   M,  M2,  12,  87,   M,   M,  57,  M2,   M,  M3,   M,
224 #else	/* JALAPENO || SERRANO */
225 M,    M, 100,   M,  83,   M,  M2,  12,  87,   M,   M,  57, S11C,  M,  M3,   M,
226 #endif	/* JALAPENO || SERRANO */
227 M2,  97,  82,  M2,  78,  M2,  M2,   1,  96,   M,   M,   M,   M,   M,  M3,  M2,
228 94,   M,  M2,  M3,  M2,   M,  M3,   M,  M2,   M,  79,   M,  69,   M,  M4,   M,
229 M2,  93,  92,   M,  91,   M,  M2,   8,  90,  M2,  M2,   M,   M,   M,   M,  M4,
230 89,   M,   M,  M3,  M2,  M3,  M3,   M,   M,   M,  M3,  M2,  M3,  M2,   M,  M3,
231 86,   M,  M2,  M3,  M2,   M,  M3,   M,  M2,   M,  M3,   M,  M3,   M,   M,  M3,
232 M,    M,  M3,  M2,  M3,  M2,  M4,   M,  60,   M,  M2,  M3,  M4,   M,   M,  M2,
233 M2,  88,  85,  M2,  84,   M,  M2,  55,  81,  M2,  M2,  M3,  M2,  M3,  M3,  M4,
234 77,   M,   M,   M,  M2,  M3,   M,   M,  M2,  M3,  M3,  M4,  M3,  M2,   M,   M,
235 74,   M,  M2,  M3,   M,   M,  M3,   M,   M,   M,  M3,   M,  M3,   M,  M4,  M3,
236 M2,  70, 107,  M4,  65,  M2,  M2,   M, 127,   M,   M,   M,  M2,  M3,  M3,   M,
237 80,  M2,  M2,  72,   M, 119, 118,   M,  M2, 126,  76,   M, 125,   M,  M4,  M3,
238 M2, 115, 124,   M,  75,   M,   M,  M3,  61,   M,  M4,   M,  M4,   M,   M,   M,
239 M,  123, 122,  M4, 121,  M4,   M,  M3, 117,  M2,  M2,  M3,  M4,  M3,   M,   M,
240 111,  M,   M,   M,  M4,  M3,  M3,   M,   M,   M,  M3,   M,  M3,  M2,   M,   M
241 };
242 
243 #define	ESYND_TBL_SIZE	(sizeof (ecc_syndrome_tab) / sizeof (uint8_t))
244 
245 int8_t L2TAG_bit_to_way_map[128] = {
246 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
247 /* 1 */ 0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,  0,  0,  0,  0,
248 /* 2 */ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
249 /* 3 */ 0,  0,  0,  0,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
250 /* 4 */ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -1, -1, -1, -1,
251 /* 5 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
252 /* 6 */ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
253 /* 7 */ 1,  1,  1,  1,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
254 /* 8 */ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -1, -1, -1, -1,
255 };
256 
257 uint8_t L2TAG_bit_to_way_bit[128] = {
258 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
259 /* 1 */ 0,  1,  2,  0,  1,  2,  0,  1,  2,  0,  1,  2,  19, 20, 21, 22,
260 /* 2 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
261 /* 3 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
262 /* 4 */31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, C0, C0, C0, C0,
263 /* 5 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, 19, 20, 21, 22,
264 /* 6 */23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
265 /* 7 */39, 40, 41, 42, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
266 /* 8 */31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, C0, C0, C0, C0,
267 };
268 
269 int8_t L3TAG_bit_to_way_map[128] = {
270 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
271 /* 1 */ 1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,
272 /* 2 */ 1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,
273 /* 3 */ 1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3,  1,  3, -1, -1,
274 /* 4 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
275 /* 5 */ 0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,
276 /* 6 */ 0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,
277 /* 7 */ 0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2,  0,  2, -1, -1,
278 /* 8 */-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
279 };
280 
281 uint8_t L3TAG_bit_to_way_bit[128] = {
282 /*	1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16 */
283 /* 1 */ 0,  0,  1,  1,  2,  2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
284 /* 2 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
285 /* 3 */37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, C0, C0,
286 /* 4 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
287 /* 5 */ 0,  0,  1,  1,  2,  2, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
288 /* 6 */29, 29, 30, 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36,
289 /* 7 */37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, C0, C0,
290 /* 8 */C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0,
291 };
292 
293 uint16_t
calcecc(uint64_t chi,uint64_t clo)294 calcecc(uint64_t chi, uint64_t clo)
295 {
296 	int i;
297 	uint64_t syndrome = 0;
298 
299 	for (i = 0; i < (NDATABITS/2); i++) {
300 		syndrome ^= ((chi & 1) ? e[(NDATABITS/2) + i] : 0) ^
301 		    ((clo & 1) ? e[i] : 0);
302 		chi >>= 1;
303 		clo >>= 1;
304 	}
305 	return (uint16_t)(syndrome);
306 }
307 
308 uint64_t
calcsynd(uint64_t chi,uint64_t clo,uint64_t ecc)309 calcsynd(uint64_t chi, uint64_t clo, uint64_t ecc)
310 {
311 	return (calcecc(chi, clo) ^ ecc);
312 }
313 
314 static uint8_t
tag_bit_to_way_bit(cmd_ptrsubtype_t pstype,int16_t tag_bit)315 tag_bit_to_way_bit(cmd_ptrsubtype_t pstype, int16_t tag_bit)
316 {
317 	uint8_t way_bit = C0;
318 
319 	switch (pstype) {
320 		case CMD_PTR_CPU_L2TAG:
321 			way_bit = L2TAG_bit_to_way_bit[tag_bit];
322 			break;
323 		case CMD_PTR_CPU_L3TAG:
324 			way_bit = L3TAG_bit_to_way_bit[tag_bit];
325 			break;
326 	}
327 	return (way_bit);
328 }
329 
330 static int8_t
bit_to_way(cmd_ptrsubtype_t pstype,uint32_t bit)331 bit_to_way(cmd_ptrsubtype_t pstype, uint32_t bit)
332 {
333 	int8_t way = -1;
334 
335 	switch (pstype) {
336 		case CMD_PTR_CPU_L2TAG:
337 			way = L2TAG_bit_to_way_map[bit & BIT_MASK];
338 			break;
339 		case CMD_PTR_CPU_L3TAG:
340 			way = L3TAG_bit_to_way_map[bit & BIT_MASK];
341 			break;
342 	}
343 	return (way);
344 }
345 
346 static int32_t
get_index(cmd_ptrsubtype_t pstype,uint64_t tag_afar)347 get_index(cmd_ptrsubtype_t pstype, uint64_t tag_afar)
348 {
349 	int32_t	index = -1;
350 
351 	switch (pstype) {
352 		case CMD_PTR_CPU_L2TAG:
353 			index = (int32_t)((tag_afar & PN_L2_INDEX_MASK)
354 			    >> PN_CACHE_LINE_SHIFT);
355 			break;
356 		case CMD_PTR_CPU_L3TAG:
357 			index = (int32_t)((tag_afar & PN_L3_TAG_RD_MASK)
358 			    >> PN_CACHE_LINE_SHIFT);
359 			break;
360 	}
361 	return (index);
362 }
363 
364 static int
get_retired_ways(uint64_t * tag_data)365 get_retired_ways(uint64_t *tag_data)
366 {
367 	int		i, retired_ways;
368 
369 	retired_ways = 0;
370 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
371 		if ((tag_data[i] & CH_ECSTATE_MASK) ==
372 		    PN_ECSTATE_NA)
373 			retired_ways++;
374 	}
375 	return (retired_ways);
376 }
377 
378 static cmd_evdisp_t
extract_data_from_ereport_payload(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,uint64_t * afarp,uint64_t * tag_data,const char * fltnm)379 extract_data_from_ereport_payload(fmd_hdl_t *hdl, nvlist_t *nvl,
380     cmd_cpu_t *cpu,
381     cmd_ptrsubtype_t pstype,
382     uint64_t *afarp, uint64_t *tag_data,
383     const char *fltnm)
384 {
385 	ch_ec_data_t	*ec_data;
386 	char		*payload_namep;
387 	int		tag_afar_status;
388 	uint64_t	tag_afar;
389 	int		i;
390 	uint_t		sz;
391 	int32_t	index;
392 	int32_t		recheck_of_tags;
393 
394 	tag_afar_status = cmd_afar_valid(hdl, nvl, 0, &tag_afar);
395 	if (tag_afar_status == -1) {
396 		fmd_hdl_debug(hdl,
397 		    "\n%s:cpu_id = %d Invalid afar status in nvlist\n",
398 		    fltnm, cpu->cpu_cpuid);
399 		return (CMD_EVD_BAD);
400 	}
401 	*afarp = tag_afar;
402 	index = get_index(pstype, tag_afar);
403 	switch (pstype) {
404 		case CMD_PTR_CPU_L2TAG:
405 			payload_namep = FM_EREPORT_PAYLOAD_NAME_L2_DATA;
406 			break;
407 		case CMD_PTR_CPU_L3TAG:
408 			payload_namep = FM_EREPORT_PAYLOAD_NAME_L3_DATA;
409 			break;
410 		default:
411 			return (CMD_EVD_BAD);
412 	}
413 	if (nvlist_lookup_int32(nvl, FM_EREPORT_RECHECK_OF_TAGS,
414 	    &recheck_of_tags) != 0)
415 		recheck_of_tags = 0;
416 	if ((recheck_of_tags) || (test_mode))
417 		return (get_tagdata(cpu, pstype, index, tag_data));
418 	if (nvlist_lookup_uint64_array(nvl, payload_namep,
419 	    (uint64_t **)&ec_data, &sz) != 0) {
420 		fmd_hdl_debug(hdl,
421 		    "\n%s: cpu_id = %d index = %d could not find %s"
422 		    " in nvlist\n",
423 		    fltnm, cpu->cpu_cpuid, index, payload_namep);
424 		fmd_hdl_debug(hdl,
425 		    "\n%s: cpu_id = %d Reading tag data through"
426 		    " mem_cache driver.\n",
427 		    fltnm, cpu->cpu_cpuid);
428 		return (get_tagdata(cpu, pstype, index,
429 		    tag_data));
430 	}
431 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
432 		tag_data[i] = ec_data[i].ec_tag;
433 	}
434 	return (CMD_EVD_OK);
435 }
436 
437 static void
print_ecc(fmd_hdl_t * hdl,cmd_cpu_t * cpu,const char * fltnm,uint64_t * tag_data)438 print_ecc(fmd_hdl_t *hdl, cmd_cpu_t *cpu, const char *fltnm, uint64_t *tag_data)
439 {
440 	int	i;
441 	uint16_t	tag_ecc[PN_CACHE_NWAYS];
442 
443 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
444 		tag_ecc[i] =
445 		    ((tag_data[i] & PN_TAG_ECC_MASK)
446 		    >> PN_LX_TAG_ECC_START_BIT);
447 	}
448 	fmd_hdl_debug(hdl,
449 	    "\n%s: cpu_id = %d ecc[0] = 0x%03x ecc[1] = 0x%03x"
450 	    " ecc[2] = 0x%03x ecc[3] = 0x%03x\n",
451 	    fltnm, cpu->cpu_cpuid, tag_ecc[0], tag_ecc[1], tag_ecc[2],
452 	    tag_ecc[3]);
453 
454 }
455 
456 static int
matching_ecc(uint64_t * tag_data)457 matching_ecc(uint64_t *tag_data)
458 {
459 	int	i;
460 	uint16_t	tag_ecc[PN_CACHE_NWAYS];
461 
462 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
463 		tag_ecc[i] =
464 		    ((tag_data[i] & PN_TAG_ECC_MASK)
465 		    >> PN_LX_TAG_ECC_START_BIT);
466 		if (tag_ecc[i] != tag_ecc[0]) {
467 			return (1);
468 		}
469 	}
470 	return (0);
471 }
472 
473 static void
gen_data_for_ecc(uint64_t * tag_data,uint64_t * data_for_ecc_gen,cmd_ptrsubtype_t pstype)474 gen_data_for_ecc(uint64_t *tag_data, uint64_t *data_for_ecc_gen,
475     cmd_ptrsubtype_t pstype)
476 {
477 	uint64_t	ptag[PN_CACHE_NWAYS];
478 	uint8_t		state[PN_CACHE_NWAYS];
479 	int		i;
480 	uint8_t		bit_position;
481 
482 	for (i = 0; i < PN_CACHE_NWAYS; i++) {
483 		state[i] = tag_data[i] & CH_ECSTATE_MASK;
484 		switch (pstype) {
485 			case CMD_PTR_CPU_L2TAG:
486 				ptag[i] = (tag_data[i] >> PN_L2_PTAG_SHIFT) &
487 				    L2_PTAG_MASK;
488 				break;
489 			case CMD_PTR_CPU_L3TAG:
490 				ptag[i] = (tag_data[i] >> PN_L3_PTAG_SHIFT) &
491 				    L3_PTAG_MASK;
492 				break;
493 		}
494 	}
495 	/*
496 	 * We now assemble the 128 bit data swizzling the Physical tags
497 	 * and states we obtained for all the 4 ways.
498 	 */
499 	data_for_ecc_gen[0] = 0;	/* high order 64 bits */
500 	data_for_ecc_gen[1] = 0;	/* low order 64 bits */
501 	switch (pstype) {
502 		case CMD_PTR_CPU_L2TAG:
503 			data_for_ecc_gen[1] = state[0];	/* way 0 state */
504 			data_for_ecc_gen[1] |=
505 			    (state[1] << 3); /* way 1 state */
506 			data_for_ecc_gen[1] |=
507 			    (state[2] << 6); /* way 2 state */
508 			data_for_ecc_gen[1] |=
509 			    (state[3] << 9); /* way 3 state */
510 			data_for_ecc_gen[1] |= (ptag[0] << 12); /* way 0 ptag */
511 			data_for_ecc_gen[1] |= (ptag[2] << 36); /* way 2 ptag */
512 			/* bits 63:60 of low order 64 bits are 0s */
513 
514 			/*
515 			 * We now start with hig order 64 bits.
516 			 * the low 12 bits are 0s
517 			 */
518 			data_for_ecc_gen[0] |= (ptag[1] << 12); /* way 1 ptag */
519 			data_for_ecc_gen[0] |= (ptag[3] << 36); /* way 3 ptag */
520 			break;
521 		case CMD_PTR_CPU_L3TAG:
522 			bit_position = 0;
523 			/*
524 			 * Swizzle state bits for way 1 and way 3
525 			 */
526 			for (i = 0; i < 3; i++) {
527 				data_for_ecc_gen[1] |=
528 				    (((state[1] >> i) & 1) << bit_position);
529 				bit_position++;
530 				data_for_ecc_gen[1] |=
531 				    (((state[3] >> i) & 1) << bit_position);
532 				bit_position++;
533 			}
534 			/*
535 			 * Swizzle physical tag bits for way 1 and way 3
536 			 */
537 			for (i = 0; i < 20; i++) {
538 				data_for_ecc_gen[1] |=
539 				    (((ptag[1] >> i) & 1) << bit_position);
540 				bit_position++;
541 				data_for_ecc_gen[1] |=
542 				    (((ptag[3] >> i) & 1) << bit_position);
543 				bit_position++;
544 			}
545 			/*
546 			 * start the high order 64 bits.
547 			 */
548 			bit_position = 0;
549 			/*
550 			 * Swizzle state bits for way 0 and way 2
551 			 */
552 			for (i = 0; i < 3; i++) {
553 				data_for_ecc_gen[0] |=
554 				    (((state[0] >> i) & 1) << bit_position);
555 				bit_position++;
556 				data_for_ecc_gen[0] |=
557 				    (((state[2] >> i) & 1) << bit_position);
558 				bit_position++;
559 			}
560 			/*
561 			 * Swizzle physical tag bits for way 0 and way 2
562 			 */
563 			for (i = 0; i < 20; i++) {
564 				data_for_ecc_gen[0] |=
565 				    (((ptag[0] >> i) & 1) << bit_position);
566 				bit_position++;
567 				data_for_ecc_gen[0] |=
568 				    (((ptag[2] >> i) & 1) << bit_position);
569 				bit_position++;
570 			}
571 			break;
572 	}
573 }
574 
575 static uint16_t
compute_syndrome(uint64_t * tag_data,cmd_ptrsubtype_t pstype)576 compute_syndrome(uint64_t *tag_data, cmd_ptrsubtype_t pstype)
577 {
578 	uint64_t	tag_synd;
579 	uint64_t	data_for_ecc_gen[2];
580 	uint16_t	tag_ecc;
581 
582 	gen_data_for_ecc(tag_data, data_for_ecc_gen, pstype);
583 	tag_ecc = ((tag_data[0] & PN_TAG_ECC_MASK) >> PN_LX_TAG_ECC_START_BIT);
584 	tag_synd = calcsynd(data_for_ecc_gen[0], data_for_ecc_gen[1],
585 	    (uint64_t)tag_ecc);
586 	return (tag_synd);
587 }
588 
589 static int16_t
find_bit_stickiness(uint64_t * tag_data,int8_t way,int16_t bit)590 find_bit_stickiness(uint64_t *tag_data, int8_t way, int16_t bit)
591 {
592 	int16_t	sticky_bit;
593 
594 	sticky_bit = bit;
595 	if ((tag_data[way] & ((uint64_t)1 << bit)) != 0)
596 		sticky_bit |= MSB_BIT;
597 	return (sticky_bit);
598 }
599 
600 static cmd_Lxcache_t *
cmd_create_and_destroy_Lxcache(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache)601 cmd_create_and_destroy_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
602     cmd_Lxcache_t *Lxcache)
603 {
604 	const char		*fltnm;
605 	cmd_Lxcache_t	*new_Lxcache;
606 
607 	fltnm = cmd_type_to_str(Lxcache->Lxcache_type);
608 
609 	/*
610 	 * We first create a new Lxcache and add the event ep
611 	 * that is in Lxcache to the new case we create.
612 	 * we then destroy the Lxcache that has the event ep in its SERD engine.
613 	 */
614 	new_Lxcache = cmd_Lxcache_create(hdl, Lxcache->xr, cpu,
615 	    cpu->cpu_asru_nvl,
616 	    Lxcache->Lxcache_type,
617 	    Lxcache->Lxcache_index, Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
618 	if (new_Lxcache == NULL) {
619 		fmd_hdl_debug(hdl,
620 		    "\n%s:cpu_id %d:Failed to create a Lxcache for"
621 		    " index %d way %d bit %d\n",
622 		    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_index,
623 		    Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
624 		return (NULL);
625 	}
626 	(void) cmd_create_case_for_Lxcache(hdl, cpu, new_Lxcache);
627 	cmd_Lxcache_destroy(hdl, cpu, Lxcache);
628 	return (new_Lxcache);
629 }
630 
631 int
cmd_Lxcache_retire_as_reason(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache,const char * fltnm,int32_t reason)632 cmd_Lxcache_retire_as_reason(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
633     cmd_Lxcache_t *Lxcache, const char *fltnm, int32_t reason)
634 {
635 	boolean_t	ret;
636 	uint_t		certainty;
637 
638 	if (reason == CMD_LXSUSPECT_0_TAG) {
639 		/*
640 		 * clear MSB bit to retire as SUSPECT_0_TAG
641 		 * We need to update the Lxcache asru to reflect
642 		 * the change in bit value.
643 		 */
644 		Lxcache->Lxcache_bit &= CLEAR_MSB_BIT;
645 		errno = nvlist_add_uint16(
646 		    Lxcache->Lxcache_asru_nvl,
647 		    FM_FMRI_CPU_CACHE_BIT,
648 		    Lxcache->Lxcache_bit);
649 		if (errno) {
650 			fmd_hdl_debug(hdl,
651 			    "\n%s:cpu_id %d: failed to update",
652 			    " CACHE_BIT in asru.\n",
653 			    fltnm, cpu->cpu_cpuid);
654 			return (CMD_EVD_BAD);
655 		}
656 	}
657 	if (reason == CMD_LXCONVICTED)
658 		certainty = HUNDRED_PERCENT;
659 	else
660 		certainty = SUSPECT_PERCENT;
661 	ret = cmd_Lxcache_retire(hdl, cpu, Lxcache, fltnm, certainty);
662 	if (reason == CMD_LXSUSPECT_0_TAG)
663 		Lxcache->Lxcache_bit |= SET_MSB_BIT;
664 	if (ret == B_FALSE)
665 		return (CMD_EVD_BAD);
666 	Lxcache->Lxcache_reason = reason;
667 	/*
668 	 * Update the persistence storage of
669 	 * Lxcache.
670 	 */
671 	fmd_hdl_debug(hdl,
672 	    "\n%s:cpu_id %d:reason = %s flags = %s\n",
673 	    fltnm, cpu->cpu_cpuid,
674 	    cmd_reason_to_str(Lxcache->Lxcache_reason),
675 	    cmd_flags_to_str(Lxcache->Lxcache_flags));
676 	cmd_Lxcache_write(hdl, Lxcache);
677 	return (CMD_EVD_OK);
678 }
679 
680 int
retire_lowest_retirable_way_as_suspect(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * anonymous_Lxcache,const char * fltnm)681 retire_lowest_retirable_way_as_suspect(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
682     cmd_Lxcache_t *anonymous_Lxcache, const char *fltnm)
683 {
684 	/*
685 	 * This routine is called only when handling anonymous TAG or DATA
686 	 * errors. When we exit this routine we would have destroyed the
687 	 * anonymous_Lxcache structure that was passed to us and created
688 	 * a new Lxcache if we were successful in determining a way to retire.
689 	 */
690 	int8_t	lowest_retirable_way, ways_retired;
691 	int32_t	reason;
692 	cmd_ptrsubtype_t type;
693 	cmd_Lxcache_t *new_Lxcache;
694 
695 	ways_retired = get_index_retired_ways(cpu,
696 	    anonymous_Lxcache->Lxcache_type,
697 	    anonymous_Lxcache->Lxcache_index);
698 	if (ways_retired == -1) {
699 		/*
700 		 * Couldn't determine how many ways have been retired at this
701 		 * index. Destroy the anonymous_Lxcache and return failure.
702 		 */
703 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
704 		return (CMD_EVD_BAD);
705 	}
706 	/*
707 	 * Before retiring a way check if we have already
708 	 * retired 3 ways for this index.
709 	 * For TAG errors we will not perform this check because
710 	 * we could reretire cachlines retired for DATA errors.
711 	 * The get_lowest_retirable_way() will ensure that we do
712 	 * not end up retiring all 4 ways.
713 	 */
714 	if (!IS_TAG(anonymous_Lxcache->Lxcache_type)) {
715 		if (ways_retired >= 3) {
716 			fmd_hdl_debug(hdl,
717 			    "\n%s: cpu %d: num of ways retired for index %d"
718 			    " is %d will fault the CPU\n",
719 			    fltnm, cpu->cpu_cpuid,
720 			    anonymous_Lxcache->Lxcache_index, ways_retired);
721 			type = anonymous_Lxcache->Lxcache_type;
722 			/*
723 			 * destroy the anonymous_Lxcache
724 			 */
725 			cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
726 			cmd_fault_the_cpu(hdl, cpu, type, fltnm);
727 			return (CMD_EVD_OK);
728 		}
729 	}
730 	/*
731 	 * No ways have been retired as "SUSPECT" for this bit.
732 	 * We need to retire the lowest unretired way as suspect.
733 	 */
734 	fmd_hdl_debug(hdl,
735 	    "\n%s: cpu_id %d Checking for the lowest retirable"
736 	    " way at index %d\n",
737 	    fltnm, cpu->cpu_cpuid, anonymous_Lxcache->Lxcache_index);
738 	lowest_retirable_way = cmd_Lxcache_get_lowest_retirable_way(cpu,
739 	    anonymous_Lxcache->Lxcache_index, anonymous_Lxcache->Lxcache_type);
740 	if (lowest_retirable_way != -1) {
741 		fmd_hdl_debug(hdl,
742 		    "\n%s: cpu_id %d lowest retirable way is %d\n",
743 		    fltnm, cpu->cpu_cpuid, lowest_retirable_way);
744 		anonymous_Lxcache->Lxcache_way = lowest_retirable_way;
745 		new_Lxcache = cmd_create_and_destroy_Lxcache(hdl, cpu,
746 		    anonymous_Lxcache);
747 		if ((new_Lxcache == NULL) ||
748 		    (new_Lxcache->Lxcache_case.cc_cp == NULL)) {
749 			return (CMD_EVD_BAD);
750 		}
751 		if (IS_TAG(new_Lxcache->Lxcache_type))
752 			reason = CMD_LXSUSPECT_0_TAG;
753 		else
754 			reason = CMD_LXSUSPECT_DATA;
755 		return (cmd_Lxcache_retire_as_reason(hdl, cpu, new_Lxcache,
756 		    fltnm, reason));
757 	} else {
758 		fmd_hdl_debug(hdl,
759 		    "\n%s:cpu_id %d we are unable to determine which"
760 		    " way is faulty at cache index %d."
761 		    " Will retire the CPU.\nRecommended-Action:"
762 		    " Service action required\n",
763 		    fltnm, cpu->cpu_cpuid, anonymous_Lxcache->Lxcache_index);
764 		type = anonymous_Lxcache->Lxcache_type;
765 		/*
766 		 * destroy the anonymous_Lxcache
767 		 */
768 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
769 		cmd_fault_the_cpu(hdl, cpu, type, fltnm);
770 		return (CMD_EVD_OK);
771 	}
772 }
773 
774 int
unretire_suspect_and_retire_next_retirable_way(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * suspect_Lxcache,cmd_Lxcache_t * anonymous_Lxcache,const char * fltnm)775 unretire_suspect_and_retire_next_retirable_way(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
776     cmd_Lxcache_t *suspect_Lxcache, cmd_Lxcache_t *anonymous_Lxcache,
777     const char *fltnm)
778 {
779 	int8_t	retired_way, next_retirable_way;
780 	int32_t	retired_index;
781 	cmd_ptrsubtype_t retired_type;
782 	int32_t	reason;
783 	cmd_Lxcache_t *new_Lxcache;
784 
785 	/*
786 	 * This routine is called only when handling anonymous TAG or DATA
787 	 * errors. When we exit this routine we would have destroyed the
788 	 * anonymous_Lxcache structure that was passed to us.
789 	 */
790 	fmd_hdl_debug(hdl,
791 	    "\n%s:cpu_id %d found index %d way %d"
792 	    " bit %d retired as %s. Will unretire this now.\n",
793 	    fltnm, cpu->cpu_cpuid, suspect_Lxcache->Lxcache_index,
794 	    suspect_Lxcache->Lxcache_way, suspect_Lxcache->Lxcache_bit,
795 	    cmd_reason_to_str(suspect_Lxcache->Lxcache_reason));
796 	/*
797 	 * Save the way because we will destroy the
798 	 * suspect_Lxcache after we successfully unretire it.
799 	 */
800 	retired_way = suspect_Lxcache->Lxcache_way;
801 	retired_index = suspect_Lxcache->Lxcache_index;
802 	retired_type = suspect_Lxcache->Lxcache_type;
803 	/*
804 	 * unretire the retired_way.
805 	 */
806 	if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache,
807 	    fltnm)
808 	    == B_TRUE) {
809 		suspect_Lxcache->Lxcache_reason =
810 		    CMD_LXFUNCTIONING;
811 		fmd_hdl_debug(hdl,
812 		    "\n%s:cpu_id %d index %d way %d"
813 		    " successfully unretired. Will"
814 		    " destroy this Lxcache now.\n",
815 		    fltnm, cpu->cpu_cpuid, suspect_Lxcache->Lxcache_index,
816 		    suspect_Lxcache->Lxcache_way);
817 		cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
818 	} else {
819 		/*
820 		 * destroy the anonymous_Lxcache
821 		 */
822 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
823 		return (CMD_EVD_BAD);
824 	}
825 	/*
826 	 * retire the next retirable way
827 	 */
828 	next_retirable_way = cmd_Lxcache_get_next_retirable_way(cpu,
829 	    retired_index,
830 	    retired_type, retired_way);
831 	if (next_retirable_way == -1) {
832 		/*
833 		 * There is no retirable way that is next to the
834 		 * one we just retired. We need to offline the
835 		 * CPU since we are unable to determine which
836 		 * way is reporting the errors.
837 		 */
838 		fmd_hdl_debug(hdl,
839 		    "\n%s:cpu_id %d we are unable to determine"
840 		    " which way is faulty at cache index %d."
841 		    " It is likely that we have a leaky bit"
842 		    " that gets corrected.\n Will retire"
843 		    " the CPU.\nRecommended-Action: Service"
844 		    " action required\n",
845 		    fltnm, cpu->cpu_cpuid, retired_index);
846 		/*
847 		 * destroy the anonymous_Lxcache
848 		 */
849 		cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
850 		cmd_fault_the_cpu(hdl, cpu, retired_type, fltnm);
851 		return (CMD_EVD_OK);
852 	} else {
853 		fmd_hdl_debug(hdl,
854 		    "\n%s:cpu_id %d found way %d at index %d to"
855 		    " retire as SUSPECT_0/SUSPECT_DATA\n",
856 		    fltnm, cpu->cpu_cpuid, next_retirable_way, retired_index);
857 		/*
858 		 * We need to create a new Lxcache struture.
859 		 * The existing Lxcache is for anonymous way.
860 		 */
861 		anonymous_Lxcache->Lxcache_way = next_retirable_way;
862 		new_Lxcache = cmd_create_and_destroy_Lxcache(hdl,
863 		    cpu, anonymous_Lxcache);
864 		if ((new_Lxcache == NULL) ||
865 		    (new_Lxcache->Lxcache_case.cc_cp == NULL)) {
866 			return (CMD_EVD_BAD);
867 		}
868 		if (IS_TAG(new_Lxcache->Lxcache_type))
869 			reason = CMD_LXSUSPECT_0_TAG;
870 		else
871 			reason = CMD_LXSUSPECT_DATA;
872 		return (cmd_Lxcache_retire_as_reason(hdl, cpu, new_Lxcache,
873 		    fltnm, reason));
874 	}
875 }
876 
877 void
find_and_destroy_anonymous_Lxcache(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index)878 find_and_destroy_anonymous_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
879     cmd_ptrsubtype_t pstype, int32_t index)
880 {
881 	cmd_Lxcache_t *anonymous_Lxcache;
882 	const char	*fltnm;
883 
884 	fltnm = cmd_type_to_str(pstype);
885 	anonymous_Lxcache =
886 	    cmd_Lxcache_lookup_by_type_index_way_bit(cpu,
887 	    pstype, index, -1, -1);
888 	if (anonymous_Lxcache != NULL) {
889 		fmd_hdl_debug(hdl,
890 		    "\n%s:cpu_id = %d index = %d We are destroying the"
891 		    " anonymous Lxcache now.\n",
892 		    fltnm, cpu->cpu_cpuid, index);
893 		/*
894 		 * Free the resources allocated to handle
895 		 * recheck_of_tags. Delete the Lxcache.
896 		 */
897 		cmd_Lxcache_destroy(hdl, cpu,
898 		    anonymous_Lxcache);
899 	}
900 }
901 
902 void
cmd_Lxcache_anonymous_tag_error_timeout(fmd_hdl_t * hdl,id_t id)903 cmd_Lxcache_anonymous_tag_error_timeout(fmd_hdl_t *hdl, id_t id)
904 {
905 	cmd_Lxcache_t	*Lxcache;
906 	const char	*class;
907 
908 
909 	/*
910 	 * We search thru the entire Lxcache structures to find
911 	 * a matching id.
912 	 */
913 	Lxcache = cmd_Lxcache_lookup_by_timeout_id(id);
914 	if (Lxcache == NULL) {
915 		fmd_hdl_debug(hdl,
916 		    "Could not find Lxcache for timeout_id 0x%x\n", id);
917 		return;
918 	}
919 	fmd_hdl_debug(hdl,
920 	    "\n%s:anonymous_tag_error_timeout:index = %d\n",
921 	    cmd_type_to_str(Lxcache->Lxcache_type),
922 	    Lxcache->Lxcache_index);
923 	/*
924 	 * Set timeout_id to -1 to indicate that we have processed the
925 	 * timeout.
926 	 */
927 	Lxcache->Lxcache_timeout_id = -1;
928 	switch (Lxcache->Lxcache_type) {
929 		case CMD_PTR_CPU_L2TAG:
930 			class = "ereport.cpu.ultraSPARC-IVplus.thce";
931 			(void) cmd_txce(hdl, Lxcache->Lxcache_ep,
932 			    Lxcache->Lxcache_nvl,
933 			    class, Lxcache->Lxcache_clcode);
934 			break;
935 		case CMD_PTR_CPU_L3TAG:
936 			class = "ereport.cpu.ultraSPARC-IVplus.l3-thce";
937 			(void) cmd_l3_thce(hdl, Lxcache->Lxcache_ep,
938 			    Lxcache->Lxcache_nvl,
939 			    class, Lxcache->Lxcache_clcode);
940 			break;
941 		default:
942 			fmd_hdl_debug(hdl,
943 			    "Unexpected pstype 0x%x found in"
944 			    " anonymous_tag_error_timeout: index = %d\n",
945 			    Lxcache->Lxcache_type,
946 			    Lxcache->Lxcache_index);
947 			return;
948 	}
949 }
950 
951 cmd_evdisp_t
cmd_us4plus_tag_err(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,const char * serdn,const char * serdt,const char * fltnm,cmd_errcl_t clcode)952 cmd_us4plus_tag_err(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
953     cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
954     const char *serdn, const char *serdt,
955     const char *fltnm, cmd_errcl_t clcode)
956 {
957 	uint64_t	tag_afar;
958 	int32_t	index;
959 	int8_t		way;
960 	int16_t		tag_bit, bit, sticky_bit;
961 	cmd_Lxcache_t	*Lxcache, *suspect_Lxcache, *retired_Lxcache;
962 	cmd_Lxcache_t	*anonymous_Lxcache;
963 	uint64_t	tag_synd;
964 	uint64_t	tag_data[PN_CACHE_NWAYS];
965 	uint8_t		state;
966 	int		ways_retired, ret;
967 	int		retries_for_ecc_match;
968 	int32_t		recheck_of_tags;
969 	int		way_already_retired = 0;
970 
971 	/*
972 	 * We now extract physical tags and states
973 	 * and also look for matching ECC on all 4 ways.
974 	 */
975 	ret = extract_data_from_ereport_payload(hdl, nvl, cpu, pstype,
976 	    &tag_afar, tag_data, fltnm);
977 	if (ret != 0)
978 		return (ret);
979 	index = get_index(pstype, tag_afar);
980 	retries_for_ecc_match = 0;
981 	while (matching_ecc(tag_data) != 0) {
982 		if (retries_for_ecc_match >= MAX_RETRIES_FOR_ECC_MATCH)
983 			return (CMD_EVD_BAD);
984 		print_ecc(hdl, cpu, fltnm, tag_data);
985 		fmd_hdl_debug(hdl,
986 		    "\n%s:cpu_id = %d index = %d ECCs don't match.\n"
987 		    "Reading tag info again.\n",
988 		    fltnm, cpu->cpu_cpuid, index);
989 		(void) get_tagdata(cpu, pstype, index, tag_data);
990 		retries_for_ecc_match++;
991 	}
992 	ways_retired = get_retired_ways(tag_data);
993 	fmd_hdl_debug(hdl,
994 	    "\n%s:cpu_id %d: found %d ways retired at the index %d\n",
995 	    fltnm, cpu->cpu_cpuid, ways_retired, index);
996 	tag_synd = compute_syndrome(tag_data, pstype);
997 	ret = nvlist_lookup_int32(nvl, FM_EREPORT_RECHECK_OF_TAGS,
998 	    &recheck_of_tags);
999 	if (ret != CMD_EVD_OK) {
1000 		fmd_hdl_debug(hdl,
1001 		    "ret value = %d for nvlist_lookup of recheck_of_tags\n",
1002 		    ret);
1003 		recheck_of_tags = 0;
1004 	}
1005 	if (tag_synd == 0) {
1006 		/*
1007 		 * The bit has been corrected by writeback, we will
1008 		 * first check if we are processing the re-check of tags
1009 		 * that we scheduled thru the timeout call.
1010 		 * if so we will exit if we reached the max retries.
1011 		 * Else we start a timeout and exit.
1012 		 * We will create a Lxcache structure for this index with way
1013 		 * as -1 and bit as -1. We will also keep a count of
1014 		 * attempts we made to check the tag data at this index.
1015 		 *
1016 		 */
1017 		way = -1;
1018 		bit = -1;
1019 		Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(cpu, pstype,
1020 		    index, way, bit);
1021 		if (recheck_of_tags) {
1022 			/*
1023 			 * We are processing the re-read of tags scheduled by
1024 			 * timeout. Exit if retry limit has been
1025 			 * reached. Else start another timeout.
1026 			 */
1027 			if (Lxcache == NULL) {
1028 				/*
1029 				 * This shouldn't happen.
1030 				 */
1031 				fmd_hdl_debug(hdl,
1032 				    "\n%s: cpu_id = %d failed to lookup"
1033 				    " index = %d way %d bit %d\n",
1034 				    fltnm, cpu->cpu_cpuid, index, way, bit);
1035 				return (CMD_EVD_BAD);
1036 			}
1037 			fmd_hdl_debug(hdl,
1038 			    "\n%s: cpu_id = %d index = %d syndrome"
1039 			    " computed is 0 in attempt #%d.\n",
1040 			    fltnm, cpu->cpu_cpuid, index,
1041 			    Lxcache->Lxcache_retry_count);
1042 			if (Lxcache->Lxcache_retry_count >=
1043 			    RETRIES_TO_BE_DONE_WHEN_SYND_IS_ZERO) {
1044 				/*
1045 				 * We free only the nvl list here.
1046 				 * anonymous SERD engine will be freed
1047 				 * when the Lxcache gets destroyed.
1048 				 * We need the anonymous SERD engine still
1049 				 * because it has the event ep.
1050 				 * reset or destroy of SERD engine frees the
1051 				 * event ep.
1052 				 */
1053 				if (Lxcache->Lxcache_nvl != NULL) {
1054 					nvlist_free(Lxcache->Lxcache_nvl);
1055 					Lxcache->Lxcache_nvl = NULL;
1056 				}
1057 				fmd_hdl_debug(hdl,
1058 		    "\n%s:cpu_id %d Max retry count reached. Giving up.\n",
1059 				    fltnm, cpu->cpu_cpuid);
1060 				Lxcache->Lxcache_timeout_id = -1;
1061 				Lxcache->Lxcache_retry_count = 0;
1062 				goto process_after_finding_way_bit;
1063 			} else {
1064 				Lxcache->Lxcache_retry_count++;
1065 				Lxcache->Lxcache_timeout_id =
1066 				    fmd_timer_install(hdl,
1067 				    (void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR,
1068 				    NULL,
1069 				    (cmd_Lxcache_recheck_tags_delay[
1070 				    Lxcache->Lxcache_retry_count] * NANOSEC));
1071 				return (CMD_EVD_OK);
1072 			}
1073 		}
1074 		/*
1075 		 * Check if we already have a Lxcache structure
1076 		 * with anonymous way and bit created.
1077 		 */
1078 		if (Lxcache == NULL) {
1079 			Lxcache = cmd_Lxcache_create(hdl, 0, cpu,
1080 			    cpu->cpu_asru_nvl, pstype, index, way, bit);
1081 			if (Lxcache == NULL) {
1082 				fmd_hdl_debug(hdl,
1083 				    "\n%s:cpu_id %d Failed to create Lxcache"
1084 				    " for index=%d\n",
1085 				    fltnm, cpu->cpu_cpuid, index);
1086 				return (CMD_EVD_BAD);
1087 			}
1088 		}
1089 		if (Lxcache->Lxcache_timeout_id != -1) {
1090 			/*
1091 			 * We have another syndrome = 0 condition while we are
1092 			 * still in the process of retrying for the previous
1093 			 * condition.
1094 			 */
1095 			fmd_hdl_debug(hdl,
1096 			    "\n%s: cpu_id = %d index = %d We have another"
1097 			    " syndrome = 0 condition while we have already"
1098 			    " scheduled a timeout. We will ignore this"
1099 			    " event.\n",
1100 			    fltnm, cpu->cpu_cpuid, index);
1101 			return (CMD_EVD_OK);
1102 		}
1103 		fmd_hdl_debug(hdl,
1104 		    "\n%s: cpu_id = %d index = %d syndrome computed is 0."
1105 		    "Looks like the bit got corrected."
1106 		    " Will check later to see if it is OK.\n",
1107 		    fltnm, cpu->cpu_cpuid, index);
1108 		/*
1109 		 * We need to store the following arguments passed to
1110 		 * this function(tag_error_handler) so that we can
1111 		 * invoke this function from timeout routine.
1112 		 *
1113 		 * nvl, ep, clcode
1114 		 */
1115 		if (Lxcache->Lxcache_nvl == NULL) {
1116 			if (nvlist_dup(nvl, &Lxcache->Lxcache_nvl, 0) != 0) {
1117 				fmd_hdl_debug(hdl,
1118 				    "\n%s:cpu_id %d Failed to duplicate nvl"
1119 				    " for index=%d\n",
1120 				    fltnm, cpu->cpu_cpuid, index);
1121 				return (CMD_EVD_BAD);
1122 			}
1123 			if (nvlist_add_int32(Lxcache->Lxcache_nvl,
1124 			    FM_EREPORT_RECHECK_OF_TAGS, 1) != 0) {
1125 				fmd_hdl_debug(hdl,
1126 				    "\n%s:cpu_id %d Failed to add"
1127 				    " RECHECK_OF_TAGS in nvl for index=%d\n",
1128 				    fltnm, cpu->cpu_cpuid, index);
1129 				return (CMD_EVD_BAD);
1130 			}
1131 		}
1132 		/*
1133 		 * We are called with CMP_CPU_LEVEL_CORE masked out
1134 		 * from cmd_txce(), cmd_l3_thce() routines.
1135 		 * We need to set CMD_CPU_LEVEL_CORE because we want to handle
1136 		 * both the cores on the Chip as one single cpu_id.
1137 		 */
1138 		Lxcache->Lxcache_clcode = (clcode | CMD_CPU_LEVEL_CORE);
1139 		if (Lxcache->Lxcache_ep == NULL) {
1140 			Lxcache->Lxcache_ep = ep;
1141 			/*
1142 			 * we need to preserve the event ep so that it does
1143 			 * not get destroyed when we return from this call.
1144 			 * We do that by adding the event ep to the SERD engine.
1145 			 * The SERD engine we create is different from the one
1146 			 * we create when we handle the actual event at label
1147 			 * process_after_finding_way_bit.
1148 			 */
1149 			Lxcache->Lxcache_serdnm =
1150 			    cmd_Lxcache_anonymous_serdnm_create(hdl,
1151 			    cpu->cpu_cpuid, pstype, index,
1152 			    way, bit);
1153 			if (!fmd_serd_exists(hdl, Lxcache->Lxcache_serdnm)) {
1154 				fmd_serd_create(hdl, Lxcache->Lxcache_serdnm,
1155 				    fmd_prop_get_int32(hdl, serdn),
1156 				    fmd_prop_get_int64(hdl, serdt));
1157 				fmd_hdl_debug(hdl,
1158 				    "\n%s: cpu_id %d: created a SERD engine"
1159 				    " %s\n",
1160 				    fltnm, cpu->cpu_cpuid,
1161 				    Lxcache->Lxcache_serdnm);
1162 			}
1163 			(void) fmd_serd_record(hdl,
1164 			    Lxcache->Lxcache_serdnm,
1165 			    ep);
1166 		}
1167 		Lxcache->Lxcache_retry_count++;
1168 		Lxcache->Lxcache_timeout_id =
1169 		    fmd_timer_install(hdl,
1170 		    (void *)CMD_TIMERTYPE_ANONYMOUS_TAG_ERROR, NULL,
1171 		    (cmd_Lxcache_recheck_tags_delay[
1172 		    Lxcache->Lxcache_retry_count] * NANOSEC));
1173 		return (CMD_EVD_OK);
1174 
1175 	} else {
1176 		/*
1177 		 * tag_synd != 0
1178 		 * determine way and bit
1179 		 */
1180 		tag_bit = ecc_syndrome_tab[tag_synd & 0x1ff];
1181 		fmd_hdl_debug(hdl,
1182 		    "\n%s: cpu_id = %d index = %d tag_bit %03d is faulty.\n",
1183 		    fltnm, cpu->cpu_cpuid, index, tag_bit);
1184 		if ((tag_bit > C8)) {
1185 			fmd_hdl_debug(hdl, "%s: cpu_id = %d"
1186 			    " Unexpected MTAG or Multiple bit error detected\n",
1187 			    fltnm, cpu->cpu_cpuid);
1188 			find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype,
1189 			    index);
1190 			return (CMD_EVD_BAD);
1191 		}
1192 		if ((tag_bit >= C0) && (tag_bit <= C8)) {
1193 			/*
1194 			 * ECC bit is corrupted.
1195 			 * Need to offline the CPU
1196 			 */
1197 			bit = (tag_bit - C0) + PN_LX_TAG_ECC_START_BIT;
1198 			way = 0;
1199 			fmd_hdl_debug(hdl,
1200 			    "\n%s: cpu_id = %d ECC bit is faulty.\n",
1201 			    fltnm, cpu->cpu_cpuid);
1202 		} else {
1203 			bit = tag_bit_to_way_bit(pstype, tag_bit);
1204 			way = bit_to_way(pstype, tag_bit);
1205 			if (way < 0) {
1206 				fmd_hdl_debug(hdl,
1207 				    "\n%s: cpu_id = %d %d bit indicted is a"
1208 				    " meta bit  !!\n",
1209 				    fltnm, cpu->cpu_cpuid, bit);
1210 				find_and_destroy_anonymous_Lxcache(hdl, cpu,
1211 				    pstype,
1212 				    index);
1213 				return (CMD_EVD_BAD);
1214 			}
1215 		}
1216 	}	/* end of tag_synd != 0 */
1217 process_after_finding_way_bit:
1218 	if ((Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(cpu, pstype,
1219 	    index, way,
1220 	    bit)) != NULL &&
1221 	    Lxcache->Lxcache_case.cc_cp != NULL &&
1222 	    fmd_case_solved(hdl, Lxcache->Lxcache_case.cc_cp)) {
1223 		fmd_hdl_debug(hdl,
1224 		    "\n%s:cpu %d: the case for %s is already solved.\n",
1225 		    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_bufname);
1226 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1227 		return (CMD_EVD_REDUND);
1228 	}
1229 
1230 	if (Lxcache == NULL)
1231 		Lxcache = cmd_Lxcache_create(hdl, 0, cpu, cpu->cpu_asru_nvl,
1232 		    pstype, index, way, bit);
1233 	if (Lxcache == NULL) {
1234 		fmd_hdl_debug(hdl,
1235 		    "\n%s:cpu %d: Failed to create Lxcache for index %d",
1236 		    " way %d bit %d\n",
1237 		    fltnm, cpu->cpu_cpuid, index, way, bit);
1238 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1239 		return (CMD_EVD_BAD);
1240 	}
1241 	if (cmd_create_case_for_Lxcache(hdl, cpu, Lxcache) == B_FALSE) {
1242 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1243 		return (CMD_EVD_BAD);
1244 	}
1245 	if (Lxcache->Lxcache_case.cc_serdnm == NULL) {
1246 		Lxcache->Lxcache_case.cc_serdnm = cmd_Lxcache_serdnm_create(hdl,
1247 		    cpu->cpu_cpuid, pstype, index,
1248 		    way, bit);
1249 		if (!fmd_serd_exists(hdl, Lxcache->Lxcache_case.cc_serdnm)) {
1250 			fmd_serd_create(hdl, Lxcache->Lxcache_case.cc_serdnm,
1251 			    fmd_prop_get_int32(hdl, serdn),
1252 			    fmd_prop_get_int64(hdl, serdt));
1253 			fmd_hdl_debug(hdl,
1254 			    "\n%s: cpu_id %d: created a SERD engine %s\n",
1255 			    fltnm, cpu->cpu_cpuid,
1256 			    Lxcache->Lxcache_case.cc_serdnm);
1257 		}
1258 	}
1259 	fmd_hdl_debug(hdl,
1260 	    "\n%s:cpu_id %d: Checking if the SERD engine %s has fired.\n",
1261 	    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1262 
1263 	(void) fmd_serd_record(hdl, Lxcache->Lxcache_case.cc_serdnm, ep);
1264 	if (way >= 0) {
1265 		/*
1266 		 * Now that we have recorded the event ep we can do the
1267 		 * necessary cleanup of resources allocated for recheck of tags.
1268 		 */
1269 		find_and_destroy_anonymous_Lxcache(hdl, cpu, pstype, index);
1270 	}
1271 	if (fmd_serd_fired(hdl, Lxcache->Lxcache_case.cc_serdnm) ==
1272 	    FMD_B_FALSE)
1273 		return (CMD_EVD_OK);
1274 
1275 	fmd_hdl_debug(hdl, "\n%s: cpu_id = %d creating fault %s\n",
1276 	    fltnm, cpu->cpu_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1277 	fmd_case_add_serd(hdl, Lxcache->Lxcache_case.cc_cp,
1278 	    Lxcache->Lxcache_case.cc_serdnm);
1279 	fmd_serd_reset(hdl, Lxcache->Lxcache_case.cc_serdnm);
1280 	if (way == -1) {
1281 		/*
1282 		 * The assignment below is to make the code easier to maintain.
1283 		 * We need to destroy the anonymous_Lxcache after we have
1284 		 * identifed a way to retire. If we cannot detrmine a way to
1285 		 * retire we will destrory the anonymous_Lxcache and fault the
1286 		 * cpu.
1287 		 */
1288 		anonymous_Lxcache = Lxcache;
1289 		/*
1290 		 * Anonymous TAG way retirement.
1291 		 * - if a way at this index has already been retired as
1292 		 *   "suspect-1", unretire that way, and retire the next
1293 		 *   unretired way as "suspect-0", using a pattern of all zeros
1294 		 *   for the PA bits.
1295 		 * - if a way at this index has already been retired as
1296 		 *   "suspect-0", re-retire that way as "suspect-1", using a
1297 		 *   pattern of all ones for the PA bits.
1298 		 * - if no ways have been retired as "suspect" for this index,
1299 		 *   retire the lowest unretired way as "suspect-0" for this
1300 		 *   bit, using a pattern of all zeros for the PA bits.
1301 		 * - if there is no next retirable way, fault the CPU.
1302 		 */
1303 		suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1304 		    cpu, pstype, index, bit, CMD_LXSUSPECT_1_TAG);
1305 		anonymous_Lxcache->Lxcache_ep = ep;
1306 		if (suspect_Lxcache) {
1307 			ret = unretire_suspect_and_retire_next_retirable_way(
1308 			    hdl, cpu, suspect_Lxcache, anonymous_Lxcache,
1309 			    fltnm);
1310 			return (ret);
1311 		}	/* end SUSPECT_1_TAG */
1312 		suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1313 		    cpu, pstype, index, bit, CMD_LXSUSPECT_0_TAG);
1314 		if (suspect_Lxcache) {
1315 			fmd_hdl_debug(hdl,
1316 			    "\n%s:cpu_id %d found index %d way %d"
1317 			    " bit %d retired as SUSPECT_0_TAG. Will"
1318 			    " re-retire this now as SUSPECT_1_TAG.\n",
1319 			    fltnm, cpu->cpu_cpuid, index,
1320 			    suspect_Lxcache->Lxcache_way, bit);
1321 			/*
1322 			 * destroy the anonymous_Lxcache
1323 			 */
1324 			cmd_Lxcache_destroy(hdl, cpu, anonymous_Lxcache);
1325 			suspect_Lxcache->Lxcache_ep = ep;
1326 			/*
1327 			 * We need to update the FM_FMRI_CPU_CACHE_BIT entry
1328 			 * in the Lxcache_asru_nvl. This entry was last updated
1329 			 * when the cacheline was retired as SUSPECT_0.
1330 			 * Therefore the MSB of FM_FMRI_CPU_CACHE_BIT entry
1331 			 * value will be reset. To retire cacheline as
1332 			 * SUSPECT_1 the MSB has to be set.
1333 			 */
1334 			errno = nvlist_add_uint16(
1335 			    suspect_Lxcache->Lxcache_asru_nvl,
1336 			    FM_FMRI_CPU_CACHE_BIT,
1337 			    suspect_Lxcache->Lxcache_bit);
1338 			if (errno) {
1339 				fmd_hdl_debug(hdl,
1340 				    "\n%s:cpu_id %d: failed to update",
1341 				    " CACHE_BIT in asru.\n",
1342 				    fltnm, cpu->cpu_cpuid);
1343 			}
1344 			return (cmd_Lxcache_retire_as_reason(hdl, cpu,
1345 			    suspect_Lxcache, fltnm, CMD_LXSUSPECT_1_TAG));
1346 		}	/* end of SUSPECT_0_TAG */
1347 		/*
1348 		 * No ways have been retired as "SUSPECT_x" for this bit.
1349 		 * We need to retire the lowest unretired way as suspect.
1350 		 */
1351 		ret = retire_lowest_retirable_way_as_suspect(hdl, cpu,
1352 		    anonymous_Lxcache,
1353 		    fltnm);
1354 		return (ret);
1355 	}	/* End of Anonymous TAG retirement */
1356 	/*
1357 	 * Identified bit and way has fired.
1358 	 * - Destroy any anonymous SERD engine at that index.
1359 	 * - If the bad bit is an ECC bit, fault the CPU.
1360 	 * - If the way was already convicted due to tag errors, fault the CPU.
1361 	 * - If the bad bit is a state bit, then:
1362 	 * - if the stable value of the bad bit will hold the NA encoding,
1363 	 *   retire the containing way as "convicted".
1364 	 * - if the stable value of the bad bit will not hold the NA
1365 	 *   encoding, fault the CPU.
1366 	 */
1367 	cmd_Lxcache_destroy_anonymous_serd_engines(hdl, cpu, pstype, index, -1);
1368 	sticky_bit = find_bit_stickiness(tag_data, way, bit);
1369 	if ((bit >= PN_LX_TAG_ECC_START_BIT) &&
1370 	    (bit <= PN_LX_TAG_ECC_END_BIT)) {
1371 		fmd_hdl_debug(hdl,
1372 		    "\n%s:cpu_id %d Bad ECC bit %d at cache index %d way %d"
1373 		    " detected. Will offline the CPU.\n",
1374 		    fltnm, cpu->cpu_cpuid, bit, index, way);
1375 		cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1376 		return (CMD_EVD_OK);
1377 	}
1378 	/*
1379 	 * Check if a STATE bit is faulty.
1380 	 * If so we need to ensure that we will be able to
1381 	 * make the way NA, else fault the CPU.
1382 	 */
1383 	if (bit <= PN_LX_STATE_END_BIT) {
1384 		fmd_hdl_debug(hdl,
1385 		    "%s cpu_id = %d: STATE bit %d is faulty.\n",
1386 		    fltnm, cpu->cpu_cpuid, bit);
1387 		/*
1388 		 * If the stable value of bit will hold the NA encoding
1389 		 * retire the containing way Else fault the cpu.
1390 		 */
1391 		state = tag_data[way] & CH_ECSTATE_MASK;
1392 		if ((state & (1 << bit)) != (PN_ECSTATE_NA & (1 << bit))) {
1393 			/*
1394 			 * The stable value of the bad bit will not hold the
1395 			 * NA encoding. will fault the CPU.
1396 			 */
1397 			fmd_hdl_debug(hdl,
1398 			    "\n%s:cpu_id %d STATE bit %d is faulty at"
1399 			    " cache index %d way %d. STATE = 0x%x\n"
1400 			    " The bad bit will not hold the encoding we need"
1401 			    " to mark the cacheline as retired, so will offline"
1402 			    " the CPU.\n",
1403 			    fltnm, cpu->cpu_cpuid, bit, index, way, state);
1404 			cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1405 			return (CMD_EVD_OK);
1406 		}
1407 	}
1408 	/*
1409 	 * Check if we are getting fault on a way that is already retired.
1410 	 * if the way was already convicted due to tag errors, fault the CPU.
1411 	 * Note that the way could have previously been retired due to
1412 	 * data errors.  This is okay; we just re-retire it due to tag errors,
1413 	 * so that we can write the offending tag bit to a stable value.
1414 	 */
1415 	if ((tag_data[way] & CH_ECSTATE_MASK) == PN_ECSTATE_NA) {
1416 		/*
1417 		 * Looking for CONVICTED TAG fault first.
1418 		 * If found retire the CPU.
1419 		 */
1420 		retired_Lxcache = cmd_Lxcache_lookup_by_type_index_way_reason(
1421 		    cpu, pstype, index, way, CMD_LXCONVICTED);
1422 		if (retired_Lxcache) {
1423 			fmd_hdl_debug(hdl,
1424 			    "\n%s: cpu %d: The cache index %d way %d previously"
1425 			    " retired for %s fault at bit %d is reporting"
1426 			    " fault. Will fault the CPU\n",
1427 			    fltnm, cpu->cpu_cpuid, index, way,
1428 			    cmd_type_to_str(
1429 			    retired_Lxcache->Lxcache_type),
1430 			    retired_Lxcache->Lxcache_bit);
1431 			cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1432 			return (CMD_EVD_OK);
1433 		}
1434 		way_already_retired = 1;
1435 	}
1436 	/*
1437 	 * If any way(Including the current way) at this index is retired as
1438 	 * "suspect" due to tag errors, unretire it.  (If that suspect way
1439 	 * really was bad, it will start producing errors again and will
1440 	 * eventually be retired again.)
1441 	 */
1442 	suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1443 	    cpu, pstype, index,  -1,
1444 	    (CMD_LXSUSPECT_0_TAG | CMD_LXSUSPECT_1_TAG));
1445 	if (suspect_Lxcache) {
1446 		fmd_hdl_debug(hdl,
1447 		    "\n%s:cpu_id %d found index %d way %d"
1448 		    " bit %d retired as SUSPECT_x. Will"
1449 		    "  unretire this now.\n",
1450 		    fltnm, cpu->cpu_cpuid, index,
1451 		    suspect_Lxcache->Lxcache_way, -1);
1452 		/*
1453 		 * unretire the suspect_x retired_way.
1454 		 */
1455 		if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache, fltnm)
1456 		    == B_TRUE) {
1457 			suspect_Lxcache->Lxcache_reason =
1458 			    CMD_LXFUNCTIONING;
1459 			fmd_hdl_debug(hdl,
1460 			    "\n%s:cpu_id %d index %d way %d"
1461 			    " successfully unretired. Will"
1462 			    " destroy this Lxcache now.\n",
1463 			    fltnm, cpu->cpu_cpuid, index,
1464 			    suspect_Lxcache->Lxcache_way);
1465 			cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
1466 		} else {
1467 			/*
1468 			 * We are unable to unretire the previously retired
1469 			 * SUSPECT way at the fault index.
1470 			 * If the previously retired way is same as the way
1471 			 * we are attempting to retire then return failure.
1472 			 */
1473 			if (suspect_Lxcache->Lxcache_way ==
1474 			    Lxcache->Lxcache_way)
1475 				return (CMD_EVD_BAD);
1476 		}
1477 	}
1478 	ways_retired = get_index_retired_ways(cpu, pstype, index);
1479 	if (ways_retired == -1)
1480 		return (CMD_EVD_BAD);
1481 	/*
1482 	 * Before retiring a way check if we have already
1483 	 * retired 3 ways for this index.
1484 	 * If the way was already retired due to DATA error or
1485 	 * SUSPECT_X TAG error then we skip the check.
1486 	 */
1487 	if (!way_already_retired) {
1488 		if (ways_retired >= 3) {
1489 			fmd_hdl_debug(hdl,
1490 			    "\n%s: cpu %d: num of ways retired for index %d"
1491 			    " is %d will fault the CPU\n",
1492 			    fltnm, cpu->cpu_cpuid, index, ways_retired);
1493 			cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
1494 			return (CMD_EVD_OK);
1495 		}
1496 	}
1497 	fmd_hdl_debug(hdl,
1498 	    "\n%s: cpu %d: num of ways retired for index %d is %d\n",
1499 	    fltnm, cpu->cpu_cpuid, index, ways_retired);
1500 	if ((errno = nvlist_add_uint16(Lxcache->Lxcache_asru_nvl,
1501 	    FM_FMRI_CPU_CACHE_BIT,
1502 	    sticky_bit)) != 0 ||
1503 	    (errno = fmd_nvl_fmri_expand(hdl, Lxcache->Lxcache_asru_nvl)) != 0)
1504 		fmd_hdl_abort(hdl, "failed to build Lxcache fmri");
1505 	Lxcache->Lxcache_ep = ep;
1506 	return (cmd_Lxcache_retire_as_reason(hdl, cpu, Lxcache, fltnm,
1507 	    CMD_LXCONVICTED));
1508 }
1509 
1510 static boolean_t
pn_there_is_a_matching_synd(fmd_hdl_t * hdl,cmd_xr_t * xr)1511 pn_there_is_a_matching_synd(fmd_hdl_t *hdl, cmd_xr_t *xr)
1512 {
1513 	int ec_data_idx, i;
1514 	int8_t	way;
1515 	uint64_t ec_tag, data_hi, data_lo;
1516 	int ecc, calc_synd;
1517 	ec_data_elm_t *ecdptr = NULL;
1518 	uint8_t state;
1519 	ch_ec_data_t	*ecp;
1520 
1521 	ecp = (ch_ec_data_t *)(xr->xr_cache_data);
1522 	for (way = 0; way < xr->xr_num_ways; way++, ecp++) {
1523 		ec_tag = ecp->ec_tag;
1524 		/*
1525 		 * skip Retired and Invalid ways
1526 		 */
1527 		state = ec_tag & CH_ECSTATE_MASK;
1528 		if ((state == PN_ECSTATE_NA) ||
1529 		    (state == CH_ECSTATE_INV))
1530 			continue;
1531 		/*
1532 		 * Each 16 bytes of data are protected by 9-bit ECC field.
1533 		 */
1534 
1535 		for (i = 0; i < (CH_ECACHE_SUBBLK_SIZE/16); i++) {
1536 			ec_data_idx = (i/2);
1537 
1538 			ecdptr = &ecp->ec_data[ec_data_idx];
1539 			if ((i & 1) == 0) {
1540 				ecc = (ecdptr->ec_eccd >> 9) & 0x1ff;
1541 				data_hi = ecdptr->ec_d8[0];
1542 				data_lo = ecdptr->ec_d8[1];
1543 			} else {
1544 				ecc = ecdptr->ec_eccd & 0x1ff;
1545 				data_hi = ecdptr->ec_d8[2];
1546 				data_lo = ecdptr->ec_d8[3];
1547 			}
1548 
1549 			calc_synd = calcsynd(data_hi, data_lo, ecc);
1550 			if ((calc_synd != 0) &&
1551 			    (xr->xr_synd == calc_synd)) {
1552 				if (xr->xr_num_ways == 1) {
1553 					fmd_hdl_debug(hdl,
1554 			"\ncomputed syndrome matches with the reported syndrome"
1555 			" 0x%x index = %d way = %d\n",
1556 					    xr->xr_synd, xr->xr_error_index,
1557 					    xr->xr_error_way);
1558 				} else {
1559 					fmd_hdl_debug(hdl,
1560 					    "\ncomputed syndrome matches with"
1561 					    " the reported syndrome"
1562 					    " 0x%x index = %d way = %d\n",
1563 					    xr->xr_synd, xr->xr_error_index,
1564 					    way);
1565 					xr->xr_error_way = way;
1566 				}
1567 				return (B_TRUE);
1568 			}
1569 		}
1570 	}
1571 	return (B_FALSE);
1572 }
1573 
1574 /* add to cheetahregs.h */
1575 #define	CH_ECSTATE_NA	5
1576 
1577 static int32_t
pn_extract_index(int32_t type,uint64_t afar)1578 pn_extract_index(int32_t type, uint64_t afar)
1579 {
1580 	int32_t index = -1;
1581 
1582 	switch (type) {
1583 		case CMD_PTR_CPU_L2DATA:
1584 			index = (int32_t)((afar & PN_L2_INDEX_MASK)
1585 			    >> PN_CACHE_LINE_SHIFT);
1586 			break;
1587 		case CMD_PTR_CPU_L3DATA:
1588 			index = (int32_t)((afar & PN_L3_INDEX_MASK)
1589 			    >> PN_CACHE_LINE_SHIFT);
1590 			break;
1591 	}
1592 	return (index);
1593 }
1594 
1595 /*
1596  *	cmd_cache_ce_panther
1597  *
1598  *	This routine handles L2 and L3 cachedata errors for the Panther.
1599  *	It's called when the train processing for L2 and L3 correctable
1600  *	data errors are about to issue a fault.
1601  *
1602  *	This routine retrieves payload information gathered during the XR
1603  *	processing and generates a unique SERD engine and cache data
1604  *	associated with the CPU if one does not exist.
1605  *	If the SERD fires for the given engine it will initiate a cache
1606  *	line fault if the way is not anonomyous.
1607  *	If the way is anonomyous, it will attempt to choose a way for the
1608  *	given index to fault. If the maximum for the index has not been
1609  *	reached, it will attempt to unretire a different way previously retired
1610  *	under suspicion for the index prior to faulting
1611  *	the selected way.
1612  *	The routine will also fault the CPU if the maximum number of
1613  *	retired ways for the CPU has been exceeded based on the category.
1614  */
1615 /*ARGSUSED*/
1616 int
cmd_cache_ce_panther(fmd_hdl_t * hdl,fmd_event_t * ep,cmd_xr_t * xr)1617 cmd_cache_ce_panther(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_xr_t *xr)
1618 {
1619 	cmd_Lxcache_t *suspect_Lxcache, *Lxcache, *anonymous_Lxcache;
1620 	cmd_cpu_t *cpu = xr->xr_cpu;
1621 	cmd_case_t *cpu_cc;
1622 	cmd_ptrsubtype_t type;
1623 	const errdata_t *cache_ed;
1624 	uint16_t offset;
1625 	int16_t bit;
1626 	int	ways_retired;
1627 	int	ret;
1628 
1629 	/*
1630 	 * The caller of this routine cmd_xxc_hdlr() expects us to
1631 	 * return CMD_EVD_OK for success and CMD_EVD_BAD for failures.
1632 	 * If this is not a Panther or one of the Panther specific
1633 	 * errors that we handle here, then exit
1634 	 */
1635 
1636 	if (cpu->cpu_pers.cpup_type != CPU_ULTRASPARC_IVplus)
1637 		return (CMD_EVD_BAD);
1638 
1639 	if (!(xr->xr_clcode & (int)PN_CACHE_ERRORS))
1640 		return (CMD_EVD_BAD);
1641 
1642 
1643 	/* Set up Cache specific structs */
1644 
1645 	if (CMD_ERRCL_ISL2XXCU(xr->xr_clcode)) {
1646 		type = CMD_PTR_CPU_L2DATA;
1647 		cpu_cc = &cpu->cpu_l2data;
1648 		cache_ed = &l2errdata;
1649 	} else {
1650 		type = CMD_PTR_CPU_L3DATA;
1651 		cpu_cc = &cpu->cpu_l3data;
1652 		cache_ed = &l3errdata;
1653 	}
1654 
1655 	/* Ensure that our case is not solved */
1656 
1657 	if (cpu->cpu_faulting || (cpu_cc->cc_cp != NULL &&
1658 	    fmd_case_solved(hdl, cpu_cc->cc_cp)))
1659 			return (CMD_EVD_OK);
1660 
1661 	fmd_hdl_debug(hdl, "Processing Panther %s Error\n",
1662 	    cache_ed->ed_fltnm);
1663 
1664 	/* L3 errors arrive as mem scheme errors - convert to CPU */
1665 	if (type == CMD_PTR_CPU_L3DATA) {
1666 		cmd_fmri_init(hdl, &xr->xr_rsrc,
1667 		    xr->xr_detector_nvlist, "%s_rsrc",
1668 		    fmd_case_uuid(hdl, xr->xr_case));
1669 	}
1670 	bit = (uint8_t)ecc_syndrome_tab[xr->xr_synd];
1671 	offset = (uint16_t)xr->xr_afar & 0x3f;
1672 	if (bit > C8) {
1673 		fmd_hdl_debug(hdl, "xxC/LDxC dropped due to syndrome\n");
1674 		return (CMD_EVD_BAD);
1675 	}
1676 	if (bit < C0) {
1677 		/*
1678 		 * Data bit. Set bit in the range 0-511
1679 		 */
1680 		bit += ((3 - (offset/16)) * 128);
1681 	} else {
1682 		/*
1683 		 * ECC bit. Set bit in the range 512-547
1684 		 */
1685 		bit -= C0;
1686 		bit += 512 + ((3 - (offset/16)) * PN_LX_NUM_OF_BITS_IN_ECC);
1687 	}
1688 	xr->xr_error_index = pn_extract_index(type, xr->xr_afar);
1689 	if (xr->xr_error_index == 0xffffffff) {
1690 		fmd_hdl_debug(hdl, "xxC/LDxC dropped due to index\n");
1691 		return (CMD_EVD_BAD);
1692 	}
1693 	fmd_hdl_debug(hdl, "cpu_id: %d, syndrome: 0x%x, afar: 0x%llx\n",
1694 	    xr->xr_cpuid, xr->xr_synd, xr->xr_afar);
1695 	fmd_hdl_debug(hdl, "index: 0x%x(%d) bit: %d\n",
1696 	    xr->xr_error_index, xr->xr_error_index, bit);
1697 	/*
1698 	 * The payload information for the DATA errors are assembled
1699 	 * after first looking for a valid line that matches the fault AFAR.
1700 	 * If no match is found all 4 ways are logged and xr_num_ways
1701 	 * will be 4. If a matching way is found only that entry is logged
1702 	 * and xr_num_ways is set as 1.
1703 	 * The xr_error_way is set as -1 when xr_num_ways is 4, else
1704 	 * xr_error_way is set to the matching way.
1705 	 * what we do below is to force the xr_error_way to -1 for WDC/CPC
1706 	 * errors.
1707 	 * For UCC and EDC errors the xr_error_way will be set correctly.
1708 	 */
1709 
1710 	switch (xr->xr_clcode) {
1711 		case CMD_ERRCL_WDC:
1712 		case CMD_ERRCL_L3_WDC:
1713 			/*
1714 			 * WDC is a disrupting trap, and invalidates and
1715 			 * overwrites the problematic way.  Any match is due to
1716 			 * a refetch of the AFAR, which could have been to any
1717 			 * way. So these are treated as "anonymous".
1718 			 */
1719 			fmd_hdl_debug(hdl, "WDC fault detected\n");
1720 			xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1721 			break;
1722 		case CMD_ERRCL_CPC:
1723 		case CMD_ERRCL_L3_CPC:
1724 			/*
1725 			 * CPC is a disrupting trap, but since it happens due to
1726 			 * a snoop, the problematic way could become invalid,
1727 			 * overwritten by a different cache line, and then the
1728 			 * AFAR accessed and pulled into a different way,
1729 			 * causing a false positive match.  So it's best to not
1730 			 * look for a matching way and just ascribe these to
1731 			 *  the "anonymous" way.
1732 			 */
1733 			fmd_hdl_debug(hdl, "CPC fault detected\n");
1734 			xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1735 			break;
1736 		case CMD_ERRCL_UCC:
1737 		case CMD_ERRCL_L3_UCC:
1738 			/*
1739 			 * UCC is a precise trap, so, absent activity from the
1740 			 * other core, the tag address values read by the TL=1
1741 			 * trap handler are likely to be the same as those at
1742 			 * the time of the trap.
1743 			 * (A snoop from another CPU might cause a change in
1744 			 * state from valid to invalid, but the  tag address
1745 			 * won't change.) If we find a matching valid tag,
1746 			 * that identifies the way.
1747 			 */
1748 			fmd_hdl_debug(hdl, "UCC fault detected\n");
1749 			fmd_hdl_debug(hdl, "# of ways collected are %d\n",
1750 			    xr->xr_num_ways);
1751 			fmd_hdl_debug(hdl,
1752 			    "\n%s:cpu_id %d: error way = %d\n",
1753 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1754 			    xr->xr_error_way);
1755 			break;
1756 		case CMD_ERRCL_EDC:
1757 		case CMD_ERRCL_L3_EDC:
1758 			/*
1759 			 * EDC is a disrupting trap, but again if a matching
1760 			 * valid way is found, it is likely to be the correct
1761 			 * way.
1762 			 */
1763 			fmd_hdl_debug(hdl, "EDC fault detected\n");
1764 			fmd_hdl_debug(hdl, "# of ways collected are %d\n",
1765 			    xr->xr_num_ways);
1766 			fmd_hdl_debug(hdl,
1767 			    "\n%s:cpu_id %d: error way = %d\n",
1768 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1769 			    xr->xr_error_way);
1770 			break;
1771 		default:
1772 			fmd_hdl_debug(hdl, "Unexpected fault detected\n");
1773 			xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
1774 	}
1775 	if ((type == CMD_PTR_CPU_L2DATA) &&
1776 	    (xr->xr_cache_data != NULL) &&
1777 	    (!pn_there_is_a_matching_synd(hdl, xr))) {
1778 		fmd_hdl_debug(hdl, "No matching syndrome\n");
1779 	}
1780 	Lxcache = cmd_Lxcache_lookup_by_type_index_way_bit(xr->xr_cpu, type,
1781 	    xr->xr_error_index, xr->xr_error_way, bit);
1782 
1783 	if (Lxcache == NULL) {
1784 		fmd_hdl_debug(hdl,
1785 		    "\n%s: cpu %d: creating a case for index %d way %d"
1786 		    " bit %d\n",
1787 		    cache_ed->ed_fltnm, xr->xr_cpuid,
1788 		    xr->xr_error_index, xr->xr_error_way, bit);
1789 		Lxcache = cmd_Lxcache_create(hdl, xr, xr->xr_cpu,
1790 		    xr->xr_cpu->cpu_asru_nvl,
1791 		    type, xr->xr_error_index,
1792 		    xr->xr_error_way, bit);
1793 		if (Lxcache == NULL) {
1794 			fmd_hdl_debug(hdl,
1795 			    "\n%s:cpu_id %d:Failed to create a Lxcache for"
1796 			    " index %d way %d bit %d\n",
1797 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1798 			    Lxcache->Lxcache_index,
1799 			    Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
1800 			return (CMD_EVD_BAD);
1801 		}
1802 	}
1803 	if (cmd_create_case_for_Lxcache(hdl, cpu, Lxcache) == B_FALSE)
1804 		return (CMD_EVD_BAD);
1805 	if (Lxcache->Lxcache_case.cc_serdnm == NULL) {
1806 		Lxcache->Lxcache_case.cc_serdnm =
1807 		    cmd_Lxcache_serdnm_create(hdl, xr->xr_cpuid,
1808 		    type, xr->xr_error_index, xr->xr_error_way, bit);
1809 
1810 		if (!fmd_serd_exists(hdl,
1811 		    Lxcache->Lxcache_case.cc_serdnm)) {
1812 			fmd_serd_create(hdl,
1813 			    Lxcache->Lxcache_case.cc_serdnm,
1814 			    cache_ed->ed_serd->cs_n,
1815 			    cache_ed->ed_serd->cs_t);
1816 			fmd_hdl_debug(hdl,
1817 			    "\n%s: cpu_id %d: created a SERD engine %s\n",
1818 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1819 			    Lxcache->Lxcache_case.cc_serdnm);
1820 		}
1821 	}
1822 	/* Ensure that our case is not solved */
1823 	if ((Lxcache->Lxcache_case.cc_cp != NULL) &&
1824 	    fmd_case_solved(hdl, Lxcache->Lxcache_case.cc_cp)) {
1825 		fmd_hdl_debug(hdl,
1826 		    "\n%s:cpu %d: the case for %s is already solved.\n",
1827 		    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1828 		    Lxcache->Lxcache_bufname);
1829 		return (CMD_EVD_REDUND);
1830 	}
1831 
1832 	fmd_hdl_debug(hdl,
1833 	    "\n%s:cpu_id %d: checking if SERD engine %s has fired.\n",
1834 	    cache_ed->ed_fltnm, xr->xr_cpuid, Lxcache->Lxcache_case.cc_serdnm);
1835 
1836 	if (fmd_serd_record(hdl, Lxcache->Lxcache_case.cc_serdnm, ep)
1837 	    == FMD_B_FALSE)
1838 		return (CMD_EVD_OK); /* serd engine hasn't fired yet */
1839 
1840 	fmd_hdl_debug(hdl, "\n%s: cpu_id = %d creating fault %s\n",
1841 	    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1842 	    Lxcache->Lxcache_case.cc_serdnm);
1843 	fmd_case_add_serd(hdl, Lxcache->Lxcache_case.cc_cp,
1844 	    Lxcache->Lxcache_case.cc_serdnm);
1845 	fmd_serd_reset(hdl, Lxcache->Lxcache_case.cc_serdnm);
1846 	/*
1847 	 * Find out if there is a way at the fault index/bit that was retired
1848 	 * as suspect. We need this information for both anonymous way and
1849 	 * identified way handling. We store this info in suspect_Lxcache.
1850 	 */
1851 	fmd_hdl_debug(hdl,
1852 	    "\n%s:cpu_id %d checking if there is a way at"
1853 	    " index %d retired as suspect due to bit %d\n",
1854 	    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1855 	    Lxcache->Lxcache_index, Lxcache->Lxcache_bit);
1856 	suspect_Lxcache = cmd_Lxcache_lookup_by_type_index_bit_reason(
1857 	    cpu, type, Lxcache->Lxcache_index, Lxcache->Lxcache_bit,
1858 	    CMD_LXSUSPECT_DATA);
1859 	if (xr->xr_error_way != (uint32_t)CMD_ANON_WAY) {
1860 		/*
1861 		 * IDENTIFIED WAY DATA error handling.
1862 		 *
1863 		 * If there is a way at that index retired as suspect due
1864 		 * to that bit, unretire it.
1865 		 * retire the identified way, and mark the way as "convicted"
1866 		 * for this bit. Destroy any anonymous SERD engine named by
1867 		 * that index and bit.
1868 		 */
1869 		if (suspect_Lxcache != NULL) {
1870 			fmd_hdl_debug(hdl,
1871 			    "\n%s:cpu_id %d found index %d way %d"
1872 			    " bit %d retired on suspicion. Will"
1873 			    "  unretire this now.\n",
1874 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1875 			    suspect_Lxcache->Lxcache_index,
1876 			    suspect_Lxcache->Lxcache_way,
1877 			    suspect_Lxcache->Lxcache_bit);
1878 			/*
1879 			 * unretire the retired_way.
1880 			 */
1881 			if (cmd_Lxcache_unretire(hdl, cpu, suspect_Lxcache,
1882 			    cache_ed->ed_fltnm) == B_TRUE) {
1883 				suspect_Lxcache->Lxcache_reason =
1884 				    CMD_LXFUNCTIONING;
1885 				cmd_Lxcache_destroy(hdl, cpu, suspect_Lxcache);
1886 			}
1887 			/*
1888 			 * We proceed to retire the identified way even if
1889 			 * we are unable to unretire the suspect way.
1890 			 * We will not end up retiring all 4 ways because
1891 			 * we check the actual number of ways retired
1892 			 * at this index by reading the info from processor
1893 			 * directly. The call to get_index_retired_ways() does
1894 			 * that.
1895 			 */
1896 		}
1897 		/*
1898 		 * Before retiring a way check if we have already
1899 		 * retired 3 ways for this index.
1900 		 */
1901 		ways_retired = get_index_retired_ways(cpu, type,
1902 		    Lxcache->Lxcache_index);
1903 		if (ways_retired == -1) {
1904 			fmd_hdl_debug(hdl,
1905 			    "\n%s: cpu %d: We are unable to determine how many"
1906 			    " ways are retired at this index. We will not be"
1907 			    " retiring the identified cacheline at index %d"
1908 			    " way %d\n",
1909 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1910 			    Lxcache->Lxcache_index, Lxcache->Lxcache_way);
1911 			return (CMD_EVD_BAD);
1912 		}
1913 		if (ways_retired >= 3) {
1914 			fmd_hdl_debug(hdl,
1915 			    "\n%s: cpu %d: num of ways retired for index %d"
1916 			    " is %d. Will fault the CPU\n",
1917 			    cache_ed->ed_fltnm, cpu->cpu_cpuid,
1918 			    Lxcache->Lxcache_index, ways_retired);
1919 			cmd_fault_the_cpu(hdl, cpu, type, cache_ed->ed_fltnm);
1920 			return (CMD_EVD_OK);
1921 		}
1922 		/*
1923 		 * retire the cache line
1924 		 */
1925 		ret = cmd_Lxcache_retire_as_reason(hdl, cpu, Lxcache,
1926 		    cache_ed->ed_fltnm, CMD_LXCONVICTED);
1927 		if (ret != CMD_EVD_OK)
1928 			return (ret);
1929 		/*
1930 		 * anonymous serd engines for DATA faults will have valid bit
1931 		 * but way as -1.
1932 		 */
1933 		cmd_Lxcache_destroy_anonymous_serd_engines(hdl, cpu, type,
1934 		    Lxcache->Lxcache_index,
1935 		    bit);
1936 		return (CMD_EVD_OK);
1937 	}	/* end of IDENTIFIED WAY error handling */
1938 	/*
1939 	 * ANONYMOUS WAY DATA error handling.
1940 	 *
1941 	 * - if a way at this index has already been retired as "suspect"
1942 	 * for this bit, unretire that way, and retire the next retirable
1943 	 * way as "suspect" for this bit.
1944 	 * - if no ways have been retired as "suspect" for this bit,
1945 	 * retire the lowest unretired way as "suspect" for this bit.
1946 	 * - if there is no next retirable way, fault the CPU.
1947 	 */
1948 	/*
1949 	 * The assignment below is to make the code easier to maintain.
1950 	 * We need to destroy the anonymous_Lxcache after we have
1951 	 * identifed a way to retire. If we cannot detrmine a way to
1952 	 * retire we will destrory the anonymous_Lxcache and fault the cpu.
1953 	 */
1954 	anonymous_Lxcache = Lxcache;
1955 	anonymous_Lxcache->Lxcache_ep = ep;
1956 	if (suspect_Lxcache != NULL) {
1957 		ret = unretire_suspect_and_retire_next_retirable_way(hdl,
1958 		    cpu, suspect_Lxcache, anonymous_Lxcache,
1959 		    cache_ed->ed_fltnm);
1960 	} else {
1961 		ret = retire_lowest_retirable_way_as_suspect(hdl, cpu,
1962 		    anonymous_Lxcache, cache_ed->ed_fltnm);
1963 	}
1964 	return (ret);
1965 }
1966 
1967 /* ARGSUSED */
1968 int
cmd_xr_pn_cache_fill(fmd_hdl_t * hdl,nvlist_t * nvl,cmd_xr_t * xr,cmd_cpu_t * cpu,cmd_errcl_t clcode)1969 cmd_xr_pn_cache_fill(fmd_hdl_t *hdl, nvlist_t *nvl, cmd_xr_t *xr,
1970     cmd_cpu_t *cpu, cmd_errcl_t clcode)
1971 {
1972 	struct ch_ec_data *data_ptr;
1973 	uint64_t *cache_data = NULL;
1974 	uint_t sz;
1975 
1976 	if (cpu->cpu_pers.cpup_type != CPU_ULTRASPARC_IVplus)
1977 		return (0);
1978 
1979 	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR,
1980 	    &xr->xr_detector_nvlist) != 0) {
1981 		fmd_hdl_debug(hdl, "look up for FM_EREPORT_DETECTOR failed\n");
1982 		return (-1);
1983 	}
1984 	if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_NAME_AFSR,
1985 	    &xr->xr_afsr) != 0) {
1986 		fmd_hdl_debug(hdl,
1987 		    "look up for FM_EREPORT_PAYLOAD_NAME_AFSR failed\n");
1988 		return (-1);
1989 	}
1990 
1991 	/* check clcode for l2/l3 first */
1992 	if (CMD_ERRCL_ISL3XXCU(clcode)) {
1993 		if (nvlist_lookup_uint8(nvl, FM_EREPORT_PAYLOAD_NAME_L3_WAYS,
1994 		    &xr->xr_num_ways) != 0) {
1995 			fmd_hdl_debug(hdl,
1996 		    "look up for FM_EREPORT_PAYLOAD_NAME_L3_WAYS failed\n");
1997 			return (-1);
1998 		}
1999 
2000 		if (nvlist_lookup_uint64_array(nvl,
2001 		    FM_EREPORT_PAYLOAD_NAME_L3_DATA, (uint64_t **)&cache_data,
2002 		    &sz) != 0) {
2003 			fmd_hdl_debug(hdl,
2004 		    "look up for FM_EREPORT_PAYLOAD_NAME_L3_DATA failed\n");
2005 		}
2006 	} else {
2007 		if (nvlist_lookup_uint8(nvl, FM_EREPORT_PAYLOAD_NAME_L2_WAYS,
2008 		    &xr->xr_num_ways) != 0) {
2009 			fmd_hdl_debug(hdl,
2010 		    "look up for FM_EREPORT_PAYLOAD_NAME_L2_WAYS failed\n");
2011 			return (-1);
2012 		}
2013 
2014 		if (nvlist_lookup_uint64_array(nvl,
2015 		    FM_EREPORT_PAYLOAD_NAME_L2_DATA, (uint64_t **)&cache_data,
2016 		    &sz) != 0) {
2017 			fmd_hdl_debug(hdl,
2018 		    "look up for FM_EREPORT_PAYLOAD_NAME_L2_DATA failed\n");
2019 		}
2020 	}
2021 	if (xr->xr_num_ways > PN_CACHE_NWAYS) {
2022 		fmd_hdl_debug(hdl,
2023 		    "xr_num_ways > PN_CACHE_WAYS\n");
2024 		return (-1);
2025 	}
2026 
2027 	xr->xr_cache_data = cache_data;
2028 	data_ptr = (struct ch_ec_data *)cache_data;
2029 	if (cache_data == NULL) {
2030 		xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
2031 		return (0);
2032 	}
2033 
2034 	/*
2035 	 * Our error handler checks for a matching valid way
2036 	 * If there is a match, there is only 1 data set, the set
2037 	 * associated with the cache-line/way that was "valid"
2038 	 * Otherwise, it stores all of the ways
2039 	 */
2040 	xr->xr_error_tag = data_ptr[0].ec_tag;
2041 	xr->xr_error_way = (uint32_t)data_ptr[0].ec_way;
2042 
2043 	/* If there is more than 1 way structure, set way to Anonymous */
2044 	if (xr->xr_num_ways > 1)
2045 		xr->xr_error_way = (uint32_t)CMD_ANON_WAY;
2046 
2047 	return (0);
2048 }
2049