p4_pcbe.c revision 843e19887f64dde75055cf8842fc4db2171eff45
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * Performance Counter Back-End for Pentium 4.
30 */
31
32#include <sys/cpuvar.h>
33#include <sys/param.h>
34#include <sys/cpc_impl.h>
35#include <sys/cpc_pcbe.h>
36#include <sys/inttypes.h>
37#include <sys/errno.h>
38#include <sys/systm.h>
39#include <sys/archsystm.h>
40#include <sys/x86_archext.h>
41#include <sys/modctl.h>
42#include <sys/sdt.h>
43#include <sys/cred.h>
44#include <sys/policy.h>
45#include <sys/privregs.h>
46
47static int p4_pcbe_init(void);
48static uint_t p4_pcbe_ncounters(void);
49static const char *p4_pcbe_impl_name(void);
50static const char *p4_pcbe_cpuref(void);
51static char *p4_pcbe_list_events(uint_t picnum);
52static char *p4_pcbe_list_attrs(void);
53static uint64_t p4_pcbe_event_coverage(char *event);
54static uint64_t p4_pcbe_overflow_bitmap(void);
55static int p4_pcbe_configure(uint_t picnum, char *event, uint64_t preset,
56    uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
57    void *token);
58static void p4_pcbe_program(void *token);
59static void p4_pcbe_allstop(void);
60static void p4_pcbe_sample(void *token);
61static void p4_pcbe_free(void *config);
62
63extern int cpuid_get_clogid(cpu_t *);
64
65static pcbe_ops_t p4_pcbe_ops = {
66	PCBE_VER_1,
67	CPC_CAP_OVERFLOW_INTERRUPT | CPC_CAP_OVERFLOW_PRECISE,
68	p4_pcbe_ncounters,
69	p4_pcbe_impl_name,
70	p4_pcbe_cpuref,
71	p4_pcbe_list_events,
72	p4_pcbe_list_attrs,
73	p4_pcbe_event_coverage,
74	p4_pcbe_overflow_bitmap,
75	p4_pcbe_configure,
76	p4_pcbe_program,
77	p4_pcbe_allstop,
78	p4_pcbe_sample,
79	p4_pcbe_free
80};
81
82/*
83 * P4 Configuration Flags.
84 */
85#define	P4_THIS_USR	0x1 /* HTT: Measure usr events on this logical CPU */
86#define	P4_THIS_SYS	0x2 /* HTT: Measure os events on this logical CPU */
87#define	P4_SIBLING_USR	0x4 /* HTT: Measure os events on other logical CPU */
88#define	P4_SIBLING_SYS	0x8 /* HTT: Measure usr events on other logical CPU */
89#define	P4_PMI		0x10 /* HTT: Set PMI bit for local logical CPU */
90
91typedef struct _p4_pcbe_config {
92	uint8_t		p4_flags;
93	uint8_t		p4_picno;	/* From 0 to 18 */
94	uint8_t		p4_escr_ndx;	/* Which ESCR to use */
95	uint32_t	p4_escr;	/* Value to program in selected ESCR */
96	uint32_t	p4_cccr;	/* Value to program in counter's CCCR */
97	uint64_t	p4_rawpic;
98} p4_pcbe_config_t;
99
100typedef uint32_t cntr_map_t;
101
102typedef struct _p4_escr {
103	int		pe_num;
104	uint32_t	pe_addr;
105	uint32_t	pe_map; /* bitmap of counters; bit 1 means ctr 0 */
106} p4_escr_t;
107
108#define	MASK40			UINT64_C(0xffffffffff)
109
110/*
111 * CCCR field definitions.
112 *
113 * Note that the Intel Developer's Manual states that the reserved field at
114 * bit location 16 and 17 must be set to 11. (??)
115 */
116#define	CCCR_ENABLE_SHIFT	12
117#define	CCCR_ESCR_SEL_SHIFT	13
118#define	CCCR_ACTV_THR_SHIFT	16
119#define	CCCR_COMPARE_SHIFT	18
120#define	CCCR_COMPLEMENT_SHIFT	19
121#define	CCCR_THRESHOLD_SHIFT	20
122#define	CCCR_EDGE_SHIFT		24
123#define	CCCR_OVF_PMI_SHIFT	26
124#define	CCCR_OVF_PMI_T0_SHIFT	26
125#define	CCCR_OVF_PMI_T1_SHIFT	27
126#define	CCCR_OVF_SHIFT		31
127#define	CCCR_ACTV_THR_MASK	0x3
128#define	CCCR_THRESHOLD_MAX	0xF
129#define	CCCR_ENABLE		(1U << CCCR_ENABLE_SHIFT)
130#define	CCCR_COMPARE		(1U << CCCR_COMPARE_SHIFT)
131#define	CCCR_COMPLEMENT		(1U << CCCR_COMPLEMENT_SHIFT)
132#define	CCCR_EDGE		(1U << CCCR_EDGE_SHIFT)
133#define	CCCR_OVF_PMI		(1U << CCCR_OVF_PMI_SHIFT)
134#define	CCCR_OVF_PMI_T0		(1U << CCCR_OVF_PMI_T0_SHIFT)
135#define	CCCR_OVF_PMI_T1		(1U << CCCR_OVF_PMI_T1_SHIFT)
136#define	CCCR_INIT		CCCR_ENABLE
137#define	CCCR_OVF		(1U << CCCR_OVF_SHIFT)
138
139#define	ESCR_EVSEL_SHIFT	25
140#define	ESCR_EVMASK_SHIFT	9
141#define	ESCR_TAG_VALUE_SHIFT	5
142#define	ESCR_TAG_VALUE_MAX	0xF
143#define	ESCR_TAG_ENABLE_SHIFT	4
144#define	ESCR_USR_SHIFT		2
145#define	ESCR_OS_SHIFT		3
146#define	ESCR_USR		(1U << ESCR_USR_SHIFT)
147#define	ESCR_OS			(1U << ESCR_OS_SHIFT)
148#define	ESCR_TAG_ENABLE		(1U << ESCR_TAG_ENABLE_SHIFT)
149
150/*
151 * HyperThreaded ESCR fields.
152 */
153#define	ESCR_T0_OS_SHIFT	3
154#define	ESCR_T0_USR_SHIFT	2
155#define	ESCR_T1_OS_SHIFT	1
156#define	ESCR_T1_USR_SHIFT	0
157#define	ESCR_T0_OS		(1U << ESCR_T0_OS_SHIFT)
158#define	ESCR_T0_USR		(1U << ESCR_T0_USR_SHIFT)
159#define	ESCR_T1_OS		(1U << ESCR_T1_OS_SHIFT)
160#define	ESCR_T1_USR		(1U << ESCR_T1_USR_SHIFT)
161
162/*
163 * ESCRs are grouped by counter; each group of ESCRs is associated with a
164 * distinct group of counters. Use these macros to fill in the table below.
165 */
166#define	BPU0_map	(0x1 | 0x2)		/* Counters 0 and 1 */
167#define	BPU2_map	(0x4 | 0x8)		/* Counters 2 and 3 */
168#define	MS0_map		(0x10 | 0x20)		/* Counters 4 and 5 */
169#define	MS2_map		(0x40 | 0x80)		/* Counters 6 and 7 */
170#define	FLAME0_map	(0x100 | 0x200)		/* Counters 8 and 9 */
171#define	FLAME2_map	(0x400 | 0x800)		/* Counters 10 and 11 */
172#define	IQ0_map		(0x1000 | 0x2000 | 0x10000) /* Counters 12, 13, 16 */
173#define	IQ2_map		(0x4000 | 0x8000 | 0x20000) /* Counters 14, 15, 17 */
174
175/*
176 * Table describing the 45 Event Selection and Control Registers (ESCRs).
177 */
178const p4_escr_t p4_escrs[] = {
179#define	BPU0 (1)
180	{ 0, 0x3B2, BPU0_map },		/* 0 */
181#define	IS0 (1ULL << 1)
182	{ 1, 0x3B4, BPU0_map },		/* 1 */
183#define	MOB0 (1ULL << 2)
184	{ 2, 0x3AA, BPU0_map },		/* 2 */
185#define	ITLB0 (1ULL << 3)
186	{ 3, 0x3B6, BPU0_map },		/* 3 */
187#define	PMH0 (1ULL << 4)
188	{ 4, 0x3AC, BPU0_map },		/* 4 */
189#define	IX0 (1ULL << 5)
190	{ 5, 0x3C8, BPU0_map },		/* 5 */
191#define	FSB0 (1ULL << 6)
192	{ 6, 0x3A2, BPU0_map },		/* 6 */
193#define	BSU0 (1ULL << 7)
194	{ 7, 0x3A0, BPU0_map },		/* 7 */
195#define	BPU1 (1ULL << 8)
196	{ 0, 0x3B3, BPU2_map },		/* 8 */
197#define	IS1 (1ULL << 9)
198	{ 1, 0x3B5, BPU2_map },		/* 9 */
199#define	MOB1 (1ULL << 10)
200	{ 2, 0x3AB, BPU2_map },		/* 10 */
201#define	ITLB1 (1ULL << 11)
202	{ 3, 0x3B7, BPU2_map },		/* 11 */
203#define	PMH1 (1ULL << 12)
204	{ 4, 0x3AD, BPU2_map },		/* 12 */
205#define	IX1 (1ULL << 13)
206	{ 5, 0x3C9, BPU2_map },		/* 13 */
207#define	FSB1 (1ULL << 14)
208	{ 6, 0x3A3, BPU2_map },		/* 14 */
209#define	BSU1 (1ULL << 15)
210	{ 7, 0x3A1, BPU2_map },		/* 15 */
211#define	MS0 (1ULL << 16)
212	{ 0, 0x3C0, MS0_map },		/* 16 */
213#define	TC0 (1ULL << 17)
214	{ 1, 0x3C4, MS0_map },		/* 17 */
215#define	TBPU0 (1ULL << 18)
216	{ 2, 0x3C2, MS0_map },		/* 18 */
217#define	MS1 (1ULL << 19)
218	{ 0, 0x3C1, MS2_map },		/* 19 */
219#define	TC1 (1ULL << 20)
220	{ 1, 0x3C5, MS2_map },		/* 20 */
221#define	TBPU1 (1ULL << 21)
222	{ 2, 0x3C3, MS2_map },		/* 21 */
223#define	FLAME0 (1ULL << 22)
224	{ 0, 0x3A6, FLAME0_map },	/* 22 */
225#define	FIRM0 (1ULL << 23)
226	{ 1, 0x3A4, FLAME0_map },	/* 23 */
227#define	SAAT0 (1ULL << 24)
228	{ 2, 0x3AE, FLAME0_map },	/* 24 */
229#define	U2L0 (1ULL << 25)
230	{ 3, 0x3B0, FLAME0_map },	/* 25 */
231#define	DAC0 (1ULL << 26)
232	{ 5, 0x3A8, FLAME0_map },	/* 26 */
233#define	FLAME1 (1ULL << 27)
234	{ 0, 0x3A7, FLAME2_map },	/* 27 */
235#define	FIRM1 (1ULL << 28)
236	{ 1, 0x3A5, FLAME2_map },	/* 28 */
237#define	SAAT1 (1ULL << 29)
238	{ 2, 0x3AF, FLAME2_map },	/* 29 */
239#define	U2L1 (1ULL << 30)
240	{ 3, 0x3B1, FLAME2_map },	/* 30 */
241#define	DAC1 (1ULL << 31)
242	{ 5, 0x3A9, FLAME2_map },	/* 31 */
243#define	IQ0 (1ULL << 32)
244	{ 0, 0x3BA, IQ0_map },		/* 32 */
245#define	ALF0 (1ULL << 33)
246	{ 1, 0x3CA, IQ0_map },		/* 33 */
247#define	RAT0 (1ULL << 34)
248	{ 2, 0x3BC, IQ0_map },		/* 34 */
249#define	SSU0 (1ULL << 35)
250	{ 3, 0x3BE, IQ0_map },		/* 35 */
251#define	CRU0 (1ULL << 36)
252	{ 4, 0x3B8, IQ0_map },		/* 36 */
253#define	CRU2 (1ULL << 37)
254	{ 5, 0x3CC, IQ0_map },		/* 37 */
255#define	CRU4 (1ULL << 38)
256	{ 6, 0x3E0, IQ0_map },		/* 38 */
257#define	IQ1 (1ULL << 39)
258	{ 0, 0x3BB, IQ2_map },		/* 39 */
259#define	ALF1 (1ULL << 40)
260	{ 1, 0x3CB, IQ2_map },		/* 40 */
261#define	RAT1 (1ULL << 41)
262	{ 2, 0x3BD, IQ2_map },		/* 41 */
263#define	CRU1 (1ULL << 42)
264	{ 4, 0x3B9, IQ2_map },		/* 42 */
265#define	CRU3 (1ULL << 43)
266	{ 5, 0x3CD, IQ2_map },		/* 43 */
267#define	CRU5 (1ULL << 44)
268	{ 6, 0x3E1, IQ2_map }		/* 44 */
269};
270
271#define	ESCR_MAX_INDEX 44
272
273typedef struct _p4_ctr {
274	uint32_t	pc_caddr;	/* counter MSR address */
275	uint32_t	pc_ctladdr;	/* counter's CCCR MSR address */
276	uint64_t	pc_map;		/* bitmap of ESCRs controlling ctr */
277} p4_ctr_t;
278
279const p4_ctr_t p4_ctrs[18] = {
280{ /* BPU_COUNTER0 */ 0x300, 0x360, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0},
281{ /* BPU_COUNTER1 */ 0x301, 0x361, BSU0|FSB0|MOB0|PMH0|BPU0|IS0|ITLB0|IX0},
282{ /* BPU_COUNTER2 */ 0x302, 0x362, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1},
283{ /* BPU_COUNTER3 */ 0x303, 0x363, BSU1|FSB1|MOB1|PMH1|BPU1|IS1|ITLB1|IX1},
284{ /* MS_COUNTER0 */  0x304, 0x364, MS0|TBPU0|TC0 },
285{ /* MS_COUNTER1 */  0x305, 0x365, MS0|TBPU0|TC0 },
286{ /* MS_COUNTER2 */  0x306, 0x366, MS1|TBPU1|TC1 },
287{ /* MS_COUNTER3 */  0x307, 0x367, MS1|TBPU1|TC1 },
288{ /* FLAME_COUNTER0 */ 0x308, 0x368, FIRM0|FLAME0|DAC0|SAAT0|U2L0 },
289{ /* FLAME_COUNTER1 */ 0x309, 0x369, FIRM0|FLAME0|DAC0|SAAT0|U2L0 },
290{ /* FLAME_COUNTER2 */ 0x30A, 0x36A, FIRM1|FLAME1|DAC1|SAAT1|U2L1 },
291{ /* FLAME_COUNTER3 */ 0x30B, 0x36B, FIRM1|FLAME1|DAC1|SAAT1|U2L1 },
292{ /* IQ_COUNTER0 */  0x30C, 0x36C, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
293{ /* IQ_COUNTER1 */  0x30D, 0x36D, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
294{ /* IQ_COUNTER2 */  0x30E, 0x36E, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 },
295{ /* IQ_COUNTER3 */  0x30F, 0x36F, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 },
296{ /* IQ_COUNTER4 */  0x310, 0x370, CRU0|CRU2|CRU4|IQ0|RAT0|SSU0|ALF0 },
297{ /* IQ_COUNTER5 */  0x311, 0x371, CRU1|CRU3|CRU5|IQ1|RAT1|ALF1 }
298};
299
300typedef struct _p4_event {
301	char		*pe_name;	/* Name of event according to docs */
302	uint64_t	pe_escr_map;	/* Bitmap of ESCRs capable of event */
303	uint32_t	pe_escr_mask;	/* permissible ESCR event mask */
304	uint8_t		pe_ev;		/* ESCR event select value */
305	uint16_t	pe_cccr;	/* CCCR select value */
306	uint32_t	pe_ctr_mask;	/* Bitmap of capable counters */
307} p4_event_t;
308
309#define	C(n) (1 << n)
310
311p4_event_t p4_events[] = {
312{ "branch_retired", CRU2|CRU3, 0xF, 0x6, 0x5, C(12)|C(13)|C(14)|C(15)|C(16) },
313{ "mispred_branch_retired", CRU0|CRU1, 0x1, 0x3, 0x4,
314	C(12)|C(13)|C(14)|C(15)|C(16) },
315{ "TC_deliver_mode", TC0|TC1, 0xFF, 0x1, 0x1, C(4)|C(5)|C(6)|C(7) },
316{ "BPU_fetch_request", BPU0|BPU1, 0x1, 0x3, 0x0, C(0)|C(1)|C(2)|C(3) },
317{ "ITLB_reference", ITLB0|ITLB1, 0x7, 0x18, 0x3, C(0)|C(1)|C(2)|C(3) },
318{ "memory_cancel", DAC0|DAC1, 0x6, 0x2, 0x5, C(8)|C(9)|C(10)|C(11) },
319{ "memory_complete", SAAT0|SAAT1, 0x3, 0x8, 0x2, C(8)|C(9)|C(10)|C(11) },
320{ "load_port_replay", SAAT0|SAAT1, 0x1, 0x4, 0x2, C(8)|C(9)|C(10)|C(11) },
321{ "store_port_replay", SAAT0|SAAT1, 0x1, 0x5, 0x2, C(8)|C(9)|C(10)|C(11) },
322{ "MOB_load_replay", MOB0|MOB1, 0x35, 0x3, 0x2, C(0)|C(1)|C(2)|C(3) },
323{ "page_walk_type", PMH0|PMH1, 0x3, 0x1, 0x4, C(0)|C(1)|C(2)|C(3) },
324{ "BSQ_cache_reference", BSU0|BSU1, 0x73F, 0xC, 0x7, C(0)|C(1)|C(2)|C(3) },
325{ "IOQ_allocation", FSB0, 0xEFFF, 0x3, 0x6, C(0)|C(1) },
326{ "IOQ_active_entries", FSB1, 0xEFFF, 0x1A, 0x6, C(2)|C(3) },
327{ "FSB_data_activity", FSB0|FSB1, 0x3F, 0x17, 0x6, C(0)|C(1)|C(2)|C(3) },
328{ "BSQ_allocation", BSU0, 0x3FEF, 0x5, 0x7, C(0)|C(1) },
329{ "bsq_active_entries", BSU1, 0x3FEF, 0x6, 0x7, C(2)|C(3) },
330{ "x87_assist", CRU2|CRU3, 0x1F, 0x3, 0x5, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
331{ "SSE_input_assist", FIRM0|FIRM1, 0x8000, 0x34, 0x1, C(8)|C(9)|C(10)|C(11) },
332{ "packed_SP_uop", FIRM0|FIRM1, 0x8000, 0x8, 0x1, C(8)|C(9)|C(10)|C(11) },
333{ "packed_DP_uop", FIRM0|FIRM1, 0x8000, 0xC, 0x1, C(8)|C(9)|C(10)|C(11) },
334{ "scalar_SP_uop", FIRM0|FIRM1, 0x8000, 0xA, 0x1, C(8)|C(9)|C(10)|C(11) },
335{ "scalar_DP_uop", FIRM0|FIRM1, 0x8000, 0xE, 0x1, C(8)|C(9)|C(10)|C(11) },
336{ "64bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x2, 0x1, C(8)|C(9)|C(10)|C(11) },
337{ "128bit_MMX_uop", FIRM0|FIRM1, 0x8000, 0x1A, 0x1, C(8)|C(9)|C(10)|C(11) },
338{ "x87_FP_uop", FIRM0|FIRM1, 0x8000, 0x4, 0x1, C(8)|C(9)|C(10)|C(11) },
339{ "x87_SIMD_moves_uop", FIRM0|FIRM1, 0x18, 0x2E, 0x1, C(8)|C(9)|C(10)|C(11) },
340{ "machine_clear", CRU2|CRU3, 0xD, 0x2, 0x5,
341	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
342{ "global_power_events", FSB0|FSB1, 0x1, 0x5, 0x6, C(0)|C(1)|C(2)|C(3) },
343{ "tc_ms_xfer", MS0|MS1, 0x1, 0x5, 0x0, C(4)|C(5)|C(6)|C(7) },
344{ "uop_queue_writes", MS0|MS1, 0x7, 0x9, 0x0, C(4)|C(5)|C(6)|C(7) },
345{ "front_end_event", CRU2|CRU3, 0x3, 0x8, 0x5,
346	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
347{ "execution_event", CRU2|CRU3, 0xFF, 0xC, 0x5,
348	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
349{ "replay_event", CRU2|CRU3, 0x3, 0x9, 0x5,
350	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
351{ "instr_retired", CRU0|CRU1, 0xF, 0x2, 0x4,
352	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
353{ "uops_retired", CRU0|CRU1, 0x3, 0x1, 0x4,
354	C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
355{ "uop_type", RAT0|RAT1, 0x3, 0x2, 0x2, C(12)|C(13)|C(14)|C(15)|C(16)|C(17)},
356{ "retired_mispred_branch_type", TBPU0|TBPU1, 0x1F, 0x5, 0x2,
357	C(4)|C(5)|C(6)|C(7)},
358{ "retired_branch_type", TBPU0|TBPU1, 0x1F, 0x4, 0x2, C(4)|C(5)|C(6)|C(7) },
359{ NULL, 0, 0, 0, 0 }
360};
361
362/*
363 * Indicates whether the "rdpmc" instruction is available on this processor.
364 */
365static int p4_rdpmc_avail = 0;
366
367static const uint64_t p4_cccrstop = 0;
368
369static char *p4_eventlist[18];
370
371/*
372 * If set, this processor has HyperThreading.
373 */
374static int p4_htt = 0;
375
376#define	P4_FAMILY	0xF
377
378static int
379p4_pcbe_init(void)
380{
381	int		i;
382	size_t		size;
383	p4_event_t	*ev;
384
385	/*
386	 * If we're not running on a P4, refuse to load.
387	 */
388	if (cpuid_getvendor(CPU) != X86_VENDOR_Intel ||
389	    cpuid_getfamily(CPU) != P4_FAMILY)
390		return (-1);
391
392	/*
393	 * Set up the event lists for each counter.
394	 *
395	 * First pass calculates the size of the event list, and the second
396	 * pass copies each event name into the event list.
397	 */
398	for (i = 0; i < 18; i++) {
399		size = 0;
400		for (ev = p4_events; ev->pe_name != NULL; ev++) {
401			if (ev->pe_ctr_mask & C(i))
402				size += strlen(ev->pe_name) + 1;
403		}
404
405		/*
406		 * We use 'size + 1' here to ensure room for the final
407		 * strcat when it terminates the string.
408		 */
409		p4_eventlist[i] = (char *)kmem_alloc(size + 1, KM_SLEEP);
410		*p4_eventlist[i] = '\0';
411
412		for (ev = p4_events; ev->pe_name != NULL; ev++) {
413			if (ev->pe_ctr_mask & C(i)) {
414				(void) strcat(p4_eventlist[i], ev->pe_name);
415				(void) strcat(p4_eventlist[i], ",");
416			}
417		}
418		/*
419		 * Remove trailing ','
420		 */
421		p4_eventlist[i][size - 1] = '\0';
422	}
423
424	if (x86_feature & X86_MMX)
425		p4_rdpmc_avail = 1;
426	/*
427	 * The X86_HTT flag may disappear soon, so we'll isolate the impact of
428	 * its demise to the following if().
429	 */
430	if (x86_feature & X86_HTT)
431		p4_htt = 1;
432
433	return (0);
434}
435
436static uint_t
437p4_pcbe_ncounters(void)
438{
439	return (18);
440}
441
442static const char *
443p4_pcbe_impl_name(void)
444{
445	if (p4_htt)
446		return ("Pentium 4 with HyperThreading");
447	return ("Pentium 4");
448}
449
450static const char *
451p4_pcbe_cpuref(void)
452{
453	return ("See Appendix A.1 of the \"IA-32 Intel Architecture Software " \
454	    "Developer's Manual Volume 3: System Programming Guide,\" "	       \
455	    "Order # 245472-012, 2003");
456}
457
458static char *
459p4_pcbe_list_events(uint_t picnum)
460{
461	ASSERT(picnum >= 0 && picnum < 18);
462
463	return (p4_eventlist[picnum]);
464}
465
466#define	P4_ATTRS "emask,tag,compare,complement,threshold,edge"
467
468static char *
469p4_pcbe_list_attrs(void)
470{
471	if (p4_htt)
472		return (P4_ATTRS ",active_thread,count_sibling_usr,"
473		    "count_sibling_sys");
474	return (P4_ATTRS);
475}
476
477static uint64_t
478p4_pcbe_event_coverage(char *event)
479{
480	p4_event_t *ev;
481
482	for (ev = p4_events; ev->pe_name != NULL; ev++) {
483		if (strcmp(event, ev->pe_name) == 0)
484			break;
485	}
486
487	return (ev->pe_ctr_mask);
488}
489
490static uint64_t
491p4_pcbe_overflow_bitmap(void)
492{
493	extern int	kcpc_hw_overflow_intr_installed;
494	uint64_t	ret = 0;
495	int		i;
496
497	/*
498	 * The CCCR's OVF bit indicates that the corresponding counter has
499	 * overflowed. It must be explicitly cleared by software, so it is
500	 * safe to read the CCCR values here.
501	 */
502	for (i = 0; i < 18; i++) {
503		if (rdmsr(p4_ctrs[i].pc_ctladdr) & CCCR_OVF)
504			ret |= (1 << i);
505	}
506
507	/*
508	 * Pentium 4 and Xeon turn off the CPC interrupt mask bit in the LVT at
509	 * every overflow. Turn it back on here.
510	 */
511	ASSERT(kcpc_hw_overflow_intr_installed);
512	(*kcpc_hw_enable_cpc_intr)();
513
514	return (ret);
515}
516
517static int
518p4_escr_inuse(p4_pcbe_config_t **cfgs, int escr_ndx)
519{
520	int i;
521
522	for (i = 0; i < 18; i++) {
523		if (cfgs[i] == NULL)
524			continue;
525		if (cfgs[i]->p4_escr_ndx == escr_ndx)
526			return (1);
527	}
528
529	return (0);
530}
531
532static void
533build_cfgs(p4_pcbe_config_t *cfgs[18], uint64_t *data[18], void *token)
534{
535	p4_pcbe_config_t	*cfg = NULL;
536	uint64_t		*daddr;
537
538	bzero(cfgs, 18 * sizeof (p4_pcbe_config_t *));
539
540	do {
541		cfg = (p4_pcbe_config_t *)kcpc_next_config(token, cfg, &daddr);
542
543		if (cfg != NULL) {
544			ASSERT(cfg->p4_picno < 18);
545			cfgs[cfg->p4_picno] = cfg;
546			if (data != NULL) {
547				ASSERT(daddr != NULL);
548				data[cfg->p4_picno] = daddr;
549			}
550		}
551	} while (cfg != NULL);
552}
553
554/*
555 * Programming a counter:
556 *
557 * Select event.
558 * Choose an ESCR capable of counting that event.
559 * Set up the ESCR with the desired parameters (usr, sys, tag).
560 * Set up the CCCR to point to the selected ESCR.
561 * Set the CCCR parameters (overflow, cascade, edge, etc).
562 */
563static int
564p4_pcbe_configure(uint_t picnum, char *eventname, uint64_t preset,
565    uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
566    void *token)
567{
568	p4_pcbe_config_t	*cfgs[18];
569	p4_pcbe_config_t	*cfg;
570	p4_event_t		*ev;
571	int			escr_ndx;
572	int			i;
573	uint16_t		emask = 0;
574	uint8_t			tag;
575	int			use_tag = 0;
576	int			active_thread = 0x3; /* default is "any" */
577	int			compare = 0;
578	int			complement = 0;
579	int			threshold = 0;
580	int			edge = 0;
581	int			sibling_usr = 0; /* count usr on other cpu */
582	int			sibling_sys = 0; /* count sys on other cpu */
583
584	/*
585	 * If we've been handed an existing configuration, we need only preset
586	 * the counter value.
587	 */
588	if (*data != NULL) {
589		cfg = *data;
590		cfg->p4_rawpic = preset & MASK40;
591		return (0);
592	}
593
594	if (picnum < 0 || picnum >= 18)
595		return (CPC_INVALID_PICNUM);
596
597	for (ev = p4_events; ev->pe_name != NULL; ev++) {
598		if (strcmp(eventname, ev->pe_name) == 0)
599			break;
600	}
601	if (ev->pe_name == NULL)
602		return (CPC_INVALID_EVENT);
603
604	build_cfgs(cfgs, NULL, token);
605
606	/*
607	 * Find an ESCR capable of counting this event.
608	 */
609	for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) {
610		if ((ev->pe_escr_map & (1ULL << escr_ndx)) &&
611		    p4_escr_inuse(cfgs, escr_ndx) == 0)
612			break;
613	}
614
615	/*
616	 * All ESCRs capable of counting this event are already being
617	 * used.
618	 */
619	if (escr_ndx == ESCR_MAX_INDEX)
620		return (CPC_RESOURCE_UNAVAIL);
621
622	/*
623	 * At this point, ev points to the desired event and escr is the index
624	 * of a capable and available ESCR.
625	 *
626	 * Now process and verify the attributes.
627	 */
628	for (i = 0; i < nattrs; i++) {
629		if (strcmp("emask", attrs[i].ka_name) == 0) {
630			if ((attrs[i].ka_val | ev->pe_escr_mask)
631			    != ev->pe_escr_mask)
632				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
633			emask = attrs[i].ka_val;
634			continue;
635		} else if (strcmp("tag", attrs[i].ka_name) == 0) {
636			if (attrs[i].ka_val > ESCR_TAG_VALUE_MAX)
637				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
638			tag = attrs[i].ka_val;
639			use_tag = 1;
640			continue;
641		} else if (strcmp("compare", attrs[i].ka_name) == 0) {
642			if (attrs[i].ka_val != 0)
643				compare = 1;
644			continue;
645		} else if (strcmp("complement", attrs[i].ka_name) == 0) {
646			if (attrs[i].ka_val != 0)
647				complement = 1;
648			continue;
649		} else if (strcmp("threshold", attrs[i].ka_name) == 0) {
650			if (attrs[i].ka_val > CCCR_THRESHOLD_MAX)
651				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
652			threshold = attrs[i].ka_val;
653			continue;
654		} else if (strcmp("edge", attrs[i].ka_name) == 0) {
655			if (attrs[i].ka_val != 0)
656				edge = 1;
657			continue;
658		}
659
660		/*
661		 * The remaining attributes are valid only on HyperThreaded P4s
662		 * for processes with the "cpc_cpu" privilege.
663		 */
664		if (p4_htt == 0)
665			return (CPC_INVALID_ATTRIBUTE);
666
667		if (secpolicy_cpc_cpu(crgetcred()) != 0)
668			return (CPC_ATTR_REQUIRES_PRIVILEGE);
669
670		if (strcmp("active_thread", attrs[i].ka_name) == 0) {
671			if ((attrs[i].ka_val | CCCR_ACTV_THR_MASK) !=
672			    CCCR_ACTV_THR_MASK)
673				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
674			active_thread = (int)attrs[i].ka_val;
675		} else if (strcmp("count_sibling_usr", attrs[i].ka_name) == 0) {
676			if (attrs[i].ka_val != 0)
677				sibling_usr = 1;
678		} else if (strcmp("count_sibling_sys", attrs[i].ka_name) == 0) {
679			if (attrs[i].ka_val != 0)
680				sibling_sys = 1;
681		} else
682			return (CPC_INVALID_ATTRIBUTE);
683	}
684
685	/*
686	 * Make sure the counter can count this event
687	 */
688	if ((ev->pe_ctr_mask & C(picnum)) == 0)
689		return (CPC_PIC_NOT_CAPABLE);
690
691	/*
692	 * Find an ESCR that lines up with the event _and_ the counter.
693	 */
694	for (escr_ndx = 0; escr_ndx < ESCR_MAX_INDEX; escr_ndx++) {
695		if ((ev->pe_escr_map & (1ULL << escr_ndx)) &&
696		    (p4_escrs[escr_ndx].pe_map & (1 << picnum)) &&
697		    p4_escr_inuse(cfgs, escr_ndx) == 0)
698			break;
699	}
700	if (escr_ndx == ESCR_MAX_INDEX)
701		return (CPC_RESOURCE_UNAVAIL);
702
703	cfg = (p4_pcbe_config_t *)kmem_alloc(sizeof (p4_pcbe_config_t),
704	    KM_SLEEP);
705
706	cfg->p4_flags = 0;
707	cfg->p4_picno = picnum;
708	cfg->p4_escr_ndx = escr_ndx;
709	cfg->p4_escr = (ev->pe_ev << ESCR_EVSEL_SHIFT) |
710	    (emask << ESCR_EVMASK_SHIFT);
711
712	if (use_tag == 1) {
713		cfg->p4_escr |= tag << ESCR_TAG_VALUE_SHIFT;
714		cfg->p4_escr |= ESCR_TAG_ENABLE;
715	}
716
717	if (p4_htt) {
718		/*
719		 * This is a HyperThreaded P4.  Since we don't know which
720		 * logical CPU this configuration will eventually be programmed
721		 * on, we can't yet decide which fields of the ESCR to select.
722		 *
723		 * Record the necessary information in the flags for later.
724		 */
725		if (flags & CPC_COUNT_USER)
726			cfg->p4_flags |= P4_THIS_USR;
727		if (flags & CPC_COUNT_SYSTEM)
728			cfg->p4_flags |= P4_THIS_SYS;
729		if (p4_htt && sibling_usr)
730			cfg->p4_flags |= P4_SIBLING_USR;
731		if (p4_htt && sibling_sys)
732			cfg->p4_flags |= P4_SIBLING_SYS;
733	} else {
734		/*
735		 * This is not HyperThreaded, so we can determine the exact
736		 * ESCR value necessary now.
737		 */
738		if (flags & CPC_COUNT_USER)
739			cfg->p4_escr |= ESCR_USR;
740		if (flags & CPC_COUNT_SYSTEM)
741			cfg->p4_escr |= ESCR_OS;
742	}
743
744	cfg->p4_rawpic = preset & MASK40;
745
746	/*
747	 * Even on non-HT P4s, Intel states the active_thread field (marked as
748	 * "reserved" for the non-HT chips) must be set to all 1s.
749	 */
750	cfg->p4_cccr = CCCR_INIT | (active_thread << CCCR_ACTV_THR_SHIFT);
751	if (compare)
752		cfg->p4_cccr |= CCCR_COMPARE;
753	if (complement)
754		cfg->p4_cccr |= CCCR_COMPLEMENT;
755	cfg->p4_cccr |= threshold << CCCR_THRESHOLD_SHIFT;
756	if (edge)
757		cfg->p4_cccr |= CCCR_EDGE;
758	cfg->p4_cccr |= p4_escrs[cfg->p4_escr_ndx].pe_num
759	    << CCCR_ESCR_SEL_SHIFT;
760	if (flags & CPC_OVF_NOTIFY_EMT) {
761		if (p4_htt)
762			cfg->p4_flags |= P4_PMI;
763		else {
764			/*
765			 * If the user has asked for notification of overflows,
766			 * we automatically program the hardware to generate an
767			 * interrupt on overflow.
768			 *
769			 * This can only be programmed now if this P4 doesn't
770			 * have HyperThreading. If it does, we must wait until
771			 * we know which logical CPU we'll be programming.
772			 */
773			cfg->p4_cccr |= CCCR_OVF_PMI;
774		}
775	}
776
777	*data = cfg;
778
779	return (0);
780}
781
782static void
783p4_pcbe_program(void *token)
784{
785	int			i;
786	uint64_t		cccr;
787	p4_pcbe_config_t	*cfgs[18];
788
789	p4_pcbe_allstop();
790
791	build_cfgs(cfgs, NULL, token);
792
793	if (p4_rdpmc_avail) {
794		ulong_t curcr4 = getcr4();
795		if (kcpc_allow_nonpriv(token))
796			setcr4(curcr4 | CR4_PCE);
797		else
798			setcr4(curcr4 & ~CR4_PCE);
799	}
800
801	/*
802	 * Ideally we would start all counters with a single operation, but in
803	 * P4 each counter is enabled individually via its CCCR. To minimize the
804	 * probe effect of enabling the counters, we do it in two passes: the
805	 * first programs the counter and ESCR, and the second programs the
806	 * CCCR (and thus enables the counter).
807	 */
808	if (p4_htt) {
809		int	lid = cpuid_get_clogid(CPU); /* Logical ID of CPU */
810
811		for (i = 0; i < 18; i++) {
812			uint64_t escr;
813
814			if (cfgs[i] == NULL)
815				continue;
816			escr = (uint64_t)cfgs[i]->p4_escr;
817
818			if (cfgs[i]->p4_flags & P4_THIS_USR)
819				escr |= (lid == 0) ? ESCR_T0_USR : ESCR_T1_USR;
820			if (cfgs[i]->p4_flags & P4_THIS_SYS)
821				escr |= (lid == 0) ? ESCR_T0_OS : ESCR_T1_OS;
822			if (cfgs[i]->p4_flags & P4_SIBLING_USR)
823				escr |= (lid == 0) ? ESCR_T1_USR : ESCR_T0_USR;
824			if (cfgs[i]->p4_flags & P4_SIBLING_SYS)
825				escr |= (lid == 0) ? ESCR_T1_OS : ESCR_T0_OS;
826
827			wrmsr(p4_ctrs[i].pc_caddr, cfgs[i]->p4_rawpic);
828			wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr, escr);
829		}
830
831		for (i = 0; i < 18; i++) {
832			if (cfgs[i] == NULL)
833				continue;
834			cccr = (uint64_t)cfgs[i]->p4_cccr;
835			/*
836			 * We always target the overflow interrupt at the
837			 * logical CPU which is doing the counting.
838			 */
839			if (cfgs[i]->p4_flags & P4_PMI)
840				cccr |= (lid == 0) ?
841				    CCCR_OVF_PMI_T0 : CCCR_OVF_PMI_T1;
842			wrmsr(p4_ctrs[i].pc_ctladdr, cccr);
843		}
844	} else {
845		for (i = 0; i < 18; i++) {
846			if (cfgs[i] == NULL)
847				continue;
848			wrmsr(p4_ctrs[i].pc_caddr, cfgs[i]->p4_rawpic);
849			wrmsr(p4_escrs[cfgs[i]->p4_escr_ndx].pe_addr,
850			    (uint64_t)cfgs[i]->p4_escr);
851		}
852
853		for (i = 0; i < 18; i++) {
854			if (cfgs[i] == NULL)
855				continue;
856			wrmsr(p4_ctrs[i].pc_ctladdr,
857			    (uint64_t)cfgs[i]->p4_cccr);
858		}
859	}
860}
861
862static void
863p4_pcbe_allstop(void)
864{
865	int		i;
866
867	for (i = 0; i < 18; i++)
868		wrmsr(p4_ctrs[i].pc_ctladdr, 0ULL);
869
870	setcr4(getcr4() & ~CR4_PCE);
871}
872
873
874static void
875p4_pcbe_sample(void *token)
876{
877	p4_pcbe_config_t	*cfgs[18];
878	uint64_t		*addrs[18];
879	uint64_t		curpic[18];
880	int64_t			diff;
881	int			i;
882
883	for (i = 0; i < 18; i++)
884		curpic[i] = rdmsr(p4_ctrs[i].pc_caddr);
885
886	build_cfgs(cfgs, addrs, token);
887
888	for (i = 0; i < 18; i++) {
889		if (cfgs[i] == NULL)
890			continue;
891		diff = curpic[i] - cfgs[i]->p4_rawpic;
892		if (diff < 0)
893			diff += (1ll << 40);
894		*addrs[i] += diff;
895		DTRACE_PROBE4(p4__pcbe__sample, int, i, uint64_t, *addrs[i],
896		    uint64_t, curpic[i], uint64_t, cfgs[i]->p4_rawpic);
897		cfgs[i]->p4_rawpic = *addrs[i] & MASK40;
898	}
899}
900
901static void
902p4_pcbe_free(void *config)
903{
904	kmem_free(config, sizeof (p4_pcbe_config_t));
905}
906
907static struct modlpcbe modlpcbe = {
908	&mod_pcbeops,
909	"Pentium 4 Performance Counters v%I%",
910	&p4_pcbe_ops
911};
912
913static struct modlinkage modl = {
914	MODREV_1,
915	&modlpcbe,
916};
917
918int
919_init(void)
920{
921	if (p4_pcbe_init() != 0)
922		return (ENOTSUP);
923	return (mod_install(&modl));
924}
925
926int
927_fini(void)
928{
929	return (mod_remove(&modl));
930}
931
932int
933_info(struct modinfo *mi)
934{
935	return (mod_info(&modl, mi));
936}
937