opteron_pcbe.c revision 31aa620247ae407b2bee2dccd71693d1938f54d6
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * This file contains preset event names from the Performance Application
28 * Programming Interface v3.5 which included the following notice:
29 *
30 *                             Copyright (c) 2005,6
31 *                           Innovative Computing Labs
32 *                         Computer Science Department,
33 *                            University of Tennessee,
34 *                                 Knoxville, TN.
35 *                              All Rights Reserved.
36 *
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions are met:
40 *
41 *    * Redistributions of source code must retain the above copyright notice,
42 *      this list of conditions and the following disclaimer.
43 *    * Redistributions in binary form must reproduce the above copyright
44 *	notice, this list of conditions and the following disclaimer in the
45 *	documentation and/or other materials provided with the distribution.
46 *    * Neither the name of the University of Tennessee nor the names of its
47 *      contributors may be used to endorse or promote products derived from
48 *	this software without specific prior written permission.
49 *
50 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
51 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
54 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
55 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
56 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
57 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
58 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
59 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
60 * POSSIBILITY OF SUCH DAMAGE.
61 *
62 *
63 * This open source software license conforms to the BSD License template.
64 */
65
66/*
67 * Portions Copyright 2009 Advanced Micro Devices, Inc.
68 * Copyright 2019 Joyent, Inc.
69 */
70
71/*
72 * Performance Counter Back-End for AMD Opteron and AMD Athlon 64 processors.
73 */
74
75#include <sys/cpuvar.h>
76#include <sys/param.h>
77#include <sys/systm.h>
78#include <sys/cpc_pcbe.h>
79#include <sys/kmem.h>
80#include <sys/sdt.h>
81#include <sys/modctl.h>
82#include <sys/errno.h>
83#include <sys/debug.h>
84#include <sys/archsystm.h>
85#include <sys/x86_archext.h>
86#include <sys/privregs.h>
87#include <sys/ddi.h>
88#include <sys/sunddi.h>
89
90#include "opteron_pcbe_table.h"
91#include <opteron_pcbe_cpcgen.h>
92
93static int opt_pcbe_init(void);
94static uint_t opt_pcbe_ncounters(void);
95static const char *opt_pcbe_impl_name(void);
96static const char *opt_pcbe_cpuref(void);
97static char *opt_pcbe_list_events(uint_t picnum);
98static char *opt_pcbe_list_attrs(void);
99static uint64_t opt_pcbe_event_coverage(char *event);
100static uint64_t opt_pcbe_overflow_bitmap(void);
101static int opt_pcbe_configure(uint_t picnum, char *event, uint64_t preset,
102    uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
103    void *token);
104static void opt_pcbe_program(void *token);
105static void opt_pcbe_allstop(void);
106static void opt_pcbe_sample(void *token);
107static void opt_pcbe_free(void *config);
108
109static pcbe_ops_t opt_pcbe_ops = {
110	PCBE_VER_1,
111	CPC_CAP_OVERFLOW_INTERRUPT,
112	opt_pcbe_ncounters,
113	opt_pcbe_impl_name,
114	opt_pcbe_cpuref,
115	opt_pcbe_list_events,
116	opt_pcbe_list_attrs,
117	opt_pcbe_event_coverage,
118	opt_pcbe_overflow_bitmap,
119	opt_pcbe_configure,
120	opt_pcbe_program,
121	opt_pcbe_allstop,
122	opt_pcbe_sample,
123	opt_pcbe_free
124};
125
126/*
127 * Base MSR addresses for the PerfEvtSel registers and the counters themselves.
128 * Add counter number to base address to get corresponding MSR address.
129 */
130#define	PES_BASE_ADDR	0xC0010000
131#define	PIC_BASE_ADDR	0xC0010004
132
133/*
134 * Base MSR addresses for the PerfEvtSel registers and counters. The counter and
135 * event select registers are interleaved, so one needs to multiply the counter
136 * number by two to determine what they should be set to.
137 */
138#define	PES_EXT_BASE_ADDR	0xC0010200
139#define	PIC_EXT_BASE_ADDR	0xC0010201
140
141/*
142 * The number of counters present depends on which CPU features are present.
143 */
144#define	OPT_PCBE_DEF_NCOUNTERS	4
145#define	OPT_PCBE_EXT_NCOUNTERS	6
146
147/*
148 * Define offsets and masks for the fields in the Performance
149 * Event-Select (PES) registers.
150 */
151#define	OPT_PES_HOST_SHIFT	41
152#define	OPT_PES_GUEST_SHIFT	40
153#define	OPT_PES_EVSELHI_SHIFT	32
154#define	OPT_PES_CMASK_SHIFT	24
155#define	OPT_PES_CMASK_MASK	0xFF
156#define	OPT_PES_INV_SHIFT	23
157#define	OPT_PES_ENABLE_SHIFT	22
158#define	OPT_PES_INT_SHIFT	20
159#define	OPT_PES_PC_SHIFT	19
160#define	OPT_PES_EDGE_SHIFT	18
161#define	OPT_PES_OS_SHIFT	17
162#define	OPT_PES_USR_SHIFT	16
163#define	OPT_PES_UMASK_SHIFT	8
164#define	OPT_PES_UMASK_MASK	0xFF
165
166#define	OPT_PES_INV		(1ULL << OPT_PES_INV_SHIFT)
167#define	OPT_PES_ENABLE		(1ULL << OPT_PES_ENABLE_SHIFT)
168#define	OPT_PES_INT		(1ULL << OPT_PES_INT_SHIFT)
169#define	OPT_PES_PC		(1ULL << OPT_PES_PC_SHIFT)
170#define	OPT_PES_EDGE		(1ULL << OPT_PES_EDGE_SHIFT)
171#define	OPT_PES_OS		(1ULL << OPT_PES_OS_SHIFT)
172#define	OPT_PES_USR		(1ULL << OPT_PES_USR_SHIFT)
173#define	OPT_PES_HOST		(1ULL << OPT_PES_HOST_SHIFT)
174#define	OPT_PES_GUEST		(1ULL << OPT_PES_GUEST_SHIFT)
175
176typedef struct _opt_pcbe_config {
177	uint8_t		opt_picno;	/* Counter number: 0, 1, 2, or 3 */
178	uint64_t	opt_evsel;	/* Event Selection register */
179	uint64_t	opt_rawpic;	/* Raw counter value */
180} opt_pcbe_config_t;
181
182opt_pcbe_config_t nullcfgs[OPT_PCBE_EXT_NCOUNTERS] = {
183	{ 0, 0, 0 },
184	{ 1, 0, 0 },
185	{ 2, 0, 0 },
186	{ 3, 0, 0 },
187	{ 4, 0, 0 },
188	{ 5, 0, 0 },
189};
190
191typedef uint64_t (*opt_pcbe_addr_f)(uint_t);
192
193typedef struct opt_pcbe_data {
194	uint_t		opd_ncounters;
195	uint_t		opd_cmask;
196	opt_pcbe_addr_f	opd_pesf;
197	opt_pcbe_addr_f	opd_picf;
198} opt_pcbe_data_t;
199
200opt_pcbe_data_t opd;
201
202#define	MASK48		0xFFFFFFFFFFFF
203
204#define	EV_END {NULL, 0}
205#define	GEN_EV_END {NULL, NULL, 0 }
206
207/*
208 * The following Macros are used to define tables of events that are used by
209 * various families and some generic classes of events.
210 *
211 * When programming a performance counter there are two different values that we
212 * need to set:
213 *
214 *   o Event - Determines the general class of event that is being used.
215 *   o Unit  - A further breakdown that gives more specific value.
216 *
217 * Prior to the introduction of family 17h support, all family specific events
218 * were programmed based on their event. The generic events, which tried to
219 * provide PAPI mappings to events specified an additional unit mask.
220 *
221 * Starting with Family 17h, CPU performance counters default to using both the
222 * unit mask and the event select. Generic events are always aliases to a
223 * specific event/unit pair, hence why the units for them are always zero. In
224 * addition, the naming of events in family 17h has been changed to reflect
225 * AMD's guide. While this is a departure from what people are used to, it is
226 * believed that matching the more detailed literature that folks are told to
227 * reference is more valuable.
228 */
229
230#define	AMD_cmn_events						\
231	{ "FP_dispatched_fpu_ops",			0x0 },	\
232	{ "FP_cycles_no_fpu_ops_retired",		0x1 },	\
233	{ "FP_dispatched_fpu_ops_ff",			0x2 },	\
234	{ "LS_seg_reg_load",				0x20 },	\
235	{ "LS_uarch_resync_self_modify",		0x21 },	\
236	{ "LS_uarch_resync_snoop",			0x22 },	\
237	{ "LS_buffer_2_full",				0x23 },	\
238	{ "LS_locked_operation",			0x24 },	\
239	{ "LS_retired_cflush",				0x26 },	\
240	{ "LS_retired_cpuid",				0x27 },	\
241	{ "DC_access",					0x40 },	\
242	{ "DC_miss",					0x41 },	\
243	{ "DC_refill_from_L2",				0x42 },	\
244	{ "DC_refill_from_system",			0x43 },	\
245	{ "DC_copyback",				0x44 },	\
246	{ "DC_dtlb_L1_miss_L2_hit",			0x45 },	\
247	{ "DC_dtlb_L1_miss_L2_miss",			0x46 },	\
248	{ "DC_misaligned_data_ref",			0x47 },	\
249	{ "DC_uarch_late_cancel_access",		0x48 },	\
250	{ "DC_uarch_early_cancel_access",		0x49 },	\
251	{ "DC_1bit_ecc_error_found",			0x4A },	\
252	{ "DC_dispatched_prefetch_instr",		0x4B },	\
253	{ "DC_dcache_accesses_by_locks",		0x4C },	\
254	{ "BU_memory_requests",				0x65 },	\
255	{ "BU_data_prefetch",				0x67 },	\
256	{ "BU_system_read_responses",			0x6C },	\
257	{ "BU_cpu_clk_unhalted",			0x76 },	\
258	{ "BU_internal_L2_req",				0x7D },	\
259	{ "BU_fill_req_missed_L2",			0x7E },	\
260	{ "BU_fill_into_L2",				0x7F },	\
261	{ "IC_fetch",					0x80 },	\
262	{ "IC_miss",					0x81 },	\
263	{ "IC_refill_from_L2",				0x82 },	\
264	{ "IC_refill_from_system",			0x83 },	\
265	{ "IC_itlb_L1_miss_L2_hit",			0x84 },	\
266	{ "IC_itlb_L1_miss_L2_miss",			0x85 },	\
267	{ "IC_uarch_resync_snoop",			0x86 },	\
268	{ "IC_instr_fetch_stall",			0x87 },	\
269	{ "IC_return_stack_hit",			0x88 },	\
270	{ "IC_return_stack_overflow",			0x89 },	\
271	{ "FR_retired_x86_instr_w_excp_intr",		0xC0 },	\
272	{ "FR_retired_uops",				0xC1 },	\
273	{ "FR_retired_branches_w_excp_intr",		0xC2 },	\
274	{ "FR_retired_branches_mispred",		0xC3 },	\
275	{ "FR_retired_taken_branches",			0xC4 },	\
276	{ "FR_retired_taken_branches_mispred",		0xC5 },	\
277	{ "FR_retired_far_ctl_transfer",		0xC6 },	\
278	{ "FR_retired_resyncs",				0xC7 },	\
279	{ "FR_retired_near_rets",			0xC8 },	\
280	{ "FR_retired_near_rets_mispred",		0xC9 },	\
281	{ "FR_retired_taken_branches_mispred_addr_miscomp",	0xCA },\
282	{ "FR_retired_fastpath_double_op_instr",	0xCC },	\
283	{ "FR_intr_masked_cycles",			0xCD },	\
284	{ "FR_intr_masked_while_pending_cycles",	0xCE },	\
285	{ "FR_taken_hardware_intrs",			0xCF },	\
286	{ "FR_nothing_to_dispatch",			0xD0 },	\
287	{ "FR_dispatch_stalls",				0xD1 },	\
288	{ "FR_dispatch_stall_branch_abort_to_retire",	0xD2 },	\
289	{ "FR_dispatch_stall_serialization",		0xD3 },	\
290	{ "FR_dispatch_stall_segment_load",		0xD4 },	\
291	{ "FR_dispatch_stall_reorder_buffer_full",	0xD5 },	\
292	{ "FR_dispatch_stall_resv_stations_full",	0xD6 },	\
293	{ "FR_dispatch_stall_fpu_full",			0xD7 },	\
294	{ "FR_dispatch_stall_ls_full",			0xD8 },	\
295	{ "FR_dispatch_stall_waiting_all_quiet",	0xD9 },	\
296	{ "FR_dispatch_stall_far_ctl_trsfr_resync_branch_pend",	0xDA },\
297	{ "FR_fpu_exception",				0xDB },	\
298	{ "FR_num_brkpts_dr0",				0xDC },	\
299	{ "FR_num_brkpts_dr1",				0xDD },	\
300	{ "FR_num_brkpts_dr2",				0xDE },	\
301	{ "FR_num_brkpts_dr3",				0xDF },	\
302	{ "NB_mem_ctrlr_page_access",			0xE0 },	\
303	{ "NB_mem_ctrlr_turnaround",			0xE3 },	\
304	{ "NB_mem_ctrlr_bypass_counter_saturation",	0xE4 },	\
305	{ "NB_cpu_io_to_mem_io",			0xE9 },	\
306	{ "NB_cache_block_commands",			0xEA },	\
307	{ "NB_sized_commands",				0xEB },	\
308	{ "NB_ht_bus0_bandwidth",			0xF6 }
309
310#define	AMD_FAMILY_f_events					\
311	{ "BU_quadwords_written_to_system",		0x6D },	\
312	{ "FR_retired_fpu_instr",			0xCB },	\
313	{ "NB_mem_ctrlr_page_table_overflow",		0xE1 },	\
314	{ "NB_sized_blocks",				0xE5 },	\
315	{ "NB_ECC_errors",				0xE8 },	\
316	{ "NB_probe_result",				0xEC },	\
317	{ "NB_gart_events",				0xEE },	\
318	{ "NB_ht_bus1_bandwidth",			0xF7 },	\
319	{ "NB_ht_bus2_bandwidth",			0xF8 }
320
321#define	AMD_FAMILY_10h_events					\
322	{ "FP_retired_sse_ops",				0x3 },	\
323	{ "FP_retired_move_ops",			0x4 },	\
324	{ "FP_retired_serialize_ops",			0x5 },	\
325	{ "FP_serialize_ops_cycles",			0x6 },	\
326	{ "LS_cancelled_store_to_load_fwd_ops",		0x2A },	\
327	{ "LS_smi_received",				0x2B },	\
328	{ "DC_dtlb_L1_hit",				0x4D },	\
329	{ "LS_ineffective_prefetch",			0x52 },	\
330	{ "LS_global_tlb_flush",			0x54 },	\
331	{ "BU_octwords_written_to_system",		0x6D },	\
332	{ "Page_size_mismatches",			0x165 },	\
333	{ "IC_eviction",				0x8B },	\
334	{ "IC_cache_lines_invalidate",			0x8C },	\
335	{ "IC_itlb_reload",				0x99 },	\
336	{ "IC_itlb_reload_aborted",			0x9A },	\
337	{ "FR_retired_mmx_sse_fp_instr",		0xCB },	\
338	{ "Retired_x87_fp_ops",				0x1C0 },	\
339	{ "IBS_ops_tagged",				0x1CF },	\
340	{ "LFENCE_inst_retired",			0x1D3 },	\
341	{ "SFENCE_inst_retired",			0x1D4 },	\
342	{ "MFENCE_inst_retired",			0x1D5 },	\
343	{ "NB_mem_ctrlr_page_table_overflow",		0xE1 },	\
344	{ "NB_mem_ctrlr_dram_cmd_slots_missed",		0xE2 },	\
345	{ "NB_thermal_status",				0xE8 },	\
346	{ "NB_probe_results_upstream_req",		0xEC },	\
347	{ "NB_gart_events",				0xEE },	\
348	{ "NB_mem_ctrlr_req",				0x1F0 },	\
349	{ "CB_cpu_to_dram_req_to_target",		0x1E0 },	\
350	{ "CB_io_to_dram_req_to_target",		0x1E1 },	\
351	{ "CB_cpu_read_cmd_latency_to_target_0_to_3",	0x1E2 },	\
352	{ "CB_cpu_read_cmd_req_to_target_0_to_3",	0x1E3 },	\
353	{ "CB_cpu_read_cmd_latency_to_target_4_to_7",	0x1E4 },	\
354	{ "CB_cpu_read_cmd_req_to_target_4_to_7",	0x1E5 },	\
355	{ "CB_cpu_cmd_latency_to_target_0_to_7",	0x1E6 },	\
356	{ "CB_cpu_req_to_target_0_to_7",		0x1E7 },	\
357	{ "NB_ht_bus1_bandwidth",			0xF7 },	\
358	{ "NB_ht_bus2_bandwidth",			0xF8 },	\
359	{ "NB_ht_bus3_bandwidth",			0x1F9 },	\
360	{ "L3_read_req",				0x4E0 },	\
361	{ "L3_miss",					0x4E1 },	\
362	{ "L3_l2_eviction_l3_fill",			0x4E2 },	\
363	{ "L3_eviction",				0x4E3 }
364
365#define	AMD_FAMILY_11h_events					\
366	{ "BU_quadwords_written_to_system",		0x6D },	\
367	{ "FR_retired_mmx_fp_instr",			0xCB },	\
368	{ "NB_mem_ctrlr_page_table_events",		0xE1 },	\
369	{ "NB_thermal_status",				0xE8 },	\
370	{ "NB_probe_results_upstream_req",		0xEC },	\
371	{ "NB_dev_events",				0xEE },	\
372	{ "NB_mem_ctrlr_req",				0x1F0 }
373
374#define	AMD_cmn_generic_events						\
375	{ "PAPI_br_ins",	"FR_retired_branches_w_excp_intr", 0x0 },\
376	{ "PAPI_br_msp",	"FR_retired_branches_mispred",	0x0 },	\
377	{ "PAPI_br_tkn",	"FR_retired_taken_branches",	0x0 },	\
378	{ "PAPI_fp_ops",	"FP_dispatched_fpu_ops",	0x3 },	\
379	{ "PAPI_fad_ins",	"FP_dispatched_fpu_ops",	0x1 },	\
380	{ "PAPI_fml_ins",	"FP_dispatched_fpu_ops",	0x2 },	\
381	{ "PAPI_fpu_idl",	"FP_cycles_no_fpu_ops_retired",	0x0 },	\
382	{ "PAPI_tot_cyc",	"BU_cpu_clk_unhalted",		0x0 },	\
383	{ "PAPI_tot_ins",	"FR_retired_x86_instr_w_excp_intr", 0x0 }, \
384	{ "PAPI_l1_dca",	"DC_access",			0x0 },	\
385	{ "PAPI_l1_dcm",	"DC_miss",			0x0 },	\
386	{ "PAPI_l1_ldm",	"DC_refill_from_L2",		0xe },	\
387	{ "PAPI_l1_stm",	"DC_refill_from_L2",		0x10 },	\
388	{ "PAPI_l1_ica",	"IC_fetch",			0x0 },	\
389	{ "PAPI_l1_icm",	"IC_miss",			0x0 },	\
390	{ "PAPI_l1_icr",	"IC_fetch",			0x0 },	\
391	{ "PAPI_l2_dch",	"DC_refill_from_L2",		0x1e },	\
392	{ "PAPI_l2_dcm",	"DC_refill_from_system",	0x1e },	\
393	{ "PAPI_l2_dcr",	"DC_refill_from_L2",		0xe },	\
394	{ "PAPI_l2_dcw",	"DC_refill_from_L2",		0x10 },	\
395	{ "PAPI_l2_ich",	"IC_refill_from_L2",		0x0 },	\
396	{ "PAPI_l2_icm",	"IC_refill_from_system",	0x0 },	\
397	{ "PAPI_l2_ldm",	"DC_refill_from_system",	0xe },	\
398	{ "PAPI_l2_stm",	"DC_refill_from_system",	0x10 },	\
399	{ "PAPI_res_stl",	"FR_dispatch_stalls",		0x0 },	\
400	{ "PAPI_stl_icy",	"FR_nothing_to_dispatch",	0x0 },	\
401	{ "PAPI_hw_int",	"FR_taken_hardware_intrs",	0x0 }
402
403#define	OPT_cmn_generic_events						\
404	{ "PAPI_tlb_dm",	"DC_dtlb_L1_miss_L2_miss",	0x0 },	\
405	{ "PAPI_tlb_im",	"IC_itlb_L1_miss_L2_miss",	0x0 },	\
406	{ "PAPI_fp_ins",	"FR_retired_fpu_instr",		0xd },	\
407	{ "PAPI_vec_ins",	"FR_retired_fpu_instr",		0x4 }
408
409#define	AMD_FAMILY_10h_generic_events					\
410	{ "PAPI_tlb_dm",	"DC_dtlb_L1_miss_L2_miss",	0x7 },	\
411	{ "PAPI_tlb_im",	"IC_itlb_L1_miss_L2_miss",	0x3 },	\
412	{ "PAPI_l3_dcr",	"L3_read_req",			0xf1 }, \
413	{ "PAPI_l3_icr",	"L3_read_req",			0xf2 }, \
414	{ "PAPI_l3_tcr",	"L3_read_req",			0xf7 }, \
415	{ "PAPI_l3_stm",	"L3_miss",			0xf4 }, \
416	{ "PAPI_l3_ldm",	"L3_miss",			0xf3 }, \
417	{ "PAPI_l3_tcm",	"L3_miss",			0xf7 }
418
419static const amd_event_t family_f_events[] = {
420	AMD_cmn_events,
421	AMD_FAMILY_f_events,
422	EV_END
423};
424
425static const amd_event_t family_10h_events[] = {
426	AMD_cmn_events,
427	AMD_FAMILY_10h_events,
428	EV_END
429};
430
431static const amd_event_t family_11h_events[] = {
432	AMD_cmn_events,
433	AMD_FAMILY_11h_events,
434	EV_END
435};
436
437static const amd_generic_event_t opt_generic_events[] = {
438	AMD_cmn_generic_events,
439	OPT_cmn_generic_events,
440	GEN_EV_END
441};
442
443static const amd_generic_event_t family_10h_generic_events[] = {
444	AMD_cmn_generic_events,
445	AMD_FAMILY_10h_generic_events,
446	GEN_EV_END
447};
448
449/*
450 * For Family 17h, the cpcgen utility generates all of our events including ones
451 * that need specific unit codes, therefore we leave all unit codes out of
452 * these. Zen 1 and Zen 2 have different event sets that they support.
453 */
454static const amd_generic_event_t family_17h_zen1_papi_events[] = {
455	{ "PAPI_br_cn",		"ExRetCond" },
456	{ "PAPI_br_ins",	"ExRetBrn" },
457	{ "PAPI_fpu_idl",	"FpSchedEmpty" },
458	{ "PAPI_tot_cyc",	"LsNotHaltedCyc" },
459	{ "PAPI_tot_ins",	"ExRetInstr" },
460	{ "PAPI_tlb_dm",	"LsL1DTlbMiss" },
461	{ "PAPI_tlb_im",	"BpL1TlbMissL2Miss" },
462	{ "PAPI_tot_cyc",	"LsNotHaltedCyc" },
463	GEN_EV_END
464};
465
466static const amd_generic_event_t family_17h_zen2_papi_events[] = {
467	{ "PAPI_br_cn",		"ExRetCond" },
468	{ "PAPI_br_ins",	"ExRetBrn" },
469	{ "PAPI_tot_cyc",	"LsNotHaltedCyc" },
470	{ "PAPI_tot_ins",	"ExRetInstr" },
471	{ "PAPI_tlb_dm",	"LsL1DTlbMiss" },
472	{ "PAPI_tlb_im",	"BpL1TlbMissL2Miss" },
473	{ "PAPI_tot_cyc",	"LsNotHaltedCyc" },
474	GEN_EV_END
475};
476
477
478static char	*evlist;
479static size_t	evlist_sz;
480static const amd_event_t *amd_events = NULL;
481static uint_t amd_family, amd_model;
482static const amd_generic_event_t *amd_generic_events = NULL;
483
484static char amd_fam_f_rev_ae_bkdg[] = "See \"BIOS and Kernel Developer's "
485"Guide for AMD Athlon 64 and AMD Opteron Processors\" (AMD publication 26094)";
486static char amd_fam_f_NPT_bkdg[] = "See \"BIOS and Kernel Developer's Guide "
487"for AMD NPT Family 0Fh Processors\" (AMD publication 32559)";
488static char amd_fam_10h_bkdg[] = "See \"BIOS and Kernel Developer's Guide "
489"(BKDG) For AMD Family 10h Processors\" (AMD publication 31116)";
490static char amd_fam_11h_bkdg[] = "See \"BIOS and Kernel Developer's Guide "
491"(BKDG) For AMD Family 11h Processors\" (AMD publication 41256)";
492static char amd_fam_17h_zen1_reg[] = "See \"Open-Source Register Reference For "
493"AMD Family 17h Processors Models 00h-2Fh\" (AMD publication 56255) and "
494"amd_f17h_zen1_events(3CPC)";
495static char amd_fam_17h_zen2_reg[] = "See \"Preliminary Processor Programming "
496"Reference (PPR) for AMD Family 17h Model 31h, Revision B0 Processors\" "
497"(AMD publication 55803), \"Processor Programming Reference (PPR) for AMD "
498"Family 17h Model 71h, Revision B0 Processors\" (AMD publication 56176), and "
499"amd_f17h_zen2_events(3CPC)";
500
501static char amd_pcbe_impl_name[64];
502static char *amd_pcbe_cpuref;
503
504
505#define	BITS(v, u, l)   \
506	(((v) >> (l)) & ((1 << (1 + (u) - (l))) - 1))
507
508static uint64_t
509opt_pcbe_pes_addr(uint_t counter)
510{
511	ASSERT3U(counter, <, opd.opd_ncounters);
512	return (PES_BASE_ADDR + counter);
513}
514
515static uint64_t
516opt_pcbe_pes_ext_addr(uint_t counter)
517{
518	ASSERT3U(counter, <, opd.opd_ncounters);
519	return (PES_EXT_BASE_ADDR + 2 * counter);
520}
521
522static uint64_t
523opt_pcbe_pic_addr(uint_t counter)
524{
525	ASSERT3U(counter, <, opd.opd_ncounters);
526	return (PIC_BASE_ADDR + 2 * counter);
527}
528
529static uint64_t
530opt_pcbe_pic_ext_addr(uint_t counter)
531{
532	ASSERT3U(counter, <, opd.opd_ncounters);
533	return (PIC_EXT_BASE_ADDR + 2 * counter);
534}
535
536static int
537opt_pcbe_init(void)
538{
539	const amd_event_t		*evp;
540	const amd_generic_event_t	*gevp;
541
542	amd_family = cpuid_getfamily(CPU);
543	amd_model = cpuid_getmodel(CPU);
544
545	/*
546	 * Make sure this really _is_ an Opteron or Athlon 64 system. The kernel
547	 * loads this module based on its name in the module directory, but it
548	 * could have been renamed.
549	 */
550	if (cpuid_getvendor(CPU) != X86_VENDOR_AMD || amd_family < 0xf)
551		return (-1);
552
553	if (amd_family == 0xf) {
554		/* Some tools expect this string for family 0fh */
555		(void) snprintf(amd_pcbe_impl_name, sizeof (amd_pcbe_impl_name),
556		    "AMD Opteron & Athlon64");
557	} else {
558		(void) snprintf(amd_pcbe_impl_name, sizeof (amd_pcbe_impl_name),
559		    "AMD Family %02xh", amd_family);
560	}
561
562	/*
563	 * Determine whether or not the extended counter set is supported on
564	 * this processor.
565	 */
566	if (is_x86_feature(x86_featureset, X86FSET_AMD_PCEC)) {
567		opd.opd_ncounters = OPT_PCBE_EXT_NCOUNTERS;
568		opd.opd_pesf = opt_pcbe_pes_ext_addr;
569		opd.opd_picf = opt_pcbe_pic_ext_addr;
570	} else {
571		opd.opd_ncounters = OPT_PCBE_DEF_NCOUNTERS;
572		opd.opd_pesf = opt_pcbe_pes_addr;
573		opd.opd_picf = opt_pcbe_pic_addr;
574	}
575	opd.opd_cmask = (1 << opd.opd_ncounters) - 1;
576
577	/*
578	 * Figure out processor revision here and assign appropriate
579	 * event configuration.
580	 */
581
582	if (amd_family == 0xf) {
583		uint32_t rev;
584
585		rev = cpuid_getchiprev(CPU);
586
587		if (X86_CHIPREV_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_F))
588			amd_pcbe_cpuref = amd_fam_f_NPT_bkdg;
589		else
590			amd_pcbe_cpuref = amd_fam_f_rev_ae_bkdg;
591		amd_events = family_f_events;
592		amd_generic_events = opt_generic_events;
593	} else if (amd_family == 0x10) {
594		amd_pcbe_cpuref = amd_fam_10h_bkdg;
595		amd_events = family_10h_events;
596		amd_generic_events = family_10h_generic_events;
597	} else if (amd_family == 0x11) {
598		amd_pcbe_cpuref = amd_fam_11h_bkdg;
599		amd_events = family_11h_events;
600		amd_generic_events = opt_generic_events;
601	} else if (amd_family == 0x17 && amd_model <= 0x2f) {
602		amd_pcbe_cpuref = amd_fam_17h_zen1_reg;
603		amd_events = opteron_pcbe_f17h_zen1_events;
604		amd_generic_events = family_17h_zen1_papi_events;
605	} else if (amd_family == 0x17 && amd_model >= 0x30 &&
606	    amd_model <= 0x7f) {
607		amd_pcbe_cpuref = amd_fam_17h_zen2_reg;
608		amd_events = opteron_pcbe_f17h_zen2_events;
609		amd_generic_events = family_17h_zen2_papi_events;
610	} else {
611		/*
612		 * Different families have different meanings on events and even
613		 * worse (like family 15h), different constraints around
614		 * programming these values.
615		 */
616		return (-1);
617	}
618
619	/*
620	 * Construct event list.
621	 *
622	 * First pass:  Calculate size needed. We'll need an additional byte
623	 *		for the NULL pointer during the last strcat.
624	 *
625	 * Second pass: Copy strings.
626	 */
627	for (evp = amd_events; evp->name != NULL; evp++)
628		evlist_sz += strlen(evp->name) + 1;
629
630	for (gevp = amd_generic_events; gevp->name != NULL; gevp++)
631		evlist_sz += strlen(gevp->name) + 1;
632
633	evlist = kmem_alloc(evlist_sz + 1, KM_SLEEP);
634	evlist[0] = '\0';
635
636	for (evp = amd_events; evp->name != NULL; evp++) {
637		(void) strcat(evlist, evp->name);
638		(void) strcat(evlist, ",");
639	}
640
641	for (gevp = amd_generic_events; gevp->name != NULL; gevp++) {
642		(void) strcat(evlist, gevp->name);
643		(void) strcat(evlist, ",");
644	}
645
646	/*
647	 * Remove trailing comma.
648	 */
649	evlist[evlist_sz - 1] = '\0';
650
651	return (0);
652}
653
654static uint_t
655opt_pcbe_ncounters(void)
656{
657	return (opd.opd_ncounters);
658}
659
660static const char *
661opt_pcbe_impl_name(void)
662{
663	return (amd_pcbe_impl_name);
664}
665
666static const char *
667opt_pcbe_cpuref(void)
668{
669
670	return (amd_pcbe_cpuref);
671}
672
673/*ARGSUSED*/
674static char *
675opt_pcbe_list_events(uint_t picnum)
676{
677	return (evlist);
678}
679
680static char *
681opt_pcbe_list_attrs(void)
682{
683	return ("edge,pc,inv,cmask,umask");
684}
685
686static const amd_generic_event_t *
687find_generic_event(char *name)
688{
689	const amd_generic_event_t	*gevp;
690
691	for (gevp = amd_generic_events; gevp->name != NULL; gevp++)
692		if (strcmp(name, gevp->name) == 0)
693			return (gevp);
694
695	return (NULL);
696}
697
698static const amd_event_t *
699find_event(char *name)
700{
701	const amd_event_t	*evp;
702
703	for (evp = amd_events; evp->name != NULL; evp++)
704		if (strcmp(name, evp->name) == 0)
705			return (evp);
706
707	return (NULL);
708}
709
710/*ARGSUSED*/
711static uint64_t
712opt_pcbe_event_coverage(char *event)
713{
714	/*
715	 * Check whether counter event is supported
716	 */
717	if (find_event(event) == NULL && find_generic_event(event) == NULL)
718		return (0);
719
720	/*
721	 * Fortunately, all counters can count all events.
722	 */
723	return (opd.opd_cmask);
724}
725
726static uint64_t
727opt_pcbe_overflow_bitmap(void)
728{
729	/*
730	 * Unfortunately, this chip cannot detect which counter overflowed, so
731	 * we must act as if they all did.
732	 */
733	return (opd.opd_cmask);
734}
735
736/*ARGSUSED*/
737static int
738opt_pcbe_configure(uint_t picnum, char *event, uint64_t preset, uint32_t flags,
739    uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token)
740{
741	opt_pcbe_config_t		*cfg;
742	const amd_event_t		*evp;
743	amd_event_t			ev_raw = { "raw", 0};
744	const amd_generic_event_t	*gevp;
745	int				i;
746	uint64_t			evsel = 0, evsel_tmp = 0;
747
748	/*
749	 * If we've been handed an existing configuration, we need only preset
750	 * the counter value.
751	 */
752	if (*data != NULL) {
753		cfg = *data;
754		cfg->opt_rawpic = preset & MASK48;
755		return (0);
756	}
757
758	if (picnum >= opd.opd_ncounters)
759		return (CPC_INVALID_PICNUM);
760
761	if ((evp = find_event(event)) == NULL) {
762		if ((gevp = find_generic_event(event)) != NULL) {
763			evp = find_event(gevp->event);
764			ASSERT(evp != NULL);
765
766			if (nattrs > 0)
767				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
768
769			evsel |= gevp->umask << OPT_PES_UMASK_SHIFT;
770		} else {
771			long tmp;
772
773			/*
774			 * If ddi_strtol() likes this event, use it as a raw
775			 * event code.
776			 */
777			if (ddi_strtol(event, NULL, 0, &tmp) != 0)
778				return (CPC_INVALID_EVENT);
779
780			ev_raw.emask = tmp;
781			evp = &ev_raw;
782		}
783	}
784
785	/*
786	 * Configuration of EventSelect register. While on some families
787	 * certain bits might not be supported (e.g. Guest/Host on family
788	 * 11h), setting these bits is harmless
789	 */
790
791	/* Set GuestOnly bit to 0 and HostOnly bit to 1 */
792	evsel &= ~OPT_PES_HOST;
793	evsel &= ~OPT_PES_GUEST;
794
795	/* Set bits [35:32] for extended part of Event Select field */
796	evsel_tmp = evp->emask & 0x0f00;
797	evsel |= evsel_tmp << OPT_PES_EVSELHI_SHIFT;
798
799	evsel |= evp->emask & 0x00ff;
800	evsel |= evp->unit << OPT_PES_UMASK_SHIFT;
801
802	if (flags & CPC_COUNT_USER)
803		evsel |= OPT_PES_USR;
804	if (flags & CPC_COUNT_SYSTEM)
805		evsel |= OPT_PES_OS;
806	if (flags & CPC_OVF_NOTIFY_EMT)
807		evsel |= OPT_PES_INT;
808
809	for (i = 0; i < nattrs; i++) {
810		if (strcmp(attrs[i].ka_name, "edge") == 0) {
811			if (attrs[i].ka_val != 0)
812				evsel |= OPT_PES_EDGE;
813		} else if (strcmp(attrs[i].ka_name, "pc") == 0) {
814			if (attrs[i].ka_val != 0)
815				evsel |= OPT_PES_PC;
816		} else if (strcmp(attrs[i].ka_name, "inv") == 0) {
817			if (attrs[i].ka_val != 0)
818				evsel |= OPT_PES_INV;
819		} else if (strcmp(attrs[i].ka_name, "cmask") == 0) {
820			if ((attrs[i].ka_val | OPT_PES_CMASK_MASK) !=
821			    OPT_PES_CMASK_MASK)
822				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
823			evsel |= attrs[i].ka_val << OPT_PES_CMASK_SHIFT;
824		} else if (strcmp(attrs[i].ka_name, "umask") == 0) {
825			if ((attrs[i].ka_val | OPT_PES_UMASK_MASK) !=
826			    OPT_PES_UMASK_MASK)
827				return (CPC_ATTRIBUTE_OUT_OF_RANGE);
828			evsel |= attrs[i].ka_val << OPT_PES_UMASK_SHIFT;
829		} else
830			return (CPC_INVALID_ATTRIBUTE);
831	}
832
833	cfg = kmem_alloc(sizeof (*cfg), KM_SLEEP);
834
835	cfg->opt_picno = picnum;
836	cfg->opt_evsel = evsel;
837	cfg->opt_rawpic = preset & MASK48;
838
839	*data = cfg;
840	return (0);
841}
842
843static void
844opt_pcbe_program(void *token)
845{
846	opt_pcbe_config_t	*cfgs[OPT_PCBE_EXT_NCOUNTERS] = { &nullcfgs[0],
847						&nullcfgs[1], &nullcfgs[2],
848						&nullcfgs[3], &nullcfgs[4],
849						&nullcfgs[5] };
850	opt_pcbe_config_t	*pcfg = NULL;
851	int			i;
852	ulong_t			curcr4 = getcr4();
853
854	/*
855	 * Allow nonprivileged code to read the performance counters if desired.
856	 */
857	if (kcpc_allow_nonpriv(token))
858		setcr4(curcr4 | CR4_PCE);
859	else
860		setcr4(curcr4 & ~CR4_PCE);
861
862	/*
863	 * Query kernel for all configs which will be co-programmed.
864	 */
865	do {
866		pcfg = (opt_pcbe_config_t *)kcpc_next_config(token, pcfg, NULL);
867
868		if (pcfg != NULL) {
869			ASSERT(pcfg->opt_picno < opd.opd_ncounters);
870			cfgs[pcfg->opt_picno] = pcfg;
871		}
872	} while (pcfg != NULL);
873
874	/*
875	 * Program in two loops. The first configures and presets the counter,
876	 * and the second loop enables the counters. This ensures that the
877	 * counters are all enabled as closely together in time as possible.
878	 */
879
880	for (i = 0; i < opd.opd_ncounters; i++) {
881		wrmsr(opd.opd_pesf(i), cfgs[i]->opt_evsel);
882		wrmsr(opd.opd_picf(i), cfgs[i]->opt_rawpic);
883	}
884
885	for (i = 0; i < opd.opd_ncounters; i++) {
886		wrmsr(opd.opd_pesf(i), cfgs[i]->opt_evsel |
887		    (uint64_t)(uintptr_t)OPT_PES_ENABLE);
888	}
889}
890
891static void
892opt_pcbe_allstop(void)
893{
894	int		i;
895
896	for (i = 0; i < opd.opd_ncounters; i++)
897		wrmsr(opd.opd_pesf(i), 0ULL);
898
899	/*
900	 * Disable non-privileged access to the counter registers.
901	 */
902	setcr4(getcr4() & ~CR4_PCE);
903}
904
905static void
906opt_pcbe_sample(void *token)
907{
908	opt_pcbe_config_t	*cfgs[OPT_PCBE_EXT_NCOUNTERS] = { NULL, NULL,
909						NULL, NULL, NULL, NULL };
910	opt_pcbe_config_t	*pcfg = NULL;
911	int			i;
912	uint64_t		curpic[OPT_PCBE_EXT_NCOUNTERS];
913	uint64_t		*addrs[OPT_PCBE_EXT_NCOUNTERS];
914	uint64_t		*tmp;
915	int64_t			diff;
916
917	for (i = 0; i < opd.opd_ncounters; i++)
918		curpic[i] = rdmsr(opd.opd_picf(i));
919
920	/*
921	 * Query kernel for all configs which are co-programmed.
922	 */
923	do {
924		pcfg = (opt_pcbe_config_t *)kcpc_next_config(token, pcfg, &tmp);
925
926		if (pcfg != NULL) {
927			ASSERT3U(pcfg->opt_picno, <, opd.opd_ncounters);
928			cfgs[pcfg->opt_picno] = pcfg;
929			addrs[pcfg->opt_picno] = tmp;
930		}
931	} while (pcfg != NULL);
932
933	for (i = 0; i < opd.opd_ncounters; i++) {
934		if (cfgs[i] == NULL)
935			continue;
936
937		diff = (curpic[i] - cfgs[i]->opt_rawpic) & MASK48;
938		*addrs[i] += diff;
939		DTRACE_PROBE4(opt__pcbe__sample, int, i, uint64_t, *addrs[i],
940		    uint64_t, curpic[i], uint64_t, cfgs[i]->opt_rawpic);
941		cfgs[i]->opt_rawpic = *addrs[i] & MASK48;
942	}
943}
944
945static void
946opt_pcbe_free(void *config)
947{
948	kmem_free(config, sizeof (opt_pcbe_config_t));
949}
950
951
952static struct modlpcbe modlpcbe = {
953	&mod_pcbeops,
954	"AMD Performance Counters",
955	&opt_pcbe_ops
956};
957
958static struct modlinkage modl = {
959	MODREV_1,
960	&modlpcbe,
961};
962
963int
964_init(void)
965{
966	int ret;
967
968	if (opt_pcbe_init() != 0)
969		return (ENOTSUP);
970
971	if ((ret = mod_install(&modl)) != 0)
972		kmem_free(evlist, evlist_sz + 1);
973
974	return (ret);
975}
976
977int
978_fini(void)
979{
980	int ret;
981
982	if ((ret = mod_remove(&modl)) == 0)
983		kmem_free(evlist, evlist_sz + 1);
984	return (ret);
985}
986
987int
988_info(struct modinfo *mi)
989{
990	return (mod_info(&modl, mi));
991}
992