1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25/*
26 * Copyright 2019, Joyent, Inc.
27 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
28 */
29
30/*
31 * PSMI 1.1 extensions are supported only in 2.6 and later versions.
32 * PSMI 1.2 extensions are supported only in 2.7 and later versions.
33 * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
34 * PSMI 1.5 extensions are supported in Solaris Nevada.
35 * PSMI 1.6 extensions are supported in Solaris Nevada.
36 * PSMI 1.7 extensions are supported in Solaris Nevada.
37 */
38#define	PSMI_1_7
39
40#include <sys/processor.h>
41#include <sys/time.h>
42#include <sys/psm.h>
43#include <sys/smp_impldefs.h>
44#include <sys/cram.h>
45#include <sys/acpi/acpi.h>
46#include <sys/acpica.h>
47#include <sys/psm_common.h>
48#include <sys/apic.h>
49#include <sys/pit.h>
50#include <sys/ddi.h>
51#include <sys/sunddi.h>
52#include <sys/ddi_impldefs.h>
53#include <sys/pci.h>
54#include <sys/promif.h>
55#include <sys/x86_archext.h>
56#include <sys/cpc_impl.h>
57#include <sys/uadmin.h>
58#include <sys/panic.h>
59#include <sys/debug.h>
60#include <sys/archsystm.h>
61#include <sys/trap.h>
62#include <sys/machsystm.h>
63#include <sys/sysmacros.h>
64#include <sys/cpuvar.h>
65#include <sys/rm_platter.h>
66#include <sys/privregs.h>
67#include <sys/note.h>
68#include <sys/pci_intr_lib.h>
69#include <sys/spl.h>
70#include <sys/clock.h>
71#include <sys/dditypes.h>
72#include <sys/sunddi.h>
73#include <sys/x_call.h>
74#include <sys/reboot.h>
75#include <sys/hpet.h>
76#include <sys/apic_common.h>
77#include <sys/apic_timer.h>
78
79static void	apic_record_ioapic_rdt(void *intrmap_private,
80		    ioapic_rdt_t *irdt);
81static void	apic_record_msi(void *intrmap_private, msi_regs_t *mregs);
82
83/*
84 * Common routines between pcplusmp & apix (taken from apic.c).
85 */
86
87int	apic_clkinit(int);
88hrtime_t apic_gethrtime(void);
89void	apic_send_ipi(int, int);
90void	apic_set_idlecpu(processorid_t);
91void	apic_unset_idlecpu(processorid_t);
92void	apic_shutdown(int, int);
93void	apic_preshutdown(int, int);
94processorid_t	apic_get_next_processorid(processorid_t);
95
96hrtime_t apic_gettime();
97
98enum apic_ioapic_method_type apix_mul_ioapic_method = APIC_MUL_IOAPIC_PCPLUSMP;
99
100/* Now the ones for Dynamic Interrupt distribution */
101int	apic_enable_dynamic_migration = 0;
102
103/* maximum loop count when sending Start IPIs. */
104int apic_sipi_max_loop_count = 0x1000;
105
106/*
107 * These variables are frequently accessed in apic_intr_enter(),
108 * apic_intr_exit and apic_setspl, so group them together
109 */
110volatile uint32_t *apicadr =  NULL;	/* virtual addr of local APIC	*/
111int apic_setspl_delay = 1;		/* apic_setspl - delay enable	*/
112int apic_clkvect;
113
114/* vector at which error interrupts come in */
115int apic_errvect;
116int apic_enable_error_intr = 1;
117int apic_error_display_delay = 100;
118
119/* vector at which performance counter overflow interrupts come in */
120int apic_cpcovf_vect;
121int apic_enable_cpcovf_intr = 1;
122
123/* vector at which CMCI interrupts come in */
124int apic_cmci_vect;
125extern void cmi_cmci_trap(void);
126
127lock_t apic_mode_switch_lock;
128
129int apic_pir_vect;
130
131/*
132 * Patchable global variables.
133 */
134int	apic_forceload = 0;
135
136int	apic_coarse_hrtime = 1;		/* 0 - use accurate slow gethrtime() */
137
138int	apic_flat_model = 0;		/* 0 - clustered. 1 - flat */
139int	apic_panic_on_nmi = 0;
140int	apic_panic_on_apic_error = 0;
141
142int	apic_verbose = 0;	/* 0x1ff */
143
144#ifdef DEBUG
145int	apic_debug = 0;
146int	apic_restrict_vector = 0;
147
148int	apic_debug_msgbuf[APIC_DEBUG_MSGBUFSIZE];
149int	apic_debug_msgbufindex = 0;
150
151#endif /* DEBUG */
152
153uint_t apic_nticks = 0;
154uint_t apic_skipped_redistribute = 0;
155
156uint_t last_count_read = 0;
157lock_t	apic_gethrtime_lock;
158volatile int	apic_hrtime_stamp = 0;
159volatile hrtime_t apic_nsec_since_boot = 0;
160
161static	hrtime_t	apic_last_hrtime = 0;
162int		apic_hrtime_error = 0;
163int		apic_remote_hrterr = 0;
164int		apic_num_nmis = 0;
165int		apic_apic_error = 0;
166int		apic_num_apic_errors = 0;
167int		apic_num_cksum_errors = 0;
168
169int	apic_error = 0;
170
171static	int	apic_cmos_ssb_set = 0;
172
173/* use to make sure only one cpu handles the nmi */
174lock_t	apic_nmi_lock;
175/* use to make sure only one cpu handles the error interrupt */
176lock_t	apic_error_lock;
177
178static	struct {
179	uchar_t	cntl;
180	uchar_t	data;
181} aspen_bmc[] = {
182	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
183	{ CC_SMS_WR_NEXT,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
184	{ CC_SMS_WR_NEXT,	0x84 },		/* DataByte 1: SMS/OS no log */
185	{ CC_SMS_WR_NEXT,	0x2 },		/* DataByte 2: Power Down */
186	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 3: no pre-timeout */
187	{ CC_SMS_WR_NEXT,	0x0 },		/* DataByte 4: timer expir. */
188	{ CC_SMS_WR_NEXT,	0xa },		/* DataByte 5: init countdown */
189	{ CC_SMS_WR_END,	0x0 },		/* DataByte 6: init countdown */
190
191	{ CC_SMS_WR_START,	0x18 },		/* NetFn/LUN */
192	{ CC_SMS_WR_END,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
193};
194
195static	struct {
196	int	port;
197	uchar_t	data;
198} sitka_bmc[] = {
199	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
200	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
201	{ SMS_DATA_REGISTER,	0x24 },		/* Cmd SET_WATCHDOG_TIMER */
202	{ SMS_DATA_REGISTER,	0x84 },		/* DataByte 1: SMS/OS no log */
203	{ SMS_DATA_REGISTER,	0x2 },		/* DataByte 2: Power Down */
204	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 3: no pre-timeout */
205	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 4: timer expir. */
206	{ SMS_DATA_REGISTER,	0xa },		/* DataByte 5: init countdown */
207	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
208	{ SMS_DATA_REGISTER,	0x0 },		/* DataByte 6: init countdown */
209
210	{ SMS_COMMAND_REGISTER,	SMS_WRITE_START },
211	{ SMS_DATA_REGISTER,	0x18 },		/* NetFn/LUN */
212	{ SMS_COMMAND_REGISTER,	SMS_WRITE_END },
213	{ SMS_DATA_REGISTER,	0x22 }		/* Cmd RESET_WATCHDOG_TIMER */
214};
215
216/* Patchable global variables. */
217int		apic_kmdb_on_nmi = 0;		/* 0 - no, 1 - yes enter kmdb */
218uint32_t	apic_divide_reg_init = 0;	/* 0 - divide by 2 */
219
220/* default apic ops without interrupt remapping */
221static apic_intrmap_ops_t apic_nointrmap_ops = {
222	(int (*)(int))return_instr,
223	(void (*)(int))return_instr,
224	(void (*)(void **, dev_info_t *, uint16_t, int, uchar_t))return_instr,
225	(void (*)(void *, void *, uint16_t, int))return_instr,
226	(void (*)(void **))return_instr,
227	apic_record_ioapic_rdt,
228	apic_record_msi,
229};
230
231apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops;
232apic_cpus_info_t	*apic_cpus = NULL;
233cpuset_t	apic_cpumask;
234uint_t		apic_picinit_called;
235
236/* Flag to indicate that we need to shut down all processors */
237static uint_t	apic_shutdown_processors;
238
239/*
240 * Probe the ioapic method for apix module. Called in apic_probe_common()
241 */
242int
243apic_ioapic_method_probe()
244{
245	if (apix_enable == 0)
246		return (PSM_SUCCESS);
247
248	/*
249	 * Set IOAPIC EOI handling method. The priority from low to high is:
250	 *	1. IOxAPIC: with EOI register
251	 *	2. IOMMU interrupt mapping
252	 *	3. Mask-Before-EOI method for systems without boot
253	 *	interrupt routing, such as systems with only one IOAPIC;
254	 *	NVIDIA CK8-04/MCP55 systems; systems with bridge solution
255	 *	which disables the boot interrupt routing already.
256	 *	4. Directed EOI
257	 */
258	if (apic_io_ver[0] >= 0x20)
259		apix_mul_ioapic_method = APIC_MUL_IOAPIC_IOXAPIC;
260	if ((apic_io_max == 1) || (apic_nvidia_io_max == apic_io_max))
261		apix_mul_ioapic_method = APIC_MUL_IOAPIC_MASK;
262	if (apic_directed_EOI_supported())
263		apix_mul_ioapic_method = APIC_MUL_IOAPIC_DEOI;
264
265	/* fall back to pcplusmp */
266	if (apix_mul_ioapic_method == APIC_MUL_IOAPIC_PCPLUSMP) {
267		/* make sure apix is after pcplusmp in /etc/mach */
268		apix_enable = 0; /* go ahead with pcplusmp install next */
269		return (PSM_FAILURE);
270	}
271
272	return (PSM_SUCCESS);
273}
274
275/*
276 * handler for APIC Error interrupt. Just print a warning and continue
277 */
278int
279apic_error_intr()
280{
281	uint_t	error0, error1, error;
282	uint_t	i;
283
284	/*
285	 * We need to write before read as per 7.4.17 of system prog manual.
286	 * We do both and or the results to be safe
287	 */
288	error0 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
289	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
290	error1 = apic_reg_ops->apic_read(APIC_ERROR_STATUS);
291	error = error0 | error1;
292
293	/*
294	 * Clear the APIC error status (do this on all cpus that enter here)
295	 * (two writes are required due to the semantics of accessing the
296	 * error status register.)
297	 */
298	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
299	apic_reg_ops->apic_write(APIC_ERROR_STATUS, 0);
300
301	/*
302	 * Prevent more than 1 CPU from handling error interrupt causing
303	 * double printing (interleave of characters from multiple
304	 * CPU's when using prom_printf)
305	 */
306	if (lock_try(&apic_error_lock) == 0)
307		return (error ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
308	if (error) {
309#if	DEBUG
310		if (apic_debug)
311			debug_enter("pcplusmp: APIC Error interrupt received");
312#endif /* DEBUG */
313		if (apic_panic_on_apic_error)
314			cmn_err(CE_PANIC,
315			    "APIC Error interrupt on CPU %d. Status = %x",
316			    psm_get_cpu_id(), error);
317		else {
318			if ((error & ~APIC_CS_ERRORS) == 0) {
319				/* cksum error only */
320				apic_error |= APIC_ERR_APIC_ERROR;
321				apic_apic_error |= error;
322				apic_num_apic_errors++;
323				apic_num_cksum_errors++;
324			} else {
325				/*
326				 * prom_printf is the best shot we have of
327				 * something which is problem free from
328				 * high level/NMI type of interrupts
329				 */
330				prom_printf("APIC Error interrupt on CPU %d. "
331				    "Status 0 = %x, Status 1 = %x\n",
332				    psm_get_cpu_id(), error0, error1);
333				apic_error |= APIC_ERR_APIC_ERROR;
334				apic_apic_error |= error;
335				apic_num_apic_errors++;
336				for (i = 0; i < apic_error_display_delay; i++) {
337					tenmicrosec();
338				}
339				/*
340				 * provide more delay next time limited to
341				 * roughly 1 clock tick time
342				 */
343				if (apic_error_display_delay < 500)
344					apic_error_display_delay *= 2;
345			}
346		}
347		lock_clear(&apic_error_lock);
348		return (DDI_INTR_CLAIMED);
349	} else {
350		lock_clear(&apic_error_lock);
351		return (DDI_INTR_UNCLAIMED);
352	}
353}
354
355/*
356 * Turn off the mask bit in the performance counter Local Vector Table entry.
357 */
358void
359apic_cpcovf_mask_clear(void)
360{
361	apic_reg_ops->apic_write(APIC_PCINT_VECT,
362	    (apic_reg_ops->apic_read(APIC_PCINT_VECT) & ~APIC_LVT_MASK));
363}
364
365/*ARGSUSED*/
366static int
367apic_cmci_enable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
368{
369	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect);
370	return (0);
371}
372
373/*ARGSUSED*/
374static int
375apic_cmci_disable(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
376{
377	apic_reg_ops->apic_write(APIC_CMCI_VECT, apic_cmci_vect | AV_MASK);
378	return (0);
379}
380
381void
382apic_cmci_setup(processorid_t cpuid, boolean_t enable)
383{
384	cpuset_t	cpu_set;
385
386	CPUSET_ONLY(cpu_set, cpuid);
387
388	if (enable) {
389		xc_call(0, 0, 0, CPUSET2BV(cpu_set),
390		    (xc_func_t)apic_cmci_enable);
391	} else {
392		xc_call(0, 0, 0, CPUSET2BV(cpu_set),
393		    (xc_func_t)apic_cmci_disable);
394	}
395}
396
397static void
398apic_disable_local_apic(void)
399{
400	apic_reg_ops->apic_write_task_reg(APIC_MASK_ALL);
401	apic_reg_ops->apic_write(APIC_LOCAL_TIMER, AV_MASK);
402
403	/* local intr reg 0 */
404	apic_reg_ops->apic_write(APIC_INT_VECT0, AV_MASK);
405
406	/* disable NMI */
407	apic_reg_ops->apic_write(APIC_INT_VECT1, AV_MASK);
408
409	/* and error interrupt */
410	apic_reg_ops->apic_write(APIC_ERR_VECT, AV_MASK);
411
412	/* and perf counter intr */
413	apic_reg_ops->apic_write(APIC_PCINT_VECT, AV_MASK);
414
415	apic_reg_ops->apic_write(APIC_SPUR_INT_REG, APIC_SPUR_INTR);
416}
417
418static void
419apic_cpu_send_SIPI(processorid_t cpun, boolean_t start)
420{
421	int		loop_count;
422	uint32_t	vector;
423	uint_t		apicid;
424	ulong_t		iflag;
425
426	apicid =  apic_cpus[cpun].aci_local_id;
427
428	/*
429	 * Interrupts on current CPU will be disabled during the
430	 * steps in order to avoid unwanted side effects from
431	 * executing interrupt handlers on a problematic BIOS.
432	 */
433	iflag = intr_clear();
434
435	if (start) {
436		outb(CMOS_ADDR, SSB);
437		outb(CMOS_DATA, BIOS_SHUTDOWN);
438	}
439
440	/*
441	 * According to X2APIC specification in section '2.3.5.1' of
442	 * Interrupt Command Register Semantics, the semantics of
443	 * programming the Interrupt Command Register to dispatch an interrupt
444	 * is simplified. A single MSR write to the 64-bit ICR is required
445	 * for dispatching an interrupt. Specifically, with the 64-bit MSR
446	 * interface to ICR, system software is not required to check the
447	 * status of the delivery status bit prior to writing to the ICR
448	 * to send an IPI. With the removal of the Delivery Status bit,
449	 * system software no longer has a reason to read the ICR. It remains
450	 * readable only to aid in debugging.
451	 */
452#ifdef	DEBUG
453	APIC_AV_PENDING_SET();
454#else
455	if (apic_mode == LOCAL_APIC) {
456		APIC_AV_PENDING_SET();
457	}
458#endif /* DEBUG */
459
460	/* for integrated - make sure there is one INIT IPI in buffer */
461	/* for external - it will wake up the cpu */
462	apic_reg_ops->apic_write_int_cmd(apicid, AV_ASSERT | AV_RESET);
463
464	/* If only 1 CPU is installed, PENDING bit will not go low */
465	for (loop_count = apic_sipi_max_loop_count; loop_count; loop_count--) {
466		if (apic_mode == LOCAL_APIC &&
467		    apic_reg_ops->apic_read(APIC_INT_CMD1) & AV_PENDING)
468			apic_ret();
469		else
470			break;
471	}
472
473	apic_reg_ops->apic_write_int_cmd(apicid, AV_DEASSERT | AV_RESET);
474	drv_usecwait(20000);		/* 20 milli sec */
475
476	if (apic_cpus[cpun].aci_local_ver >= APIC_INTEGRATED_VERS) {
477		/* integrated apic */
478
479		vector = (rm_platter_pa >> MMU_PAGESHIFT) &
480		    (APIC_VECTOR_MASK | APIC_IPL_MASK);
481
482		/* to offset the INIT IPI queue up in the buffer */
483		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
484		drv_usecwait(200);		/* 20 micro sec */
485
486		/*
487		 * send the second SIPI (Startup IPI) as recommended by Intel
488		 * software development manual.
489		 */
490		apic_reg_ops->apic_write_int_cmd(apicid, vector | AV_STARTUP);
491		drv_usecwait(200);	/* 20 micro sec */
492	}
493
494	intr_restore(iflag);
495}
496
497/*ARGSUSED1*/
498int
499apic_cpu_start(processorid_t cpun, caddr_t arg)
500{
501	ASSERT(MUTEX_HELD(&cpu_lock));
502
503	if (!apic_cpu_in_range(cpun)) {
504		return (EINVAL);
505	}
506
507	/*
508	 * Switch to apic_common_send_ipi for safety during starting other CPUs.
509	 */
510	if (apic_mode == LOCAL_X2APIC) {
511		apic_switch_ipi_callback(B_TRUE);
512	}
513
514	apic_cmos_ssb_set = 1;
515	apic_cpu_send_SIPI(cpun, B_TRUE);
516
517	return (0);
518}
519
520/*
521 * Put CPU into halted state with interrupts disabled.
522 */
523/*ARGSUSED1*/
524int
525apic_cpu_stop(processorid_t cpun, caddr_t arg)
526{
527	int		rc;
528	cpu_t		*cp;
529	extern cpuset_t cpu_ready_set;
530	extern void cpu_idle_intercept_cpu(cpu_t *cp);
531
532	ASSERT(MUTEX_HELD(&cpu_lock));
533
534	if (!apic_cpu_in_range(cpun)) {
535		return (EINVAL);
536	}
537	if (apic_cpus[cpun].aci_local_ver < APIC_INTEGRATED_VERS) {
538		return (ENOTSUP);
539	}
540
541	cp = cpu_get(cpun);
542	ASSERT(cp != NULL);
543	ASSERT((cp->cpu_flags & CPU_OFFLINE) != 0);
544	ASSERT((cp->cpu_flags & CPU_QUIESCED) != 0);
545	ASSERT((cp->cpu_flags & CPU_ENABLE) == 0);
546
547	/* Clear CPU_READY flag to disable cross calls. */
548	cp->cpu_flags &= ~CPU_READY;
549	CPUSET_ATOMIC_DEL(cpu_ready_set, cpun);
550	rc = xc_flush_cpu(cp);
551	if (rc != 0) {
552		CPUSET_ATOMIC_ADD(cpu_ready_set, cpun);
553		cp->cpu_flags |= CPU_READY;
554		return (rc);
555	}
556
557	/* Intercept target CPU at a safe point before powering it off. */
558	cpu_idle_intercept_cpu(cp);
559
560	apic_cpu_send_SIPI(cpun, B_FALSE);
561	cp->cpu_flags &= ~CPU_RUNNING;
562
563	return (0);
564}
565
566int
567apic_cpu_ops(psm_cpu_request_t *reqp)
568{
569	if (reqp == NULL) {
570		return (EINVAL);
571	}
572
573	switch (reqp->pcr_cmd) {
574	case PSM_CPU_ADD:
575		return (apic_cpu_add(reqp));
576
577	case PSM_CPU_REMOVE:
578		return (apic_cpu_remove(reqp));
579
580	case PSM_CPU_STOP:
581		return (apic_cpu_stop(reqp->req.cpu_stop.cpuid,
582		    reqp->req.cpu_stop.ctx));
583
584	default:
585		return (ENOTSUP);
586	}
587}
588
589#ifdef	DEBUG
590int	apic_break_on_cpu = 9;
591int	apic_stretch_interrupts = 0;
592int	apic_stretch_ISR = 1 << 3;	/* IPL of 3 matches nothing now */
593#endif /* DEBUG */
594
595/*
596 * generates an interprocessor interrupt to another CPU. Any changes made to
597 * this routine must be accompanied by similar changes to
598 * apic_common_send_ipi().
599 */
600void
601apic_send_ipi(int cpun, int ipl)
602{
603	int vector;
604	ulong_t flag;
605
606	vector = apic_resv_vector[ipl];
607
608	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
609
610	flag = intr_clear();
611
612	APIC_AV_PENDING_SET();
613
614	apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
615	    vector);
616
617	intr_restore(flag);
618}
619
620void
621apic_send_pir_ipi(processorid_t cpun)
622{
623	const int vector = apic_pir_vect;
624	ulong_t flag;
625
626	ASSERT((vector >= APIC_BASE_VECT) && (vector <= APIC_SPUR_INTR));
627
628	flag = intr_clear();
629
630	/* Self-IPI for inducing PIR makes no sense. */
631	if ((cpun != psm_get_cpu_id())) {
632		APIC_AV_PENDING_SET();
633		apic_reg_ops->apic_write_int_cmd(apic_cpus[cpun].aci_local_id,
634		    vector);
635	}
636
637	intr_restore(flag);
638}
639
640int
641apic_get_pir_ipivect(void)
642{
643	return (apic_pir_vect);
644}
645
646/*ARGSUSED*/
647void
648apic_set_idlecpu(processorid_t cpun)
649{
650}
651
652/*ARGSUSED*/
653void
654apic_unset_idlecpu(processorid_t cpun)
655{
656}
657
658
659void
660apic_ret()
661{
662}
663
664/*
665 * If apic_coarse_time == 1, then apic_gettime() is used instead of
666 * apic_gethrtime().  This is used for performance instead of accuracy.
667 */
668
669hrtime_t
670apic_gettime()
671{
672	int old_hrtime_stamp;
673	hrtime_t temp;
674
675	/*
676	 * In one-shot mode, we do not keep time, so if anyone
677	 * calls psm_gettime() directly, we vector over to
678	 * gethrtime().
679	 * one-shot mode MUST NOT be enabled if this psm is the source of
680	 * hrtime.
681	 */
682
683	if (apic_oneshot)
684		return (gethrtime());
685
686
687gettime_again:
688	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
689		apic_ret();
690
691	temp = apic_nsec_since_boot;
692
693	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
694		goto gettime_again;
695	}
696	return (temp);
697}
698
699/*
700 * Here we return the number of nanoseconds since booting.  Note every
701 * clock interrupt increments apic_nsec_since_boot by the appropriate
702 * amount.
703 */
704hrtime_t
705apic_gethrtime(void)
706{
707	int curr_timeval, countval, elapsed_ticks;
708	int old_hrtime_stamp, status;
709	hrtime_t temp;
710	uint32_t cpun;
711	ulong_t oflags;
712
713	/*
714	 * In one-shot mode, we do not keep time, so if anyone
715	 * calls psm_gethrtime() directly, we vector over to
716	 * gethrtime().
717	 * one-shot mode MUST NOT be enabled if this psm is the source of
718	 * hrtime.
719	 */
720
721	if (apic_oneshot)
722		return (gethrtime());
723
724	oflags = intr_clear();	/* prevent migration */
725
726	cpun = apic_reg_ops->apic_read(APIC_LID_REG);
727	if (apic_mode == LOCAL_APIC)
728		cpun >>= APIC_ID_BIT_OFFSET;
729
730	lock_set(&apic_gethrtime_lock);
731
732gethrtime_again:
733	while ((old_hrtime_stamp = apic_hrtime_stamp) & 1)
734		apic_ret();
735
736	/*
737	 * Check to see which CPU we are on.  Note the time is kept on
738	 * the local APIC of CPU 0.  If on CPU 0, simply read the current
739	 * counter.  If on another CPU, issue a remote read command to CPU 0.
740	 */
741	if (cpun == apic_cpus[0].aci_local_id) {
742		countval = apic_reg_ops->apic_read(APIC_CURR_COUNT);
743	} else {
744#ifdef	DEBUG
745		APIC_AV_PENDING_SET();
746#else
747		if (apic_mode == LOCAL_APIC)
748			APIC_AV_PENDING_SET();
749#endif /* DEBUG */
750
751		apic_reg_ops->apic_write_int_cmd(
752		    apic_cpus[0].aci_local_id, APIC_CURR_ADD | AV_REMOTE);
753
754		while ((status = apic_reg_ops->apic_read(APIC_INT_CMD1))
755		    & AV_READ_PENDING) {
756			apic_ret();
757		}
758
759		if (status & AV_REMOTE_STATUS)	/* 1 = valid */
760			countval = apic_reg_ops->apic_read(APIC_REMOTE_READ);
761		else {	/* 0 = invalid */
762			apic_remote_hrterr++;
763			/*
764			 * return last hrtime right now, will need more
765			 * testing if change to retry
766			 */
767			temp = apic_last_hrtime;
768
769			lock_clear(&apic_gethrtime_lock);
770
771			intr_restore(oflags);
772
773			return (temp);
774		}
775	}
776	if (countval > last_count_read)
777		countval = 0;
778	else
779		last_count_read = countval;
780
781	elapsed_ticks = apic_hertz_count - countval;
782
783	curr_timeval = APIC_TICKS_TO_NSECS(elapsed_ticks);
784	temp = apic_nsec_since_boot + curr_timeval;
785
786	if (apic_hrtime_stamp != old_hrtime_stamp) {	/* got an interrupt */
787		/* we might have clobbered last_count_read. Restore it */
788		last_count_read = apic_hertz_count;
789		goto gethrtime_again;
790	}
791
792	if (temp < apic_last_hrtime) {
793		/* return last hrtime if error occurs */
794		apic_hrtime_error++;
795		temp = apic_last_hrtime;
796	}
797	else
798		apic_last_hrtime = temp;
799
800	lock_clear(&apic_gethrtime_lock);
801	intr_restore(oflags);
802
803	return (temp);
804}
805
806/* apic NMI handler */
807/*ARGSUSED*/
808void
809apic_nmi_intr(caddr_t arg, struct regs *rp)
810{
811	nmi_action_t action = nmi_action;
812
813	if (apic_shutdown_processors) {
814		apic_disable_local_apic();
815		return;
816	}
817
818	apic_error |= APIC_ERR_NMI;
819
820	if (!lock_try(&apic_nmi_lock))
821		return;
822	apic_num_nmis++;
823
824	/*
825	 * "nmi_action" always over-rides the older way of doing this, unless we
826	 * can't actually drop into kmdb when requested.
827	 */
828	if (action == NMI_ACTION_KMDB && !psm_debugger())
829		action = NMI_ACTION_UNSET;
830
831	if (action == NMI_ACTION_UNSET) {
832		if (apic_kmdb_on_nmi && psm_debugger())
833			action = NMI_ACTION_KMDB;
834		else if (apic_panic_on_nmi)
835			action = NMI_ACTION_PANIC;
836		else
837			action = NMI_ACTION_IGNORE;
838	}
839
840	switch (action) {
841	case NMI_ACTION_IGNORE:
842		/*
843		 * prom_printf is the best shot we have of something which is
844		 * problem free from high level/NMI type of interrupts
845		 */
846		prom_printf("NMI received\n");
847		break;
848
849	case NMI_ACTION_PANIC:
850		/* Keep panic from entering kmdb. */
851		nopanicdebug = 1;
852		panic("NMI received\n");
853		break;
854
855	case NMI_ACTION_KMDB:
856	default:
857		debug_enter("NMI received: entering kmdb\n");
858		break;
859	}
860
861	lock_clear(&apic_nmi_lock);
862}
863
864processorid_t
865apic_get_next_processorid(processorid_t cpu_id)
866{
867
868	int i;
869
870	if (cpu_id == -1)
871		return ((processorid_t)0);
872
873	for (i = cpu_id + 1; i < NCPU; i++) {
874		if (apic_cpu_in_range(i))
875			return (i);
876	}
877
878	return ((processorid_t)-1);
879}
880
881int
882apic_cpu_add(psm_cpu_request_t *reqp)
883{
884	int i, rv = 0;
885	ulong_t iflag;
886	boolean_t first = B_TRUE;
887	uchar_t localver = 0;
888	uint32_t localid, procid;
889	processorid_t cpuid = (processorid_t)-1;
890	mach_cpu_add_arg_t *ap;
891
892	ASSERT(reqp != NULL);
893	reqp->req.cpu_add.cpuid = (processorid_t)-1;
894
895	/* Check whether CPU hotplug is supported. */
896	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
897		return (ENOTSUP);
898	}
899
900	ap = (mach_cpu_add_arg_t *)reqp->req.cpu_add.argp;
901	switch (ap->type) {
902	case MACH_CPU_ARG_LOCAL_APIC:
903		localid = ap->arg.apic.apic_id;
904		procid = ap->arg.apic.proc_id;
905		if (localid >= 255 || procid > 255) {
906			cmn_err(CE_WARN,
907			    "!apic: apicid(%u) or procid(%u) is invalid.",
908			    localid, procid);
909			return (EINVAL);
910		}
911		break;
912
913	case MACH_CPU_ARG_LOCAL_X2APIC:
914		localid = ap->arg.apic.apic_id;
915		procid = ap->arg.apic.proc_id;
916		if (localid >= UINT32_MAX) {
917			cmn_err(CE_WARN,
918			    "!apic: x2apicid(%u) is invalid.", localid);
919			return (EINVAL);
920		} else if (localid >= 255 && apic_mode == LOCAL_APIC) {
921			cmn_err(CE_WARN, "!apic: system is in APIC mode, "
922			    "can't support x2APIC processor.");
923			return (ENOTSUP);
924		}
925		break;
926
927	default:
928		cmn_err(CE_WARN,
929		    "!apic: unknown argument type %d to apic_cpu_add().",
930		    ap->type);
931		return (EINVAL);
932	}
933
934	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
935	iflag = intr_clear();
936	lock_set(&apic_ioapic_lock);
937
938	/* Check whether local APIC id already exists. */
939	for (i = 0; i < apic_nproc; i++) {
940		if (!CPU_IN_SET(apic_cpumask, i))
941			continue;
942		if (apic_cpus[i].aci_local_id == localid) {
943			lock_clear(&apic_ioapic_lock);
944			intr_restore(iflag);
945			cmn_err(CE_WARN,
946			    "!apic: local apic id %u already exists.",
947			    localid);
948			return (EEXIST);
949		} else if (apic_cpus[i].aci_processor_id == procid) {
950			lock_clear(&apic_ioapic_lock);
951			intr_restore(iflag);
952			cmn_err(CE_WARN,
953			    "!apic: processor id %u already exists.",
954			    (int)procid);
955			return (EEXIST);
956		}
957
958		/*
959		 * There's no local APIC version number available in MADT table,
960		 * so assume that all CPUs are homogeneous and use local APIC
961		 * version number of the first existing CPU.
962		 */
963		if (first) {
964			first = B_FALSE;
965			localver = apic_cpus[i].aci_local_ver;
966		}
967	}
968	ASSERT(first == B_FALSE);
969
970	/*
971	 * Try to assign the same cpuid if APIC id exists in the dirty cache.
972	 */
973	for (i = 0; i < apic_max_nproc; i++) {
974		if (CPU_IN_SET(apic_cpumask, i)) {
975			ASSERT((apic_cpus[i].aci_status & APIC_CPU_FREE) == 0);
976			continue;
977		}
978		ASSERT(apic_cpus[i].aci_status & APIC_CPU_FREE);
979		if ((apic_cpus[i].aci_status & APIC_CPU_DIRTY) &&
980		    apic_cpus[i].aci_local_id == localid &&
981		    apic_cpus[i].aci_processor_id == procid) {
982			cpuid = i;
983			break;
984		}
985	}
986
987	/* Avoid the dirty cache and allocate fresh slot if possible. */
988	if (cpuid == (processorid_t)-1) {
989		for (i = 0; i < apic_max_nproc; i++) {
990			if ((apic_cpus[i].aci_status & APIC_CPU_FREE) &&
991			    (apic_cpus[i].aci_status & APIC_CPU_DIRTY) == 0) {
992				cpuid = i;
993				break;
994			}
995		}
996	}
997
998	/* Try to find any free slot as last resort. */
999	if (cpuid == (processorid_t)-1) {
1000		for (i = 0; i < apic_max_nproc; i++) {
1001			if (apic_cpus[i].aci_status & APIC_CPU_FREE) {
1002				cpuid = i;
1003				break;
1004			}
1005		}
1006	}
1007
1008	if (cpuid == (processorid_t)-1) {
1009		lock_clear(&apic_ioapic_lock);
1010		intr_restore(iflag);
1011		cmn_err(CE_NOTE,
1012		    "!apic: failed to allocate cpu id for processor %u.",
1013		    procid);
1014		rv = EAGAIN;
1015	} else if (ACPI_FAILURE(acpica_map_cpu(cpuid, procid))) {
1016		lock_clear(&apic_ioapic_lock);
1017		intr_restore(iflag);
1018		cmn_err(CE_NOTE,
1019		    "!apic: failed to build mapping for processor %u.",
1020		    procid);
1021		rv = EBUSY;
1022	} else {
1023		ASSERT(cpuid >= 0 && cpuid < NCPU);
1024		ASSERT(cpuid < apic_max_nproc && cpuid < max_ncpus);
1025		bzero(&apic_cpus[cpuid], sizeof (apic_cpus[0]));
1026		apic_cpus[cpuid].aci_processor_id = procid;
1027		apic_cpus[cpuid].aci_local_id = localid;
1028		apic_cpus[cpuid].aci_local_ver = localver;
1029		CPUSET_ATOMIC_ADD(apic_cpumask, cpuid);
1030		if (cpuid >= apic_nproc) {
1031			apic_nproc = cpuid + 1;
1032		}
1033		lock_clear(&apic_ioapic_lock);
1034		intr_restore(iflag);
1035		reqp->req.cpu_add.cpuid = cpuid;
1036	}
1037
1038	return (rv);
1039}
1040
1041int
1042apic_cpu_remove(psm_cpu_request_t *reqp)
1043{
1044	int i;
1045	ulong_t iflag;
1046	processorid_t cpuid;
1047
1048	/* Check whether CPU hotplug is supported. */
1049	if (!plat_dr_support_cpu() || apic_max_nproc == -1) {
1050		return (ENOTSUP);
1051	}
1052
1053	cpuid = reqp->req.cpu_remove.cpuid;
1054
1055	/* Use apic_ioapic_lock to sync with apic_get_next_bind_cpu. */
1056	iflag = intr_clear();
1057	lock_set(&apic_ioapic_lock);
1058
1059	if (!apic_cpu_in_range(cpuid)) {
1060		lock_clear(&apic_ioapic_lock);
1061		intr_restore(iflag);
1062		cmn_err(CE_WARN,
1063		    "!apic: cpuid %d doesn't exist in apic_cpus array.",
1064		    cpuid);
1065		return (ENODEV);
1066	}
1067	ASSERT((apic_cpus[cpuid].aci_status & APIC_CPU_FREE) == 0);
1068
1069	if (ACPI_FAILURE(acpica_unmap_cpu(cpuid))) {
1070		lock_clear(&apic_ioapic_lock);
1071		intr_restore(iflag);
1072		return (ENOENT);
1073	}
1074
1075	if (cpuid == apic_nproc - 1) {
1076		/*
1077		 * We are removing the highest numbered cpuid so we need to
1078		 * find the next highest cpuid as the new value for apic_nproc.
1079		 */
1080		for (i = apic_nproc; i > 0; i--) {
1081			if (CPU_IN_SET(apic_cpumask, i - 1)) {
1082				apic_nproc = i;
1083				break;
1084			}
1085		}
1086		/* at least one CPU left */
1087		ASSERT(i > 0);
1088	}
1089	CPUSET_ATOMIC_DEL(apic_cpumask, cpuid);
1090	/* mark slot as free and keep it in the dirty cache */
1091	apic_cpus[cpuid].aci_status = APIC_CPU_FREE | APIC_CPU_DIRTY;
1092
1093	lock_clear(&apic_ioapic_lock);
1094	intr_restore(iflag);
1095
1096	return (0);
1097}
1098
1099/*
1100 * Return the number of ticks the APIC decrements in SF nanoseconds.
1101 * The fixed-frequency PIT (aka 8254) is used for the measurement.
1102 */
1103static uint64_t
1104apic_calibrate_impl()
1105{
1106	uint8_t		pit_tick_lo;
1107	uint16_t	pit_tick, target_pit_tick, pit_ticks_adj;
1108	uint32_t	pit_ticks;
1109	uint32_t	start_apic_tick, end_apic_tick, apic_ticks;
1110	ulong_t		iflag;
1111
1112	apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
1113	apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
1114
1115	iflag = intr_clear();
1116
1117	do {
1118		pit_tick_lo = inb(PITCTR0_PORT);
1119		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1120	} while (pit_tick < APIC_TIME_MIN ||
1121	    pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX);
1122
1123	/*
1124	 * Wait for the PIT to decrement by 5 ticks to ensure
1125	 * we didn't start in the middle of a tick.
1126	 * Compare with 0x10 for the wrap around case.
1127	 */
1128	target_pit_tick = pit_tick - 5;
1129	do {
1130		pit_tick_lo = inb(PITCTR0_PORT);
1131		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1132	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1133
1134	start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1135
1136	/*
1137	 * Wait for the PIT to decrement by APIC_TIME_COUNT ticks
1138	 */
1139	target_pit_tick = pit_tick - APIC_TIME_COUNT;
1140	do {
1141		pit_tick_lo = inb(PITCTR0_PORT);
1142		pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
1143	} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
1144
1145	end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
1146
1147	intr_restore(iflag);
1148
1149	apic_ticks = start_apic_tick - end_apic_tick;
1150
1151	/* The PIT might have decremented by more ticks than planned */
1152	pit_ticks_adj = target_pit_tick - pit_tick;
1153	/* total number of PIT ticks corresponding to apic_ticks */
1154	pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
1155
1156	/*
1157	 * Determine the number of nanoseconds per APIC clock tick
1158	 * and then determine how many APIC ticks to interrupt at the
1159	 * desired frequency
1160	 * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
1161	 * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
1162	 * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
1163	 * apic_ticks_per_SFns =
1164	 * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
1165	 */
1166	return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC));
1167}
1168
1169/*
1170 * It was found empirically that 5 measurements seem sufficient to give a good
1171 * accuracy. Most spurious measurements are higher than the target value thus
1172 * we eliminate up to 2/5 spurious measurements.
1173 */
1174#define	APIC_CALIBRATE_MEASUREMENTS		5
1175
1176#define	APIC_CALIBRATE_PERCENT_OFF_WARNING	10
1177
1178/*
1179 * Return the number of ticks the APIC decrements in SF nanoseconds.
1180 * Several measurements are taken to filter out outliers.
1181 */
1182uint64_t
1183apic_calibrate()
1184{
1185	uint64_t	measurements[APIC_CALIBRATE_MEASUREMENTS];
1186	int		median_idx;
1187	uint64_t	median;
1188
1189	/*
1190	 * When running under a virtual machine, the emulated PIT and APIC
1191	 * counters do not always return the right values and can roll over.
1192	 * Those spurious measurements are relatively rare but could
1193	 * significantly affect the calibration.
1194	 * Therefore we take several measurements and then keep the median.
1195	 * The median is preferred to the average here as we only want to
1196	 * discard outliers.
1197	 */
1198	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++)
1199		measurements[i] = apic_calibrate_impl();
1200
1201	/*
1202	 * sort results and retrieve median.
1203	 */
1204	for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) {
1205		for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) {
1206			if (measurements[j] < measurements[i]) {
1207				uint64_t tmp = measurements[i];
1208				measurements[i] = measurements[j];
1209				measurements[j] = tmp;
1210			}
1211		}
1212	}
1213	median_idx = APIC_CALIBRATE_MEASUREMENTS / 2;
1214	median = measurements[median_idx];
1215
1216#if (APIC_CALIBRATE_MEASUREMENTS >= 3)
1217	/*
1218	 * Check that measurements are consistent. Post a warning
1219	 * if the three middle values are not close to each other.
1220	 */
1221	uint64_t delta_warn = median *
1222	    APIC_CALIBRATE_PERCENT_OFF_WARNING / 100;
1223	if ((median - measurements[median_idx - 1]) > delta_warn ||
1224	    (measurements[median_idx + 1] - median) > delta_warn) {
1225		cmn_err(CE_WARN, "apic_calibrate measurements lack "
1226		    "precision: %llu, %llu, %llu.",
1227		    (u_longlong_t)measurements[median_idx - 1],
1228		    (u_longlong_t)median,
1229		    (u_longlong_t)measurements[median_idx + 1]);
1230	}
1231#endif
1232
1233	return (median);
1234}
1235
1236/*
1237 * Initialise the APIC timer on the local APIC of CPU 0 to the desired
1238 * frequency.  Note at this stage in the boot sequence, the boot processor
1239 * is the only active processor.
1240 * hertz value of 0 indicates a one-shot mode request.  In this case
1241 * the function returns the resolution (in nanoseconds) for the hardware
1242 * timer interrupt.  If one-shot mode capability is not available,
1243 * the return value will be 0. apic_enable_oneshot is a global switch
1244 * for disabling the functionality.
1245 * A non-zero positive value for hertz indicates a periodic mode request.
1246 * In this case the hardware will be programmed to generate clock interrupts
1247 * at hertz frequency and returns the resolution of interrupts in
1248 * nanosecond.
1249 */
1250
1251int
1252apic_clkinit(int hertz)
1253{
1254	int		ret;
1255
1256	apic_int_busy_mark = (apic_int_busy_mark *
1257	    apic_sample_factor_redistribution) / 100;
1258	apic_int_free_mark = (apic_int_free_mark *
1259	    apic_sample_factor_redistribution) / 100;
1260	apic_diff_for_redistribution = (apic_diff_for_redistribution *
1261	    apic_sample_factor_redistribution) / 100;
1262
1263	ret = apic_timer_init(hertz);
1264	return (ret);
1265
1266}
1267
1268/*
1269 * apic_preshutdown:
1270 * Called early in shutdown whilst we can still access filesystems to do
1271 * things like loading modules which will be required to complete shutdown
1272 * after filesystems are all unmounted.
1273 */
1274void
1275apic_preshutdown(int cmd, int fcn)
1276{
1277	APIC_VERBOSE_POWEROFF(("apic_preshutdown(%d,%d); m=%d a=%d\n",
1278	    cmd, fcn, apic_poweroff_method, apic_enable_acpi));
1279}
1280
1281void
1282apic_shutdown(int cmd, int fcn)
1283{
1284	int restarts, attempts;
1285	int i;
1286	uchar_t	byte;
1287	ulong_t iflag;
1288
1289	hpet_acpi_fini();
1290
1291	/* Send NMI to all CPUs except self to do per processor shutdown */
1292	iflag = intr_clear();
1293#ifdef	DEBUG
1294	APIC_AV_PENDING_SET();
1295#else
1296	if (apic_mode == LOCAL_APIC)
1297		APIC_AV_PENDING_SET();
1298#endif /* DEBUG */
1299	apic_shutdown_processors = 1;
1300	apic_reg_ops->apic_write(APIC_INT_CMD1,
1301	    AV_NMI | AV_LEVEL | AV_SH_ALL_EXCSELF);
1302
1303	/* restore cmos shutdown byte before reboot */
1304	if (apic_cmos_ssb_set) {
1305		outb(CMOS_ADDR, SSB);
1306		outb(CMOS_DATA, 0);
1307	}
1308
1309	ioapic_disable_redirection();
1310
1311	/*	disable apic mode if imcr present	*/
1312	if (apic_imcrp) {
1313		outb(APIC_IMCR_P1, (uchar_t)APIC_IMCR_SELECT);
1314		outb(APIC_IMCR_P2, (uchar_t)APIC_IMCR_PIC);
1315	}
1316
1317	apic_disable_local_apic();
1318
1319	intr_restore(iflag);
1320
1321	/* remainder of function is for shutdown cases only */
1322	if (cmd != A_SHUTDOWN)
1323		return;
1324
1325	/*
1326	 * Switch system back into Legacy-Mode if using ACPI and
1327	 * not powering-off.  Some BIOSes need to remain in ACPI-mode
1328	 * for power-off to succeed (Dell Dimension 4600)
1329	 * Do not disable ACPI while doing fastreboot
1330	 */
1331	if (apic_enable_acpi && fcn != AD_POWEROFF && fcn != AD_FASTREBOOT)
1332		(void) AcpiDisable();
1333
1334	if (fcn == AD_FASTREBOOT) {
1335		apic_reg_ops->apic_write(APIC_INT_CMD1,
1336		    AV_ASSERT | AV_RESET | AV_SH_ALL_EXCSELF);
1337	}
1338
1339	/* remainder of function is for shutdown+poweroff case only */
1340	if (fcn != AD_POWEROFF)
1341		return;
1342
1343	switch (apic_poweroff_method) {
1344		case APIC_POWEROFF_VIA_RTC:
1345
1346			/* select the extended NVRAM bank in the RTC */
1347			outb(CMOS_ADDR, RTC_REGA);
1348			byte = inb(CMOS_DATA);
1349			outb(CMOS_DATA, (byte | EXT_BANK));
1350
1351			outb(CMOS_ADDR, PFR_REG);
1352
1353			/* for Predator must toggle the PAB bit */
1354			byte = inb(CMOS_DATA);
1355
1356			/*
1357			 * clear power active bar, wakeup alarm and
1358			 * kickstart
1359			 */
1360			byte &= ~(PAB_CBIT | WF_FLAG | KS_FLAG);
1361			outb(CMOS_DATA, byte);
1362
1363			/* delay before next write */
1364			drv_usecwait(1000);
1365
1366			/* for S40 the following would suffice */
1367			byte = inb(CMOS_DATA);
1368
1369			/* power active bar control bit */
1370			byte |= PAB_CBIT;
1371			outb(CMOS_DATA, byte);
1372
1373			break;
1374
1375		case APIC_POWEROFF_VIA_ASPEN_BMC:
1376			restarts = 0;
1377restart_aspen_bmc:
1378			if (++restarts == 3)
1379				break;
1380			attempts = 0;
1381			do {
1382				byte = inb(MISMIC_FLAG_REGISTER);
1383				byte &= MISMIC_BUSY_MASK;
1384				if (byte != 0) {
1385					drv_usecwait(1000);
1386					if (attempts >= 3)
1387						goto restart_aspen_bmc;
1388					++attempts;
1389				}
1390			} while (byte != 0);
1391			outb(MISMIC_CNTL_REGISTER, CC_SMS_GET_STATUS);
1392			byte = inb(MISMIC_FLAG_REGISTER);
1393			byte |= 0x1;
1394			outb(MISMIC_FLAG_REGISTER, byte);
1395			i = 0;
1396			for (; i < (sizeof (aspen_bmc)/sizeof (aspen_bmc[0]));
1397			    i++) {
1398				attempts = 0;
1399				do {
1400					byte = inb(MISMIC_FLAG_REGISTER);
1401					byte &= MISMIC_BUSY_MASK;
1402					if (byte != 0) {
1403						drv_usecwait(1000);
1404						if (attempts >= 3)
1405							goto restart_aspen_bmc;
1406						++attempts;
1407					}
1408				} while (byte != 0);
1409				outb(MISMIC_CNTL_REGISTER, aspen_bmc[i].cntl);
1410				outb(MISMIC_DATA_REGISTER, aspen_bmc[i].data);
1411				byte = inb(MISMIC_FLAG_REGISTER);
1412				byte |= 0x1;
1413				outb(MISMIC_FLAG_REGISTER, byte);
1414			}
1415			break;
1416
1417		case APIC_POWEROFF_VIA_SITKA_BMC:
1418			restarts = 0;
1419restart_sitka_bmc:
1420			if (++restarts == 3)
1421				break;
1422			attempts = 0;
1423			do {
1424				byte = inb(SMS_STATUS_REGISTER);
1425				byte &= SMS_STATE_MASK;
1426				if ((byte == SMS_READ_STATE) ||
1427				    (byte == SMS_WRITE_STATE)) {
1428					drv_usecwait(1000);
1429					if (attempts >= 3)
1430						goto restart_sitka_bmc;
1431					++attempts;
1432				}
1433			} while ((byte == SMS_READ_STATE) ||
1434			    (byte == SMS_WRITE_STATE));
1435			outb(SMS_COMMAND_REGISTER, SMS_GET_STATUS);
1436			i = 0;
1437			for (; i < (sizeof (sitka_bmc)/sizeof (sitka_bmc[0]));
1438			    i++) {
1439				attempts = 0;
1440				do {
1441					byte = inb(SMS_STATUS_REGISTER);
1442					byte &= SMS_IBF_MASK;
1443					if (byte != 0) {
1444						drv_usecwait(1000);
1445						if (attempts >= 3)
1446							goto restart_sitka_bmc;
1447						++attempts;
1448					}
1449				} while (byte != 0);
1450				outb(sitka_bmc[i].port, sitka_bmc[i].data);
1451			}
1452			break;
1453
1454		case APIC_POWEROFF_NONE:
1455
1456			/* If no APIC direct method, we will try using ACPI */
1457			if (apic_enable_acpi) {
1458				if (acpi_poweroff() == 1)
1459					return;
1460			} else
1461				return;
1462
1463			break;
1464	}
1465	/*
1466	 * Wait a limited time here for power to go off.
1467	 * If the power does not go off, then there was a
1468	 * problem and we should continue to the halt which
1469	 * prints a message for the user to press a key to
1470	 * reboot.
1471	 */
1472	drv_usecwait(7000000); /* wait seven seconds */
1473
1474}
1475
1476cyclic_id_t apic_cyclic_id;
1477
1478/*
1479 * The following functions are in the platform specific file so that they
1480 * can be different functions depending on whether we are running on
1481 * bare metal or a hypervisor.
1482 */
1483
1484/*
1485 * map an apic for memory-mapped access
1486 */
1487uint32_t *
1488mapin_apic(uint32_t addr, size_t len, int flags)
1489{
1490	return ((void *)psm_map_phys(addr, len, flags));
1491}
1492
1493uint32_t *
1494mapin_ioapic(uint32_t addr, size_t len, int flags)
1495{
1496	return (mapin_apic(addr, len, flags));
1497}
1498
1499/*
1500 * unmap an apic
1501 */
1502void
1503mapout_apic(caddr_t addr, size_t len)
1504{
1505	psm_unmap_phys(addr, len);
1506}
1507
1508void
1509mapout_ioapic(caddr_t addr, size_t len)
1510{
1511	mapout_apic(addr, len);
1512}
1513
1514uint32_t
1515ioapic_read(int ioapic_ix, uint32_t reg)
1516{
1517	volatile uint32_t *ioapic;
1518
1519	ioapic = apicioadr[ioapic_ix];
1520	ioapic[APIC_IO_REG] = reg;
1521	return (ioapic[APIC_IO_DATA]);
1522}
1523
1524void
1525ioapic_write(int ioapic_ix, uint32_t reg, uint32_t value)
1526{
1527	volatile uint32_t *ioapic;
1528
1529	ioapic = apicioadr[ioapic_ix];
1530	ioapic[APIC_IO_REG] = reg;
1531	ioapic[APIC_IO_DATA] = value;
1532}
1533
1534void
1535ioapic_write_eoi(int ioapic_ix, uint32_t value)
1536{
1537	volatile uint32_t *ioapic;
1538
1539	ioapic = apicioadr[ioapic_ix];
1540	ioapic[APIC_IO_EOI] = value;
1541}
1542
1543/*
1544 * Round-robin algorithm to find the next CPU with interrupts enabled.
1545 * It can't share the same static variable apic_next_bind_cpu with
1546 * apic_get_next_bind_cpu(), since that will cause all interrupts to be
1547 * bound to CPU1 at boot time.  During boot, only CPU0 is online with
1548 * interrupts enabled when apic_get_next_bind_cpu() and apic_find_cpu()
1549 * are called.  However, the pcplusmp driver assumes that there will be
1550 * boot_ncpus CPUs configured eventually so it tries to distribute all
1551 * interrupts among CPU0 - CPU[boot_ncpus - 1].  Thus to prevent all
1552 * interrupts being targetted at CPU1, we need to use a dedicated static
1553 * variable for find_next_cpu() instead of sharing apic_next_bind_cpu.
1554 */
1555
1556processorid_t
1557apic_find_cpu(int flag)
1558{
1559	int i;
1560	static processorid_t acid = 0;
1561
1562	/* Find the first CPU with the passed-in flag set */
1563	for (i = 0; i < apic_nproc; i++) {
1564		if (++acid >= apic_nproc) {
1565			acid = 0;
1566		}
1567		if (apic_cpu_in_range(acid) &&
1568		    (apic_cpus[acid].aci_status & flag)) {
1569			break;
1570		}
1571	}
1572
1573	ASSERT((apic_cpus[acid].aci_status & flag) != 0);
1574	return (acid);
1575}
1576
1577void
1578apic_intrmap_init(int apic_mode)
1579{
1580	int suppress_brdcst_eoi = 0;
1581
1582	/*
1583	 * Intel Software Developer's Manual 3A, 10.12.7:
1584	 *
1585	 * Routing of device interrupts to local APIC units operating in
1586	 * x2APIC mode requires use of the interrupt-remapping architecture
1587	 * specified in the Intel Virtualization Technology for Directed
1588	 * I/O, Revision 1.3.  Because of this, BIOS must enumerate support
1589	 * for and software must enable this interrupt remapping with
1590	 * Extended Interrupt Mode Enabled before it enabling x2APIC mode in
1591	 * the local APIC units.
1592	 *
1593	 *
1594	 * In other words, to use the APIC in x2APIC mode, we need interrupt
1595	 * remapping.  Since we don't start up the IOMMU by default, we
1596	 * won't be able to do any interrupt remapping and therefore have to
1597	 * use the APIC in traditional 'local APIC' mode with memory mapped
1598	 * I/O.
1599	 */
1600
1601	if (psm_vt_ops != NULL) {
1602		if (((apic_intrmap_ops_t *)psm_vt_ops)->
1603		    apic_intrmap_init(apic_mode) == DDI_SUCCESS) {
1604
1605			apic_vt_ops = psm_vt_ops;
1606
1607			/*
1608			 * We leverage the interrupt remapping engine to
1609			 * suppress broadcast EOI; thus we must send the
1610			 * directed EOI with the directed-EOI handler.
1611			 */
1612			if (apic_directed_EOI_supported() == 0) {
1613				suppress_brdcst_eoi = 1;
1614			}
1615
1616			apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi);
1617
1618			if (apic_detect_x2apic()) {
1619				apic_enable_x2apic();
1620			}
1621
1622			if (apic_directed_EOI_supported() == 0) {
1623				apic_set_directed_EOI_handler();
1624			}
1625		}
1626	}
1627}
1628
1629/*ARGSUSED*/
1630static void
1631apic_record_ioapic_rdt(void *intrmap_private, ioapic_rdt_t *irdt)
1632{
1633	irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
1634}
1635
1636/*ARGSUSED*/
1637static void
1638apic_record_msi(void *intrmap_private, msi_regs_t *mregs)
1639{
1640	mregs->mr_addr = MSI_ADDR_HDR |
1641	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
1642	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
1643	    (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
1644	mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
1645	    mregs->mr_data;
1646}
1647
1648/*
1649 * Functions from apic_introp.c
1650 *
1651 * Those functions are used by apic_intr_ops().
1652 */
1653
1654/*
1655 * MSI support flag:
1656 * reflects whether MSI is supported at APIC level
1657 * it can also be patched through /etc/system
1658 *
1659 *  0 = default value - don't know and need to call apic_check_msi_support()
1660 *      to find out then set it accordingly
1661 *  1 = supported
1662 * -1 = not supported
1663 */
1664int	apic_support_msi = 0;
1665
1666/* Multiple vector support for MSI-X */
1667int	apic_msix_enable = 1;
1668
1669/* Multiple vector support for MSI */
1670int	apic_multi_msi_enable = 1;
1671
1672/*
1673 * Check whether the system supports MSI.
1674 *
1675 * MSI is required for PCI-E and for PCI versions later than 2.2, so if we find
1676 * a PCI-E bus or we find a PCI bus whose version we know is >= 2.2, then we
1677 * return PSM_SUCCESS to indicate this system supports MSI.
1678 *
1679 * (Currently the only way we check whether a given PCI bus supports >= 2.2 is
1680 * by detecting if we are running inside the KVM hypervisor, which guarantees
1681 * this version number.)
1682 */
1683int
1684apic_check_msi_support()
1685{
1686	dev_info_t *cdip;
1687	char dev_type[16];
1688	int dev_len;
1689	int hwenv = get_hwenv();
1690
1691	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support:\n"));
1692
1693	/*
1694	 * check whether the first level children of root_node have
1695	 * PCI-E or PCI capability.
1696	 */
1697	for (cdip = ddi_get_child(ddi_root_node()); cdip != NULL;
1698	    cdip = ddi_get_next_sibling(cdip)) {
1699
1700		DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: cdip: 0x%p,"
1701		    " driver: %s, binding: %s, nodename: %s\n", (void *)cdip,
1702		    ddi_driver_name(cdip), ddi_binding_name(cdip),
1703		    ddi_node_name(cdip)));
1704		dev_len = sizeof (dev_type);
1705		if (ddi_getlongprop_buf(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1706		    "device_type", (caddr_t)dev_type, &dev_len)
1707		    != DDI_PROP_SUCCESS)
1708			continue;
1709		if (strcmp(dev_type, "pciex") == 0)
1710			return (PSM_SUCCESS);
1711		if (strcmp(dev_type, "pci") == 0 &&
1712		    (hwenv == HW_KVM || hwenv == HW_BHYVE))
1713			return (PSM_SUCCESS);
1714	}
1715
1716	/* MSI is not supported on this system */
1717	DDI_INTR_IMPLDBG((CE_CONT, "apic_check_msi_support: no 'pciex' "
1718	    "device_type found\n"));
1719	return (PSM_FAILURE);
1720}
1721
1722/*
1723 * apic_pci_msi_unconfigure:
1724 *
1725 * This and next two interfaces are copied from pci_intr_lib.c
1726 * Do ensure that these two files stay in sync.
1727 * These needed to be copied over here to avoid a deadlock situation on
1728 * certain mp systems that use MSI interrupts.
1729 *
1730 * IMPORTANT regards next three interfaces:
1731 * i) are called only for MSI/X interrupts.
1732 * ii) called with interrupts disabled, and must not block
1733 */
1734void
1735apic_pci_msi_unconfigure(dev_info_t *rdip, int type, int inum)
1736{
1737	ushort_t		msi_ctrl;
1738	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1739	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1740
1741	ASSERT((handle != NULL) && (cap_ptr != 0));
1742
1743	if (type == DDI_INTR_TYPE_MSI) {
1744		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1745		msi_ctrl &= (~PCI_MSI_MME_MASK);
1746		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1747		pci_config_put32(handle, cap_ptr + PCI_MSI_ADDR_OFFSET, 0);
1748
1749		if (msi_ctrl &  PCI_MSI_64BIT_MASK) {
1750			pci_config_put16(handle,
1751			    cap_ptr + PCI_MSI_64BIT_DATA, 0);
1752			pci_config_put32(handle,
1753			    cap_ptr + PCI_MSI_ADDR_OFFSET + 4, 0);
1754		} else {
1755			pci_config_put16(handle,
1756			    cap_ptr + PCI_MSI_32BIT_DATA, 0);
1757		}
1758
1759	} else if (type == DDI_INTR_TYPE_MSIX) {
1760		uintptr_t	off;
1761		uint32_t	mask;
1762		ddi_intr_msix_t	*msix_p = i_ddi_get_msix(rdip);
1763
1764		ASSERT(msix_p != NULL);
1765
1766		/* Offset into "inum"th entry in the MSI-X table & mask it */
1767		off = (uintptr_t)msix_p->msix_tbl_addr + (inum *
1768		    PCI_MSIX_VECTOR_SIZE) + PCI_MSIX_VECTOR_CTRL_OFFSET;
1769
1770		mask = ddi_get32(msix_p->msix_tbl_hdl, (uint32_t *)off);
1771
1772		ddi_put32(msix_p->msix_tbl_hdl, (uint32_t *)off, (mask | 1));
1773
1774		/* Offset into the "inum"th entry in the MSI-X table */
1775		off = (uintptr_t)msix_p->msix_tbl_addr +
1776		    (inum * PCI_MSIX_VECTOR_SIZE);
1777
1778		/* Reset the "data" and "addr" bits */
1779		ddi_put32(msix_p->msix_tbl_hdl,
1780		    (uint32_t *)(off + PCI_MSIX_DATA_OFFSET), 0);
1781		ddi_put64(msix_p->msix_tbl_hdl, (uint64_t *)off, 0);
1782	}
1783}
1784
1785/*
1786 * apic_pci_msi_disable_mode:
1787 */
1788void
1789apic_pci_msi_disable_mode(dev_info_t *rdip, int type)
1790{
1791	ushort_t		msi_ctrl;
1792	int			cap_ptr = i_ddi_get_msi_msix_cap_ptr(rdip);
1793	ddi_acc_handle_t	handle = i_ddi_get_pci_config_handle(rdip);
1794
1795	ASSERT((handle != NULL) && (cap_ptr != 0));
1796
1797	if (type == DDI_INTR_TYPE_MSI) {
1798		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSI_CTRL);
1799		if (!(msi_ctrl & PCI_MSI_ENABLE_BIT))
1800			return;
1801
1802		msi_ctrl &= ~PCI_MSI_ENABLE_BIT;	/* MSI disable */
1803		pci_config_put16(handle, cap_ptr + PCI_MSI_CTRL, msi_ctrl);
1804
1805	} else if (type == DDI_INTR_TYPE_MSIX) {
1806		msi_ctrl = pci_config_get16(handle, cap_ptr + PCI_MSIX_CTRL);
1807		if (msi_ctrl & PCI_MSIX_ENABLE_BIT) {
1808			msi_ctrl &= ~PCI_MSIX_ENABLE_BIT;
1809			pci_config_put16(handle, cap_ptr + PCI_MSIX_CTRL,
1810			    msi_ctrl);
1811		}
1812	}
1813}
1814
1815uint32_t
1816apic_get_localapicid(uint32_t cpuid)
1817{
1818	ASSERT(cpuid < apic_nproc && apic_cpus != NULL);
1819
1820	return (apic_cpus[cpuid].aci_local_id);
1821}
1822
1823uchar_t
1824apic_get_ioapicid(uchar_t ioapicindex)
1825{
1826	ASSERT(ioapicindex < MAX_IO_APIC);
1827
1828	return (apic_io_id[ioapicindex]);
1829}
1830