1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /*
25  * Copyright (c) 2010, Intel Corporation.
26  * All rights reserved.
27  * Copyright 2016 PALO, Richard.
28  */
29 
30 /*
31  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
32  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
33  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
34  * PSMI 1.5 extensions are supported in Solaris Nevada.
35  * PSMI 1.6 extensions are supported in Solaris Nevada.
36  * PSMI 1.7 extensions are supported in Solaris Nevada.
37  */
38 #define	PSMI_1_7
39 
40 #include <sys/processor.h>
41 #include <sys/time.h>
42 #include <sys/psm.h>
43 #include <sys/smp_impldefs.h>
44 #include <sys/cram.h>
45 #include <sys/acpi/acpi.h>
46 #include <sys/acpica.h>
47 #include <sys/psm_common.h>
48 #include <sys/apic.h>
49 #include <sys/apic_common.h>
50 #include <sys/pit.h>
51 #include <sys/ddi.h>
52 #include <sys/sunddi.h>
53 #include <sys/ddi_impldefs.h>
54 #include <sys/pci.h>
55 #include <sys/promif.h>
56 #include <sys/x86_archext.h>
57 #include <sys/cpc_impl.h>
58 #include <sys/uadmin.h>
59 #include <sys/panic.h>
60 #include <sys/debug.h>
61 #include <sys/archsystm.h>
62 #include <sys/trap.h>
63 #include <sys/machsystm.h>
64 #include <sys/cpuvar.h>
65 #include <sys/rm_platter.h>
66 #include <sys/privregs.h>
67 #include <sys/cyclic.h>
68 #include <sys/note.h>
69 #include <sys/pci_intr_lib.h>
70 #include <sys/sunndi.h>
71 
72 
73 /*
74  *      Local Function Prototypes
75  */
76 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector);
77 static void apic_xlate_vector_free_timeout_handler(void *arg);
78 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
79     int new_bind_cpu, int apicindex, int intin_no, int which_irq,
80     struct ioapic_reprogram_data *drep);
81 static int apic_setup_irq_table(dev_info_t *dip, int irqno,
82     struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp,
83     int type);
84 static void apic_try_deferred_reprogram(int ipl, int vect);
85 static void delete_defer_repro_ent(int which_irq);
86 static void apic_ioapic_wait_pending_clear(int ioapicindex,
87     int intin_no);
88 
89 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid,
90     int ipin, int *pci_irqp, iflag_t *intr_flagp);
91 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno,
92     int child_ipin, struct apic_io_intr **intrp);
93 extern uchar_t acpi_find_ioapic(int irq);
94 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid);
95 extern int apic_find_bus_id(int bustype);
96 extern int apic_find_intin(uchar_t ioapic, uchar_t intin);
97 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq);
98 
99 extern	int apic_sci_vect;
100 extern	iflag_t apic_sci_flags;
101 extern	int	apic_intr_policy;
102 extern	char *psm_name;
103 
104 /*
105  * number of bits per byte, from <sys/param.h>
106  */
107 #define	UCHAR_MAX	((1 << NBBY) - 1)
108 
109 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */
110 extern int apic_max_reps_clear_pending;
111 
112 /* The irq # is implicit in the array index: */
113 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1];
114 /*
115  * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info
116  * is indexed by IRQ number, NOT by vector number.
117  */
118 
119 extern	int	apic_int_busy_mark;
120 extern	int	apic_int_free_mark;
121 extern	int	apic_diff_for_redistribution;
122 extern	int	apic_sample_factor_redistribution;
123 extern	int	apic_redist_cpu_skip;
124 extern	int	apic_num_imbalance;
125 extern	int	apic_num_rebind;
126 
127 /* timeout for xlate_vector, mark_vector */
128 int	apic_revector_timeout = 16 * 10000; /* 160 millisec */
129 
130 extern int	apic_defconf;
131 extern int	apic_irq_translate;
132 
133 extern int	apic_use_acpi_madt_only;	/* 1=ONLY use MADT from ACPI */
134 
135 extern	uchar_t	apic_io_vectbase[MAX_IO_APIC];
136 
137 extern	boolean_t ioapic_mask_workaround[MAX_IO_APIC];
138 
139 /*
140  * First available slot to be used as IRQ index into the apic_irq_table
141  * for those interrupts (like MSI/X) that don't have a physical IRQ.
142  */
143 extern int apic_first_avail_irq;
144 
145 /*
146  * apic_defer_reprogram_lock ensures that only one processor is handling
147  * deferred interrupt programming at *_intr_exit time.
148  */
149 static	lock_t	apic_defer_reprogram_lock;
150 
151 /*
152  * The current number of deferred reprogrammings outstanding
153  */
154 uint_t	apic_reprogram_outstanding = 0;
155 
156 #ifdef DEBUG
157 /*
158  * Counters that keep track of deferred reprogramming stats
159  */
160 uint_t	apic_intr_deferrals = 0;
161 uint_t	apic_intr_deliver_timeouts = 0;
162 uint_t	apic_last_ditch_reprogram_failures = 0;
163 uint_t	apic_deferred_setup_failures = 0;
164 uint_t	apic_defer_repro_total_retries = 0;
165 uint_t	apic_defer_repro_successes = 0;
166 uint_t	apic_deferred_spurious_enters = 0;
167 #endif
168 
169 extern	int	apic_io_max;
170 extern	struct apic_io_intr *apic_io_intrp;
171 
172 uchar_t	apic_vector_to_irq[APIC_MAX_VECTOR+1];
173 
174 extern	uint32_t	eisa_level_intr_mask;
175 	/* At least MSB will be set if EISA bus */
176 
177 extern	int	apic_pci_bus_total;
178 extern	uchar_t	apic_single_pci_busid;
179 
180 /*
181  * Following declarations are for revectoring; used when ISRs at different
182  * IPLs share an irq.
183  */
184 static	lock_t	apic_revector_lock;
185 int	apic_revector_pending = 0;
186 static	uchar_t	*apic_oldvec_to_newvec;
187 static	uchar_t	*apic_newvec_to_oldvec;
188 
189 /* ACPI Interrupt Source Override Structure ptr */
190 ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop;
191 extern	int acpi_iso_cnt;
192 
193 /*
194  * Auto-configuration routines
195  */
196 
197 /*
198  * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable
199  * are also set to NULL. vector->irq is set to a value which cannot map
200  * to a real irq to show that it is free.
201  */
202 void
203 apic_init_common(void)
204 {
205 	int	i, j, indx;
206 	int	*iptr;
207 
208 	/*
209 	 * Initialize apic_ipls from apic_vectortoipl.  This array is
210 	 * used in apic_intr_enter to determine the IPL to use for the
211 	 * corresponding vector.  On some systems, due to hardware errata
212 	 * and interrupt sharing, the IPL may not correspond to the IPL listed
213 	 * in apic_vectortoipl (see apic_addspl and apic_delspl).
214 	 */
215 	for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) {
216 		indx = i * APIC_VECTOR_PER_IPL;
217 
218 		for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++)
219 			apic_ipls[indx] = apic_vectortoipl[i];
220 	}
221 
222 	/* cpu 0 is always up (for now) */
223 	apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE;
224 
225 	iptr = (int *)&apic_irq_table[0];
226 	for (i = 0; i <= APIC_MAX_VECTOR; i++) {
227 		apic_level_intr[i] = 0;
228 		*iptr++ = 0;
229 		apic_vector_to_irq[i] = APIC_RESV_IRQ;
230 
231 		/* These *must* be initted to B_TRUE! */
232 		apic_reprogram_info[i].done = B_TRUE;
233 		apic_reprogram_info[i].irqp = NULL;
234 		apic_reprogram_info[i].tries = 0;
235 		apic_reprogram_info[i].bindcpu = 0;
236 	}
237 
238 	/*
239 	 * Allocate a dummy irq table entry for the reserved entry.
240 	 * This takes care of the race between removing an irq and
241 	 * clock detecting a CPU in that irq during interrupt load
242 	 * sampling.
243 	 */
244 	apic_irq_table[APIC_RESV_IRQ] =
245 	    kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
246 
247 	mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL);
248 }
249 
250 void
251 ioapic_init_intr(int mask_apic)
252 {
253 	int ioapic_ix;
254 	struct intrspec ispec;
255 	apic_irq_t *irqptr;
256 	int i, j;
257 	ulong_t iflag;
258 
259 	LOCK_INIT_CLEAR(&apic_revector_lock);
260 	LOCK_INIT_CLEAR(&apic_defer_reprogram_lock);
261 
262 	/* mask interrupt vectors */
263 	for (j = 0; j < apic_io_max && mask_apic; j++) {
264 		int intin_max;
265 
266 		ioapic_ix = j;
267 		/* Bits 23-16 define the maximum redirection entries */
268 		intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16)
269 		    & 0xff;
270 		for (i = 0; i <= intin_max; i++)
271 			ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK);
272 	}
273 
274 	/*
275 	 * Hack alert: deal with ACPI SCI interrupt chicken/egg here
276 	 */
277 	if (apic_sci_vect > 0) {
278 		/*
279 		 * acpica has already done add_avintr(); we just
280 		 * to finish the job by mimicing translate_irq()
281 		 *
282 		 * Fake up an intrspec and setup the tables
283 		 */
284 		ispec.intrspec_vec = apic_sci_vect;
285 		ispec.intrspec_pri = SCI_IPL;
286 
287 		if (apic_setup_irq_table(NULL, apic_sci_vect, NULL,
288 		    &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) {
289 			cmn_err(CE_WARN, "!apic: SCI setup failed");
290 			return;
291 		}
292 		irqptr = apic_irq_table[apic_sci_vect];
293 
294 		iflag = intr_clear();
295 		lock_set(&apic_ioapic_lock);
296 
297 		/* Program I/O APIC */
298 		(void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE);
299 
300 		lock_clear(&apic_ioapic_lock);
301 		intr_restore(iflag);
302 
303 		irqptr->airq_share++;
304 	}
305 }
306 
307 /*
308  * Add mask bits to disable interrupt vector from happening
309  * at or above IPL. In addition, it should remove mask bits
310  * to enable interrupt vectors below the given IPL.
311  *
312  * Both add and delspl are complicated by the fact that different interrupts
313  * may share IRQs. This can happen in two ways.
314  * 1. The same H/W line is shared by more than 1 device
315  * 1a. with interrupts at different IPLs
316  * 1b. with interrupts at same IPL
317  * 2. We ran out of vectors at a given IPL and started sharing vectors.
318  * 1b and 2 should be handled gracefully, except for the fact some ISRs
319  * will get called often when no interrupt is pending for the device.
320  * For 1a, we handle it at the higher IPL.
321  */
322 /*ARGSUSED*/
323 int
324 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
325 {
326 	uchar_t vector;
327 	ulong_t iflag;
328 	apic_irq_t *irqptr, *irqheadptr;
329 	int irqindex;
330 
331 	ASSERT(max_ipl <= UCHAR_MAX);
332 	irqindex = IRQINDEX(irqno);
333 
334 	if ((irqindex == -1) || (!apic_irq_table[irqindex]))
335 		return (PSM_FAILURE);
336 
337 	mutex_enter(&airq_mutex);
338 	irqptr = irqheadptr = apic_irq_table[irqindex];
339 
340 	DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x "
341 	    "vector=0x%x\n", (void *)irqptr->airq_dip,
342 	    irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
343 
344 	while (irqptr) {
345 		if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
346 			break;
347 		irqptr = irqptr->airq_next;
348 	}
349 	irqptr->airq_share++;
350 
351 	mutex_exit(&airq_mutex);
352 
353 	/* return if it is not hardware interrupt */
354 	if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
355 		return (PSM_SUCCESS);
356 
357 	/* Or if there are more interupts at a higher IPL */
358 	if (ipl != max_ipl)
359 		return (PSM_SUCCESS);
360 
361 	/*
362 	 * if apic_picinit() has not been called yet, just return.
363 	 * At the end of apic_picinit(), we will call setup_io_intr().
364 	 */
365 
366 	if (!apic_picinit_called)
367 		return (PSM_SUCCESS);
368 
369 	/*
370 	 * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate,
371 	 * return failure.
372 	 */
373 	if (irqptr->airq_ipl != max_ipl &&
374 	    !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
375 
376 		vector = apic_allocate_vector(max_ipl, irqindex, 1);
377 		if (vector == 0) {
378 			irqptr->airq_share--;
379 			return (PSM_FAILURE);
380 		}
381 		irqptr = irqheadptr;
382 		apic_mark_vector(irqptr->airq_vector, vector);
383 		while (irqptr) {
384 			irqptr->airq_vector = vector;
385 			irqptr->airq_ipl = (uchar_t)max_ipl;
386 			/*
387 			 * reprogram irq being added and every one else
388 			 * who is not in the UNINIT state
389 			 */
390 			if ((VIRTIRQ(irqindex, irqptr->airq_share_id) ==
391 			    irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) {
392 				apic_record_rdt_entry(irqptr, irqindex);
393 
394 				iflag = intr_clear();
395 				lock_set(&apic_ioapic_lock);
396 
397 				(void) apic_setup_io_intr(irqptr, irqindex,
398 				    B_FALSE);
399 
400 				lock_clear(&apic_ioapic_lock);
401 				intr_restore(iflag);
402 			}
403 			irqptr = irqptr->airq_next;
404 		}
405 		return (PSM_SUCCESS);
406 
407 	} else if (irqptr->airq_ipl != max_ipl &&
408 	    ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
409 		/*
410 		 * We cannot upgrade the vector, but we can change
411 		 * the IPL that this vector induces.
412 		 *
413 		 * Note that we subtract APIC_BASE_VECT from the vector
414 		 * here because this array is used in apic_intr_enter
415 		 * (no need to add APIC_BASE_VECT in that hot code
416 		 * path since we can do it in the rarely-executed path
417 		 * here).
418 		 */
419 		apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] =
420 		    (uchar_t)max_ipl;
421 
422 		irqptr = irqheadptr;
423 		while (irqptr) {
424 			irqptr->airq_ipl = (uchar_t)max_ipl;
425 			irqptr = irqptr->airq_next;
426 		}
427 
428 		return (PSM_SUCCESS);
429 	}
430 
431 	ASSERT(irqptr);
432 
433 	iflag = intr_clear();
434 	lock_set(&apic_ioapic_lock);
435 
436 	(void) apic_setup_io_intr(irqptr, irqindex, B_FALSE);
437 
438 	lock_clear(&apic_ioapic_lock);
439 	intr_restore(iflag);
440 
441 	return (PSM_SUCCESS);
442 }
443 
444 /*
445  * Recompute mask bits for the given interrupt vector.
446  * If there is no interrupt servicing routine for this
447  * vector, this function should disable interrupt vector
448  * from happening at all IPLs. If there are still
449  * handlers using the given vector, this function should
450  * disable the given vector from happening below the lowest
451  * IPL of the remaining hadlers.
452  */
453 /*ARGSUSED*/
454 int
455 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
456 {
457 	uchar_t vector;
458 	uint32_t bind_cpu;
459 	int intin, irqindex;
460 	int ioapic_ix;
461 	apic_irq_t	*irqptr, *preirqptr, *irqheadptr, *irqp;
462 	ulong_t iflag;
463 
464 	mutex_enter(&airq_mutex);
465 	irqindex = IRQINDEX(irqno);
466 	irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex];
467 
468 	DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x "
469 	    "vector=0x%x\n", (void *)irqptr->airq_dip,
470 	    irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
471 
472 	while (irqptr) {
473 		if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
474 			break;
475 		preirqptr = irqptr;
476 		irqptr = irqptr->airq_next;
477 	}
478 	ASSERT(irqptr);
479 
480 	irqptr->airq_share--;
481 
482 	mutex_exit(&airq_mutex);
483 
484 	/*
485 	 * If there are more interrupts at a higher IPL, we don't need
486 	 * to disable anything.
487 	 */
488 	if (ipl < max_ipl)
489 		return (PSM_SUCCESS);
490 
491 	/* return if it is not hardware interrupt */
492 	if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
493 		return (PSM_SUCCESS);
494 
495 	if (!apic_picinit_called) {
496 		/*
497 		 * Clear irq_struct. If two devices shared an intpt
498 		 * line & 1 unloaded before picinit, we are hosed. But, then
499 		 * we hope the machine survive.
500 		 */
501 		irqptr->airq_mps_intr_index = FREE_INDEX;
502 		irqptr->airq_temp_cpu = IRQ_UNINIT;
503 		apic_free_vector(irqptr->airq_vector);
504 		return (PSM_SUCCESS);
505 	}
506 	/*
507 	 * Downgrade vector to new max_ipl if needed. If we cannot allocate,
508 	 * use old IPL. Not very elegant, but it should work.
509 	 */
510 	if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) &&
511 	    !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
512 		apic_irq_t	*irqp;
513 		if (vector = apic_allocate_vector(max_ipl, irqno, 1)) {
514 			apic_mark_vector(irqheadptr->airq_vector, vector);
515 			irqp = irqheadptr;
516 			while (irqp) {
517 				irqp->airq_vector = vector;
518 				irqp->airq_ipl = (uchar_t)max_ipl;
519 				if (irqp->airq_temp_cpu != IRQ_UNINIT) {
520 					apic_record_rdt_entry(irqp, irqindex);
521 
522 					iflag = intr_clear();
523 					lock_set(&apic_ioapic_lock);
524 
525 					(void) apic_setup_io_intr(irqp,
526 					    irqindex, B_FALSE);
527 
528 					lock_clear(&apic_ioapic_lock);
529 					intr_restore(iflag);
530 				}
531 				irqp = irqp->airq_next;
532 			}
533 		}
534 
535 	} else if (irqptr->airq_ipl != max_ipl &&
536 	    max_ipl != PSM_INVALID_IPL &&
537 	    ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
538 
539 	/*
540 	 * We cannot downgrade the IPL of the vector below the vector's
541 	 * hardware priority. If we did, it would be possible for a
542 	 * higher-priority hardware vector to interrupt a CPU running at an IPL
543 	 * lower than the hardware priority of the interrupting vector (but
544 	 * higher than the soft IPL of this IRQ). When this happens, we would
545 	 * then try to drop the IPL BELOW what it was (effectively dropping
546 	 * below base_spl) which would be potentially catastrophic.
547 	 *
548 	 * (e.g. Suppose the hardware vector associated with this IRQ is 0x40
549 	 * (hardware IPL of 4).  Further assume that the old IPL of this IRQ
550 	 * was 4, but the new IPL is 1.  If we forced vector 0x40 to result in
551 	 * an IPL of 1, it would be possible for the processor to be executing
552 	 * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting
553 	 * the currently-executing ISR.  When apic_intr_enter consults
554 	 * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1
555 	 * so even though the processor was running at IPL 4, an IPL 1
556 	 * interrupt will have interrupted it, which must not happen)).
557 	 *
558 	 * Effectively, this means that the hardware priority corresponding to
559 	 * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's
560 	 * hardware priority.
561 	 *
562 	 * (In the above example, then, after removal of the IPL 4 device's
563 	 * interrupt handler, the new IPL will continue to be 4 because the
564 	 * hardware priority that IPL 1 implies is lower than the hardware
565 	 * priority of the vector used.)
566 	 */
567 		/* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */
568 		const int apic_ipls_index = irqptr->airq_vector -
569 		    APIC_BASE_VECT;
570 		const int vect_inherent_hwpri = irqptr->airq_vector >>
571 		    APIC_IPL_SHIFT;
572 
573 		/*
574 		 * If there are still devices using this IRQ, determine the
575 		 * new ipl to use.
576 		 */
577 		if (irqptr->airq_share) {
578 			int vect_desired_hwpri, hwpri;
579 
580 			ASSERT(max_ipl < MAXIPL);
581 			vect_desired_hwpri = apic_ipltopri[max_ipl] >>
582 			    APIC_IPL_SHIFT;
583 
584 			/*
585 			 * If the desired IPL's hardware priority is lower
586 			 * than that of the vector, use the hardware priority
587 			 * of the vector to determine the new IPL.
588 			 */
589 			hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ?
590 			    vect_inherent_hwpri : vect_desired_hwpri;
591 
592 			/*
593 			 * Now, to get the right index for apic_vectortoipl,
594 			 * we need to subtract APIC_BASE_VECT from the
595 			 * hardware-vector-equivalent (in hwpri).  Since hwpri
596 			 * is already shifted, we shift APIC_BASE_VECT before
597 			 * doing the subtraction.
598 			 */
599 			hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT);
600 
601 			ASSERT(hwpri >= 0);
602 			ASSERT(hwpri < MAXIPL);
603 			max_ipl = apic_vectortoipl[hwpri];
604 			apic_ipls[apic_ipls_index] = max_ipl;
605 
606 			irqp = irqheadptr;
607 			while (irqp) {
608 				irqp->airq_ipl = (uchar_t)max_ipl;
609 				irqp = irqp->airq_next;
610 			}
611 		} else {
612 			/*
613 			 * No more devices on this IRQ, so reset this vector's
614 			 * element in apic_ipls to the original IPL for this
615 			 * vector
616 			 */
617 			apic_ipls[apic_ipls_index] =
618 			    apic_vectortoipl[vect_inherent_hwpri];
619 		}
620 	}
621 
622 	/*
623 	 * If there are still active interrupts, we are done.
624 	 */
625 	if (irqptr->airq_share)
626 		return (PSM_SUCCESS);
627 
628 	iflag = intr_clear();
629 	lock_set(&apic_ioapic_lock);
630 
631 	if (irqptr->airq_mps_intr_index == MSI_INDEX) {
632 		/*
633 		 * Disable the MSI vector
634 		 * Make sure we only disable on the last
635 		 * of the multi-MSI support
636 		 */
637 		if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
638 			apic_pci_msi_disable_mode(irqptr->airq_dip,
639 			    DDI_INTR_TYPE_MSI);
640 		}
641 	} else if (irqptr->airq_mps_intr_index == MSIX_INDEX) {
642 		/*
643 		 * Disable the MSI-X vector
644 		 * needs to clear its mask and addr/data for each MSI-X
645 		 */
646 		apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX,
647 		    irqptr->airq_origirq);
648 		/*
649 		 * Make sure we only disable on the last MSI-X
650 		 */
651 		if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
652 			apic_pci_msi_disable_mode(irqptr->airq_dip,
653 			    DDI_INTR_TYPE_MSIX);
654 		}
655 	} else {
656 		/*
657 		 * The assumption here is that this is safe, even for
658 		 * systems with IOAPICs that suffer from the hardware
659 		 * erratum because all devices have been quiesced before
660 		 * they unregister their interrupt handlers.  If that
661 		 * assumption turns out to be false, this mask operation
662 		 * can induce the same erratum result we're trying to
663 		 * avoid.
664 		 */
665 		ioapic_ix = irqptr->airq_ioapicindex;
666 		intin = irqptr->airq_intin_no;
667 		ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK);
668 	}
669 
670 	/*
671 	 * This irq entry is the only one in the chain.
672 	 */
673 	if (irqheadptr->airq_next == NULL) {
674 		ASSERT(irqheadptr == irqptr);
675 		bind_cpu = irqptr->airq_temp_cpu;
676 		if (((uint32_t)bind_cpu != IRQ_UNBOUND) &&
677 		    ((uint32_t)bind_cpu != IRQ_UNINIT)) {
678 			ASSERT(apic_cpu_in_range(bind_cpu));
679 			if (bind_cpu & IRQ_USER_BOUND) {
680 				/* If hardbound, temp_cpu == cpu */
681 				bind_cpu &= ~IRQ_USER_BOUND;
682 				apic_cpus[bind_cpu].aci_bound--;
683 			} else
684 				apic_cpus[bind_cpu].aci_temp_bound--;
685 		}
686 		irqptr->airq_temp_cpu = IRQ_UNINIT;
687 		irqptr->airq_mps_intr_index = FREE_INDEX;
688 		lock_clear(&apic_ioapic_lock);
689 		intr_restore(iflag);
690 		apic_free_vector(irqptr->airq_vector);
691 		return (PSM_SUCCESS);
692 	}
693 
694 	/*
695 	 * If we get here, we are sharing the vector and there are more than
696 	 * one active irq entries in the chain.
697 	 */
698 	lock_clear(&apic_ioapic_lock);
699 	intr_restore(iflag);
700 
701 	mutex_enter(&airq_mutex);
702 	/* Remove the irq entry from the chain */
703 	if (irqptr == irqheadptr) { /* The irq entry is at the head */
704 		apic_irq_table[irqindex] = irqptr->airq_next;
705 	} else {
706 		preirqptr->airq_next = irqptr->airq_next;
707 	}
708 	/* Free the irq entry */
709 	kmem_free(irqptr, sizeof (apic_irq_t));
710 	mutex_exit(&airq_mutex);
711 
712 	return (PSM_SUCCESS);
713 }
714 
715 /*
716  * apic_introp_xlate() replaces apic_translate_irq() and is
717  * called only from apic_intr_ops().  With the new ADII framework,
718  * the priority can no longer be retrieved through i_ddi_get_intrspec().
719  * It has to be passed in from the caller.
720  *
721  * Return value:
722  *      Success: irqno for the given device
723  *      Failure: -1
724  */
725 int
726 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type)
727 {
728 	char dev_type[16];
729 	int dev_len, pci_irq, newirq, bustype, devid, busid, i;
730 	int irqno = ispec->intrspec_vec;
731 	ddi_acc_handle_t cfg_handle;
732 	uchar_t ipin;
733 	struct apic_io_intr *intrp;
734 	iflag_t intr_flag;
735 	ACPI_SUBTABLE_HEADER	*hp;
736 	ACPI_MADT_INTERRUPT_OVERRIDE *isop;
737 	apic_irq_t *airqp;
738 	int parent_is_pci_or_pciex = 0;
739 	int child_is_pciex = 0;
740 
741 	DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s "
742 	    "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type,
743 	    irqno));
744 
745 	dev_len = sizeof (dev_type);
746 	if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip),
747 	    DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type,
748 	    &dev_len) == DDI_PROP_SUCCESS) {
749 		if ((strcmp(dev_type, "pci") == 0) ||
750 		    (strcmp(dev_type, "pciex") == 0))
751 			parent_is_pci_or_pciex = 1;
752 	}
753 
754 	if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
755 	    DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type,
756 	    &dev_len) == DDI_PROP_SUCCESS) {
757 		if (strstr(dev_type, "pciex"))
758 			child_is_pciex = 1;
759 	}
760 
761 	if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
762 		if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) {
763 			airqp->airq_iflag.bustype =
764 			    child_is_pciex ? BUS_PCIE : BUS_PCI;
765 			return (apic_vector_to_irq[airqp->airq_vector]);
766 		}
767 		return (apic_setup_irq_table(dip, irqno, NULL, ispec,
768 		    NULL, type));
769 	}
770 
771 	bustype = 0;
772 
773 	/* check if we have already translated this irq */
774 	mutex_enter(&airq_mutex);
775 	newirq = apic_min_device_irq;
776 	for (; newirq <= apic_max_device_irq; newirq++) {
777 		airqp = apic_irq_table[newirq];
778 		while (airqp) {
779 			if ((airqp->airq_dip == dip) &&
780 			    (airqp->airq_origirq == irqno) &&
781 			    (airqp->airq_mps_intr_index != FREE_INDEX)) {
782 
783 				mutex_exit(&airq_mutex);
784 				return (VIRTIRQ(newirq, airqp->airq_share_id));
785 			}
786 			airqp = airqp->airq_next;
787 		}
788 	}
789 	mutex_exit(&airq_mutex);
790 
791 	if (apic_defconf)
792 		goto defconf;
793 
794 	if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi))
795 		goto nonpci;
796 
797 	if (parent_is_pci_or_pciex) {
798 		/* pci device */
799 		if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0)
800 			goto nonpci;
801 		if (busid == 0 && apic_pci_bus_total == 1)
802 			busid = (int)apic_single_pci_busid;
803 
804 		if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS)
805 			return (-1);
806 		ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA;
807 		pci_config_teardown(&cfg_handle);
808 		if (apic_enable_acpi && !apic_use_acpi_madt_only) {
809 			if (apic_acpi_translate_pci_irq(dip, busid, devid,
810 			    ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS)
811 				return (-1);
812 
813 			intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI;
814 			return (apic_setup_irq_table(dip, pci_irq, NULL, ispec,
815 			    &intr_flag, type));
816 		} else {
817 			pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3);
818 			if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid))
819 			    == NULL) {
820 				if ((pci_irq = apic_handle_pci_pci_bridge(dip,
821 				    devid, ipin, &intrp)) == -1)
822 					return (-1);
823 			}
824 			return (apic_setup_irq_table(dip, pci_irq, intrp, ispec,
825 			    NULL, type));
826 		}
827 	} else if (strcmp(dev_type, "isa") == 0)
828 		bustype = BUS_ISA;
829 	else if (strcmp(dev_type, "eisa") == 0)
830 		bustype = BUS_EISA;
831 
832 nonpci:
833 	if (apic_enable_acpi && !apic_use_acpi_madt_only) {
834 		/* search iso entries first */
835 		if (acpi_iso_cnt != 0) {
836 			hp = (ACPI_SUBTABLE_HEADER *)acpi_isop;
837 			i = 0;
838 			while (i < acpi_iso_cnt) {
839 				if (hp->Type ==
840 				    ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) {
841 					isop =
842 					    (ACPI_MADT_INTERRUPT_OVERRIDE *) hp;
843 					if (isop->Bus == 0 &&
844 					    isop->SourceIrq == irqno) {
845 						newirq = isop->GlobalIrq;
846 						intr_flag.intr_po =
847 						    isop->IntiFlags &
848 						    ACPI_MADT_POLARITY_MASK;
849 						intr_flag.intr_el =
850 						    (isop->IntiFlags &
851 						    ACPI_MADT_TRIGGER_MASK)
852 						    >> 2;
853 						intr_flag.bustype = BUS_ISA;
854 
855 						return (apic_setup_irq_table(
856 						    dip, newirq, NULL, ispec,
857 						    &intr_flag, type));
858 
859 					}
860 					i++;
861 				}
862 				hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) +
863 				    hp->Length);
864 			}
865 		}
866 		intr_flag.intr_po = INTR_PO_ACTIVE_HIGH;
867 		intr_flag.intr_el = INTR_EL_EDGE;
868 		intr_flag.bustype = BUS_ISA;
869 		return (apic_setup_irq_table(dip, irqno, NULL, ispec,
870 		    &intr_flag, type));
871 	} else {
872 		if (bustype == 0)	/* not initialized */
873 			bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA;
874 		for (i = 0; i < 2; i++) {
875 			if (((busid = apic_find_bus_id(bustype)) != -1) &&
876 			    ((intrp = apic_find_io_intr_w_busid(irqno, busid))
877 			    != NULL)) {
878 				if ((newirq = apic_setup_irq_table(dip, irqno,
879 				    intrp, ispec, NULL, type)) != -1) {
880 					return (newirq);
881 				}
882 				goto defconf;
883 			}
884 			bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA;
885 		}
886 	}
887 
888 /* MPS default configuration */
889 defconf:
890 	newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type);
891 	if (newirq == -1)
892 		return (-1);
893 	ASSERT(IRQINDEX(newirq) == irqno);
894 	ASSERT(apic_irq_table[irqno]);
895 	return (newirq);
896 }
897 
898 /*
899  * Attempt to share vector with someone else
900  */
901 static int
902 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl,
903     uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp)
904 {
905 #ifdef DEBUG
906 	apic_irq_t *tmpirqp = NULL;
907 #endif /* DEBUG */
908 	apic_irq_t *irqptr, dummyirq;
909 	int	newirq, chosen_irq = -1, share = 127;
910 	int	lowest, highest, i;
911 	uchar_t	share_id;
912 
913 	DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x "
914 	    "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl));
915 
916 	highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK;
917 	lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL;
918 
919 	if (highest < lowest) /* Both ipl and ipl-1 map to same pri */
920 		lowest -= APIC_VECTOR_PER_IPL;
921 	dummyirq.airq_mps_intr_index = intr_index;
922 	dummyirq.airq_ioapicindex = ioapicindex;
923 	dummyirq.airq_intin_no = ipin;
924 	if (intr_flagp)
925 		dummyirq.airq_iflag = *intr_flagp;
926 	apic_record_rdt_entry(&dummyirq, irqno);
927 	for (i = lowest; i <= highest; i++) {
928 		newirq = apic_vector_to_irq[i];
929 		if (newirq == APIC_RESV_IRQ)
930 			continue;
931 		irqptr = apic_irq_table[newirq];
932 
933 		if ((dummyirq.airq_rdt_entry & 0xFF00) !=
934 		    (irqptr->airq_rdt_entry & 0xFF00))
935 			/* not compatible */
936 			continue;
937 
938 		if (irqptr->airq_share < share) {
939 			share = irqptr->airq_share;
940 			chosen_irq = newirq;
941 		}
942 	}
943 	if (chosen_irq != -1) {
944 		/*
945 		 * Assign a share id which is free or which is larger
946 		 * than the largest one.
947 		 */
948 		share_id = 1;
949 		mutex_enter(&airq_mutex);
950 		irqptr = apic_irq_table[chosen_irq];
951 		while (irqptr) {
952 			if (irqptr->airq_mps_intr_index == FREE_INDEX) {
953 				share_id = irqptr->airq_share_id;
954 				break;
955 			}
956 			if (share_id <= irqptr->airq_share_id)
957 				share_id = irqptr->airq_share_id + 1;
958 #ifdef DEBUG
959 			tmpirqp = irqptr;
960 #endif /* DEBUG */
961 			irqptr = irqptr->airq_next;
962 		}
963 		if (!irqptr) {
964 			irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
965 			irqptr->airq_temp_cpu = IRQ_UNINIT;
966 			irqptr->airq_next =
967 			    apic_irq_table[chosen_irq]->airq_next;
968 			apic_irq_table[chosen_irq]->airq_next = irqptr;
969 #ifdef	DEBUG
970 			tmpirqp = apic_irq_table[chosen_irq];
971 #endif /* DEBUG */
972 		}
973 		irqptr->airq_mps_intr_index = intr_index;
974 		irqptr->airq_ioapicindex = ioapicindex;
975 		irqptr->airq_intin_no = ipin;
976 		if (intr_flagp)
977 			irqptr->airq_iflag = *intr_flagp;
978 		irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector;
979 		irqptr->airq_share_id = share_id;
980 		apic_record_rdt_entry(irqptr, irqno);
981 		*irqptrp = irqptr;
982 #ifdef	DEBUG
983 		/* shuffle the pointers to test apic_delspl path */
984 		if (tmpirqp) {
985 			tmpirqp->airq_next = irqptr->airq_next;
986 			irqptr->airq_next = apic_irq_table[chosen_irq];
987 			apic_irq_table[chosen_irq] = irqptr;
988 		}
989 #endif /* DEBUG */
990 		mutex_exit(&airq_mutex);
991 		return (VIRTIRQ(chosen_irq, share_id));
992 	}
993 	return (-1);
994 }
995 
996 /*
997  * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry
998  * is used already, we will try to allocate a new irqno.
999  *
1000  * Return value:
1001  *	Success: irqno
1002  *	Failure: -1
1003  */
1004 static int
1005 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp,
1006     struct intrspec *ispec, iflag_t *intr_flagp, int type)
1007 {
1008 	int origirq = ispec->intrspec_vec;
1009 	uchar_t ipl = ispec->intrspec_pri;
1010 	int	newirq, intr_index;
1011 	uchar_t	ipin, ioapic, ioapicindex, vector;
1012 	apic_irq_t *irqptr;
1013 	major_t	major;
1014 	dev_info_t	*sdip;
1015 
1016 	DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d "
1017 	    "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq));
1018 
1019 	ASSERT(ispec != NULL);
1020 
1021 	major =  (dip != NULL) ? ddi_driver_major(dip) : 0;
1022 
1023 	if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
1024 		/* MSI/X doesn't need to setup ioapic stuffs */
1025 		ioapicindex = 0xff;
1026 		ioapic = 0xff;
1027 		ipin = (uchar_t)0xff;
1028 		intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX :
1029 		    MSIX_INDEX;
1030 		mutex_enter(&airq_mutex);
1031 		if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) {
1032 			mutex_exit(&airq_mutex);
1033 			/* need an irq for MSI/X to index into autovect[] */
1034 			cmn_err(CE_WARN, "No interrupt irq: %s instance %d",
1035 			    ddi_get_name(dip), ddi_get_instance(dip));
1036 			return (-1);
1037 		}
1038 		mutex_exit(&airq_mutex);
1039 
1040 	} else if (intrp != NULL) {
1041 		intr_index = (int)(intrp - apic_io_intrp);
1042 		ioapic = intrp->intr_destid;
1043 		ipin = intrp->intr_destintin;
1044 		/* Find ioapicindex. If destid was ALL, we will exit with 0. */
1045 		for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--)
1046 			if (apic_io_id[ioapicindex] == ioapic)
1047 				break;
1048 		ASSERT((ioapic == apic_io_id[ioapicindex]) ||
1049 		    (ioapic == INTR_ALL_APIC));
1050 
1051 		/* check whether this intin# has been used by another irqno */
1052 		if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) {
1053 			return (newirq);
1054 		}
1055 
1056 	} else if (intr_flagp != NULL) {
1057 		/* ACPI case */
1058 		intr_index = ACPI_INDEX;
1059 		ioapicindex = acpi_find_ioapic(irqno);
1060 		ASSERT(ioapicindex != 0xFF);
1061 		ioapic = apic_io_id[ioapicindex];
1062 		ipin = irqno - apic_io_vectbase[ioapicindex];
1063 		if (apic_irq_table[irqno] &&
1064 		    apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) {
1065 			ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin &&
1066 			    apic_irq_table[irqno]->airq_ioapicindex ==
1067 			    ioapicindex);
1068 			return (irqno);
1069 		}
1070 
1071 	} else {
1072 		/* default configuration */
1073 		ioapicindex = 0;
1074 		ioapic = apic_io_id[ioapicindex];
1075 		ipin = (uchar_t)irqno;
1076 		intr_index = DEFAULT_INDEX;
1077 	}
1078 
1079 	if (ispec == NULL) {
1080 		APIC_VERBOSE_IOAPIC((CE_WARN, "No intrspec for irqno = %x\n",
1081 		    irqno));
1082 	} else if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) {
1083 		if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index,
1084 		    ipl, ioapicindex, ipin, &irqptr)) != -1) {
1085 			irqptr->airq_ipl = ipl;
1086 			irqptr->airq_origirq = (uchar_t)origirq;
1087 			irqptr->airq_dip = dip;
1088 			irqptr->airq_major = major;
1089 			sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip;
1090 			/* This is OK to do really */
1091 			if (sdip == NULL) {
1092 				cmn_err(CE_WARN, "Sharing vectors: %s"
1093 				    " instance %d and SCI",
1094 				    ddi_get_name(dip), ddi_get_instance(dip));
1095 			} else {
1096 				cmn_err(CE_WARN, "Sharing vectors: %s"
1097 				    " instance %d and %s instance %d",
1098 				    ddi_get_name(sdip), ddi_get_instance(sdip),
1099 				    ddi_get_name(dip), ddi_get_instance(dip));
1100 			}
1101 			return (newirq);
1102 		}
1103 		/* try high priority allocation now  that share has failed */
1104 		if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) {
1105 			cmn_err(CE_WARN, "No interrupt vector: %s instance %d",
1106 			    ddi_get_name(dip), ddi_get_instance(dip));
1107 			return (-1);
1108 		}
1109 	}
1110 
1111 	mutex_enter(&airq_mutex);
1112 	if (apic_irq_table[irqno] == NULL) {
1113 		irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1114 		irqptr->airq_temp_cpu = IRQ_UNINIT;
1115 		apic_irq_table[irqno] = irqptr;
1116 	} else {
1117 		irqptr = apic_irq_table[irqno];
1118 		if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1119 			/*
1120 			 * The slot is used by another irqno, so allocate
1121 			 * a free irqno for this interrupt
1122 			 */
1123 			newirq = apic_allocate_irq(apic_first_avail_irq);
1124 			if (newirq == -1) {
1125 				mutex_exit(&airq_mutex);
1126 				return (-1);
1127 			}
1128 			irqno = newirq;
1129 			irqptr = apic_irq_table[irqno];
1130 			if (irqptr == NULL) {
1131 				irqptr = kmem_zalloc(sizeof (apic_irq_t),
1132 				    KM_SLEEP);
1133 				irqptr->airq_temp_cpu = IRQ_UNINIT;
1134 				apic_irq_table[irqno] = irqptr;
1135 			}
1136 			vector = apic_modify_vector(vector, newirq);
1137 		}
1138 	}
1139 	apic_max_device_irq = max(irqno, apic_max_device_irq);
1140 	apic_min_device_irq = min(irqno, apic_min_device_irq);
1141 	mutex_exit(&airq_mutex);
1142 	irqptr->airq_ioapicindex = ioapicindex;
1143 	irqptr->airq_intin_no = ipin;
1144 	irqptr->airq_ipl = ipl;
1145 	irqptr->airq_vector = vector;
1146 	irqptr->airq_origirq = (uchar_t)origirq;
1147 	irqptr->airq_share_id = 0;
1148 	irqptr->airq_mps_intr_index = (short)intr_index;
1149 	irqptr->airq_dip = dip;
1150 	irqptr->airq_major = major;
1151 	irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin);
1152 	if (intr_flagp)
1153 		irqptr->airq_iflag = *intr_flagp;
1154 
1155 	if (!DDI_INTR_IS_MSI_OR_MSIX(type)) {
1156 		/* setup I/O APIC entry for non-MSI/X interrupts */
1157 		apic_record_rdt_entry(irqptr, irqno);
1158 	}
1159 	return (irqno);
1160 }
1161 
1162 /*
1163  * return the cpu to which this intr should be bound.
1164  * Check properties or any other mechanism to see if user wants it
1165  * bound to a specific CPU. If so, return the cpu id with high bit set.
1166  * If not, use the policy to choose a cpu and return the id.
1167  */
1168 uint32_t
1169 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin)
1170 {
1171 	int	instance, instno, prop_len, bind_cpu, count;
1172 	uint_t	i, rc;
1173 	uint32_t cpu;
1174 	major_t	major;
1175 	char	*name, *drv_name, *prop_val, *cptr;
1176 	char	prop_name[32];
1177 	ulong_t iflag;
1178 
1179 
1180 	if (apic_intr_policy == INTR_LOWEST_PRIORITY)
1181 		return (IRQ_UNBOUND);
1182 
1183 	if (apic_nproc == 1)
1184 		return (0);
1185 
1186 	drv_name = NULL;
1187 	rc = DDI_PROP_NOT_FOUND;
1188 	major = (major_t)-1;
1189 	if (dip != NULL) {
1190 		name = ddi_get_name(dip);
1191 		major = ddi_name_to_major(name);
1192 		drv_name = ddi_major_to_name(major);
1193 		instance = ddi_get_instance(dip);
1194 		if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
1195 			i = apic_min_device_irq;
1196 			for (; i <= apic_max_device_irq; i++) {
1197 
1198 				if ((i == irq) || (apic_irq_table[i] == NULL) ||
1199 				    (apic_irq_table[i]->airq_mps_intr_index
1200 				    == FREE_INDEX))
1201 					continue;
1202 
1203 				if ((apic_irq_table[i]->airq_major == major) &&
1204 				    (!(apic_irq_table[i]->airq_cpu &
1205 				    IRQ_USER_BOUND))) {
1206 
1207 					cpu = apic_irq_table[i]->airq_cpu;
1208 
1209 					cmn_err(CE_CONT,
1210 					    "!%s: %s (%s) instance #%d "
1211 					    "irq 0x%x vector 0x%x ioapic 0x%x "
1212 					    "intin 0x%x is bound to cpu %d\n",
1213 					    psm_name,
1214 					    name, drv_name, instance, irq,
1215 					    apic_irq_table[irq]->airq_vector,
1216 					    ioapicid, intin, cpu);
1217 					return (cpu);
1218 				}
1219 			}
1220 		}
1221 		/*
1222 		 * search for "drvname"_intpt_bind_cpus property first, the
1223 		 * syntax of the property should be "a[,b,c,...]" where
1224 		 * instance 0 binds to cpu a, instance 1 binds to cpu b,
1225 		 * instance 3 binds to cpu c...
1226 		 * ddi_getlongprop() will search /option first, then /
1227 		 * if "drvname"_intpt_bind_cpus doesn't exist, then find
1228 		 * intpt_bind_cpus property.  The syntax is the same, and
1229 		 * it applies to all the devices if its "drvname" specific
1230 		 * property doesn't exist
1231 		 */
1232 		(void) strcpy(prop_name, drv_name);
1233 		(void) strcat(prop_name, "_intpt_bind_cpus");
1234 		rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name,
1235 		    (caddr_t)&prop_val, &prop_len);
1236 		if (rc != DDI_PROP_SUCCESS) {
1237 			rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0,
1238 			    "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len);
1239 		}
1240 	}
1241 	if (rc == DDI_PROP_SUCCESS) {
1242 		for (i = count = 0; i < (prop_len - 1); i++)
1243 			if (prop_val[i] == ',')
1244 				count++;
1245 		if (prop_val[i-1] != ',')
1246 			count++;
1247 		/*
1248 		 * if somehow the binding instances defined in the
1249 		 * property are not enough for this instno., then
1250 		 * reuse the pattern for the next instance until
1251 		 * it reaches the requested instno
1252 		 */
1253 		instno = instance % count;
1254 		i = 0;
1255 		cptr = prop_val;
1256 		while (i < instno)
1257 			if (*cptr++ == ',')
1258 				i++;
1259 		bind_cpu = stoi(&cptr);
1260 		kmem_free(prop_val, prop_len);
1261 		/* if specific CPU is bogus, then default to next cpu */
1262 		if (!apic_cpu_in_range(bind_cpu)) {
1263 			cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present",
1264 			    psm_name, prop_name, prop_val, bind_cpu);
1265 			rc = DDI_PROP_NOT_FOUND;
1266 		} else {
1267 			/* indicate that we are bound at user request */
1268 			bind_cpu |= IRQ_USER_BOUND;
1269 		}
1270 		/*
1271 		 * no need to check apic_cpus[].aci_status, if specific CPU is
1272 		 * not up, then post_cpu_start will handle it.
1273 		 */
1274 	}
1275 	if (rc != DDI_PROP_SUCCESS) {
1276 		iflag = intr_clear();
1277 		lock_set(&apic_ioapic_lock);
1278 		bind_cpu = apic_get_next_bind_cpu();
1279 		lock_clear(&apic_ioapic_lock);
1280 		intr_restore(iflag);
1281 	}
1282 
1283 	if (drv_name != NULL)
1284 		cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x "
1285 		    "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1286 		    psm_name, name, drv_name, instance, irq,
1287 		    apic_irq_table[irq]->airq_vector, ioapicid, intin,
1288 		    bind_cpu & ~IRQ_USER_BOUND);
1289 	else
1290 		cmn_err(CE_CONT, "!%s: irq 0x%x "
1291 		    "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1292 		    psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid,
1293 		    intin, bind_cpu & ~IRQ_USER_BOUND);
1294 
1295 	return ((uint32_t)bind_cpu);
1296 }
1297 
1298 /*
1299  * Mark vector as being in the process of being deleted. Interrupts
1300  * may still come in on some CPU. The moment an interrupt comes with
1301  * the new vector, we know we can free the old one. Called only from
1302  * addspl and delspl with interrupts disabled. Because an interrupt
1303  * can be shared, but no interrupt from either device may come in,
1304  * we also use a timeout mechanism, which we arbitrarily set to
1305  * apic_revector_timeout microseconds.
1306  */
1307 static void
1308 apic_mark_vector(uchar_t oldvector, uchar_t newvector)
1309 {
1310 	ulong_t iflag;
1311 
1312 	iflag = intr_clear();
1313 	lock_set(&apic_revector_lock);
1314 	if (!apic_oldvec_to_newvec) {
1315 		apic_oldvec_to_newvec =
1316 		    kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2,
1317 		    KM_NOSLEEP);
1318 
1319 		if (!apic_oldvec_to_newvec) {
1320 			/*
1321 			 * This failure is not catastrophic.
1322 			 * But, the oldvec will never be freed.
1323 			 */
1324 			apic_error |= APIC_ERR_MARK_VECTOR_FAIL;
1325 			lock_clear(&apic_revector_lock);
1326 			intr_restore(iflag);
1327 			return;
1328 		}
1329 		apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR];
1330 	}
1331 
1332 	/* See if we already did this for drivers which do double addintrs */
1333 	if (apic_oldvec_to_newvec[oldvector] != newvector) {
1334 		apic_oldvec_to_newvec[oldvector] = newvector;
1335 		apic_newvec_to_oldvec[newvector] = oldvector;
1336 		apic_revector_pending++;
1337 	}
1338 	lock_clear(&apic_revector_lock);
1339 	intr_restore(iflag);
1340 	(void) timeout(apic_xlate_vector_free_timeout_handler,
1341 	    (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout));
1342 }
1343 
1344 /*
1345  * xlate_vector is called from intr_enter if revector_pending is set.
1346  * It will xlate it if needed and mark the old vector as free.
1347  */
1348 uchar_t
1349 apic_xlate_vector(uchar_t vector)
1350 {
1351 	uchar_t	newvector, oldvector = 0;
1352 
1353 	lock_set(&apic_revector_lock);
1354 	/* Do we really need to do this ? */
1355 	if (!apic_revector_pending) {
1356 		lock_clear(&apic_revector_lock);
1357 		return (vector);
1358 	}
1359 	if ((newvector = apic_oldvec_to_newvec[vector]) != 0)
1360 		oldvector = vector;
1361 	else {
1362 		/*
1363 		 * The incoming vector is new . See if a stale entry is
1364 		 * remaining
1365 		 */
1366 		if ((oldvector = apic_newvec_to_oldvec[vector]) != 0)
1367 			newvector = vector;
1368 	}
1369 
1370 	if (oldvector) {
1371 		apic_revector_pending--;
1372 		apic_oldvec_to_newvec[oldvector] = 0;
1373 		apic_newvec_to_oldvec[newvector] = 0;
1374 		apic_free_vector(oldvector);
1375 		lock_clear(&apic_revector_lock);
1376 		/* There could have been more than one reprogramming! */
1377 		return (apic_xlate_vector(newvector));
1378 	}
1379 	lock_clear(&apic_revector_lock);
1380 	return (vector);
1381 }
1382 
1383 void
1384 apic_xlate_vector_free_timeout_handler(void *arg)
1385 {
1386 	ulong_t iflag;
1387 	uchar_t oldvector, newvector;
1388 
1389 	oldvector = (uchar_t)(uintptr_t)arg;
1390 	iflag = intr_clear();
1391 	lock_set(&apic_revector_lock);
1392 	if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) {
1393 		apic_free_vector(oldvector);
1394 		apic_oldvec_to_newvec[oldvector] = 0;
1395 		apic_newvec_to_oldvec[newvector] = 0;
1396 		apic_revector_pending--;
1397 	}
1398 
1399 	lock_clear(&apic_revector_lock);
1400 	intr_restore(iflag);
1401 }
1402 
1403 /*
1404  * Bind interrupt corresponding to irq_ptr to bind_cpu.
1405  * Must be called with interrupts disabled and apic_ioapic_lock held
1406  */
1407 int
1408 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu,
1409     struct ioapic_reprogram_data *drep)
1410 {
1411 	int			ioapicindex, intin_no;
1412 	uint32_t		airq_temp_cpu;
1413 	apic_cpus_info_t	*cpu_infop;
1414 	uint32_t		rdt_entry;
1415 	int			which_irq;
1416 	ioapic_rdt_t		irdt;
1417 
1418 	which_irq = apic_vector_to_irq[irq_ptr->airq_vector];
1419 
1420 	intin_no = irq_ptr->airq_intin_no;
1421 	ioapicindex = irq_ptr->airq_ioapicindex;
1422 	airq_temp_cpu = irq_ptr->airq_temp_cpu;
1423 	if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) {
1424 		if (airq_temp_cpu & IRQ_USER_BOUND)
1425 			/* Mask off high bit so it can be used as array index */
1426 			airq_temp_cpu &= ~IRQ_USER_BOUND;
1427 
1428 		ASSERT(apic_cpu_in_range(airq_temp_cpu));
1429 	}
1430 
1431 	/*
1432 	 * Can't bind to a CPU that's not accepting interrupts:
1433 	 */
1434 	cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND];
1435 	if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE))
1436 		return (1);
1437 
1438 	/*
1439 	 * If we are about to change the interrupt vector for this interrupt,
1440 	 * and this interrupt is level-triggered, attached to an IOAPIC,
1441 	 * has been delivered to a CPU and that CPU has not handled it
1442 	 * yet, we cannot reprogram the IOAPIC now.
1443 	 */
1444 	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1445 
1446 		rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex,
1447 		    intin_no);
1448 
1449 		if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) &&
1450 		    apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu,
1451 		    bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) {
1452 
1453 			return (0);
1454 		}
1455 
1456 		/*
1457 		 * NOTE: We do not unmask the RDT here, as an interrupt MAY
1458 		 * still come in before we have a chance to reprogram it below.
1459 		 * The reprogramming below will simultaneously change and
1460 		 * unmask the RDT entry.
1461 		 */
1462 
1463 		if ((uint32_t)bind_cpu == IRQ_UNBOUND) {
1464 			irdt.ir_lo =  AV_LDEST | AV_LOPRI |
1465 			    irq_ptr->airq_rdt_entry;
1466 
1467 			WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1468 			    AV_TOALL);
1469 
1470 			if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu !=
1471 			    IRQ_UNBOUND)
1472 				apic_cpus[airq_temp_cpu].aci_temp_bound--;
1473 
1474 			/*
1475 			 * Write the vector, trigger, and polarity portion of
1476 			 * the RDT
1477 			 */
1478 			WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1479 			    irdt.ir_lo);
1480 
1481 			irq_ptr->airq_temp_cpu = IRQ_UNBOUND;
1482 			return (0);
1483 		}
1484 	}
1485 
1486 	if (bind_cpu & IRQ_USER_BOUND) {
1487 		cpu_infop->aci_bound++;
1488 	} else {
1489 		cpu_infop->aci_temp_bound++;
1490 	}
1491 	ASSERT(apic_cpu_in_range(bind_cpu));
1492 
1493 	if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) {
1494 		apic_cpus[airq_temp_cpu].aci_temp_bound--;
1495 	}
1496 	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1497 
1498 		irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry;
1499 		irdt.ir_hi = cpu_infop->aci_local_id;
1500 
1501 		/* Write the RDT entry -- bind to a specific CPU: */
1502 		WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1503 		    irdt.ir_hi << APIC_ID_BIT_OFFSET);
1504 
1505 		/* Write the vector, trigger, and polarity portion of the RDT */
1506 		WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1507 		    irdt.ir_lo);
1508 
1509 	} else {
1510 		int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ?
1511 		    DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX;
1512 		if (type == DDI_INTR_TYPE_MSI) {
1513 			if (irq_ptr->airq_ioapicindex ==
1514 			    irq_ptr->airq_origirq) {
1515 				/* first one */
1516 				DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1517 				    "apic_pci_msi_enable_vector\n"));
1518 				apic_pci_msi_enable_vector(irq_ptr,
1519 				    type, which_irq, irq_ptr->airq_vector,
1520 				    irq_ptr->airq_intin_no,
1521 				    cpu_infop->aci_local_id);
1522 			}
1523 			if ((irq_ptr->airq_ioapicindex +
1524 			    irq_ptr->airq_intin_no - 1) ==
1525 			    irq_ptr->airq_origirq) { /* last one */
1526 				DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1527 				    "apic_pci_msi_enable_mode\n"));
1528 				apic_pci_msi_enable_mode(irq_ptr->airq_dip,
1529 				    type, which_irq);
1530 			}
1531 		} else { /* MSI-X */
1532 			apic_pci_msi_enable_vector(irq_ptr, type,
1533 			    irq_ptr->airq_origirq, irq_ptr->airq_vector, 1,
1534 			    cpu_infop->aci_local_id);
1535 			apic_pci_msi_enable_mode(irq_ptr->airq_dip, type,
1536 			    irq_ptr->airq_origirq);
1537 		}
1538 	}
1539 	irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu;
1540 	apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND));
1541 	return (0);
1542 }
1543 
1544 static void
1545 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no)
1546 {
1547 	if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no)
1548 	    & AV_REMOTE_IRR) != 0) {
1549 		/*
1550 		 * Trying to clear the bit through normal
1551 		 * channels has failed.  So as a last-ditch
1552 		 * effort, try to set the trigger mode to
1553 		 * edge, then to level.  This has been
1554 		 * observed to work on many systems.
1555 		 */
1556 		WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1557 		    intin_no,
1558 		    READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1559 		    intin_no) & ~AV_LEVEL);
1560 
1561 		WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1562 		    intin_no,
1563 		    READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1564 		    intin_no) | AV_LEVEL);
1565 
1566 		/*
1567 		 * If the bit's STILL set, this interrupt may
1568 		 * be hosed.
1569 		 */
1570 		if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1571 		    intin_no) & AV_REMOTE_IRR) != 0) {
1572 
1573 			prom_printf("%s: Remote IRR still "
1574 			    "not clear for IOAPIC %d intin %d.\n"
1575 			    "\tInterrupts to this pin may cease "
1576 			    "functioning.\n", psm_name, ioapic_ix,
1577 			    intin_no);
1578 #ifdef DEBUG
1579 			apic_last_ditch_reprogram_failures++;
1580 #endif
1581 		}
1582 	}
1583 }
1584 
1585 /*
1586  * This function is protected by apic_ioapic_lock coupled with the
1587  * fact that interrupts are disabled.
1588  */
1589 static void
1590 delete_defer_repro_ent(int which_irq)
1591 {
1592 	ASSERT(which_irq >= 0);
1593 	ASSERT(which_irq <= 255);
1594 	ASSERT(LOCK_HELD(&apic_ioapic_lock));
1595 
1596 	if (apic_reprogram_info[which_irq].done)
1597 		return;
1598 
1599 	apic_reprogram_info[which_irq].done = B_TRUE;
1600 
1601 #ifdef DEBUG
1602 	apic_defer_repro_total_retries +=
1603 	    apic_reprogram_info[which_irq].tries;
1604 
1605 	apic_defer_repro_successes++;
1606 #endif
1607 
1608 	if (--apic_reprogram_outstanding == 0) {
1609 
1610 		setlvlx = psm_intr_exit_fn();
1611 	}
1612 }
1613 
1614 
1615 /*
1616  * Interrupts must be disabled during this function to prevent
1617  * self-deadlock.  Interrupts are disabled because this function
1618  * is called from apic_check_stuck_interrupt(), which is called
1619  * from apic_rebind(), which requires its caller to disable interrupts.
1620  */
1621 static void
1622 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu)
1623 {
1624 	ASSERT(which_irq >= 0);
1625 	ASSERT(which_irq <= 255);
1626 	ASSERT(!interrupts_enabled());
1627 
1628 	/*
1629 	 * On the off-chance that there's already a deferred
1630 	 * reprogramming on this irq, check, and if so, just update the
1631 	 * CPU and irq pointer to which the interrupt is targeted, then return.
1632 	 */
1633 	if (!apic_reprogram_info[which_irq].done) {
1634 		apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1635 		apic_reprogram_info[which_irq].irqp = irq_ptr;
1636 		return;
1637 	}
1638 
1639 	apic_reprogram_info[which_irq].irqp = irq_ptr;
1640 	apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1641 	apic_reprogram_info[which_irq].tries = 0;
1642 	/*
1643 	 * This must be the last thing set, since we're not
1644 	 * grabbing any locks, apic_try_deferred_reprogram() will
1645 	 * make its decision about using this entry iff done
1646 	 * is false.
1647 	 */
1648 	apic_reprogram_info[which_irq].done = B_FALSE;
1649 
1650 	/*
1651 	 * If there were previously no deferred reprogrammings, change
1652 	 * setlvlx to call apic_try_deferred_reprogram()
1653 	 */
1654 	if (++apic_reprogram_outstanding == 1) {
1655 
1656 		setlvlx = apic_try_deferred_reprogram;
1657 	}
1658 }
1659 
1660 static void
1661 apic_try_deferred_reprogram(int prev_ipl, int irq)
1662 {
1663 	int reproirq;
1664 	ulong_t iflag;
1665 	struct ioapic_reprogram_data *drep;
1666 
1667 	(*psm_intr_exit_fn())(prev_ipl, irq);
1668 
1669 	if (!lock_try(&apic_defer_reprogram_lock)) {
1670 		return;
1671 	}
1672 
1673 	/*
1674 	 * Acquire the apic_ioapic_lock so that any other operations that
1675 	 * may affect the apic_reprogram_info state are serialized.
1676 	 * It's still possible for the last deferred reprogramming to clear
1677 	 * between the time we entered this function and the time we get to
1678 	 * the for loop below.  In that case, *setlvlx will have been set
1679 	 * back to *_intr_exit and drep will be NULL. (There's no way to
1680 	 * stop that from happening -- we would need to grab a lock before
1681 	 * calling *setlvlx, which is neither realistic nor prudent).
1682 	 */
1683 	iflag = intr_clear();
1684 	lock_set(&apic_ioapic_lock);
1685 
1686 	/*
1687 	 * For each deferred RDT entry, try to reprogram it now.  Note that
1688 	 * there is no lock acquisition to read apic_reprogram_info because
1689 	 * '.done' is set only after the other fields in the structure are set.
1690 	 */
1691 
1692 	drep = NULL;
1693 	for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) {
1694 		if (apic_reprogram_info[reproirq].done == B_FALSE) {
1695 			drep = &apic_reprogram_info[reproirq];
1696 			break;
1697 		}
1698 	}
1699 
1700 	/*
1701 	 * Either we found a deferred action to perform, or
1702 	 * we entered this function spuriously, after *setlvlx
1703 	 * was restored to point to *_intr_exit.  Any other
1704 	 * permutation is invalid.
1705 	 */
1706 	ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn());
1707 
1708 	/*
1709 	 * Though we can't really do anything about errors
1710 	 * at this point, keep track of them for reporting.
1711 	 * Note that it is very possible for apic_setup_io_intr
1712 	 * to re-register this very timeout if the Remote IRR bit
1713 	 * has not yet cleared.
1714 	 */
1715 
1716 #ifdef DEBUG
1717 	if (drep != NULL) {
1718 		if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) {
1719 			apic_deferred_setup_failures++;
1720 		}
1721 	} else {
1722 		apic_deferred_spurious_enters++;
1723 	}
1724 #else
1725 	if (drep != NULL)
1726 		(void) apic_setup_io_intr(drep, reproirq, B_TRUE);
1727 #endif
1728 
1729 	lock_clear(&apic_ioapic_lock);
1730 	intr_restore(iflag);
1731 
1732 	lock_clear(&apic_defer_reprogram_lock);
1733 }
1734 
1735 static void
1736 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no)
1737 {
1738 	int waited;
1739 
1740 	/*
1741 	 * Wait for the delivery pending bit to clear.
1742 	 */
1743 	if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1744 	    (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) {
1745 
1746 		/*
1747 		 * If we're still waiting on the delivery of this interrupt,
1748 		 * continue to wait here until it is delivered (this should be
1749 		 * a very small amount of time, but include a timeout just in
1750 		 * case).
1751 		 */
1752 		for (waited = 0; waited < apic_max_reps_clear_pending;
1753 		    waited++) {
1754 			if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1755 			    intin_no) & AV_PENDING) == 0) {
1756 				break;
1757 			}
1758 		}
1759 	}
1760 }
1761 
1762 
1763 /*
1764  * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR
1765  * bit set.  Calls functions that modify the function that setlvlx points to,
1766  * so that the reprogramming can be retried very shortly.
1767  *
1768  * This function will mask the RDT entry if the interrupt is level-triggered.
1769  * (The caller is responsible for unmasking the RDT entry.)
1770  *
1771  * Returns non-zero if the caller should defer IOAPIC reprogramming.
1772  */
1773 static int
1774 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
1775     int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq,
1776     struct ioapic_reprogram_data *drep)
1777 {
1778 	int32_t			rdt_entry;
1779 	int			waited;
1780 	int			reps = 0;
1781 
1782 	/*
1783 	 * Wait for the delivery pending bit to clear.
1784 	 */
1785 	do {
1786 		++reps;
1787 
1788 		apic_ioapic_wait_pending_clear(ioapic_ix, intin_no);
1789 
1790 		/*
1791 		 * Mask the RDT entry, but only if it's a level-triggered
1792 		 * interrupt
1793 		 */
1794 		rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1795 		    intin_no);
1796 		if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) {
1797 
1798 			/* Mask it */
1799 			WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no,
1800 			    AV_MASK | rdt_entry);
1801 		}
1802 
1803 		if ((rdt_entry & AV_LEVEL) == AV_LEVEL) {
1804 			/*
1805 			 * If there was a race and an interrupt was injected
1806 			 * just before we masked, check for that case here.
1807 			 * Then, unmask the RDT entry and try again.  If we're
1808 			 * on our last try, don't unmask (because we want the
1809 			 * RDT entry to remain masked for the rest of the
1810 			 * function).
1811 			 */
1812 			rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1813 			    intin_no);
1814 			if ((rdt_entry & AV_PENDING) &&
1815 			    (reps < apic_max_reps_clear_pending)) {
1816 				/* Unmask it */
1817 				WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1818 				    intin_no, rdt_entry & ~AV_MASK);
1819 			}
1820 		}
1821 
1822 	} while ((rdt_entry & AV_PENDING) &&
1823 	    (reps < apic_max_reps_clear_pending));
1824 
1825 #ifdef DEBUG
1826 		if (rdt_entry & AV_PENDING)
1827 			apic_intr_deliver_timeouts++;
1828 #endif
1829 
1830 	/*
1831 	 * If the remote IRR bit is set, then the interrupt has been sent
1832 	 * to a CPU for processing.  We have no choice but to wait for
1833 	 * that CPU to process the interrupt, at which point the remote IRR
1834 	 * bit will be cleared.
1835 	 */
1836 	if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1837 	    (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) {
1838 
1839 		/*
1840 		 * If the CPU that this RDT is bound to is NOT the current
1841 		 * CPU, wait until that CPU handles the interrupt and ACKs
1842 		 * it.  If this interrupt is not bound to any CPU (that is,
1843 		 * if it's bound to the logical destination of "anyone"), it
1844 		 * may have been delivered to the current CPU so handle that
1845 		 * case by deferring the reprogramming (below).
1846 		 */
1847 		if ((old_bind_cpu != IRQ_UNBOUND) &&
1848 		    (old_bind_cpu != IRQ_UNINIT) &&
1849 		    (old_bind_cpu != psm_get_cpu_id())) {
1850 			for (waited = 0; waited < apic_max_reps_clear_pending;
1851 			    waited++) {
1852 				if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1853 				    intin_no) & AV_REMOTE_IRR) == 0) {
1854 
1855 					delete_defer_repro_ent(which_irq);
1856 
1857 					/* Remote IRR has cleared! */
1858 					return (0);
1859 				}
1860 			}
1861 		}
1862 
1863 		/*
1864 		 * If we waited and the Remote IRR bit is still not cleared,
1865 		 * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS
1866 		 * times for this interrupt, try the last-ditch workaround:
1867 		 */
1868 		if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) {
1869 
1870 			apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no);
1871 
1872 			/* Mark this one as reprogrammed: */
1873 			delete_defer_repro_ent(which_irq);
1874 
1875 			return (0);
1876 		} else {
1877 #ifdef DEBUG
1878 			apic_intr_deferrals++;
1879 #endif
1880 
1881 			/*
1882 			 * If waiting for the Remote IRR bit (above) didn't
1883 			 * allow it to clear, defer the reprogramming.
1884 			 * Add a new deferred-programming entry if the
1885 			 * caller passed a NULL one (and update the existing one
1886 			 * in case anything changed).
1887 			 */
1888 			add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu);
1889 			if (drep)
1890 				drep->tries++;
1891 
1892 			/* Inform caller to defer IOAPIC programming: */
1893 			return (1);
1894 		}
1895 
1896 	}
1897 
1898 	/* Remote IRR is clear */
1899 	delete_defer_repro_ent(which_irq);
1900 
1901 	return (0);
1902 }
1903 
1904 /*
1905  * Called to migrate all interrupts at an irq to another cpu.
1906  * Must be called with interrupts disabled and apic_ioapic_lock held
1907  */
1908 int
1909 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu)
1910 {
1911 	apic_irq_t	*irqptr = irq_ptr;
1912 	int		retval = 0;
1913 
1914 	while (irqptr) {
1915 		if (irqptr->airq_temp_cpu != IRQ_UNINIT)
1916 			retval |= apic_rebind(irqptr, bind_cpu, NULL);
1917 		irqptr = irqptr->airq_next;
1918 	}
1919 
1920 	return (retval);
1921 }
1922 
1923 /*
1924  * apic_intr_redistribute does all the messy computations for identifying
1925  * which interrupt to move to which CPU. Currently we do just one interrupt
1926  * at a time. This reduces the time we spent doing all this within clock
1927  * interrupt. When it is done in idle, we could do more than 1.
1928  * First we find the most busy and the most free CPU (time in ISR only)
1929  * skipping those CPUs that has been identified as being ineligible (cpu_skip)
1930  * Then we look for IRQs which are closest to the difference between the
1931  * most busy CPU and the average ISR load. We try to find one whose load
1932  * is less than difference.If none exists, then we chose one larger than the
1933  * difference, provided it does not make the most idle CPU worse than the
1934  * most busy one. In the end, we clear all the busy fields for CPUs. For
1935  * IRQs, they are cleared as they are scanned.
1936  */
1937 void
1938 apic_intr_redistribute(void)
1939 {
1940 	int busiest_cpu, most_free_cpu;
1941 	int cpu_free, cpu_busy, max_busy, min_busy;
1942 	int min_free, diff;
1943 	int average_busy, cpus_online;
1944 	int i, busy;
1945 	ulong_t iflag;
1946 	apic_cpus_info_t *cpu_infop;
1947 	apic_irq_t *min_busy_irq = NULL;
1948 	apic_irq_t *max_busy_irq = NULL;
1949 
1950 	busiest_cpu = most_free_cpu = -1;
1951 	cpu_free = cpu_busy = max_busy = average_busy = 0;
1952 	min_free = apic_sample_factor_redistribution;
1953 	cpus_online = 0;
1954 	/*
1955 	 * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu
1956 	 * without ioapic_lock. That is OK as we are just doing statistical
1957 	 * sampling anyway and any inaccuracy now will get corrected next time
1958 	 * The call to rebind which actually changes things will make sure
1959 	 * we are consistent.
1960 	 */
1961 	for (i = 0; i < apic_nproc; i++) {
1962 		if (apic_cpu_in_range(i) &&
1963 		    !(apic_redist_cpu_skip & (1 << i)) &&
1964 		    (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) {
1965 
1966 			cpu_infop = &apic_cpus[i];
1967 			/*
1968 			 * If no unbound interrupts or only 1 total on this
1969 			 * CPU, skip
1970 			 */
1971 			if (!cpu_infop->aci_temp_bound ||
1972 			    (cpu_infop->aci_bound + cpu_infop->aci_temp_bound)
1973 			    == 1) {
1974 				apic_redist_cpu_skip |= 1 << i;
1975 				continue;
1976 			}
1977 
1978 			busy = cpu_infop->aci_busy;
1979 			average_busy += busy;
1980 			cpus_online++;
1981 			if (max_busy < busy) {
1982 				max_busy = busy;
1983 				busiest_cpu = i;
1984 			}
1985 			if (min_free > busy) {
1986 				min_free = busy;
1987 				most_free_cpu = i;
1988 			}
1989 			if (busy > apic_int_busy_mark) {
1990 				cpu_busy |= 1 << i;
1991 			} else {
1992 				if (busy < apic_int_free_mark)
1993 					cpu_free |= 1 << i;
1994 			}
1995 		}
1996 	}
1997 	if ((cpu_busy && cpu_free) ||
1998 	    (max_busy >= (min_free + apic_diff_for_redistribution))) {
1999 
2000 		apic_num_imbalance++;
2001 #ifdef	DEBUG
2002 		if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2003 			prom_printf(
2004 			    "redistribute busy=%x free=%x max=%x min=%x",
2005 			    cpu_busy, cpu_free, max_busy, min_free);
2006 		}
2007 #endif /* DEBUG */
2008 
2009 
2010 		average_busy /= cpus_online;
2011 
2012 		diff = max_busy - average_busy;
2013 		min_busy = max_busy; /* start with the max possible value */
2014 		max_busy = 0;
2015 		min_busy_irq = max_busy_irq = NULL;
2016 		i = apic_min_device_irq;
2017 		for (; i <= apic_max_device_irq; i++) {
2018 			apic_irq_t *irq_ptr;
2019 			/* Change to linked list per CPU ? */
2020 			if ((irq_ptr = apic_irq_table[i]) == NULL)
2021 				continue;
2022 			/* Check for irq_busy & decide which one to move */
2023 			/* Also zero them for next round */
2024 			if ((irq_ptr->airq_temp_cpu == busiest_cpu) &&
2025 			    irq_ptr->airq_busy) {
2026 				if (irq_ptr->airq_busy < diff) {
2027 					/*
2028 					 * Check for least busy CPU,
2029 					 * best fit or what ?
2030 					 */
2031 					if (max_busy < irq_ptr->airq_busy) {
2032 						/*
2033 						 * Most busy within the
2034 						 * required differential
2035 						 */
2036 						max_busy = irq_ptr->airq_busy;
2037 						max_busy_irq = irq_ptr;
2038 					}
2039 				} else {
2040 					if (min_busy > irq_ptr->airq_busy) {
2041 						/*
2042 						 * least busy, but more than
2043 						 * the reqd diff
2044 						 */
2045 						if (min_busy <
2046 						    (diff + average_busy -
2047 						    min_free)) {
2048 							/*
2049 							 * Making sure new cpu
2050 							 * will not end up
2051 							 * worse
2052 							 */
2053 							min_busy =
2054 							    irq_ptr->airq_busy;
2055 
2056 							min_busy_irq = irq_ptr;
2057 						}
2058 					}
2059 				}
2060 			}
2061 			irq_ptr->airq_busy = 0;
2062 		}
2063 
2064 		if (max_busy_irq != NULL) {
2065 #ifdef	DEBUG
2066 			if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2067 				prom_printf("rebinding %x to %x",
2068 				    max_busy_irq->airq_vector, most_free_cpu);
2069 			}
2070 #endif /* DEBUG */
2071 			iflag = intr_clear();
2072 			if (lock_try(&apic_ioapic_lock)) {
2073 				if (apic_rebind_all(max_busy_irq,
2074 				    most_free_cpu) == 0) {
2075 					/* Make change permenant */
2076 					max_busy_irq->airq_cpu =
2077 					    (uint32_t)most_free_cpu;
2078 				}
2079 				lock_clear(&apic_ioapic_lock);
2080 			}
2081 			intr_restore(iflag);
2082 
2083 		} else if (min_busy_irq != NULL) {
2084 #ifdef	DEBUG
2085 			if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2086 				prom_printf("rebinding %x to %x",
2087 				    min_busy_irq->airq_vector, most_free_cpu);
2088 			}
2089 #endif /* DEBUG */
2090 
2091 			iflag = intr_clear();
2092 			if (lock_try(&apic_ioapic_lock)) {
2093 				if (apic_rebind_all(min_busy_irq,
2094 				    most_free_cpu) == 0) {
2095 					/* Make change permenant */
2096 					min_busy_irq->airq_cpu =
2097 					    (uint32_t)most_free_cpu;
2098 				}
2099 				lock_clear(&apic_ioapic_lock);
2100 			}
2101 			intr_restore(iflag);
2102 
2103 		} else {
2104 			if (cpu_busy != (1 << busiest_cpu)) {
2105 				apic_redist_cpu_skip |= 1 << busiest_cpu;
2106 				/*
2107 				 * We leave cpu_skip set so that next time we
2108 				 * can choose another cpu
2109 				 */
2110 			}
2111 		}
2112 		apic_num_rebind++;
2113 	} else {
2114 		/*
2115 		 * found nothing. Could be that we skipped over valid CPUs
2116 		 * or we have balanced everything. If we had a variable
2117 		 * ticks_for_redistribution, it could be increased here.
2118 		 * apic_int_busy, int_free etc would also need to be
2119 		 * changed.
2120 		 */
2121 		if (apic_redist_cpu_skip)
2122 			apic_redist_cpu_skip = 0;
2123 	}
2124 	for (i = 0; i < apic_nproc; i++) {
2125 		if (apic_cpu_in_range(i)) {
2126 			apic_cpus[i].aci_busy = 0;
2127 		}
2128 	}
2129 }
2130 
2131 void
2132 apic_cleanup_busy(void)
2133 {
2134 	int i;
2135 	apic_irq_t *irq_ptr;
2136 
2137 	for (i = 0; i < apic_nproc; i++) {
2138 		if (apic_cpu_in_range(i)) {
2139 			apic_cpus[i].aci_busy = 0;
2140 		}
2141 	}
2142 
2143 	for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
2144 		if ((irq_ptr = apic_irq_table[i]) != NULL)
2145 			irq_ptr->airq_busy = 0;
2146 	}
2147 }
2148 
2149 int
2150 apic_ioapic_method_probe()
2151 {
2152 	return (PSM_SUCCESS);
2153 }
2154