xref: /illumos-gate/usr/src/uts/sun4/os/intr.c (revision 3aedfe0b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/sysmacros.h>
29 #include <sys/stack.h>
30 #include <sys/cpuvar.h>
31 #include <sys/ivintr.h>
32 #include <sys/intreg.h>
33 #include <sys/membar.h>
34 #include <sys/kmem.h>
35 #include <sys/intr.h>
36 #include <sys/sunndi.h>
37 #include <sys/cmn_err.h>
38 #include <sys/privregs.h>
39 #include <sys/systm.h>
40 #include <sys/archsystm.h>
41 #include <sys/machsystm.h>
42 #include <sys/x_call.h>
43 #include <vm/seg_kp.h>
44 #include <sys/debug.h>
45 #include <sys/cyclic.h>
46 #include <sys/kdi_impl.h>
47 
48 #include <sys/cpu_sgnblk_defs.h>
49 
50 /* Global locks which protect the interrupt distribution lists */
51 static kmutex_t intr_dist_lock;
52 static kmutex_t intr_dist_cpu_lock;
53 
54 /* Head of the interrupt distribution lists */
55 static struct intr_dist *intr_dist_head = NULL;
56 static struct intr_dist *intr_dist_whead = NULL;
57 
58 uint64_t siron_inum;
59 uint64_t *siron_cpu_inum = NULL;
60 uint64_t siron_poke_cpu_inum;
61 static int siron_cpu_setup(cpu_setup_t, int, void *);
62 extern uint_t softlevel1();
63 
64 uint64_t poke_cpu_inum;
65 uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
66 uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);
67 
68 /*
69  * Note:-
70  * siron_pending was originally created to prevent a resource over consumption
71  * bug in setsoftint(exhaustion of interrupt pool free list).
72  * It's original intention is obsolete with the use of iv_pending in
73  * setsoftint. However, siron_pending stayed around, acting as a second
74  * gatekeeper preventing soft interrupts from being queued. In this capacity,
75  * it can lead to hangs on MP systems, where due to global visibility issues
76  * it can end up set while iv_pending is reset, preventing soft interrupts from
77  * ever being processed. In addition to its gatekeeper role, init_intr also
78  * uses it to flag the situation where siron() was called before siron_inum has
79  * been defined.
80  *
81  * siron() does not need an extra gatekeeper; any cpu that wishes should be
82  * allowed to queue a soft interrupt. It is softint()'s job to ensure
83  * correct handling of the queues. Therefore, siron_pending has been
84  * stripped of its gatekeeper task, retaining only its intr_init job, where
85  * it indicates that there is a pending need to call siron().
86  */
87 int siron_pending;
88 
89 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
90 int intr_dist_debug = 0;
91 int32_t intr_dist_weight_max = 1;
92 int32_t intr_dist_weight_maxmax = 1000;
93 int intr_dist_weight_maxfactor = 2;
94 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
95 
96 /*
97  * intr_init() - Interrupt initialization
98  *	Initialize the system's interrupt vector table.
99  */
100 void
101 intr_init(cpu_t *cp)
102 {
103 	extern uint_t softlevel1();
104 
105 	init_ivintr();
106 	REGISTER_BBUS_INTR();
107 
108 	/*
109 	 * We just allocate memory for per-cpu siron right now. Rest of
110 	 * the work is done when CPU is configured.
111 	 */
112 	siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
113 	siron_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
114 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
115 	siron_poke_cpu_inum = add_softintr(PIL_13,
116 	    siron_poke_cpu_intr, 0, SOFTINT_MT);
117 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
118 
119 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
120 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
121 
122 	/*
123 	 * A soft interrupt may have been requested prior to the initialization
124 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
125 	 * init_intr(), so we have to wait until now before we can dispatch the
126 	 * pending soft interrupt (if any).
127 	 */
128 	if (siron_pending) {
129 		siron_pending = 0;
130 		siron();
131 	}
132 }
133 
134 /*
135  * poke_cpu_intr - fall through when poke_cpu calls
136  */
137 /* ARGSUSED */
138 uint_t
139 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
140 {
141 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
142 	membar_stld_stst();
143 	return (1);
144 }
145 
146 /*
147  * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
148  * inform its driver component that there's work to be done.  We need to keep
149  * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
150  * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
151  * implementation of setsoftint is complicated enough that we don't want to
152  * duplicate it, but at the same time we don't want to preclude tracing either.
153  * The meat of setsoftint() therefore goes into kdi_setsoftint, with
154  * setsoftint() implemented as a wrapper.  This allows tracing, while still
155  * providing a way for kmdb to sneak in unmolested.
156  */
157 void
158 kdi_siron(void)
159 {
160 	if (siron_inum != 0)
161 		kdi_setsoftint(siron_inum);
162 	else
163 		siron_pending = 1;
164 }
165 
166 void
167 setsoftint(uint64_t inum)
168 {
169 	kdi_setsoftint(inum);
170 }
171 
172 /*
173  * Generates softlevel1 interrupt on current CPU if it
174  * is not pending already.
175  */
176 void
177 siron(void)
178 {
179 	uint64_t inum;
180 
181 	if (siron_inum != 0) {
182 		if (siron_cpu_inum[CPU->cpu_id] != 0)
183 			inum = siron_cpu_inum[CPU->cpu_id];
184 		else
185 			inum = siron_inum;
186 
187 		setsoftint(inum);
188 	} else
189 		siron_pending = 1;
190 }
191 
192 /*
193  * This routine creates per-CPU siron inum for CPUs which are
194  * configured during boot.
195  */
196 void
197 siron_mp_init()
198 {
199 	cpu_t *c;
200 
201 	mutex_enter(&cpu_lock);
202 	c = cpu_list;
203 	do {
204 		(void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
205 	} while ((c = c->cpu_next) != cpu_list);
206 
207 	register_cpu_setup_func(siron_cpu_setup, NULL);
208 	mutex_exit(&cpu_lock);
209 }
210 
211 /*
212  * siron_poke_cpu_intr - cross-call handler.
213  */
214 /* ARGSUSED */
215 uint_t
216 siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
217 {
218 	/* generate level1 softint */
219 	siron();
220 	return (1);
221 }
222 
223 /*
224  * This routine generates a cross-call on target CPU(s).
225  */
226 void
227 siron_poke_cpu(cpuset_t poke)
228 {
229 	int cpuid = CPU->cpu_id;
230 
231 	if (CPU_IN_SET(poke, cpuid)) {
232 		siron();
233 		CPUSET_DEL(poke, cpuid);
234 		if (CPUSET_ISNULL(poke))
235 			return;
236 	}
237 
238 	xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
239 }
240 
241 /*
242  * This callback function allows us to create per-CPU siron inum.
243  */
244 /* ARGSUSED */
245 static int
246 siron_cpu_setup(cpu_setup_t what, int id, void *arg)
247 {
248 	cpu_t *cp = cpu[id];
249 
250 	ASSERT(MUTEX_HELD(&cpu_lock));
251 	ASSERT(cp != NULL);
252 
253 	switch (what) {
254 	case CPU_CONFIG:
255 		siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
256 		    (softintrfunc)softlevel1, 0, SOFTINT_ST);
257 		break;
258 	case CPU_UNCONFIG:
259 		(void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
260 		siron_cpu_inum[cp->cpu_id] = 0;
261 		break;
262 	default:
263 		break;
264 	}
265 
266 	return (0);
267 }
268 
269 /*
270  * no_ivintr()
271  * 	called by setvecint_tl1() through sys_trap()
272  *	vector interrupt received but not valid or not
273  *	registered in intr_vec_table
274  *	considered as a spurious mondo interrupt
275  */
276 /* ARGSUSED */
277 void
278 no_ivintr(struct regs *rp, int inum, int pil)
279 {
280 	cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
281 	    inum, pil);
282 
283 #ifdef DEBUG_VEC_INTR
284 	prom_enter_mon();
285 #endif /* DEBUG_VEC_INTR */
286 }
287 
288 void
289 intr_dequeue_req(uint_t pil, uint64_t inum)
290 {
291 	intr_vec_t	*iv, *next, *prev;
292 	struct machcpu	*mcpu;
293 	uint32_t	clr;
294 	processorid_t	cpu_id;
295 	extern uint_t	getpstate(void);
296 
297 	ASSERT((getpstate() & PSTATE_IE) == 0);
298 
299 	mcpu = &CPU->cpu_m;
300 	cpu_id = CPU->cpu_id;
301 
302 	iv = (intr_vec_t *)inum;
303 	prev = NULL;
304 	next = mcpu->intr_head[pil];
305 
306 	/* Find a matching entry in the list */
307 	while (next != NULL) {
308 		if (next == iv)
309 			break;
310 		prev = next;
311 		next = IV_GET_PIL_NEXT(next, cpu_id);
312 	}
313 
314 	if (next != NULL) {
315 		intr_vec_t	*next_iv = IV_GET_PIL_NEXT(next, cpu_id);
316 
317 		/* Remove entry from list */
318 		if (prev != NULL)
319 			IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
320 		else
321 			mcpu->intr_head[pil] = next_iv; /* head */
322 
323 		if (next_iv == NULL)
324 			mcpu->intr_tail[pil] = prev; /* tail */
325 	}
326 
327 	/* Clear pending interrupts at this level if the list is empty */
328 	if (mcpu->intr_head[pil] == NULL) {
329 		clr = 1 << pil;
330 		if (pil == PIL_14)
331 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
332 		wr_clr_softint(clr);
333 	}
334 }
335 
336 
337 /*
338  * Send a directed interrupt of specified interrupt number id to a cpu.
339  */
340 void
341 send_dirint(
342 	int cpuix,		/* cpu to be interrupted */
343 	int intr_id)		/* interrupt number id */
344 {
345 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
346 }
347 
348 /*
349  * Take the specified CPU out of participation in interrupts.
350  *	Called by p_online(2) when a processor is being taken off-line.
351  *	This allows interrupt threads being handled on the processor to
352  *	complete before the processor is idled.
353  */
354 int
355 cpu_disable_intr(struct cpu *cp)
356 {
357 	ASSERT(MUTEX_HELD(&cpu_lock));
358 
359 	/*
360 	 * Turn off the CPU_ENABLE flag before calling the redistribution
361 	 * function, since it checks for this in the cpu flags.
362 	 */
363 	cp->cpu_flags &= ~CPU_ENABLE;
364 
365 	intr_redist_all_cpus();
366 
367 	return (0);
368 }
369 
370 /*
371  * Allow the specified CPU to participate in interrupts.
372  *	Called by p_online(2) if a processor could not be taken off-line
373  *	because of bound threads, in order to resume processing interrupts.
374  *	Also called after starting a processor.
375  */
376 void
377 cpu_enable_intr(struct cpu *cp)
378 {
379 	ASSERT(MUTEX_HELD(&cpu_lock));
380 
381 	cp->cpu_flags |= CPU_ENABLE;
382 
383 	intr_redist_all_cpus();
384 }
385 
386 /*
387  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
388  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
389  * are issued to redirect interrupts of a specified weight, from heavy to
390  * light.  This allows all the interrupts of a given weight to be redistributed
391  * for all weighted nexus drivers prior to those of less weight.
392  */
393 static void
394 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
395 {
396 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
397 	struct intr_dist *iptr;
398 	struct intr_dist **pptr;
399 
400 	ASSERT(func);
401 	new->func = func;
402 	new->arg = arg;
403 	new->next = NULL;
404 
405 	/* Add to tail so that redistribution occurs in original order. */
406 	mutex_enter(&intr_dist_lock);
407 	for (iptr = *phead, pptr = phead; iptr != NULL;
408 	    pptr = &iptr->next, iptr = iptr->next) {
409 		/* check for problems as we locate the tail */
410 		if ((iptr->func == func) && (iptr->arg == arg)) {
411 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
412 			/*NOTREACHED*/
413 		}
414 	}
415 	*pptr = new;
416 
417 	mutex_exit(&intr_dist_lock);
418 }
419 
420 void
421 intr_dist_add(void (*func)(void *), void *arg)
422 {
423 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
424 }
425 
426 void
427 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
428 {
429 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
430 }
431 
432 /*
433  * Search for the interrupt distribution structure with the specified
434  * mondo vec reg in the interrupt distribution list. If a match is found,
435  * then delete the entry from the list. The caller is responsible for
436  * modifying the mondo vector registers.
437  */
438 static void
439 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
440 {
441 	struct intr_dist *iptr;
442 	struct intr_dist **vect;
443 
444 	mutex_enter(&intr_dist_lock);
445 	for (iptr = *headp, vect = headp;
446 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
447 		if ((iptr->func == func) && (iptr->arg == arg)) {
448 			*vect = iptr->next;
449 			kmem_free(iptr, sizeof (struct intr_dist));
450 			mutex_exit(&intr_dist_lock);
451 			return;
452 		}
453 	}
454 
455 	if (!panicstr)
456 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
457 	mutex_exit(&intr_dist_lock);
458 }
459 
460 void
461 intr_dist_rem(void (*func)(void *), void *arg)
462 {
463 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
464 }
465 
466 void
467 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
468 {
469 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
470 }
471 
472 /*
473  * Initiate interrupt redistribution.  Redistribution improves the isolation
474  * associated with interrupt weights by ordering operations from heavy weight
475  * to light weight.  When a CPUs orientation changes relative to interrupts,
476  * there is *always* a redistribution to accommodate this change (call to
477  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
478  * that a redistribution could improve the quality of an initialization. For
479  * example, if you are not using a NIC it may not be attached with s10 (devfs).
480  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
481  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
482  * occurring late, so optimal "isolation" relative to weight is not occurring.
483  * The same applies to detach, although in this case doing the redistribution
484  * might improve "spread" for medium weight devices since the "isolation" of
485  * a higher weight device may no longer be present.
486  *
487  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
488  *
489  * NB: There is risk associated with automatically triggering execution of the
490  * redistribution code at arbitrary times. The risk comes from the fact that
491  * there is a lot of low-level hardware interaction associated with a
492  * redistribution.  At some point we may want this code to perform automatic
493  * redistribution (redistribution thread; trigger timeout when add/remove
494  * weight delta is large enough, and call cv_signal from timeout - causing
495  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
496  * risky at this time.
497  */
498 void
499 i_ddi_intr_redist_all_cpus()
500 {
501 	mutex_enter(&cpu_lock);
502 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
503 	intr_redist_all_cpus();
504 	mutex_exit(&cpu_lock);
505 }
506 
507 /*
508  * Redistribute all interrupts
509  *
510  * This function redistributes all interrupting devices, running the
511  * parent callback functions for each node.
512  */
513 void
514 intr_redist_all_cpus(void)
515 {
516 	struct cpu *cp;
517 	struct intr_dist *iptr;
518 	int32_t weight, max_weight;
519 
520 	ASSERT(MUTEX_HELD(&cpu_lock));
521 	mutex_enter(&intr_dist_lock);
522 
523 	/*
524 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
525 	 * cpu_list since we hold cpu_lock.
526 	 */
527 	cp = cpu_list;
528 	do {
529 		cp->cpu_intr_weight = 0;
530 	} while ((cp = cp->cpu_next) != cpu_list);
531 
532 	/*
533 	 * Assume that this redistribution may encounter a device weight
534 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
535 	 * intr_dist_weight_maxfactor times larger.
536 	 */
537 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
538 	if (max_weight > intr_dist_weight_maxmax)
539 		max_weight = intr_dist_weight_maxmax;
540 	intr_dist_weight_max = 1;
541 
542 	INTR_DEBUG((CE_CONT, "intr_dist: "
543 	    "intr_redist_all_cpus: %d-0\n", max_weight));
544 
545 	/*
546 	 * Redistribute weighted, from heavy to light.  The callback that
547 	 * specifies a weight equal to weight_max should redirect all
548 	 * interrupts of weight weight_max or greater [weight_max, inf.).
549 	 * Interrupts of lesser weight should be processed on the call with
550 	 * the matching weight. This allows all the heaver weight interrupts
551 	 * on all weighted busses (multiple pci busses) to be redirected prior
552 	 * to any lesser weight interrupts.
553 	 */
554 	for (weight = max_weight; weight >= 0; weight--)
555 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
556 			((void (*)(void *, int32_t, int32_t))iptr->func)
557 			    (iptr->arg, max_weight, weight);
558 
559 	/* redistribute normal (non-weighted) interrupts */
560 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
561 		((void (*)(void *))iptr->func)(iptr->arg);
562 	mutex_exit(&intr_dist_lock);
563 }
564 
565 void
566 intr_redist_all_cpus_shutdown(void)
567 {
568 	intr_policy = INTR_CURRENT_CPU;
569 	intr_redist_all_cpus();
570 }
571 
572 /*
573  * Determine what CPU to target, based on interrupt policy.
574  *
575  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
576  *	advance through interrupt enabled cpus (round-robin).
577  *
578  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
579  *	cpu_intr_weight, round robin when all equal.
580  *
581  *	Weighted interrupt distribution provides two things: "spread" of weight
582  *	(associated with algorithm itself) and "isolation" (associated with a
583  *	particular device weight). A redistribution is what provides optimal
584  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
585  *	(relative to what came before) is always occurring.
586  *
587  *	An interrupt weight is a subjective number that represents the
588  *	percentage of a CPU required to service a device's interrupts: the
589  *	default weight is 0% (however the algorithm still maintains
590  *	round-robin), a network interface controller (NIC) may have a large
591  *	weight (35%). Interrupt weight only has meaning relative to the
592  *	interrupt weight of other devices: a CPU can be weighted more than
593  *	100%, and a single device might consume more than 100% of a CPU.
594  *
595  *	A coarse interrupt weight can be defined by the parent nexus driver
596  *	based on bus specific information, like pci class codes. A nexus
597  *	driver that supports device interrupt weighting for its children
598  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
599  *	and removes the weight of a device from the CPU that an interrupt
600  *	is directed at.  The quality of initialization improves when the
601  *	device interrupt weights more accuracy reflect actual run-time weights,
602  *	and as the assignments are ordered from is heavy to light.
603  *
604  *	The implementation also supports interrupt weight being specified in
605  *	driver.conf files via the property "ddi-intr-weight", which takes
606  *	precedence over the nexus supplied weight.  This support is added to
607  *	permit possible tweaking in the product in response to customer
608  *	problems. This is not a formal or committed interface.
609  *
610  *	While a weighted approach chooses the CPU providing the best spread
611  *	given past weights, less than optimal isolation can result in cases
612  *	where heavy weight devices show up last. The nexus driver's interrupt
613  *	redistribution logic should use intr_dist_add/rem_weighted so that
614  *	interrupts can be redistributed heavy first for optimal isolation.
615  */
616 uint32_t
617 intr_dist_cpuid(void)
618 {
619 	static struct cpu	*curr_cpu;
620 	struct cpu		*start_cpu;
621 	struct cpu		*new_cpu;
622 	struct cpu		*cp;
623 	int			cpuid = -1;
624 
625 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
626 	mutex_enter(&intr_dist_cpu_lock);
627 
628 	switch (intr_policy) {
629 	case INTR_CURRENT_CPU:
630 		cpuid = CPU->cpu_id;
631 		break;
632 
633 	case INTR_BOOT_CPU:
634 		panic("INTR_BOOT_CPU no longer supported.");
635 		/*NOTREACHED*/
636 
637 	case INTR_FLAT_DIST:
638 	case INTR_WEIGHTED_DIST:
639 	default:
640 		/*
641 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
642 		 * the cpu has been deleted (cpu structs are never freed).
643 		 */
644 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
645 			curr_cpu = CPU;
646 
647 		/*
648 		 * Advance to online CPU after curr_cpu (round-robin). For
649 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
650 		 * weight.  For a nexus that does not support weight the
651 		 * default weight of zero is used. We degrade to round-robin
652 		 * behavior among equal weightes.  The default weight is zero
653 		 * and round-robin behavior continues.
654 		 *
655 		 * Disable preemption while traversing cpu_next_onln to
656 		 * ensure the list does not change.  This works because
657 		 * modifiers of this list and other lists in a struct cpu
658 		 * call pause_cpus() before making changes.
659 		 */
660 		kpreempt_disable();
661 		cp = start_cpu = curr_cpu->cpu_next_onln;
662 		new_cpu = NULL;
663 		do {
664 			/* Skip CPUs with interrupts disabled */
665 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
666 				continue;
667 
668 			if (intr_policy == INTR_FLAT_DIST) {
669 				/* select CPU */
670 				new_cpu = cp;
671 				break;
672 			} else if ((new_cpu == NULL) ||
673 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
674 				/* Choose if lighter weight */
675 				new_cpu = cp;
676 			}
677 		} while ((cp = cp->cpu_next_onln) != start_cpu);
678 		ASSERT(new_cpu);
679 		cpuid = new_cpu->cpu_id;
680 
681 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
682 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
683 
684 		/* update static pointer for next round-robin */
685 		curr_cpu = new_cpu;
686 		kpreempt_enable();
687 		break;
688 	}
689 	mutex_exit(&intr_dist_cpu_lock);
690 	return (cpuid);
691 }
692 
693 /*
694  * Add or remove the the weight of a device from a CPUs interrupt weight.
695  *
696  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
697  * their children to improve the overall quality of interrupt initialization.
698  *
699  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
700  * among multiple devices (sharing ino) then the nexus should call
701  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
702  * that share must specify the same cpuid.
703  *
704  * If a nexus driver is unable to determine the cpu at remove_intr time
705  * for some of its interrupts, then it should not call add_device_weight -
706  * intr_dist_cpuid will still provide round-robin.
707  *
708  * An established device weight (from dev_info node) takes precedence over
709  * the weight passed in.  If a device weight is not already established
710  * then the passed in nexus weight is established.
711  */
712 void
713 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
714     dev_info_t *dip, int32_t nweight)
715 {
716 	int32_t		eweight;
717 
718 	/*
719 	 * For non-weighted policy everything has weight of zero (and we get
720 	 * round-robin distribution from intr_dist_cpuid).
721 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
722 	 * calls this rouitne even if intr_policy has been patched to
723 	 * INTR_FLAG_DIST.
724 	 */
725 	ASSERT(dip);
726 	if (intr_policy != INTR_WEIGHTED_DIST)
727 		return;
728 
729 	eweight = i_ddi_get_intr_weight(dip);
730 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
731 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
732 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
733 	    ddi_get_instance(ddi_get_parent(dip)),
734 	    ddi_driver_name(dip), ddi_get_instance(dip)));
735 
736 	/* if no establish weight, establish nexus weight */
737 	if (eweight < 0) {
738 		if (nweight > 0)
739 			(void) i_ddi_set_intr_weight(dip, nweight);
740 		else
741 			nweight = 0;
742 	} else
743 		nweight = eweight;	/* use established weight */
744 
745 	/* Establish exclusion for cpu_intr_weight manipulation */
746 	mutex_enter(&intr_dist_cpu_lock);
747 	cpu[cpuid]->cpu_intr_weight += nweight;
748 
749 	/* update intr_dist_weight_max */
750 	if (nweight > intr_dist_weight_max)
751 		intr_dist_weight_max = nweight;
752 	mutex_exit(&intr_dist_cpu_lock);
753 }
754 
755 void
756 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
757 {
758 	struct cpu	*cp;
759 	int32_t		weight;
760 
761 	ASSERT(dip);
762 	if (intr_policy != INTR_WEIGHTED_DIST)
763 		return;
764 
765 	/* remove weight of device from cpu */
766 	weight = i_ddi_get_intr_weight(dip);
767 	if (weight < 0)
768 		weight = 0;
769 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
770 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
771 	    ddi_driver_name(ddi_get_parent(dip)),
772 	    ddi_get_instance(ddi_get_parent(dip)),
773 	    ddi_driver_name(dip), ddi_get_instance(dip)));
774 
775 	/* Establish exclusion for cpu_intr_weight manipulation */
776 	mutex_enter(&intr_dist_cpu_lock);
777 	cp = cpu[cpuid];
778 	cp->cpu_intr_weight -= weight;
779 	if (cp->cpu_intr_weight < 0)
780 		cp->cpu_intr_weight = 0;	/* sanity */
781 	mutex_exit(&intr_dist_cpu_lock);
782 }
783