xref: /illumos-gate/usr/src/uts/sun4/os/intr.c (revision 100b72f4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/sysmacros.h>
29 #include <sys/stack.h>
30 #include <sys/cpuvar.h>
31 #include <sys/ivintr.h>
32 #include <sys/intreg.h>
33 #include <sys/membar.h>
34 #include <sys/kmem.h>
35 #include <sys/intr.h>
36 #include <sys/sunndi.h>
37 #include <sys/cmn_err.h>
38 #include <sys/privregs.h>
39 #include <sys/systm.h>
40 #include <sys/archsystm.h>
41 #include <sys/machsystm.h>
42 #include <sys/x_call.h>
43 #include <vm/seg_kp.h>
44 #include <sys/debug.h>
45 #include <sys/cyclic.h>
46 
47 #include <sys/cpu_sgnblk_defs.h>
48 
49 kmutex_t soft_iv_lock;	/* protect software interrupt vector table */
50 /* Global locks which protect the interrupt distribution lists */
51 static kmutex_t intr_dist_lock;
52 static kmutex_t intr_dist_cpu_lock;
53 
54 /* Head of the interrupt distribution lists */
55 static struct intr_dist *intr_dist_head = NULL;
56 static struct intr_dist *intr_dist_whead = NULL;
57 
58 uint_t swinum_base;
59 uint_t maxswinum;
60 uint_t siron_inum;
61 uint_t poke_cpu_inum;
62 /*
63  * Note:-
64  * siron_pending was originally created to prevent a resource over consumption
65  * bug in setsoftint(exhaustion of interrupt pool free list).
66  * It's original intention is obsolete with the use of iv_pending in
67  * setsoftint. However, siron_pending stayed around, acting as a second
68  * gatekeeper preventing soft interrupts from being queued. In this capacity,
69  * it can lead to hangs on MP systems, where due to global visibility issues
70  * it can end up set while iv_pending is reset, preventing soft interrupts from
71  * ever being processed. In addition to its gatekeeper role, init_intr also
72  * uses it to flag the situation where siron() was called before siron_inum has
73  * been defined.
74  *
75  * siron() does not need an extra gatekeeper; any cpu that wishes should be
76  * allowed to queue a soft interrupt. It is softint()'s job to ensure
77  * correct handling of the queues. Therefore, siron_pending has been
78  * stripped of its gatekeeper task, retaining only its intr_init job, where
79  * it indicates that there is a pending need to call siron().
80  */
81 int siron_pending;
82 
83 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
84 int intr_dist_debug = 0;
85 int32_t intr_dist_weight_max = 1;
86 int32_t intr_dist_weight_maxmax = 1000;
87 int intr_dist_weight_maxfactor = 2;
88 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
89 
90 static void sw_ivintr_init(cpu_t *);
91 
92 /*
93  * intr_init() - interrupt initialization
94  *	Initialize the system's software interrupt vector table and
95  *	CPU's interrupt free list
96  */
97 void
98 intr_init(cpu_t *cp)
99 {
100 	init_ivintr();
101 	sw_ivintr_init(cp);
102 	init_intr_pool(cp);
103 
104 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
105 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
106 
107 	/*
108 	 * A soft interrupt may have been requested prior to the initialization
109 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
110 	 * init_intr_pool, so we have to wait until now before we can dispatch
111 	 * the pending soft interrupt (if any).
112 	 */
113 	if (siron_pending) {
114 		siron_pending = 0;
115 		siron();
116 	}
117 }
118 
119 /*
120  * poke_cpu_intr - fall through when poke_cpu calls
121  */
122 
123 /* ARGSUSED */
124 uint_t
125 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
126 {
127 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
128 	membar_stld_stst();
129 	return (1);
130 }
131 
132 /*
133  * sw_ivintr_init() - software interrupt vector initialization
134  *	called after CPU is active
135  *	the software interrupt vector table is part of the intr_vector[]
136  */
137 static void
138 sw_ivintr_init(cpu_t *cp)
139 {
140 	extern uint_t softlevel1();
141 
142 	mutex_init(&soft_iv_lock, NULL, MUTEX_DEFAULT, NULL);
143 
144 	swinum_base = SOFTIVNUM;
145 
146 	/*
147 	 * the maximum software interrupt == MAX_SOFT_INO
148 	 */
149 	maxswinum = swinum_base + MAX_SOFT_INO;
150 
151 	REGISTER_BBUS_INTR();
152 
153 	siron_inum = add_softintr(PIL_1, softlevel1, 0);
154 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0);
155 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
156 }
157 
158 cpuset_t intr_add_pools_inuse;
159 
160 /*
161  * cleanup_intr_pool()
162  *	Free up the extra intr request pool for this cpu.
163  */
164 void
165 cleanup_intr_pool(cpu_t *cp)
166 {
167 	extern struct intr_req *intr_add_head;
168 	int poolno;
169 	struct intr_req *pool;
170 
171 	poolno = cp->cpu_m.intr_pool_added;
172 	if (poolno >= 0) {
173 		cp->cpu_m.intr_pool_added = -1;
174 		pool = (poolno * INTR_PENDING_MAX * intr_add_pools) +
175 
176 			intr_add_head;	/* not byte arithmetic */
177 		bzero(pool, INTR_PENDING_MAX * intr_add_pools *
178 		    sizeof (struct intr_req));
179 
180 		CPUSET_DEL(intr_add_pools_inuse, poolno);
181 	}
182 }
183 
184 /*
185  * init_intr_pool()
186  *	initialize the intr request pool for the cpu
187  * 	should be called for each cpu
188  */
189 void
190 init_intr_pool(cpu_t *cp)
191 {
192 	extern struct intr_req *intr_add_head;
193 #ifdef	DEBUG
194 	extern struct intr_req *intr_add_tail;
195 #endif	/* DEBUG */
196 	int i, pool;
197 
198 	cp->cpu_m.intr_pool_added = -1;
199 
200 	for (i = 0; i < INTR_PENDING_MAX-1; i++) {
201 		cp->cpu_m.intr_pool[i].intr_next =
202 		    &cp->cpu_m.intr_pool[i+1];
203 	}
204 	cp->cpu_m.intr_pool[INTR_PENDING_MAX-1].intr_next = NULL;
205 
206 	cp->cpu_m.intr_head[0] = &cp->cpu_m.intr_pool[0];
207 	cp->cpu_m.intr_tail[0] = &cp->cpu_m.intr_pool[INTR_PENDING_MAX-1];
208 
209 	if (intr_add_pools != 0) {
210 
211 		/*
212 		 * If additional interrupt pools have been allocated,
213 		 * initialize those too and add them to the free list.
214 		 */
215 
216 		struct intr_req *trace;
217 
218 		for (pool = 0; pool < max_ncpus; pool++) {
219 			if (!(CPU_IN_SET(intr_add_pools_inuse, pool)))
220 			    break;
221 		}
222 		if (pool >= max_ncpus) {
223 			/*
224 			 * XXX - intr pools are alloc'd, just not as
225 			 * much as we would like.
226 			 */
227 			cmn_err(CE_WARN, "Failed to alloc all requested intr "
228 			    "pools for cpu%d", cp->cpu_id);
229 			return;
230 		}
231 		CPUSET_ADD(intr_add_pools_inuse, pool);
232 		cp->cpu_m.intr_pool_added = pool;
233 
234 		trace = (pool * INTR_PENDING_MAX * intr_add_pools) +
235 			intr_add_head;	/* not byte arithmetic */
236 
237 		cp->cpu_m.intr_pool[INTR_PENDING_MAX-1].intr_next = trace;
238 
239 		for (i = 1; i < intr_add_pools * INTR_PENDING_MAX; i++, trace++)
240 			trace->intr_next = trace + 1;
241 		trace->intr_next = NULL;
242 
243 		ASSERT(trace >= intr_add_head && trace <= intr_add_tail);
244 
245 		cp->cpu_m.intr_tail[0] = trace;
246 	}
247 }
248 
249 
250 /*
251  * siron - primitive for sun/os/softint.c
252  */
253 void
254 siron(void)
255 {
256 	if (siron_inum != 0)
257 		setsoftint(siron_inum);
258 	else
259 		siron_pending = 1;
260 }
261 
262 /*
263  * no_ivintr()
264  * 	called by vec_interrupt() through sys_trap()
265  *	vector interrupt received but not valid or not
266  *	registered in intr_vector[]
267  *	considered as a spurious mondo interrupt
268  */
269 /* ARGSUSED */
270 void
271 no_ivintr(struct regs *rp, int inum, int pil)
272 {
273 	cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
274 	    inum, pil);
275 
276 
277 #ifdef DEBUG_VEC_INTR
278 	prom_enter_mon();
279 #endif /* DEBUG_VEC_INTR */
280 }
281 
282 /*
283  * no_intr_pool()
284  * 	called by vec_interrupt() through sys_trap()
285  *	vector interrupt received but no intr_req entries
286  */
287 /* ARGSUSED */
288 void
289 no_intr_pool(struct regs *rp, int inum, int pil)
290 {
291 #ifdef DEBUG_VEC_INTR
292 	cmn_err(CE_WARN, "intr_req pool empty: num 0x%x, pil 0x%x",
293 		inum, pil);
294 	prom_enter_mon();
295 #else
296 	cmn_err(CE_PANIC, "intr_req pool empty: num 0x%x, pil 0x%x",
297 		inum, pil);
298 #endif /* DEBUG_VEC_INTR */
299 }
300 
301 void
302 intr_dequeue_req(uint_t pil, uint32_t inum)
303 {
304 	struct intr_req *ir, *prev;
305 	struct machcpu *mcpu;
306 	uint32_t clr;
307 	extern uint_t getpstate(void);
308 
309 	ASSERT((getpstate() & PSTATE_IE) == 0);
310 
311 	mcpu = &CPU->cpu_m;
312 
313 	/* Find a matching entry in the list */
314 	prev = NULL;
315 	ir = mcpu->intr_head[pil];
316 	while (ir != NULL) {
317 		if (ir->intr_number == inum)
318 			break;
319 		prev = ir;
320 		ir = ir->intr_next;
321 	}
322 	if (ir != NULL) {
323 		/*
324 		 * Remove entry from list
325 		 */
326 		if (prev != NULL)
327 			prev->intr_next = ir->intr_next;	/* non-head */
328 		else
329 			mcpu->intr_head[pil] = ir->intr_next;	/* head */
330 
331 		if (ir->intr_next == NULL)
332 			mcpu->intr_tail[pil] = prev;		/* tail */
333 
334 		/*
335 		 * Place on free list
336 		 */
337 		ir->intr_next = mcpu->intr_head[0];
338 		mcpu->intr_head[0] = ir;
339 	}
340 
341 	/*
342 	 * clear pending interrupts at this level if the list is empty
343 	 */
344 	if (mcpu->intr_head[pil] == NULL) {
345 		clr = 1 << pil;
346 		if (pil == PIL_14)
347 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
348 		wr_clr_softint(clr);
349 	}
350 }
351 
352 
353 /*
354  * Send a directed interrupt of specified interrupt number id to a cpu.
355  */
356 void
357 send_dirint(
358 	int cpuix,		/* cpu to be interrupted */
359 	int intr_id)		/* interrupt number id */
360 {
361 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
362 }
363 
364 /*
365  * Take the specified CPU out of participation in interrupts.
366  *	Called by p_online(2) when a processor is being taken off-line.
367  *	This allows interrupt threads being handled on the processor to
368  *	complete before the processor is idled.
369  */
370 int
371 cpu_disable_intr(struct cpu *cp)
372 {
373 	ASSERT(MUTEX_HELD(&cpu_lock));
374 
375 	/*
376 	 * Turn off the CPU_ENABLE flag before calling the redistribution
377 	 * function, since it checks for this in the cpu flags.
378 	 */
379 	cp->cpu_flags &= ~CPU_ENABLE;
380 
381 	intr_redist_all_cpus();
382 
383 	return (0);
384 }
385 
386 /*
387  * Allow the specified CPU to participate in interrupts.
388  *	Called by p_online(2) if a processor could not be taken off-line
389  *	because of bound threads, in order to resume processing interrupts.
390  *	Also called after starting a processor.
391  */
392 void
393 cpu_enable_intr(struct cpu *cp)
394 {
395 	ASSERT(MUTEX_HELD(&cpu_lock));
396 
397 	cp->cpu_flags |= CPU_ENABLE;
398 
399 	intr_redist_all_cpus();
400 }
401 
402 /*
403  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
404  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
405  * are issued to redirect interrupts of a specified weight, from heavy to
406  * light.  This allows all the interrupts of a given weight to be redistributed
407  * for all weighted nexus drivers prior to those of less weight.
408  */
409 static void
410 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
411 {
412 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
413 	struct intr_dist *iptr;
414 	struct intr_dist **pptr;
415 
416 	ASSERT(func);
417 	new->func = func;
418 	new->arg = arg;
419 	new->next = NULL;
420 
421 	/* Add to tail so that redistribution occurs in original order. */
422 	mutex_enter(&intr_dist_lock);
423 	for (iptr = *phead, pptr = phead; iptr != NULL;
424 	    pptr = &iptr->next, iptr = iptr->next) {
425 		/* check for problems as we locate the tail */
426 		if ((iptr->func == func) && (iptr->arg == arg)) {
427 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
428 			/*NOTREACHED*/
429 		}
430 	}
431 	*pptr = new;
432 
433 	mutex_exit(&intr_dist_lock);
434 }
435 
436 void
437 intr_dist_add(void (*func)(void *), void *arg)
438 {
439 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
440 }
441 
442 void
443 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
444 {
445 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
446 }
447 
448 /*
449  * Search for the interrupt distribution structure with the specified
450  * mondo vec reg in the interrupt distribution list. If a match is found,
451  * then delete the entry from the list. The caller is responsible for
452  * modifying the mondo vector registers.
453  */
454 static void
455 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
456 {
457 	struct intr_dist *iptr;
458 	struct intr_dist **vect;
459 
460 	mutex_enter(&intr_dist_lock);
461 	for (iptr = *headp, vect = headp;
462 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
463 		if ((iptr->func == func) && (iptr->arg == arg)) {
464 			*vect = iptr->next;
465 			kmem_free(iptr, sizeof (struct intr_dist));
466 			mutex_exit(&intr_dist_lock);
467 			return;
468 		}
469 	}
470 
471 	if (!panicstr)
472 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
473 	mutex_exit(&intr_dist_lock);
474 }
475 
476 void
477 intr_dist_rem(void (*func)(void *), void *arg)
478 {
479 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
480 }
481 
482 void
483 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
484 {
485 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
486 }
487 
488 /*
489  * Initiate interrupt redistribution.  Redistribution improves the isolation
490  * associated with interrupt weights by ordering operations from heavy weight
491  * to light weight.  When a CPUs orientation changes relative to interrupts,
492  * there is *always* a redistribution to accommodate this change (call to
493  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
494  * that a redistribution could improve the quality of an initialization. For
495  * example, if you are not using a NIC it may not be attached with s10 (devfs).
496  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
497  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
498  * occurring late, so optimal "isolation" relative to weight is not occurring.
499  * The same applies to detach, although in this case doing the redistribution
500  * might improve "spread" for medium weight devices since the "isolation" of
501  * a higher weight device may no longer be present.
502  *
503  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
504  *
505  * NB: There is risk associated with automatically triggering execution of the
506  * redistribution code at arbitrary times. The risk comes from the fact that
507  * there is a lot of low-level hardware interaction associated with a
508  * redistribution.  At some point we may want this code to perform automatic
509  * redistribution (redistribution thread; trigger timeout when add/remove
510  * weight delta is large enough, and call cv_signal from timeout - causing
511  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
512  * risky at this time.
513  */
514 void
515 i_ddi_intr_redist_all_cpus()
516 {
517 	mutex_enter(&cpu_lock);
518 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
519 	intr_redist_all_cpus();
520 	mutex_exit(&cpu_lock);
521 }
522 
523 /*
524  * Redistribute all interrupts
525  *
526  * This function redistributes all interrupting devices, running the
527  * parent callback functions for each node.
528  */
529 void
530 intr_redist_all_cpus(void)
531 {
532 	struct cpu *cp;
533 	struct intr_dist *iptr;
534 	int32_t weight, max_weight;
535 
536 	ASSERT(MUTEX_HELD(&cpu_lock));
537 	mutex_enter(&intr_dist_lock);
538 
539 	/*
540 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
541 	 * cpu_list since we hold cpu_lock.
542 	 */
543 	cp = cpu_list;
544 	do {
545 		cp->cpu_intr_weight = 0;
546 	} while ((cp = cp->cpu_next) != cpu_list);
547 
548 	/*
549 	 * Assume that this redistribution may encounter a device weight
550 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
551 	 * intr_dist_weight_maxfactor times larger.
552 	 */
553 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
554 	if (max_weight > intr_dist_weight_maxmax)
555 		max_weight = intr_dist_weight_maxmax;
556 	intr_dist_weight_max = 1;
557 
558 	INTR_DEBUG((CE_CONT, "intr_dist: "
559 	    "intr_redist_all_cpus: %d-0\n", max_weight));
560 
561 	/*
562 	 * Redistribute weighted, from heavy to light.  The callback that
563 	 * specifies a weight equal to weight_max should redirect all
564 	 * interrupts of weight weight_max or greater [weight_max, inf.).
565 	 * Interrupts of lesser weight should be processed on the call with
566 	 * the matching weight. This allows all the heaver weight interrupts
567 	 * on all weighted busses (multiple pci busses) to be redirected prior
568 	 * to any lesser weight interrupts.
569 	 */
570 	for (weight = max_weight; weight >= 0; weight--)
571 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
572 			((void (*)(void *, int32_t, int32_t))iptr->func)
573 			    (iptr->arg, max_weight, weight);
574 
575 	/* redistribute normal (non-weighted) interrupts */
576 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
577 		((void (*)(void *))iptr->func)(iptr->arg);
578 	mutex_exit(&intr_dist_lock);
579 }
580 
581 void
582 intr_redist_all_cpus_shutdown(void)
583 {
584 	intr_policy = INTR_CURRENT_CPU;
585 	intr_redist_all_cpus();
586 }
587 
588 /*
589  * Determine what CPU to target, based on interrupt policy.
590  *
591  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
592  *	advance through interrupt enabled cpus (round-robin).
593  *
594  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
595  *	cpu_intr_weight, round robin when all equal.
596  *
597  *	Weighted interrupt distribution provides two things: "spread" of weight
598  *	(associated with algorithm itself) and "isolation" (associated with a
599  *	particular device weight). A redistribution is what provides optimal
600  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
601  *	(relative to what came before) is always occurring.
602  *
603  *	An interrupt weight is a subjective number that represents the
604  *	percentage of a CPU required to service a device's interrupts: the
605  *	default weight is 0% (however the algorithm still maintains
606  *	round-robin), a network interface controller (NIC) may have a large
607  *	weight (35%). Interrupt weight only has meaning relative to the
608  *	interrupt weight of other devices: a CPU can be weighted more than
609  *	100%, and a single device might consume more than 100% of a CPU.
610  *
611  *	A coarse interrupt weight can be defined by the parent nexus driver
612  *	based on bus specific information, like pci class codes. A nexus
613  *	driver that supports device interrupt weighting for its children
614  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
615  *	and removes the weight of a device from the CPU that an interrupt
616  *	is directed at.  The quality of initialization improves when the
617  *	device interrupt weights more accuracy reflect actual run-time weights,
618  *	and as the assignments are ordered from is heavy to light.
619  *
620  *	The implementation also supports interrupt weight being specified in
621  *	driver.conf files via the property "ddi-intr-weight", which takes
622  *	precedence over the nexus supplied weight.  This support is added to
623  *	permit possible tweaking in the product in response to customer
624  *	problems. This is not a formal or committed interface.
625  *
626  *	While a weighted approach chooses the CPU providing the best spread
627  *	given past weights, less than optimal isolation can result in cases
628  *	where heavy weight devices show up last. The nexus driver's interrupt
629  *	redistribution logic should use intr_dist_add/rem_weighted so that
630  *	interrupts can be redistributed heavy first for optimal isolation.
631  */
632 uint32_t
633 intr_dist_cpuid(void)
634 {
635 	static struct cpu	*curr_cpu;
636 	struct cpu		*start_cpu;
637 	struct cpu		*new_cpu;
638 	struct cpu		*cp;
639 	int			cpuid = -1;
640 
641 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
642 	mutex_enter(&intr_dist_cpu_lock);
643 
644 	switch (intr_policy) {
645 	case INTR_CURRENT_CPU:
646 		cpuid = CPU->cpu_id;
647 		break;
648 
649 	case INTR_BOOT_CPU:
650 		panic("INTR_BOOT_CPU no longer supported.");
651 		/*NOTREACHED*/
652 
653 	case INTR_FLAT_DIST:
654 	case INTR_WEIGHTED_DIST:
655 	default:
656 		/*
657 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
658 		 * the cpu has been deleted (cpu structs are never freed).
659 		 */
660 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
661 			curr_cpu = CPU;
662 
663 		/*
664 		 * Advance to online CPU after curr_cpu (round-robin). For
665 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
666 		 * weight.  For a nexus that does not support weight the
667 		 * default weight of zero is used. We degrade to round-robin
668 		 * behavior among equal weightes.  The default weight is zero
669 		 * and round-robin behavior continues.
670 		 *
671 		 * Disable preemption while traversing cpu_next_onln to
672 		 * ensure the list does not change.  This works because
673 		 * modifiers of this list and other lists in a struct cpu
674 		 * call pause_cpus() before making changes.
675 		 */
676 		kpreempt_disable();
677 		cp = start_cpu = curr_cpu->cpu_next_onln;
678 		new_cpu = NULL;
679 		do {
680 			/* Skip CPUs with interrupts disabled */
681 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
682 				continue;
683 
684 			if (intr_policy == INTR_FLAT_DIST) {
685 				/* select CPU */
686 				new_cpu = cp;
687 				break;
688 			} else if ((new_cpu == NULL) ||
689 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
690 				/* Choose if lighter weight */
691 				new_cpu = cp;
692 			}
693 		} while ((cp = cp->cpu_next_onln) != start_cpu);
694 		ASSERT(new_cpu);
695 		cpuid = new_cpu->cpu_id;
696 
697 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
698 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
699 
700 		/* update static pointer for next round-robin */
701 		curr_cpu = new_cpu;
702 		kpreempt_enable();
703 		break;
704 	}
705 	mutex_exit(&intr_dist_cpu_lock);
706 	return (cpuid);
707 }
708 
709 /*
710  * Add or remove the the weight of a device from a CPUs interrupt weight.
711  *
712  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
713  * their children to improve the overall quality of interrupt initialization.
714  *
715  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
716  * among multiple devices (sharing ino) then the nexus should call
717  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
718  * that share must specify the same cpuid.
719  *
720  * If a nexus driver is unable to determine the cpu at remove_intr time
721  * for some of its interrupts, then it should not call add_device_weight -
722  * intr_dist_cpuid will still provide round-robin.
723  *
724  * An established device weight (from dev_info node) takes precedence over
725  * the weight passed in.  If a device weight is not already established
726  * then the passed in nexus weight is established.
727  */
728 void
729 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
730     dev_info_t *dip, int32_t nweight)
731 {
732 	int32_t		eweight;
733 
734 	/*
735 	 * For non-weighted policy everything has weight of zero (and we get
736 	 * round-robin distribution from intr_dist_cpuid).
737 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
738 	 * calls this rouitne even if intr_policy has been patched to
739 	 * INTR_FLAG_DIST.
740 	 */
741 	ASSERT(dip);
742 	if (intr_policy != INTR_WEIGHTED_DIST)
743 		return;
744 
745 	eweight = i_ddi_get_intr_weight(dip);
746 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
747 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
748 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
749 	    ddi_get_instance(ddi_get_parent(dip)),
750 	    ddi_driver_name(dip), ddi_get_instance(dip)));
751 
752 	/* if no establish weight, establish nexus weight */
753 	if (eweight < 0) {
754 		if (nweight > 0)
755 			(void) i_ddi_set_intr_weight(dip, nweight);
756 		else
757 			nweight = 0;
758 	} else
759 		nweight = eweight;	/* use established weight */
760 
761 	/* Establish exclusion for cpu_intr_weight manipulation */
762 	mutex_enter(&intr_dist_cpu_lock);
763 	cpu[cpuid]->cpu_intr_weight += nweight;
764 
765 	/* update intr_dist_weight_max */
766 	if (nweight > intr_dist_weight_max)
767 		intr_dist_weight_max = nweight;
768 	mutex_exit(&intr_dist_cpu_lock);
769 }
770 
771 void
772 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
773 {
774 	struct cpu	*cp;
775 	int32_t		weight;
776 
777 	ASSERT(dip);
778 	if (intr_policy != INTR_WEIGHTED_DIST)
779 		return;
780 
781 	/* remove weight of device from cpu */
782 	weight = i_ddi_get_intr_weight(dip);
783 	if (weight < 0)
784 		weight = 0;
785 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
786 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
787 	    ddi_driver_name(ddi_get_parent(dip)),
788 	    ddi_get_instance(ddi_get_parent(dip)),
789 	    ddi_driver_name(dip), ddi_get_instance(dip)));
790 
791 	/* Establish exclusion for cpu_intr_weight manipulation */
792 	mutex_enter(&intr_dist_cpu_lock);
793 	cp = cpu[cpuid];
794 	cp->cpu_intr_weight -= weight;
795 	if (cp->cpu_intr_weight < 0)
796 		cp->cpu_intr_weight = 0;	/* sanity */
797 	mutex_exit(&intr_dist_cpu_lock);
798 }
799