xref: /illumos-gate/usr/src/uts/sun4/os/intr.c (revision f8047eab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/sysmacros.h>
30 #include <sys/stack.h>
31 #include <sys/cpuvar.h>
32 #include <sys/ivintr.h>
33 #include <sys/intreg.h>
34 #include <sys/membar.h>
35 #include <sys/kmem.h>
36 #include <sys/intr.h>
37 #include <sys/sunndi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/privregs.h>
40 #include <sys/systm.h>
41 #include <sys/archsystm.h>
42 #include <sys/machsystm.h>
43 #include <sys/x_call.h>
44 #include <vm/seg_kp.h>
45 #include <sys/debug.h>
46 #include <sys/cyclic.h>
47 
48 #include <sys/cpu_sgnblk_defs.h>
49 
50 kmutex_t soft_iv_lock;	/* protect software interrupt vector table */
51 /* Global locks which protect the interrupt distribution lists */
52 static kmutex_t intr_dist_lock;
53 static kmutex_t intr_dist_cpu_lock;
54 
55 /* Head of the interrupt distribution lists */
56 static struct intr_dist *intr_dist_head = NULL;
57 static struct intr_dist *intr_dist_whead = NULL;
58 
59 uint_t swinum_base;
60 uint_t maxswinum;
61 uint_t siron_inum;
62 uint_t poke_cpu_inum;
63 /*
64  * Note:-
65  * siron_pending was originally created to prevent a resource over consumption
66  * bug in setsoftint(exhaustion of interrupt pool free list).
67  * It's original intention is obsolete with the use of iv_pending in
68  * setsoftint. However, siron_pending stayed around, acting as a second
69  * gatekeeper preventing soft interrupts from being queued. In this capacity,
70  * it can lead to hangs on MP systems, where due to global visibility issues
71  * it can end up set while iv_pending is reset, preventing soft interrupts from
72  * ever being processed. In addition to its gatekeeper role, init_intr also
73  * uses it to flag the situation where siron() was called before siron_inum has
74  * been defined.
75  *
76  * siron() does not need an extra gatekeeper; any cpu that wishes should be
77  * allowed to queue a soft interrupt. It is softint()'s job to ensure
78  * correct handling of the queues. Therefore, siron_pending has been
79  * stripped of its gatekeeper task, retaining only its intr_init job, where
80  * it indicates that there is a pending need to call siron().
81  */
82 int siron_pending;
83 
84 int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
85 int intr_dist_debug = 0;
86 int32_t intr_dist_weight_max = 1;
87 int32_t intr_dist_weight_maxmax = 1000;
88 int intr_dist_weight_maxfactor = 2;
89 #define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args
90 
91 static void sw_ivintr_init(cpu_t *);
92 
93 /*
94  * intr_init() - interrupt initialization
95  *	Initialize the system's software interrupt vector table and
96  *	CPU's interrupt free list
97  */
98 void
99 intr_init(cpu_t *cp)
100 {
101 	init_ivintr();
102 	sw_ivintr_init(cp);
103 	init_intr_pool(cp);
104 
105 	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
106 	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);
107 
108 	/*
109 	 * A soft interrupt may have been requested prior to the initialization
110 	 * of soft interrupts.  Soft interrupts can't be dispatched until after
111 	 * init_intr_pool, so we have to wait until now before we can dispatch
112 	 * the pending soft interrupt (if any).
113 	 */
114 	if (siron_pending) {
115 		siron_pending = 0;
116 		siron();
117 	}
118 }
119 
120 /*
121  * poke_cpu_intr - fall through when poke_cpu calls
122  */
123 
124 /* ARGSUSED */
125 uint_t
126 poke_cpu_intr(caddr_t arg1, caddr_t arg2)
127 {
128 	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
129 	membar_stld_stst();
130 	return (1);
131 }
132 
133 /*
134  * sw_ivintr_init() - software interrupt vector initialization
135  *	called after CPU is active
136  *	the software interrupt vector table is part of the intr_vector[]
137  */
138 static void
139 sw_ivintr_init(cpu_t *cp)
140 {
141 	extern uint_t softlevel1();
142 
143 	mutex_init(&soft_iv_lock, NULL, MUTEX_DEFAULT, NULL);
144 
145 	swinum_base = SOFTIVNUM;
146 
147 	/*
148 	 * the maximum software interrupt == MAX_SOFT_INO
149 	 */
150 	maxswinum = swinum_base + MAX_SOFT_INO;
151 
152 	REGISTER_BBUS_INTR();
153 
154 	siron_inum = add_softintr(PIL_1, softlevel1, 0);
155 	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0);
156 	cp->cpu_m.poke_cpu_outstanding = B_FALSE;
157 }
158 
159 cpuset_t intr_add_pools_inuse;
160 
161 /*
162  * cleanup_intr_pool()
163  *	Free up the extra intr request pool for this cpu.
164  */
165 void
166 cleanup_intr_pool(cpu_t *cp)
167 {
168 	extern struct intr_req *intr_add_head;
169 	int poolno;
170 	struct intr_req *pool;
171 
172 	poolno = cp->cpu_m.intr_pool_added;
173 	if (poolno >= 0) {
174 		cp->cpu_m.intr_pool_added = -1;
175 		pool = (poolno * INTR_PENDING_MAX * intr_add_pools) +
176 
177 			intr_add_head;	/* not byte arithmetic */
178 		bzero(pool, INTR_PENDING_MAX * intr_add_pools *
179 		    sizeof (struct intr_req));
180 
181 		CPUSET_DEL(intr_add_pools_inuse, poolno);
182 	}
183 }
184 
185 /*
186  * init_intr_pool()
187  *	initialize the intr request pool for the cpu
188  * 	should be called for each cpu
189  */
190 void
191 init_intr_pool(cpu_t *cp)
192 {
193 	extern struct intr_req *intr_add_head;
194 #ifdef	DEBUG
195 	extern struct intr_req *intr_add_tail;
196 #endif	/* DEBUG */
197 	int i, pool;
198 
199 	cp->cpu_m.intr_pool_added = -1;
200 
201 	for (i = 0; i < INTR_PENDING_MAX-1; i++) {
202 		cp->cpu_m.intr_pool[i].intr_next =
203 		    &cp->cpu_m.intr_pool[i+1];
204 	}
205 	cp->cpu_m.intr_pool[INTR_PENDING_MAX-1].intr_next = NULL;
206 
207 	cp->cpu_m.intr_head[0] = &cp->cpu_m.intr_pool[0];
208 	cp->cpu_m.intr_tail[0] = &cp->cpu_m.intr_pool[INTR_PENDING_MAX-1];
209 
210 	if (intr_add_pools != 0) {
211 
212 		/*
213 		 * If additional interrupt pools have been allocated,
214 		 * initialize those too and add them to the free list.
215 		 */
216 
217 		struct intr_req *trace;
218 
219 		for (pool = 0; pool < max_ncpus; pool++) {
220 			if (!(CPU_IN_SET(intr_add_pools_inuse, pool)))
221 			    break;
222 		}
223 		if (pool >= max_ncpus) {
224 			/*
225 			 * XXX - intr pools are alloc'd, just not as
226 			 * much as we would like.
227 			 */
228 			cmn_err(CE_WARN, "Failed to alloc all requested intr "
229 			    "pools for cpu%d", cp->cpu_id);
230 			return;
231 		}
232 		CPUSET_ADD(intr_add_pools_inuse, pool);
233 		cp->cpu_m.intr_pool_added = pool;
234 
235 		trace = (pool * INTR_PENDING_MAX * intr_add_pools) +
236 			intr_add_head;	/* not byte arithmetic */
237 
238 		cp->cpu_m.intr_pool[INTR_PENDING_MAX-1].intr_next = trace;
239 
240 		for (i = 1; i < intr_add_pools * INTR_PENDING_MAX; i++, trace++)
241 			trace->intr_next = trace + 1;
242 		trace->intr_next = NULL;
243 
244 		ASSERT(trace >= intr_add_head && trace <= intr_add_tail);
245 
246 		cp->cpu_m.intr_tail[0] = trace;
247 	}
248 }
249 
250 
251 /*
252  * siron - primitive for sun/os/softint.c
253  */
254 void
255 siron(void)
256 {
257 	if (siron_inum != 0)
258 		setsoftint(siron_inum);
259 	else
260 		siron_pending = 1;
261 }
262 
263 /*
264  * no_ivintr()
265  * 	called by vec_interrupt() through sys_trap()
266  *	vector interrupt received but not valid or not
267  *	registered in intr_vector[]
268  *	considered as a spurious mondo interrupt
269  */
270 /* ARGSUSED */
271 void
272 no_ivintr(struct regs *rp, int inum, int pil)
273 {
274 	cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
275 	    inum, pil);
276 
277 
278 #ifdef DEBUG_VEC_INTR
279 	prom_enter_mon();
280 #endif /* DEBUG_VEC_INTR */
281 }
282 
283 /*
284  * no_intr_pool()
285  * 	called by vec_interrupt() through sys_trap()
286  *	vector interrupt received but no intr_req entries
287  */
288 /* ARGSUSED */
289 void
290 no_intr_pool(struct regs *rp, int inum, int pil)
291 {
292 #ifdef DEBUG_VEC_INTR
293 	cmn_err(CE_WARN, "intr_req pool empty: num 0x%x, pil 0x%x",
294 		inum, pil);
295 	prom_enter_mon();
296 #else
297 	cmn_err(CE_PANIC, "intr_req pool empty: num 0x%x, pil 0x%x",
298 		inum, pil);
299 #endif /* DEBUG_VEC_INTR */
300 }
301 
302 void
303 intr_dequeue_req(uint_t pil, uint32_t inum)
304 {
305 	struct intr_req *ir, *prev;
306 	struct machcpu *mcpu;
307 	uint32_t clr;
308 	extern uint_t getpstate(void);
309 
310 	ASSERT((getpstate() & PSTATE_IE) == 0);
311 
312 	mcpu = &CPU->cpu_m;
313 
314 	/* Find a matching entry in the list */
315 	prev = NULL;
316 	ir = mcpu->intr_head[pil];
317 	while (ir != NULL) {
318 		if (ir->intr_number == inum)
319 			break;
320 		prev = ir;
321 		ir = ir->intr_next;
322 	}
323 	if (ir != NULL) {
324 		/*
325 		 * Remove entry from list
326 		 */
327 		if (prev != NULL)
328 			prev->intr_next = ir->intr_next;	/* non-head */
329 		else
330 			mcpu->intr_head[pil] = ir->intr_next;	/* head */
331 
332 		if (ir->intr_next == NULL)
333 			mcpu->intr_tail[pil] = prev;		/* tail */
334 
335 		/*
336 		 * Place on free list
337 		 */
338 		ir->intr_next = mcpu->intr_head[0];
339 		mcpu->intr_head[0] = ir;
340 	}
341 
342 	/*
343 	 * clear pending interrupts at this level if the list is empty
344 	 */
345 	if (mcpu->intr_head[pil] == NULL) {
346 		clr = 1 << pil;
347 		if (pil == PIL_14)
348 			clr |= (TICK_INT_MASK | STICK_INT_MASK);
349 		wr_clr_softint(clr);
350 	}
351 }
352 
353 
354 /*
355  * Send a directed interrupt of specified interrupt number id to a cpu.
356  */
357 void
358 send_dirint(
359 	int cpuix,		/* cpu to be interrupted */
360 	int intr_id)		/* interrupt number id */
361 {
362 	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
363 }
364 
365 void
366 init_intr_threads(struct cpu *cp)
367 {
368 	int i;
369 
370 	for (i = 0; i < NINTR_THREADS; i++)
371 		thread_create_intr(cp);
372 
373 	cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE,
374 		KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) +
375 		INTR_STACK_SIZE - SA(MINFRAME);
376 }
377 
378 /*
379  * Take the specified CPU out of participation in interrupts.
380  *	Called by p_online(2) when a processor is being taken off-line.
381  *	This allows interrupt threads being handled on the processor to
382  *	complete before the processor is idled.
383  */
384 int
385 cpu_disable_intr(struct cpu *cp)
386 {
387 	ASSERT(MUTEX_HELD(&cpu_lock));
388 
389 	/*
390 	 * Turn off the CPU_ENABLE flag before calling the redistribution
391 	 * function, since it checks for this in the cpu flags.
392 	 */
393 	cp->cpu_flags &= ~CPU_ENABLE;
394 
395 	intr_redist_all_cpus();
396 
397 	return (0);
398 }
399 
400 /*
401  * Allow the specified CPU to participate in interrupts.
402  *	Called by p_online(2) if a processor could not be taken off-line
403  *	because of bound threads, in order to resume processing interrupts.
404  *	Also called after starting a processor.
405  */
406 void
407 cpu_enable_intr(struct cpu *cp)
408 {
409 	ASSERT(MUTEX_HELD(&cpu_lock));
410 
411 	cp->cpu_flags |= CPU_ENABLE;
412 
413 	intr_redist_all_cpus();
414 }
415 
416 /*
417  * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
418  * one for weighted callbacks and one for normal callbacks. Weighted callbacks
419  * are issued to redirect interrupts of a specified weight, from heavy to
420  * light.  This allows all the interrupts of a given weight to be redistributed
421  * for all weighted nexus drivers prior to those of less weight.
422  */
423 static void
424 intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
425 {
426 	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
427 	struct intr_dist *iptr;
428 	struct intr_dist **pptr;
429 
430 	ASSERT(func);
431 	new->func = func;
432 	new->arg = arg;
433 	new->next = NULL;
434 
435 	/* Add to tail so that redistribution occurs in original order. */
436 	mutex_enter(&intr_dist_lock);
437 	for (iptr = *phead, pptr = phead; iptr != NULL;
438 	    pptr = &iptr->next, iptr = iptr->next) {
439 		/* check for problems as we locate the tail */
440 		if ((iptr->func == func) && (iptr->arg == arg)) {
441 			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
442 			/*NOTREACHED*/
443 		}
444 	}
445 	*pptr = new;
446 
447 	mutex_exit(&intr_dist_lock);
448 }
449 
450 void
451 intr_dist_add(void (*func)(void *), void *arg)
452 {
453 	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
454 }
455 
456 void
457 intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
458 {
459 	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
460 }
461 
462 /*
463  * Search for the interrupt distribution structure with the specified
464  * mondo vec reg in the interrupt distribution list. If a match is found,
465  * then delete the entry from the list. The caller is responsible for
466  * modifying the mondo vector registers.
467  */
468 static void
469 intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
470 {
471 	struct intr_dist *iptr;
472 	struct intr_dist **vect;
473 
474 	mutex_enter(&intr_dist_lock);
475 	for (iptr = *headp, vect = headp;
476 	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
477 		if ((iptr->func == func) && (iptr->arg == arg)) {
478 			*vect = iptr->next;
479 			kmem_free(iptr, sizeof (struct intr_dist));
480 			mutex_exit(&intr_dist_lock);
481 			return;
482 		}
483 	}
484 
485 	if (!panicstr)
486 		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
487 	mutex_exit(&intr_dist_lock);
488 }
489 
490 void
491 intr_dist_rem(void (*func)(void *), void *arg)
492 {
493 	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
494 }
495 
496 void
497 intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
498 {
499 	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
500 }
501 
502 /*
503  * Initiate interrupt redistribution.  Redistribution improves the isolation
504  * associated with interrupt weights by ordering operations from heavy weight
505  * to light weight.  When a CPUs orientation changes relative to interrupts,
506  * there is *always* a redistribution to accommodate this change (call to
507  * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
508  * that a redistribution could improve the quality of an initialization. For
509  * example, if you are not using a NIC it may not be attached with s10 (devfs).
510  * If you then configure the NIC (ifconfig), this may cause the NIC to attach
511  * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
512  * occurring late, so optimal "isolation" relative to weight is not occurring.
513  * The same applies to detach, although in this case doing the redistribution
514  * might improve "spread" for medium weight devices since the "isolation" of
515  * a higher weight device may no longer be present.
516  *
517  * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
518  *
519  * NB: There is risk associated with automatically triggering execution of the
520  * redistribution code at arbitrary times. The risk comes from the fact that
521  * there is a lot of low-level hardware interaction associated with a
522  * redistribution.  At some point we may want this code to perform automatic
523  * redistribution (redistribution thread; trigger timeout when add/remove
524  * weight delta is large enough, and call cv_signal from timeout - causing
525  * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
526  * risky at this time.
527  */
528 void
529 i_ddi_intr_redist_all_cpus()
530 {
531 	mutex_enter(&cpu_lock);
532 	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
533 	intr_redist_all_cpus();
534 	mutex_exit(&cpu_lock);
535 }
536 
537 /*
538  * Redistribute all interrupts
539  *
540  * This function redistributes all interrupting devices, running the
541  * parent callback functions for each node.
542  */
543 void
544 intr_redist_all_cpus(void)
545 {
546 	struct cpu *cp;
547 	struct intr_dist *iptr;
548 	int32_t weight, max_weight;
549 
550 	ASSERT(MUTEX_HELD(&cpu_lock));
551 	mutex_enter(&intr_dist_lock);
552 
553 	/*
554 	 * zero cpu_intr_weight on all cpus - it is safe to traverse
555 	 * cpu_list since we hold cpu_lock.
556 	 */
557 	cp = cpu_list;
558 	do {
559 		cp->cpu_intr_weight = 0;
560 	} while ((cp = cp->cpu_next) != cpu_list);
561 
562 	/*
563 	 * Assume that this redistribution may encounter a device weight
564 	 * via driver.conf tuning of "ddi-intr-weight" that is at most
565 	 * intr_dist_weight_maxfactor times larger.
566 	 */
567 	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
568 	if (max_weight > intr_dist_weight_maxmax)
569 		max_weight = intr_dist_weight_maxmax;
570 	intr_dist_weight_max = 1;
571 
572 	INTR_DEBUG((CE_CONT, "intr_dist: "
573 	    "intr_redist_all_cpus: %d-0\n", max_weight));
574 
575 	/*
576 	 * Redistribute weighted, from heavy to light.  The callback that
577 	 * specifies a weight equal to weight_max should redirect all
578 	 * interrupts of weight weight_max or greater [weight_max, inf.).
579 	 * Interrupts of lesser weight should be processed on the call with
580 	 * the matching weight. This allows all the heaver weight interrupts
581 	 * on all weighted busses (multiple pci busses) to be redirected prior
582 	 * to any lesser weight interrupts.
583 	 */
584 	for (weight = max_weight; weight >= 0; weight--)
585 		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
586 			((void (*)(void *, int32_t, int32_t))iptr->func)
587 			    (iptr->arg, max_weight, weight);
588 
589 	/* redistribute normal (non-weighted) interrupts */
590 	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
591 		((void (*)(void *))iptr->func)(iptr->arg);
592 	mutex_exit(&intr_dist_lock);
593 }
594 
595 void
596 intr_redist_all_cpus_shutdown(void)
597 {
598 	intr_policy = INTR_CURRENT_CPU;
599 	intr_redist_all_cpus();
600 }
601 
602 /*
603  * Determine what CPU to target, based on interrupt policy.
604  *
605  * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
606  *	advance through interrupt enabled cpus (round-robin).
607  *
608  * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
609  *	cpu_intr_weight, round robin when all equal.
610  *
611  *	Weighted interrupt distribution provides two things: "spread" of weight
612  *	(associated with algorithm itself) and "isolation" (associated with a
613  *	particular device weight). A redistribution is what provides optimal
614  *	"isolation" of heavy weight interrupts, optimal "spread" of weight
615  *	(relative to what came before) is always occurring.
616  *
617  *	An interrupt weight is a subjective number that represents the
618  *	percentage of a CPU required to service a device's interrupts: the
619  *	default weight is 0% (however the algorithm still maintains
620  *	round-robin), a network interface controller (NIC) may have a large
621  *	weight (35%). Interrupt weight only has meaning relative to the
622  *	interrupt weight of other devices: a CPU can be weighted more than
623  *	100%, and a single device might consume more than 100% of a CPU.
624  *
625  *	A coarse interrupt weight can be defined by the parent nexus driver
626  *	based on bus specific information, like pci class codes. A nexus
627  *	driver that supports device interrupt weighting for its children
628  *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
629  *	and removes the weight of a device from the CPU that an interrupt
630  *	is directed at.  The quality of initialization improves when the
631  *	device interrupt weights more accuracy reflect actual run-time weights,
632  *	and as the assignments are ordered from is heavy to light.
633  *
634  *	The implementation also supports interrupt weight being specified in
635  *	driver.conf files via the property "ddi-intr-weight", which takes
636  *	precedence over the nexus supplied weight.  This support is added to
637  *	permit possible tweaking in the product in response to customer
638  *	problems. This is not a formal or committed interface.
639  *
640  *	While a weighted approach chooses the CPU providing the best spread
641  *	given past weights, less than optimal isolation can result in cases
642  *	where heavy weight devices show up last. The nexus driver's interrupt
643  *	redistribution logic should use intr_dist_add/rem_weighted so that
644  *	interrupts can be redistributed heavy first for optimal isolation.
645  */
646 uint32_t
647 intr_dist_cpuid(void)
648 {
649 	static struct cpu	*curr_cpu;
650 	struct cpu		*start_cpu;
651 	struct cpu		*new_cpu;
652 	struct cpu		*cp;
653 	int			cpuid = -1;
654 
655 	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
656 	mutex_enter(&intr_dist_cpu_lock);
657 
658 	switch (intr_policy) {
659 	case INTR_CURRENT_CPU:
660 		cpuid = CPU->cpu_id;
661 		break;
662 
663 	case INTR_BOOT_CPU:
664 		panic("INTR_BOOT_CPU no longer supported.");
665 		/*NOTREACHED*/
666 
667 	case INTR_FLAT_DIST:
668 	case INTR_WEIGHTED_DIST:
669 	default:
670 		/*
671 		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
672 		 * the cpu has been deleted (cpu structs are never freed).
673 		 */
674 		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
675 			curr_cpu = CPU;
676 
677 		/*
678 		 * Advance to online CPU after curr_cpu (round-robin). For
679 		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
680 		 * weight.  For a nexus that does not support weight the
681 		 * default weight of zero is used. We degrade to round-robin
682 		 * behavior among equal weightes.  The default weight is zero
683 		 * and round-robin behavior continues.
684 		 *
685 		 * Disable preemption while traversing cpu_next_onln to
686 		 * ensure the list does not change.  This works because
687 		 * modifiers of this list and other lists in a struct cpu
688 		 * call pause_cpus() before making changes.
689 		 */
690 		kpreempt_disable();
691 		cp = start_cpu = curr_cpu->cpu_next_onln;
692 		new_cpu = NULL;
693 		do {
694 			/* Skip CPUs with interrupts disabled */
695 			if ((cp->cpu_flags & CPU_ENABLE) == 0)
696 				continue;
697 
698 			if (intr_policy == INTR_FLAT_DIST) {
699 				/* select CPU */
700 				new_cpu = cp;
701 				break;
702 			} else if ((new_cpu == NULL) ||
703 			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
704 				/* Choose if lighter weight */
705 				new_cpu = cp;
706 			}
707 		} while ((cp = cp->cpu_next_onln) != start_cpu);
708 		ASSERT(new_cpu);
709 		cpuid = new_cpu->cpu_id;
710 
711 		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
712 		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));
713 
714 		/* update static pointer for next round-robin */
715 		curr_cpu = new_cpu;
716 		kpreempt_enable();
717 		break;
718 	}
719 	mutex_exit(&intr_dist_cpu_lock);
720 	return (cpuid);
721 }
722 
723 /*
724  * Add or remove the the weight of a device from a CPUs interrupt weight.
725  *
726  * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
727  * their children to improve the overall quality of interrupt initialization.
728  *
729  * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
730  * among multiple devices (sharing ino) then the nexus should call
731  * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
732  * that share must specify the same cpuid.
733  *
734  * If a nexus driver is unable to determine the cpu at remove_intr time
735  * for some of its interrupts, then it should not call add_device_weight -
736  * intr_dist_cpuid will still provide round-robin.
737  *
738  * An established device weight (from dev_info node) takes precedence over
739  * the weight passed in.  If a device weight is not already established
740  * then the passed in nexus weight is established.
741  */
742 void
743 intr_dist_cpuid_add_device_weight(uint32_t cpuid,
744     dev_info_t *dip, int32_t nweight)
745 {
746 	int32_t		eweight;
747 
748 	/*
749 	 * For non-weighted policy everything has weight of zero (and we get
750 	 * round-robin distribution from intr_dist_cpuid).
751 	 * NB: intr_policy is limited to this file. A weighted nexus driver is
752 	 * calls this rouitne even if intr_policy has been patched to
753 	 * INTR_FLAG_DIST.
754 	 */
755 	ASSERT(dip);
756 	if (intr_policy != INTR_WEIGHTED_DIST)
757 		return;
758 
759 	eweight = i_ddi_get_intr_weight(dip);
760 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
761 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
762 	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
763 	    ddi_get_instance(ddi_get_parent(dip)),
764 	    ddi_driver_name(dip), ddi_get_instance(dip)));
765 
766 	/* if no establish weight, establish nexus weight */
767 	if (eweight < 0) {
768 		if (nweight > 0)
769 			(void) i_ddi_set_intr_weight(dip, nweight);
770 		else
771 			nweight = 0;
772 	} else
773 		nweight = eweight;	/* use established weight */
774 
775 	/* Establish exclusion for cpu_intr_weight manipulation */
776 	mutex_enter(&intr_dist_cpu_lock);
777 	cpu[cpuid]->cpu_intr_weight += nweight;
778 
779 	/* update intr_dist_weight_max */
780 	if (nweight > intr_dist_weight_max)
781 		intr_dist_weight_max = nweight;
782 	mutex_exit(&intr_dist_cpu_lock);
783 }
784 
785 void
786 intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
787 {
788 	struct cpu	*cp;
789 	int32_t		weight;
790 
791 	ASSERT(dip);
792 	if (intr_policy != INTR_WEIGHTED_DIST)
793 		return;
794 
795 	/* remove weight of device from cpu */
796 	weight = i_ddi_get_intr_weight(dip);
797 	if (weight < 0)
798 		weight = 0;
799 	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
800 	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
801 	    ddi_driver_name(ddi_get_parent(dip)),
802 	    ddi_get_instance(ddi_get_parent(dip)),
803 	    ddi_driver_name(dip), ddi_get_instance(dip)));
804 
805 	/* Establish exclusion for cpu_intr_weight manipulation */
806 	mutex_enter(&intr_dist_cpu_lock);
807 	cp = cpu[cpuid];
808 	cp->cpu_intr_weight -= weight;
809 	if (cp->cpu_intr_weight < 0)
810 		cp->cpu_intr_weight = 0;	/* sanity */
811 	mutex_exit(&intr_dist_cpu_lock);
812 }
813