xref: /illumos-gate/usr/src/uts/i86pc/os/x_call.c (revision bf73eaa5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/t_lock.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/x_call.h>
32 #include <sys/xc_levels.h>
33 #include <sys/cpu.h>
34 #include <sys/psw.h>
35 #include <sys/sunddi.h>
36 #include <sys/debug.h>
37 #include <sys/systm.h>
38 #include <sys/archsystm.h>
39 #include <sys/machsystm.h>
40 #include <sys/mutex_impl.h>
41 #include <sys/stack.h>
42 #include <sys/promif.h>
43 #include <sys/x86_archext.h>
44 
45 /*
46  * Implementation for cross-processor calls via interprocessor interrupts
47  *
48  * This implementation uses a message passing architecture to allow multiple
49  * concurrent cross calls to be in flight at any given time. We use the cmpxchg
50  * instruction, aka casptr(), to implement simple efficient work queues for
51  * message passing between CPUs with almost no need for regular locking.
52  * See xc_extract() and xc_insert() below.
53  *
54  * The general idea is that initiating a cross call means putting a message
55  * on a target(s) CPU's work queue. Any synchronization is handled by passing
56  * the message back and forth between initiator and target(s).
57  *
58  * Every CPU has xc_work_cnt, which indicates it has messages to process.
59  * This value is incremented as message traffic is initiated and decremented
60  * with every message that finishes all processing.
61  *
62  * The code needs no mfence or other membar_*() calls. The uses of
63  * casptr(), cas32() and atomic_dec_32() for the message passing are
64  * implemented with LOCK prefix instructions which are equivalent to mfence.
65  *
66  * One interesting aspect of this implmentation is that it allows 2 or more
67  * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
68  * The cross call processing by the CPUs will happen in any order with only
69  * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
70  * from cross calls before all slaves have invoked the function.
71  *
72  * The reason for this asynchronous approach is to allow for fast global
73  * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
74  * on a different Virtual Address at the same time. The old code required
75  * N squared IPIs. With this method, depending on timing, it could happen
76  * with just N IPIs.
77  */
78 
79 /*
80  * The default is to not enable collecting counts of IPI information, since
81  * the updating of shared cachelines could cause excess bus traffic.
82  */
83 uint_t xc_collect_enable = 0;
84 uint64_t xc_total_cnt = 0;	/* total #IPIs sent for cross calls */
85 uint64_t xc_multi_cnt = 0;	/* # times we piggy backed on another IPI */
86 
87 /*
88  * Values for message states. Here are the normal transitions. A transition
89  * of "->" happens in the slave cpu and "=>" happens in the master cpu as
90  * the messages are passed back and forth.
91  *
92  * FREE => ASYNC ->                       DONE => FREE
93  * FREE => CALL ->                        DONE => FREE
94  * FREE => SYNC -> WAITING => RELEASED -> DONE => FREE
95  *
96  * The interesing one above is ASYNC. You might ask, why not go directly
97  * to FREE, instead of DONE. If it did that, it might be possible to exhaust
98  * the master's xc_free list if a master can generate ASYNC messages faster
99  * then the slave can process them. That could be handled with more complicated
100  * handling. However since nothing important uses ASYNC, I've not bothered.
101  */
102 #define	XC_MSG_FREE	(0)	/* msg in xc_free queue */
103 #define	XC_MSG_ASYNC	(1)	/* msg in slave xc_msgbox */
104 #define	XC_MSG_CALL	(2)	/* msg in slave xc_msgbox */
105 #define	XC_MSG_SYNC	(3)	/* msg in slave xc_msgbox */
106 #define	XC_MSG_WAITING	(4)	/* msg in master xc_msgbox or xc_waiters */
107 #define	XC_MSG_RELEASED	(5)	/* msg in slave xc_msgbox */
108 #define	XC_MSG_DONE	(6)	/* msg in master xc_msgbox */
109 
110 /*
111  * We allow for one high priority message at a time to happen in the system.
112  * This is used for panic, kmdb, etc., so no locking is done.
113  */
114 static volatile cpuset_t xc_priority_set_store;
115 static volatile ulong_t *xc_priority_set = CPUSET2BV(xc_priority_set_store);
116 static xc_data_t xc_priority_data;
117 
118 /*
119  * Wrappers to avoid C compiler warnings due to volatile. The atomic bit
120  * operations don't accept volatile bit vectors - which is a bit silly.
121  */
122 #define	XC_BT_SET(vector, b)	BT_ATOMIC_SET((ulong_t *)(vector), (b))
123 #define	XC_BT_CLEAR(vector, b)	BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
124 
125 /*
126  * Decrement a CPU's work count
127  */
128 static void
129 xc_decrement(struct machcpu *mcpu)
130 {
131 	atomic_dec_32(&mcpu->xc_work_cnt);
132 }
133 
134 /*
135  * Increment a CPU's work count and return the old value
136  */
137 static int
138 xc_increment(struct machcpu *mcpu)
139 {
140 	int old;
141 	do {
142 		old = mcpu->xc_work_cnt;
143 	} while (cas32((uint32_t *)&mcpu->xc_work_cnt, old, old + 1) != old);
144 	return (old);
145 }
146 
147 /*
148  * Put a message into a queue. The insertion is atomic no matter
149  * how many different inserts/extracts to the same queue happen.
150  */
151 static void
152 xc_insert(void *queue, xc_msg_t *msg)
153 {
154 	xc_msg_t *old_head;
155 
156 	/*
157 	 * FREE messages should only ever be getting inserted into
158 	 * the xc_master CPUs xc_free queue.
159 	 */
160 	ASSERT(msg->xc_command != XC_MSG_FREE ||
161 	    cpu[msg->xc_master] == NULL || /* possible only during init */
162 	    queue == &cpu[msg->xc_master]->cpu_m.xc_free);
163 
164 	do {
165 		old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
166 		msg->xc_next = old_head;
167 	} while (casptr(queue, old_head, msg) != old_head);
168 }
169 
170 /*
171  * Extract a message from a queue. The extraction is atomic only
172  * when just one thread does extractions from the queue.
173  * If the queue is empty, NULL is returned.
174  */
175 static xc_msg_t *
176 xc_extract(xc_msg_t **queue)
177 {
178 	xc_msg_t *old_head;
179 
180 	do {
181 		old_head = (xc_msg_t *)*(volatile xc_msg_t **)queue;
182 		if (old_head == NULL)
183 			return (old_head);
184 	} while (casptr(queue, old_head, old_head->xc_next) != old_head);
185 	old_head->xc_next = NULL;
186 	return (old_head);
187 }
188 
189 
190 /*
191  * Initialize the machcpu fields used for cross calls
192  */
193 static uint_t xc_initialized = 0;
194 void
195 xc_init_cpu(struct cpu *cpup)
196 {
197 	xc_msg_t *msg;
198 	int c;
199 
200 	/*
201 	 * add a new msg to each existing CPU's free list, as well as one for
202 	 * my list for each of them. ncpus has an inconsistent value when this
203 	 * function is called, so use cpup->cpu_id.
204 	 */
205 	for (c = 0; c < cpup->cpu_id; ++c) {
206 		if (cpu[c] == NULL)
207 			continue;
208 		msg = kmem_zalloc(sizeof (*msg), KM_SLEEP);
209 		msg->xc_command = XC_MSG_FREE;
210 		msg->xc_master = c;
211 		xc_insert(&cpu[c]->cpu_m.xc_free, msg);
212 
213 		msg = kmem_zalloc(sizeof (*msg), KM_SLEEP);
214 		msg->xc_command = XC_MSG_FREE;
215 		msg->xc_master = cpup->cpu_id;
216 		xc_insert(&cpup->cpu_m.xc_free, msg);
217 	}
218 
219 	/*
220 	 * Add one for self messages
221 	 */
222 	msg = kmem_zalloc(sizeof (*msg), KM_SLEEP);
223 	msg->xc_command = XC_MSG_FREE;
224 	msg->xc_master = cpup->cpu_id;
225 	xc_insert(&cpup->cpu_m.xc_free, msg);
226 
227 	if (!xc_initialized)
228 		xc_initialized = 1;
229 }
230 
231 /*
232  * X-call message processing routine. Note that this is used by both
233  * senders and recipients of messages.
234  *
235  * We're protected against changing CPUs by either being in a high-priority
236  * interrupt, having preemption disabled or by having a raised SPL.
237  */
238 /*ARGSUSED*/
239 uint_t
240 xc_serv(caddr_t arg1, caddr_t arg2)
241 {
242 	struct machcpu *mcpup = &(CPU->cpu_m);
243 	xc_msg_t *msg;
244 	xc_data_t *data;
245 	xc_msg_t *xc_waiters = NULL;
246 	uint32_t num_waiting = 0;
247 	xc_func_t func;
248 	xc_arg_t a1;
249 	xc_arg_t a2;
250 	xc_arg_t a3;
251 	uint_t rc = DDI_INTR_UNCLAIMED;
252 
253 	while (mcpup->xc_work_cnt != 0) {
254 		rc = DDI_INTR_CLAIMED;
255 
256 		/*
257 		 * We may have to wait for a message to arrive.
258 		 */
259 		for (msg = NULL; msg == NULL;
260 		    msg = xc_extract(&mcpup->xc_msgbox)) {
261 
262 			/*
263 			 * Alway check for and handle a priority message.
264 			 */
265 			if (BT_TEST(xc_priority_set, CPU->cpu_id)) {
266 				func = xc_priority_data.xc_func;
267 				a1 = xc_priority_data.xc_a1;
268 				a2 = xc_priority_data.xc_a2;
269 				a3 = xc_priority_data.xc_a3;
270 				XC_BT_CLEAR(xc_priority_set, CPU->cpu_id);
271 				xc_decrement(mcpup);
272 				func(a1, a2, a3);
273 				if (mcpup->xc_work_cnt == 0)
274 					return (rc);
275 			}
276 
277 			/*
278 			 * wait for a message to arrive
279 			 */
280 			SMT_PAUSE();
281 		}
282 
283 
284 		/*
285 		 * process the message
286 		 */
287 		switch (msg->xc_command) {
288 
289 		/*
290 		 * ASYNC gives back the message immediately, then we do the
291 		 * function and return with no more waiting.
292 		 */
293 		case XC_MSG_ASYNC:
294 			data = &cpu[msg->xc_master]->cpu_m.xc_data;
295 			func = data->xc_func;
296 			a1 = data->xc_a1;
297 			a2 = data->xc_a2;
298 			a3 = data->xc_a3;
299 			msg->xc_command = XC_MSG_DONE;
300 			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
301 			if (func != NULL)
302 				(void) (*func)(a1, a2, a3);
303 			xc_decrement(mcpup);
304 			break;
305 
306 		/*
307 		 * SYNC messages do the call, then send it back to the master
308 		 * in WAITING mode
309 		 */
310 		case XC_MSG_SYNC:
311 			data = &cpu[msg->xc_master]->cpu_m.xc_data;
312 			if (data->xc_func != NULL)
313 				(void) (*data->xc_func)(data->xc_a1,
314 				    data->xc_a2, data->xc_a3);
315 			msg->xc_command = XC_MSG_WAITING;
316 			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
317 			break;
318 
319 		/*
320 		 * WAITING messsages are collected by the master until all
321 		 * have arrived. Once all arrive, we release them back to
322 		 * the slaves
323 		 */
324 		case XC_MSG_WAITING:
325 			xc_insert(&xc_waiters, msg);
326 			if (++num_waiting < mcpup->xc_wait_cnt)
327 				break;
328 			while ((msg = xc_extract(&xc_waiters)) != NULL) {
329 				msg->xc_command = XC_MSG_RELEASED;
330 				xc_insert(&cpu[msg->xc_slave]->cpu_m.xc_msgbox,
331 				    msg);
332 				--num_waiting;
333 			}
334 			if (num_waiting != 0)
335 				panic("wrong number waiting");
336 			mcpup->xc_wait_cnt = 0;
337 			break;
338 
339 		/*
340 		 * CALL messages do the function and then, like RELEASE,
341 		 * send the message is back to master as DONE.
342 		 */
343 		case XC_MSG_CALL:
344 			data = &cpu[msg->xc_master]->cpu_m.xc_data;
345 			if (data->xc_func != NULL)
346 				(void) (*data->xc_func)(data->xc_a1,
347 				    data->xc_a2, data->xc_a3);
348 			/*FALLTHROUGH*/
349 		case XC_MSG_RELEASED:
350 			msg->xc_command = XC_MSG_DONE;
351 			xc_insert(&cpu[msg->xc_master]->cpu_m.xc_msgbox, msg);
352 			xc_decrement(mcpup);
353 			break;
354 
355 		/*
356 		 * DONE means a slave has completely finished up.
357 		 * Once we collect all the DONE messages, we'll exit
358 		 * processing too.
359 		 */
360 		case XC_MSG_DONE:
361 			msg->xc_command = XC_MSG_FREE;
362 			xc_insert(&mcpup->xc_free, msg);
363 			xc_decrement(mcpup);
364 			break;
365 
366 		case XC_MSG_FREE:
367 			panic("free message 0x%p in msgbox", (void *)msg);
368 			break;
369 
370 		default:
371 			panic("bad message 0x%p in msgbox", (void *)msg);
372 			break;
373 		}
374 	}
375 	return (rc);
376 }
377 
378 /*
379  * Initiate cross call processing.
380  */
381 static void
382 xc_common(
383 	xc_func_t func,
384 	xc_arg_t arg1,
385 	xc_arg_t arg2,
386 	xc_arg_t arg3,
387 	ulong_t *set,
388 	uint_t command)
389 {
390 	int c;
391 	struct cpu *cpup;
392 	xc_msg_t *msg;
393 	xc_data_t *data;
394 	int cnt;
395 	int save_spl;
396 
397 	if (!xc_initialized) {
398 		if (BT_TEST(set, CPU->cpu_id) && (CPU->cpu_flags & CPU_READY) &&
399 		    func != NULL)
400 			(void) (*func)(arg1, arg2, arg3);
401 		return;
402 	}
403 
404 	save_spl = splr(ipltospl(XC_HI_PIL));
405 
406 	/*
407 	 * fill in cross call data
408 	 */
409 	data = &CPU->cpu_m.xc_data;
410 	data->xc_func = func;
411 	data->xc_a1 = arg1;
412 	data->xc_a2 = arg2;
413 	data->xc_a3 = arg3;
414 
415 	/*
416 	 * Post messages to all CPUs involved that are CPU_READY
417 	 */
418 	CPU->cpu_m.xc_wait_cnt = 0;
419 	for (c = 0; c < ncpus; ++c) {
420 		if (!BT_TEST(set, c))
421 			continue;
422 		cpup = cpu[c];
423 		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY))
424 			continue;
425 
426 		/*
427 		 * Fill out a new message.
428 		 */
429 		msg = xc_extract(&CPU->cpu_m.xc_free);
430 		if (msg == NULL)
431 			panic("Ran out of free xc_msg_t's");
432 		msg->xc_command = command;
433 		if (msg->xc_master != CPU->cpu_id)
434 			panic("msg %p has wrong xc_master", (void *)msg);
435 		msg->xc_slave = c;
436 
437 		/*
438 		 * Increment my work count for all messages that I'll
439 		 * transition from DONE to FREE.
440 		 * Also remember how many XC_MSG_WAITINGs to look for
441 		 */
442 		(void) xc_increment(&CPU->cpu_m);
443 		if (command == XC_MSG_SYNC)
444 			++CPU->cpu_m.xc_wait_cnt;
445 
446 		/*
447 		 * Increment the target CPU work count then insert the message
448 		 * in the target msgbox. If I post the first bit of work
449 		 * for the target to do, send an IPI to the target CPU.
450 		 */
451 		cnt = xc_increment(&cpup->cpu_m);
452 		xc_insert(&cpup->cpu_m.xc_msgbox, msg);
453 		if (cpup != CPU) {
454 			if (cnt == 0) {
455 				CPU_STATS_ADDQ(CPU, sys, xcalls, 1);
456 				send_dirint(c, XC_HI_PIL);
457 				if (xc_collect_enable)
458 					++xc_total_cnt;
459 			} else if (xc_collect_enable) {
460 				++xc_multi_cnt;
461 			}
462 		}
463 	}
464 
465 	/*
466 	 * Now drop into the message handler until all work is done
467 	 */
468 	(void) xc_serv(NULL, NULL);
469 	splx(save_spl);
470 }
471 
472 /*
473  * Push out a priority cross call.
474  */
475 static void
476 xc_priority_common(
477 	xc_func_t func,
478 	xc_arg_t arg1,
479 	xc_arg_t arg2,
480 	xc_arg_t arg3,
481 	ulong_t *set)
482 {
483 	int i;
484 	int c;
485 	struct cpu *cpup;
486 
487 	/*
488 	 * Wait briefly for any previous xc_priority to have finished.
489 	 */
490 	for (c = 0; c < ncpus; ++c) {
491 		cpup = cpu[c];
492 		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY))
493 			continue;
494 
495 		/*
496 		 * The value of 40000 here is from old kernel code. It
497 		 * really should be changed to some time based value, since
498 		 * under a hypervisor, there's no guarantee a remote CPU
499 		 * is even scheduled.
500 		 */
501 		for (i = 0; BT_TEST(xc_priority_set, c) && i < 40000; ++i)
502 			SMT_PAUSE();
503 
504 		/*
505 		 * Some CPU did not respond to a previous priority request. It's
506 		 * probably deadlocked with interrupts blocked or some such
507 		 * problem. We'll just erase the previous request - which was
508 		 * most likely a kmdb_enter that has already expired - and plow
509 		 * ahead.
510 		 */
511 		if (BT_TEST(xc_priority_set, c)) {
512 			XC_BT_CLEAR(xc_priority_set, c);
513 			if (cpup->cpu_m.xc_work_cnt > 0)
514 				xc_decrement(&cpup->cpu_m);
515 		}
516 	}
517 
518 	/*
519 	 * fill in cross call data
520 	 */
521 	xc_priority_data.xc_func = func;
522 	xc_priority_data.xc_a1 = arg1;
523 	xc_priority_data.xc_a2 = arg2;
524 	xc_priority_data.xc_a3 = arg3;
525 
526 	/*
527 	 * Post messages to all CPUs involved that are CPU_READY
528 	 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
529 	 */
530 	for (c = 0; c < ncpus; ++c) {
531 		if (!BT_TEST(set, c))
532 			continue;
533 		cpup = cpu[c];
534 		if (cpup == NULL || !(cpup->cpu_flags & CPU_READY) ||
535 		    cpup == CPU)
536 			continue;
537 		(void) xc_increment(&cpup->cpu_m);
538 		XC_BT_SET(xc_priority_set, c);
539 		send_dirint(c, XC_HI_PIL);
540 		for (i = 0; i < 10; ++i) {
541 			(void) casptr(&cpup->cpu_m.xc_msgbox,
542 			    cpup->cpu_m.xc_msgbox, cpup->cpu_m.xc_msgbox);
543 		}
544 	}
545 }
546 
547 /*
548  * Do cross call to all other CPUs with absolutely no waiting or handshaking.
549  * This should only be used for extraordinary operations, like panic(), which
550  * need to work, in some fashion, in a not completely functional system.
551  * All other uses that want minimal waiting should use xc_call_nowait().
552  */
553 void
554 xc_priority(
555 	xc_arg_t arg1,
556 	xc_arg_t arg2,
557 	xc_arg_t arg3,
558 	ulong_t *set,
559 	xc_func_t func)
560 {
561 	extern int IGNORE_KERNEL_PREEMPTION;
562 	int save_spl = splr(ipltospl(XC_HI_PIL));
563 	int save_kernel_preemption = IGNORE_KERNEL_PREEMPTION;
564 
565 	IGNORE_KERNEL_PREEMPTION = 1;
566 	xc_priority_common((xc_func_t)func, arg1, arg2, arg3, set);
567 	IGNORE_KERNEL_PREEMPTION = save_kernel_preemption;
568 	splx(save_spl);
569 }
570 
571 /*
572  * Wrapper for kmdb to capture other CPUs, causing them to enter the debugger.
573  */
574 void
575 kdi_xc_others(int this_cpu, void (*func)(void))
576 {
577 	extern int IGNORE_KERNEL_PREEMPTION;
578 	int save_kernel_preemption;
579 	cpuset_t set;
580 
581 	if (!xc_initialized)
582 		return;
583 
584 	save_kernel_preemption = IGNORE_KERNEL_PREEMPTION;
585 	IGNORE_KERNEL_PREEMPTION = 1;
586 	CPUSET_ALL_BUT(set, this_cpu);
587 	xc_priority_common((xc_func_t)func, 0, 0, 0, CPUSET2BV(set));
588 	IGNORE_KERNEL_PREEMPTION = save_kernel_preemption;
589 }
590 
591 
592 
593 /*
594  * Invoke function on specified processors. Remotes may continue after
595  * service with no waiting. xc_call_nowait() may return immediately too.
596  */
597 void
598 xc_call_nowait(
599 	xc_arg_t arg1,
600 	xc_arg_t arg2,
601 	xc_arg_t arg3,
602 	ulong_t *set,
603 	xc_func_t func)
604 {
605 	xc_common(func, arg1, arg2, arg3, set, XC_MSG_ASYNC);
606 }
607 
608 /*
609  * Invoke function on specified processors. Remotes may continue after
610  * service with no waiting. xc_call() returns only after remotes have finished.
611  */
612 void
613 xc_call(
614 	xc_arg_t arg1,
615 	xc_arg_t arg2,
616 	xc_arg_t arg3,
617 	ulong_t *set,
618 	xc_func_t func)
619 {
620 	xc_common(func, arg1, arg2, arg3, set, XC_MSG_CALL);
621 }
622 
623 /*
624  * Invoke function on specified processors. Remotes wait until all have
625  * finished. xc_sync() also waits until all remotes have finished.
626  */
627 void
628 xc_sync(
629 	xc_arg_t arg1,
630 	xc_arg_t arg2,
631 	xc_arg_t arg3,
632 	ulong_t *set,
633 	xc_func_t func)
634 {
635 	xc_common(func, arg1, arg2, arg3, set, XC_MSG_SYNC);
636 }
637