xref: /illumos-gate/usr/src/uts/sun4v/os/suspend.c (revision 02b4e56c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/mutex.h>
26 #include <sys/cpuvar.h>
27 #include <sys/cyclic.h>
28 #include <sys/disp.h>
29 #include <sys/ddi.h>
30 #include <sys/wdt.h>
31 #include <sys/callb.h>
32 #include <sys/cmn_err.h>
33 #include <sys/hypervisor_api.h>
34 #include <sys/membar.h>
35 #include <sys/x_call.h>
36 #include <sys/promif.h>
37 #include <sys/systm.h>
38 #include <sys/mach_descrip.h>
39 #include <sys/cpu_module.h>
40 #include <sys/pg.h>
41 #include <sys/lgrp.h>
42 #include <sys/sysmacros.h>
43 #include <sys/sunddi.h>
44 #include <sys/cpupart.h>
45 #include <sys/hsvc.h>
46 #include <sys/mpo.h>
47 #include <vm/hat_sfmmu.h>
48 #include <sys/time.h>
49 #include <sys/clock.h>
50 
51 /*
52  * Sun4v OS Suspend
53  *
54  * Provides a means to suspend a sun4v guest domain by pausing CPUs and then
55  * calling into the HV to initiate a suspension. Suspension is sequenced
56  * externally by calling suspend_pre, suspend_start, and suspend_post.
57  * suspend_pre and suspend_post are meant to perform any special operations
58  * that should be done before or after a suspend/resume operation. e.g.,
59  * callbacks to cluster software to disable heartbeat monitoring before the
60  * system is suspended. suspend_start prepares kernel services to be suspended
61  * and then suspends the domain by calling hv_guest_suspend.
62  *
63  * Special Handling for %tick and %stick Registers
64  *
65  * After a suspend/resume operation, the %tick and %stick registers may have
66  * jumped forwards or backwards. The delta is assumed to be consistent across
67  * all CPUs, within the negligible level of %tick and %stick variation
68  * acceptable on a cold boot. In order to maintain increasing %tick and %stick
69  * counter values without exposing large positive or negative jumps to kernel
70  * or user code, a %tick and %stick offset is used. Kernel reads of these
71  * counters return the sum of the hardware register counter and offset
72  * variable. After a suspend/resume operation, user reads of %tick or %stick
73  * are emulated. Suspend code enables emulation by setting the
74  * %{tick,stick}.NPT fields which trigger a privileged instruction access
75  * trap whenever the registers are read from user mode. If emulation has been
76  * enabled, the trap handler emulates the instruction. Emulation is only
77  * enabled during a successful suspend/resume operation. When emulation is
78  * enabled, CPUs that are DR'd into the system will have their
79  * %{tick,stick}.NPT bits set to 1 as well.
80  */
81 
82 extern u_longlong_t gettick(void);	/* returns %stick */
83 extern uint64_t gettick_counter(void);	/* returns %tick */
84 extern uint64_t gettick_npt(void);
85 extern uint64_t getstick_npt(void);
86 extern int mach_descrip_update(void);
87 extern cpuset_t cpu_ready_set;
88 extern uint64_t native_tick_offset;
89 extern uint64_t native_stick_offset;
90 extern uint64_t sys_tick_freq;
91 
92 /*
93  * Global Sun Cluster pre/post callbacks.
94  */
95 const char *(*cl_suspend_error_decode)(int);
96 int (*cl_suspend_pre_callback)(void);
97 int (*cl_suspend_post_callback)(void);
98 #define	SC_PRE_FAIL_STR_FMT	"Sun Cluster pre-suspend failure: %d"
99 #define	SC_POST_FAIL_STR_FMT	"Sun Cluster post-suspend failure: %d"
100 #define	SC_FAIL_STR_MAX		256
101 
102 /*
103  * The minimum major and minor version of the HSVC_GROUP_CORE API group
104  * required in order to use OS suspend.
105  */
106 #define	SUSPEND_CORE_MAJOR	1
107 #define	SUSPEND_CORE_MINOR	2
108 
109 /*
110  * By default, sun4v OS suspend is supported if the required HV version
111  * is present. suspend_disabled should be set on platforms that do not
112  * allow OS suspend regardless of whether or not the HV supports it.
113  * It can also be set in /etc/system.
114  */
115 static int suspend_disabled = 0;
116 
117 /*
118  * Controls whether or not user-land tick and stick register emulation
119  * will be enabled following a successful suspend operation.
120  */
121 static int enable_user_tick_stick_emulation = 1;
122 
123 /*
124  * Indicates whether or not tick and stick emulation is currently active.
125  * After a successful suspend operation, if emulation is enabled, this
126  * variable is set to B_TRUE. Global scope to allow emulation code to
127  * check if emulation is active.
128  */
129 boolean_t tick_stick_emulation_active = B_FALSE;
130 
131 /*
132  * When non-zero, after a successful suspend and resume, cpunodes, CPU HW
133  * sharing data structures, and processor groups will be updated using
134  * information from the updated MD.
135  */
136 static int suspend_update_cpu_mappings = 1;
137 
138 /*
139  * The maximum number of microseconds by which the %tick or %stick register
140  * can vary between any two CPUs in the system. To calculate the
141  * native_stick_offset and native_tick_offset, we measure the change in these
142  * registers on one CPU over a suspend/resume. Other CPUs may experience
143  * slightly larger or smaller changes. %tick and %stick should be synchronized
144  * between CPUs, but there may be some variation. So we add an additional value
145  * derived from this variable to ensure that these registers always increase
146  * over a suspend/resume operation, assuming all %tick and %stick registers
147  * are synchronized (within a certain limit) across CPUs in the system. The
148  * delta between %sticks on different CPUs should be a small number of cycles,
149  * not perceptible to readers of %stick that migrate between CPUs. We set this
150  * value to 1 millisecond which means that over a suspend/resume operation,
151  * all CPU's %tick and %stick will advance forwards as long as, across all
152  * CPUs, the %tick and %stick are synchronized to within 1 ms. This applies to
153  * CPUs before the suspend and CPUs after the resume. 1 ms is conservative,
154  * but small enough to not trigger TOD faults.
155  */
156 static uint64_t suspend_tick_stick_max_delta = 1000; /* microseconds */
157 
158 /*
159  * The number of times the system has been suspended and resumed.
160  */
161 static uint64_t suspend_count = 0;
162 
163 /*
164  * DBG and DBG_PROM() macro.
165  */
166 #ifdef	DEBUG
167 
168 static int suspend_debug_flag = 0;
169 
170 #define	DBG_PROM		\
171 if (suspend_debug_flag)		\
172 	prom_printf
173 
174 #define	DBG			\
175 if (suspend_debug_flag)		\
176 	suspend_debug
177 
178 static void
179 suspend_debug(const char *fmt, ...)
180 {
181 	char	buf[512];
182 	va_list	ap;
183 
184 	va_start(ap, fmt);
185 	(void) vsprintf(buf, fmt, ap);
186 	va_end(ap);
187 
188 	cmn_err(CE_NOTE, "%s", buf);
189 }
190 
191 #else /* DEBUG */
192 
193 #define	DBG_PROM
194 #define	DBG
195 
196 #endif /* DEBUG */
197 
198 /*
199  * Return true if the HV supports OS suspend and if suspend has not been
200  * disabled on this platform.
201  */
202 boolean_t
203 suspend_supported(void)
204 {
205 	uint64_t major, minor;
206 
207 	if (suspend_disabled)
208 		return (B_FALSE);
209 
210 	if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0)
211 		return (B_FALSE);
212 
213 	return ((major == SUSPEND_CORE_MAJOR && minor >= SUSPEND_CORE_MINOR) ||
214 	    (major > SUSPEND_CORE_MAJOR));
215 }
216 
217 /*
218  * Memory DR is not permitted if the system has been suspended and resumed.
219  * It is the responsibility of the caller of suspend_start and the DR
220  * subsystem to serialize DR operations and suspend_memdr_allowed() checks.
221  */
222 boolean_t
223 suspend_memdr_allowed(void)
224 {
225 	return (suspend_count == 0);
226 }
227 
228 /*
229  * Given a source tick, stick, and tod value, set the tick and stick offsets
230  * such that the (current physical register value) + offset == (source value)
231  * and in addition account for some variation between the %tick/%stick on
232  * different CPUs. We account for this variation by adding in double the value
233  * of suspend_tick_stick_max_delta. The following is an explanation of why
234  * suspend_tick_stick_max_delta must be multplied by two and added to
235  * native_stick_offset.
236  *
237  * Consider a guest instance that is yet to be suspended with CPUs p0 and p1
238  * with physical "source" %stick values s0 and s1 respectively. When the guest
239  * is first resumed, the physical "target" %stick values are t0 and t1
240  * respectively. The virtual %stick values after the resume are v0 and v1
241  * respectively. Let x be the maximum difference between any two CPU's %stick
242  * register at a given point in time and let the %stick values be assigned
243  * such that
244  *
245  *     s1 = s0 + x and
246  *     t1 = t0 - x
247  *
248  * Let us assume that p0 is driving the suspend and resume. Then, we will
249  * calculate the stick offset f and the virtual %stick on p0 after the
250  * resume as follows.
251  *
252  *      f = s0 - t0 and
253  *     v0 = t0 + f
254  *
255  * We calculate the virtual %stick v1 on p1 after the resume as
256  *
257  *     v1 = t1 + f
258  *
259  * Substitution yields
260  *
261  *     v1 = t1 + (s0 - t0)
262  *     v1 = (t0 - x) + (s0 - t0)
263  *     v1 = -x + s0
264  *     v1 = s0 - x
265  *     v1 = (s1 - x) - x
266  *     v1 = s1 - 2x
267  *
268  * Therefore, in this scenario, without accounting for %stick variation in
269  * the calculation of the native_stick_offset f, the virtual %stick on p1
270  * is less than the value of the %stick on p1 before the suspend which is
271  * unacceptable. By adding 2x to v1, we guarantee it will be equal to s1
272  * which means the %stick on p1 after the resume will always be greater
273  * than or equal to the %stick on p1 before the suspend. Since v1 = t1 + f
274  * at any point in time, we can accomplish this by adding 2x to f. This
275  * guarantees any processes bound to CPU P0 or P1 will not see a %stick
276  * decrease across a suspend/resume. Hence, in the code below, we multiply
277  * suspend_tick_stick_max_delta by two in the calculation for
278  * native_stick_offset, native_tick_offset, and target_hrtime.
279  */
280 static void
281 set_tick_offsets(uint64_t source_tick, uint64_t source_stick, timestruc_t *tsp)
282 {
283 	uint64_t target_tick;
284 	uint64_t target_stick;
285 	hrtime_t source_hrtime;
286 	hrtime_t target_hrtime;
287 
288 	/*
289 	 * Temporarily set the offsets to zero so that the following reads
290 	 * of the registers will yield physical unadjusted counter values.
291 	 */
292 	native_tick_offset = 0;
293 	native_stick_offset = 0;
294 
295 	target_tick = gettick_counter();	/* returns %tick */
296 	target_stick = gettick();		/* returns %stick */
297 
298 	/*
299 	 * Calculate the new offsets. In addition to the delta observed on
300 	 * this CPU, add an additional value. Multiply the %tick/%stick
301 	 * frequency by suspend_tick_stick_max_delta (us). Then, multiply by 2
302 	 * to account for a delta between CPUs before the suspend and a
303 	 * delta between CPUs after the resume.
304 	 */
305 	native_tick_offset = (source_tick - target_tick) +
306 	    (CPU->cpu_curr_clock * suspend_tick_stick_max_delta * 2 / MICROSEC);
307 	native_stick_offset = (source_stick - target_stick) +
308 	    (sys_tick_freq * suspend_tick_stick_max_delta * 2 / MICROSEC);
309 
310 	/*
311 	 * We've effectively increased %stick and %tick by twice the value
312 	 * of suspend_tick_stick_max_delta to account for variation across
313 	 * CPUs. Now adjust the preserved TOD by the same amount.
314 	 */
315 	source_hrtime = ts2hrt(tsp);
316 	target_hrtime = source_hrtime +
317 	    (suspend_tick_stick_max_delta * 2 * (NANOSEC/MICROSEC));
318 	hrt2ts(target_hrtime, tsp);
319 }
320 
321 /*
322  * Set the {tick,stick}.NPT field to 1 on this CPU.
323  */
324 static void
325 enable_tick_stick_npt(void)
326 {
327 	(void) hv_stick_set_npt(1);
328 	(void) hv_tick_set_npt(1);
329 }
330 
331 /*
332  * Synchronize a CPU's {tick,stick}.NPT fields with the current state
333  * of the system. This is used when a CPU is DR'd into the system.
334  */
335 void
336 suspend_sync_tick_stick_npt(void)
337 {
338 	if (tick_stick_emulation_active) {
339 		DBG("enabling {%%tick/%%stick}.NPT on CPU 0x%x", CPU->cpu_id);
340 		(void) hv_stick_set_npt(1);
341 		(void) hv_tick_set_npt(1);
342 	} else {
343 		ASSERT(gettick_npt() == 0);
344 		ASSERT(getstick_npt() == 0);
345 	}
346 }
347 
348 /*
349  * Obtain an updated MD from the hypervisor and update cpunodes, CPU HW
350  * sharing data structures, and processor groups.
351  */
352 static void
353 update_cpu_mappings(void)
354 {
355 	md_t		*mdp;
356 	processorid_t	id;
357 	cpu_t		*cp;
358 	cpu_pg_t	*pgps[NCPU];
359 
360 	if ((mdp = md_get_handle()) == NULL) {
361 		DBG("suspend: md_get_handle failed");
362 		return;
363 	}
364 
365 	DBG("suspend: updating CPU mappings");
366 
367 	mutex_enter(&cpu_lock);
368 
369 	setup_chip_mappings(mdp);
370 	setup_exec_unit_mappings(mdp);
371 	for (id = 0; id < NCPU; id++) {
372 		if ((cp = cpu_get(id)) == NULL)
373 			continue;
374 		cpu_map_exec_units(cp);
375 	}
376 
377 	/*
378 	 * Re-calculate processor groups.
379 	 *
380 	 * First tear down all PG information before adding any new PG
381 	 * information derived from the MD we just downloaded. We must
382 	 * call pg_cpu_inactive and pg_cpu_active with CPUs paused and
383 	 * we want to minimize the number of times pause_cpus is called.
384 	 * Inactivating all CPUs would leave PGs without any active CPUs,
385 	 * so while CPUs are paused, call pg_cpu_inactive and swap in the
386 	 * bootstrap PG structure saving the original PG structure to be
387 	 * fini'd afterwards. This prevents the dispatcher from encountering
388 	 * PGs in which all CPUs are inactive.
389 	 */
390 	pause_cpus(NULL);
391 	for (id = 0; id < NCPU; id++) {
392 		if ((cp = cpu_get(id)) == NULL)
393 			continue;
394 		pg_cpu_inactive(cp);
395 		pgps[id] = cp->cpu_pg;
396 		pg_cpu_bootstrap(cp);
397 	}
398 	start_cpus();
399 
400 	/*
401 	 * pg_cpu_fini* and pg_cpu_init* must be called while CPUs are
402 	 * not paused. Use two separate loops here so that we do not
403 	 * initialize PG data for CPUs until all the old PG data structures
404 	 * are torn down.
405 	 */
406 	for (id = 0; id < NCPU; id++) {
407 		if ((cp = cpu_get(id)) == NULL)
408 			continue;
409 		pg_cpu_fini(cp, pgps[id]);
410 		mpo_cpu_remove(id);
411 	}
412 
413 	/*
414 	 * Initialize PG data for each CPU, but leave the bootstrapped
415 	 * PG structure in place to avoid running with any PGs containing
416 	 * nothing but inactive CPUs.
417 	 */
418 	for (id = 0; id < NCPU; id++) {
419 		if ((cp = cpu_get(id)) == NULL)
420 			continue;
421 		mpo_cpu_add(mdp, id);
422 		pgps[id] = pg_cpu_init(cp, B_TRUE);
423 	}
424 
425 	/*
426 	 * Now that PG data has been initialized for all CPUs in the
427 	 * system, replace the bootstrapped PG structure with the
428 	 * initialized PG structure and call pg_cpu_active for each CPU.
429 	 */
430 	pause_cpus(NULL);
431 	for (id = 0; id < NCPU; id++) {
432 		if ((cp = cpu_get(id)) == NULL)
433 			continue;
434 		cp->cpu_pg = pgps[id];
435 		pg_cpu_active(cp);
436 	}
437 	start_cpus();
438 
439 	mutex_exit(&cpu_lock);
440 
441 	(void) md_fini_handle(mdp);
442 }
443 
444 /*
445  * Wrapper for the Sun Cluster error decoding function.
446  */
447 static int
448 cluster_error_decode(int error, char *error_reason, size_t max_reason_len)
449 {
450 	const char	*decoded;
451 	size_t		decoded_len;
452 
453 	ASSERT(error_reason != NULL);
454 	ASSERT(max_reason_len > 0);
455 
456 	max_reason_len = MIN(max_reason_len, SC_FAIL_STR_MAX);
457 
458 	if (cl_suspend_error_decode == NULL)
459 		return (-1);
460 
461 	if ((decoded = (*cl_suspend_error_decode)(error)) == NULL)
462 		return (-1);
463 
464 	/* Get number of non-NULL bytes */
465 	if ((decoded_len = strnlen(decoded, max_reason_len - 1)) == 0)
466 		return (-1);
467 
468 	bcopy(decoded, error_reason, decoded_len);
469 
470 	/*
471 	 * The error string returned from cl_suspend_error_decode
472 	 * should be NULL-terminated, but set the terminator here
473 	 * because we only copied non-NULL bytes. If the decoded
474 	 * string was not NULL-terminated, this guarantees that
475 	 * error_reason will be.
476 	 */
477 	error_reason[decoded_len] = '\0';
478 
479 	return (0);
480 }
481 
482 /*
483  * Wrapper for the Sun Cluster pre-suspend callback.
484  */
485 static int
486 cluster_pre_wrapper(char *error_reason, size_t max_reason_len)
487 {
488 	int rv = 0;
489 
490 	if (cl_suspend_pre_callback != NULL) {
491 		rv = (*cl_suspend_pre_callback)();
492 		DBG("suspend: cl_suspend_pre_callback returned %d", rv);
493 		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
494 			if (cluster_error_decode(rv, error_reason,
495 			    max_reason_len)) {
496 				(void) snprintf(error_reason, max_reason_len,
497 				    SC_PRE_FAIL_STR_FMT, rv);
498 			}
499 		}
500 	}
501 
502 	return (rv);
503 }
504 
505 /*
506  * Wrapper for the Sun Cluster post-suspend callback.
507  */
508 static int
509 cluster_post_wrapper(char *error_reason, size_t max_reason_len)
510 {
511 	int rv = 0;
512 
513 	if (cl_suspend_post_callback != NULL) {
514 		rv = (*cl_suspend_post_callback)();
515 		DBG("suspend: cl_suspend_post_callback returned %d", rv);
516 		if (rv != 0 && error_reason != NULL && max_reason_len > 0) {
517 			if (cluster_error_decode(rv, error_reason,
518 			    max_reason_len)) {
519 				(void) snprintf(error_reason,
520 				    max_reason_len, SC_POST_FAIL_STR_FMT, rv);
521 			}
522 		}
523 	}
524 
525 	return (rv);
526 }
527 
528 /*
529  * Execute pre-suspend callbacks preparing the system for a suspend operation.
530  * Returns zero on success, non-zero on failure. Sets the recovered argument
531  * to indicate whether or not callbacks could be undone in the event of a
532  * failure--if callbacks were successfully undone, *recovered is set to B_TRUE,
533  * otherwise *recovered is set to B_FALSE. Must be called successfully before
534  * suspend_start can be called. Callers should first call suspend_support to
535  * determine if OS suspend is supported.
536  */
537 int
538 suspend_pre(char *error_reason, size_t max_reason_len, boolean_t *recovered)
539 {
540 	int rv;
541 
542 	ASSERT(recovered != NULL);
543 
544 	/*
545 	 * Return an error if suspend_pre is erreoneously called
546 	 * when OS suspend is not supported.
547 	 */
548 	ASSERT(suspend_supported());
549 	if (!suspend_supported()) {
550 		DBG("suspend: suspend_pre called without suspend support");
551 		*recovered = B_TRUE;
552 		return (ENOTSUP);
553 	}
554 	DBG("suspend: %s", __func__);
555 
556 	rv = cluster_pre_wrapper(error_reason, max_reason_len);
557 
558 	/*
559 	 * At present, only one pre-suspend operation exists.
560 	 * If it fails, no recovery needs to be done.
561 	 */
562 	if (rv != 0 && recovered != NULL)
563 		*recovered = B_TRUE;
564 
565 	return (rv);
566 }
567 
568 /*
569  * Execute post-suspend callbacks. Returns zero on success, non-zero on
570  * failure. Must be called after suspend_start is called, regardless of
571  * whether or not suspend_start is successful.
572  */
573 int
574 suspend_post(char *error_reason, size_t max_reason_len)
575 {
576 	ASSERT(suspend_supported());
577 	DBG("suspend: %s", __func__);
578 	return (cluster_post_wrapper(error_reason, max_reason_len));
579 }
580 
581 /*
582  * Suspends the OS by pausing CPUs and calling into the HV to initiate
583  * the suspend. When the HV routine hv_guest_suspend returns, the system
584  * will be resumed. Must be called after a successful call to suspend_pre.
585  * suspend_post must be called after suspend_start, whether or not
586  * suspend_start returns an error.
587  */
588 /*ARGSUSED*/
589 int
590 suspend_start(char *error_reason, size_t max_reason_len)
591 {
592 	uint64_t	source_tick;
593 	uint64_t	source_stick;
594 	uint64_t	rv;
595 	timestruc_t	source_tod;
596 	int		spl;
597 
598 	ASSERT(suspend_supported());
599 	DBG("suspend: %s", __func__);
600 
601 	sfmmu_ctxdoms_lock();
602 
603 	mutex_enter(&cpu_lock);
604 
605 	/* Suspend the watchdog */
606 	watchdog_suspend();
607 
608 	/* Record the TOD */
609 	mutex_enter(&tod_lock);
610 	source_tod = tod_get();
611 	mutex_exit(&tod_lock);
612 
613 	/* Pause all other CPUs */
614 	pause_cpus(NULL);
615 	DBG_PROM("suspend: CPUs paused\n");
616 
617 	/* Suspend cyclics */
618 	cyclic_suspend();
619 	DBG_PROM("suspend: cyclics suspended\n");
620 
621 	/* Disable interrupts */
622 	spl = spl8();
623 	DBG_PROM("suspend: spl8()\n");
624 
625 	source_tick = gettick_counter();
626 	source_stick = gettick();
627 	DBG_PROM("suspend: source_tick: 0x%lx\n", source_tick);
628 	DBG_PROM("suspend: source_stick: 0x%lx\n", source_stick);
629 
630 	/*
631 	 * Call into the HV to initiate the suspend. hv_guest_suspend()
632 	 * returns after the guest has been resumed or if the suspend
633 	 * operation failed or was cancelled. After a successful suspend,
634 	 * the %tick and %stick registers may have changed by an amount
635 	 * that is not proportional to the amount of time that has passed.
636 	 * They may have jumped forwards or backwards. Some variation is
637 	 * allowed and accounted for using suspend_tick_stick_max_delta,
638 	 * but otherwise this jump must be uniform across all CPUs and we
639 	 * operate under the assumption that it is (maintaining two global
640 	 * offset variables--one for %tick and one for %stick.)
641 	 */
642 	DBG_PROM("suspend: suspending... \n");
643 	rv = hv_guest_suspend();
644 	if (rv != 0) {
645 		splx(spl);
646 		cyclic_resume();
647 		start_cpus();
648 		watchdog_resume();
649 		mutex_exit(&cpu_lock);
650 		sfmmu_ctxdoms_unlock();
651 		DBG("suspend: failed, rv: %ld\n", rv);
652 		return (rv);
653 	}
654 
655 	suspend_count++;
656 
657 	/* Update the global tick and stick offsets and the preserved TOD */
658 	set_tick_offsets(source_tick, source_stick, &source_tod);
659 
660 	/* Ensure new offsets are globally visible before resuming CPUs */
661 	membar_sync();
662 
663 	/* Enable interrupts */
664 	splx(spl);
665 
666 	/* Set the {%tick,%stick}.NPT bits on all CPUs */
667 	if (enable_user_tick_stick_emulation) {
668 		xc_all((xcfunc_t *)enable_tick_stick_npt, NULL, NULL);
669 		xt_sync(cpu_ready_set);
670 		ASSERT(gettick_npt() != 0);
671 		ASSERT(getstick_npt() != 0);
672 	}
673 
674 	/* If emulation is enabled, but not currently active, enable it */
675 	if (enable_user_tick_stick_emulation && !tick_stick_emulation_active) {
676 		tick_stick_emulation_active = B_TRUE;
677 	}
678 
679 	sfmmu_ctxdoms_remove();
680 
681 	/* Resume cyclics, unpause CPUs */
682 	cyclic_resume();
683 	start_cpus();
684 
685 	/* Set the TOD */
686 	mutex_enter(&tod_lock);
687 	tod_set(source_tod);
688 	mutex_exit(&tod_lock);
689 
690 	/* Re-enable the watchdog */
691 	watchdog_resume();
692 
693 	mutex_exit(&cpu_lock);
694 
695 	/* Download the latest MD */
696 	if ((rv = mach_descrip_update()) != 0)
697 		cmn_err(CE_PANIC, "suspend: mach_descrip_update failed: %ld",
698 		    rv);
699 
700 	sfmmu_ctxdoms_update();
701 	sfmmu_ctxdoms_unlock();
702 
703 	/* Get new MD, update CPU mappings/relationships */
704 	if (suspend_update_cpu_mappings)
705 		update_cpu_mappings();
706 
707 	DBG("suspend: target tick: 0x%lx", gettick_counter());
708 	DBG("suspend: target stick: 0x%llx", gettick());
709 	DBG("suspend: user %%tick/%%stick emulation is %d",
710 	    tick_stick_emulation_active);
711 	DBG("suspend: finished");
712 
713 	return (0);
714 }
715