xref: /illumos-gate/usr/src/uts/i86xpv/os/xen_machdep.c (revision 074bb90d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /* derived from netbsd's xen_machdep.c 1.1.2.1 */
28 
29 /*
30  *
31  * Copyright (c) 2004 Christian Limpach.
32  * All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  * 1. Redistributions of source code must retain the above copyright
38  *    notice, this list of conditions and the following disclaimer.
39  * 2. Redistributions in binary form must reproduce the above copyright
40  *    notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  * 3. This section intentionally left blank.
43  * 4. The name of the author may not be used to endorse or promote products
44  *    derived from this software without specific prior written permission.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
47  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
48  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
49  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
50  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
51  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
52  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
53  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
54  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
55  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56  */
57 /*
58  * Section 3 of the above license was updated in response to bug 6379571.
59  */
60 
61 #include <sys/xpv_user.h>
62 
63 /* XXX 3.3. TODO remove this include */
64 #include <xen/public/arch-x86/xen-mca.h>
65 
66 #include <sys/ctype.h>
67 #include <sys/types.h>
68 #include <sys/cmn_err.h>
69 #include <sys/trap.h>
70 #include <sys/segments.h>
71 #include <sys/hypervisor.h>
72 #include <sys/xen_mmu.h>
73 #include <sys/machsystm.h>
74 #include <sys/promif.h>
75 #include <sys/bootconf.h>
76 #include <sys/bootinfo.h>
77 #include <sys/cpr.h>
78 #include <sys/taskq.h>
79 #include <sys/uadmin.h>
80 #include <sys/evtchn_impl.h>
81 #include <sys/archsystm.h>
82 #include <xen/sys/xenbus_impl.h>
83 #include <sys/mach_mmu.h>
84 #include <vm/hat_i86.h>
85 #include <sys/gnttab.h>
86 #include <sys/reboot.h>
87 #include <sys/stack.h>
88 #include <sys/clock.h>
89 #include <sys/bitmap.h>
90 #include <sys/processor.h>
91 #include <sys/xen_errno.h>
92 #include <sys/xpv_panic.h>
93 #include <sys/smp_impldefs.h>
94 #include <sys/cpu.h>
95 #include <sys/balloon_impl.h>
96 #include <sys/ddi.h>
97 
98 #ifdef DEBUG
99 #define	SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
100 #else
101 #define	SUSPEND_DEBUG(...)
102 #endif
103 
104 int cpr_debug;
105 cpuset_t cpu_suspend_lost_set;
106 static int xen_suspend_debug;
107 
108 uint_t xen_phys_ncpus;
109 xen_mc_logical_cpu_t *xen_phys_cpus;
110 int xen_physinfo_debug = 0;
111 
112 /*
113  * Determine helpful version information.
114  *
115  * (And leave copies in the data segment so we can look at them later
116  * with e.g. kmdb.)
117  */
118 
119 typedef enum xen_version {
120 	XENVER_BOOT_IDX,
121 	XENVER_CURRENT_IDX
122 } xen_version_t;
123 
124 struct xenver {
125 	ulong_t xv_major;
126 	ulong_t xv_minor;
127 	ulong_t xv_revision;
128 	xen_extraversion_t xv_ver;
129 	ulong_t xv_is_xvm;
130 	xen_changeset_info_t xv_chgset;
131 	xen_compile_info_t xv_build;
132 	xen_capabilities_info_t xv_caps;
133 } xenver[2];
134 
135 #define	XENVER_BOOT(m)	(xenver[XENVER_BOOT_IDX].m)
136 #define	XENVER_CURRENT(m)	(xenver[XENVER_CURRENT_IDX].m)
137 
138 /*
139  * Update the xenver data. We maintain two copies, boot and
140  * current. If we are setting the boot, then also set current.
141  */
142 static void
143 xen_set_version(xen_version_t idx)
144 {
145 	ulong_t ver;
146 
147 	bzero(&xenver[idx], sizeof (xenver[idx]));
148 
149 	ver = HYPERVISOR_xen_version(XENVER_version, 0);
150 
151 	xenver[idx].xv_major = BITX(ver, 31, 16);
152 	xenver[idx].xv_minor = BITX(ver, 15, 0);
153 
154 	(void) HYPERVISOR_xen_version(XENVER_extraversion, &xenver[idx].xv_ver);
155 
156 	/*
157 	 * The revision is buried in the extraversion information that is
158 	 * maintained by the hypervisor. For our purposes we expect that
159 	 * the revision number is:
160 	 * 	- the second character in the extraversion information
161 	 *	- one character long
162 	 *	- numeric digit
163 	 * If it isn't then we can't extract the revision and we leave it
164 	 * set to 0.
165 	 */
166 	if (strlen(xenver[idx].xv_ver) > 1 && isdigit(xenver[idx].xv_ver[1]))
167 		xenver[idx].xv_revision = xenver[idx].xv_ver[1] - '0';
168 	else
169 		cmn_err(CE_WARN, "Cannot extract revision on this hypervisor "
170 		    "version: v%s, unexpected version format",
171 		    xenver[idx].xv_ver);
172 
173 	xenver[idx].xv_is_xvm = 0;
174 
175 	if (strlen(xenver[idx].xv_ver) >= 4 &&
176 	    strncmp(xenver[idx].xv_ver + strlen(xenver[idx].xv_ver) - 4,
177 	    "-xvm", 4) == 0)
178 		xenver[idx].xv_is_xvm = 1;
179 
180 	(void) HYPERVISOR_xen_version(XENVER_changeset,
181 	    &xenver[idx].xv_chgset);
182 
183 	(void) HYPERVISOR_xen_version(XENVER_compile_info,
184 	    &xenver[idx].xv_build);
185 	/*
186 	 * Capabilities are a set of space separated ascii strings
187 	 * e.g. 'xen-3.1-x86_32p' or 'hvm-3.2-x86_64'
188 	 */
189 	(void) HYPERVISOR_xen_version(XENVER_capabilities,
190 	    &xenver[idx].xv_caps);
191 
192 	cmn_err(CE_CONT, "?v%lu.%lu%s chgset '%s'\n", xenver[idx].xv_major,
193 	    xenver[idx].xv_minor, xenver[idx].xv_ver, xenver[idx].xv_chgset);
194 
195 	if (idx == XENVER_BOOT_IDX)
196 		bcopy(&xenver[XENVER_BOOT_IDX], &xenver[XENVER_CURRENT_IDX],
197 		    sizeof (xenver[XENVER_BOOT_IDX]));
198 }
199 
200 typedef enum xen_hypervisor_check {
201 	XEN_RUN_CHECK,
202 	XEN_SUSPEND_CHECK
203 } xen_hypervisor_check_t;
204 
205 /*
206  * To run the hypervisor must be 3.0.4 or better. To suspend/resume
207  * we need 3.0.4 or better and if it is 3.0.4. then it must be provided
208  * by the Solaris xVM project.
209  * Checking can be disabled for testing purposes by setting the
210  * xen_suspend_debug variable.
211  */
212 static int
213 xen_hypervisor_supports_solaris(xen_hypervisor_check_t check)
214 {
215 	if (xen_suspend_debug == 1)
216 		return (1);
217 	if (XENVER_CURRENT(xv_major) < 3)
218 		return (0);
219 	if (XENVER_CURRENT(xv_major) > 3)
220 		return (1);
221 	if (XENVER_CURRENT(xv_minor) > 0)
222 		return (1);
223 	if (XENVER_CURRENT(xv_revision) < 4)
224 		return (0);
225 	if (check == XEN_SUSPEND_CHECK && XENVER_CURRENT(xv_revision) == 4 &&
226 	    !XENVER_CURRENT(xv_is_xvm))
227 		return (0);
228 
229 	return (1);
230 }
231 
232 /*
233  * If the hypervisor is -xvm, or 3.1.2 or higher, we don't need the
234  * workaround.
235  */
236 static void
237 xen_pte_workaround(void)
238 {
239 #if defined(__amd64)
240 	extern int pt_kern;
241 
242 	if (XENVER_CURRENT(xv_major) != 3)
243 		return;
244 	if (XENVER_CURRENT(xv_minor) > 1)
245 		return;
246 	if (XENVER_CURRENT(xv_minor) == 1 &&
247 	    XENVER_CURRENT(xv_revision) > 1)
248 		return;
249 	if (XENVER_CURRENT(xv_is_xvm))
250 		return;
251 
252 	pt_kern = PT_USER;
253 #endif
254 }
255 
256 void
257 xen_set_callback(void (*func)(void), uint_t type, uint_t flags)
258 {
259 	struct callback_register cb;
260 
261 	bzero(&cb, sizeof (cb));
262 #if defined(__amd64)
263 	cb.address = (ulong_t)func;
264 #elif defined(__i386)
265 	cb.address.cs = KCS_SEL;
266 	cb.address.eip = (ulong_t)func;
267 #endif
268 	cb.type = type;
269 	cb.flags = flags;
270 
271 	/*
272 	 * XXPV always ignore return value for NMI
273 	 */
274 	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb) != 0 &&
275 	    type != CALLBACKTYPE_nmi)
276 		panic("HYPERVISOR_callback_op failed");
277 }
278 
279 void
280 xen_init_callbacks(void)
281 {
282 	/*
283 	 * register event (interrupt) handler.
284 	 */
285 	xen_set_callback(xen_callback, CALLBACKTYPE_event, 0);
286 
287 	/*
288 	 * failsafe handler.
289 	 */
290 	xen_set_callback(xen_failsafe_callback, CALLBACKTYPE_failsafe,
291 	    CALLBACKF_mask_events);
292 
293 	/*
294 	 * NMI handler.
295 	 */
296 	xen_set_callback(nmiint, CALLBACKTYPE_nmi, 0);
297 
298 	/*
299 	 * system call handler
300 	 * XXPV move to init_cpu_syscall?
301 	 */
302 #if defined(__amd64)
303 	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
304 	    CALLBACKF_mask_events);
305 #endif	/* __amd64 */
306 }
307 
308 
309 /*
310  * cmn_err() followed by a 1/4 second delay; this gives the
311  * logging service a chance to flush messages and helps avoid
312  * intermixing output from prom_printf().
313  * XXPV: doesn't exactly help us on UP though.
314  */
315 /*PRINTFLIKE2*/
316 void
317 cpr_err(int ce, const char *fmt, ...)
318 {
319 	va_list adx;
320 
321 	va_start(adx, fmt);
322 	vcmn_err(ce, fmt, adx);
323 	va_end(adx);
324 	drv_usecwait(MICROSEC >> 2);
325 }
326 
327 void
328 xen_suspend_devices(void)
329 {
330 	int rc;
331 
332 	SUSPEND_DEBUG("xen_suspend_devices\n");
333 
334 	if ((rc = cpr_suspend_devices(ddi_root_node())) != 0)
335 		panic("failed to suspend devices: %d", rc);
336 }
337 
338 void
339 xen_resume_devices(void)
340 {
341 	int rc;
342 
343 	SUSPEND_DEBUG("xen_resume_devices\n");
344 
345 	if ((rc = cpr_resume_devices(ddi_root_node(), 0)) != 0)
346 		panic("failed to resume devices: %d", rc);
347 }
348 
349 /*
350  * The list of mfn pages is out of date.  Recompute it.
351  */
352 static void
353 rebuild_mfn_list(void)
354 {
355 	int i = 0;
356 	size_t sz;
357 	size_t off;
358 	pfn_t pfn;
359 
360 	SUSPEND_DEBUG("rebuild_mfn_list\n");
361 
362 	sz = ((mfn_count * sizeof (mfn_t)) + MMU_PAGEOFFSET) & MMU_PAGEMASK;
363 
364 	for (off = 0; off < sz; off += MMU_PAGESIZE) {
365 		size_t j = mmu_btop(off);
366 		if (((j * sizeof (mfn_t)) & MMU_PAGEOFFSET) == 0) {
367 			pfn = hat_getpfnum(kas.a_hat,
368 			    (caddr_t)&mfn_list_pages[j]);
369 			mfn_list_pages_page[i++] = pfn_to_mfn(pfn);
370 		}
371 
372 		pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list + off);
373 		mfn_list_pages[j] = pfn_to_mfn(pfn);
374 	}
375 
376 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)mfn_list_pages_page);
377 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list
378 	    = pfn_to_mfn(pfn);
379 }
380 
381 static void
382 suspend_cpus(void)
383 {
384 	int i;
385 
386 	SUSPEND_DEBUG("suspend_cpus\n");
387 
388 	mp_enter_barrier();
389 
390 	for (i = 1; i < ncpus; i++) {
391 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
392 			SUSPEND_DEBUG("xen_vcpu_down %d\n", i);
393 			(void) xen_vcpu_down(i);
394 		}
395 
396 		mach_cpucontext_reset(cpu[i]);
397 	}
398 }
399 
400 static void
401 resume_cpus(void)
402 {
403 	int i;
404 
405 	for (i = 1; i < ncpus; i++) {
406 		if (cpu[i] == NULL)
407 			continue;
408 
409 		if (!CPU_IN_SET(cpu_suspend_lost_set, i)) {
410 			SUSPEND_DEBUG("xen_vcpu_up %d\n", i);
411 			mach_cpucontext_restore(cpu[i]);
412 			(void) xen_vcpu_up(i);
413 		}
414 	}
415 
416 	mp_leave_barrier();
417 }
418 
419 /*
420  * Top level routine to direct suspend/resume of a domain.
421  */
422 void
423 xen_suspend_domain(void)
424 {
425 	extern void rtcsync(void);
426 	extern hrtime_t hres_last_tick;
427 	mfn_t start_info_mfn;
428 	ulong_t flags;
429 	pfn_t pfn;
430 	int i;
431 
432 	/*
433 	 * Check that we are happy to suspend on this hypervisor.
434 	 */
435 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0) {
436 		cpr_err(CE_WARN, "Cannot suspend on this hypervisor "
437 		    "version: v%lu.%lu%s, need at least version v3.0.4 or "
438 		    "-xvm based hypervisor", XENVER_CURRENT(xv_major),
439 		    XENVER_CURRENT(xv_minor), XENVER_CURRENT(xv_ver));
440 		return;
441 	}
442 
443 	/*
444 	 * XXPV - Are we definitely OK to suspend by the time we've connected
445 	 * the handler?
446 	 */
447 
448 	cpr_err(CE_NOTE, "Domain suspending for save/migrate");
449 
450 	SUSPEND_DEBUG("xen_suspend_domain\n");
451 
452 	/*
453 	 * suspend interrupts and devices
454 	 * XXPV - we use suspend/resume for both save/restore domains (like sun
455 	 * cpr) and for migration.  Would be nice to know the difference if
456 	 * possible.  For save/restore where down time may be a long time, we
457 	 * may want to do more of the things that cpr does.  (i.e. notify user
458 	 * processes, shrink memory footprint for faster restore, etc.)
459 	 */
460 	xen_suspend_devices();
461 	SUSPEND_DEBUG("xenbus_suspend\n");
462 	xenbus_suspend();
463 
464 	pfn = hat_getpfnum(kas.a_hat, (caddr_t)xen_info);
465 	start_info_mfn = pfn_to_mfn(pfn);
466 
467 	/*
468 	 * XXPV: cpu hotplug can hold this under a xenbus watch. Are we safe
469 	 * wrt xenbus being suspended here?
470 	 */
471 	mutex_enter(&cpu_lock);
472 
473 	/*
474 	 * Suspend must be done on vcpu 0, as no context for other CPUs is
475 	 * saved.
476 	 *
477 	 * XXPV - add to taskq API ?
478 	 */
479 	thread_affinity_set(curthread, 0);
480 	kpreempt_disable();
481 
482 	SUSPEND_DEBUG("xen_start_migrate\n");
483 	xen_start_migrate();
484 	if (ncpus > 1)
485 		suspend_cpus();
486 
487 	/*
488 	 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
489 	 * any holder would have dropped it to get through suspend_cpus().
490 	 */
491 	mutex_enter(&ec_lock);
492 
493 	/*
494 	 * From here on in, we can't take locks.
495 	 */
496 	SUSPEND_DEBUG("ec_suspend\n");
497 	ec_suspend();
498 	SUSPEND_DEBUG("gnttab_suspend\n");
499 	gnttab_suspend();
500 
501 	flags = intr_clear();
502 
503 	xpv_time_suspend();
504 
505 	/*
506 	 * Currently, the hypervisor incorrectly fails to bring back
507 	 * powered-down VCPUs.  Thus we need to record any powered-down VCPUs
508 	 * to prevent any attempts to operate on them.  But we have to do this
509 	 * *after* the very first time we do ec_suspend().
510 	 */
511 	for (i = 1; i < ncpus; i++) {
512 		if (cpu[i] == NULL)
513 			continue;
514 
515 		if (cpu_get_state(cpu[i]) == P_POWEROFF)
516 			CPUSET_ATOMIC_ADD(cpu_suspend_lost_set, i);
517 	}
518 
519 	/*
520 	 * The dom0 save/migrate code doesn't automatically translate
521 	 * these into PFNs, but expects them to be, so we do it here.
522 	 * We don't use mfn_to_pfn() because so many OS services have
523 	 * been disabled at this point.
524 	 */
525 	xen_info->store_mfn = mfn_to_pfn_mapping[xen_info->store_mfn];
526 	xen_info->console.domU.mfn =
527 	    mfn_to_pfn_mapping[xen_info->console.domU.mfn];
528 
529 	if (CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0) {
530 		prom_printf("xen_suspend_domain(): "
531 		    "CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask not set\n");
532 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
533 	}
534 
535 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
536 	    0, UVMF_INVLPG)) {
537 		prom_printf("xen_suspend_domain(): "
538 		    "HYPERVISOR_update_va_mapping() failed\n");
539 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
540 	}
541 
542 	SUSPEND_DEBUG("HYPERVISOR_suspend\n");
543 
544 	/*
545 	 * At this point we suspend and sometime later resume.
546 	 */
547 	if (HYPERVISOR_suspend(start_info_mfn)) {
548 		prom_printf("xen_suspend_domain(): "
549 		    "HYPERVISOR_suspend() failed\n");
550 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
551 	}
552 
553 	/*
554 	 * Point HYPERVISOR_shared_info to its new value.
555 	 */
556 	if (HYPERVISOR_update_va_mapping((uintptr_t)HYPERVISOR_shared_info,
557 	    xen_info->shared_info | PT_NOCONSIST | PT_VALID | PT_WRITABLE,
558 	    UVMF_INVLPG))
559 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
560 
561 	if (xen_info->nr_pages != mfn_count) {
562 		prom_printf("xen_suspend_domain(): number of pages"
563 		    " changed, was 0x%lx, now 0x%lx\n", mfn_count,
564 		    xen_info->nr_pages);
565 		(void) HYPERVISOR_shutdown(SHUTDOWN_crash);
566 	}
567 
568 	xpv_time_resume();
569 
570 	cached_max_mfn = 0;
571 
572 	SUSPEND_DEBUG("gnttab_resume\n");
573 	gnttab_resume();
574 
575 	/* XXPV: add a note that this must be lockless. */
576 	SUSPEND_DEBUG("ec_resume\n");
577 	ec_resume();
578 
579 	intr_restore(flags);
580 
581 	if (ncpus > 1)
582 		resume_cpus();
583 
584 	mutex_exit(&ec_lock);
585 	xen_end_migrate();
586 	mutex_exit(&cpu_lock);
587 
588 	/*
589 	 * Now we can take locks again.
590 	 */
591 
592 	/*
593 	 * Force the tick value used for tv_nsec in hres_tick() to be up to
594 	 * date. rtcsync() will reset the hrestime value appropriately.
595 	 */
596 	hres_last_tick = xpv_gethrtime();
597 
598 	/*
599 	 * XXPV: we need to have resumed the CPUs since this takes locks, but
600 	 * can remote CPUs see bad state? Presumably yes. Should probably nest
601 	 * taking of todlock inside of cpu_lock, or vice versa, then provide an
602 	 * unlocked version.  Probably need to call clkinitf to reset cpu freq
603 	 * and re-calibrate if we migrated to a different speed cpu.  Also need
604 	 * to make a (re)init_cpu_info call to update processor info structs
605 	 * and device tree info.  That remains to be written at the moment.
606 	 */
607 	rtcsync();
608 
609 	rebuild_mfn_list();
610 
611 	SUSPEND_DEBUG("xenbus_resume\n");
612 	xenbus_resume();
613 	SUSPEND_DEBUG("xenbus_resume_devices\n");
614 	xen_resume_devices();
615 
616 	thread_affinity_clear(curthread);
617 	kpreempt_enable();
618 
619 	SUSPEND_DEBUG("finished xen_suspend_domain\n");
620 
621 	/*
622 	 * We have restarted our suspended domain, update the hypervisor
623 	 * details. NB: This must be done at the end of this function,
624 	 * since we need the domain to be completely resumed before
625 	 * these functions will work correctly.
626 	 */
627 	xen_set_version(XENVER_CURRENT_IDX);
628 
629 	/*
630 	 * We can check and report a warning, but we don't stop the
631 	 * process.
632 	 */
633 	if (xen_hypervisor_supports_solaris(XEN_SUSPEND_CHECK) == 0)
634 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
635 		    "but need at least version v3.0.4",
636 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
637 		    XENVER_CURRENT(xv_ver));
638 
639 	cmn_err(CE_NOTE, "domain restore/migrate completed");
640 }
641 
642 /*ARGSUSED*/
643 int
644 xen_debug_handler(void *arg)
645 {
646 	debug_enter("External debug event received");
647 
648 	/*
649 	 * If we've not got KMDB loaded, output some stuff difficult to capture
650 	 * from a domain core.
651 	 */
652 	if (!(boothowto & RB_DEBUG)) {
653 		shared_info_t *si = HYPERVISOR_shared_info;
654 		int i;
655 
656 		prom_printf("evtchn_pending [ ");
657 		for (i = 0; i < 8; i++)
658 			prom_printf("%lx ", si->evtchn_pending[i]);
659 		prom_printf("]\nevtchn_mask [ ");
660 		for (i = 0; i < 8; i++)
661 			prom_printf("%lx ", si->evtchn_mask[i]);
662 		prom_printf("]\n");
663 
664 		for (i = 0; i < ncpus; i++) {
665 			vcpu_info_t *vcpu = &si->vcpu_info[i];
666 			if (cpu[i] == NULL)
667 				continue;
668 			prom_printf("CPU%d pending %d mask %d sel %lx\n",
669 			    i, vcpu->evtchn_upcall_pending,
670 			    vcpu->evtchn_upcall_mask,
671 			    vcpu->evtchn_pending_sel);
672 		}
673 	}
674 
675 	return (0);
676 }
677 
678 /*ARGSUSED*/
679 static void
680 xen_sysrq_handler(struct xenbus_watch *watch, const char **vec,
681     unsigned int len)
682 {
683 	xenbus_transaction_t xbt;
684 	char key = '\0';
685 	int ret;
686 
687 retry:
688 	if (xenbus_transaction_start(&xbt)) {
689 		cmn_err(CE_WARN, "failed to start sysrq transaction");
690 		return;
691 	}
692 
693 	if ((ret = xenbus_scanf(xbt, "control", "sysrq", "%c", &key)) != 0) {
694 		/*
695 		 * ENOENT happens in response to our own xenbus_rm.
696 		 * XXPV - this happens spuriously on boot?
697 		 */
698 		if (ret != ENOENT)
699 			cmn_err(CE_WARN, "failed to read sysrq: %d", ret);
700 		goto out;
701 	}
702 
703 	if ((ret = xenbus_rm(xbt, "control", "sysrq")) != 0) {
704 		cmn_err(CE_WARN, "failed to reset sysrq: %d", ret);
705 		goto out;
706 	}
707 
708 	if (xenbus_transaction_end(xbt, 0) == EAGAIN)
709 		goto retry;
710 
711 	/*
712 	 * Somewhat arbitrary - on Linux this means 'reboot'. We could just
713 	 * accept any key, but this might increase the risk of sending a
714 	 * harmless sysrq to the wrong domain...
715 	 */
716 	if (key == 'b')
717 		(void) xen_debug_handler(NULL);
718 	else
719 		cmn_err(CE_WARN, "Ignored sysrq %c", key);
720 	return;
721 
722 out:
723 	(void) xenbus_transaction_end(xbt, 1);
724 }
725 
726 taskq_t *xen_shutdown_tq;
727 
728 #define	SHUTDOWN_INVALID	-1
729 #define	SHUTDOWN_POWEROFF	0
730 #define	SHUTDOWN_REBOOT		1
731 #define	SHUTDOWN_SUSPEND	2
732 #define	SHUTDOWN_HALT		3
733 #define	SHUTDOWN_MAX		4
734 
735 #define	SHUTDOWN_TIMEOUT_SECS (60 * 5)
736 
737 static const char *cmd_strings[SHUTDOWN_MAX] = {
738 	"poweroff",
739 	"reboot",
740 	"suspend",
741 	"halt"
742 };
743 
744 static void
745 xen_dirty_shutdown(void *arg)
746 {
747 	int cmd = (uintptr_t)arg;
748 
749 	cmn_err(CE_WARN, "Externally requested shutdown failed or "
750 	    "timed out.\nShutting down.\n");
751 
752 	switch (cmd) {
753 	case SHUTDOWN_HALT:
754 	case SHUTDOWN_POWEROFF:
755 		(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
756 		break;
757 	case SHUTDOWN_REBOOT:
758 		(void) kadmin(A_REBOOT, AD_BOOT, NULL, kcred);
759 		break;
760 	}
761 }
762 
763 static void
764 xen_shutdown(void *arg)
765 {
766 	int cmd = (uintptr_t)arg;
767 	proc_t *initpp;
768 
769 	ASSERT(cmd > SHUTDOWN_INVALID && cmd < SHUTDOWN_MAX);
770 
771 	if (cmd == SHUTDOWN_SUSPEND) {
772 		xen_suspend_domain();
773 		return;
774 	}
775 
776 	switch (cmd) {
777 	case SHUTDOWN_POWEROFF:
778 		force_shutdown_method = AD_POWEROFF;
779 		break;
780 	case SHUTDOWN_HALT:
781 		force_shutdown_method = AD_HALT;
782 		break;
783 	case SHUTDOWN_REBOOT:
784 		force_shutdown_method = AD_BOOT;
785 		break;
786 	}
787 
788 	/*
789 	 * If we're still booting and init(1) isn't set up yet, simply halt.
790 	 */
791 	mutex_enter(&pidlock);
792 	initpp = prfind(P_INITPID);
793 	mutex_exit(&pidlock);
794 	if (initpp == NULL) {
795 		extern void halt(char *);
796 		halt("Power off the System");   /* just in case */
797 	}
798 
799 	/*
800 	 * else, graceful shutdown with inittab and all getting involved
801 	 */
802 	psignal(initpp, SIGPWR);
803 
804 	(void) timeout(xen_dirty_shutdown, arg,
805 	    SHUTDOWN_TIMEOUT_SECS * drv_usectohz(MICROSEC));
806 }
807 
808 /*ARGSUSED*/
809 static void
810 xen_shutdown_handler(struct xenbus_watch *watch, const char **vec,
811 	unsigned int len)
812 {
813 	char *str;
814 	xenbus_transaction_t xbt;
815 	int err, shutdown_code = SHUTDOWN_INVALID;
816 	unsigned int slen;
817 
818 again:
819 	err = xenbus_transaction_start(&xbt);
820 	if (err)
821 		return;
822 	if (xenbus_read(xbt, "control", "shutdown", (void *)&str, &slen)) {
823 		(void) xenbus_transaction_end(xbt, 1);
824 		return;
825 	}
826 
827 	SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU->cpu_id, str);
828 
829 	/*
830 	 * If this is a watch fired from our write below, check out early to
831 	 * avoid an infinite loop.
832 	 */
833 	if (strcmp(str, "") == 0) {
834 		(void) xenbus_transaction_end(xbt, 0);
835 		kmem_free(str, slen);
836 		return;
837 	} else if (strcmp(str, "poweroff") == 0) {
838 		shutdown_code = SHUTDOWN_POWEROFF;
839 	} else if (strcmp(str, "reboot") == 0) {
840 		shutdown_code = SHUTDOWN_REBOOT;
841 	} else if (strcmp(str, "suspend") == 0) {
842 		shutdown_code = SHUTDOWN_SUSPEND;
843 	} else if (strcmp(str, "halt") == 0) {
844 		shutdown_code = SHUTDOWN_HALT;
845 	} else {
846 		printf("Ignoring shutdown request: %s\n", str);
847 	}
848 
849 	/*
850 	 * XXPV	Should we check the value of xenbus_write() too, or are all
851 	 *	errors automatically folded into xenbus_transaction_end() ??
852 	 */
853 	(void) xenbus_write(xbt, "control", "shutdown", "");
854 	err = xenbus_transaction_end(xbt, 0);
855 	if (err == EAGAIN) {
856 		SUSPEND_DEBUG("%d: trying again\n", CPU->cpu_id);
857 		kmem_free(str, slen);
858 		goto again;
859 	}
860 
861 	kmem_free(str, slen);
862 	if (shutdown_code != SHUTDOWN_INVALID) {
863 		(void) taskq_dispatch(xen_shutdown_tq, xen_shutdown,
864 		    (void *)(intptr_t)shutdown_code, 0);
865 	}
866 }
867 
868 static struct xenbus_watch shutdown_watch;
869 static struct xenbus_watch sysrq_watch;
870 
871 void
872 xen_late_startup(void)
873 {
874 	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
875 		xen_shutdown_tq = taskq_create("shutdown_taskq", 1,
876 		    maxclsyspri - 1, 1, 1, TASKQ_PREPOPULATE);
877 		shutdown_watch.node = "control/shutdown";
878 		shutdown_watch.callback = xen_shutdown_handler;
879 		if (register_xenbus_watch(&shutdown_watch))
880 			cmn_err(CE_WARN, "Failed to set shutdown watcher");
881 
882 		sysrq_watch.node = "control/sysrq";
883 		sysrq_watch.callback = xen_sysrq_handler;
884 		if (register_xenbus_watch(&sysrq_watch))
885 			cmn_err(CE_WARN, "Failed to set sysrq watcher");
886 	}
887 	balloon_init(xen_info->nr_pages);
888 }
889 
890 #ifdef DEBUG
891 #define	XEN_PRINTF_BUFSIZE	1024
892 
893 char xen_printf_buffer[XEN_PRINTF_BUFSIZE];
894 
895 /*
896  * Printf function that calls hypervisor directly.  For DomU it only
897  * works when running on a xen hypervisor built with debug on.  Works
898  * always since no I/O ring interaction is needed.
899  */
900 /*PRINTFLIKE1*/
901 void
902 xen_printf(const char *fmt, ...)
903 {
904 	va_list	ap;
905 
906 	va_start(ap, fmt);
907 	(void) vsnprintf(xen_printf_buffer, XEN_PRINTF_BUFSIZE, fmt, ap);
908 	va_end(ap);
909 
910 	(void) HYPERVISOR_console_io(CONSOLEIO_write,
911 	    strlen(xen_printf_buffer), xen_printf_buffer);
912 }
913 #else
914 void
915 xen_printf(const char *fmt, ...)
916 {
917 }
918 #endif	/* DEBUG */
919 
920 void
921 startup_xen_version(void)
922 {
923 	xen_set_version(XENVER_BOOT_IDX);
924 	if (xen_hypervisor_supports_solaris(XEN_RUN_CHECK) == 0)
925 		cmn_err(CE_WARN, "Found hypervisor version: v%lu.%lu%s "
926 		    "but need at least version v3.0.4",
927 		    XENVER_CURRENT(xv_major), XENVER_CURRENT(xv_minor),
928 		    XENVER_CURRENT(xv_ver));
929 	xen_pte_workaround();
930 }
931 
932 int xen_mca_simulate_mc_physinfo_failure = 0;
933 
934 void
935 startup_xen_mca(void)
936 {
937 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
938 		return;
939 
940 	xen_phys_ncpus = 0;
941 	xen_phys_cpus = NULL;
942 
943 	if (xen_mca_simulate_mc_physinfo_failure ||
944 	    xen_get_mc_physcpuinfo(NULL, &xen_phys_ncpus) != 0) {
945 		cmn_err(CE_WARN,
946 		    "%sxen_get_mc_physinfo failure during xen MCA startup: "
947 		    "there will be no machine check support",
948 		    xen_mca_simulate_mc_physinfo_failure ? "(simulated) " : "");
949 		return;
950 	}
951 
952 	xen_phys_cpus = kmem_alloc(xen_phys_ncpus *
953 	    sizeof (xen_mc_logical_cpu_t), KM_NOSLEEP);
954 
955 	if (xen_phys_cpus == NULL) {
956 		cmn_err(CE_WARN,
957 		    "xen_get_mc_physinfo failure: can't allocate CPU array");
958 		return;
959 	}
960 
961 	if (xen_get_mc_physcpuinfo(xen_phys_cpus, &xen_phys_ncpus) != 0) {
962 		cmn_err(CE_WARN, "xen_get_mc_physinfo failure: no "
963 		    "physical CPU info");
964 		kmem_free(xen_phys_cpus,
965 		    xen_phys_ncpus * sizeof (xen_mc_logical_cpu_t));
966 		xen_phys_ncpus = 0;
967 		xen_phys_cpus = NULL;
968 	}
969 
970 	if (xen_physinfo_debug) {
971 		xen_mc_logical_cpu_t *xcp;
972 		unsigned i;
973 
974 		cmn_err(CE_NOTE, "xvm mca: %u physical cpus:\n",
975 		    xen_phys_ncpus);
976 		for (i = 0; i < xen_phys_ncpus; i++) {
977 			xcp = &xen_phys_cpus[i];
978 			cmn_err(CE_NOTE, "cpu%u: (%u, %u, %u) apid %u",
979 			    xcp->mc_cpunr, xcp->mc_chipid, xcp->mc_coreid,
980 			    xcp->mc_threadid, xcp->mc_apicid);
981 		}
982 	}
983 }
984 
985 /*
986  * Miscellaneous hypercall wrappers with slightly more verbose diagnostics.
987  */
988 
989 void
990 xen_set_gdt(ulong_t *frame_list, int entries)
991 {
992 	int err;
993 	if ((err = HYPERVISOR_set_gdt(frame_list, entries)) != 0) {
994 		/*
995 		 * X_EINVAL:	reserved entry or bad frames
996 		 * X_EFAULT:	bad address
997 		 */
998 		panic("xen_set_gdt(%p, %d): error %d",
999 		    (void *)frame_list, entries, -(int)err);
1000 	}
1001 }
1002 
1003 void
1004 xen_set_ldt(user_desc_t *ldt, uint_t nsels)
1005 {
1006 	struct mmuext_op	op;
1007 	long			err;
1008 
1009 	op.cmd = MMUEXT_SET_LDT;
1010 	op.arg1.linear_addr = (uintptr_t)ldt;
1011 	op.arg2.nr_ents = nsels;
1012 
1013 	if ((err = HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) != 0) {
1014 		panic("xen_set_ldt(%p, %d): error %d",
1015 		    (void *)ldt, nsels, -(int)err);
1016 	}
1017 }
1018 
1019 void
1020 xen_stack_switch(ulong_t ss, ulong_t esp)
1021 {
1022 	long err;
1023 
1024 	if ((err = HYPERVISOR_stack_switch(ss, esp)) != 0) {
1025 		/*
1026 		 * X_EPERM:	bad selector
1027 		 */
1028 		panic("xen_stack_switch(%lx, %lx): error %d", ss, esp,
1029 		    -(int)err);
1030 	}
1031 }
1032 
1033 long
1034 xen_set_trap_table(trap_info_t *table)
1035 {
1036 	long err;
1037 
1038 	if ((err = HYPERVISOR_set_trap_table(table)) != 0) {
1039 		/*
1040 		 * X_EFAULT:	bad address
1041 		 * X_EPERM:	bad selector
1042 		 */
1043 		panic("xen_set_trap_table(%p): error %d", (void *)table,
1044 		    -(int)err);
1045 	}
1046 	return (err);
1047 }
1048 
1049 #if defined(__amd64)
1050 void
1051 xen_set_segment_base(int reg, ulong_t value)
1052 {
1053 	long err;
1054 
1055 	if ((err = HYPERVISOR_set_segment_base(reg, value)) != 0) {
1056 		/*
1057 		 * X_EFAULT:	bad address
1058 		 * X_EINVAL:	bad type
1059 		 */
1060 		panic("xen_set_segment_base(%d, %lx): error %d",
1061 		    reg, value, -(int)err);
1062 	}
1063 }
1064 #endif	/* __amd64 */
1065 
1066 /*
1067  * Translate a hypervisor errcode to a Solaris error code.
1068  */
1069 int
1070 xen_xlate_errcode(int error)
1071 {
1072 	switch (-error) {
1073 
1074 	/*
1075 	 * Translate hypervisor errno's into native errno's
1076 	 */
1077 
1078 #define	CASE(num)	case X_##num: error = num; break
1079 
1080 	CASE(EPERM);	CASE(ENOENT);	CASE(ESRCH);
1081 	CASE(EINTR);	CASE(EIO);	CASE(ENXIO);
1082 	CASE(E2BIG);	CASE(ENOMEM);	CASE(EACCES);
1083 	CASE(EFAULT);	CASE(EBUSY);	CASE(EEXIST);
1084 	CASE(ENODEV);	CASE(EISDIR);	CASE(EINVAL);
1085 	CASE(ENOSPC);	CASE(ESPIPE);	CASE(EROFS);
1086 	CASE(ENOSYS);	CASE(ENOTEMPTY); CASE(EISCONN);
1087 	CASE(ENODATA);	CASE(EAGAIN);
1088 
1089 #undef CASE
1090 
1091 	default:
1092 		panic("xen_xlate_errcode: unknown error %d", error);
1093 	}
1094 
1095 	return (error);
1096 }
1097 
1098 /*
1099  * Raise PS_IOPL on current vcpu to user level.
1100  * Caller responsible for preventing kernel preemption.
1101  */
1102 void
1103 xen_enable_user_iopl(void)
1104 {
1105 	physdev_set_iopl_t set_iopl;
1106 	set_iopl.iopl = 3;		/* user ring 3 */
1107 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1108 }
1109 
1110 /*
1111  * Drop PS_IOPL on current vcpu to kernel level
1112  */
1113 void
1114 xen_disable_user_iopl(void)
1115 {
1116 	physdev_set_iopl_t set_iopl;
1117 	set_iopl.iopl = 1;		/* kernel pseudo ring 1 */
1118 	(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1119 }
1120 
1121 int
1122 xen_gdt_setprot(cpu_t *cp, uint_t prot)
1123 {
1124 	int err;
1125 #if defined(__amd64)
1126 	int pt_bits = PT_VALID;
1127 	if (prot & PROT_WRITE)
1128 		pt_bits |= PT_WRITABLE;
1129 #endif
1130 
1131 	if ((err = as_setprot(&kas, (caddr_t)cp->cpu_gdt,
1132 	    MMU_PAGESIZE, prot)) != 0)
1133 		goto done;
1134 
1135 #if defined(__amd64)
1136 	err = xen_kpm_page(mmu_btop(cp->cpu_m.mcpu_gdtpa), pt_bits);
1137 #endif
1138 
1139 done:
1140 	if (err) {
1141 		cmn_err(CE_WARN, "cpu%d: xen_gdt_setprot(%s) failed: error %d",
1142 		    cp->cpu_id, (prot & PROT_WRITE) ? "writable" : "read-only",
1143 		    err);
1144 	}
1145 
1146 	return (err);
1147 }
1148 
1149 int
1150 xen_ldt_setprot(user_desc_t *ldt, size_t lsize, uint_t prot)
1151 {
1152 	int err;
1153 	caddr_t	lva = (caddr_t)ldt;
1154 #if defined(__amd64)
1155 	int pt_bits = PT_VALID;
1156 	pgcnt_t npgs;
1157 	if (prot & PROT_WRITE)
1158 		pt_bits |= PT_WRITABLE;
1159 #endif	/* __amd64 */
1160 
1161 	if ((err = as_setprot(&kas, (caddr_t)ldt, lsize, prot)) != 0)
1162 		goto done;
1163 
1164 #if defined(__amd64)
1165 
1166 	ASSERT(IS_P2ALIGNED(lsize, PAGESIZE));
1167 	npgs = mmu_btop(lsize);
1168 	while (npgs--) {
1169 		if ((err = xen_kpm_page(hat_getpfnum(kas.a_hat, lva),
1170 		    pt_bits)) != 0)
1171 			break;
1172 		lva += PAGESIZE;
1173 	}
1174 #endif	/* __amd64 */
1175 
1176 done:
1177 	if (err) {
1178 		cmn_err(CE_WARN, "xen_ldt_setprot(%p, %s) failed: error %d",
1179 		    (void *)lva,
1180 		    (prot & PROT_WRITE) ? "writable" : "read-only", err);
1181 	}
1182 
1183 	return (err);
1184 }
1185 
1186 int
1187 xen_get_mc_physcpuinfo(xen_mc_logical_cpu_t *log_cpus, uint_t *ncpus)
1188 {
1189 	struct xen_mc_physcpuinfo cpi;
1190 
1191 	cpi.ncpus = *ncpus;
1192 	/*LINTED: constant in conditional context*/
1193 	set_xen_guest_handle(cpi.info, log_cpus);
1194 
1195 	if (HYPERVISOR_mca(XEN_MC_physcpuinfo, (xen_mc_arg_t *)&cpi) != 0)
1196 		return (-1);
1197 
1198 	*ncpus = cpi.ncpus;
1199 	return (0);
1200 }
1201 
1202 void
1203 print_panic(const char *str)
1204 {
1205 	xen_printf(str);
1206 }
1207 
1208 /*
1209  * Interfaces to iterate over real cpu information, but only that info
1210  * which we choose to expose here.  These are of interest to dom0
1211  * only (and the backing hypercall should not work for domu).
1212  */
1213 
1214 xen_mc_lcpu_cookie_t
1215 xen_physcpu_next(xen_mc_lcpu_cookie_t cookie)
1216 {
1217 	xen_mc_logical_cpu_t *xcp = (xen_mc_logical_cpu_t *)cookie;
1218 
1219 	if (!DOMAIN_IS_INITDOMAIN(xen_info))
1220 		return (NULL);
1221 
1222 	if (cookie == NULL)
1223 		return ((xen_mc_lcpu_cookie_t)xen_phys_cpus);
1224 
1225 	if (xcp == xen_phys_cpus + xen_phys_ncpus - 1)
1226 		return (NULL);
1227 	else
1228 		return ((xen_mc_lcpu_cookie_t)++xcp);
1229 }
1230 
1231 #define	COOKIE2XCP(c) ((xen_mc_logical_cpu_t *)(c))
1232 
1233 const char *
1234 xen_physcpu_vendorstr(xen_mc_lcpu_cookie_t cookie)
1235 {
1236 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
1237 
1238 	return ((const char *)&xcp->mc_vendorid[0]);
1239 }
1240 
1241 int
1242 xen_physcpu_family(xen_mc_lcpu_cookie_t cookie)
1243 {
1244 	return (COOKIE2XCP(cookie)->mc_family);
1245 }
1246 
1247 int
1248 xen_physcpu_model(xen_mc_lcpu_cookie_t cookie)
1249 {
1250 	return (COOKIE2XCP(cookie)->mc_model);
1251 }
1252 
1253 int
1254 xen_physcpu_stepping(xen_mc_lcpu_cookie_t cookie)
1255 {
1256 	return (COOKIE2XCP(cookie)->mc_step);
1257 }
1258 
1259 id_t
1260 xen_physcpu_chipid(xen_mc_lcpu_cookie_t cookie)
1261 {
1262 	return (COOKIE2XCP(cookie)->mc_chipid);
1263 }
1264 
1265 id_t
1266 xen_physcpu_coreid(xen_mc_lcpu_cookie_t cookie)
1267 {
1268 	return (COOKIE2XCP(cookie)->mc_coreid);
1269 }
1270 
1271 id_t
1272 xen_physcpu_strandid(xen_mc_lcpu_cookie_t cookie)
1273 {
1274 	return (COOKIE2XCP(cookie)->mc_threadid);
1275 }
1276 
1277 id_t
1278 xen_physcpu_initial_apicid(xen_mc_lcpu_cookie_t cookie)
1279 {
1280 	return (COOKIE2XCP(cookie)->mc_clusterid);
1281 }
1282 
1283 id_t
1284 xen_physcpu_logical_id(xen_mc_lcpu_cookie_t cookie)
1285 {
1286 	return (COOKIE2XCP(cookie)->mc_cpunr);
1287 }
1288 
1289 boolean_t
1290 xen_physcpu_is_cmt(xen_mc_lcpu_cookie_t cookie)
1291 {
1292 	return (COOKIE2XCP(cookie)->mc_nthreads > 1);
1293 }
1294 
1295 uint64_t
1296 xen_physcpu_mcg_cap(xen_mc_lcpu_cookie_t cookie)
1297 {
1298 	xen_mc_logical_cpu_t *xcp = COOKIE2XCP(cookie);
1299 
1300 	/*
1301 	 * Need to #define the indices, or search through the array.
1302 	 */
1303 	return (xcp->mc_msrvalues[0].value);
1304 }
1305 
1306 int
1307 xen_map_gref(uint_t cmd, gnttab_map_grant_ref_t *mapop, uint_t count,
1308     boolean_t uvaddr)
1309 {
1310 	long rc;
1311 	uint_t i;
1312 
1313 	ASSERT(cmd == GNTTABOP_map_grant_ref);
1314 
1315 #if !defined(_BOOT)
1316 	if (uvaddr == B_FALSE) {
1317 		for (i = 0; i < count; ++i) {
1318 			mapop[i].flags |= (PT_FOREIGN <<_GNTMAP_guest_avail0);
1319 		}
1320 	}
1321 #endif
1322 
1323 	rc = HYPERVISOR_grant_table_op(cmd, mapop, count);
1324 
1325 	return (rc);
1326 }
1327 
1328 static int
1329 xpv_get_physinfo(xen_sysctl_physinfo_t *pi)
1330 {
1331 	xen_sysctl_t op;
1332 	struct sp { void *p; } *sp = (struct sp *)&op.u.physinfo.cpu_to_node;
1333 	int ret;
1334 
1335 	bzero(&op, sizeof (op));
1336 	op.cmd = XEN_SYSCTL_physinfo;
1337 	op.interface_version = XEN_SYSCTL_INTERFACE_VERSION;
1338 	/*LINTED: constant in conditional context*/
1339 	set_xen_guest_handle(*sp, NULL);
1340 
1341 	ret = HYPERVISOR_sysctl(&op);
1342 
1343 	if (ret != 0)
1344 		return (xen_xlate_errcode(ret));
1345 
1346 	bcopy(&op.u.physinfo, pi, sizeof (op.u.physinfo));
1347 	return (0);
1348 }
1349 
1350 /*
1351  * On dom0, we can determine the number of physical cpus on the machine.
1352  * This number is important when figuring out what workarounds are
1353  * appropriate, so compute it now.
1354  */
1355 uint_t
1356 xpv_nr_phys_cpus(void)
1357 {
1358 	static uint_t nphyscpus = 0;
1359 
1360 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1361 
1362 	if (nphyscpus == 0) {
1363 		xen_sysctl_physinfo_t pi;
1364 		int ret;
1365 
1366 		if ((ret = xpv_get_physinfo(&pi)) != 0)
1367 			panic("xpv_get_physinfo() failed: %d\n", ret);
1368 		nphyscpus = pi.nr_cpus;
1369 	}
1370 	return (nphyscpus);
1371 }
1372 
1373 pgcnt_t
1374 xpv_nr_phys_pages(void)
1375 {
1376 	xen_sysctl_physinfo_t pi;
1377 	int ret;
1378 
1379 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1380 
1381 	if ((ret = xpv_get_physinfo(&pi)) != 0)
1382 		panic("xpv_get_physinfo() failed: %d\n", ret);
1383 
1384 	return ((pgcnt_t)pi.total_pages);
1385 }
1386 
1387 uint64_t
1388 xpv_cpu_khz(void)
1389 {
1390 	xen_sysctl_physinfo_t pi;
1391 	int ret;
1392 
1393 	ASSERT(DOMAIN_IS_INITDOMAIN(xen_info));
1394 
1395 	if ((ret = xpv_get_physinfo(&pi)) != 0)
1396 		panic("xpv_get_physinfo() failed: %d\n", ret);
1397 	return ((uint64_t)pi.cpu_khz);
1398 }
1399