1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2020 Joyent, Inc.
27 */
28
29
30#include <sys/types.h>
31#include <sys/machparam.h>
32#include <sys/x86_archext.h>
33#include <sys/systm.h>
34#include <sys/mach_mmu.h>
35#include <sys/multiboot.h>
36#include <sys/multiboot2.h>
37#include <sys/multiboot2_impl.h>
38#include <sys/sysmacros.h>
39#include <sys/framebuffer.h>
40#include <sys/sha1.h>
41#include <util/string.h>
42#include <util/strtolctype.h>
43#include <sys/efi.h>
44
45/*
46 * Compile time debug knob. We do not have any early mechanism to control it
47 * as the boot is the earliest mechanism we have, and we do not want to have
48 * it being switched on by default.
49 */
50int dboot_debug = 0;
51
52#if defined(__xpv)
53
54#include <sys/hypervisor.h>
55uintptr_t xen_virt_start;
56pfn_t *mfn_to_pfn_mapping;
57
58#else /* !__xpv */
59
60extern multiboot_header_t mb_header;
61extern uint32_t mb2_load_addr;
62extern int have_cpuid(void);
63
64#endif /* !__xpv */
65
66#include <sys/inttypes.h>
67#include <sys/bootinfo.h>
68#include <sys/mach_mmu.h>
69#include <sys/boot_console.h>
70
71#include "dboot_asm.h"
72#include "dboot_printf.h"
73#include "dboot_xboot.h"
74#include "dboot_elfload.h"
75
76#define	SHA1_ASCII_LENGTH	(SHA1_DIGEST_LENGTH * 2)
77
78/*
79 * This file contains code that runs to transition us from either a multiboot
80 * compliant loader (32 bit non-paging) or a XPV domain loader to
81 * regular kernel execution. Its task is to setup the kernel memory image
82 * and page tables.
83 *
84 * The code executes as:
85 *	- 32 bits under GRUB (for 32 or 64 bit Solaris)
86 *	- a 32 bit program for the 32-bit PV hypervisor
87 *	- a 64 bit program for the 64-bit PV hypervisor (at least for now)
88 *
89 * Under the PV hypervisor, we must create mappings for any memory beyond the
90 * initial start of day allocation (such as the kernel itself).
91 *
92 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
93 * Since we are running in real mode, so all such memory is accessible.
94 */
95
96/*
97 * Standard bits used in PTE (page level) and PTP (internal levels)
98 */
99x86pte_t ptp_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_USER;
100x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST;
101
102/*
103 * This is the target addresses (physical) where the kernel text and data
104 * nucleus pages will be unpacked. On the hypervisor this is actually a
105 * virtual address.
106 */
107paddr_t ktext_phys;
108uint32_t ksize = 2 * FOUR_MEG;	/* kernel nucleus is 8Meg */
109
110static uint64_t target_kernel_text;	/* value to use for KERNEL_TEXT */
111
112/*
113 * The stack is setup in assembler before entering startup_kernel()
114 */
115char stack_space[STACK_SIZE];
116
117/*
118 * Used to track physical memory allocation
119 */
120static paddr_t next_avail_addr = 0;
121
122#if defined(__xpv)
123/*
124 * Additional information needed for hypervisor memory allocation.
125 * Only memory up to scratch_end is mapped by page tables.
126 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
127 * to derive a pfn from a pointer, you subtract mfn_base.
128 */
129
130static paddr_t scratch_end = 0;	/* we can't write all of mem here */
131static paddr_t mfn_base;		/* addr corresponding to mfn_list[0] */
132start_info_t *xen_info;
133
134#else	/* __xpv */
135
136/*
137 * If on the metal, then we have a multiboot loader.
138 */
139uint32_t mb_magic;			/* magic from boot loader */
140uint32_t mb_addr;			/* multiboot info package from loader */
141int multiboot_version;
142multiboot_info_t *mb_info;
143multiboot2_info_header_t *mb2_info;
144multiboot_tag_mmap_t *mb2_mmap_tagp;
145int num_entries;			/* mmap entry count */
146boolean_t num_entries_set;		/* is mmap entry count set */
147uintptr_t load_addr;
148static boot_framebuffer_t framebuffer __aligned(16);
149static boot_framebuffer_t *fb;
150
151/* can not be automatic variables because of alignment */
152static efi_guid_t smbios3 = SMBIOS3_TABLE_GUID;
153static efi_guid_t smbios = SMBIOS_TABLE_GUID;
154static efi_guid_t acpi2 = EFI_ACPI_TABLE_GUID;
155static efi_guid_t acpi1 = ACPI_10_TABLE_GUID;
156#endif	/* __xpv */
157
158/*
159 * This contains information passed to the kernel
160 */
161struct xboot_info boot_info __aligned(16);
162struct xboot_info *bi;
163
164/*
165 * Page table and memory stuff.
166 */
167static paddr_t max_mem;			/* maximum memory address */
168
169/*
170 * Information about processor MMU
171 */
172int amd64_support = 0;
173int largepage_support = 0;
174int pae_support = 0;
175int pge_support = 0;
176int NX_support = 0;
177int PAT_support = 0;
178
179/*
180 * Low 32 bits of kernel entry address passed back to assembler.
181 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
182 */
183uint32_t entry_addr_low;
184
185/*
186 * Memlists for the kernel. We shouldn't need a lot of these.
187 */
188#define	MAX_MEMLIST (50)
189struct boot_memlist memlists[MAX_MEMLIST];
190uint_t memlists_used = 0;
191struct boot_memlist pcimemlists[MAX_MEMLIST];
192uint_t pcimemlists_used = 0;
193struct boot_memlist rsvdmemlists[MAX_MEMLIST];
194uint_t rsvdmemlists_used = 0;
195
196/*
197 * This should match what's in the bootloader.  It's arbitrary, but GRUB
198 * in particular has limitations on how much space it can use before it
199 * stops working properly.  This should be enough.
200 */
201struct boot_modules modules[MAX_BOOT_MODULES];
202uint_t modules_used = 0;
203
204#ifdef __xpv
205/*
206 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
207 * definition in Xen source.
208 */
209typedef struct {
210	uint32_t	base_addr_low;
211	uint32_t	base_addr_high;
212	uint32_t	length_low;
213	uint32_t	length_high;
214	uint32_t	type;
215} mmap_t;
216
217/*
218 * There is 512KB of scratch area after the boot stack page.
219 * We'll use that for everything except the kernel nucleus pages which are too
220 * big to fit there and are allocated last anyway.
221 */
222#define	MAXMAPS	100
223static mmap_t map_buffer[MAXMAPS];
224#else
225typedef mb_memory_map_t mmap_t;
226#endif
227
228/*
229 * Debugging macros
230 */
231uint_t prom_debug = 0;
232uint_t map_debug = 0;
233
234static char noname[2] = "-";
235
236/*
237 * Either hypervisor-specific or grub-specific code builds the initial
238 * memlists. This code does the sort/merge/link for final use.
239 */
240static void
241sort_physinstall(void)
242{
243	int i;
244#if !defined(__xpv)
245	int j;
246	struct boot_memlist tmp;
247
248	/*
249	 * Now sort the memlists, in case they weren't in order.
250	 * Yeah, this is a bubble sort; small, simple and easy to get right.
251	 */
252	DBG_MSG("Sorting phys-installed list\n");
253	for (j = memlists_used - 1; j > 0; --j) {
254		for (i = 0; i < j; ++i) {
255			if (memlists[i].addr < memlists[i + 1].addr)
256				continue;
257			tmp = memlists[i];
258			memlists[i] = memlists[i + 1];
259			memlists[i + 1] = tmp;
260		}
261	}
262
263	/*
264	 * Merge any memlists that don't have holes between them.
265	 */
266	for (i = 0; i <= memlists_used - 1; ++i) {
267		if (memlists[i].addr + memlists[i].size != memlists[i + 1].addr)
268			continue;
269
270		if (prom_debug)
271			dboot_printf(
272			    "merging mem segs %" PRIx64 "...%" PRIx64
273			    " w/ %" PRIx64 "...%" PRIx64 "\n",
274			    memlists[i].addr,
275			    memlists[i].addr + memlists[i].size,
276			    memlists[i + 1].addr,
277			    memlists[i + 1].addr + memlists[i + 1].size);
278
279		memlists[i].size += memlists[i + 1].size;
280		for (j = i + 1; j < memlists_used - 1; ++j)
281			memlists[j] = memlists[j + 1];
282		--memlists_used;
283		DBG(memlists_used);
284		--i;	/* after merging we need to reexamine, so do this */
285	}
286#endif	/* __xpv */
287
288	if (prom_debug) {
289		dboot_printf("\nFinal memlists:\n");
290		for (i = 0; i < memlists_used; ++i) {
291			dboot_printf("\t%d: addr=%" PRIx64 " size=%"
292			    PRIx64 "\n", i, memlists[i].addr, memlists[i].size);
293		}
294	}
295
296	/*
297	 * link together the memlists with native size pointers
298	 */
299	memlists[0].next = 0;
300	memlists[0].prev = 0;
301	for (i = 1; i < memlists_used; ++i) {
302		memlists[i].prev = (native_ptr_t)(uintptr_t)(memlists + i - 1);
303		memlists[i].next = 0;
304		memlists[i - 1].next = (native_ptr_t)(uintptr_t)(memlists + i);
305	}
306	bi->bi_phys_install = (native_ptr_t)(uintptr_t)memlists;
307	DBG(bi->bi_phys_install);
308}
309
310/*
311 * build bios reserved memlists
312 */
313static void
314build_rsvdmemlists(void)
315{
316	int i;
317
318	rsvdmemlists[0].next = 0;
319	rsvdmemlists[0].prev = 0;
320	for (i = 1; i < rsvdmemlists_used; ++i) {
321		rsvdmemlists[i].prev =
322		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i - 1);
323		rsvdmemlists[i].next = 0;
324		rsvdmemlists[i - 1].next =
325		    (native_ptr_t)(uintptr_t)(rsvdmemlists + i);
326	}
327	bi->bi_rsvdmem = (native_ptr_t)(uintptr_t)rsvdmemlists;
328	DBG(bi->bi_rsvdmem);
329}
330
331#if defined(__xpv)
332
333/*
334 * halt on the hypervisor after a delay to drain console output
335 */
336void
337dboot_halt(void)
338{
339	uint_t i = 10000;
340
341	while (--i)
342		(void) HYPERVISOR_yield();
343	(void) HYPERVISOR_shutdown(SHUTDOWN_poweroff);
344}
345
346/*
347 * From a machine address, find the corresponding pseudo-physical address.
348 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
349 * Machine addresses are the real underlying hardware addresses.
350 * These are needed for page table entries. Note that this routine is
351 * poorly protected. A bad value of "ma" will cause a page fault.
352 */
353paddr_t
354ma_to_pa(maddr_t ma)
355{
356	ulong_t pgoff = ma & MMU_PAGEOFFSET;
357	ulong_t pfn = mfn_to_pfn_mapping[mmu_btop(ma)];
358	paddr_t pa;
359
360	if (pfn >= xen_info->nr_pages)
361		return (-(paddr_t)1);
362	pa = mfn_base + mmu_ptob((paddr_t)pfn) + pgoff;
363#ifdef DEBUG
364	if (ma != pa_to_ma(pa))
365		dboot_printf("ma_to_pa(%" PRIx64 ") got %" PRIx64 ", "
366		    "pa_to_ma() says %" PRIx64 "\n", ma, pa, pa_to_ma(pa));
367#endif
368	return (pa);
369}
370
371/*
372 * From a pseudo-physical address, find the corresponding machine address.
373 */
374maddr_t
375pa_to_ma(paddr_t pa)
376{
377	pfn_t pfn;
378	ulong_t mfn;
379
380	pfn = mmu_btop(pa - mfn_base);
381	if (pa < mfn_base || pfn >= xen_info->nr_pages)
382		dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t)pa);
383	mfn = ((ulong_t *)xen_info->mfn_list)[pfn];
384#ifdef DEBUG
385	if (mfn_to_pfn_mapping[mfn] != pfn)
386		dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
387		    pfn, mfn, mfn_to_pfn_mapping[mfn]);
388#endif
389	return (mfn_to_ma(mfn) | (pa & MMU_PAGEOFFSET));
390}
391
392#endif	/* __xpv */
393
394x86pte_t
395get_pteval(paddr_t table, uint_t index)
396{
397	if (pae_support)
398		return (((x86pte_t *)(uintptr_t)table)[index]);
399	return (((x86pte32_t *)(uintptr_t)table)[index]);
400}
401
402/*ARGSUSED*/
403void
404set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval)
405{
406#ifdef __xpv
407	mmu_update_t t;
408	maddr_t mtable = pa_to_ma(table);
409	int retcnt;
410
411	t.ptr = (mtable + index * pte_size) | MMU_NORMAL_PT_UPDATE;
412	t.val = pteval;
413	if (HYPERVISOR_mmu_update(&t, 1, &retcnt, DOMID_SELF) || retcnt != 1)
414		dboot_panic("HYPERVISOR_mmu_update() failed");
415#else /* __xpv */
416	uintptr_t tab_addr = (uintptr_t)table;
417
418	if (pae_support)
419		((x86pte_t *)tab_addr)[index] = pteval;
420	else
421		((x86pte32_t *)tab_addr)[index] = (x86pte32_t)pteval;
422	if (level == top_level && level == 2)
423		reload_cr3();
424#endif /* __xpv */
425}
426
427paddr_t
428make_ptable(x86pte_t *pteval, uint_t level)
429{
430	paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
431
432	if (level == top_level && level == 2)
433		*pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID;
434	else
435		*pteval = pa_to_ma((uintptr_t)new_table) | ptp_bits;
436
437#ifdef __xpv
438	/* Remove write permission to the new page table. */
439	if (HYPERVISOR_update_va_mapping(new_table,
440	    *pteval & ~(x86pte_t)PT_WRITABLE, UVMF_INVLPG | UVMF_LOCAL))
441		dboot_panic("HYP_update_va_mapping error");
442#endif
443
444	if (map_debug)
445		dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
446		    PRIx64 "\n", level, (ulong_t)new_table, *pteval);
447	return (new_table);
448}
449
450x86pte_t *
451map_pte(paddr_t table, uint_t index)
452{
453	return ((x86pte_t *)(uintptr_t)(table + index * pte_size));
454}
455
456/*
457 * dump out the contents of page tables...
458 */
459static void
460dump_tables(void)
461{
462	uint_t save_index[4];	/* for recursion */
463	char *save_table[4];	/* for recursion */
464	uint_t	l;
465	uint64_t va;
466	uint64_t pgsize;
467	int index;
468	int i;
469	x86pte_t pteval;
470	char *table;
471	static char *tablist = "\t\t\t";
472	char *tabs = tablist + 3 - top_level;
473	uint_t pa, pa1;
474#if !defined(__xpv)
475#define	maddr_t paddr_t
476#endif /* !__xpv */
477
478	dboot_printf("Finished pagetables:\n");
479	table = (char *)(uintptr_t)top_page_table;
480	l = top_level;
481	va = 0;
482	for (index = 0; index < ptes_per_table; ++index) {
483		pgsize = 1ull << shift_amt[l];
484		if (pae_support)
485			pteval = ((x86pte_t *)table)[index];
486		else
487			pteval = ((x86pte32_t *)table)[index];
488		if (pteval == 0)
489			goto next_entry;
490
491		dboot_printf("%s %p[0x%x] = %" PRIx64 ", va=%" PRIx64,
492		    tabs + l, (void *)table, index, (uint64_t)pteval, va);
493		pa = ma_to_pa(pteval & MMU_PAGEMASK);
494		dboot_printf(" physaddr=%x\n", pa);
495
496		/*
497		 * Don't try to walk hypervisor private pagetables
498		 */
499		if ((l > 1 || (l == 1 && (pteval & PT_PAGESIZE) == 0))) {
500			save_table[l] = table;
501			save_index[l] = index;
502			--l;
503			index = -1;
504			table = (char *)(uintptr_t)
505			    ma_to_pa(pteval & MMU_PAGEMASK);
506			goto recursion;
507		}
508
509		/*
510		 * shorten dump for consecutive mappings
511		 */
512		for (i = 1; index + i < ptes_per_table; ++i) {
513			if (pae_support)
514				pteval = ((x86pte_t *)table)[index + i];
515			else
516				pteval = ((x86pte32_t *)table)[index + i];
517			if (pteval == 0)
518				break;
519			pa1 = ma_to_pa(pteval & MMU_PAGEMASK);
520			if (pa1 != pa + i * pgsize)
521				break;
522		}
523		if (i > 2) {
524			dboot_printf("%s...\n", tabs + l);
525			va += pgsize * (i - 2);
526			index += i - 2;
527		}
528next_entry:
529		va += pgsize;
530		if (l == 3 && index == 256)	/* VA hole */
531			va = 0xffff800000000000ull;
532recursion:
533		;
534	}
535	if (l < top_level) {
536		++l;
537		index = save_index[l];
538		table = save_table[l];
539		goto recursion;
540	}
541}
542
543/*
544 * Add a mapping for the machine page at the given virtual address.
545 */
546static void
547map_ma_at_va(maddr_t ma, native_ptr_t va, uint_t level)
548{
549	x86pte_t *ptep;
550	x86pte_t pteval;
551
552	pteval = ma | pte_bits;
553	if (level > 0)
554		pteval |= PT_PAGESIZE;
555	if (va >= target_kernel_text && pge_support)
556		pteval |= PT_GLOBAL;
557
558	if (map_debug && ma != va)
559		dboot_printf("mapping ma=0x%" PRIx64 " va=0x%" PRIx64
560		    " pte=0x%" PRIx64 " l=%d\n",
561		    (uint64_t)ma, (uint64_t)va, pteval, level);
562
563#if defined(__xpv)
564	/*
565	 * see if we can avoid find_pte() on the hypervisor
566	 */
567	if (HYPERVISOR_update_va_mapping(va, pteval,
568	    UVMF_INVLPG | UVMF_LOCAL) == 0)
569		return;
570#endif
571
572	/*
573	 * Find the pte that will map this address. This creates any
574	 * missing intermediate level page tables
575	 */
576	ptep = find_pte(va, NULL, level, 0);
577
578	/*
579	 * When paravirtualized, we must use hypervisor calls to modify the
580	 * PTE, since paging is active. On real hardware we just write to
581	 * the pagetables which aren't in use yet.
582	 */
583#if defined(__xpv)
584	ptep = ptep;	/* shut lint up */
585	if (HYPERVISOR_update_va_mapping(va, pteval, UVMF_INVLPG | UVMF_LOCAL))
586		dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
587		    " l=%d ma=0x%" PRIx64 ", pte=0x%" PRIx64 "",
588		    (uint64_t)va, level, (uint64_t)ma, pteval);
589#else
590	if (va < 1024 * 1024)
591		pteval |= PT_NOCACHE;		/* for video RAM */
592	if (pae_support)
593		*ptep = pteval;
594	else
595		*((x86pte32_t *)ptep) = (x86pte32_t)pteval;
596#endif
597}
598
599/*
600 * Add a mapping for the physical page at the given virtual address.
601 */
602static void
603map_pa_at_va(paddr_t pa, native_ptr_t va, uint_t level)
604{
605	map_ma_at_va(pa_to_ma(pa), va, level);
606}
607
608/*
609 * This is called to remove start..end from the
610 * possible range of PCI addresses.
611 */
612const uint64_t pci_lo_limit = 0x00100000ul;
613const uint64_t pci_hi_limit = 0xfff00000ul;
614static void
615exclude_from_pci(uint64_t start, uint64_t end)
616{
617	int i;
618	int j;
619	struct boot_memlist *ml;
620
621	for (i = 0; i < pcimemlists_used; ++i) {
622		ml = &pcimemlists[i];
623
624		/* delete the entire range? */
625		if (start <= ml->addr && ml->addr + ml->size <= end) {
626			--pcimemlists_used;
627			for (j = i; j < pcimemlists_used; ++j)
628				pcimemlists[j] = pcimemlists[j + 1];
629			--i;	/* to revisit the new one at this index */
630		}
631
632		/* split a range? */
633		else if (ml->addr < start && end < ml->addr + ml->size) {
634
635			++pcimemlists_used;
636			if (pcimemlists_used > MAX_MEMLIST)
637				dboot_panic("too many pcimemlists");
638
639			for (j = pcimemlists_used - 1; j > i; --j)
640				pcimemlists[j] = pcimemlists[j - 1];
641			ml->size = start - ml->addr;
642
643			++ml;
644			ml->size = (ml->addr + ml->size) - end;
645			ml->addr = end;
646			++i;	/* skip on to next one */
647		}
648
649		/* cut memory off the start? */
650		else if (ml->addr < end && end < ml->addr + ml->size) {
651			ml->size -= end - ml->addr;
652			ml->addr = end;
653		}
654
655		/* cut memory off the end? */
656		else if (ml->addr <= start && start < ml->addr + ml->size) {
657			ml->size = start - ml->addr;
658		}
659	}
660}
661
662/*
663 * During memory allocation, find the highest address not used yet.
664 */
665static void
666check_higher(paddr_t a)
667{
668	if (a < next_avail_addr)
669		return;
670	next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
671	DBG(next_avail_addr);
672}
673
674static int
675dboot_loader_mmap_entries(void)
676{
677#if !defined(__xpv)
678	if (num_entries_set == B_TRUE)
679		return (num_entries);
680
681	switch (multiboot_version) {
682	case 1:
683		DBG(mb_info->flags);
684		if (mb_info->flags & 0x40) {
685			mb_memory_map_t *mmap;
686			caddr32_t mmap_addr;
687
688			DBG(mb_info->mmap_addr);
689			DBG(mb_info->mmap_length);
690			check_higher(mb_info->mmap_addr + mb_info->mmap_length);
691
692			for (mmap_addr = mb_info->mmap_addr;
693			    mmap_addr < mb_info->mmap_addr +
694			    mb_info->mmap_length;
695			    mmap_addr += mmap->size + sizeof (mmap->size)) {
696				mmap = (mb_memory_map_t *)(uintptr_t)mmap_addr;
697				++num_entries;
698			}
699
700			num_entries_set = B_TRUE;
701		}
702		break;
703	case 2:
704		num_entries_set = B_TRUE;
705		num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
706		    mb2_mmap_tagp);
707		break;
708	default:
709		dboot_panic("Unknown multiboot version: %d\n",
710		    multiboot_version);
711		break;
712	}
713	return (num_entries);
714#else
715	return (MAXMAPS);
716#endif
717}
718
719static uint32_t
720dboot_loader_mmap_get_type(int index)
721{
722#if !defined(__xpv)
723	mb_memory_map_t *mp, *mpend;
724	caddr32_t mmap_addr;
725	int i;
726
727	switch (multiboot_version) {
728	case 1:
729		mp = (mb_memory_map_t *)(uintptr_t)mb_info->mmap_addr;
730		mpend = (mb_memory_map_t *)(uintptr_t)
731		    (mb_info->mmap_addr + mb_info->mmap_length);
732
733		for (i = 0; mp < mpend && i != index; i++)
734			mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
735			    sizeof (mp->size));
736		if (mp >= mpend) {
737			dboot_panic("dboot_loader_mmap_get_type(): index "
738			    "out of bounds: %d\n", index);
739		}
740		return (mp->type);
741
742	case 2:
743		return (dboot_multiboot2_mmap_get_type(mb2_info,
744		    mb2_mmap_tagp, index));
745
746	default:
747		dboot_panic("Unknown multiboot version: %d\n",
748		    multiboot_version);
749		break;
750	}
751	return (0);
752#else
753	return (map_buffer[index].type);
754#endif
755}
756
757static uint64_t
758dboot_loader_mmap_get_base(int index)
759{
760#if !defined(__xpv)
761	mb_memory_map_t *mp, *mpend;
762	int i;
763
764	switch (multiboot_version) {
765	case 1:
766		mp = (mb_memory_map_t *)mb_info->mmap_addr;
767		mpend = (mb_memory_map_t *)
768		    (mb_info->mmap_addr + mb_info->mmap_length);
769
770		for (i = 0; mp < mpend && i != index; i++)
771			mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
772			    sizeof (mp->size));
773		if (mp >= mpend) {
774			dboot_panic("dboot_loader_mmap_get_base(): index "
775			    "out of bounds: %d\n", index);
776		}
777		return (((uint64_t)mp->base_addr_high << 32) +
778		    (uint64_t)mp->base_addr_low);
779
780	case 2:
781		return (dboot_multiboot2_mmap_get_base(mb2_info,
782		    mb2_mmap_tagp, index));
783
784	default:
785		dboot_panic("Unknown multiboot version: %d\n",
786		    multiboot_version);
787		break;
788	}
789	return (0);
790#else
791	return (((uint64_t)map_buffer[index].base_addr_high << 32) +
792	    (uint64_t)map_buffer[index].base_addr_low);
793#endif
794}
795
796static uint64_t
797dboot_loader_mmap_get_length(int index)
798{
799#if !defined(__xpv)
800	mb_memory_map_t *mp, *mpend;
801	int i;
802
803	switch (multiboot_version) {
804	case 1:
805		mp = (mb_memory_map_t *)mb_info->mmap_addr;
806		mpend = (mb_memory_map_t *)
807		    (mb_info->mmap_addr + mb_info->mmap_length);
808
809		for (i = 0; mp < mpend && i != index; i++)
810			mp = (mb_memory_map_t *)((uintptr_t)mp + mp->size +
811			    sizeof (mp->size));
812		if (mp >= mpend) {
813			dboot_panic("dboot_loader_mmap_get_length(): index "
814			    "out of bounds: %d\n", index);
815		}
816		return (((uint64_t)mp->length_high << 32) +
817		    (uint64_t)mp->length_low);
818
819	case 2:
820		return (dboot_multiboot2_mmap_get_length(mb2_info,
821		    mb2_mmap_tagp, index));
822
823	default:
824		dboot_panic("Unknown multiboot version: %d\n",
825		    multiboot_version);
826		break;
827	}
828	return (0);
829#else
830	return (((uint64_t)map_buffer[index].length_high << 32) +
831	    (uint64_t)map_buffer[index].length_low);
832#endif
833}
834
835static void
836build_pcimemlists(void)
837{
838	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
839	uint64_t start;
840	uint64_t end;
841	int i, num;
842
843	/*
844	 * initialize
845	 */
846	pcimemlists[0].addr = pci_lo_limit;
847	pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
848	pcimemlists_used = 1;
849
850	num = dboot_loader_mmap_entries();
851	/*
852	 * Fill in PCI memlists.
853	 */
854	for (i = 0; i < num; ++i) {
855		start = dboot_loader_mmap_get_base(i);
856		end = start + dboot_loader_mmap_get_length(i);
857
858		if (prom_debug)
859			dboot_printf("\ttype: %d %" PRIx64 "..%"
860			    PRIx64 "\n", dboot_loader_mmap_get_type(i),
861			    start, end);
862
863		/*
864		 * page align start and end
865		 */
866		start = (start + page_offset) & ~page_offset;
867		end &= ~page_offset;
868		if (end <= start)
869			continue;
870
871		exclude_from_pci(start, end);
872	}
873
874	/*
875	 * Finish off the pcimemlist
876	 */
877	if (prom_debug) {
878		for (i = 0; i < pcimemlists_used; ++i) {
879			dboot_printf("pcimemlist entry 0x%" PRIx64 "..0x%"
880			    PRIx64 "\n", pcimemlists[i].addr,
881			    pcimemlists[i].addr + pcimemlists[i].size);
882		}
883	}
884	pcimemlists[0].next = 0;
885	pcimemlists[0].prev = 0;
886	for (i = 1; i < pcimemlists_used; ++i) {
887		pcimemlists[i].prev =
888		    (native_ptr_t)(uintptr_t)(pcimemlists + i - 1);
889		pcimemlists[i].next = 0;
890		pcimemlists[i - 1].next =
891		    (native_ptr_t)(uintptr_t)(pcimemlists + i);
892	}
893	bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
894	DBG(bi->bi_pcimem);
895}
896
897#if defined(__xpv)
898/*
899 * Initialize memory allocator stuff from hypervisor-supplied start info.
900 */
901static void
902init_mem_alloc(void)
903{
904	int	local;	/* variables needed to find start region */
905	paddr_t	scratch_start;
906	xen_memory_map_t map;
907
908	DBG_MSG("Entered init_mem_alloc()\n");
909
910	/*
911	 * Free memory follows the stack. There's at least 512KB of scratch
912	 * space, rounded up to at least 2Mb alignment.  That should be enough
913	 * for the page tables we'll need to build.  The nucleus memory is
914	 * allocated last and will be outside the addressible range.  We'll
915	 * switch to new page tables before we unpack the kernel
916	 */
917	scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE);
918	DBG(scratch_start);
919	scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG);
920	DBG(scratch_end);
921
922	/*
923	 * For paranoia, leave some space between hypervisor data and ours.
924	 * Use 500 instead of 512.
925	 */
926	next_avail_addr = scratch_end - 500 * 1024;
927	DBG(next_avail_addr);
928
929	/*
930	 * The domain builder gives us at most 1 module
931	 */
932	DBG(xen_info->mod_len);
933	if (xen_info->mod_len > 0) {
934		DBG(xen_info->mod_start);
935		modules[0].bm_addr =
936		    (native_ptr_t)(uintptr_t)xen_info->mod_start;
937		modules[0].bm_size = xen_info->mod_len;
938		bi->bi_module_cnt = 1;
939		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
940	} else {
941		bi->bi_module_cnt = 0;
942		bi->bi_modules = (native_ptr_t)(uintptr_t)NULL;
943	}
944	DBG(bi->bi_module_cnt);
945	DBG(bi->bi_modules);
946
947	DBG(xen_info->mfn_list);
948	DBG(xen_info->nr_pages);
949	max_mem = (paddr_t)xen_info->nr_pages << MMU_PAGESHIFT;
950	DBG(max_mem);
951
952	/*
953	 * Using pseudo-physical addresses, so only 1 memlist element
954	 */
955	memlists[0].addr = 0;
956	DBG(memlists[0].addr);
957	memlists[0].size = max_mem;
958	DBG(memlists[0].size);
959	memlists_used = 1;
960	DBG(memlists_used);
961
962	/*
963	 * finish building physinstall list
964	 */
965	sort_physinstall();
966
967	/*
968	 * build bios reserved memlists
969	 */
970	build_rsvdmemlists();
971
972	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
973		/*
974		 * build PCI Memory list
975		 */
976		map.nr_entries = MAXMAPS;
977		/*LINTED: constant in conditional context*/
978		set_xen_guest_handle(map.buffer, map_buffer);
979		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
980			dboot_panic("getting XENMEM_machine_memory_map failed");
981		build_pcimemlists();
982	}
983}
984
985#else	/* !__xpv */
986
987static void
988dboot_multiboot1_xboot_consinfo(void)
989{
990	fb->framebuffer = 0;
991}
992
993static void
994dboot_multiboot2_xboot_consinfo(void)
995{
996	multiboot_tag_framebuffer_t *fbtag;
997	fbtag = dboot_multiboot2_find_tag(mb2_info,
998	    MULTIBOOT_TAG_TYPE_FRAMEBUFFER);
999	fb->framebuffer = (uint64_t)(uintptr_t)fbtag;
1000}
1001
1002static int
1003dboot_multiboot_modcount(void)
1004{
1005	switch (multiboot_version) {
1006	case 1:
1007		return (mb_info->mods_count);
1008
1009	case 2:
1010		return (dboot_multiboot2_modcount(mb2_info));
1011
1012	default:
1013		dboot_panic("Unknown multiboot version: %d\n",
1014		    multiboot_version);
1015		break;
1016	}
1017	return (0);
1018}
1019
1020static uint32_t
1021dboot_multiboot_modstart(int index)
1022{
1023	switch (multiboot_version) {
1024	case 1:
1025		return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
1026
1027	case 2:
1028		return (dboot_multiboot2_modstart(mb2_info, index));
1029
1030	default:
1031		dboot_panic("Unknown multiboot version: %d\n",
1032		    multiboot_version);
1033		break;
1034	}
1035	return (0);
1036}
1037
1038static uint32_t
1039dboot_multiboot_modend(int index)
1040{
1041	switch (multiboot_version) {
1042	case 1:
1043		return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
1044
1045	case 2:
1046		return (dboot_multiboot2_modend(mb2_info, index));
1047
1048	default:
1049		dboot_panic("Unknown multiboot version: %d\n",
1050		    multiboot_version);
1051		break;
1052	}
1053	return (0);
1054}
1055
1056static char *
1057dboot_multiboot_modcmdline(int index)
1058{
1059	switch (multiboot_version) {
1060	case 1:
1061		return ((char *)((mb_module_t *)
1062		    mb_info->mods_addr)[index].mod_name);
1063
1064	case 2:
1065		return (dboot_multiboot2_modcmdline(mb2_info, index));
1066
1067	default:
1068		dboot_panic("Unknown multiboot version: %d\n",
1069		    multiboot_version);
1070		break;
1071	}
1072	return (0);
1073}
1074
1075/*
1076 * Find the modules used by console setup.
1077 * Since we need the console to print early boot messages, the console is set up
1078 * before anything else and therefore we need to pick up the needed modules.
1079 *
1080 * Note, we just will search for and if found, will pass the modules
1081 * to console setup, the proper module list processing will happen later.
1082 * Currently used modules are boot environment and console font.
1083 */
1084static void
1085dboot_find_console_modules(void)
1086{
1087	int i, modcount;
1088	uint32_t mod_start, mod_end;
1089	char *cmdline;
1090
1091	modcount = dboot_multiboot_modcount();
1092	bi->bi_module_cnt = 0;
1093	for (i = 0; i < modcount; ++i) {
1094		cmdline = dboot_multiboot_modcmdline(i);
1095		if (cmdline == NULL)
1096			continue;
1097
1098		if (strstr(cmdline, "type=console-font") != NULL)
1099			modules[bi->bi_module_cnt].bm_type = BMT_FONT;
1100		else if (strstr(cmdline, "type=environment") != NULL)
1101			modules[bi->bi_module_cnt].bm_type = BMT_ENV;
1102		else
1103			continue;
1104
1105		mod_start = dboot_multiboot_modstart(i);
1106		mod_end = dboot_multiboot_modend(i);
1107		modules[bi->bi_module_cnt].bm_addr =
1108		    (native_ptr_t)(uintptr_t)mod_start;
1109		modules[bi->bi_module_cnt].bm_size = mod_end - mod_start;
1110		modules[bi->bi_module_cnt].bm_name =
1111		    (native_ptr_t)(uintptr_t)NULL;
1112		modules[bi->bi_module_cnt].bm_hash =
1113		    (native_ptr_t)(uintptr_t)NULL;
1114		bi->bi_module_cnt++;
1115	}
1116	if (bi->bi_module_cnt != 0)
1117		bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1118}
1119
1120static boolean_t
1121dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
1122{
1123	boolean_t rv = B_FALSE;
1124
1125	switch (multiboot_version) {
1126	case 1:
1127		if (mb_info->flags & 0x01) {
1128			*lower = mb_info->mem_lower;
1129			*upper = mb_info->mem_upper;
1130			rv = B_TRUE;
1131		}
1132		break;
1133
1134	case 2:
1135		return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
1136
1137	default:
1138		dboot_panic("Unknown multiboot version: %d\n",
1139		    multiboot_version);
1140		break;
1141	}
1142	return (rv);
1143}
1144
1145static uint8_t
1146dboot_a2h(char v)
1147{
1148	if (v >= 'a')
1149		return (v - 'a' + 0xa);
1150	else if (v >= 'A')
1151		return (v - 'A' + 0xa);
1152	else if (v >= '0')
1153		return (v - '0');
1154	else
1155		dboot_panic("bad ASCII hex character %c\n", v);
1156
1157	return (0);
1158}
1159
1160static void
1161digest_a2h(const char *ascii, uint8_t *digest)
1162{
1163	unsigned int i;
1164
1165	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1166		digest[i] = dboot_a2h(ascii[i * 2]) << 4;
1167		digest[i] |= dboot_a2h(ascii[i * 2 + 1]);
1168	}
1169}
1170
1171/*
1172 * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1173 * the ASCII-format hash found in the 40-byte buffer at ascii.  If they
1174 * match, return 0, otherwise -1.  This works only for images smaller than
1175 * 4 GB, which should not be a problem.
1176 */
1177static int
1178check_image_hash(uint_t midx)
1179{
1180	const char *ascii;
1181	const void *image;
1182	size_t len;
1183	SHA1_CTX ctx;
1184	uint8_t digest[SHA1_DIGEST_LENGTH];
1185	uint8_t baseline[SHA1_DIGEST_LENGTH];
1186	unsigned int i;
1187
1188	ascii = (const char *)(uintptr_t)modules[midx].bm_hash;
1189	image = (const void *)(uintptr_t)modules[midx].bm_addr;
1190	len = (size_t)modules[midx].bm_size;
1191
1192	digest_a2h(ascii, baseline);
1193
1194	SHA1Init(&ctx);
1195	SHA1Update(&ctx, image, len);
1196	SHA1Final(digest, &ctx);
1197
1198	for (i = 0; i < SHA1_DIGEST_LENGTH; i++) {
1199		if (digest[i] != baseline[i])
1200			return (-1);
1201	}
1202
1203	return (0);
1204}
1205
1206static const char *
1207type_to_str(boot_module_type_t type)
1208{
1209	switch (type) {
1210	case BMT_ROOTFS:
1211		return ("rootfs");
1212	case BMT_FILE:
1213		return ("file");
1214	case BMT_HASH:
1215		return ("hash");
1216	case BMT_ENV:
1217		return ("environment");
1218	case BMT_FONT:
1219		return ("console-font");
1220	default:
1221		return ("unknown");
1222	}
1223}
1224
1225static void
1226check_images(void)
1227{
1228	uint_t i;
1229	char displayhash[SHA1_ASCII_LENGTH + 1];
1230
1231	for (i = 0; i < modules_used; i++) {
1232		if (prom_debug) {
1233			dboot_printf("module #%d: name %s type %s "
1234			    "addr %lx size %lx\n",
1235			    i, (char *)(uintptr_t)modules[i].bm_name,
1236			    type_to_str(modules[i].bm_type),
1237			    (ulong_t)modules[i].bm_addr,
1238			    (ulong_t)modules[i].bm_size);
1239		}
1240
1241		if (modules[i].bm_type == BMT_HASH ||
1242		    modules[i].bm_hash == (native_ptr_t)(uintptr_t)NULL) {
1243			DBG_MSG("module has no hash; skipping check\n");
1244			continue;
1245		}
1246		(void) memcpy(displayhash,
1247		    (void *)(uintptr_t)modules[i].bm_hash,
1248		    SHA1_ASCII_LENGTH);
1249		displayhash[SHA1_ASCII_LENGTH] = '\0';
1250		if (prom_debug) {
1251			dboot_printf("checking expected hash [%s]: ",
1252			    displayhash);
1253		}
1254
1255		if (check_image_hash(i) != 0)
1256			dboot_panic("hash mismatch!\n");
1257		else
1258			DBG_MSG("OK\n");
1259	}
1260}
1261
1262/*
1263 * Determine the module's starting address, size, name, and type, and fill the
1264 * boot_modules structure.  This structure is used by the bop code, except for
1265 * hashes which are checked prior to transferring control to the kernel.
1266 */
1267static void
1268process_module(int midx)
1269{
1270	uint32_t mod_start = dboot_multiboot_modstart(midx);
1271	uint32_t mod_end = dboot_multiboot_modend(midx);
1272	char *cmdline = dboot_multiboot_modcmdline(midx);
1273	char *p, *q;
1274
1275	check_higher(mod_end);
1276	if (prom_debug) {
1277		dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1278		    midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
1279	}
1280
1281	if (mod_start > mod_end) {
1282		dboot_panic("module #%d: module start address 0x%lx greater "
1283		    "than end address 0x%lx", midx,
1284		    (ulong_t)mod_start, (ulong_t)mod_end);
1285	}
1286
1287	/*
1288	 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1289	 * the address of the last valid byte in a module plus 1 as mod_end.
1290	 * This is of course a bug; the multiboot specification simply states
1291	 * that mod_start and mod_end "contain the start and end addresses of
1292	 * the boot module itself" which is pretty obviously not what GRUB is
1293	 * doing.  However, fixing it requires that not only this code be
1294	 * changed but also that other code consuming this value and values
1295	 * derived from it be fixed, and that the kernel and GRUB must either
1296	 * both have the bug or neither.  While there are a lot of combinations
1297	 * that will work, there are also some that won't, so for simplicity
1298	 * we'll just cope with the bug.  That means we won't actually hash the
1299	 * byte at mod_end, and we will expect that mod_end for the hash file
1300	 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1301	 * hash plus a newline for each module).  We set bm_size to the true
1302	 * correct number of bytes in each module, achieving exactly this.
1303	 */
1304
1305	modules[midx].bm_addr = (native_ptr_t)(uintptr_t)mod_start;
1306	modules[midx].bm_size = mod_end - mod_start;
1307	modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
1308	modules[midx].bm_hash = (native_ptr_t)(uintptr_t)NULL;
1309	modules[midx].bm_type = BMT_FILE;
1310
1311	if (cmdline == NULL) {
1312		modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
1313		return;
1314	}
1315
1316	p = cmdline;
1317	modules[midx].bm_name =
1318	    (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
1319
1320	while (p != NULL) {
1321		q = strsep(&p, " \t\f\n\r");
1322		if (strncmp(q, "name=", 5) == 0) {
1323			if (q[5] != '\0' && !isspace(q[5])) {
1324				modules[midx].bm_name =
1325				    (native_ptr_t)(uintptr_t)(q + 5);
1326			}
1327			continue;
1328		}
1329
1330		if (strncmp(q, "type=", 5) == 0) {
1331			if (q[5] == '\0' || isspace(q[5]))
1332				continue;
1333			q += 5;
1334			if (strcmp(q, "rootfs") == 0) {
1335				modules[midx].bm_type = BMT_ROOTFS;
1336			} else if (strcmp(q, "hash") == 0) {
1337				modules[midx].bm_type = BMT_HASH;
1338			} else if (strcmp(q, "environment") == 0) {
1339				modules[midx].bm_type = BMT_ENV;
1340			} else if (strcmp(q, "console-font") == 0) {
1341				modules[midx].bm_type = BMT_FONT;
1342			} else if (strcmp(q, "file") != 0) {
1343				dboot_printf("\tmodule #%d: unknown module "
1344				    "type '%s'; defaulting to 'file'\n",
1345				    midx, q);
1346			}
1347			continue;
1348		}
1349
1350		if (strncmp(q, "hash=", 5) == 0) {
1351			if (q[5] != '\0' && !isspace(q[5])) {
1352				modules[midx].bm_hash =
1353				    (native_ptr_t)(uintptr_t)(q + 5);
1354			}
1355			continue;
1356		}
1357
1358		dboot_printf("ignoring unknown option '%s'\n", q);
1359	}
1360}
1361
1362/*
1363 * Backward compatibility: if there are exactly one or two modules, both
1364 * of type 'file' and neither with an embedded hash value, we have been
1365 * given the legacy style modules.  In this case we need to treat the first
1366 * module as a rootfs and the second as a hash referencing that module.
1367 * Otherwise, even if the configuration is invalid, we assume that the
1368 * operator knows what he's doing or at least isn't being bitten by this
1369 * interface change.
1370 */
1371static void
1372fixup_modules(void)
1373{
1374	if (modules_used == 0 || modules_used > 2)
1375		return;
1376
1377	if (modules[0].bm_type != BMT_FILE ||
1378	    modules_used > 1 && modules[1].bm_type != BMT_FILE) {
1379		return;
1380	}
1381
1382	if (modules[0].bm_hash != (native_ptr_t)(uintptr_t)NULL ||
1383	    modules_used > 1 &&
1384	    modules[1].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1385		return;
1386	}
1387
1388	modules[0].bm_type = BMT_ROOTFS;
1389	if (modules_used > 1) {
1390		modules[1].bm_type = BMT_HASH;
1391		modules[1].bm_name = modules[0].bm_name;
1392	}
1393}
1394
1395/*
1396 * For modules that do not have assigned hashes but have a separate hash module,
1397 * find the assigned hash module and set the primary module's bm_hash to point
1398 * to the hash data from that module.  We will then ignore modules of type
1399 * BMT_HASH from this point forward.
1400 */
1401static void
1402assign_module_hashes(void)
1403{
1404	uint_t i, j;
1405
1406	for (i = 0; i < modules_used; i++) {
1407		if (modules[i].bm_type == BMT_HASH ||
1408		    modules[i].bm_hash != (native_ptr_t)(uintptr_t)NULL) {
1409			continue;
1410		}
1411
1412		for (j = 0; j < modules_used; j++) {
1413			if (modules[j].bm_type != BMT_HASH ||
1414			    strcmp((char *)(uintptr_t)modules[j].bm_name,
1415			    (char *)(uintptr_t)modules[i].bm_name) != 0) {
1416				continue;
1417			}
1418
1419			if (modules[j].bm_size < SHA1_ASCII_LENGTH) {
1420				dboot_printf("Short hash module of length "
1421				    "0x%lx bytes; ignoring\n",
1422				    (ulong_t)modules[j].bm_size);
1423			} else {
1424				modules[i].bm_hash = modules[j].bm_addr;
1425			}
1426			break;
1427		}
1428	}
1429}
1430
1431/*
1432 * Walk through the module information finding the last used address.
1433 * The first available address will become the top level page table.
1434 */
1435static void
1436dboot_process_modules(void)
1437{
1438	int i, modcount;
1439	extern char _end[];
1440
1441	DBG_MSG("\nFinding Modules\n");
1442	modcount = dboot_multiboot_modcount();
1443	if (modcount > MAX_BOOT_MODULES) {
1444		dboot_panic("Too many modules (%d) -- the maximum is %d.",
1445		    modcount, MAX_BOOT_MODULES);
1446	}
1447	/*
1448	 * search the modules to find the last used address
1449	 * we'll build the module list while we're walking through here
1450	 */
1451	check_higher((paddr_t)(uintptr_t)&_end);
1452	for (i = 0; i < modcount; ++i) {
1453		process_module(i);
1454		modules_used++;
1455	}
1456	bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
1457	DBG(bi->bi_modules);
1458	bi->bi_module_cnt = modcount;
1459	DBG(bi->bi_module_cnt);
1460
1461	fixup_modules();
1462	assign_module_hashes();
1463	check_images();
1464}
1465
1466/*
1467 * We then build the phys_install memlist from the multiboot information.
1468 */
1469static void
1470dboot_process_mmap(void)
1471{
1472	uint64_t start;
1473	uint64_t end;
1474	uint64_t page_offset = MMU_PAGEOFFSET;	/* needs to be 64 bits */
1475	uint32_t lower, upper;
1476	int i, mmap_entries;
1477
1478	/*
1479	 * Walk through the memory map from multiboot and build our memlist
1480	 * structures. Note these will have native format pointers.
1481	 */
1482	DBG_MSG("\nFinding Memory Map\n");
1483	num_entries = 0;
1484	num_entries_set = B_FALSE;
1485	max_mem = 0;
1486	if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
1487		for (i = 0; i < mmap_entries; i++) {
1488			uint32_t type = dboot_loader_mmap_get_type(i);
1489			start = dboot_loader_mmap_get_base(i);
1490			end = start + dboot_loader_mmap_get_length(i);
1491
1492			if (prom_debug)
1493				dboot_printf("\ttype: %d %" PRIx64 "..%"
1494				    PRIx64 "\n", type, start, end);
1495
1496			/*
1497			 * page align start and end
1498			 */
1499			start = (start + page_offset) & ~page_offset;
1500			end &= ~page_offset;
1501			if (end <= start)
1502				continue;
1503
1504			/*
1505			 * only type 1 is usable RAM
1506			 */
1507			switch (type) {
1508			case 1:
1509				if (end > max_mem)
1510					max_mem = end;
1511				memlists[memlists_used].addr = start;
1512				memlists[memlists_used].size = end - start;
1513				++memlists_used;
1514				if (memlists_used > MAX_MEMLIST)
1515					dboot_panic("too many memlists");
1516				break;
1517			case 2:
1518				rsvdmemlists[rsvdmemlists_used].addr = start;
1519				rsvdmemlists[rsvdmemlists_used].size =
1520				    end - start;
1521				++rsvdmemlists_used;
1522				if (rsvdmemlists_used > MAX_MEMLIST)
1523					dboot_panic("too many rsvdmemlists");
1524				break;
1525			default:
1526				continue;
1527			}
1528		}
1529		build_pcimemlists();
1530	} else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
1531		DBG(lower);
1532		memlists[memlists_used].addr = 0;
1533		memlists[memlists_used].size = lower * 1024;
1534		++memlists_used;
1535		DBG(upper);
1536		memlists[memlists_used].addr = 1024 * 1024;
1537		memlists[memlists_used].size = upper * 1024;
1538		++memlists_used;
1539
1540		/*
1541		 * Old platform - assume I/O space at the end of memory.
1542		 */
1543		pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
1544		pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
1545		pcimemlists[0].next = 0;
1546		pcimemlists[0].prev = 0;
1547		bi->bi_pcimem = (native_ptr_t)(uintptr_t)pcimemlists;
1548		DBG(bi->bi_pcimem);
1549	} else {
1550		dboot_panic("No memory info from boot loader!!!");
1551	}
1552
1553	/*
1554	 * finish processing the physinstall list
1555	 */
1556	sort_physinstall();
1557
1558	/*
1559	 * build bios reserved mem lists
1560	 */
1561	build_rsvdmemlists();
1562}
1563
1564/*
1565 * The highest address is used as the starting point for dboot's simple
1566 * memory allocator.
1567 *
1568 * Finding the highest address in case of Multiboot 1 protocol is
1569 * quite painful in the sense that some information provided by
1570 * the multiboot info structure points to BIOS data, and some to RAM.
1571 *
1572 * The module list was processed and checked already by dboot_process_modules(),
1573 * so we will check the command line string and the memory map.
1574 *
1575 * This list of to be checked items is based on our current knowledge of
1576 * allocations made by grub1 and will need to be reviewed if there
1577 * are updates about the information provided by Multiboot 1.
1578 *
1579 * In the case of the Multiboot 2, our life is much simpler, as the MB2
1580 * information tag list is one contiguous chunk of memory.
1581 */
1582static paddr_t
1583dboot_multiboot1_highest_addr(void)
1584{
1585	paddr_t addr = (paddr_t)(uintptr_t)NULL;
1586	char *cmdl = (char *)mb_info->cmdline;
1587
1588	if (mb_info->flags & MB_INFO_CMDLINE)
1589		addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
1590
1591	if (mb_info->flags & MB_INFO_MEM_MAP)
1592		addr = MAX(addr,
1593		    ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
1594	return (addr);
1595}
1596
1597static void
1598dboot_multiboot_highest_addr(void)
1599{
1600	paddr_t addr;
1601
1602	switch (multiboot_version) {
1603	case 1:
1604		addr = dboot_multiboot1_highest_addr();
1605		if (addr != (paddr_t)(uintptr_t)NULL)
1606			check_higher(addr);
1607		break;
1608	case 2:
1609		addr = dboot_multiboot2_highest_addr(mb2_info);
1610		if (addr != (paddr_t)(uintptr_t)NULL)
1611			check_higher(addr);
1612		break;
1613	default:
1614		dboot_panic("Unknown multiboot version: %d\n",
1615		    multiboot_version);
1616		break;
1617	}
1618}
1619
1620/*
1621 * Walk the boot loader provided information and find the highest free address.
1622 */
1623static void
1624init_mem_alloc(void)
1625{
1626	DBG_MSG("Entered init_mem_alloc()\n");
1627	dboot_process_modules();
1628	dboot_process_mmap();
1629	dboot_multiboot_highest_addr();
1630}
1631
1632static int
1633dboot_same_guids(efi_guid_t *g1, efi_guid_t *g2)
1634{
1635	int i;
1636
1637	if (g1->time_low != g2->time_low)
1638		return (0);
1639	if (g1->time_mid != g2->time_mid)
1640		return (0);
1641	if (g1->time_hi_and_version != g2->time_hi_and_version)
1642		return (0);
1643	if (g1->clock_seq_hi_and_reserved != g2->clock_seq_hi_and_reserved)
1644		return (0);
1645	if (g1->clock_seq_low != g2->clock_seq_low)
1646		return (0);
1647
1648	for (i = 0; i < 6; i++) {
1649		if (g1->node_addr[i] != g2->node_addr[i])
1650			return (0);
1651	}
1652	return (1);
1653}
1654
1655static void
1656process_efi32(EFI_SYSTEM_TABLE32 *efi)
1657{
1658	uint32_t entries;
1659	EFI_CONFIGURATION_TABLE32 *config;
1660	efi_guid_t VendorGuid;
1661	int i;
1662
1663	entries = efi->NumberOfTableEntries;
1664	config = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1665	    efi->ConfigurationTable;
1666
1667	for (i = 0; i < entries; i++) {
1668		(void) memcpy(&VendorGuid, &config[i].VendorGuid,
1669		    sizeof (VendorGuid));
1670		if (dboot_same_guids(&VendorGuid, &smbios3)) {
1671			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1672			    config[i].VendorTable;
1673		}
1674		if (bi->bi_smbios == 0 &&
1675		    dboot_same_guids(&VendorGuid, &smbios)) {
1676			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1677			    config[i].VendorTable;
1678		}
1679		if (dboot_same_guids(&VendorGuid, &acpi2)) {
1680			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1681			    config[i].VendorTable;
1682		}
1683		if (bi->bi_acpi_rsdp == 0 &&
1684		    dboot_same_guids(&VendorGuid, &acpi1)) {
1685			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1686			    config[i].VendorTable;
1687		}
1688	}
1689}
1690
1691static void
1692process_efi64(EFI_SYSTEM_TABLE64 *efi)
1693{
1694	uint64_t entries;
1695	EFI_CONFIGURATION_TABLE64 *config;
1696	efi_guid_t VendorGuid;
1697	int i;
1698
1699	entries = efi->NumberOfTableEntries;
1700	config = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1701	    efi->ConfigurationTable;
1702
1703	for (i = 0; i < entries; i++) {
1704		(void) memcpy(&VendorGuid, &config[i].VendorGuid,
1705		    sizeof (VendorGuid));
1706		if (dboot_same_guids(&VendorGuid, &smbios3)) {
1707			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1708			    config[i].VendorTable;
1709		}
1710		if (bi->bi_smbios == 0 &&
1711		    dboot_same_guids(&VendorGuid, &smbios)) {
1712			bi->bi_smbios = (native_ptr_t)(uintptr_t)
1713			    config[i].VendorTable;
1714		}
1715		/* Prefer acpi v2+ over v1. */
1716		if (dboot_same_guids(&VendorGuid, &acpi2)) {
1717			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1718			    config[i].VendorTable;
1719		}
1720		if (bi->bi_acpi_rsdp == 0 &&
1721		    dboot_same_guids(&VendorGuid, &acpi1)) {
1722			bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
1723			    config[i].VendorTable;
1724		}
1725	}
1726}
1727
1728static void
1729dboot_multiboot_get_fwtables(void)
1730{
1731	multiboot_tag_new_acpi_t *nacpitagp;
1732	multiboot_tag_old_acpi_t *oacpitagp;
1733	multiboot_tag_efi64_t *efi64tagp = NULL;
1734	multiboot_tag_efi32_t *efi32tagp = NULL;
1735
1736	/* no fw tables from multiboot 1 */
1737	if (multiboot_version != 2)
1738		return;
1739
1740	efi64tagp = (multiboot_tag_efi64_t *)
1741	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_EFI64);
1742	if (efi64tagp != NULL) {
1743		bi->bi_uefi_arch = XBI_UEFI_ARCH_64;
1744		bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1745		    efi64tagp->mb_pointer;
1746		process_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
1747		    efi64tagp->mb_pointer);
1748	} else {
1749		efi32tagp = (multiboot_tag_efi32_t *)
1750		    dboot_multiboot2_find_tag(mb2_info,
1751		    MULTIBOOT_TAG_TYPE_EFI32);
1752		if (efi32tagp != NULL) {
1753			bi->bi_uefi_arch = XBI_UEFI_ARCH_32;
1754			bi->bi_uefi_systab = (native_ptr_t)(uintptr_t)
1755			    efi32tagp->mb_pointer;
1756			process_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
1757			    efi32tagp->mb_pointer);
1758		}
1759	}
1760
1761	/*
1762	 * The multiboot2 info contains a copy of the RSDP; stash a pointer to
1763	 * it (see find_rsdp() in fakebop).
1764	 */
1765	nacpitagp = (multiboot_tag_new_acpi_t *)
1766	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_ACPI_NEW);
1767	oacpitagp = (multiboot_tag_old_acpi_t *)
1768	    dboot_multiboot2_find_tag(mb2_info, MULTIBOOT_TAG_TYPE_ACPI_OLD);
1769
1770	if (nacpitagp != NULL) {
1771		bi->bi_acpi_rsdp_copy = (native_ptr_t)(uintptr_t)
1772		    &nacpitagp->mb_rsdp[0];
1773	} else if (oacpitagp != NULL) {
1774		bi->bi_acpi_rsdp_copy = (native_ptr_t)(uintptr_t)
1775		    &oacpitagp->mb_rsdp[0];
1776	}
1777}
1778
1779/* print out EFI version string with newline */
1780static void
1781dboot_print_efi_version(uint32_t ver)
1782{
1783	int rev;
1784
1785	dboot_printf("%d.", EFI_REV_MAJOR(ver));
1786
1787	rev = EFI_REV_MINOR(ver);
1788	if ((rev % 10) != 0) {
1789		dboot_printf("%d.%d\n", rev / 10, rev % 10);
1790	} else {
1791		dboot_printf("%d\n", rev / 10);
1792	}
1793}
1794
1795static void
1796print_efi32(EFI_SYSTEM_TABLE32 *efi)
1797{
1798	uint16_t *data;
1799	EFI_CONFIGURATION_TABLE32 *conf;
1800	int i;
1801
1802	dboot_printf("EFI32 signature: %llx\n",
1803	    (unsigned long long)efi->Hdr.Signature);
1804	dboot_printf("EFI system version: ");
1805	dboot_print_efi_version(efi->Hdr.Revision);
1806	dboot_printf("EFI system vendor: ");
1807	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1808	for (i = 0; data[i] != 0; i++)
1809		dboot_printf("%c", (char)data[i]);
1810	dboot_printf("\nEFI firmware revision: ");
1811	dboot_print_efi_version(efi->FirmwareRevision);
1812	dboot_printf("EFI system table number of entries: %d\n",
1813	    efi->NumberOfTableEntries);
1814	conf = (EFI_CONFIGURATION_TABLE32 *)(uintptr_t)
1815	    efi->ConfigurationTable;
1816	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1817		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1818		    conf[i].VendorGuid.time_low,
1819		    conf[i].VendorGuid.time_mid,
1820		    conf[i].VendorGuid.time_hi_and_version,
1821		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1822		    conf[i].VendorGuid.clock_seq_low);
1823		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1824		    conf[i].VendorGuid.node_addr[0],
1825		    conf[i].VendorGuid.node_addr[1],
1826		    conf[i].VendorGuid.node_addr[2],
1827		    conf[i].VendorGuid.node_addr[3],
1828		    conf[i].VendorGuid.node_addr[4],
1829		    conf[i].VendorGuid.node_addr[5]);
1830	}
1831}
1832
1833static void
1834print_efi64(EFI_SYSTEM_TABLE64 *efi)
1835{
1836	uint16_t *data;
1837	EFI_CONFIGURATION_TABLE64 *conf;
1838	int i;
1839
1840	dboot_printf("EFI64 signature: %llx\n",
1841	    (unsigned long long)efi->Hdr.Signature);
1842	dboot_printf("EFI system version: ");
1843	dboot_print_efi_version(efi->Hdr.Revision);
1844	dboot_printf("EFI system vendor: ");
1845	data = (uint16_t *)(uintptr_t)efi->FirmwareVendor;
1846	for (i = 0; data[i] != 0; i++)
1847		dboot_printf("%c", (char)data[i]);
1848	dboot_printf("\nEFI firmware revision: ");
1849	dboot_print_efi_version(efi->FirmwareRevision);
1850	dboot_printf("EFI system table number of entries: %" PRIu64 "\n",
1851	    efi->NumberOfTableEntries);
1852	conf = (EFI_CONFIGURATION_TABLE64 *)(uintptr_t)
1853	    efi->ConfigurationTable;
1854	for (i = 0; i < (int)efi->NumberOfTableEntries; i++) {
1855		dboot_printf("%d: 0x%x 0x%x 0x%x 0x%x 0x%x", i,
1856		    conf[i].VendorGuid.time_low,
1857		    conf[i].VendorGuid.time_mid,
1858		    conf[i].VendorGuid.time_hi_and_version,
1859		    conf[i].VendorGuid.clock_seq_hi_and_reserved,
1860		    conf[i].VendorGuid.clock_seq_low);
1861		dboot_printf(" 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
1862		    conf[i].VendorGuid.node_addr[0],
1863		    conf[i].VendorGuid.node_addr[1],
1864		    conf[i].VendorGuid.node_addr[2],
1865		    conf[i].VendorGuid.node_addr[3],
1866		    conf[i].VendorGuid.node_addr[4],
1867		    conf[i].VendorGuid.node_addr[5]);
1868	}
1869}
1870#endif /* !__xpv */
1871
1872/*
1873 * Simple memory allocator, allocates aligned physical memory.
1874 * Note that startup_kernel() only allocates memory, never frees.
1875 * Memory usage just grows in an upward direction.
1876 */
1877static void *
1878do_mem_alloc(uint32_t size, uint32_t align)
1879{
1880	uint_t i;
1881	uint64_t best;
1882	uint64_t start;
1883	uint64_t end;
1884
1885	/*
1886	 * make sure size is a multiple of pagesize
1887	 */
1888	size = RNDUP(size, MMU_PAGESIZE);
1889	next_avail_addr = RNDUP(next_avail_addr, align);
1890
1891	/*
1892	 * XXPV fixme joe
1893	 *
1894	 * a really large bootarchive that causes you to run out of memory
1895	 * may cause this to blow up
1896	 */
1897	/* LINTED E_UNEXPECTED_UINT_PROMOTION */
1898	best = (uint64_t)-size;
1899	for (i = 0; i < memlists_used; ++i) {
1900		start = memlists[i].addr;
1901#if defined(__xpv)
1902		start += mfn_base;
1903#endif
1904		end = start + memlists[i].size;
1905
1906		/*
1907		 * did we find the desired address?
1908		 */
1909		if (start <= next_avail_addr && next_avail_addr + size <= end) {
1910			best = next_avail_addr;
1911			goto done;
1912		}
1913
1914		/*
1915		 * if not is this address the best so far?
1916		 */
1917		if (start > next_avail_addr && start < best &&
1918		    RNDUP(start, align) + size <= end)
1919			best = RNDUP(start, align);
1920	}
1921
1922	/*
1923	 * We didn't find exactly the address we wanted, due to going off the
1924	 * end of a memory region. Return the best found memory address.
1925	 */
1926done:
1927	next_avail_addr = best + size;
1928#if defined(__xpv)
1929	if (next_avail_addr > scratch_end)
1930		dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1931		    "0x%lx", (ulong_t)next_avail_addr,
1932		    (ulong_t)scratch_end);
1933#endif
1934	(void) memset((void *)(uintptr_t)best, 0, size);
1935	return ((void *)(uintptr_t)best);
1936}
1937
1938void *
1939mem_alloc(uint32_t size)
1940{
1941	return (do_mem_alloc(size, MMU_PAGESIZE));
1942}
1943
1944
1945/*
1946 * Build page tables to map all of memory used so far as well as the kernel.
1947 */
1948static void
1949build_page_tables(void)
1950{
1951	uint32_t psize;
1952	uint32_t level;
1953	uint32_t off;
1954	uint64_t start;
1955#if !defined(__xpv)
1956	uint32_t i;
1957	uint64_t end;
1958#endif	/* __xpv */
1959
1960	/*
1961	 * If we're on metal, we need to create the top level pagetable.
1962	 */
1963#if defined(__xpv)
1964	top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base;
1965#else /* __xpv */
1966	top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1967#endif /* __xpv */
1968	DBG((uintptr_t)top_page_table);
1969
1970	/*
1971	 * Determine if we'll use large mappings for kernel, then map it.
1972	 */
1973	if (largepage_support) {
1974		psize = lpagesize;
1975		level = 1;
1976	} else {
1977		psize = MMU_PAGESIZE;
1978		level = 0;
1979	}
1980
1981	DBG_MSG("Mapping kernel\n");
1982	DBG(ktext_phys);
1983	DBG(target_kernel_text);
1984	DBG(ksize);
1985	DBG(psize);
1986	for (off = 0; off < ksize; off += psize)
1987		map_pa_at_va(ktext_phys + off, target_kernel_text + off, level);
1988
1989	/*
1990	 * The kernel will need a 1 page window to work with page tables
1991	 */
1992	bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE);
1993	DBG(bi->bi_pt_window);
1994	bi->bi_pte_to_pt_window =
1995	    (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0);
1996	DBG(bi->bi_pte_to_pt_window);
1997
1998#if defined(__xpv)
1999	if (!DOMAIN_IS_INITDOMAIN(xen_info)) {
2000		/* If this is a domU we're done. */
2001		DBG_MSG("\nPage tables constructed\n");
2002		return;
2003	}
2004#endif /* __xpv */
2005
2006	/*
2007	 * We need 1:1 mappings for the lower 1M of memory to access
2008	 * BIOS tables used by a couple of drivers during boot.
2009	 *
2010	 * The following code works because our simple memory allocator
2011	 * only grows usage in an upwards direction.
2012	 *
2013	 * Note that by this point in boot some mappings for low memory
2014	 * may already exist because we've already accessed device in low
2015	 * memory.  (Specifically the video frame buffer and keyboard
2016	 * status ports.)  If we're booting on raw hardware then GRUB
2017	 * created these mappings for us.  If we're booting under a
2018	 * hypervisor then we went ahead and remapped these devices into
2019	 * memory allocated within dboot itself.
2020	 */
2021	if (map_debug)
2022		dboot_printf("1:1 map pa=0..1Meg\n");
2023	for (start = 0; start < 1024 * 1024; start += MMU_PAGESIZE) {
2024#if defined(__xpv)
2025		map_ma_at_va(start, start, 0);
2026#else /* __xpv */
2027		map_pa_at_va(start, start, 0);
2028#endif /* __xpv */
2029	}
2030
2031#if !defined(__xpv)
2032
2033	for (i = 0; i < memlists_used; ++i) {
2034		start = memlists[i].addr;
2035		end = start + memlists[i].size;
2036
2037		if (map_debug)
2038			dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2039			    start, end);
2040		while (start < end && start < next_avail_addr) {
2041			map_pa_at_va(start, start, 0);
2042			start += MMU_PAGESIZE;
2043		}
2044		if (start >= next_avail_addr)
2045			break;
2046	}
2047
2048	/*
2049	 * Map framebuffer memory as PT_NOCACHE as this is memory from a
2050	 * device and therefore must not be cached.
2051	 */
2052	if (fb != NULL && fb->framebuffer != 0) {
2053		multiboot_tag_framebuffer_t *fb_tagp;
2054		fb_tagp = (multiboot_tag_framebuffer_t *)(uintptr_t)
2055		    fb->framebuffer;
2056
2057		start = fb_tagp->framebuffer_common.framebuffer_addr;
2058		end = start + fb_tagp->framebuffer_common.framebuffer_height *
2059		    fb_tagp->framebuffer_common.framebuffer_pitch;
2060
2061		if (map_debug)
2062			dboot_printf("FB 1:1 map pa=%" PRIx64 "..%" PRIx64 "\n",
2063			    start, end);
2064		pte_bits |= PT_NOCACHE;
2065		if (PAT_support != 0)
2066			pte_bits |= PT_PAT_4K;
2067
2068		while (start < end) {
2069			map_pa_at_va(start, start, 0);
2070			start += MMU_PAGESIZE;
2071		}
2072		pte_bits &= ~PT_NOCACHE;
2073		if (PAT_support != 0)
2074			pte_bits &= ~PT_PAT_4K;
2075	}
2076#endif /* !__xpv */
2077
2078	DBG_MSG("\nPage tables constructed\n");
2079}
2080
2081#define	NO_MULTIBOOT	\
2082"multiboot is no longer used to boot the Solaris Operating System.\n\
2083The grub entry should be changed to:\n\
2084kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
2085module$ /platform/i86pc/$ISADIR/boot_archive\n\
2086See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
2087
2088static void
2089dboot_init_xboot_consinfo(void)
2090{
2091	bi = &boot_info;
2092
2093#if !defined(__xpv)
2094	fb = &framebuffer;
2095	bi->bi_framebuffer = (native_ptr_t)(uintptr_t)fb;
2096
2097	switch (multiboot_version) {
2098	case 1:
2099		dboot_multiboot1_xboot_consinfo();
2100		break;
2101	case 2:
2102		dboot_multiboot2_xboot_consinfo();
2103		break;
2104	default:
2105		dboot_panic("Unknown multiboot version: %d\n",
2106		    multiboot_version);
2107		break;
2108	}
2109	dboot_find_console_modules();
2110#endif
2111}
2112
2113/*
2114 * Set up basic data from the boot loader.
2115 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
2116 * 32-bit dboot code setup used to set up and start 64-bit kernel.
2117 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
2118 * start 64-bit illumos kernel.
2119 */
2120static void
2121dboot_loader_init(void)
2122{
2123#if !defined(__xpv)
2124	mb_info = NULL;
2125	mb2_info = NULL;
2126
2127	switch (mb_magic) {
2128	case MB_BOOTLOADER_MAGIC:
2129		multiboot_version = 1;
2130		mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
2131#if defined(_BOOT_TARGET_amd64)
2132		load_addr = mb_header.load_addr;
2133#endif
2134		break;
2135
2136	case MULTIBOOT2_BOOTLOADER_MAGIC:
2137		multiboot_version = 2;
2138		mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
2139		mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
2140#if defined(_BOOT_TARGET_amd64)
2141		load_addr = mb2_load_addr;
2142#endif
2143		break;
2144
2145	default:
2146		dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
2147		break;
2148	}
2149#endif	/* !defined(__xpv) */
2150}
2151
2152/* Extract the kernel command line from [multi]boot information. */
2153static char *
2154dboot_loader_cmdline(void)
2155{
2156	char *line = NULL;
2157
2158#if defined(__xpv)
2159	line = (char *)xen_info->cmd_line;
2160#else /* __xpv */
2161
2162	switch (multiboot_version) {
2163	case 1:
2164		if (mb_info->flags & MB_INFO_CMDLINE)
2165			line = (char *)mb_info->cmdline;
2166		break;
2167
2168	case 2:
2169		line = dboot_multiboot2_cmdline(mb2_info);
2170		break;
2171
2172	default:
2173		dboot_panic("Unknown multiboot version: %d\n",
2174		    multiboot_version);
2175		break;
2176	}
2177
2178#endif /* __xpv */
2179
2180	/*
2181	 * Make sure we have valid pointer so the string operations
2182	 * will not crash us.
2183	 */
2184	if (line == NULL)
2185		line = "";
2186
2187	return (line);
2188}
2189
2190static char *
2191dboot_loader_name(void)
2192{
2193#if defined(__xpv)
2194	return (NULL);
2195#else /* __xpv */
2196	multiboot_tag_string_t *tag;
2197
2198	switch (multiboot_version) {
2199	case 1:
2200		return ((char *)(uintptr_t)mb_info->boot_loader_name);
2201
2202	case 2:
2203		tag = dboot_multiboot2_find_tag(mb2_info,
2204		    MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
2205		return (tag->mb_string);
2206	default:
2207		dboot_panic("Unknown multiboot version: %d\n",
2208		    multiboot_version);
2209		break;
2210	}
2211
2212	return (NULL);
2213#endif /* __xpv */
2214}
2215
2216/*
2217 * startup_kernel has a pretty simple job. It builds pagetables which reflect
2218 * 1:1 mappings for all memory in use. It then also adds mappings for
2219 * the kernel nucleus at virtual address of target_kernel_text using large page
2220 * mappings. The page table pages are also accessible at 1:1 mapped
2221 * virtual addresses.
2222 */
2223/*ARGSUSED*/
2224void
2225startup_kernel(void)
2226{
2227	char *cmdline;
2228	char *bootloader;
2229#if defined(__xpv)
2230	physdev_set_iopl_t set_iopl;
2231#endif /* __xpv */
2232
2233	if (dboot_debug == 1)
2234		bcons_init(NULL);	/* Set very early console to ttya. */
2235	dboot_loader_init();
2236	/*
2237	 * At this point we are executing in a 32 bit real mode.
2238	 */
2239
2240	bootloader = dboot_loader_name();
2241	cmdline = dboot_loader_cmdline();
2242
2243#if defined(__xpv)
2244	/*
2245	 * For dom0, before we initialize the console subsystem we'll
2246	 * need to enable io operations, so set I/O priveldge level to 1.
2247	 */
2248	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2249		set_iopl.iopl = 1;
2250		(void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
2251	}
2252#endif /* __xpv */
2253
2254	dboot_init_xboot_consinfo();
2255	bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
2256	bcons_init(bi);		/* Now we can set the real console. */
2257
2258	prom_debug = (find_boot_prop("prom_debug") != NULL);
2259	map_debug = (find_boot_prop("map_debug") != NULL);
2260
2261#if !defined(__xpv)
2262	dboot_multiboot_get_fwtables();
2263#endif
2264	DBG_MSG("\n\nillumos prekernel set: ");
2265	DBG_MSG(cmdline);
2266	DBG_MSG("\n");
2267
2268	if (bootloader != NULL && prom_debug) {
2269		dboot_printf("Kernel loaded by: %s\n", bootloader);
2270#if !defined(__xpv)
2271		dboot_printf("Using multiboot %d boot protocol.\n",
2272		    multiboot_version);
2273#endif
2274	}
2275
2276	if (strstr(cmdline, "multiboot") != NULL) {
2277		dboot_panic(NO_MULTIBOOT);
2278	}
2279
2280	DBG((uintptr_t)bi);
2281#if !defined(__xpv)
2282	DBG((uintptr_t)mb_info);
2283	DBG((uintptr_t)mb2_info);
2284	if (mb2_info != NULL)
2285		DBG(mb2_info->mbi_total_size);
2286	DBG(bi->bi_acpi_rsdp);
2287	DBG(bi->bi_acpi_rsdp_copy);
2288	DBG(bi->bi_smbios);
2289	DBG(bi->bi_uefi_arch);
2290	DBG(bi->bi_uefi_systab);
2291
2292	if (bi->bi_uefi_systab && prom_debug) {
2293		if (bi->bi_uefi_arch == XBI_UEFI_ARCH_64) {
2294			print_efi64((EFI_SYSTEM_TABLE64 *)(uintptr_t)
2295			    bi->bi_uefi_systab);
2296		} else {
2297			print_efi32((EFI_SYSTEM_TABLE32 *)(uintptr_t)
2298			    bi->bi_uefi_systab);
2299		}
2300	}
2301#endif
2302
2303	/*
2304	 * Need correct target_kernel_text value
2305	 */
2306	target_kernel_text = KERNEL_TEXT;
2307	DBG(target_kernel_text);
2308
2309#if defined(__xpv)
2310
2311	/*
2312	 * XXPV	Derive this stuff from CPUID / what the hypervisor has enabled
2313	 */
2314
2315#if defined(_BOOT_TARGET_amd64)
2316	/*
2317	 * 64-bit hypervisor.
2318	 */
2319	amd64_support = 1;
2320	pae_support = 1;
2321
2322#else	/* _BOOT_TARGET_amd64 */
2323
2324	/*
2325	 * See if we are running on a PAE Hypervisor
2326	 */
2327	{
2328		xen_capabilities_info_t caps;
2329
2330		if (HYPERVISOR_xen_version(XENVER_capabilities, &caps) != 0)
2331			dboot_panic("HYPERVISOR_xen_version(caps) failed");
2332		caps[sizeof (caps) - 1] = 0;
2333		if (prom_debug)
2334			dboot_printf("xen capabilities %s\n", caps);
2335		if (strstr(caps, "x86_32p") != NULL)
2336			pae_support = 1;
2337	}
2338
2339#endif	/* _BOOT_TARGET_amd64 */
2340	{
2341		xen_platform_parameters_t p;
2342
2343		if (HYPERVISOR_xen_version(XENVER_platform_parameters, &p) != 0)
2344			dboot_panic("HYPERVISOR_xen_version(parms) failed");
2345		DBG(p.virt_start);
2346		mfn_to_pfn_mapping = (pfn_t *)(xen_virt_start = p.virt_start);
2347	}
2348
2349	/*
2350	 * The hypervisor loads stuff starting at 1Gig
2351	 */
2352	mfn_base = ONE_GIG;
2353	DBG(mfn_base);
2354
2355	/*
2356	 * enable writable page table mode for the hypervisor
2357	 */
2358	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2359	    VMASST_TYPE_writable_pagetables) < 0)
2360		dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2361
2362	/*
2363	 * check for NX support
2364	 */
2365	if (pae_support) {
2366		uint32_t eax = 0x80000000;
2367		uint32_t edx = get_cpuid_edx(&eax);
2368
2369		if (eax >= 0x80000001) {
2370			eax = 0x80000001;
2371			edx = get_cpuid_edx(&eax);
2372			if (edx & CPUID_AMD_EDX_NX)
2373				NX_support = 1;
2374		}
2375	}
2376
2377	/*
2378	 * check for PAT support
2379	 */
2380	{
2381		uint32_t eax = 1;
2382		uint32_t edx = get_cpuid_edx(&eax);
2383
2384		if (edx & CPUID_INTC_EDX_PAT)
2385			PAT_support = 1;
2386	}
2387#if !defined(_BOOT_TARGET_amd64)
2388
2389	/*
2390	 * The 32-bit hypervisor uses segmentation to protect itself from
2391	 * guests. This means when a guest attempts to install a flat 4GB
2392	 * code or data descriptor the 32-bit hypervisor will protect itself
2393	 * by silently shrinking the segment such that if the guest attempts
2394	 * any access where the hypervisor lives a #gp fault is generated.
2395	 * The problem is that some applications expect a full 4GB flat
2396	 * segment for their current thread pointer and will use negative
2397	 * offset segment wrap around to access data. TLS support in linux
2398	 * brand is one example of this.
2399	 *
2400	 * The 32-bit hypervisor can catch the #gp fault in these cases
2401	 * and emulate the access without passing the #gp fault to the guest
2402	 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2403	 * Seems like this should have been the default.
2404	 * Either way, we want the hypervisor -- and not Solaris -- to deal
2405	 * to deal with emulating these accesses.
2406	 */
2407	if (HYPERVISOR_vm_assist(VMASST_CMD_enable,
2408	    VMASST_TYPE_4gb_segments) < 0)
2409		dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2410#endif	/* !_BOOT_TARGET_amd64 */
2411
2412#else	/* __xpv */
2413
2414	/*
2415	 * use cpuid to enable MMU features
2416	 */
2417	if (have_cpuid()) {
2418		uint32_t eax, edx;
2419
2420		eax = 1;
2421		edx = get_cpuid_edx(&eax);
2422		if (edx & CPUID_INTC_EDX_PSE)
2423			largepage_support = 1;
2424		if (edx & CPUID_INTC_EDX_PGE)
2425			pge_support = 1;
2426		if (edx & CPUID_INTC_EDX_PAE)
2427			pae_support = 1;
2428		if (edx & CPUID_INTC_EDX_PAT)
2429			PAT_support = 1;
2430
2431		eax = 0x80000000;
2432		edx = get_cpuid_edx(&eax);
2433		if (eax >= 0x80000001) {
2434			eax = 0x80000001;
2435			edx = get_cpuid_edx(&eax);
2436			if (edx & CPUID_AMD_EDX_LM)
2437				amd64_support = 1;
2438			if (edx & CPUID_AMD_EDX_NX)
2439				NX_support = 1;
2440		}
2441	} else {
2442		dboot_printf("cpuid not supported\n");
2443	}
2444#endif /* __xpv */
2445
2446
2447#if defined(_BOOT_TARGET_amd64)
2448	if (amd64_support == 0)
2449		dboot_panic("long mode not supported, rebooting");
2450	else if (pae_support == 0)
2451		dboot_panic("long mode, but no PAE; rebooting");
2452#else
2453	/*
2454	 * Allow the command line to over-ride use of PAE for 32 bit.
2455	 */
2456	if (strstr(cmdline, "disablePAE=true") != NULL) {
2457		pae_support = 0;
2458		NX_support = 0;
2459		amd64_support = 0;
2460	}
2461#endif
2462
2463	/*
2464	 * initialize the simple memory allocator
2465	 */
2466	init_mem_alloc();
2467
2468#if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2469	/*
2470	 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2471	 */
2472	if (max_mem < FOUR_GIG && NX_support == 0)
2473		pae_support = 0;
2474#endif
2475
2476	/*
2477	 * configure mmu information
2478	 */
2479	if (pae_support) {
2480		shift_amt = shift_amt_pae;
2481		ptes_per_table = 512;
2482		pte_size = 8;
2483		lpagesize = TWO_MEG;
2484#if defined(_BOOT_TARGET_amd64)
2485		top_level = 3;
2486#else
2487		top_level = 2;
2488#endif
2489	} else {
2490		pae_support = 0;
2491		NX_support = 0;
2492		shift_amt = shift_amt_nopae;
2493		ptes_per_table = 1024;
2494		pte_size = 4;
2495		lpagesize = FOUR_MEG;
2496		top_level = 1;
2497	}
2498
2499	DBG(PAT_support);
2500	DBG(pge_support);
2501	DBG(NX_support);
2502	DBG(largepage_support);
2503	DBG(amd64_support);
2504	DBG(top_level);
2505	DBG(pte_size);
2506	DBG(ptes_per_table);
2507	DBG(lpagesize);
2508
2509#if defined(__xpv)
2510	ktext_phys = ONE_GIG;		/* from UNIX Mapfile */
2511#else
2512	ktext_phys = FOUR_MEG;		/* from UNIX Mapfile */
2513#endif
2514
2515#if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2516	/*
2517	 * For grub, copy kernel bits from the ELF64 file to final place.
2518	 */
2519	DBG_MSG("\nAllocating nucleus pages.\n");
2520	ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
2521
2522	if (ktext_phys == 0)
2523		dboot_panic("failed to allocate aligned kernel memory");
2524	DBG(load_addr);
2525	if (dboot_elfload64(load_addr) != 0)
2526		dboot_panic("failed to parse kernel ELF image, rebooting");
2527#endif
2528
2529	DBG(ktext_phys);
2530
2531	/*
2532	 * Allocate page tables.
2533	 */
2534	build_page_tables();
2535
2536	/*
2537	 * return to assembly code to switch to running kernel
2538	 */
2539	entry_addr_low = (uint32_t)target_kernel_text;
2540	DBG(entry_addr_low);
2541	bi->bi_use_largepage = largepage_support;
2542	bi->bi_use_pae = pae_support;
2543	bi->bi_use_pge = pge_support;
2544	bi->bi_use_nx = NX_support;
2545
2546#if defined(__xpv)
2547
2548	bi->bi_next_paddr = next_avail_addr - mfn_base;
2549	DBG(bi->bi_next_paddr);
2550	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2551	DBG(bi->bi_next_vaddr);
2552
2553	/*
2554	 * unmap unused pages in start area to make them available for DMA
2555	 */
2556	while (next_avail_addr < scratch_end) {
2557		(void) HYPERVISOR_update_va_mapping(next_avail_addr,
2558		    0, UVMF_INVLPG | UVMF_LOCAL);
2559		next_avail_addr += MMU_PAGESIZE;
2560	}
2561
2562	bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info;
2563	DBG((uintptr_t)HYPERVISOR_shared_info);
2564	bi->bi_shared_info = (native_ptr_t)HYPERVISOR_shared_info;
2565	bi->bi_top_page_table = (uintptr_t)top_page_table - mfn_base;
2566
2567#else /* __xpv */
2568
2569	bi->bi_next_paddr = next_avail_addr;
2570	DBG(bi->bi_next_paddr);
2571	bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr;
2572	DBG(bi->bi_next_vaddr);
2573	bi->bi_mb_version = multiboot_version;
2574
2575	switch (multiboot_version) {
2576	case 1:
2577		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb_info;
2578		break;
2579	case 2:
2580		bi->bi_mb_info = (native_ptr_t)(uintptr_t)mb2_info;
2581		break;
2582	default:
2583		dboot_panic("Unknown multiboot version: %d\n",
2584		    multiboot_version);
2585		break;
2586	}
2587	bi->bi_top_page_table = (uintptr_t)top_page_table;
2588
2589#endif /* __xpv */
2590
2591	bi->bi_kseg_size = FOUR_MEG;
2592	DBG(bi->bi_kseg_size);
2593
2594#ifndef __xpv
2595	if (map_debug)
2596		dump_tables();
2597#endif
2598
2599	DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");
2600
2601#ifndef __xpv
2602	/* Update boot info with FB data */
2603	fb->cursor.origin.x = fb_info.cursor.origin.x;
2604	fb->cursor.origin.y = fb_info.cursor.origin.y;
2605	fb->cursor.pos.x = fb_info.cursor.pos.x;
2606	fb->cursor.pos.y = fb_info.cursor.pos.y;
2607	fb->cursor.visible = fb_info.cursor.visible;
2608#endif
2609}
2610