1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2012 DEY Storage Systems, Inc.  All rights reserved.
25 * Copyright 2017 Nexenta Systems, Inc.
26 * Copyright (c) 2018 Joyent, Inc.
27 * Copyright (c) 2015 by Delphix. All rights reserved.
28 */
29/*
30 * Copyright (c) 2010, Intel Corporation.
31 * All rights reserved.
32 */
33
34#include <sys/types.h>
35#include <sys/t_lock.h>
36#include <sys/param.h>
37#include <sys/sysmacros.h>
38#include <sys/signal.h>
39#include <sys/systm.h>
40#include <sys/user.h>
41#include <sys/mman.h>
42#include <sys/vm.h>
43#include <sys/conf.h>
44#include <sys/avintr.h>
45#include <sys/autoconf.h>
46#include <sys/disp.h>
47#include <sys/class.h>
48#include <sys/bitmap.h>
49
50#include <sys/privregs.h>
51
52#include <sys/proc.h>
53#include <sys/buf.h>
54#include <sys/kmem.h>
55#include <sys/mem.h>
56#include <sys/kstat.h>
57
58#include <sys/reboot.h>
59
60#include <sys/cred.h>
61#include <sys/vnode.h>
62#include <sys/file.h>
63
64#include <sys/procfs.h>
65
66#include <sys/vfs.h>
67#include <sys/cmn_err.h>
68#include <sys/utsname.h>
69#include <sys/debug.h>
70#include <sys/kdi.h>
71
72#include <sys/dumphdr.h>
73#include <sys/bootconf.h>
74#include <sys/memlist_plat.h>
75#include <sys/varargs.h>
76#include <sys/promif.h>
77#include <sys/modctl.h>
78
79#include <sys/sunddi.h>
80#include <sys/sunndi.h>
81#include <sys/ndi_impldefs.h>
82#include <sys/ddidmareq.h>
83#include <sys/psw.h>
84#include <sys/regset.h>
85#include <sys/clock.h>
86#include <sys/pte.h>
87#include <sys/tss.h>
88#include <sys/stack.h>
89#include <sys/trap.h>
90#include <sys/fp.h>
91#include <vm/kboot_mmu.h>
92#include <vm/anon.h>
93#include <vm/as.h>
94#include <vm/page.h>
95#include <vm/seg.h>
96#include <vm/seg_dev.h>
97#include <vm/seg_kmem.h>
98#include <vm/seg_kpm.h>
99#include <vm/seg_map.h>
100#include <vm/seg_vn.h>
101#include <vm/seg_kp.h>
102#include <sys/memnode.h>
103#include <vm/vm_dep.h>
104#include <sys/thread.h>
105#include <sys/sysconf.h>
106#include <sys/vm_machparam.h>
107#include <sys/archsystm.h>
108#include <sys/machsystm.h>
109#include <vm/hat.h>
110#include <vm/hat_i86.h>
111#include <sys/pmem.h>
112#include <sys/smp_impldefs.h>
113#include <sys/x86_archext.h>
114#include <sys/cpuvar.h>
115#include <sys/segments.h>
116#include <sys/clconf.h>
117#include <sys/kobj.h>
118#include <sys/kobj_lex.h>
119#include <sys/cpc_impl.h>
120#include <sys/cpu_module.h>
121#include <sys/smbios.h>
122#include <sys/debug_info.h>
123#include <sys/bootinfo.h>
124#include <sys/ddi_periodic.h>
125#include <sys/systeminfo.h>
126#include <sys/multiboot.h>
127#include <sys/ramdisk.h>
128
129#ifdef	__xpv
130
131#include <sys/hypervisor.h>
132#include <sys/xen_mmu.h>
133#include <sys/evtchn_impl.h>
134#include <sys/gnttab.h>
135#include <sys/xpv_panic.h>
136#include <xen/sys/xenbus_comms.h>
137#include <xen/public/physdev.h>
138
139extern void xen_late_startup(void);
140
141struct xen_evt_data cpu0_evt_data;
142
143#else	/* __xpv */
144#include <sys/memlist_impl.h>
145
146extern void mem_config_init(void);
147#endif /* __xpv */
148
149extern void progressbar_init(void);
150extern void brand_init(void);
151extern void pcf_init(void);
152extern void pg_init(void);
153extern void ssp_init(void);
154
155extern int size_pse_array(pgcnt_t, int);
156
157#if defined(_SOFT_HOSTID)
158
159#include <sys/rtc.h>
160
161static int32_t set_soft_hostid(void);
162static char hostid_file[] = "/etc/hostid";
163
164#endif
165
166void *gfx_devinfo_list;
167
168#if defined(__amd64) && !defined(__xpv)
169extern void immu_startup(void);
170#endif
171
172/*
173 * XXX make declaration below "static" when drivers no longer use this
174 * interface.
175 */
176extern caddr_t p0_va;	/* Virtual address for accessing physical page 0 */
177
178/*
179 * segkp
180 */
181extern int segkp_fromheap;
182
183static void kvm_init(void);
184static void startup_init(void);
185static void startup_memlist(void);
186static void startup_kmem(void);
187static void startup_modules(void);
188static void startup_vm(void);
189static void startup_end(void);
190static void layout_kernel_va(void);
191
192/*
193 * Declare these as initialized data so we can patch them.
194 */
195#ifdef __i386
196
197/*
198 * Due to virtual address space limitations running in 32 bit mode, restrict
199 * the amount of physical memory configured to a max of PHYSMEM pages (16g).
200 *
201 * If the physical max memory size of 64g were allowed to be configured, the
202 * size of user virtual address space will be less than 1g. A limited user
203 * address space greatly reduces the range of applications that can run.
204 *
205 * If more physical memory than PHYSMEM is required, users should preferably
206 * run in 64 bit mode which has far looser virtual address space limitations.
207 *
208 * If 64 bit mode is not available (as in IA32) and/or more physical memory
209 * than PHYSMEM is required in 32 bit mode, physmem can be set to the desired
210 * value or to 0 (to configure all available memory) via eeprom(1M). kernelbase
211 * should also be carefully tuned to balance out the need of the user
212 * application while minimizing the risk of kernel heap exhaustion due to
213 * kernelbase being set too high.
214 */
215#define	PHYSMEM	0x400000
216
217#else /* __amd64 */
218
219/*
220 * For now we can handle memory with physical addresses up to about
221 * 64 Terabytes. This keeps the kernel above the VA hole, leaving roughly
222 * half the VA space for seg_kpm. When systems get bigger than 64TB this
223 * code will need revisiting. There is an implicit assumption that there
224 * are no *huge* holes in the physical address space too.
225 */
226#define	TERABYTE		(1ul << 40)
227#define	PHYSMEM_MAX64		mmu_btop(64 * TERABYTE)
228#define	PHYSMEM			PHYSMEM_MAX64
229#define	AMD64_VA_HOLE_END	0xFFFF800000000000ul
230
231#endif /* __amd64 */
232
233pgcnt_t physmem = PHYSMEM;
234pgcnt_t obp_pages;	/* Memory used by PROM for its text and data */
235
236char *kobj_file_buf;
237int kobj_file_bufsize;	/* set in /etc/system */
238
239/* Global variables for MP support. Used in mp_startup */
240caddr_t	rm_platter_va = 0;
241uint32_t rm_platter_pa;
242
243int	auto_lpg_disable = 1;
244
245/*
246 * Some CPUs have holes in the middle of the 64-bit virtual address range.
247 */
248uintptr_t hole_start, hole_end;
249
250/*
251 * kpm mapping window
252 */
253caddr_t kpm_vbase;
254size_t  kpm_size;
255static int kpm_desired;
256#ifdef __amd64
257static uintptr_t segkpm_base = (uintptr_t)SEGKPM_BASE;
258#endif
259
260/*
261 * Configuration parameters set at boot time.
262 */
263
264caddr_t econtig;		/* end of first block of contiguous kernel */
265
266struct bootops		*bootops = 0;	/* passed in from boot */
267struct bootops		**bootopsp;
268struct boot_syscalls	*sysp;		/* passed in from boot */
269
270char bootblock_fstype[16];
271
272char kern_bootargs[OBP_MAXPATHLEN];
273char kern_bootfile[OBP_MAXPATHLEN];
274
275/*
276 * ZFS zio segment.  This allows us to exclude large portions of ZFS data that
277 * gets cached in kmem caches on the heap.  If this is set to zero, we allocate
278 * zio buffers from their own segment, otherwise they are allocated from the
279 * heap.  The optimization of allocating zio buffers from their own segment is
280 * only valid on 64-bit kernels.
281 */
282#if defined(__amd64)
283int segzio_fromheap = 0;
284#else
285int segzio_fromheap = 1;
286#endif
287
288/*
289 * Give folks an escape hatch for disabling SMAP via kmdb. Doesn't work
290 * post-boot.
291 */
292int disable_smap = 0;
293
294/*
295 * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
296 * depends on number of BOP_ALLOC calls made and requested size, memory size
297 * combination and whether boot.bin memory needs to be freed.
298 */
299#define	POSS_NEW_FRAGMENTS	12
300
301/*
302 * VM data structures
303 */
304long page_hashsz;		/* Size of page hash table (power of two) */
305unsigned int page_hashsz_shift;	/* log2(page_hashsz) */
306struct page *pp_base;		/* Base of initial system page struct array */
307struct page **page_hash;	/* Page hash table */
308pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
309size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
310int pse_shift;			/* log2(pse_table_size) */
311struct seg ktextseg;		/* Segment used for kernel executable image */
312struct seg kvalloc;		/* Segment used for "valloc" mapping */
313struct seg kpseg;		/* Segment used for pageable kernel virt mem */
314struct seg kmapseg;		/* Segment used for generic kernel mappings */
315struct seg kdebugseg;		/* Segment used for the kernel debugger */
316
317struct seg *segkmap = &kmapseg;	/* Kernel generic mapping segment */
318static struct seg *segmap = &kmapseg;	/* easier to use name for in here */
319
320struct seg *segkp = &kpseg;	/* Pageable kernel virtual memory segment */
321
322struct seg kvseg_core;		/* Segment used for the core heap */
323struct seg kpmseg;		/* Segment used for physical mapping */
324struct seg *segkpm = &kpmseg;	/* 64bit kernel physical mapping segment */
325
326caddr_t segkp_base;		/* Base address of segkp */
327caddr_t segzio_base;		/* Base address of segzio */
328pgcnt_t segkpsize = btop(SEGKPDEFSIZE);	/* size of segkp segment in pages */
329caddr_t segkvmm_base;
330pgcnt_t segkvmmsize;
331pgcnt_t segziosize;
332
333/*
334 * A static DR page_t VA map is reserved that can map the page structures
335 * for a domain's entire RA space. The pages that back this space are
336 * dynamically allocated and need not be physically contiguous.  The DR
337 * map size is derived from KPM size.
338 * This mechanism isn't used by x86 yet, so just stubs here.
339 */
340int ppvm_enable = 0;		/* Static virtual map for page structs */
341page_t *ppvm_base = NULL;	/* Base of page struct map */
342pgcnt_t ppvm_size = 0;		/* Size of page struct map */
343
344/*
345 * VA range available to the debugger
346 */
347const caddr_t kdi_segdebugbase = (const caddr_t)SEGDEBUGBASE;
348const size_t kdi_segdebugsize = SEGDEBUGSIZE;
349
350struct memseg *memseg_base;
351struct vnode unused_pages_vp;
352
353#define	FOURGB	0x100000000LL
354
355struct memlist *memlist;
356
357caddr_t s_text;		/* start of kernel text segment */
358caddr_t e_text;		/* end of kernel text segment */
359caddr_t s_data;		/* start of kernel data segment */
360caddr_t e_data;		/* end of kernel data segment */
361caddr_t modtext;	/* start of loadable module text reserved */
362caddr_t e_modtext;	/* end of loadable module text reserved */
363caddr_t moddata;	/* start of loadable module data reserved */
364caddr_t e_moddata;	/* end of loadable module data reserved */
365
366struct memlist *phys_install;	/* Total installed physical memory */
367struct memlist *phys_avail;	/* Total available physical memory */
368struct memlist *bios_rsvd;	/* Bios reserved memory */
369
370/*
371 * kphysm_init returns the number of pages that were processed
372 */
373static pgcnt_t kphysm_init(page_t *, pgcnt_t);
374
375#define	IO_PROP_SIZE	64	/* device property size */
376
377/*
378 * a couple useful roundup macros
379 */
380#define	ROUND_UP_PAGE(x)	\
381	((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)MMU_PAGESIZE))
382#define	ROUND_UP_LPAGE(x)	\
383	((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[1]))
384#define	ROUND_UP_4MEG(x)	\
385	((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)FOUR_MEG))
386#define	ROUND_UP_TOPLEVEL(x)	\
387	((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[mmu.max_level]))
388
389/*
390 *	32-bit Kernel's Virtual memory layout.
391 *		+-----------------------+
392 *		|			|
393 * 0xFFC00000  -|-----------------------|- ARGSBASE
394 *		|	debugger	|
395 * 0xFF800000  -|-----------------------|- SEGDEBUGBASE
396 *		|      Kernel Data	|
397 * 0xFEC00000  -|-----------------------|
398 *              |      Kernel Text	|
399 * 0xFE800000  -|-----------------------|- KERNEL_TEXT (0xFB400000 on Xen)
400 *		|---       GDT       ---|- GDT page (GDT_VA)
401 *		|---    debug info   ---|- debug info (DEBUG_INFO_VA)
402 *		|			|
403 *		|   page_t structures	|
404 *		|   memsegs, memlists,	|
405 *		|   page hash, etc.	|
406 * ---	       -|-----------------------|- ekernelheap, valloc_base (floating)
407 *		|			|  (segkp is just an arena in the heap)
408 *		|			|
409 *		|	kvseg		|
410 *		|			|
411 *		|			|
412 * ---         -|-----------------------|- kernelheap (floating)
413 *		|        Segkmap	|
414 * 0xC3002000  -|-----------------------|- segmap_start (floating)
415 *		|	Red Zone	|
416 * 0xC3000000  -|-----------------------|- kernelbase / userlimit (floating)
417 *		|			|			||
418 *		|     Shared objects	|			\/
419 *		|			|
420 *		:			:
421 *		|	user data	|
422 *		|-----------------------|
423 *		|	user text	|
424 * 0x08048000  -|-----------------------|
425 *		|	user stack	|
426 *		:			:
427 *		|	invalid		|
428 * 0x00000000	+-----------------------+
429 *
430 *
431 *		64-bit Kernel's Virtual memory layout. (assuming 64 bit app)
432 *			+-----------------------+
433 *			|			|
434 * 0xFFFFFFFF.FFC00000  |-----------------------|- ARGSBASE
435 *			|	debugger (?)	|
436 * 0xFFFFFFFF.FF800000  |-----------------------|- SEGDEBUGBASE
437 *			|      unused		|
438 *			+-----------------------+
439 *			|      Kernel Data	|
440 * 0xFFFFFFFF.FBC00000  |-----------------------|
441 *			|      Kernel Text	|
442 * 0xFFFFFFFF.FB800000  |-----------------------|- KERNEL_TEXT
443 *			|---    debug info   ---|- debug info (DEBUG_INFO_VA)
444 *			|---       GDT       ---|- GDT page (GDT_VA)
445 *			|---       IDT       ---|- IDT page (IDT_VA)
446 *			|---       LDT       ---|- LDT pages (LDT_VA)
447 *			|			|
448 *			|      Core heap	| (used for loadable modules)
449 * 0xFFFFFFFF.C0000000  |-----------------------|- core_base / ekernelheap
450 *			|	 Kernel		|
451 *			|	  heap		|
452 *			|			|
453 *			|			|
454 * 0xFFFFFXXX.XXX00000  |-----------------------|- kernelheap (floating)
455 *			|	 segmap		|
456 * 0xFFFFFXXX.XXX00000  |-----------------------|- segmap_start (floating)
457 *			|    device mappings	|
458 * 0xFFFFFXXX.XXX00000  |-----------------------|- toxic_addr (floating)
459 *			|	 segzio		|
460 * 0xFFFFFXXX.XXX00000  |-----------------------|- segzio_base (floating)
461 *			|        segkvmm	|
462 *			|			|
463 *			|			|
464 *			|			|
465 * 0xFFFFFXXX.XXX00000  |-----------------------|- segkvmm_base (floating)
466 *			|	 segkp		|
467 * 			|-----------------------|- segkp_base (floating)
468 *			|   page_t structures	|  valloc_base + valloc_sz
469 *			|   memsegs, memlists,	|
470 *			|   page hash, etc.	|
471 * 0xFFFFFE00.00000000  |-----------------------|- valloc_base (lower if >256GB)
472 *			|	 segkpm		|
473 *			|			|
474 * 0xFFFFFD00.00000000  |-----------------------|- SEGKPM_BASE (lower if >256GB)
475 *			|	Red Zone	|
476 * 0xFFFFFC80.00000000  |-----------------------|- KERNELBASE (lower if >256GB)
477 * 0xFFFFFC7F.FFE00000  |-----------------------|- USERLIMIT (lower if >256GB)
478 *			|     User stack	|- User space memory
479 *			|			|
480 *			| shared objects, etc	|	(grows downwards)
481 *			:			:
482 *			|			|
483 * 0xFFFF8000.00000000  |-----------------------|
484 *			|			|
485 *			| VA Hole / unused	|
486 *			|			|
487 * 0x00008000.00000000  |-----------------------|
488 *			|			|
489 *			|			|
490 *			:			:
491 *			|	user heap	|	(grows upwards)
492 *			|			|
493 *			|	user data	|
494 *			|-----------------------|
495 *			|	user text	|
496 * 0x00000000.04000000  |-----------------------|
497 *			|	invalid		|
498 * 0x00000000.00000000	+-----------------------+
499 *
500 * A 32 bit app on the 64 bit kernel sees the same layout as on the 32 bit
501 * kernel, except that userlimit is raised to 0xfe000000
502 *
503 * Floating values:
504 *
505 * valloc_base: start of the kernel's memory management/tracking data
506 * structures.  This region contains page_t structures for
507 * physical memory, memsegs, memlists, and the page hash.
508 *
509 * core_base: start of the kernel's "core" heap area on 64-bit systems.
510 * This area is intended to be used for global data as well as for module
511 * text/data that does not fit into the nucleus pages.  The core heap is
512 * restricted to a 2GB range, allowing every address within it to be
513 * accessed using rip-relative addressing
514 *
515 * ekernelheap: end of kernelheap and start of segmap.
516 *
517 * kernelheap: start of kernel heap.  On 32-bit systems, this starts right
518 * above a red zone that separates the user's address space from the
519 * kernel's.  On 64-bit systems, it sits above segkp and segkpm.
520 *
521 * segmap_start: start of segmap. The length of segmap can be modified
522 * through eeprom. The default length is 16MB on 32-bit systems and 64MB
523 * on 64-bit systems.
524 *
525 * kernelbase: On a 32-bit kernel the default value of 0xd4000000 will be
526 * decreased by 2X the size required for page_t.  This allows the kernel
527 * heap to grow in size with physical memory.  With sizeof(page_t) == 80
528 * bytes, the following shows the values of kernelbase and kernel heap
529 * sizes for different memory configurations (assuming default segmap and
530 * segkp sizes).
531 *
532 *	mem	size for	kernelbase	kernel heap
533 *	size	page_t's			size
534 *	----	---------	----------	-----------
535 *	1gb	0x01400000	0xd1800000	684MB
536 *	2gb	0x02800000	0xcf000000	704MB
537 *	4gb	0x05000000	0xca000000	744MB
538 *	6gb	0x07800000	0xc5000000	784MB
539 *	8gb	0x0a000000	0xc0000000	824MB
540 *	16gb	0x14000000	0xac000000	984MB
541 *	32gb	0x28000000	0x84000000	1304MB
542 *	64gb	0x50000000	0x34000000	1944MB (*)
543 *
544 * kernelbase is less than the abi minimum of 0xc0000000 for memory
545 * configurations above 8gb.
546 *
547 * (*) support for memory configurations above 32gb will require manual tuning
548 * of kernelbase to balance out the need of user applications.
549 */
550
551/* real-time-clock initialization parameters */
552extern time_t process_rtc_config_file(void);
553
554uintptr_t	kernelbase;
555uintptr_t	postbootkernelbase;	/* not set till boot loader is gone */
556uintptr_t	eprom_kernelbase;
557size_t		segmapsize;
558uintptr_t	segmap_start;
559int		segmapfreelists;
560pgcnt_t		npages;
561pgcnt_t		orig_npages;
562size_t		core_size;		/* size of "core" heap */
563uintptr_t	core_base;		/* base address of "core" heap */
564
565/*
566 * List of bootstrap pages. We mark these as allocated in startup.
567 * release_bootstrap() will free them when we're completely done with
568 * the bootstrap.
569 */
570static page_t *bootpages;
571
572/*
573 * boot time pages that have a vnode from the ramdisk will keep that forever.
574 */
575static page_t *rd_pages;
576
577/*
578 * Lower 64K
579 */
580static page_t *lower_pages = NULL;
581static int lower_pages_count = 0;
582
583struct system_hardware system_hardware;
584
585/*
586 * Enable some debugging messages concerning memory usage...
587 */
588static void
589print_memlist(char *title, struct memlist *mp)
590{
591	prom_printf("MEMLIST: %s:\n", title);
592	while (mp != NULL)  {
593		prom_printf("\tAddress 0x%" PRIx64 ", size 0x%" PRIx64 "\n",
594		    mp->ml_address, mp->ml_size);
595		mp = mp->ml_next;
596	}
597}
598
599/*
600 * XX64 need a comment here.. are these just default values, surely
601 * we read the "cpuid" type information to figure this out.
602 */
603int	l2cache_sz = 0x80000;
604int	l2cache_linesz = 0x40;
605int	l2cache_assoc = 1;
606
607static size_t	textrepl_min_gb = 10;
608
609/*
610 * on 64 bit we use a predifined VA range for mapping devices in the kernel
611 * on 32 bit the mappings are intermixed in the heap, so we use a bit map
612 */
613#ifdef __amd64
614
615vmem_t		*device_arena;
616uintptr_t	toxic_addr = (uintptr_t)NULL;
617size_t		toxic_size = 1024 * 1024 * 1024; /* Sparc uses 1 gig too */
618
619#else	/* __i386 */
620
621ulong_t		*toxic_bit_map;	/* one bit for each 4k of VA in heap_arena */
622size_t		toxic_bit_map_len = 0;	/* in bits */
623
624#endif	/* __i386 */
625
626/*
627 * Simple boot time debug facilities
628 */
629static char *prm_dbg_str[] = {
630	"%s:%d: '%s' is 0x%x\n",
631	"%s:%d: '%s' is 0x%llx\n"
632};
633
634int prom_debug;
635
636#define	PRM_DEBUG(q)	if (prom_debug)		\
637	prom_printf(prm_dbg_str[sizeof (q) >> 3], "startup.c", __LINE__, #q, q);
638#define	PRM_POINT(q)	if (prom_debug)		\
639	prom_printf("%s:%d: %s\n", "startup.c", __LINE__, q);
640
641/*
642 * This structure is used to keep track of the intial allocations
643 * done in startup_memlist(). The value of NUM_ALLOCATIONS needs to
644 * be >= the number of ADD_TO_ALLOCATIONS() executed in the code.
645 */
646#define	NUM_ALLOCATIONS 8
647int num_allocations = 0;
648struct {
649	void **al_ptr;
650	size_t al_size;
651} allocations[NUM_ALLOCATIONS];
652size_t valloc_sz = 0;
653uintptr_t valloc_base;
654
655#define	ADD_TO_ALLOCATIONS(ptr, size) {					\
656		size = ROUND_UP_PAGE(size);				\
657		if (num_allocations == NUM_ALLOCATIONS)			\
658			panic("too many ADD_TO_ALLOCATIONS()");		\
659		allocations[num_allocations].al_ptr = (void**)&ptr;	\
660		allocations[num_allocations].al_size = size;		\
661		valloc_sz += size;					\
662		++num_allocations;					\
663	}
664
665/*
666 * Allocate all the initial memory needed by the page allocator.
667 */
668static void
669perform_allocations(void)
670{
671	caddr_t mem;
672	int i;
673	int valloc_align;
674
675	PRM_DEBUG(valloc_base);
676	PRM_DEBUG(valloc_sz);
677	valloc_align = mmu.level_size[mmu.max_page_level > 0];
678	mem = BOP_ALLOC(bootops, (caddr_t)valloc_base, valloc_sz, valloc_align);
679	if (mem != (caddr_t)valloc_base)
680		panic("BOP_ALLOC() failed");
681	bzero(mem, valloc_sz);
682	for (i = 0; i < num_allocations; ++i) {
683		*allocations[i].al_ptr = (void *)mem;
684		mem += allocations[i].al_size;
685	}
686}
687
688/*
689 * Set up and enable SMAP now before we start other CPUs, but after the kernel's
690 * VM has been set up so we can use hot_patch_kernel_text().
691 *
692 * We can only patch 1, 2, or 4 bytes, but not three bytes. So instead, we
693 * replace the four byte word at the patch point. See uts/intel/ia32/ml/copy.s
694 * for more information on what's going on here.
695 */
696static void
697startup_smap(void)
698{
699	int i;
700	uint32_t inst;
701	uint8_t *instp;
702	char sym[128];
703	struct modctl *modp;
704
705	extern int _smap_enable_patch_count;
706	extern int _smap_disable_patch_count;
707
708	if (disable_smap != 0)
709		remove_x86_feature(x86_featureset, X86FSET_SMAP);
710
711	if (is_x86_feature(x86_featureset, X86FSET_SMAP) == B_FALSE)
712		return;
713
714	for (i = 0; i < _smap_enable_patch_count; i++) {
715		int sizep;
716
717		VERIFY3U(i, <, _smap_enable_patch_count);
718		VERIFY(snprintf(sym, sizeof (sym), "_smap_enable_patch_%d", i) <
719		    sizeof (sym));
720		instp = (uint8_t *)(void *)kobj_getelfsym(sym, NULL, &sizep);
721		VERIFY(instp != 0);
722		inst = (instp[3] << 24) | (SMAP_CLAC_INSTR & 0x00ffffff);
723		hot_patch_kernel_text((caddr_t)instp, inst, 4);
724	}
725
726	for (i = 0; i < _smap_disable_patch_count; i++) {
727		int sizep;
728
729		VERIFY(snprintf(sym, sizeof (sym), "_smap_disable_patch_%d",
730		    i) < sizeof (sym));
731		instp = (uint8_t *)(void *)kobj_getelfsym(sym, NULL, &sizep);
732		VERIFY(instp != 0);
733		inst = (instp[3] << 24) | (SMAP_STAC_INSTR & 0x00ffffff);
734		hot_patch_kernel_text((caddr_t)instp, inst, 4);
735	}
736
737	/*
738	 * Hotinline calls to smap_enable and smap_disable within
739	 * unix module. Hotinlines in other modules are done on
740	 * mod_load().
741	 */
742	modp = mod_hold_by_name("unix");
743	do_hotinlines(modp->mod_mp);
744	mod_release_mod(modp);
745
746	setcr4(getcr4() | CR4_SMAP);
747	smap_enable();
748}
749
750/*
751 * Our world looks like this at startup time.
752 *
753 * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data
754 * at 0xfec00000.  On a 64-bit OS, kernel text and data are loaded at
755 * 0xffffffff.fe800000 and 0xffffffff.fec00000 respectively.  Those
756 * addresses are fixed in the binary at link time.
757 *
758 * On the text page:
759 * unix/genunix/krtld/module text loads.
760 *
761 * On the data page:
762 * unix/genunix/krtld/module data loads.
763 *
764 * Machine-dependent startup code
765 */
766void
767startup(void)
768{
769#if !defined(__xpv)
770	extern void startup_pci_bios(void);
771#endif
772	extern cpuset_t cpu_ready_set;
773
774	/*
775	 * Make sure that nobody tries to use sekpm until we have
776	 * initialized it properly.
777	 */
778#if defined(__amd64)
779	kpm_desired = 1;
780#endif
781	kpm_enable = 0;
782	CPUSET_ONLY(cpu_ready_set, 0);	/* cpu 0 is boot cpu */
783
784#if defined(__xpv)	/* XXPV fix me! */
785	{
786		extern int segvn_use_regions;
787		segvn_use_regions = 0;
788	}
789#endif
790	ssp_init();
791	progressbar_init();
792	startup_init();
793#if defined(__xpv)
794	startup_xen_version();
795#endif
796	startup_memlist();
797	startup_kmem();
798	startup_vm();
799#if !defined(__xpv)
800	/*
801	 * Note we need to do this even on fast reboot in order to access
802	 * the irq routing table (used for pci labels).
803	 */
804	startup_pci_bios();
805	startup_smap();
806#endif
807#if defined(__xpv)
808	startup_xen_mca();
809#endif
810	startup_modules();
811
812	startup_end();
813}
814
815static void
816startup_init()
817{
818	PRM_POINT("startup_init() starting...");
819
820	/*
821	 * Complete the extraction of cpuid data
822	 */
823	cpuid_pass2(CPU);
824
825	(void) check_boot_version(BOP_GETVERSION(bootops));
826
827	/*
828	 * Check for prom_debug in boot environment
829	 */
830	if (BOP_GETPROPLEN(bootops, "prom_debug") >= 0) {
831		++prom_debug;
832		PRM_POINT("prom_debug found in boot enviroment");
833	}
834
835	/*
836	 * Collect node, cpu and memory configuration information.
837	 */
838	get_system_configuration();
839
840	/*
841	 * Halt if this is an unsupported processor.
842	 */
843	if (x86_type == X86_TYPE_486 || x86_type == X86_TYPE_CYRIX_486) {
844		printf("\n486 processor (\"%s\") detected.\n",
845		    CPU->cpu_brandstr);
846		halt("This processor is not supported by this release "
847		    "of Solaris.");
848	}
849
850	PRM_POINT("startup_init() done");
851}
852
853/*
854 * Callback for copy_memlist_filter() to filter nucleus, kadb/kmdb, (ie.
855 * everything mapped above KERNEL_TEXT) pages from phys_avail. Note it
856 * also filters out physical page zero.  There is some reliance on the
857 * boot loader allocating only a few contiguous physical memory chunks.
858 */
859static void
860avail_filter(uint64_t *addr, uint64_t *size)
861{
862	uintptr_t va;
863	uintptr_t next_va;
864	pfn_t pfn;
865	uint64_t pfn_addr;
866	uint64_t pfn_eaddr;
867	uint_t prot;
868	size_t len;
869	uint_t change;
870
871	if (prom_debug)
872		prom_printf("\tFilter: in: a=%" PRIx64 ", s=%" PRIx64 "\n",
873		    *addr, *size);
874
875	/*
876	 * page zero is required for BIOS.. never make it available
877	 */
878	if (*addr == 0) {
879		*addr += MMU_PAGESIZE;
880		*size -= MMU_PAGESIZE;
881	}
882
883	/*
884	 * First we trim from the front of the range. Since kbm_probe()
885	 * walks ranges in virtual order, but addr/size are physical, we need
886	 * to the list until no changes are seen.  This deals with the case
887	 * where page "p" is mapped at v, page "p + PAGESIZE" is mapped at w
888	 * but w < v.
889	 */
890	do {
891		change = 0;
892		for (va = KERNEL_TEXT;
893		    *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
894		    va = next_va) {
895
896			next_va = va + len;
897			pfn_addr = pfn_to_pa(pfn);
898			pfn_eaddr = pfn_addr + len;
899
900			if (pfn_addr <= *addr && pfn_eaddr > *addr) {
901				change = 1;
902				while (*size > 0 && len > 0) {
903					*addr += MMU_PAGESIZE;
904					*size -= MMU_PAGESIZE;
905					len -= MMU_PAGESIZE;
906				}
907			}
908		}
909		if (change && prom_debug)
910			prom_printf("\t\ttrim: a=%" PRIx64 ", s=%" PRIx64 "\n",
911			    *addr, *size);
912	} while (change);
913
914	/*
915	 * Trim pages from the end of the range.
916	 */
917	for (va = KERNEL_TEXT;
918	    *size > 0 && kbm_probe(&va, &len, &pfn, &prot) != 0;
919	    va = next_va) {
920
921		next_va = va + len;
922		pfn_addr = pfn_to_pa(pfn);
923
924		if (pfn_addr >= *addr && pfn_addr < *addr + *size)
925			*size = pfn_addr - *addr;
926	}
927
928	if (prom_debug)
929		prom_printf("\tFilter out: a=%" PRIx64 ", s=%" PRIx64 "\n",
930		    *addr, *size);
931}
932
933static void
934kpm_init()
935{
936	struct segkpm_crargs b;
937
938	/*
939	 * These variables were all designed for sfmmu in which segkpm is
940	 * mapped using a single pagesize - either 8KB or 4MB.  On x86, we
941	 * might use 2+ page sizes on a single machine, so none of these
942	 * variables have a single correct value.  They are set up as if we
943	 * always use a 4KB pagesize, which should do no harm.  In the long
944	 * run, we should get rid of KPM's assumption that only a single
945	 * pagesize is used.
946	 */
947	kpm_pgshft = MMU_PAGESHIFT;
948	kpm_pgsz =  MMU_PAGESIZE;
949	kpm_pgoff = MMU_PAGEOFFSET;
950	kpmp2pshft = 0;
951	kpmpnpgs = 1;
952	ASSERT(((uintptr_t)kpm_vbase & (kpm_pgsz - 1)) == 0);
953
954	PRM_POINT("about to create segkpm");
955	rw_enter(&kas.a_lock, RW_WRITER);
956
957	if (seg_attach(&kas, kpm_vbase, kpm_size, segkpm) < 0)
958		panic("cannot attach segkpm");
959
960	b.prot = PROT_READ | PROT_WRITE;
961	b.nvcolors = 1;
962
963	if (segkpm_create(segkpm, (caddr_t)&b) != 0)
964		panic("segkpm_create segkpm");
965
966	rw_exit(&kas.a_lock);
967
968	kpm_enable = 1;
969
970	/*
971	 * As the KPM was disabled while setting up the system, go back and fix
972	 * CPU zero's access to its user page table. This is a bit gross, but
973	 * we have a chicken and egg problem otherwise.
974	 */
975	ASSERT(CPU->cpu_hat_info->hci_user_l3ptes == NULL);
976	CPU->cpu_hat_info->hci_user_l3ptes =
977	    (x86pte_t *)hat_kpm_mapin_pfn(CPU->cpu_hat_info->hci_user_l3pfn);
978}
979
980/*
981 * The debug info page provides enough information to allow external
982 * inspectors (e.g. when running under a hypervisor) to bootstrap
983 * themselves into allowing full-blown kernel debugging.
984 */
985static void
986init_debug_info(void)
987{
988	caddr_t mem;
989	debug_info_t *di;
990
991#ifndef __lint
992	ASSERT(sizeof (debug_info_t) < MMU_PAGESIZE);
993#endif
994
995	mem = BOP_ALLOC(bootops, (caddr_t)DEBUG_INFO_VA, MMU_PAGESIZE,
996	    MMU_PAGESIZE);
997
998	if (mem != (caddr_t)DEBUG_INFO_VA)
999		panic("BOP_ALLOC() failed");
1000	bzero(mem, MMU_PAGESIZE);
1001
1002	di = (debug_info_t *)mem;
1003
1004	di->di_magic = DEBUG_INFO_MAGIC;
1005	di->di_version = DEBUG_INFO_VERSION;
1006	di->di_modules = (uintptr_t)&modules;
1007	di->di_s_text = (uintptr_t)s_text;
1008	di->di_e_text = (uintptr_t)e_text;
1009	di->di_s_data = (uintptr_t)s_data;
1010	di->di_e_data = (uintptr_t)e_data;
1011	di->di_hat_htable_off = offsetof(hat_t, hat_htable);
1012	di->di_ht_pfn_off = offsetof(htable_t, ht_pfn);
1013}
1014
1015/*
1016 * Build the memlists and other kernel essential memory system data structures.
1017 * This is everything at valloc_base.
1018 */
1019static void
1020startup_memlist(void)
1021{
1022	size_t memlist_sz;
1023	size_t memseg_sz;
1024	size_t pagehash_sz;
1025	size_t pp_sz;
1026	uintptr_t va;
1027	size_t len;
1028	uint_t prot;
1029	pfn_t pfn;
1030	int memblocks;
1031	pfn_t rsvd_high_pfn;
1032	pgcnt_t rsvd_pgcnt;
1033	size_t rsvdmemlist_sz;
1034	int rsvdmemblocks;
1035	caddr_t pagecolor_mem;
1036	size_t pagecolor_memsz;
1037	caddr_t page_ctrs_mem;
1038	size_t page_ctrs_size;
1039	size_t pse_table_alloc_size;
1040	struct memlist *current;
1041	extern void startup_build_mem_nodes(struct memlist *);
1042
1043	/* XX64 fix these - they should be in include files */
1044	extern size_t page_coloring_init(uint_t, int, int);
1045	extern void page_coloring_setup(caddr_t);
1046
1047	PRM_POINT("startup_memlist() starting...");
1048
1049	/*
1050	 * Use leftover large page nucleus text/data space for loadable modules.
1051	 * Use at most MODTEXT/MODDATA.
1052	 */
1053	len = kbm_nucleus_size;
1054	ASSERT(len > MMU_PAGESIZE);
1055
1056	moddata = (caddr_t)ROUND_UP_PAGE(e_data);
1057	e_moddata = (caddr_t)P2ROUNDUP((uintptr_t)e_data, (uintptr_t)len);
1058	if (e_moddata - moddata > MODDATA)
1059		e_moddata = moddata + MODDATA;
1060
1061	modtext = (caddr_t)ROUND_UP_PAGE(e_text);
1062	e_modtext = (caddr_t)P2ROUNDUP((uintptr_t)e_text, (uintptr_t)len);
1063	if (e_modtext - modtext > MODTEXT)
1064		e_modtext = modtext + MODTEXT;
1065
1066	econtig = e_moddata;
1067
1068	PRM_DEBUG(modtext);
1069	PRM_DEBUG(e_modtext);
1070	PRM_DEBUG(moddata);
1071	PRM_DEBUG(e_moddata);
1072	PRM_DEBUG(econtig);
1073
1074	/*
1075	 * Examine the boot loader physical memory map to find out:
1076	 * - total memory in system - physinstalled
1077	 * - the max physical address - physmax
1078	 * - the number of discontiguous segments of memory.
1079	 */
1080	if (prom_debug)
1081		print_memlist("boot physinstalled",
1082		    bootops->boot_mem->physinstalled);
1083	installed_top_size_ex(bootops->boot_mem->physinstalled, &physmax,
1084	    &physinstalled, &memblocks);
1085	PRM_DEBUG(physmax);
1086	PRM_DEBUG(physinstalled);
1087	PRM_DEBUG(memblocks);
1088
1089	/*
1090	 * We no longer support any form of memory DR.
1091	 */
1092	plat_dr_physmax = 0;
1093
1094	/*
1095	 * Examine the bios reserved memory to find out:
1096	 * - the number of discontiguous segments of memory.
1097	 */
1098	if (prom_debug)
1099		print_memlist("boot reserved mem",
1100		    bootops->boot_mem->rsvdmem);
1101	installed_top_size_ex(bootops->boot_mem->rsvdmem, &rsvd_high_pfn,
1102	    &rsvd_pgcnt, &rsvdmemblocks);
1103	PRM_DEBUG(rsvd_high_pfn);
1104	PRM_DEBUG(rsvd_pgcnt);
1105	PRM_DEBUG(rsvdmemblocks);
1106
1107	/*
1108	 * Initialize hat's mmu parameters.
1109	 * Check for enforce-prot-exec in boot environment. It's used to
1110	 * enable/disable support for the page table entry NX bit.
1111	 * The default is to enforce PROT_EXEC on processors that support NX.
1112	 * Boot seems to round up the "len", but 8 seems to be big enough.
1113	 */
1114	mmu_init();
1115
1116#ifdef	__i386
1117	/*
1118	 * physmax is lowered if there is more memory than can be
1119	 * physically addressed in 32 bit (PAE/non-PAE) modes.
1120	 */
1121	if (mmu.pae_hat) {
1122		if (PFN_ABOVE64G(physmax)) {
1123			physinstalled -= (physmax - (PFN_64G - 1));
1124			physmax = PFN_64G - 1;
1125		}
1126	} else {
1127		if (PFN_ABOVE4G(physmax)) {
1128			physinstalled -= (physmax - (PFN_4G - 1));
1129			physmax = PFN_4G - 1;
1130		}
1131	}
1132#endif
1133
1134	startup_build_mem_nodes(bootops->boot_mem->physinstalled);
1135
1136	if (BOP_GETPROPLEN(bootops, "enforce-prot-exec") >= 0) {
1137		int len = BOP_GETPROPLEN(bootops, "enforce-prot-exec");
1138		char value[8];
1139
1140		if (len < 8)
1141			(void) BOP_GETPROP(bootops, "enforce-prot-exec", value);
1142		else
1143			(void) strcpy(value, "");
1144		if (strcmp(value, "off") == 0)
1145			mmu.pt_nx = 0;
1146	}
1147	PRM_DEBUG(mmu.pt_nx);
1148
1149	/*
1150	 * We will need page_t's for every page in the system, except for
1151	 * memory mapped at or above above the start of the kernel text segment.
1152	 *
1153	 * pages above e_modtext are attributed to kernel debugger (obp_pages)
1154	 */
1155	npages = physinstalled - 1; /* avail_filter() skips page 0, so "- 1" */
1156	obp_pages = 0;
1157	va = KERNEL_TEXT;
1158	while (kbm_probe(&va, &len, &pfn, &prot) != 0) {
1159		npages -= len >> MMU_PAGESHIFT;
1160		if (va >= (uintptr_t)e_moddata)
1161			obp_pages += len >> MMU_PAGESHIFT;
1162		va += len;
1163	}
1164	PRM_DEBUG(npages);
1165	PRM_DEBUG(obp_pages);
1166
1167	/*
1168	 * If physmem is patched to be non-zero, use it instead of the computed
1169	 * value unless it is larger than the actual amount of memory on hand.
1170	 */
1171	if (physmem == 0 || physmem > npages) {
1172		physmem = npages;
1173	} else if (physmem < npages) {
1174		orig_npages = npages;
1175		npages = physmem;
1176	}
1177	PRM_DEBUG(physmem);
1178
1179	/*
1180	 * We now compute the sizes of all the  initial allocations for
1181	 * structures the kernel needs in order do kmem_alloc(). These
1182	 * include:
1183	 *	memsegs
1184	 *	memlists
1185	 *	page hash table
1186	 *	page_t's
1187	 *	page coloring data structs
1188	 */
1189	memseg_sz = sizeof (struct memseg) * (memblocks + POSS_NEW_FRAGMENTS);
1190	ADD_TO_ALLOCATIONS(memseg_base, memseg_sz);
1191	PRM_DEBUG(memseg_sz);
1192
1193	/*
1194	 * Reserve space for memlists. There's no real good way to know exactly
1195	 * how much room we'll need, but this should be a good upper bound.
1196	 */
1197	memlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1198	    (memblocks + POSS_NEW_FRAGMENTS));
1199	ADD_TO_ALLOCATIONS(memlist, memlist_sz);
1200	PRM_DEBUG(memlist_sz);
1201
1202	/*
1203	 * Reserve space for bios reserved memlists.
1204	 */
1205	rsvdmemlist_sz = ROUND_UP_PAGE(2 * sizeof (struct memlist) *
1206	    (rsvdmemblocks + POSS_NEW_FRAGMENTS));
1207	ADD_TO_ALLOCATIONS(bios_rsvd, rsvdmemlist_sz);
1208	PRM_DEBUG(rsvdmemlist_sz);
1209
1210	/* LINTED */
1211	ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT), sizeof (struct page)));
1212	/*
1213	 * The page structure hash table size is a power of 2
1214	 * such that the average hash chain length is PAGE_HASHAVELEN.
1215	 */
1216	page_hashsz = npages / PAGE_HASHAVELEN;
1217	page_hashsz_shift = highbit(page_hashsz);
1218	page_hashsz = 1 << page_hashsz_shift;
1219	pagehash_sz = sizeof (struct page *) * page_hashsz;
1220	ADD_TO_ALLOCATIONS(page_hash, pagehash_sz);
1221	PRM_DEBUG(pagehash_sz);
1222
1223	/*
1224	 * Set aside room for the page structures themselves.
1225	 */
1226	PRM_DEBUG(npages);
1227	pp_sz = sizeof (struct page) * npages;
1228	ADD_TO_ALLOCATIONS(pp_base, pp_sz);
1229	PRM_DEBUG(pp_sz);
1230
1231	/*
1232	 * determine l2 cache info and memory size for page coloring
1233	 */
1234	(void) getl2cacheinfo(CPU,
1235	    &l2cache_sz, &l2cache_linesz, &l2cache_assoc);
1236	pagecolor_memsz =
1237	    page_coloring_init(l2cache_sz, l2cache_linesz, l2cache_assoc);
1238	ADD_TO_ALLOCATIONS(pagecolor_mem, pagecolor_memsz);
1239	PRM_DEBUG(pagecolor_memsz);
1240
1241	page_ctrs_size = page_ctrs_sz();
1242	ADD_TO_ALLOCATIONS(page_ctrs_mem, page_ctrs_size);
1243	PRM_DEBUG(page_ctrs_size);
1244
1245	/*
1246	 * Allocate the array that protects pp->p_selock.
1247	 */
1248	pse_shift = size_pse_array(physmem, max_ncpus);
1249	pse_table_size = 1 << pse_shift;
1250	pse_table_alloc_size = pse_table_size * sizeof (pad_mutex_t);
1251	ADD_TO_ALLOCATIONS(pse_mutex, pse_table_alloc_size);
1252
1253	valloc_sz = ROUND_UP_LPAGE(valloc_sz);
1254	valloc_base = VALLOC_BASE;
1255
1256	/*
1257	 * The signicant memory-sized regions are roughly sized as follows in
1258	 * the default layout with max physmem:
1259	 *  segkpm: 1x physmem allocated (but 1Tb room, below VALLOC_BASE)
1260	 *  segzio: 1.5x physmem
1261	 *  segkvmm: 4x physmem
1262	 *  heap: whatever's left up to COREHEAP_BASE, at least 1.5x physmem
1263	 *
1264	 * The idea is that we leave enough room to avoid fragmentation issues,
1265	 * so we would like the VA arenas to have some extra.
1266	 *
1267	 * Ignoring the loose change of segkp, valloc, and such, this means that
1268	 * as COREHEAP_BASE-VALLOC_BASE=2Tb, we can accommodate a physmem up to
1269	 * about (2Tb / 7.0), rounded down to 256Gb in the check below.
1270	 *
1271	 * Note that KPM lives below VALLOC_BASE, but we want to include it in
1272	 * adjustments, hence the 8 below.
1273	 *
1274	 * Beyond 256Gb, we push segkpm_base (and hence kernelbase and
1275	 * _userlimit) down to accommodate the VA requirements above.
1276	 */
1277	if (physmax + 1 > mmu_btop(TERABYTE / 4)) {
1278		uint64_t physmem_bytes = mmu_ptob(physmax + 1);
1279		uint64_t adjustment = 8 * (physmem_bytes - (TERABYTE / 4));
1280
1281		PRM_DEBUG(adjustment);
1282
1283		/*
1284		 * segkpm_base is always aligned on a L3 PTE boundary.
1285		 */
1286		segkpm_base -= P2ROUNDUP(adjustment, KERNEL_REDZONE_SIZE);
1287
1288		/*
1289		 * But make sure we leave some space for user apps above hole.
1290		 */
1291		segkpm_base = MAX(segkpm_base, AMD64_VA_HOLE_END + TERABYTE);
1292
1293		ASSERT(segkpm_base <= SEGKPM_BASE);
1294
1295		valloc_base = segkpm_base + P2ROUNDUP(physmem_bytes, ONE_GIG);
1296		if (valloc_base < segkpm_base)
1297			panic("not enough kernel VA to support memory size");
1298	}
1299
1300	PRM_DEBUG(segkpm_base);
1301	PRM_DEBUG(valloc_base);
1302
1303	/*
1304	 * do all the initial allocations
1305	 */
1306	perform_allocations();
1307
1308	/*
1309	 * Build phys_install and phys_avail in kernel memspace.
1310	 * - phys_install should be all memory in the system.
1311	 * - phys_avail is phys_install minus any memory mapped before this
1312	 *    point above KERNEL_TEXT.
1313	 */
1314	current = phys_install = memlist;
1315	copy_memlist_filter(bootops->boot_mem->physinstalled, &current, NULL);
1316	if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1317		panic("physinstalled was too big!");
1318	if (prom_debug)
1319		print_memlist("phys_install", phys_install);
1320
1321	phys_avail = current;
1322	PRM_POINT("Building phys_avail:\n");
1323	copy_memlist_filter(bootops->boot_mem->physinstalled, &current,
1324	    avail_filter);
1325	if ((caddr_t)current > (caddr_t)memlist + memlist_sz)
1326		panic("physavail was too big!");
1327	if (prom_debug)
1328		print_memlist("phys_avail", phys_avail);
1329#ifndef	__xpv
1330	/*
1331	 * Free unused memlist items, which may be used by memory DR driver
1332	 * at runtime.
1333	 */
1334	if ((caddr_t)current < (caddr_t)memlist + memlist_sz) {
1335		memlist_free_block((caddr_t)current,
1336		    (caddr_t)memlist + memlist_sz - (caddr_t)current);
1337	}
1338#endif
1339
1340	/*
1341	 * Build bios reserved memspace
1342	 */
1343	current = bios_rsvd;
1344	copy_memlist_filter(bootops->boot_mem->rsvdmem, &current, NULL);
1345	if ((caddr_t)current > (caddr_t)bios_rsvd + rsvdmemlist_sz)
1346		panic("bios_rsvd was too big!");
1347	if (prom_debug)
1348		print_memlist("bios_rsvd", bios_rsvd);
1349#ifndef	__xpv
1350	/*
1351	 * Free unused memlist items, which may be used by memory DR driver
1352	 * at runtime.
1353	 */
1354	if ((caddr_t)current < (caddr_t)bios_rsvd + rsvdmemlist_sz) {
1355		memlist_free_block((caddr_t)current,
1356		    (caddr_t)bios_rsvd + rsvdmemlist_sz - (caddr_t)current);
1357	}
1358#endif
1359
1360	/*
1361	 * setup page coloring
1362	 */
1363	page_coloring_setup(pagecolor_mem);
1364	page_lock_init();	/* currently a no-op */
1365
1366	/*
1367	 * free page list counters
1368	 */
1369	(void) page_ctrs_alloc(page_ctrs_mem);
1370
1371	/*
1372	 * Size the pcf array based on the number of cpus in the box at
1373	 * boot time.
1374	 */
1375
1376	pcf_init();
1377
1378	/*
1379	 * Initialize the page structures from the memory lists.
1380	 */
1381	availrmem_initial = availrmem = freemem = 0;
1382	PRM_POINT("Calling kphysm_init()...");
1383	npages = kphysm_init(pp_base, npages);
1384	PRM_POINT("kphysm_init() done");
1385	PRM_DEBUG(npages);
1386
1387	init_debug_info();
1388
1389	/*
1390	 * Now that page_t's have been initialized, remove all the
1391	 * initial allocation pages from the kernel free page lists.
1392	 */
1393	boot_mapin((caddr_t)valloc_base, valloc_sz);
1394	boot_mapin((caddr_t)MISC_VA_BASE, MISC_VA_SIZE);
1395	PRM_POINT("startup_memlist() done");
1396
1397	PRM_DEBUG(valloc_sz);
1398
1399#if defined(__amd64)
1400	if ((availrmem >> (30 - MMU_PAGESHIFT)) >=
1401	    textrepl_min_gb && l2cache_sz <= 2 << 20) {
1402		extern size_t textrepl_size_thresh;
1403		textrepl_size_thresh = (16 << 20) - 1;
1404	}
1405#endif
1406}
1407
1408/*
1409 * Layout the kernel's part of address space and initialize kmem allocator.
1410 */
1411static void
1412startup_kmem(void)
1413{
1414	extern void page_set_colorequiv_arr(void);
1415#if !defined(__xpv)
1416	extern uint64_t kpti_kbase;
1417#endif
1418
1419	PRM_POINT("startup_kmem() starting...");
1420
1421#if defined(__amd64)
1422	if (eprom_kernelbase && eprom_kernelbase != KERNELBASE)
1423		cmn_err(CE_NOTE, "!kernelbase cannot be changed on 64-bit "
1424		    "systems.");
1425	kernelbase = segkpm_base - KERNEL_REDZONE_SIZE;
1426	core_base = (uintptr_t)COREHEAP_BASE;
1427	core_size = (size_t)MISC_VA_BASE - COREHEAP_BASE;
1428#else	/* __i386 */
1429	/*
1430	 * We configure kernelbase based on:
1431	 *
1432	 * 1. user specified kernelbase via eeprom command. Value cannot exceed
1433	 *    KERNELBASE_MAX. we large page align eprom_kernelbase
1434	 *
1435	 * 2. Default to KERNELBASE and adjust to 2X less the size for page_t.
1436	 *    On large memory systems we must lower kernelbase to allow
1437	 *    enough room for page_t's for all of memory.
1438	 *
1439	 * The value set here, might be changed a little later.
1440	 */
1441	if (eprom_kernelbase) {
1442		kernelbase = eprom_kernelbase & mmu.level_mask[1];
1443		if (kernelbase > KERNELBASE_MAX)
1444			kernelbase = KERNELBASE_MAX;
1445	} else {
1446		kernelbase = (uintptr_t)KERNELBASE;
1447		kernelbase -= ROUND_UP_4MEG(2 * valloc_sz);
1448	}
1449	ASSERT((kernelbase & mmu.level_offset[1]) == 0);
1450	core_base = valloc_base;
1451	core_size = 0;
1452#endif	/* __i386 */
1453
1454	PRM_DEBUG(core_base);
1455	PRM_DEBUG(core_size);
1456	PRM_DEBUG(kernelbase);
1457
1458#if defined(__i386)
1459	segkp_fromheap = 1;
1460#endif	/* __i386 */
1461
1462	ekernelheap = (char *)core_base;
1463	PRM_DEBUG(ekernelheap);
1464
1465	/*
1466	 * Now that we know the real value of kernelbase,
1467	 * update variables that were initialized with a value of
1468	 * KERNELBASE (in common/conf/param.c).
1469	 *
1470	 * XXX	The problem with this sort of hackery is that the
1471	 *	compiler just may feel like putting the const declarations
1472	 *	(in param.c) into the .text section.  Perhaps they should
1473	 *	just be declared as variables there?
1474	 */
1475
1476	*(uintptr_t *)&_kernelbase = kernelbase;
1477	*(uintptr_t *)&_userlimit = kernelbase;
1478#if defined(__amd64)
1479	*(uintptr_t *)&_userlimit -= KERNELBASE - USERLIMIT;
1480#if !defined(__xpv)
1481	kpti_kbase = kernelbase;
1482#endif
1483#else
1484	*(uintptr_t *)&_userlimit32 = _userlimit;
1485#endif
1486	PRM_DEBUG(_kernelbase);
1487	PRM_DEBUG(_userlimit);
1488	PRM_DEBUG(_userlimit32);
1489
1490	/* We have to re-do this now that we've modified _userlimit. */
1491	mmu_calc_user_slots();
1492
1493	layout_kernel_va();
1494
1495#if defined(__i386)
1496	/*
1497	 * If segmap is too large we can push the bottom of the kernel heap
1498	 * higher than the base.  Or worse, it could exceed the top of the
1499	 * VA space entirely, causing it to wrap around.
1500	 */
1501	if (kernelheap >= ekernelheap || (uintptr_t)kernelheap < kernelbase)
1502		panic("too little address space available for kernelheap,"
1503		    " use eeprom for lower kernelbase or smaller segmapsize");
1504#endif	/* __i386 */
1505
1506	/*
1507	 * Initialize the kernel heap. Note 3rd argument must be > 1st.
1508	 */
1509	kernelheap_init(kernelheap, ekernelheap,
1510	    kernelheap + MMU_PAGESIZE,
1511	    (void *)core_base, (void *)(core_base + core_size));
1512
1513#if defined(__xpv)
1514	/*
1515	 * Link pending events struct into cpu struct
1516	 */
1517	CPU->cpu_m.mcpu_evt_pend = &cpu0_evt_data;
1518#endif
1519	/*
1520	 * Initialize kernel memory allocator.
1521	 */
1522	kmem_init();
1523
1524	/*
1525	 * Factor in colorequiv to check additional 'equivalent' bins
1526	 */
1527	page_set_colorequiv_arr();
1528
1529	/*
1530	 * print this out early so that we know what's going on
1531	 */
1532	print_x86_featureset(x86_featureset);
1533
1534	/*
1535	 * Initialize bp_mapin().
1536	 */
1537	bp_init(MMU_PAGESIZE, HAT_STORECACHING_OK);
1538
1539	/*
1540	 * orig_npages is non-zero if physmem has been configured for less
1541	 * than the available memory.
1542	 */
1543	if (orig_npages) {
1544		cmn_err(CE_WARN, "!%slimiting physmem to 0x%lx of 0x%lx pages",
1545		    (npages == PHYSMEM ? "Due to virtual address space " : ""),
1546		    npages, orig_npages);
1547	}
1548#if defined(__i386)
1549	if (eprom_kernelbase && (eprom_kernelbase != kernelbase))
1550		cmn_err(CE_WARN, "kernelbase value, User specified 0x%lx, "
1551		    "System using 0x%lx",
1552		    (uintptr_t)eprom_kernelbase, (uintptr_t)kernelbase);
1553#endif
1554
1555#ifdef	KERNELBASE_ABI_MIN
1556	if (kernelbase < (uintptr_t)KERNELBASE_ABI_MIN) {
1557		cmn_err(CE_NOTE, "!kernelbase set to 0x%lx, system is not "
1558		    "i386 ABI compliant.", (uintptr_t)kernelbase);
1559	}
1560#endif
1561
1562#ifndef __xpv
1563	if (plat_dr_support_memory()) {
1564		mem_config_init();
1565	}
1566#else	/* __xpv */
1567	/*
1568	 * Some of the xen start information has to be relocated up
1569	 * into the kernel's permanent address space.
1570	 */
1571	PRM_POINT("calling xen_relocate_start_info()");
1572	xen_relocate_start_info();
1573	PRM_POINT("xen_relocate_start_info() done");
1574
1575	/*
1576	 * (Update the vcpu pointer in our cpu structure to point into
1577	 * the relocated shared info.)
1578	 */
1579	CPU->cpu_m.mcpu_vcpu_info =
1580	    &HYPERVISOR_shared_info->vcpu_info[CPU->cpu_id];
1581#endif	/* __xpv */
1582
1583	PRM_POINT("startup_kmem() done");
1584}
1585
1586#ifndef __xpv
1587/*
1588 * If we have detected that we are running in an HVM environment, we need
1589 * to prepend the PV driver directory to the module search path.
1590 */
1591#define	HVM_MOD_DIR "/platform/i86hvm/kernel"
1592static void
1593update_default_path()
1594{
1595	char *current, *newpath;
1596	int newlen;
1597
1598	/*
1599	 * We are about to resync with krtld.  krtld will reset its
1600	 * internal module search path iff Solaris has set default_path.
1601	 * We want to be sure we're prepending this new directory to the
1602	 * right search path.
1603	 */
1604	current = (default_path == NULL) ? kobj_module_path : default_path;
1605
1606	newlen = strlen(HVM_MOD_DIR) + strlen(current) + 2;
1607	newpath = kmem_alloc(newlen, KM_SLEEP);
1608	(void) strcpy(newpath, HVM_MOD_DIR);
1609	(void) strcat(newpath, " ");
1610	(void) strcat(newpath, current);
1611
1612	default_path = newpath;
1613}
1614#endif
1615
1616static void
1617startup_modules(void)
1618{
1619	int cnt;
1620	extern void prom_setup(void);
1621	int32_t v, h;
1622	char d[11];
1623	char *cp;
1624	cmi_hdl_t hdl;
1625
1626	PRM_POINT("startup_modules() starting...");
1627
1628#ifndef __xpv
1629	/*
1630	 * Initialize ten-micro second timer so that drivers will
1631	 * not get short changed in their init phase. This was
1632	 * not getting called until clkinit which, on fast cpu's
1633	 * caused the drv_usecwait to be way too short.
1634	 */
1635	microfind();
1636
1637	if ((get_hwenv() & HW_XEN_HVM) != 0)
1638		update_default_path();
1639#endif
1640
1641	/*
1642	 * Read the GMT lag from /etc/rtc_config.
1643	 */
1644	sgmtl(process_rtc_config_file());
1645
1646	/*
1647	 * Calculate default settings of system parameters based upon
1648	 * maxusers, yet allow to be overridden via the /etc/system file.
1649	 */
1650	param_calc(0);
1651
1652	mod_setup();
1653
1654	/*
1655	 * Initialize system parameters.
1656	 */
1657	param_init();
1658
1659	/*
1660	 * Initialize the default brands
1661	 */
1662	brand_init();
1663
1664	/*
1665	 * maxmem is the amount of physical memory we're playing with.
1666	 */
1667	maxmem = physmem;
1668
1669	/*
1670	 * Initialize segment management stuff.
1671	 */
1672	seg_init();
1673
1674	if (modload("fs", "specfs") == -1)
1675		halt("Can't load specfs");
1676
1677	if (modload("fs", "devfs") == -1)
1678		halt("Can't load devfs");
1679
1680	if (modload("fs", "dev") == -1)
1681		halt("Can't load dev");
1682
1683	if (modload("fs", "procfs") == -1)
1684		halt("Can't load procfs");
1685
1686	(void) modloadonly("sys", "lbl_edition");
1687
1688	dispinit();
1689
1690	/* Read cluster configuration data. */
1691	clconf_init();
1692
1693#if defined(__xpv)
1694	(void) ec_init();
1695	gnttab_init();
1696	(void) xs_early_init();
1697#endif /* __xpv */
1698
1699	/*
1700	 * Create a kernel device tree. First, create rootnex and
1701	 * then invoke bus specific code to probe devices.
1702	 */
1703	setup_ddi();
1704
1705#ifdef __xpv
1706	if (DOMAIN_IS_INITDOMAIN(xen_info))
1707#endif
1708	{
1709		id_t smid;
1710		smbios_system_t smsys;
1711		smbios_info_t sminfo;
1712		char *mfg;
1713		/*
1714		 * Load the System Management BIOS into the global ksmbios
1715		 * handle, if an SMBIOS is present on this system.
1716		 * Also set "si-hw-provider" property, if not already set.
1717		 */
1718		ksmbios = smbios_open(NULL, SMB_VERSION, ksmbios_flags, NULL);
1719		if (ksmbios != NULL &&
1720		    ((smid = smbios_info_system(ksmbios, &smsys)) != SMB_ERR) &&
1721		    (smbios_info_common(ksmbios, smid, &sminfo)) != SMB_ERR) {
1722			mfg = (char *)sminfo.smbi_manufacturer;
1723			if (BOP_GETPROPLEN(bootops, "si-hw-provider") < 0) {
1724				extern char hw_provider[];
1725				int i;
1726				for (i = 0; i < SYS_NMLN; i++) {
1727					if (isprint(mfg[i]))
1728						hw_provider[i] = mfg[i];
1729					else {
1730						hw_provider[i] = '\0';
1731						break;
1732					}
1733				}
1734				hw_provider[SYS_NMLN - 1] = '\0';
1735			}
1736		}
1737	}
1738
1739
1740	/*
1741	 * Originally clconf_init() apparently needed the hostid.  But
1742	 * this no longer appears to be true - it uses its own nodeid.
1743	 * By placing the hostid logic here, we are able to make use of
1744	 * the SMBIOS UUID.
1745	 */
1746	if ((h = set_soft_hostid()) == HW_INVALID_HOSTID) {
1747		cmn_err(CE_WARN, "Unable to set hostid");
1748	} else {
1749		for (v = h, cnt = 0; cnt < 10; cnt++) {
1750			d[cnt] = (char)(v % 10);
1751			v /= 10;
1752			if (v == 0)
1753				break;
1754		}
1755		for (cp = hw_serial; cnt >= 0; cnt--)
1756			*cp++ = d[cnt] + '0';
1757		*cp = 0;
1758	}
1759
1760	/*
1761	 * Set up the CPU module subsystem for the boot cpu in the native
1762	 * case, and all physical cpu resource in the xpv dom0 case.
1763	 * Modifies the device tree, so this must be done after
1764	 * setup_ddi().
1765	 */
1766#ifdef __xpv
1767	/*
1768	 * If paravirtualized and on dom0 then we initialize all physical
1769	 * cpu handles now;  if paravirtualized on a domU then do not
1770	 * initialize.
1771	 */
1772	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
1773		xen_mc_lcpu_cookie_t cpi;
1774
1775		for (cpi = xen_physcpu_next(NULL); cpi != NULL;
1776		    cpi = xen_physcpu_next(cpi)) {
1777			if ((hdl = cmi_init(CMI_HDL_SOLARIS_xVM_MCA,
1778			    xen_physcpu_chipid(cpi), xen_physcpu_coreid(cpi),
1779			    xen_physcpu_strandid(cpi))) != NULL &&
1780			    is_x86_feature(x86_featureset, X86FSET_MCA))
1781				cmi_mca_init(hdl);
1782		}
1783	}
1784#else
1785	/*
1786	 * Initialize a handle for the boot cpu - others will initialize
1787	 * as they startup.
1788	 */
1789	if ((hdl = cmi_init(CMI_HDL_NATIVE, cmi_ntv_hwchipid(CPU),
1790	    cmi_ntv_hwcoreid(CPU), cmi_ntv_hwstrandid(CPU))) != NULL) {
1791		if (is_x86_feature(x86_featureset, X86FSET_MCA))
1792			cmi_mca_init(hdl);
1793		CPU->cpu_m.mcpu_cmi_hdl = hdl;
1794	}
1795#endif	/* __xpv */
1796
1797	/*
1798	 * Fake a prom tree such that /dev/openprom continues to work
1799	 */
1800	PRM_POINT("startup_modules: calling prom_setup...");
1801	prom_setup();
1802	PRM_POINT("startup_modules: done");
1803
1804	/*
1805	 * Load all platform specific modules
1806	 */
1807	PRM_POINT("startup_modules: calling psm_modload...");
1808	psm_modload();
1809
1810	PRM_POINT("startup_modules() done");
1811}
1812
1813/*
1814 * claim a "setaside" boot page for use in the kernel
1815 */
1816page_t *
1817boot_claim_page(pfn_t pfn)
1818{
1819	page_t *pp;
1820
1821	pp = page_numtopp_nolock(pfn);
1822	ASSERT(pp != NULL);
1823
1824	if (PP_ISBOOTPAGES(pp)) {
1825		if (pp->p_next != NULL)
1826			pp->p_next->p_prev = pp->p_prev;
1827		if (pp->p_prev == NULL)
1828			bootpages = pp->p_next;
1829		else
1830			pp->p_prev->p_next = pp->p_next;
1831	} else {
1832		/*
1833		 * htable_attach() expects a base pagesize page
1834		 */
1835		if (pp->p_szc != 0)
1836			page_boot_demote(pp);
1837		pp = page_numtopp(pfn, SE_EXCL);
1838	}
1839	return (pp);
1840}
1841
1842/*
1843 * Walk through the pagetables looking for pages mapped in by boot.  If the
1844 * setaside flag is set the pages are expected to be returned to the
1845 * kernel later in boot, so we add them to the bootpages list.
1846 */
1847static void
1848protect_boot_range(uintptr_t low, uintptr_t high, int setaside)
1849{
1850	uintptr_t va = low;
1851	size_t len;
1852	uint_t prot;
1853	pfn_t pfn;
1854	page_t *pp;
1855	pgcnt_t boot_protect_cnt = 0;
1856
1857	while (kbm_probe(&va, &len, &pfn, &prot) != 0 && va < high) {
1858		if (va + len >= high)
1859			panic("0x%lx byte mapping at 0x%p exceeds boot's "
1860			    "legal range.", len, (void *)va);
1861
1862		while (len > 0) {
1863			pp = page_numtopp_alloc(pfn);
1864			if (pp != NULL) {
1865				if (setaside == 0)
1866					panic("Unexpected mapping by boot.  "
1867					    "addr=%p pfn=%lx\n",
1868					    (void *)va, pfn);
1869
1870				pp->p_next = bootpages;
1871				pp->p_prev = NULL;
1872				PP_SETBOOTPAGES(pp);
1873				if (bootpages != NULL) {
1874					bootpages->p_prev = pp;
1875				}
1876				bootpages = pp;
1877				++boot_protect_cnt;
1878			}
1879
1880			++pfn;
1881			len -= MMU_PAGESIZE;
1882			va += MMU_PAGESIZE;
1883		}
1884	}
1885	PRM_DEBUG(boot_protect_cnt);
1886}
1887
1888/*
1889 * Establish the final size of the kernel's heap, size of segmap, segkp, etc.
1890 */
1891static void
1892layout_kernel_va(void)
1893{
1894	const size_t physmem_size = mmu_ptob(physmem);
1895	size_t size;
1896
1897	PRM_POINT("layout_kernel_va() starting...");
1898
1899	kpm_vbase = (caddr_t)segkpm_base;
1900	kpm_size = ROUND_UP_LPAGE(mmu_ptob(physmax + 1));
1901	if ((uintptr_t)kpm_vbase + kpm_size > (uintptr_t)valloc_base)
1902		panic("not enough room for kpm!");
1903	PRM_DEBUG(kpm_size);
1904	PRM_DEBUG(kpm_vbase);
1905
1906	segkp_base = (caddr_t)valloc_base + valloc_sz;
1907	if (!segkp_fromheap) {
1908		size = mmu_ptob(segkpsize);
1909
1910		/*
1911		 * determine size of segkp
1912		 */
1913		if (size < SEGKPMINSIZE || size > SEGKPMAXSIZE) {
1914			size = SEGKPDEFSIZE;
1915			cmn_err(CE_WARN, "!Illegal value for segkpsize. "
1916			    "segkpsize has been reset to %ld pages",
1917			    mmu_btop(size));
1918		}
1919		size = MIN(size, MAX(SEGKPMINSIZE, physmem_size));
1920
1921		segkpsize = mmu_btop(ROUND_UP_LPAGE(size));
1922	}
1923	PRM_DEBUG(segkp_base);
1924	PRM_DEBUG(segkpsize);
1925
1926	/*
1927	 * segkvmm: backing for vmm guest memory. Like segzio, we have a
1928	 * separate segment for two reasons: it makes it easy to skip our pages
1929	 * on kernel crash dumps, and it helps avoid fragmentation.  With this
1930	 * segment, we're expecting significantly-sized allocations only; we'll
1931	 * default to 4x the size of physmem.
1932	 */
1933	segkvmm_base = segkp_base + mmu_ptob(segkpsize);
1934	size = segkvmmsize != 0 ? mmu_ptob(segkvmmsize) : (physmem_size * 4);
1935
1936	size = MAX(size, SEGVMMMINSIZE);
1937	segkvmmsize = mmu_btop(ROUND_UP_LPAGE(size));
1938
1939	PRM_DEBUG(segkvmmsize);
1940	PRM_DEBUG(segkvmm_base);
1941
1942	/*
1943	 * segzio is used for ZFS cached data.  For segzio, we use 1.5x physmem.
1944	 */
1945	segzio_base = segkvmm_base + mmu_ptob(segkvmmsize);
1946	if (segzio_fromheap) {
1947		segziosize = 0;
1948	} else {
1949		size = (segziosize != 0) ? mmu_ptob(segziosize) :
1950		    (physmem_size * 3) / 2;
1951
1952		size = MAX(size, SEGZIOMINSIZE);
1953		segziosize = mmu_btop(ROUND_UP_LPAGE(size));
1954	}
1955	PRM_DEBUG(segziosize);
1956	PRM_DEBUG(segzio_base);
1957
1958	/*
1959	 * Put the range of VA for device mappings next, kmdb knows to not
1960	 * grep in this range of addresses.
1961	 */
1962	toxic_addr =
1963	    ROUND_UP_LPAGE((uintptr_t)segzio_base + mmu_ptob(segziosize));
1964	PRM_DEBUG(toxic_addr);
1965	segmap_start = ROUND_UP_LPAGE(toxic_addr + toxic_size);
1966
1967	/*
1968	 * Users can change segmapsize through eeprom. If the variable
1969	 * is tuned through eeprom, there is no upper bound on the
1970	 * size of segmap.
1971	 */
1972	segmapsize = MAX(ROUND_UP_LPAGE(segmapsize), SEGMAPDEFAULT);
1973
1974	PRM_DEBUG(segmap_start);
1975	PRM_DEBUG(segmapsize);
1976	kernelheap = (caddr_t)ROUND_UP_LPAGE(segmap_start + segmapsize);
1977	PRM_DEBUG(kernelheap);
1978	PRM_POINT("layout_kernel_va() done...");
1979}
1980
1981/*
1982 * Finish initializing the VM system, now that we are no longer
1983 * relying on the boot time memory allocators.
1984 */
1985static void
1986startup_vm(void)
1987{
1988	struct segmap_crargs a;
1989
1990	extern int use_brk_lpg, use_stk_lpg;
1991
1992	PRM_POINT("startup_vm() starting...");
1993
1994	/*
1995	 * Initialize the hat layer.
1996	 */
1997	hat_init();
1998
1999	/*
2000	 * Do final allocations of HAT data structures that need to
2001	 * be allocated before quiescing the boot loader.
2002	 */
2003	PRM_POINT("Calling hat_kern_alloc()...");
2004	hat_kern_alloc((caddr_t)segmap_start, segmapsize, ekernelheap);
2005	PRM_POINT("hat_kern_alloc() done");
2006
2007#ifndef __xpv
2008	/*
2009	 * Setup Page Attribute Table
2010	 */
2011	pat_sync();
2012#endif
2013
2014	/*
2015	 * The next two loops are done in distinct steps in order
2016	 * to be sure that any page that is doubly mapped (both above
2017	 * KERNEL_TEXT and below kernelbase) is dealt with correctly.
2018	 * Note this may never happen, but it might someday.
2019	 */
2020	bootpages = NULL;
2021	PRM_POINT("Protecting boot pages");
2022
2023	/*
2024	 * Protect any pages mapped above KERNEL_TEXT that somehow have
2025	 * page_t's. This can only happen if something weird allocated
2026	 * in this range (like kadb/kmdb).
2027	 */
2028	protect_boot_range(KERNEL_TEXT, (uintptr_t)-1, 0);
2029
2030	/*
2031	 * Before we can take over memory allocation/mapping from the boot
2032	 * loader we must remove from our free page lists any boot allocated
2033	 * pages that stay mapped until release_bootstrap().
2034	 */
2035	protect_boot_range(0, kernelbase, 1);
2036
2037
2038	/*
2039	 * Switch to running on regular HAT (not boot_mmu)
2040	 */
2041	PRM_POINT("Calling hat_kern_setup()...");
2042	hat_kern_setup();
2043
2044	/*
2045	 * It is no longer safe to call BOP_ALLOC(), so make sure we don't.
2046	 */
2047	bop_no_more_mem();
2048
2049	PRM_POINT("hat_kern_setup() done");
2050
2051	hat_cpu_online(CPU);
2052
2053	/*
2054	 * Initialize VM system
2055	 */
2056	PRM_POINT("Calling kvm_init()...");
2057	kvm_init();
2058	PRM_POINT("kvm_init() done");
2059
2060	/*
2061	 * Tell kmdb that the VM system is now working
2062	 */
2063	if (boothowto & RB_DEBUG)
2064		kdi_dvec_vmready();
2065
2066#if defined(__xpv)
2067	/*
2068	 * Populate the I/O pool on domain 0
2069	 */
2070	if (DOMAIN_IS_INITDOMAIN(xen_info)) {
2071		extern long populate_io_pool(void);
2072		long init_io_pool_cnt;
2073
2074		PRM_POINT("Populating reserve I/O page pool");
2075		init_io_pool_cnt = populate_io_pool();
2076		PRM_DEBUG(init_io_pool_cnt);
2077	}
2078#endif
2079	/*
2080	 * Mangle the brand string etc.
2081	 */
2082	cpuid_pass3(CPU);
2083
2084#if defined(__amd64)
2085
2086	/*
2087	 * Create the device arena for toxic (to dtrace/kmdb) mappings.
2088	 */
2089	device_arena = vmem_create("device", (void *)toxic_addr,
2090	    toxic_size, MMU_PAGESIZE, NULL, NULL, NULL, 0, VM_SLEEP);
2091
2092#else	/* __i386 */
2093
2094	/*
2095	 * allocate the bit map that tracks toxic pages
2096	 */
2097	toxic_bit_map_len = btop((ulong_t)(valloc_base - kernelbase));
2098	PRM_DEBUG(toxic_bit_map_len);
2099	toxic_bit_map =
2100	    kmem_zalloc(BT_SIZEOFMAP(toxic_bit_map_len), KM_NOSLEEP);
2101	ASSERT(toxic_bit_map != NULL);
2102	PRM_DEBUG(toxic_bit_map);
2103
2104#endif	/* __i386 */
2105
2106
2107	/*
2108	 * Now that we've got more VA, as well as the ability to allocate from
2109	 * it, tell the debugger.
2110	 */
2111	if (boothowto & RB_DEBUG)
2112		kdi_dvec_memavail();
2113
2114#if !defined(__xpv)
2115	/*
2116	 * Map page pfn=0 for drivers, such as kd, that need to pick up
2117	 * parameters left there by controllers/BIOS.
2118	 */
2119	PRM_POINT("setup up p0_va");
2120	p0_va = i86devmap(0, 1, PROT_READ);
2121	PRM_DEBUG(p0_va);
2122#endif
2123
2124	cmn_err(CE_CONT, "?mem = %luK (0x%lx)\n",
2125	    physinstalled << (MMU_PAGESHIFT - 10), ptob(physinstalled));
2126
2127	/*
2128	 * disable automatic large pages for small memory systems or
2129	 * when the disable flag is set.
2130	 *
2131	 * Do not yet consider page sizes larger than 2m/4m.
2132	 */
2133	if (!auto_lpg_disable && mmu.max_page_level > 0) {
2134		max_uheap_lpsize = LEVEL_SIZE(1);
2135		max_ustack_lpsize = LEVEL_SIZE(1);
2136		max_privmap_lpsize = LEVEL_SIZE(1);
2137		max_uidata_lpsize = LEVEL_SIZE(1);
2138		max_utext_lpsize = LEVEL_SIZE(1);
2139		max_shm_lpsize = LEVEL_SIZE(1);
2140	}
2141	if (physmem < privm_lpg_min_physmem || mmu.max_page_level == 0 ||
2142	    auto_lpg_disable) {
2143		use_brk_lpg = 0;
2144		use_stk_lpg = 0;
2145	}
2146	mcntl0_lpsize = LEVEL_SIZE(mmu.umax_page_level);
2147
2148	PRM_POINT("Calling hat_init_finish()...");
2149	hat_init_finish();
2150	PRM_POINT("hat_init_finish() done");
2151
2152	/*
2153	 * Initialize the segkp segment type.
2154	 */
2155	rw_enter(&kas.a_lock, RW_WRITER);
2156	PRM_POINT("Attaching segkp");
2157	if (segkp_fromheap) {
2158		segkp->s_as = &kas;
2159	} else if (seg_attach(&kas, (caddr_t)segkp_base, mmu_ptob(segkpsize),
2160	    segkp) < 0) {
2161		panic("startup: cannot attach segkp");
2162		/*NOTREACHED*/
2163	}
2164	PRM_POINT("Doing segkp_create()");
2165	if (segkp_create(segkp) != 0) {
2166		panic("startup: segkp_create failed");
2167		/*NOTREACHED*/
2168	}
2169	PRM_DEBUG(segkp);
2170	rw_exit(&kas.a_lock);
2171
2172	/*
2173	 * kpm segment
2174	 */
2175	segmap_kpm = 0;
2176	if (kpm_desired)
2177		kpm_init();
2178
2179	/*
2180	 * Now create segmap segment.
2181	 */
2182	rw_enter(&kas.a_lock, RW_WRITER);
2183	if (seg_attach(&kas, (caddr_t)segmap_start, segmapsize, segmap) < 0) {
2184		panic("cannot attach segmap");
2185		/*NOTREACHED*/
2186	}
2187	PRM_DEBUG(segmap);
2188
2189	a.prot = PROT_READ | PROT_WRITE;
2190	a.shmsize = 0;
2191	a.nfreelist = segmapfreelists;
2192
2193	if (segmap_create(segmap, (caddr_t)&a) != 0)
2194		panic("segmap_create segmap");
2195	rw_exit(&kas.a_lock);
2196
2197	setup_vaddr_for_ppcopy(CPU);
2198
2199	segdev_init();
2200#if defined(__xpv)
2201	if (DOMAIN_IS_INITDOMAIN(xen_info))
2202#endif
2203		pmem_init();
2204
2205	PRM_POINT("startup_vm() done");
2206}
2207
2208/*
2209 * Load a tod module for the non-standard tod part found on this system.
2210 */
2211static void
2212load_tod_module(char *todmod)
2213{
2214	if (modload("tod", todmod) == -1)
2215		halt("Can't load TOD module");
2216}
2217
2218static void
2219startup_end(void)
2220{
2221	int i;
2222	extern void setx86isalist(void);
2223	extern void cpu_event_init(void);
2224
2225	PRM_POINT("startup_end() starting...");
2226
2227	/*
2228	 * Perform tasks that get done after most of the VM
2229	 * initialization has been done but before the clock
2230	 * and other devices get started.
2231	 */
2232	kern_setup1();
2233
2234	/*
2235	 * Perform CPC initialization for this CPU.
2236	 */
2237	kcpc_hw_init(CPU);
2238
2239	/*
2240	 * Initialize cpu event framework.
2241	 */
2242	cpu_event_init();
2243
2244#if defined(OPTERON_WORKAROUND_6323525)
2245	if (opteron_workaround_6323525)
2246		patch_workaround_6323525();
2247#endif
2248	/*
2249	 * If needed, load TOD module now so that ddi_get_time(9F) etc. work
2250	 * (For now, "needed" is defined as set tod_module_name in /etc/system)
2251	 */
2252	if (tod_module_name != NULL) {
2253		PRM_POINT("load_tod_module()");
2254		load_tod_module(tod_module_name);
2255	}
2256
2257#if defined(__xpv)
2258	/*
2259	 * Forceload interposing TOD module for the hypervisor.
2260	 */
2261	PRM_POINT("load_tod_module()");
2262	load_tod_module("xpvtod");
2263#endif
2264
2265	/*
2266	 * Configure the system.
2267	 */
2268	PRM_POINT("Calling configure()...");
2269	configure();		/* set up devices */
2270	PRM_POINT("configure() done");
2271
2272	/*
2273	 * We can now setup for XSAVE because fpu_probe is done in configure().
2274	 */
2275	if (fp_save_mech == FP_XSAVE) {
2276		xsave_setup_msr(CPU);
2277	}
2278
2279	/*
2280	 * Set the isa_list string to the defined instruction sets we
2281	 * support.
2282	 */
2283	setx86isalist();
2284	cpu_intr_alloc(CPU, NINTR_THREADS);
2285	psm_install();
2286
2287	/*
2288	 * We're done with bootops.  We don't unmap the bootstrap yet because
2289	 * we're still using bootsvcs.
2290	 */
2291	PRM_POINT("NULLing out bootops");
2292	*bootopsp = (struct bootops *)NULL;
2293	bootops = (struct bootops *)NULL;
2294
2295#if defined(__xpv)
2296	ec_init_debug_irq();
2297	xs_domu_init();
2298#endif
2299
2300#if !defined(__xpv)
2301	/*
2302	 * Intel IOMMU has been setup/initialized in ddi_impl.c
2303	 * Start it up now.
2304	 */
2305	immu_startup();
2306
2307	/*
2308	 * Now that we're no longer going to drop into real mode for a BIOS call
2309	 * via bootops, we can enable PCID (which requires CR0.PG).
2310	 */
2311	enable_pcid();
2312#endif
2313
2314	PRM_POINT("Enabling interrupts");
2315	(*picinitf)();
2316	sti();
2317#if defined(__xpv)
2318	ASSERT(CPU->cpu_m.mcpu_vcpu_info->evtchn_upcall_mask == 0);
2319	xen_late_startup();
2320#endif
2321
2322	(void) add_avsoftintr((void *)&softlevel1_hdl, 1, softlevel1,
2323	    "softlevel1", NULL, NULL); /* XXX to be moved later */
2324
2325	/*
2326	 * Register software interrupt handlers for ddi_periodic_add(9F).
2327	 * Software interrupts up to the level 10 are supported.
2328	 */
2329	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
2330		(void) add_avsoftintr((void *)&softlevel_hdl[i-1], i,
2331		    (avfunc)(uintptr_t)ddi_periodic_softintr, "ddi_periodic",
2332		    (caddr_t)(uintptr_t)i, NULL);
2333	}
2334
2335#if !defined(__xpv)
2336	if (modload("drv", "amd_iommu") < 0) {
2337		PRM_POINT("No AMD IOMMU present\n");
2338	} else if (ddi_hold_installed_driver(ddi_name_to_major(
2339	    "amd_iommu")) == NULL) {
2340		prom_printf("ERROR: failed to attach AMD IOMMU\n");
2341	}
2342#endif
2343	post_startup_cpu_fixups();
2344
2345	PRM_POINT("startup_end() done");
2346}
2347
2348/*
2349 * Don't remove the following 2 variables.  They are necessary
2350 * for reading the hostid from the legacy file (/kernel/misc/sysinit).
2351 */
2352char *_hs1107 = hw_serial;
2353ulong_t  _bdhs34;
2354
2355void
2356post_startup(void)
2357{
2358	extern void cpupm_init(cpu_t *);
2359	extern void cpu_event_init_cpu(cpu_t *);
2360
2361	/*
2362	 * Set the system wide, processor-specific flags to be passed
2363	 * to userland via the aux vector for performance hints and
2364	 * instruction set extensions.
2365	 */
2366	bind_hwcap();
2367
2368#ifdef __xpv
2369	if (DOMAIN_IS_INITDOMAIN(xen_info))
2370#endif
2371	{
2372#if defined(__xpv)
2373		xpv_panic_init();
2374#else
2375		/*
2376		 * Startup the memory scrubber.
2377		 * XXPV	This should be running somewhere ..
2378		 */
2379		if ((get_hwenv() & HW_VIRTUAL) == 0)
2380			memscrub_init();
2381#endif
2382	}
2383
2384	/*
2385	 * Complete CPU module initialization
2386	 */
2387	cmi_post_startup();
2388
2389	/*
2390	 * Perform forceloading tasks for /etc/system.
2391	 */
2392	(void) mod_sysctl(SYS_FORCELOAD, NULL);
2393
2394	/*
2395	 * ON4.0: Force /proc module in until clock interrupt handle fixed
2396	 * ON4.0: This must be fixed or restated in /etc/systems.
2397	 */
2398	(void) modload("fs", "procfs");
2399
2400	(void) i_ddi_attach_hw_nodes("pit_beep");
2401
2402#if defined(__i386)
2403	/*
2404	 * Check for required functional Floating Point hardware,
2405	 * unless FP hardware explicitly disabled.
2406	 */
2407	if (fpu_exists && (fpu_pentium_fdivbug || fp_kind == FP_NO))
2408		halt("No working FP hardware found");
2409#endif
2410
2411	maxmem = freemem;
2412
2413	cpu_event_init_cpu(CPU);
2414	cpupm_init(CPU);
2415	(void) mach_cpu_create_device_node(CPU, NULL);
2416
2417	pg_init();
2418}
2419
2420static int
2421pp_in_range(page_t *pp, uint64_t low_addr, uint64_t high_addr)
2422{
2423	return ((pp->p_pagenum >= btop(low_addr)) &&
2424	    (pp->p_pagenum < btopr(high_addr)));
2425}
2426
2427static int
2428pp_in_module(page_t *pp, const rd_existing_t *modranges)
2429{
2430	uint_t i;
2431
2432	for (i = 0; modranges[i].phys != 0; i++) {
2433		if (pp_in_range(pp, modranges[i].phys,
2434		    modranges[i].phys + modranges[i].size))
2435			return (1);
2436	}
2437
2438	return (0);
2439}
2440
2441void
2442release_bootstrap(void)
2443{
2444	int root_is_ramdisk;
2445	page_t *pp;
2446	extern void kobj_boot_unmountroot(void);
2447	extern dev_t rootdev;
2448	uint_t i;
2449	char propname[32];
2450	rd_existing_t *modranges;
2451#if !defined(__xpv)
2452	pfn_t	pfn;
2453#endif
2454
2455	/*
2456	 * Save the bootfs module ranges so that we can reserve them below
2457	 * for the real bootfs.
2458	 */
2459	modranges = kmem_alloc(sizeof (rd_existing_t) * MAX_BOOT_MODULES,
2460	    KM_SLEEP);
2461	for (i = 0; ; i++) {
2462		uint64_t start, size;
2463
2464		modranges[i].phys = 0;
2465
2466		(void) snprintf(propname, sizeof (propname),
2467		    "module-addr-%u", i);
2468		if (do_bsys_getproplen(NULL, propname) <= 0)
2469			break;
2470		(void) do_bsys_getprop(NULL, propname, &start);
2471
2472		(void) snprintf(propname, sizeof (propname),
2473		    "module-size-%u", i);
2474		if (do_bsys_getproplen(NULL, propname) <= 0)
2475			break;
2476		(void) do_bsys_getprop(NULL, propname, &size);
2477
2478		modranges[i].phys = start;
2479		modranges[i].size = size;
2480	}
2481
2482	/* unmount boot ramdisk and release kmem usage */
2483	kobj_boot_unmountroot();
2484
2485	/*
2486	 * We're finished using the boot loader so free its pages.
2487	 */
2488	PRM_POINT("Unmapping lower boot pages");
2489
2490	clear_boot_mappings(0, _userlimit);
2491
2492	postbootkernelbase = kernelbase;
2493
2494	/*
2495	 * If root isn't on ramdisk, destroy the hardcoded
2496	 * ramdisk node now and release the memory. Else,
2497	 * ramdisk memory is kept in rd_pages.
2498	 */
2499	root_is_ramdisk = (getmajor(rootdev) == ddi_name_to_major("ramdisk"));
2500	if (!root_is_ramdisk) {
2501		dev_info_t *dip = ddi_find_devinfo("ramdisk", -1, 0);
2502		ASSERT(dip && ddi_get_parent(dip) == ddi_root_node());
2503		ndi_rele_devi(dip);	/* held from ddi_find_devinfo */
2504		(void) ddi_remove_child(dip, 0);
2505	}
2506
2507	PRM_POINT("Releasing boot pages");
2508	while (bootpages) {
2509		extern uint64_t ramdisk_start, ramdisk_end;
2510		pp = bootpages;
2511		bootpages = pp->p_next;
2512
2513
2514		/* Keep pages for the lower 64K */
2515		if (pp_in_range(pp, 0, 0x40000)) {
2516			pp->p_next = lower_pages;
2517			lower_pages = pp;
2518			lower_pages_count++;
2519			continue;
2520		}
2521
2522		if (root_is_ramdisk && pp_in_range(pp, ramdisk_start,
2523		    ramdisk_end) || pp_in_module(pp, modranges)) {
2524			pp->p_next = rd_pages;
2525			rd_pages = pp;
2526			continue;
2527		}
2528		pp->p_next = (struct page *)0;
2529		pp->p_prev = (struct page *)0;
2530		PP_CLRBOOTPAGES(pp);
2531		page_free(pp, 1);
2532	}
2533	PRM_POINT("Boot pages released");
2534
2535	kmem_free(modranges, sizeof (rd_existing_t) * 99);
2536
2537#if !defined(__xpv)
2538/* XXPV -- note this following bunch of code needs to be revisited in Xen 3.0 */
2539	/*
2540	 * Find 1 page below 1 MB so that other processors can boot up or
2541	 * so that any processor can resume.
2542	 * Make sure it has a kernel VA as well as a 1:1 mapping.
2543	 * We should have just free'd one up.
2544	 */
2545
2546	/*
2547	 * 0x10 pages is 64K.  Leave the bottom 64K alone
2548	 * for BIOS.
2549	 */
2550	for (pfn = 0x10; pfn < btop(1*1024*1024); pfn++) {
2551		if (page_numtopp_alloc(pfn) == NULL)
2552			continue;
2553		rm_platter_va = i86devmap(pfn, 1,
2554		    PROT_READ | PROT_WRITE | PROT_EXEC);
2555		rm_platter_pa = ptob(pfn);
2556		break;
2557	}
2558	if (pfn == btop(1*1024*1024) && use_mp)
2559		panic("No page below 1M available for starting "
2560		    "other processors or for resuming from system-suspend");
2561#endif	/* !__xpv */
2562}
2563
2564/*
2565 * Initialize the platform-specific parts of a page_t.
2566 */
2567void
2568add_physmem_cb(page_t *pp, pfn_t pnum)
2569{
2570	pp->p_pagenum = pnum;
2571	pp->p_mapping = NULL;
2572	pp->p_embed = 0;
2573	pp->p_share = 0;
2574	pp->p_mlentry = 0;
2575}
2576
2577/*
2578 * kphysm_init() initializes physical memory.
2579 */
2580static pgcnt_t
2581kphysm_init(page_t *pp, pgcnt_t npages)
2582{
2583	struct memlist	*pmem;
2584	struct memseg	*cur_memseg;
2585	pfn_t		base_pfn;
2586	pfn_t		end_pfn;
2587	pgcnt_t		num;
2588	pgcnt_t		pages_done = 0;
2589	uint64_t	addr;
2590	uint64_t	size;
2591	extern pfn_t	ddiphysmin;
2592	extern int	mnode_xwa;
2593	int		ms = 0, me = 0;
2594
2595	ASSERT(page_hash != NULL && page_hashsz != 0);
2596
2597	cur_memseg = memseg_base;
2598	for (pmem = phys_avail; pmem && npages; pmem = pmem->ml_next) {
2599		/*
2600		 * In a 32 bit kernel can't use higher memory if we're
2601		 * not booting in PAE mode. This check takes care of that.
2602		 */
2603		addr = pmem->ml_address;
2604		size = pmem->ml_size;
2605		if (btop(addr) > physmax)
2606			continue;
2607
2608		/*
2609		 * align addr and size - they may not be at page boundaries
2610		 */
2611		if ((addr & MMU_PAGEOFFSET) != 0) {
2612			addr += MMU_PAGEOFFSET;
2613			addr &= ~(uint64_t)MMU_PAGEOFFSET;
2614			size -= addr - pmem->ml_address;
2615		}
2616
2617		/* only process pages below or equal to physmax */
2618		if ((btop(addr + size) - 1) > physmax)
2619			size = ptob(physmax - btop(addr) + 1);
2620
2621		num = btop(size);
2622		if (num == 0)
2623			continue;
2624
2625		if (num > npages)
2626			num = npages;
2627
2628		npages -= num;
2629		pages_done += num;
2630		base_pfn = btop(addr);
2631
2632		if (prom_debug)
2633			prom_printf("MEMSEG addr=0x%" PRIx64
2634			    " pgs=0x%lx pfn 0x%lx-0x%lx\n",
2635			    addr, num, base_pfn, base_pfn + num);
2636
2637		/*
2638		 * Ignore pages below ddiphysmin to simplify ddi memory
2639		 * allocation with non-zero addr_lo requests.
2640		 */
2641		if (base_pfn < ddiphysmin) {
2642			if (base_pfn + num <= ddiphysmin)
2643				continue;
2644			pp += (ddiphysmin - base_pfn);
2645			num -= (ddiphysmin - base_pfn);
2646			base_pfn = ddiphysmin;
2647		}
2648
2649		/*
2650		 * mnode_xwa is greater than 1 when large pages regions can
2651		 * cross memory node boundaries. To prevent the formation
2652		 * of these large pages, configure the memsegs based on the
2653		 * memory node ranges which had been made non-contiguous.
2654		 */
2655		end_pfn = base_pfn + num - 1;
2656		if (mnode_xwa > 1) {
2657			ms = PFN_2_MEM_NODE(base_pfn);
2658			me = PFN_2_MEM_NODE(end_pfn);
2659
2660			if (ms != me) {
2661				/*
2662				 * current range spans more than 1 memory node.
2663				 * Set num to only the pfn range in the start
2664				 * memory node.
2665				 */
2666				num = mem_node_config[ms].physmax - base_pfn
2667				    + 1;
2668				ASSERT(end_pfn > mem_node_config[ms].physmax);
2669			}
2670		}
2671
2672		for (;;) {
2673			/*
2674			 * Build the memsegs entry
2675			 */
2676			cur_memseg->pages = pp;
2677			cur_memseg->epages = pp + num;
2678			cur_memseg->pages_base = base_pfn;
2679			cur_memseg->pages_end = base_pfn + num;
2680
2681			/*
2682			 * Insert into memseg list in decreasing pfn range
2683			 * order. Low memory is typically more fragmented such
2684			 * that this ordering keeps the larger ranges at the
2685			 * front of the list for code that searches memseg.
2686			 * This ASSERTS that the memsegs coming in from boot
2687			 * are in increasing physical address order and not
2688			 * contiguous.
2689			 */
2690			if (memsegs != NULL) {
2691				ASSERT(cur_memseg->pages_base >=
2692				    memsegs->pages_end);
2693				cur_memseg->next = memsegs;
2694			}
2695			memsegs = cur_memseg;
2696
2697			/*
2698			 * add_physmem() initializes the PSM part of the page
2699			 * struct by calling the PSM back with add_physmem_cb().
2700			 * In addition it coalesces pages into larger pages as
2701			 * it initializes them.
2702			 */
2703			add_physmem(pp, num, base_pfn);
2704			cur_memseg++;
2705			availrmem_initial += num;
2706			availrmem += num;
2707
2708			pp += num;
2709			if (ms >= me)
2710				break;
2711
2712			/* process next memory node range */
2713			ms++;
2714			base_pfn = mem_node_config[ms].physbase;
2715
2716			if (mnode_xwa > 1) {
2717				num = MIN(mem_node_config[ms].physmax,
2718				    end_pfn) - base_pfn + 1;
2719			} else {
2720				num = mem_node_config[ms].physmax -
2721				    base_pfn + 1;
2722			}
2723		}
2724	}
2725
2726	PRM_DEBUG(availrmem_initial);
2727	PRM_DEBUG(availrmem);
2728	PRM_DEBUG(freemem);
2729	build_pfn_hash();
2730	return (pages_done);
2731}
2732
2733/*
2734 * Kernel VM initialization.
2735 */
2736static void
2737kvm_init(void)
2738{
2739	ASSERT((((uintptr_t)s_text) & MMU_PAGEOFFSET) == 0);
2740
2741	/*
2742	 * Put the kernel segments in kernel address space.
2743	 */
2744	rw_enter(&kas.a_lock, RW_WRITER);
2745	as_avlinit(&kas);
2746
2747	(void) seg_attach(&kas, s_text, e_moddata - s_text, &ktextseg);
2748	(void) segkmem_create(&ktextseg);
2749
2750	(void) seg_attach(&kas, (caddr_t)valloc_base, valloc_sz, &kvalloc);
2751	(void) segkmem_create(&kvalloc);
2752
2753	(void) seg_attach(&kas, kernelheap,
2754	    ekernelheap - kernelheap, &kvseg);
2755	(void) segkmem_create(&kvseg);
2756
2757	if (core_size > 0) {
2758		PRM_POINT("attaching kvseg_core");
2759		(void) seg_attach(&kas, (caddr_t)core_base, core_size,
2760		    &kvseg_core);
2761		(void) segkmem_create(&kvseg_core);
2762	}
2763
2764	PRM_POINT("attaching segkvmm");
2765	(void) seg_attach(&kas, segkvmm_base, mmu_ptob(segkvmmsize), &kvmmseg);
2766	(void) segkmem_create(&kvmmseg);
2767	segkmem_kvmm_init(segkvmm_base, mmu_ptob(segkvmmsize));
2768
2769	if (segziosize > 0) {
2770		PRM_POINT("attaching segzio");
2771		(void) seg_attach(&kas, segzio_base, mmu_ptob(segziosize),
2772		    &kzioseg);
2773		(void) segkmem_create(&kzioseg);
2774
2775		/* create zio area covering new segment */
2776		segkmem_zio_init(segzio_base, mmu_ptob(segziosize));
2777	}
2778
2779	(void) seg_attach(&kas, kdi_segdebugbase, kdi_segdebugsize, &kdebugseg);
2780	(void) segkmem_create(&kdebugseg);
2781
2782	rw_exit(&kas.a_lock);
2783
2784	/*
2785	 * Ensure that the red zone at kernelbase is never accessible.
2786	 */
2787	PRM_POINT("protecting redzone");
2788	(void) as_setprot(&kas, (caddr_t)kernelbase, KERNEL_REDZONE_SIZE, 0);
2789
2790	/*
2791	 * Make the text writable so that it can be hot patched by DTrace.
2792	 */
2793	(void) as_setprot(&kas, s_text, e_modtext - s_text,
2794	    PROT_READ | PROT_WRITE | PROT_EXEC);
2795
2796	/*
2797	 * Make data writable until end.
2798	 */
2799	(void) as_setprot(&kas, s_data, e_moddata - s_data,
2800	    PROT_READ | PROT_WRITE | PROT_EXEC);
2801}
2802
2803#ifndef __xpv
2804/*
2805 * Solaris adds an entry for Write Combining caching to the PAT
2806 */
2807static uint64_t pat_attr_reg = PAT_DEFAULT_ATTRIBUTE;
2808
2809void
2810pat_sync(void)
2811{
2812	ulong_t	cr0, cr0_orig, cr4;
2813
2814	if (!is_x86_feature(x86_featureset, X86FSET_PAT))
2815		return;
2816	cr0_orig = cr0 = getcr0();
2817	cr4 = getcr4();
2818
2819	/* disable caching and flush all caches and TLBs */
2820	cr0 |= CR0_CD;
2821	cr0 &= ~CR0_NW;
2822	setcr0(cr0);
2823	invalidate_cache();
2824	if (cr4 & CR4_PGE) {
2825		setcr4(cr4 & ~(ulong_t)CR4_PGE);
2826		setcr4(cr4);
2827	} else {
2828		reload_cr3();
2829	}
2830
2831	/* add our entry to the PAT */
2832	wrmsr(REG_PAT, pat_attr_reg);
2833
2834	/* flush TLBs and cache again, then reenable cr0 caching */
2835	if (cr4 & CR4_PGE) {
2836		setcr4(cr4 & ~(ulong_t)CR4_PGE);
2837		setcr4(cr4);
2838	} else {
2839		reload_cr3();
2840	}
2841	invalidate_cache();
2842	setcr0(cr0_orig);
2843}
2844
2845#endif /* !__xpv */
2846
2847#if defined(_SOFT_HOSTID)
2848/*
2849 * On platforms that do not have a hardware serial number, attempt
2850 * to set one based on the contents of /etc/hostid.  If this file does
2851 * not exist, assume that we are to generate a new hostid and set
2852 * it in the kernel, for subsequent saving by a userland process
2853 * once the system is up and the root filesystem is mounted r/w.
2854 *
2855 * In order to gracefully support upgrade on OpenSolaris, if
2856 * /etc/hostid does not exist, we will attempt to get a serial number
2857 * using the legacy method (/kernel/misc/sysinit).
2858 *
2859 * If that isn't present, we attempt to use an SMBIOS UUID, which is
2860 * a hardware serial number.  Note that we don't automatically trust
2861 * all SMBIOS UUIDs (some older platforms are defective and ship duplicate
2862 * UUIDs in violation of the standard), we check against a blacklist.
2863 *
2864 * In an attempt to make the hostid less prone to abuse
2865 * (for license circumvention, etc), we store it in /etc/hostid
2866 * in rot47 format.
2867 */
2868extern volatile unsigned long tenmicrodata;
2869static int atoi(char *);
2870
2871/*
2872 * Set this to non-zero in /etc/system if you think your SMBIOS returns a
2873 * UUID that is not unique. (Also report it so that the smbios_uuid_blacklist
2874 * array can be updated.)
2875 */
2876int smbios_broken_uuid = 0;
2877
2878/*
2879 * List of known bad UUIDs.  This is just the lower 32-bit values, since
2880 * that's what we use for the host id.  If your hostid falls here, you need
2881 * to contact your hardware OEM for a fix for your BIOS.
2882 */
2883static unsigned char
2884smbios_uuid_blacklist[][16] = {
2885
2886	{	/* Reported bad UUID (Google search) */
2887		0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00, 0x05,
2888		0x00, 0x06, 0x00, 0x07, 0x00, 0x08, 0x00, 0x09,
2889	},
2890	{	/* Known bad DELL UUID */
2891		0x4C, 0x4C, 0x45, 0x44, 0x00, 0x00, 0x20, 0x10,
2892		0x80, 0x20, 0x80, 0xC0, 0x4F, 0x20, 0x20, 0x20,
2893	},
2894	{	/* Uninitialized flash */
2895		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2896		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
2897	},
2898	{	/* All zeros */
2899		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2900		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
2901	},
2902};
2903
2904static int32_t
2905uuid_to_hostid(const uint8_t *uuid)
2906{
2907	/*
2908	 * Although the UUIDs are 128-bits, they may not distribute entropy
2909	 * evenly.  We would like to use SHA or MD5, but those are located
2910	 * in loadable modules and not available this early in boot.  As we
2911	 * don't need the values to be cryptographically strong, we just
2912	 * generate 32-bit vaue by xor'ing the various sequences together,
2913	 * which ensures that the entire UUID contributes to the hostid.
2914	 */
2915	uint32_t	id = 0;
2916
2917	/* first check against the blacklist */
2918	for (int i = 0; i < (sizeof (smbios_uuid_blacklist) / 16); i++) {
2919		if (bcmp(smbios_uuid_blacklist[0], uuid, 16) == 0) {
2920			cmn_err(CE_CONT, "?Broken SMBIOS UUID. "
2921			    "Contact BIOS manufacturer for repair.\n");
2922			return ((int32_t)HW_INVALID_HOSTID);
2923		}
2924	}
2925
2926	for (int i = 0; i < 16; i++)
2927		id ^= ((uuid[i]) << (8 * (i % sizeof (id))));
2928
2929	/* Make sure return value is positive */
2930	return (id & 0x7fffffff);
2931}
2932
2933static int32_t
2934set_soft_hostid(void)
2935{
2936	struct _buf *file;
2937	char tokbuf[MAXNAMELEN];
2938	token_t token;
2939	int done = 0;
2940	u_longlong_t tmp;
2941	int i;
2942	int32_t hostid = (int32_t)HW_INVALID_HOSTID;
2943	unsigned char *c;
2944	hrtime_t tsc;
2945	smbios_system_t smsys;
2946
2947	/*
2948	 * If /etc/hostid file not found, we'd like to get a pseudo
2949	 * random number to use at the hostid.  A nice way to do this
2950	 * is to read the real time clock.  To remain xen-compatible,
2951	 * we can't poke the real hardware, so we use tsc_read() to
2952	 * read the real time clock.  However, there is an ominous
2953	 * warning in tsc_read that says it can return zero, so we
2954	 * deal with that possibility by falling back to using the
2955	 * (hopefully random enough) value in tenmicrodata.
2956	 */
2957
2958	if ((file = kobj_open_file(hostid_file)) == (struct _buf *)-1) {
2959		/*
2960		 * hostid file not found - try to load sysinit module
2961		 * and see if it has a nonzero hostid value...use that
2962		 * instead of generating a new hostid here if so.
2963		 */
2964		if ((i = modload("misc", "sysinit")) != -1) {
2965			if (strlen(hw_serial) > 0)
2966				hostid = (int32_t)atoi(hw_serial);
2967			(void) modunload(i);
2968		}
2969
2970		/*
2971		 * We try to use the SMBIOS UUID. But not if it is blacklisted
2972		 * in /etc/system.
2973		 */
2974		if ((hostid == HW_INVALID_HOSTID) &&
2975		    (smbios_broken_uuid == 0) &&
2976		    (ksmbios != NULL) &&
2977		    (smbios_info_system(ksmbios, &smsys) != SMB_ERR) &&
2978		    (smsys.smbs_uuidlen >= 16)) {
2979			hostid = uuid_to_hostid(smsys.smbs_uuid);
2980		}
2981
2982		/*
2983		 * Generate a "random" hostid using the clock.  These
2984		 * hostids will change on each boot if the value is not
2985		 * saved to a persistent /etc/hostid file.
2986		 */
2987		if (hostid == HW_INVALID_HOSTID) {
2988			tsc = tsc_read();
2989			if (tsc == 0)	/* tsc_read can return zero sometimes */
2990				hostid = (int32_t)tenmicrodata & 0x0CFFFFF;
2991			else
2992				hostid = (int32_t)tsc & 0x0CFFFFF;
2993		}
2994	} else {
2995		/* hostid file found */
2996		while (!done) {
2997			token = kobj_lex(file, tokbuf, sizeof (tokbuf));
2998
2999			switch (token) {
3000			case POUND:
3001				/*
3002				 * skip comments
3003				 */
3004				kobj_find_eol(file);
3005				break;
3006			case STRING:
3007				/*
3008				 * un-rot47 - obviously this
3009				 * nonsense is ascii-specific
3010				 */
3011				for (c = (unsigned char *)tokbuf;
3012				    *c != '\0'; c++) {
3013					*c += 47;
3014					if (*c > '~')
3015						*c -= 94;
3016					else if (*c < '!')
3017						*c += 94;
3018				}
3019				/*
3020				 * now we should have a real number
3021				 */
3022
3023				if (kobj_getvalue(tokbuf, &tmp) != 0)
3024					kobj_file_err(CE_WARN, file,
3025					    "Bad value %s for hostid",
3026					    tokbuf);
3027				else
3028					hostid = (int32_t)tmp;
3029
3030				break;
3031			case EOF:
3032				done = 1;
3033				/* FALLTHROUGH */
3034			case NEWLINE:
3035				kobj_newline(file);
3036				break;
3037			default:
3038				break;
3039
3040			}
3041		}
3042		if (hostid == HW_INVALID_HOSTID) /* didn't find a hostid */
3043			kobj_file_err(CE_WARN, file,
3044			    "hostid missing or corrupt");
3045
3046		kobj_close_file(file);
3047	}
3048	/*
3049	 * hostid is now the value read from /etc/hostid, or the
3050	 * new hostid we generated in this routine or HW_INVALID_HOSTID if not
3051	 * set.
3052	 */
3053	return (hostid);
3054}
3055
3056static int
3057atoi(char *p)
3058{
3059	int i = 0;
3060
3061	while (*p != '\0')
3062		i = 10 * i + (*p++ - '0');
3063
3064	return (i);
3065}
3066
3067#endif /* _SOFT_HOSTID */
3068
3069void
3070get_system_configuration(void)
3071{
3072	char	prop[32];
3073	u_longlong_t nodes_ll, cpus_pernode_ll, lvalue;
3074
3075	if (BOP_GETPROPLEN(bootops, "nodes") > sizeof (prop) ||
3076	    BOP_GETPROP(bootops, "nodes", prop) < 0 ||
3077	    kobj_getvalue(prop, &nodes_ll) == -1 ||
3078	    nodes_ll > MAXNODES ||
3079	    BOP_GETPROPLEN(bootops, "cpus_pernode") > sizeof (prop) ||
3080	    BOP_GETPROP(bootops, "cpus_pernode", prop) < 0 ||
3081	    kobj_getvalue(prop, &cpus_pernode_ll) == -1) {
3082		system_hardware.hd_nodes = 1;
3083		system_hardware.hd_cpus_per_node = 0;
3084	} else {
3085		system_hardware.hd_nodes = (int)nodes_ll;
3086		system_hardware.hd_cpus_per_node = (int)cpus_pernode_ll;
3087	}
3088
3089	if (BOP_GETPROPLEN(bootops, "kernelbase") > sizeof (prop) ||
3090	    BOP_GETPROP(bootops, "kernelbase", prop) < 0 ||
3091	    kobj_getvalue(prop, &lvalue) == -1)
3092		eprom_kernelbase = 0;
3093	else
3094		eprom_kernelbase = (uintptr_t)lvalue;
3095
3096	if (BOP_GETPROPLEN(bootops, "segmapsize") > sizeof (prop) ||
3097	    BOP_GETPROP(bootops, "segmapsize", prop) < 0 ||
3098	    kobj_getvalue(prop, &lvalue) == -1)
3099		segmapsize = SEGMAPDEFAULT;
3100	else
3101		segmapsize = (uintptr_t)lvalue;
3102
3103	if (BOP_GETPROPLEN(bootops, "segmapfreelists") > sizeof (prop) ||
3104	    BOP_GETPROP(bootops, "segmapfreelists", prop) < 0 ||
3105	    kobj_getvalue(prop, &lvalue) == -1)
3106		segmapfreelists = 0;	/* use segmap driver default */
3107	else
3108		segmapfreelists = (int)lvalue;
3109
3110	/* physmem used to be here, but moved much earlier to fakebop.c */
3111}
3112
3113/*
3114 * Add to a memory list.
3115 * start = start of new memory segment
3116 * len = length of new memory segment in bytes
3117 * new = pointer to a new struct memlist
3118 * memlistp = memory list to which to add segment.
3119 */
3120void
3121memlist_add(
3122	uint64_t start,
3123	uint64_t len,
3124	struct memlist *new,
3125	struct memlist **memlistp)
3126{
3127	struct memlist *cur;
3128	uint64_t end = start + len;
3129
3130	new->ml_address = start;
3131	new->ml_size = len;
3132
3133	cur = *memlistp;
3134
3135	while (cur) {
3136		if (cur->ml_address >= end) {
3137			new->ml_next = cur;
3138			*memlistp = new;
3139			new->ml_prev = cur->ml_prev;
3140			cur->ml_prev = new;
3141			return;
3142		}
3143		ASSERT(cur->ml_address + cur->ml_size <= start);
3144		if (cur->ml_next == NULL) {
3145			cur->ml_next = new;
3146			new->ml_prev = cur;
3147			new->ml_next = NULL;
3148			return;
3149		}
3150		memlistp = &cur->ml_next;
3151		cur = cur->ml_next;
3152	}
3153}
3154
3155void
3156kobj_vmem_init(vmem_t **text_arena, vmem_t **data_arena)
3157{
3158	size_t tsize = e_modtext - modtext;
3159	size_t dsize = e_moddata - moddata;
3160
3161	*text_arena = vmem_create("module_text", tsize ? modtext : NULL, tsize,
3162	    1, segkmem_alloc, segkmem_free, heaptext_arena, 0, VM_SLEEP);
3163	*data_arena = vmem_create("module_data", dsize ? moddata : NULL, dsize,
3164	    1, segkmem_alloc, segkmem_free, heap32_arena, 0, VM_SLEEP);
3165}
3166
3167caddr_t
3168kobj_text_alloc(vmem_t *arena, size_t size)
3169{
3170	return (vmem_alloc(arena, size, VM_SLEEP | VM_BESTFIT));
3171}
3172
3173/*ARGSUSED*/
3174caddr_t
3175kobj_texthole_alloc(caddr_t addr, size_t size)
3176{
3177	panic("unexpected call to kobj_texthole_alloc()");
3178	/*NOTREACHED*/
3179	return (0);
3180}
3181
3182/*ARGSUSED*/
3183void
3184kobj_texthole_free(caddr_t addr, size_t size)
3185{
3186	panic("unexpected call to kobj_texthole_free()");
3187}
3188
3189/*
3190 * This is called just after configure() in startup().
3191 *
3192 * The ISALIST concept is a bit hopeless on Intel, because
3193 * there's no guarantee of an ever-more-capable processor
3194 * given that various parts of the instruction set may appear
3195 * and disappear between different implementations.
3196 *
3197 * While it would be possible to correct it and even enhance
3198 * it somewhat, the explicit hardware capability bitmask allows
3199 * more flexibility.
3200 *
3201 * So, we just leave this alone.
3202 */
3203void
3204setx86isalist(void)
3205{
3206	char *tp;
3207	size_t len;
3208	extern char *isa_list;
3209
3210#define	TBUFSIZE	1024
3211
3212	tp = kmem_alloc(TBUFSIZE, KM_SLEEP);
3213	*tp = '\0';
3214
3215#if defined(__amd64)
3216	(void) strcpy(tp, "amd64 ");
3217#endif
3218
3219	switch (x86_vendor) {
3220	case X86_VENDOR_Intel:
3221	case X86_VENDOR_AMD:
3222	case X86_VENDOR_TM:
3223		if (is_x86_feature(x86_featureset, X86FSET_CMOV)) {
3224			/*
3225			 * Pentium Pro or later
3226			 */
3227			(void) strcat(tp, "pentium_pro");
3228			(void) strcat(tp,
3229			    is_x86_feature(x86_featureset, X86FSET_MMX) ?
3230			    "+mmx pentium_pro " : " ");
3231		}
3232		/*FALLTHROUGH*/
3233	case X86_VENDOR_Cyrix:
3234		/*
3235		 * The Cyrix 6x86 does not have any Pentium features
3236		 * accessible while not at privilege level 0.
3237		 */
3238		if (is_x86_feature(x86_featureset, X86FSET_CPUID)) {
3239			(void) strcat(tp, "pentium");
3240			(void) strcat(tp,
3241			    is_x86_feature(x86_featureset, X86FSET_MMX) ?
3242			    "+mmx pentium " : " ");
3243		}
3244		break;
3245	default:
3246		break;
3247	}
3248	(void) strcat(tp, "i486 i386 i86");
3249	len = strlen(tp) + 1;   /* account for NULL at end of string */
3250	isa_list = strcpy(kmem_alloc(len, KM_SLEEP), tp);
3251	kmem_free(tp, TBUFSIZE);
3252
3253#undef TBUFSIZE
3254}
3255
3256
3257#ifdef __amd64
3258
3259void *
3260device_arena_alloc(size_t size, int vm_flag)
3261{
3262	return (vmem_alloc(device_arena, size, vm_flag));
3263}
3264
3265void
3266device_arena_free(void *vaddr, size_t size)
3267{
3268	vmem_free(device_arena, vaddr, size);
3269}
3270
3271#else /* __i386 */
3272
3273void *
3274device_arena_alloc(size_t size, int vm_flag)
3275{
3276	caddr_t	vaddr;
3277	uintptr_t v;
3278	size_t	start;
3279	size_t	end;
3280
3281	vaddr = vmem_alloc(heap_arena, size, vm_flag);
3282	if (vaddr == NULL)
3283		return (NULL);
3284
3285	v = (uintptr_t)vaddr;
3286	ASSERT(v >= kernelbase);
3287	ASSERT(v + size <= valloc_base);
3288
3289	start = btop(v - kernelbase);
3290	end = btop(v + size - 1 - kernelbase);
3291	ASSERT(start < toxic_bit_map_len);
3292	ASSERT(end < toxic_bit_map_len);
3293
3294	while (start <= end) {
3295		BT_ATOMIC_SET(toxic_bit_map, start);
3296		++start;
3297	}
3298	return (vaddr);
3299}
3300
3301void
3302device_arena_free(void *vaddr, size_t size)
3303{
3304	uintptr_t v = (uintptr_t)vaddr;
3305	size_t	start;
3306	size_t	end;
3307
3308	ASSERT(v >= kernelbase);
3309	ASSERT(v + size <= valloc_base);
3310
3311	start = btop(v - kernelbase);
3312	end = btop(v + size - 1 - kernelbase);
3313	ASSERT(start < toxic_bit_map_len);
3314	ASSERT(end < toxic_bit_map_len);
3315
3316	while (start <= end) {
3317		ASSERT(BT_TEST(toxic_bit_map, start) != 0);
3318		BT_ATOMIC_CLEAR(toxic_bit_map, start);
3319		++start;
3320	}
3321	vmem_free(heap_arena, vaddr, size);
3322}
3323
3324/*
3325 * returns 1st address in range that is in device arena, or NULL
3326 * if len is not NULL it returns the length of the toxic range
3327 */
3328void *
3329device_arena_contains(void *vaddr, size_t size, size_t *len)
3330{
3331	uintptr_t v = (uintptr_t)vaddr;
3332	uintptr_t eaddr = v + size;
3333	size_t start;
3334	size_t end;
3335
3336	/*
3337	 * if called very early by kmdb, just return NULL
3338	 */
3339	if (toxic_bit_map == NULL)
3340		return (NULL);
3341
3342	/*
3343	 * First check if we're completely outside the bitmap range.
3344	 */
3345	if (v >= valloc_base || eaddr < kernelbase)
3346		return (NULL);
3347
3348	/*
3349	 * Trim ends of search to look at only what the bitmap covers.
3350	 */
3351	if (v < kernelbase)
3352		v = kernelbase;
3353	start = btop(v - kernelbase);
3354	end = btop(eaddr - kernelbase);
3355	if (end >= toxic_bit_map_len)
3356		end = toxic_bit_map_len;
3357
3358	if (bt_range(toxic_bit_map, &start, &end, end) == 0)
3359		return (NULL);
3360
3361	v = kernelbase + ptob(start);
3362	if (len != NULL)
3363		*len = ptob(end - start);
3364	return ((void *)v);
3365}
3366
3367#endif	/* __i386 */
3368