1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <vm/hat.h>
28#include <vm/hat_sfmmu.h>
29#include <vm/page.h>
30#include <sys/pte.h>
31#include <sys/systm.h>
32#include <sys/mman.h>
33#include <sys/sysmacros.h>
34#include <sys/machparam.h>
35#include <sys/vtrace.h>
36#include <sys/kmem.h>
37#include <sys/mmu.h>
38#include <sys/cmn_err.h>
39#include <sys/cpu.h>
40#include <sys/cpuvar.h>
41#include <sys/debug.h>
42#include <sys/lgrp.h>
43#include <sys/archsystm.h>
44#include <sys/machsystm.h>
45#include <sys/vmsystm.h>
46#include <sys/bitmap.h>
47#include <vm/as.h>
48#include <vm/seg.h>
49#include <vm/seg_kmem.h>
50#include <vm/seg_kp.h>
51#include <vm/seg_kpm.h>
52#include <vm/rm.h>
53#include <vm/vm_dep.h>
54#include <sys/t_lock.h>
55#include <sys/vm_machparam.h>
56#include <sys/promif.h>
57#include <sys/prom_isa.h>
58#include <sys/prom_plat.h>
59#include <sys/prom_debug.h>
60#include <sys/privregs.h>
61#include <sys/bootconf.h>
62#include <sys/memlist.h>
63#include <sys/memlist_plat.h>
64#include <sys/cpu_module.h>
65#include <sys/reboot.h>
66#include <sys/kdi.h>
67
68/*
69 * Static routines
70 */
71static void	sfmmu_map_prom_mappings(struct translation *, size_t);
72static struct translation *read_prom_mappings(size_t *);
73static void	sfmmu_reloc_trap_handler(void *, void *, size_t);
74
75/*
76 * External routines
77 */
78extern void sfmmu_remap_kernel(void);
79extern void sfmmu_patch_utsb(void);
80
81/*
82 * Global Data:
83 */
84extern caddr_t	textva, datava;
85extern tte_t	ktext_tte, kdata_tte;	/* ttes for kernel text and data */
86extern int	enable_bigktsb;
87extern int	kmem64_smchunks;
88
89uint64_t memsegspa = (uintptr_t)MSEG_NULLPTR_PA; /* memsegs physical linkage */
90uint64_t memseg_phash[N_MEM_SLOTS];	/* use physical memseg addresses */
91
92int	sfmmu_kern_mapped = 0;
93
94/*
95 * DMMU primary context register for the kernel context. Machine specific code
96 * inserts correct page size codes when necessary
97 */
98uint64_t kcontextreg = KCONTEXT;
99
100#ifdef DEBUG
101static int ndata_middle_hole_detected = 0;
102#endif
103
104/* Extern Global Data */
105
106extern int page_relocate_ready;
107
108/*
109 * Controls the logic which enables the use of the
110 * QUAD_LDD_PHYS ASI for TSB accesses.
111 */
112extern int	ktsb_phys;
113
114/*
115 * Global Routines called from within:
116 *	usr/src/uts/sun4u
117 *	usr/src/uts/sfmmu
118 *	usr/src/uts/sun
119 */
120
121pfn_t
122va_to_pfn(void *vaddr)
123{
124	u_longlong_t physaddr;
125	int mode, valid;
126
127	if (tba_taken_over)
128		return (hat_getpfnum(kas.a_hat, (caddr_t)vaddr));
129
130#if !defined(C_OBP)
131	if (!kmem64_smchunks &&
132	    (caddr_t)vaddr >= kmem64_base && (caddr_t)vaddr < kmem64_end) {
133		if (kmem64_pabase == (uint64_t)-1)
134			prom_panic("va_to_pfn: kmem64_pabase not init");
135		physaddr = kmem64_pabase + ((caddr_t)vaddr - kmem64_base);
136		return ((pfn_t)physaddr >> MMU_PAGESHIFT);
137	}
138#endif	/* !C_OBP */
139
140	if ((prom_translate_virt(vaddr, &valid, &physaddr, &mode) != -1) &&
141	    (valid == -1)) {
142		return ((pfn_t)(physaddr >> MMU_PAGESHIFT));
143	}
144	return (PFN_INVALID);
145}
146
147uint64_t
148va_to_pa(void *vaddr)
149{
150	pfn_t pfn;
151
152	if ((pfn = va_to_pfn(vaddr)) == PFN_INVALID)
153		return ((uint64_t)-1);
154	return (((uint64_t)pfn << MMU_PAGESHIFT) |
155	    ((uint64_t)vaddr & MMU_PAGEOFFSET));
156}
157
158void
159hat_kern_setup(void)
160{
161	struct translation *trans_root;
162	size_t ntrans_root;
163	extern void startup_fixup_physavail(void);
164
165	/*
166	 * These are the steps we take to take over the mmu from the prom.
167	 *
168	 * (1)	Read the prom's mappings through the translation property.
169	 * (2)	Remap the kernel text and kernel data with 2 locked 4MB ttes.
170	 *	Create the the hmeblks for these 2 ttes at this time.
171	 * (3)	Create hat structures for all other prom mappings.  Since the
172	 *	kernel text and data hme_blks have already been created we
173	 *	skip the equivalent prom's mappings.
174	 * (4)	Initialize the tsb and its corresponding hardware regs.
175	 * (5)	Take over the trap table (currently in startup).
176	 * (6)	Up to this point it is possible the prom required some of its
177	 *	locked tte's.  Now that we own the trap table we remove them.
178	 */
179
180	ktsb_pbase = va_to_pa(ktsb_base);
181	ktsb4m_pbase = va_to_pa(ktsb4m_base);
182	PRM_DEBUG(ktsb_pbase);
183	PRM_DEBUG(ktsb4m_pbase);
184
185	sfmmu_patch_ktsb();
186	sfmmu_patch_utsb();
187	sfmmu_patch_mmu_asi(ktsb_phys);
188
189	sfmmu_init_tsbs();
190
191	if (kpm_enable) {
192		sfmmu_kpm_patch_tlbm();
193		if (kpm_smallpages == 0) {
194			sfmmu_kpm_patch_tsbm();
195		}
196	}
197
198	if (!shctx_on) {
199		sfmmu_patch_shctx();
200	}
201
202	/*
203	 * The 8K-indexed kernel TSB space is used to hold
204	 * translations below...
205	 */
206	trans_root = read_prom_mappings(&ntrans_root);
207	sfmmu_remap_kernel();
208	startup_fixup_physavail();
209	mmu_init_kernel_pgsz(kas.a_hat);
210	sfmmu_map_prom_mappings(trans_root, ntrans_root);
211
212	/*
213	 * We invalidate 8K kernel TSB because we used it in
214	 * sfmmu_map_prom_mappings()
215	 */
216	sfmmu_inv_tsb(ktsb_base, ktsb_sz);
217	sfmmu_inv_tsb(ktsb4m_base, ktsb4m_sz);
218
219	sfmmu_init_ktsbinfo();
220
221
222	sfmmu_kern_mapped = 1;
223
224	/*
225	 * hments have been created for mapped pages, and thus we're ready
226	 * for kmdb to start using its own trap table.  It walks the hments
227	 * to resolve TLB misses, and can't be used until they're ready.
228	 */
229	if (boothowto & RB_DEBUG)
230		kdi_dvec_vmready();
231}
232
233/*
234 * Macro used below to convert the prom's 32-bit high and low fields into
235 * a value appropriate for the 64-bit kernel.
236 */
237
238#define	COMBINE(hi, lo) (((uint64_t)(uint32_t)(hi) << 32) | (uint32_t)(lo))
239
240/*
241 * Track larges pages used.
242 * Provides observability for this feature on non-debug kernels.
243 */
244ulong_t map_prom_lpcount[MMU_PAGE_SIZES];
245
246/*
247 * This function traverses the prom mapping list and creates equivalent
248 * mappings in the sfmmu mapping hash.
249 */
250static void
251sfmmu_map_prom_mappings(struct translation *trans_root, size_t ntrans_root)
252{
253	struct translation *promt;
254	tte_t	tte, oldtte, *ttep;
255	pfn_t	pfn, oldpfn, basepfn;
256	caddr_t vaddr;
257	size_t	size, offset;
258	unsigned long i;
259	uint_t	attr;
260	page_t *pp;
261	extern struct memlist *virt_avail;
262	char buf[256];
263
264	ttep = &tte;
265	for (i = 0, promt = trans_root; i < ntrans_root; i++, promt++) {
266		ASSERT(promt->tte_hi != 0);
267		ASSERT32(promt->virt_hi == 0 && promt->size_hi == 0);
268
269		vaddr = (caddr_t)COMBINE(promt->virt_hi, promt->virt_lo);
270
271		/*
272		 * hack until we get rid of map-for-unix
273		 */
274		if (vaddr < (caddr_t)KERNELBASE)
275			continue;
276
277		ttep->tte_inthi = promt->tte_hi;
278		ttep->tte_intlo = promt->tte_lo;
279		attr = PROC_DATA | HAT_NOSYNC;
280#if defined(TTE_IS_GLOBAL)
281		if (TTE_IS_GLOBAL(ttep)) {
282			/*
283			 * The prom better not use global translations
284			 * because a user process might use the same
285			 * virtual addresses
286			 */
287			prom_panic("sfmmu_map_prom_mappings: global"
288			    " translation");
289			TTE_SET_LOFLAGS(ttep, TTE_GLB_INT, 0);
290		}
291#endif
292		if (TTE_IS_LOCKED(ttep)) {
293			/* clear the lock bits */
294			TTE_CLR_LOCKED(ttep);
295		}
296		attr |= (TTE_IS_VCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEVTTE;
297		attr |= (TTE_IS_PCACHEABLE(ttep)) ? 0 : SFMMU_UNCACHEPTTE;
298		attr |= (TTE_IS_SIDEFFECT(ttep)) ? SFMMU_SIDEFFECT : 0;
299		attr |= (TTE_IS_IE(ttep)) ? HAT_STRUCTURE_LE : 0;
300
301		size = COMBINE(promt->size_hi, promt->size_lo);
302		offset = 0;
303		basepfn = TTE_TO_PFN((caddr_t)COMBINE(promt->virt_hi,
304		    promt->virt_lo), ttep);
305		while (size) {
306			vaddr = (caddr_t)(COMBINE(promt->virt_hi,
307			    promt->virt_lo) + offset);
308
309			/*
310			 * make sure address is not in virt-avail list
311			 */
312			if (address_in_memlist(virt_avail, (uint64_t)vaddr,
313			    size)) {
314				prom_panic("sfmmu_map_prom_mappings:"
315				    " inconsistent translation/avail lists");
316			}
317
318			pfn = basepfn + mmu_btop(offset);
319			if (pf_is_memory(pfn)) {
320				if (attr & SFMMU_UNCACHEPTTE) {
321					prom_panic("sfmmu_map_prom_mappings:"
322					    " uncached prom memory page");
323				}
324			} else {
325				if (!(attr & SFMMU_SIDEFFECT)) {
326					prom_panic("sfmmu_map_prom_mappings:"
327					    " prom i/o page without"
328					    " side-effect");
329				}
330			}
331
332			/*
333			 * skip kmem64 area
334			 */
335			if (!kmem64_smchunks &&
336			    vaddr >= kmem64_base &&
337			    vaddr < kmem64_aligned_end) {
338#if !defined(C_OBP)
339				prom_panic("sfmmu_map_prom_mappings:"
340				    " unexpected kmem64 prom mapping");
341#else	/* !C_OBP */
342				size_t mapsz;
343
344				if (ptob(pfn) !=
345				    kmem64_pabase + (vaddr - kmem64_base)) {
346					prom_panic("sfmmu_map_prom_mappings:"
347					    " unexpected kmem64 prom mapping");
348				}
349
350				mapsz = kmem64_aligned_end - vaddr;
351				if (mapsz >= size) {
352					break;
353				}
354				size -= mapsz;
355				offset += mapsz;
356				continue;
357#endif	/* !C_OBP */
358			}
359
360			oldpfn = sfmmu_vatopfn(vaddr, KHATID, &oldtte);
361			ASSERT(oldpfn != PFN_SUSPENDED);
362			ASSERT(page_relocate_ready == 0);
363
364			if (oldpfn != PFN_INVALID) {
365				/*
366				 * mapping already exists.
367				 * Verify they are equal
368				 */
369				if (pfn != oldpfn) {
370					(void) snprintf(buf, sizeof (buf),
371					"sfmmu_map_prom_mappings: mapping"
372					" conflict (va = 0x%p, pfn = 0x%p,"
373					" oldpfn = 0x%p)", (void *)vaddr,
374					    (void *)pfn, (void *)oldpfn);
375					prom_panic(buf);
376				}
377				size -= MMU_PAGESIZE;
378				offset += MMU_PAGESIZE;
379				continue;
380			}
381
382			pp = page_numtopp_nolock(pfn);
383			if ((pp != NULL) && PP_ISFREE((page_t *)pp)) {
384				(void) snprintf(buf, sizeof (buf),
385				"sfmmu_map_prom_mappings: prom-mapped"
386				" page (va = 0x%p, pfn = 0x%p) on free list",
387				    (void *)vaddr, (void *)pfn);
388				prom_panic(buf);
389			}
390
391			sfmmu_memtte(ttep, pfn, attr, TTE8K);
392			sfmmu_tteload(kas.a_hat, ttep, vaddr, pp,
393			    HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD);
394			size -= MMU_PAGESIZE;
395			offset += MMU_PAGESIZE;
396		}
397	}
398
399	/*
400	 * We claimed kmem64 from prom, so now we need to load tte.
401	 */
402	if (!kmem64_smchunks && kmem64_base != NULL) {
403		pgcnt_t pages;
404		size_t psize;
405		int pszc;
406
407		pszc = kmem64_szc;
408#ifdef sun4u
409		if (pszc > TTE8K) {
410			pszc = segkmem_lpszc;
411		}
412#endif	/* sun4u */
413		psize = TTEBYTES(pszc);
414		pages = btop(psize);
415		basepfn = kmem64_pabase >> MMU_PAGESHIFT;
416		vaddr = kmem64_base;
417		while (vaddr < kmem64_end) {
418			sfmmu_memtte(ttep, basepfn,
419			    PROC_DATA | HAT_NOSYNC, pszc);
420			sfmmu_tteload(kas.a_hat, ttep, vaddr, NULL,
421			    HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD);
422			vaddr += psize;
423			basepfn += pages;
424		}
425		map_prom_lpcount[pszc] =
426		    ((caddr_t)P2ROUNDUP((uintptr_t)kmem64_end, psize) -
427		    kmem64_base) >> TTE_PAGE_SHIFT(pszc);
428	}
429}
430
431#undef COMBINE	/* local to previous routine */
432
433/*
434 * This routine reads in the "translations" property in to a buffer and
435 * returns a pointer to this buffer and the number of translations.
436 */
437static struct translation *
438read_prom_mappings(size_t *ntransrootp)
439{
440	char *prop = "translations";
441	size_t translen;
442	pnode_t node;
443	struct translation *transroot;
444
445	/*
446	 * the "translations" property is associated with the mmu node
447	 */
448	node = (pnode_t)prom_getphandle(prom_mmu_ihandle());
449
450	/*
451	 * We use the TSB space to read in the prom mappings.  This space
452	 * is currently not being used because we haven't taken over the
453	 * trap table yet.  It should be big enough to hold the mappings.
454	 */
455	if ((translen = prom_getproplen(node, prop)) == -1)
456		cmn_err(CE_PANIC, "no translations property");
457	*ntransrootp = translen / sizeof (*transroot);
458	translen = roundup(translen, MMU_PAGESIZE);
459	PRM_DEBUG(translen);
460	if (translen > TSB_BYTES(ktsb_szcode))
461		cmn_err(CE_PANIC, "not enough space for translations");
462
463	transroot = (struct translation *)ktsb_base;
464	ASSERT(transroot);
465	if (prom_getprop(node, prop, (caddr_t)transroot) == -1) {
466		cmn_err(CE_PANIC, "translations getprop failed");
467	}
468	return (transroot);
469}
470
471/*
472 * Init routine of the nucleus data memory allocator.
473 *
474 * The nucleus data memory allocator is organized in ecache_alignsize'd
475 * memory chunks. Memory allocated by ndata_alloc() will never be freed.
476 *
477 * The ndata argument is used as header of the ndata freelist.
478 * Other freelist nodes are placed in the nucleus memory itself
479 * at the beginning of a free memory chunk. Therefore a freelist
480 * node (struct memlist) must fit into the smallest allocatable
481 * memory chunk (ecache_alignsize bytes).
482 *
483 * The memory interval [base, end] passed to ndata_alloc_init() must be
484 * bzero'd to allow the allocator to return bzero'd memory easily.
485 */
486void
487ndata_alloc_init(struct memlist *ndata, uintptr_t base, uintptr_t end)
488{
489	ASSERT(sizeof (struct memlist) <= ecache_alignsize);
490
491	base = roundup(base, ecache_alignsize);
492	end = end - end % ecache_alignsize;
493
494	ASSERT(base < end);
495
496	ndata->ml_address = base;
497	ndata->ml_size = end - base;
498	ndata->ml_next = NULL;
499	ndata->ml_prev = NULL;
500}
501
502/*
503 * Deliver the size of the largest free memory chunk.
504 */
505size_t
506ndata_maxsize(struct memlist *ndata)
507{
508	size_t chunksize = ndata->ml_size;
509
510	while ((ndata = ndata->ml_next) != NULL) {
511		if (chunksize < ndata->ml_size)
512			chunksize = ndata->ml_size;
513	}
514
515	return (chunksize);
516}
517
518
519/*
520 * Allocate the last properly aligned memory chunk.
521 * This function is called when no more large nucleus memory chunks
522 * will be allocated.  The remaining free nucleus memory at the end
523 * of the nucleus can be added to the phys_avail list.
524 */
525void *
526ndata_extra_base(struct memlist *ndata, size_t alignment, caddr_t endaddr)
527{
528	uintptr_t base;
529	size_t wasteage = 0;
530#ifdef	DEBUG
531	static int called = 0;
532
533	if (called++ > 0)
534		cmn_err(CE_PANIC, "ndata_extra_base() called more than once");
535#endif /* DEBUG */
536
537	/*
538	 * The alignment needs to be a multiple of ecache_alignsize.
539	 */
540	ASSERT((alignment % ecache_alignsize) ==  0);
541
542	while (ndata->ml_next != NULL) {
543		wasteage += ndata->ml_size;
544		ndata = ndata->ml_next;
545	}
546
547	base = roundup(ndata->ml_address, alignment);
548
549	if (base >= ndata->ml_address + ndata->ml_size)
550		return (NULL);
551
552	if ((caddr_t)(ndata->ml_address + ndata->ml_size) != endaddr) {
553#ifdef DEBUG
554		ndata_middle_hole_detected = 1;	/* see if we hit this again */
555#endif
556		return (NULL);
557	}
558
559	if (base == ndata->ml_address) {
560		if (ndata->ml_prev != NULL)
561			ndata->ml_prev->ml_next = NULL;
562		else
563			ndata->ml_size = 0;
564
565		bzero((void *)base, sizeof (struct memlist));
566
567	} else {
568		ndata->ml_size = base - ndata->ml_address;
569		wasteage += ndata->ml_size;
570	}
571	PRM_DEBUG(wasteage);
572
573	return ((void *)base);
574}
575
576/*
577 * Select the best matching buffer, avoid memory fragmentation.
578 */
579static struct memlist *
580ndata_select_chunk(struct memlist *ndata, size_t wanted, size_t alignment)
581{
582	struct memlist *fnd_below = NULL;
583	struct memlist *fnd_above = NULL;
584	struct memlist *fnd_unused = NULL;
585	struct memlist *frlist;
586	uintptr_t base;
587	uintptr_t end;
588	size_t below;
589	size_t above;
590	size_t unused;
591	size_t best_below = ULONG_MAX;
592	size_t best_above = ULONG_MAX;
593	size_t best_unused = ULONG_MAX;
594
595	ASSERT(ndata != NULL);
596
597	/*
598	 * Look for the best matching buffer, avoid memory fragmentation.
599	 * The following strategy is used, try to find
600	 *   1. an exact fitting buffer
601	 *   2. avoid wasting any space below the buffer, take first
602	 *	fitting buffer
603	 *   3. avoid wasting any space above the buffer, take first
604	 *	fitting buffer
605	 *   4. avoid wasting space, take first fitting buffer
606	 *   5. take the last buffer in chain
607	 */
608	for (frlist = ndata; frlist != NULL; frlist = frlist->ml_next) {
609		base = roundup(frlist->ml_address, alignment);
610		end = roundup(base + wanted, ecache_alignsize);
611
612		if (end > frlist->ml_address + frlist->ml_size)
613			continue;
614
615		below = (base - frlist->ml_address) / ecache_alignsize;
616		above = (frlist->ml_address + frlist->ml_size - end) /
617		    ecache_alignsize;
618		unused = below + above;
619
620		if (unused == 0)
621			return (frlist);
622
623		if (frlist->ml_next == NULL)
624			break;
625
626		if (below < best_below) {
627			best_below = below;
628			fnd_below = frlist;
629		}
630
631		if (above < best_above) {
632			best_above = above;
633			fnd_above = frlist;
634		}
635
636		if (unused < best_unused) {
637			best_unused = unused;
638			fnd_unused = frlist;
639		}
640	}
641
642	if (best_below == 0)
643		return (fnd_below);
644	if (best_above == 0)
645		return (fnd_above);
646	if (best_unused < ULONG_MAX)
647		return (fnd_unused);
648
649	return (frlist);
650}
651
652/*
653 * Nucleus data memory allocator.
654 * The granularity of the allocator is ecache_alignsize.
655 * See also comment for ndata_alloc_init().
656 */
657void *
658ndata_alloc(struct memlist *ndata, size_t wanted, size_t alignment)
659{
660	struct memlist *found;
661	struct memlist *fnd_above;
662	uintptr_t base;
663	uintptr_t end;
664	size_t below;
665	size_t above;
666
667	/*
668	 * Look for the best matching buffer, avoid memory fragmentation.
669	 */
670	if ((found = ndata_select_chunk(ndata, wanted, alignment)) == NULL)
671		return (NULL);
672
673	/*
674	 * Allocate the nucleus data buffer.
675	 */
676	base = roundup(found->ml_address, alignment);
677	end = roundup(base + wanted, ecache_alignsize);
678	ASSERT(end <= found->ml_address + found->ml_size);
679
680	below = base - found->ml_address;
681	above = found->ml_address + found->ml_size - end;
682	ASSERT(above == 0 || (above % ecache_alignsize) == 0);
683
684	if (below >= ecache_alignsize) {
685		/*
686		 * There is free memory below the allocated memory chunk.
687		 */
688		found->ml_size = below - below % ecache_alignsize;
689
690		if (above) {
691			fnd_above = (struct memlist *)end;
692			fnd_above->ml_address = end;
693			fnd_above->ml_size = above;
694
695			if ((fnd_above->ml_next = found->ml_next) != NULL)
696				found->ml_next->ml_prev = fnd_above;
697			fnd_above->ml_prev = found;
698			found->ml_next = fnd_above;
699		}
700
701		return ((void *)base);
702	}
703
704	if (found->ml_prev == NULL) {
705		/*
706		 * The first chunk (ndata) is selected.
707		 */
708		ASSERT(found == ndata);
709		if (above) {
710			found->ml_address = end;
711			found->ml_size = above;
712		} else if (found->ml_next != NULL) {
713			found->ml_address = found->ml_next->ml_address;
714			found->ml_size = found->ml_next->ml_size;
715			if ((found->ml_next = found->ml_next->ml_next) != NULL)
716				found->ml_next->ml_prev = found;
717
718			bzero((void *)found->ml_address,
719			    sizeof (struct memlist));
720		} else {
721			found->ml_address = end;
722			found->ml_size = 0;
723		}
724
725		return ((void *)base);
726	}
727
728	/*
729	 * Not the first chunk.
730	 */
731	if (above) {
732		fnd_above = (struct memlist *)end;
733		fnd_above->ml_address = end;
734		fnd_above->ml_size = above;
735
736		if ((fnd_above->ml_next = found->ml_next) != NULL)
737			fnd_above->ml_next->ml_prev = fnd_above;
738		fnd_above->ml_prev = found->ml_prev;
739		found->ml_prev->ml_next = fnd_above;
740
741	} else {
742		if ((found->ml_prev->ml_next = found->ml_next) != NULL)
743			found->ml_next->ml_prev = found->ml_prev;
744	}
745
746	bzero((void *)found->ml_address, sizeof (struct memlist));
747
748	return ((void *)base);
749}
750
751/*
752 * Size the kernel TSBs based upon the amount of physical
753 * memory in the system.
754 */
755static void
756calc_tsb_sizes(pgcnt_t npages)
757{
758	PRM_DEBUG(npages);
759
760	if (npages <= TSB_FREEMEM_MIN) {
761		ktsb_szcode = TSB_128K_SZCODE;
762		enable_bigktsb = 0;
763	} else if (npages <= TSB_FREEMEM_LARGE / 2) {
764		ktsb_szcode = TSB_256K_SZCODE;
765		enable_bigktsb = 0;
766	} else if (npages <= TSB_FREEMEM_LARGE) {
767		ktsb_szcode = TSB_512K_SZCODE;
768		enable_bigktsb = 0;
769	} else if (npages <= TSB_FREEMEM_LARGE * 2 ||
770	    enable_bigktsb == 0) {
771		ktsb_szcode = TSB_1M_SZCODE;
772		enable_bigktsb = 0;
773	} else {
774		ktsb_szcode = highbit(npages - 1);
775		ktsb_szcode -= TSB_START_SIZE;
776		ktsb_szcode = MAX(ktsb_szcode, MIN_BIGKTSB_SZCODE);
777		ktsb_szcode = MIN(ktsb_szcode, MAX_BIGKTSB_SZCODE);
778	}
779
780	/*
781	 * We choose the TSB to hold kernel 4M mappings to have twice
782	 * the reach as the primary kernel TSB since this TSB will
783	 * potentially (currently) be shared by both mappings to all of
784	 * physical memory plus user TSBs. If this TSB has to be in nucleus
785	 * (only for Spitfire and Cheetah) limit its size to 64K.
786	 */
787	ktsb4m_szcode = highbit((2 * npages) / TTEPAGES(TTE4M) - 1);
788	ktsb4m_szcode -= TSB_START_SIZE;
789	ktsb4m_szcode = MAX(ktsb4m_szcode, TSB_MIN_SZCODE);
790	ktsb4m_szcode = MIN(ktsb4m_szcode, TSB_SOFTSZ_MASK);
791	if ((enable_bigktsb == 0 || ktsb_phys == 0) && ktsb4m_szcode >
792	    TSB_64K_SZCODE) {
793		ktsb4m_szcode = TSB_64K_SZCODE;
794		max_bootlp_tteszc = TTE8K;
795	}
796
797	ktsb_sz = TSB_BYTES(ktsb_szcode);	/* kernel 8K tsb size */
798	ktsb4m_sz = TSB_BYTES(ktsb4m_szcode);	/* kernel 4M tsb size */
799}
800
801/*
802 * Allocate kernel TSBs from nucleus data memory.
803 * The function return 0 on success and -1 on failure.
804 */
805int
806ndata_alloc_tsbs(struct memlist *ndata, pgcnt_t npages)
807{
808	/*
809	 * Set ktsb_phys to 1 if the processor supports ASI_QUAD_LDD_PHYS.
810	 */
811	(void) sfmmu_setup_4lp();
812
813	/*
814	 * Size the kernel TSBs based upon the amount of physical
815	 * memory in the system.
816	 */
817	calc_tsb_sizes(npages);
818
819	/*
820	 * Allocate the 8K kernel TSB if it belongs inside the nucleus.
821	 */
822	if (enable_bigktsb == 0) {
823		if ((ktsb_base = ndata_alloc(ndata, ktsb_sz, ktsb_sz)) == NULL)
824			return (-1);
825		ASSERT(!((uintptr_t)ktsb_base & (ktsb_sz - 1)));
826
827		PRM_DEBUG(ktsb_base);
828		PRM_DEBUG(ktsb_sz);
829		PRM_DEBUG(ktsb_szcode);
830	}
831
832	/*
833	 * Next, allocate 4M kernel TSB from the nucleus since it's small.
834	 */
835	if (ktsb4m_szcode <= TSB_64K_SZCODE) {
836
837		ktsb4m_base = ndata_alloc(ndata, ktsb4m_sz, ktsb4m_sz);
838		if (ktsb4m_base == NULL)
839			return (-1);
840		ASSERT(!((uintptr_t)ktsb4m_base & (ktsb4m_sz - 1)));
841
842		PRM_DEBUG(ktsb4m_base);
843		PRM_DEBUG(ktsb4m_sz);
844		PRM_DEBUG(ktsb4m_szcode);
845	}
846
847	return (0);
848}
849
850size_t
851calc_hmehash_sz(pgcnt_t npages)
852{
853	ulong_t hme_buckets;
854
855	/*
856	 * The number of buckets in the hme hash tables
857	 * is a power of 2 such that the average hash chain length is
858	 * HMENT_HASHAVELEN.  The number of buckets for the user hash is
859	 * a function of physical memory and a predefined overmapping factor.
860	 * The number of buckets for the kernel hash is a function of
861	 * physical memory only.
862	 */
863	hme_buckets = (npages * HMEHASH_FACTOR) /
864	    (HMENT_HASHAVELEN * (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT));
865
866	uhmehash_num = (int)MIN(hme_buckets, MAX_UHME_BUCKETS);
867
868	if (uhmehash_num > USER_BUCKETS_THRESHOLD) {
869		/*
870		 * if uhmehash_num is not power of 2 round it down to the
871		 *  next power of 2.
872		 */
873		uint_t align = 1 << (highbit(uhmehash_num - 1) - 1);
874		uhmehash_num = P2ALIGN(uhmehash_num, align);
875	} else
876		uhmehash_num = 1 << highbit(uhmehash_num - 1);
877
878	hme_buckets = npages / (HMEBLK_SPAN(TTE8K) >> MMU_PAGESHIFT);
879	khmehash_num = (int)MIN(hme_buckets, MAX_KHME_BUCKETS);
880	khmehash_num = 1 << highbit(khmehash_num - 1);
881	khmehash_num = MAX(khmehash_num, MIN_KHME_BUCKETS);
882
883	return ((uhmehash_num + khmehash_num) * sizeof (struct hmehash_bucket));
884}
885
886caddr_t
887alloc_hmehash(caddr_t alloc_base)
888{
889	size_t khmehash_sz, uhmehash_sz;
890
891	khme_hash = (struct hmehash_bucket *)alloc_base;
892	khmehash_sz = khmehash_num * sizeof (struct hmehash_bucket);
893	alloc_base += khmehash_sz;
894
895	uhme_hash = (struct hmehash_bucket *)alloc_base;
896	uhmehash_sz = uhmehash_num * sizeof (struct hmehash_bucket);
897	alloc_base += uhmehash_sz;
898
899	PRM_DEBUG(khme_hash);
900	PRM_DEBUG(uhme_hash);
901
902	return (alloc_base);
903}
904
905/*
906 * Allocate hat structs from the nucleus data memory.
907 */
908int
909ndata_alloc_hat(struct memlist *ndata)
910{
911	size_t	cb_alloc_sz;
912
913	cb_alloc_sz = sfmmu_max_cb_id * sizeof (struct sfmmu_callback);
914	PRM_DEBUG(cb_alloc_sz);
915	sfmmu_cb_table = ndata_alloc(ndata, cb_alloc_sz, ecache_alignsize);
916	if (sfmmu_cb_table == NULL)
917		return (-1);
918	PRM_DEBUG(sfmmu_cb_table);
919
920	return (0);
921}
922
923int
924ndata_alloc_kpm(struct memlist *ndata, pgcnt_t kpm_npages)
925{
926	size_t	kpmp_alloc_sz;
927
928	/*
929	 * For the kpm_page mutex array we allocate one mutex every 16
930	 * kpm pages (64MB). In smallpage mode we allocate one mutex
931	 * every 8K pages. The minimum is set to 64 entries and the
932	 * maximum to 8K entries.
933	 */
934	if (kpm_smallpages == 0) {
935		kpmp_shift = highbit(sizeof (kpm_page_t)) - 1;
936		kpmp_table_sz = 1 << highbit(kpm_npages / 16);
937		kpmp_table_sz = (kpmp_table_sz < 64) ? 64 :
938		    ((kpmp_table_sz > 8192) ? 8192 : kpmp_table_sz);
939		kpmp_alloc_sz = kpmp_table_sz * sizeof (kpm_hlk_t);
940
941		kpmp_table = ndata_alloc(ndata, kpmp_alloc_sz,
942		    ecache_alignsize);
943		if (kpmp_table == NULL)
944			return (-1);
945
946		PRM_DEBUG(kpmp_table);
947		PRM_DEBUG(kpmp_table_sz);
948
949		kpmp_stable_sz = 0;
950		kpmp_stable = NULL;
951	} else {
952		ASSERT(kpm_pgsz == PAGESIZE);
953		kpmp_shift = highbit(sizeof (kpm_shlk_t)) + 1;
954		kpmp_stable_sz = 1 << highbit(kpm_npages / 8192);
955		kpmp_stable_sz = (kpmp_stable_sz < 64) ? 64 :
956		    ((kpmp_stable_sz > 8192) ? 8192 : kpmp_stable_sz);
957		kpmp_alloc_sz = kpmp_stable_sz * sizeof (kpm_shlk_t);
958
959		kpmp_stable = ndata_alloc(ndata, kpmp_alloc_sz,
960		    ecache_alignsize);
961		if (kpmp_stable == NULL)
962			return (-1);
963
964		PRM_DEBUG(kpmp_stable);
965		PRM_DEBUG(kpmp_stable_sz);
966
967		kpmp_table_sz = 0;
968		kpmp_table = NULL;
969	}
970	PRM_DEBUG(kpmp_shift);
971
972	return (0);
973}
974
975/*
976 * This function bop allocs kernel TSBs.
977 */
978caddr_t
979sfmmu_ktsb_alloc(caddr_t tsbbase)
980{
981	caddr_t vaddr;
982
983	if (enable_bigktsb) {
984		ktsb_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb_sz);
985		vaddr = prom_alloc(ktsb_base, ktsb_sz, ktsb_sz);
986		if (vaddr != ktsb_base)
987			cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc"
988			    " 8K bigktsb");
989		ktsb_base = vaddr;
990		tsbbase = ktsb_base + ktsb_sz;
991		PRM_DEBUG(ktsb_base);
992		PRM_DEBUG(tsbbase);
993	}
994
995	if (ktsb4m_szcode > TSB_64K_SZCODE) {
996		ASSERT(ktsb_phys && enable_bigktsb);
997		ktsb4m_base = (caddr_t)roundup((uintptr_t)tsbbase, ktsb4m_sz);
998		vaddr = (caddr_t)BOP_ALLOC(bootops, ktsb4m_base, ktsb4m_sz,
999		    ktsb4m_sz);
1000		if (vaddr != ktsb4m_base)
1001			cmn_err(CE_PANIC, "sfmmu_ktsb_alloc: can't alloc"
1002			    " 4M bigktsb");
1003		ktsb4m_base = vaddr;
1004		tsbbase = ktsb4m_base + ktsb4m_sz;
1005		PRM_DEBUG(ktsb4m_base);
1006		PRM_DEBUG(tsbbase);
1007	}
1008	return (tsbbase);
1009}
1010
1011/*
1012 * Moves code assembled outside of the trap table into the trap
1013 * table taking care to relocate relative branches to code outside
1014 * of the trap handler.
1015 */
1016static void
1017sfmmu_reloc_trap_handler(void *tablep, void *start, size_t count)
1018{
1019	size_t i;
1020	uint32_t *src;
1021	uint32_t *dst;
1022	uint32_t inst;
1023	int op, op2;
1024	int32_t offset;
1025	int disp;
1026
1027	src = start;
1028	dst = tablep;
1029	offset = src - dst;
1030	for (src = start, i = 0; i < count; i++, src++, dst++) {
1031		inst = *dst = *src;
1032		op = (inst >> 30) & 0x2;
1033		if (op == 1) {
1034			/* call */
1035			disp = ((int32_t)inst << 2) >> 2; /* sign-extend */
1036			if (disp + i >= 0 && disp + i < count)
1037				continue;
1038			disp += offset;
1039			inst = 0x40000000u | (disp & 0x3fffffffu);
1040			*dst = inst;
1041		} else if (op == 0) {
1042			/* branch or sethi */
1043			op2 = (inst >> 22) & 0x7;
1044
1045			switch (op2) {
1046			case 0x3: /* BPr */
1047				disp = (((inst >> 20) & 0x3) << 14) |
1048				    (inst & 0x3fff);
1049				disp = (disp << 16) >> 16; /* sign-extend */
1050				if (disp + i >= 0 && disp + i < count)
1051					continue;
1052				disp += offset;
1053				if (((disp << 16) >> 16) != disp)
1054					cmn_err(CE_PANIC, "bad reloc");
1055				inst &= ~0x303fff;
1056				inst |= (disp & 0x3fff);
1057				inst |= (disp & 0xc000) << 6;
1058				break;
1059
1060			case 0x2: /* Bicc */
1061				disp = ((int32_t)inst << 10) >> 10;
1062				if (disp + i >= 0 && disp + i < count)
1063					continue;
1064				disp += offset;
1065				if (((disp << 10) >> 10) != disp)
1066					cmn_err(CE_PANIC, "bad reloc");
1067				inst &= ~0x3fffff;
1068				inst |= (disp & 0x3fffff);
1069				break;
1070
1071			case 0x1: /* Bpcc */
1072				disp = ((int32_t)inst << 13) >> 13;
1073				if (disp + i >= 0 && disp + i < count)
1074					continue;
1075				disp += offset;
1076				if (((disp << 13) >> 13) != disp)
1077					cmn_err(CE_PANIC, "bad reloc");
1078				inst &= ~0x7ffff;
1079				inst |= (disp & 0x7ffffu);
1080				break;
1081			}
1082			*dst = inst;
1083		}
1084	}
1085	flush_instr_mem(tablep, count * sizeof (uint32_t));
1086}
1087
1088/*
1089 * Routine to allocate a large page to use in the TSB caches.
1090 */
1091/*ARGSUSED*/
1092static page_t *
1093sfmmu_tsb_page_create(void *addr, size_t size, int vmflag, void *arg)
1094{
1095	int pgflags;
1096
1097	pgflags = PG_EXCL;
1098	if ((vmflag & VM_NOSLEEP) == 0)
1099		pgflags |= PG_WAIT;
1100	if (vmflag & VM_PANIC)
1101		pgflags |= PG_PANIC;
1102	if (vmflag & VM_PUSHPAGE)
1103		pgflags |= PG_PUSHPAGE;
1104
1105	return (page_create_va_large(&kvp, (u_offset_t)(uintptr_t)addr, size,
1106	    pgflags, &kvseg, addr, arg));
1107}
1108
1109/*
1110 * Allocate a large page to back the virtual address range
1111 * [addr, addr + size).  If addr is NULL, allocate the virtual address
1112 * space as well.
1113 */
1114static void *
1115sfmmu_tsb_xalloc(vmem_t *vmp, void *inaddr, size_t size, int vmflag,
1116    uint_t attr, page_t *(*page_create_func)(void *, size_t, int, void *),
1117    void *pcarg)
1118{
1119	page_t *ppl;
1120	page_t *rootpp;
1121	caddr_t addr = inaddr;
1122	pgcnt_t npages = btopr(size);
1123	page_t **ppa;
1124	int i = 0;
1125
1126	/*
1127	 * Assuming that only TSBs will call this with size > PAGESIZE
1128	 * There is no reason why this couldn't be expanded to 8k pages as
1129	 * well, or other page sizes in the future .... but for now, we
1130	 * only support fixed sized page requests.
1131	 */
1132	if ((inaddr == NULL) && ((addr = vmem_xalloc(vmp, size, size, 0, 0,
1133	    NULL, NULL, vmflag)) == NULL))
1134		return (NULL);
1135
1136	if (page_resv(npages, vmflag & VM_KMFLAGS) == 0) {
1137		if (inaddr == NULL)
1138			vmem_xfree(vmp, addr, size);
1139		return (NULL);
1140	}
1141
1142	ppl = page_create_func(addr, size, vmflag, pcarg);
1143	if (ppl == NULL) {
1144		if (inaddr == NULL)
1145			vmem_xfree(vmp, addr, size);
1146		page_unresv(npages);
1147		return (NULL);
1148	}
1149
1150	rootpp = ppl;
1151	ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1152	while (ppl != NULL) {
1153		page_t *pp = ppl;
1154		ppa[i++] = pp;
1155		page_sub(&ppl, pp);
1156		ASSERT(page_iolock_assert(pp));
1157		page_io_unlock(pp);
1158	}
1159
1160	/*
1161	 * Load the locked entry.  It's OK to preload the entry into
1162	 * the TSB since we now support large mappings in the kernel TSB.
1163	 */
1164	hat_memload_array(kas.a_hat, (caddr_t)rootpp->p_offset, size,
1165	    ppa, (PROT_ALL & ~PROT_USER) | HAT_NOSYNC | attr, HAT_LOAD_LOCK);
1166
1167	for (--i; i >= 0; --i) {
1168		(void) page_pp_lock(ppa[i], 0, 1);
1169		page_unlock(ppa[i]);
1170	}
1171
1172	kmem_free(ppa, npages * sizeof (page_t *));
1173	return (addr);
1174}
1175
1176/* Called to import new spans into the TSB vmem arenas */
1177void *
1178sfmmu_tsb_segkmem_alloc(vmem_t *vmp, size_t size, int vmflag)
1179{
1180	lgrp_id_t lgrpid = LGRP_NONE;
1181
1182	if (tsb_lgrp_affinity) {
1183		/*
1184		 * Search for the vmp->lgrpid mapping by brute force;
1185		 * some day vmp will have an lgrp, until then we have
1186		 * to do this the hard way.
1187		 */
1188		for (lgrpid = 0; lgrpid < NLGRPS_MAX &&
1189		    vmp != kmem_tsb_default_arena[lgrpid]; lgrpid++)
1190			;
1191		if (lgrpid == NLGRPS_MAX)
1192			lgrpid = LGRP_NONE;
1193	}
1194
1195	return (sfmmu_tsb_xalloc(vmp, NULL, size, vmflag, 0,
1196	    sfmmu_tsb_page_create, lgrpid != LGRP_NONE? &lgrpid : NULL));
1197}
1198
1199/* Called to free spans from the TSB vmem arenas */
1200void
1201sfmmu_tsb_segkmem_free(vmem_t *vmp, void *inaddr, size_t size)
1202{
1203	page_t *pp;
1204	caddr_t addr = inaddr;
1205	caddr_t eaddr;
1206	pgcnt_t npages = btopr(size);
1207	pgcnt_t pgs_left = npages;
1208	page_t *rootpp = NULL;
1209
1210	hat_unload(kas.a_hat, addr, size, HAT_UNLOAD_UNLOCK);
1211
1212	for (eaddr = addr + size; addr < eaddr; addr += PAGESIZE) {
1213		pp = page_lookup(&kvp, (u_offset_t)(uintptr_t)addr, SE_EXCL);
1214		if (pp == NULL)
1215			panic("sfmmu_tsb_segkmem_free: page not found");
1216
1217		ASSERT(PAGE_EXCL(pp));
1218		page_pp_unlock(pp, 0, 1);
1219
1220		if (rootpp == NULL)
1221			rootpp = pp;
1222		if (--pgs_left == 0) {
1223			/*
1224			 * similar logic to segspt_free_pages, but we know we
1225			 * have one large page.
1226			 */
1227			page_destroy_pages(rootpp);
1228		}
1229	}
1230	page_unresv(npages);
1231
1232	if (vmp != NULL)
1233		vmem_xfree(vmp, inaddr, size);
1234}
1235