1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/sysmacros.h>
29#include <sys/machsystm.h>
30#include <sys/machparam.h>
31#include <sys/cmn_err.h>
32#include <sys/stat.h>
33#include <sys/mach_descrip.h>
34#include <sys/memnode.h>
35#include <sys/mdesc.h>
36#include <sys/mpo.h>
37#include <vm/page.h>
38#include <vm/vm_dep.h>
39#include <vm/hat_sfmmu.h>
40#include <sys/promif.h>
41
42/*
43 * MPO and the sun4v memory representation
44 * ---------------------------------------
45 *
46 * Latency groups are defined in the sun4v achitecture by memory-latency-group
47 * nodes in the Machine Description, as specified in FWARC/2007/260.  These
48 * tie together cpu nodes and mblock nodes, and contain mask and match
49 * properties that identify the portion of an mblock that belongs to the
50 * lgroup.  Mask and match are defined in the Physical Address (PA) space,
51 * but an mblock defines Real Addresses (RA).  To translate, the mblock
52 * includes the property address-congruence-offset, hereafter referred to as
53 * ra_to_pa.  A real address ra is a member of an lgroup if
54 *
55 *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
56 *
57 * The MD is traversed, and information on all mblocks is kept in the array
58 * mpo_mblock[].  Information on all CPUs, including which lgroup they map
59 * to, is kept in the array mpo_cpu[].
60 *
61 * This implementation makes (and verifies) the simplifying assumption that
62 * the mask bits are the same for all defined lgroups, and that all 1 bits in
63 * the mask are contiguous.  Thus the number of lgroups is bounded by the
64 * number of possible mask values, and the lgrp_handle_t is defined as the
65 * mask value, shifted right to eliminate the 0 bit positions in mask.  The
66 * masks and values are also referred to as "home bits" in the code.
67 *
68 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
69 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
70 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
71 * home bits.  This yields the mem_node.
72 *
73 * Interfaces
74 * ----------
75 *
76 * This file exports the following entry points:
77 *
78 * plat_lgrp_init()
79 * plat_build_mem_nodes()
80 * plat_lgrp_cpu_to_hand()
81 * plat_lgrp_latency()
82 * plat_pfn_to_mem_node()
83 *	These implement the usual platform lgroup interfaces.
84 *
85 * plat_rapfn_to_papfn()
86 *	Recover the PA page coloring bits from an RA.
87 *
88 * plat_mem_node_iterator_init()
89 *	Initialize an iterator to efficiently step through pages in a mem_node.
90 *
91 * plat_mem_node_intersect_range()
92 *	Find the intersection with a mem_node.
93 *
94 * plat_slice_add()
95 * plat_slice_del()
96 *	Platform hooks to add/delete a pfn range.
97 *
98 * Internal Organization
99 * ---------------------
100 *
101 * A number of routines are used both boot/DR code which (re)build
102 * appropriate MPO structures.
103 *
104 * mblock_alloc()
105 *	Allocate memory for mblocks and stripes as
106 *	appropriate for boot or memory DR.
107 *
108 * mblock_free()
109 *	Free memory allocated by mblock_alloc.
110 *
111 * mblock_update()
112 *	Build mblocks based on mblock nodes read from the MD.
113 *
114 * mblock_update_add()
115 *	Rebuild mblocks after a memory DR add operation.
116 *
117 * mblock_update_del()
118 *	Rebuild mblocks after a memory DR delete operation.
119 *
120 * mblock_install()
121 *	Install mblocks as the new configuration.
122 *
123 * mstripe_update()
124 *	Build stripes based on mblocks.
125 *
126 * mnode_update()
127 *	Call memnode layer to add/del a pfn range, based on stripes.
128 *
129 * The platform interfaces allocate all memory required for the
130 * particualar update first, block access to the MPO structures
131 * while they are updated, and free old structures after the update.
132 */
133
134int	sun4v_mpo_enable = 1;
135int	sun4v_mpo_debug = 0;
136char	sun4v_mpo_status[256] = "";
137
138/* Save CPU info from the MD and associate CPUs with lgroups */
139static	struct cpu_md mpo_cpu[NCPU];
140
141/* Save lgroup info from the MD */
142#define	MAX_MD_LGROUPS 32
143static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
144static	int	n_lgrpnodes = 0;
145static	int	n_locality_groups = 0;
146static	int	max_locality_groups = 0;
147static	int	szc_mask0 = 0;
148
149/* Save mblocks from the MD */
150#define	SMALL_MBLOCKS_COUNT	8
151static	struct	mblock_md *mpo_mblock;
152static	struct	mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
153static	int	n_mblocks = 0;
154
155/* Save mem_node stripes calculate from mblocks and lgroups. */
156static mem_stripe_t *mem_stripes;
157static	mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
158static	int	n_mem_stripes = 0;
159static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
160static	int	stripe_shift;	/* stride/stripes expressed as a shift */
161static	pfn_t	mnode_pages;	/* mem_node stripe width */
162
163/* Save home mask and shift used to calculate lgrp_handle_t values */
164static	uint64_t home_mask = 0;
165static	pfn_t	home_mask_pfn = 0;
166static	int	home_mask_shift = 0;
167static	uint_t	home_mask_pfn_shift = 0;
168
169/* Save lowest and highest latencies found across all lgroups */
170static	int	lower_latency = 0;
171static	int	higher_latency = 0;
172
173static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
174static	int	mpo_genid;		/* config gen; updated by mem DR */
175static	mpo_config_t mpo_config;	/* current mblocks and stripes */
176
177typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t;
178
179static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
180static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
181static	int	fix_interleave(void);
182
183static int  mblock_alloc(mpo_config_t *, update_t, int nmblocks);
184static void mblock_install(mpo_config_t *);
185static void mblock_free(mpo_config_t *);
186static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes);
187static void mblock_update_add(mpo_config_t *);
188static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t);
189static void mstripe_update(mpo_config_t *);
190static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t);
191
192/* Debug support */
193#if defined(DEBUG) && !defined(lint)
194#define	VALIDATE_SLICE(base, end) {					\
195	ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M)));		\
196	ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M)));	\
197}
198#define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
199#else
200#define	VALIDATE_SLICE(base, end)
201#define	MPO_DEBUG(...)
202#endif	/* DEBUG */
203
204/* Record status message, viewable from mdb */
205#define	MPO_STATUS(args...) {						      \
206	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
207	MPO_DEBUG(sun4v_mpo_status);					      \
208}
209
210/*
211 * The MPO locks are to protect the MPO metadata while that
212 * information is updated as a result of a memory DR operation.
213 * The read lock must be acquired to read the metadata and the
214 * write locks must be acquired to update it.
215 */
216#define	mpo_rd_lock	kpreempt_disable
217#define	mpo_rd_unlock	kpreempt_enable
218
219static void
220mpo_wr_lock()
221{
222	mutex_enter(&cpu_lock);
223	pause_cpus(NULL, NULL);
224	mutex_exit(&cpu_lock);
225}
226
227static void
228mpo_wr_unlock()
229{
230	mutex_enter(&cpu_lock);
231	start_cpus();
232	mutex_exit(&cpu_lock);
233}
234
235/*
236 * Routine to read a uint64_t from a given md
237 */
238static	int64_t
239get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
240{
241	int err = md_get_prop_val(md, node, propname, val);
242	return (err);
243}
244
245static int
246mblock_cmp(const void *a, const void *b)
247{
248	struct mblock_md *m1 = (struct mblock_md *)a;
249	struct mblock_md *m2 = (struct mblock_md *)b;
250
251	if (m1->base < m2->base)
252		return (-1);
253	else if (m1->base == m2->base)
254		return (0);
255	else
256		return (1);
257}
258
259static void
260mblock_sort(struct mblock_md *mblocks, int n)
261{
262	extern void qsort(void *, size_t, size_t,
263	    int (*)(const void *, const void *));
264
265	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
266}
267
268static void
269mpo_update_tunables(void)
270{
271	int i, ncpu_min;
272
273	/*
274	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
275	 * this process is currently running on before considering
276	 *  expanding threads to another lgroup.
277	 *
278	 * lgrp_expand_proc_diff determines how much less the remote lgroup
279	 *  must be loaded before expanding to it.
280	 *
281	 * On sun4v CMT processors, threads share a core pipeline, and
282	 * at less than 100% utilization, best throughput is obtained by
283	 * spreading threads across more cores, even if some are in a
284	 * different lgroup.  Spread threads to a new lgroup if the
285	 * current group is more than 50% loaded.  Because of virtualization,
286	 * lgroups may have different numbers of CPUs, but the tunables
287	 * apply to all lgroups, so find the smallest lgroup and compute
288	 * 50% loading.
289	 */
290
291	ncpu_min = NCPU;
292	for (i = 0; i < n_lgrpnodes; i++) {
293		int ncpu = mpo_lgroup[i].ncpu;
294		if (ncpu != 0 && ncpu < ncpu_min)
295			ncpu_min = ncpu;
296	}
297	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
298
299	/* new home may only be half as loaded as the existing home to use it */
300	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
301
302	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
303}
304
305static mde_cookie_t
306cpuid_to_cpunode(md_t *md, int cpuid)
307{
308	mde_cookie_t    rootnode, foundnode, *cpunodes;
309	uint64_t	cpuid_prop;
310	int	n_cpunodes, i;
311
312	if (md == NULL)
313		return (MDE_INVAL_ELEM_COOKIE);
314
315	rootnode = md_root_node(md);
316	if (rootnode == MDE_INVAL_ELEM_COOKIE)
317		return (MDE_INVAL_ELEM_COOKIE);
318
319	n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
320	    "fwd", &cpunodes);
321	if (n_cpunodes <= 0 || n_cpunodes > NCPU)
322		goto cpuid_fail;
323
324	for (i = 0; i < n_cpunodes; i++) {
325		if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
326		    &cpuid_prop))
327			break;
328		if (cpuid_prop == (uint64_t)cpuid) {
329			foundnode = cpunodes[i];
330			md_free_scan_dag(md, &cpunodes);
331			return (foundnode);
332		}
333	}
334cpuid_fail:
335	if (n_cpunodes > 0)
336		md_free_scan_dag(md, &cpunodes);
337	return (MDE_INVAL_ELEM_COOKIE);
338}
339
340static int
341mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
342{
343	mde_cookie_t *nodes;
344	uint64_t latency, lowest_latency;
345	uint64_t address_match, lowest_address_match;
346	int n_lgroups, j, result = 0;
347
348	/* Find lgroup nodes reachable from this cpu */
349	n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
350	    "fwd", &nodes);
351
352	lowest_latency = ~(0UL);
353
354	/* Find the lgroup node with the smallest latency */
355	for (j = 0; j < n_lgroups; j++) {
356		result = get_int(md, nodes[j], PROP_LG_LATENCY,
357		    &latency);
358		result |= get_int(md, nodes[j], PROP_LG_MATCH,
359		    &address_match);
360		if (result != 0) {
361			j = -1;
362			goto to_lgrp_done;
363		}
364		if (latency < lowest_latency) {
365			lowest_latency = latency;
366			lowest_address_match = address_match;
367		}
368	}
369	for (j = 0; j < n_lgrpnodes; j++) {
370		if ((mpo_lgroup[j].latency == lowest_latency) &&
371		    (mpo_lgroup[j].addr_match == lowest_address_match))
372			break;
373	}
374	if (j == n_lgrpnodes)
375		j = -1;
376
377to_lgrp_done:
378	if (n_lgroups > 0)
379		md_free_scan_dag(md, &nodes);
380	return (j);
381}
382
383/* Called when DR'ing in a CPU */
384void
385mpo_cpu_add(md_t *md, int cpuid)
386{
387	mde_cookie_t cpunode;
388
389	int i;
390
391	if (n_lgrpnodes <= 0)
392		return;
393
394	if (md == NULL)
395		goto add_fail;
396
397	cpunode = cpuid_to_cpunode(md, cpuid);
398	if (cpunode == MDE_INVAL_ELEM_COOKIE)
399		goto add_fail;
400
401	i = mpo_cpu_to_lgroup(md, cpunode);
402	if (i == -1)
403		goto add_fail;
404
405	mpo_cpu[cpuid].lgrp_index = i;
406	mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
407	mpo_lgroup[i].ncpu++;
408	mpo_update_tunables();
409	return;
410add_fail:
411	panic("mpo_cpu_add: Cannot read MD");
412}
413
414/* Called when DR'ing out a CPU */
415void
416mpo_cpu_remove(int cpuid)
417{
418	int i;
419
420	if (n_lgrpnodes <= 0)
421		return;
422
423	i = mpo_cpu[cpuid].lgrp_index;
424	mpo_lgroup[i].ncpu--;
425	mpo_cpu[cpuid].home = 0;
426	mpo_cpu[cpuid].lgrp_index = -1;
427	mpo_update_tunables();
428}
429
430static mde_cookie_t
431md_get_root(md_t *md)
432{
433	mde_cookie_t root = MDE_INVAL_ELEM_COOKIE;
434	int n_nodes;
435
436	n_nodes = md_node_count(md);
437
438	if (n_nodes <= 0) {
439		MPO_STATUS("md_get_root: No nodes in node count\n");
440		return (root);
441	}
442
443	root = md_root_node(md);
444
445	if (root == MDE_INVAL_ELEM_COOKIE) {
446		MPO_STATUS("md_get_root: Root node is missing\n");
447		return (root);
448	}
449
450	MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes);
451	MPO_DEBUG("md_get_root: md: %p\n", md);
452	MPO_DEBUG("md_get_root: root: %lx\n", root);
453done:
454	return (root);
455}
456
457static int
458lgrp_update(md_t *md, mde_cookie_t root)
459{
460	int i, j, result;
461	int ret_val = 0;
462	int sub_page_fix;
463	mde_cookie_t *nodes, *lgrpnodes;
464
465	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
466	    "fwd", &lgrpnodes);
467
468	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
469		MPO_STATUS("lgrp_update: No Lgroups\n");
470		ret_val = -1;
471		goto fail;
472	}
473
474	MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes);
475
476	for (i = 0; i < n_lgrpnodes; i++) {
477		mpo_lgroup[i].node = lgrpnodes[i];
478		mpo_lgroup[i].id = i;
479		mpo_lgroup[i].ncpu = 0;
480		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
481		    &mpo_lgroup[i].addr_mask);
482		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
483		    &mpo_lgroup[i].addr_match);
484
485		/*
486		 * If either the mask or match properties are missing, set to 0
487		 */
488		if (result < 0) {
489			mpo_lgroup[i].addr_mask = 0;
490			mpo_lgroup[i].addr_match = 0;
491		}
492
493		/* Set latency to 0 if property not present */
494
495		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
496		    &mpo_lgroup[i].latency);
497		if (result < 0)
498			mpo_lgroup[i].latency = 0;
499	}
500
501	/*
502	 * Sub-page level interleave is not yet supported.  Check for it,
503	 * and remove sub-page interleaved lgroups from mpo_lgroup and
504	 * n_lgrpnodes.  If no lgroups are left, return.
505	 */
506
507	sub_page_fix = fix_interleave();
508	if (n_lgrpnodes == 0) {
509		ret_val = -1;
510		goto fail;
511	}
512
513	/* Ensure that all of the addr_mask values are the same */
514
515	for (i = 0; i < n_lgrpnodes; i++) {
516		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
517			MPO_STATUS("lgrp_update: "
518			    "addr_mask values are not the same\n");
519			ret_val = -1;
520			goto fail;
521		}
522	}
523
524	/*
525	 * Ensure that all lgrp nodes see all the mblocks. However, if
526	 * sub-page interleave is being fixed, they do not, so skip
527	 * the check.
528	 */
529
530	if (sub_page_fix == 0) {
531		for (i = 0; i < n_lgrpnodes; i++) {
532			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
533			    PROP_LG_MBLOCK, "fwd", &nodes);
534			md_free_scan_dag(md, &nodes);
535			if (j != n_mblocks) {
536				MPO_STATUS("lgrp_update: "
537				    "sub-page interleave is being fixed\n");
538				ret_val = -1;
539				goto fail;
540			}
541		}
542	}
543fail:
544	if (n_lgrpnodes > 0) {
545		md_free_scan_dag(md, &lgrpnodes);
546		for (i = 0; i < n_lgrpnodes; i++)
547			mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
548	}
549
550	return (ret_val);
551}
552
553/*
554 *
555 * Traverse the MD to determine:
556 *
557 *  Number of CPU nodes, lgrp_nodes, and mblocks
558 *  Then for each lgrp_node, obtain the appropriate data.
559 *  For each CPU, determine its home locality and store it.
560 *  For each mblock, retrieve its data and store it.
561 */
562static	int
563lgrp_traverse(md_t *md)
564{
565	mde_cookie_t root, *cpunodes, *mblocknodes;
566	int o;
567	uint64_t i, k, stripe, stride;
568	uint64_t mem_lg_homeset = 0;
569	int ret_val = 0;
570	int result = 0;
571	int n_cpunodes = 0;
572	mpo_config_t new_config;
573
574	if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) {
575		ret_val = -1;
576		goto fail;
577	}
578
579	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
580	    &mblocknodes);
581	if (n_mblocks <= 0) {
582		MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
583		    "Descriptor\n");
584		ret_val = -1;
585		goto fail;
586	}
587
588	/*
589	 * Build the Memory Nodes.  Do this before any possibility of
590	 * bailing from this routine so we obtain ra_to_pa (needed for page
591	 * coloring) even when there are no lgroups defined.
592	 */
593	if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) {
594		ret_val = -1;
595		goto fail;
596	}
597
598	mblock_update(&new_config, md, mblocknodes);
599	mblock_install(&new_config);
600
601	/* Page coloring hook is required so we can iterate through mnodes */
602	if (&page_next_pfn_for_color_cpu == NULL) {
603		MPO_STATUS("lgrp_traverse: No page coloring support\n");
604		ret_val = -1;
605		goto fail;
606	}
607
608	/* Global enable for mpo */
609	if (sun4v_mpo_enable == 0) {
610		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
611		ret_val = -1;
612		goto fail;
613	}
614
615	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
616
617	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
618		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
619		    "in MD\n");
620		ret_val = -1;
621		goto fail;
622	}
623
624	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
625
626	if ((ret_val = lgrp_update(md, root)) == -1)
627		goto fail;
628
629	/*
630	 * Use the address mask from the first lgroup node
631	 * to establish our home_mask.
632	 */
633	home_mask = mpo_lgroup[0].addr_mask;
634	home_mask_pfn = btop(home_mask);
635	home_mask_shift = lowbit(home_mask) - 1;
636	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
637	mnode_pages = btop(1ULL << home_mask_shift);
638
639	/*
640	 * How many values are possible in home mask?  Assume the mask
641	 * bits are contiguous.
642	 */
643	max_locality_groups =
644	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
645
646	stripe_shift = highbit(max_locality_groups) - 1;
647	stripe = ptob(mnode_pages);
648	stride = max_locality_groups * stripe;
649	mnode_stride = btop(stride);
650
651	/* Now verify the home mask bits are contiguous */
652
653	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
654		MPO_STATUS("lgrp_traverse: "
655		    "home mask bits are not contiguous\n");
656		ret_val = -1;
657		goto fail;
658	}
659
660	/* Record all of the home bits */
661
662	for (i = 0; i < n_lgrpnodes; i++) {
663		HOMESET_ADD(mem_lg_homeset,
664		    mpo_lgroup[i].addr_match >> home_mask_shift);
665	}
666
667	/* Count the number different "home"  mem_lg's we've discovered */
668
669	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
670
671	/* If we have only 1 locality group then we can exit */
672	if (n_locality_groups == 1) {
673		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
674		ret_val = -1;
675		goto fail;
676	}
677
678	/*
679	 * Set the latencies.  A CPU's lgroup is defined by the lowest
680	 * latency found.  All other memory is considered remote, and the
681	 * remote latency is represented by the highest latency found.
682	 * Thus hierarchical lgroups, if any, are approximated by a
683	 * two level scheme.
684	 *
685	 * The Solaris MPO framework by convention wants to see latencies
686	 * in units of nano-sec/10. In the MD, the units are defined to be
687	 * pico-seconds.
688	 */
689
690	lower_latency = mpo_lgroup[0].latency;
691	higher_latency = mpo_lgroup[0].latency;
692
693	for (i = 1; i < n_lgrpnodes; i++) {
694		if (mpo_lgroup[i].latency < lower_latency) {
695			lower_latency = mpo_lgroup[i].latency;
696		}
697		if (mpo_lgroup[i].latency > higher_latency) {
698			higher_latency = mpo_lgroup[i].latency;
699		}
700	}
701	lower_latency /= 10000;
702	higher_latency /= 10000;
703
704	/* Clear our CPU data */
705
706	for (i = 0; i < NCPU; i++) {
707		mpo_cpu[i].home = 0;
708		mpo_cpu[i].lgrp_index = -1;
709	}
710
711	/* Build the CPU nodes */
712	for (i = 0; i < n_cpunodes; i++) {
713
714		/* Read in the lgroup nodes */
715		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
716		if (result < 0) {
717			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
718			ret_val = -1;
719			goto fail;
720		}
721
722		o = mpo_cpu_to_lgroup(md, cpunodes[i]);
723		if (o == -1) {
724			ret_val = -1;
725			goto fail;
726		}
727		mpo_cpu[k].lgrp_index = o;
728		mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
729		mpo_lgroup[o].ncpu++;
730	}
731	/* Validate that no large pages cross mnode boundaries. */
732	if (valid_pages(md, cpunodes[0]) == 0) {
733		ret_val = -1;
734		goto fail;
735	}
736
737fail:
738	if (n_cpunodes > 0)
739		md_free_scan_dag(md, &cpunodes);
740	if (n_mblocks > 0)
741		md_free_scan_dag(md, &mblocknodes);
742	else
743		panic("lgrp_traverse: No memory blocks found");
744
745	if (ret_val == 0) {
746		MPO_STATUS("MPO feature is enabled.\n");
747	} else
748		sun4v_mpo_enable = 0;	/* set this for DR */
749
750	return (ret_val);
751}
752
753/*
754 *  Determine the number of unique mem_lg's present in our system
755 */
756static	int
757unique_home_mem_lg_count(uint64_t mem_lg_homeset)
758{
759	int homeid;
760	int count = 0;
761
762	/*
763	 * Scan the "home" bits of the mem_lgs, count
764	 * the number that are unique.
765	 */
766
767	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
768		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
769			count++;
770		}
771	}
772
773	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
774	    mem_lg_homeset);
775	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
776
777	/* Default must be at least one */
778	if (count == 0)
779		count = 1;
780
781	return (count);
782}
783
784/*
785 * Platform specific lgroup initialization
786 */
787void
788plat_lgrp_init(void)
789{
790	md_t *md;
791	int rc;
792
793	/* Get the Machine Descriptor handle */
794
795	md = md_get_handle();
796
797	/* If not, we cannot continue */
798
799	if (md == NULL) {
800		panic("cannot access machine descriptor\n");
801	} else {
802		rc = lgrp_traverse(md);
803		(void) md_fini_handle(md);
804	}
805
806	/*
807	 * If we can't process the MD for lgroups then at least let the
808	 * system try to boot.  Assume we have one lgroup so that
809	 * when plat_build_mem_nodes is called, it will attempt to init
810	 * an mnode based on the supplied memory segment.
811	 */
812
813	if (rc == -1) {
814		home_mask_pfn = 0;
815		max_locality_groups = 1;
816		n_locality_groups = 1;
817		return;
818	}
819
820	mem_node_pfn_shift = 0;
821	mem_node_physalign = 0;
822
823	/* Use lgroup-aware TSB allocations */
824	tsb_lgrp_affinity = 1;
825
826	/* Require that a home lgroup have some memory to be chosen */
827	lgrp_mem_free_thresh = 1;
828
829	/* Standard home-on-next-touch policy */
830	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
831
832	/* Disable option to choose root lgroup if all leaf lgroups are busy */
833	lgrp_load_thresh = UINT32_MAX;
834
835	mpo_update_tunables();
836}
837
838/*
839 *  Helper routine for debugging calls to mem_node_add_slice()
840 */
841static	void
842mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
843{
844#if defined(DEBUG) && !defined(lint)
845	static int slice_count = 0;
846
847	slice_count++;
848	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
849	    slice_count, basepfn, endpfn);
850#endif
851	mem_node_add_slice(basepfn, endpfn);
852}
853
854static	void
855mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn)
856{
857#if defined(DEBUG) && !defined(lint)
858	static int slice_count = 0;
859
860	slice_count++;
861	MPO_DEBUG("mem_del_slice(%d): basepfn: %lx  endpfn: %lx\n",
862	    slice_count, basepfn, endpfn);
863#endif
864	mem_node_del_slice(basepfn, endpfn);
865}
866
867/*
868 *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
869 */
870static	void
871mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
872{
873	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
874	    "mnode index: %d\n", plathand, mnode);
875	plat_assign_lgrphand_to_mem_node(plathand, mnode);
876}
877
878/*
879 * plat_build_mem_nodes()
880 *
881 * Define the mem_nodes based on the modified boot memory list,
882 * or based on info read from the MD in plat_lgrp_init().
883 *
884 * When the home mask lies in the middle of the address bits (as it does on
885 * Victoria Falls), then the memory in one mem_node is no longer contiguous;
886 * it is striped across an mblock in a repeating pattern of contiguous memory
887 * followed by a gap.  The stripe width is the size of the contiguous piece.
888 * The stride is the distance from the start of one contiguous piece to the
889 * start of the next.  The gap is thus stride - stripe_width.
890 *
891 * The stripe of an mnode that falls within an mblock is described by the type
892 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
893 * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
894 * this array is predetermined.  The mem_stripe_t that describes mnode m
895 * within mpo_mblock[i] is stored at
896 *	 mem_stripes[ m + i * max_locality_groups ]
897 *
898 * max_locality_groups is the total number of possible locality groups,
899 * as defined by the size of the home mask, even if the memory assigned
900 * to the domain is small and does not cover all the lgroups.  Thus some
901 * mem_stripe_t's may be empty.
902 *
903 * The members of mem_stripe_t are:
904 *	physbase: First valid page in mem_node in the corresponding mblock
905 *	physmax: Last valid page in mem_node in mblock
906 *	offset:  The full stripe width starts at physbase - offset.
907 *	    Thus if offset is non-zero, this mem_node starts in the middle
908 *	    of a stripe width, and the second full stripe starts at
909 *	    physbase - offset + stride.  (even though physmax may fall in the
910 *	    middle of a stripe width, we do not save the ending fragment size
911 *	    in this data structure.)
912 *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
913 *
914 *	The stripe width is kept in the global mnode_pages.
915 *	The stride is kept in the global mnode_stride.
916 *	All the above use pfn's as the unit.
917 *
918 * As an example, the memory layout for a domain with 2 mblocks and 4
919 * mem_nodes 0,1,2,3 could look like this:
920 *
921 *	123012301230 ...	012301230123 ...
922 *	  mblock 0		  mblock 1
923 */
924
925/*ARGSUSED*/
926void
927plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
928{
929	int elem;
930	uint64_t base, len;
931
932	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
933	max_mem_nodes = max_locality_groups;
934
935	mstripe_update(&mpo_config);
936
937	/* Check for non-MPO sun4v platforms */
938	if (n_locality_groups <= 1) {
939		mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
940		for (elem = 0; elem < nelems; list++, elem++) {
941			base = list->addr;
942			len = list->size;
943
944			mpo_mem_node_add_slice(btop(base),
945			    btop(base + len - 1));
946		}
947		mem_node_pfn_shift = 0;
948		mem_node_physalign = 0;
949	} else
950		mnode_update(&mpo_config, 0, 0, U_ADD_ALL);
951
952	/*
953	 * Indicate to vm_pagelist that the hpm_counters array
954	 * should be shared because the ranges overlap.
955	 */
956	if (max_mem_nodes > 1) {
957		interleaved_mnodes = 1;
958	}
959}
960
961/*
962 * Return the locality group value for the supplied processor
963 */
964lgrp_handle_t
965plat_lgrp_cpu_to_hand(processorid_t id)
966{
967	lgrp_handle_t lgrphand;
968
969	mpo_rd_lock();
970	if (n_locality_groups > 1) {
971		lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home;
972	} else {
973		lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */
974	}
975	mpo_rd_unlock();
976
977	return (lgrphand);
978}
979
980int
981plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
982{
983	/*
984	 * Return min remote latency when there are more than two lgroups
985	 * (root and child) and getting latency between two different lgroups
986	 * or root is involved.
987	 */
988	if (lgrp_optimizations() && (from != to ||
989	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
990		return ((int)higher_latency);
991	} else {
992		return ((int)lower_latency);
993	}
994}
995
996int
997plat_pfn_to_mem_node(pfn_t pfn)
998{
999	int i, mnode;
1000	pfn_t ra_to_pa_pfn;
1001	struct mblock_md *mb;
1002
1003	if (n_locality_groups <= 1)
1004		return (0);
1005
1006	/*
1007	 * The mnode is defined to be 1:1 with the lgroup handle, which
1008	 * is taken from from the home bits.  Find the mblock in which
1009	 * the pfn falls to get the ra_to_pa adjustment, and extract
1010	 * the home bits.
1011	 */
1012	mpo_rd_lock();
1013	mb = &mpo_mblock[0];
1014	for (i = 0; i < n_mblocks; i++) {
1015		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1016			ra_to_pa_pfn = btop(mb->ra_to_pa);
1017			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1018			    home_mask_pfn_shift);
1019			ASSERT(mnode < max_mem_nodes);
1020			mpo_rd_unlock();
1021			return (mnode);
1022		}
1023		mb++;
1024	}
1025
1026	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1027	return (pfn);
1028}
1029
1030/*
1031 * plat_rapfn_to_papfn
1032 *
1033 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034 * and home mask bits are correct.  The upper bits do not necessarily
1035 * match the actual PA, however.
1036 */
1037pfn_t
1038plat_rapfn_to_papfn(pfn_t pfn)
1039{
1040	int i;
1041	pfn_t ra_to_pa_pfn;
1042	struct mblock_md *mb;
1043
1044	ASSERT(n_mblocks > 0);
1045	if (n_mblocks == 1)
1046		return (pfn + base_ra_to_pa_pfn);
1047
1048	/*
1049	 * Find the mblock in which the pfn falls
1050	 * in order to get the ra_to_pa adjustment.
1051	 */
1052	mpo_rd_lock();
1053	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1054		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1055			ra_to_pa_pfn = btop(mb->ra_to_pa);
1056			mpo_rd_unlock();
1057			return (pfn + ra_to_pa_pfn);
1058		}
1059	}
1060
1061	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1062	return (pfn);
1063}
1064
1065/*
1066 * plat_mem_node_iterator_init()
1067 *      Initialize cookie "it" to iterate over pfn's in an mnode.  There is
1068 *      no additional iterator function.  The caller uses the info from
1069 *      the iterator structure directly.
1070 *
1071 *      pfn: starting pfn.
1072 *      mnode: desired mnode.
1073 *	szc: desired page size.
1074 *      init:
1075 *          if 1, start a new traversal, initialize "it", find first
1076 *              mblock containing pfn, and return its starting pfn
1077 *              within the mnode.
1078 *          if 0, continue the previous traversal using passed-in data
1079 *              from "it", advance to the next mblock, and return its
1080 *              starting pfn within the mnode.
1081 *      it: returns readonly data to the caller; see below.
1082 *
1083 *	The input pfn must be aligned for the page size szc.
1084 *
1085 *      Returns: starting pfn for the iteration for the mnode/mblock,
1086 *	    which is aligned according to the page size,
1087 *          or returns (pfn_t)(-1) if the input pfn lies past the last
1088 *          valid pfn of the mnode.
1089 *      Returns misc values in the "it" struct that allows the caller
1090 *          to advance the pfn within an mblock using address arithmetic;
1091 *          see definition of mem_node_iterator_t in vm_dep.h.
1092 *          When the caller calculates a pfn that is greater than the
1093 *          returned value it->mi_mblock_end, the caller should again
1094 *          call plat_mem_node_iterator_init, passing init=0.
1095 *
1096 *          The last mblock in continuation case may be invalid because
1097 *          of memory DR.  To detect this situation mi_genid is checked
1098 *          against mpo_genid which is incremented after a memory DR
1099 *          operation.  See also plat_slice_add()/plat_slice_del().
1100 */
1101pfn_t
1102plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1103    mem_node_iterator_t *it, int init)
1104{
1105	int i;
1106	pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1107	struct mblock_md *mblock;
1108	pfn_t base, end;
1109	mem_stripe_t *ms;
1110	uint64_t szcpagesize;
1111
1112	ASSERT(it != NULL);
1113	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1114	ASSERT(n_mblocks > 0);
1115	ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1116
1117	mpo_rd_lock();
1118
1119	if (init || (it->mi_genid != mpo_genid)) {
1120		it->mi_genid = mpo_genid;
1121		it->mi_last_mblock = 0;
1122		it->mi_init = 1;
1123	}
1124
1125	/* Check if mpo is not enabled and we only have one mblock */
1126	if (n_locality_groups == 1 && n_mblocks == 1) {
1127		if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) {
1128			pfn = (pfn_t)-1;
1129			goto done;
1130		}
1131		it->mi_mnode = mnode;
1132		it->mi_ra_to_pa = base_ra_to_pa_pfn;
1133		it->mi_mnode_pfn_mask = 0;
1134		it->mi_mnode_pfn_shift = 0;
1135		it->mi_mnode_mask = 0;
1136		it->mi_mblock_base = mem_node_config[mnode].physbase;
1137		it->mi_mblock_end = mem_node_config[mnode].physmax;
1138		if (pfn < it->mi_mblock_base)
1139			pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1140		if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1141			pfn = (pfn_t)-1;
1142		goto done;
1143	}
1144
1145	/* init=1 means begin iterator, init=0 means continue */
1146	if (init == 1) {
1147		i = 0;
1148	} else {
1149		ASSERT(it->mi_last_mblock < n_mblocks);
1150		i = it->mi_last_mblock;
1151		ASSERT(pfn >
1152		    mem_stripes[i * max_locality_groups + mnode].physmax);
1153		if (++i == n_mblocks) {
1154			pfn = (pfn_t)-1;
1155			goto done;
1156		}
1157	}
1158
1159	/*
1160	 * Find mblock that contains pfn for mnode's stripe, or first such an
1161	 * mblock after pfn, else pfn is out of bound and we'll return -1.
1162	 * mblocks and stripes are sorted in ascending address order.
1163	 */
1164	szcpagesize = szcpgcnt << PAGESHIFT;
1165	for (; i < n_mblocks; i++) {
1166		if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1167			continue;
1168		ms = &mem_stripes[i * max_locality_groups + mnode];
1169		if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1170		    (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1171		    ms->physmax)
1172			break;
1173	}
1174	if (i == n_mblocks) {
1175		it->mi_last_mblock = i - 1;
1176		pfn = (pfn_t)-1;
1177		goto done;
1178	}
1179
1180	it->mi_last_mblock = i;
1181
1182	mblock = &mpo_mblock[i];
1183	base = ms->physbase;
1184	end = ms->physmax;
1185
1186	it->mi_mnode = mnode;
1187	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1188	it->mi_mblock_base = base;
1189	it->mi_mblock_end = end;
1190	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1191	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1192	it->mi_mnode_mask = max_locality_groups - 1;
1193	if (pfn < base) {
1194		pfn = P2ROUNDUP(base, szcpgcnt);
1195		ASSERT(pfn + szcpgcnt - 1 <= end);
1196	}
1197	ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
1198done:
1199	mpo_rd_unlock();
1200	return (pfn);
1201}
1202
1203/*
1204 * plat_mem_node_intersect_range()
1205 *
1206 * Find the intersection between a memnode and a range of pfn's.
1207 */
1208void
1209plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1210    int mnode, pgcnt_t *npages_out)
1211{
1212	pfn_t offset, len, hole, base, end, test_end, frag;
1213	pfn_t nearest;
1214	mem_stripe_t *ms;
1215	int i, npages;
1216
1217	*npages_out = 0;
1218
1219	if (!mem_node_config[mnode].exists || test_len == 0)
1220		return;
1221
1222	base = mem_node_config[mnode].physbase;
1223	end = mem_node_config[mnode].physmax;
1224
1225	test_end = test_base + test_len - 1;
1226	if (end < test_base || base > test_end)
1227		return;
1228
1229	if (n_locality_groups == 1) {
1230		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1231		return;
1232	}
1233
1234	hole = mnode_stride - mnode_pages;
1235	npages = 0;
1236
1237	/*
1238	 * Iterate over all the stripes for this mnode (one per mblock),
1239	 * find the intersection with each, and accumulate the intersections.
1240	 *
1241	 * Determing the intersection with a stripe is tricky.  If base or end
1242	 * fall outside the mem_node bounds, round them to physbase/physmax of
1243	 * mem_node.  If base or end fall in a gap, round them to start of
1244	 * nearest stripe.  If they fall within a stripe, keep base or end,
1245	 * but calculate the fragment size that should be excluded from the
1246	 * stripe.  Calculate how many strides fall in the adjusted range,
1247	 * multiply by stripe width, and add the start and end fragments.
1248	 */
1249
1250	mpo_rd_lock();
1251	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1252		ms = &mem_stripes[i];
1253		if (ms->exists &&
1254		    test_base <= (end = ms->physmax) &&
1255		    test_end >= (base = ms->physbase)) {
1256
1257			offset = ms->offset;
1258
1259			if (test_base > base) {
1260				/* Round test_base to next multiple of stride */
1261				len = P2ROUNDUP(test_base - (base - offset),
1262				    mnode_stride);
1263				nearest = base - offset + len;
1264				/*
1265				 * Compute distance from test_base to the
1266				 * stride boundary to see if test_base falls
1267				 * in the stripe or in the hole.
1268				 */
1269				if (nearest - test_base > hole) {
1270					/*
1271					 * test_base lies in stripe,
1272					 * and offset should be excluded.
1273					 */
1274					offset = test_base -
1275					    (nearest - mnode_stride);
1276					base = test_base;
1277				} else {
1278					/* round up to next stripe start */
1279					offset = 0;
1280					base = nearest;
1281					if (base > end)
1282						continue;
1283				}
1284
1285			}
1286
1287			if (test_end < end)
1288				end = test_end;
1289			end++;		/* adjust to an exclusive bound */
1290
1291			/* Round end to next multiple of stride */
1292			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1293			nearest = (base - offset) + len;
1294			if (nearest - end <= hole) {
1295				/* end falls in hole, use entire last stripe */
1296				frag = 0;
1297			} else {
1298				/* end falls in stripe, compute fragment */
1299				frag = nearest - hole - end;
1300			}
1301
1302			len = (len >> stripe_shift) - offset - frag;
1303			npages += len;
1304		}
1305	}
1306
1307	*npages_out = npages;
1308	mpo_rd_unlock();
1309}
1310
1311/*
1312 * valid_pages()
1313 *
1314 * Return 1 if pages are valid and do not cross mnode boundaries
1315 * (which would break page free list assumptions), and 0 otherwise.
1316 */
1317
1318#define	MNODE(pa)	\
1319	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1320
1321static int
1322valid_pages(md_t *md, mde_cookie_t cpu0)
1323{
1324	int i, max_szc;
1325	uint64_t last_page_base, szc_mask;
1326	uint64_t max_page_len, max_coalesce_len;
1327	struct mblock_md *mb = mpo_mblock;
1328
1329	/*
1330	 * Find the smaller of the largest page possible and supported.
1331	 * mmu_exported_pagesize_mask is not yet initialized, so read
1332	 * it from the MD.  Apply minimal fixups in case of broken MDs
1333	 * to get a sane mask.
1334	 */
1335
1336	if (cpu0 == 0)
1337		szc_mask = szc_mask0;
1338	else {
1339		if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1340			szc_mask = 0;
1341		/* largest in sun4v default support */
1342		szc_mask |=  (1 << TTE4M);
1343		szc_mask0 = szc_mask;
1344	}
1345	max_szc = highbit(szc_mask) - 1;
1346	if (max_szc > TTE256M)
1347		max_szc = TTE256M;
1348	max_page_len = TTEBYTES(max_szc);
1349
1350	/*
1351	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1353	 * within one mnode to use MPO.
1354	 */
1355	max_coalesce_len = TTEBYTES(TTE256M);
1356	ASSERT(max_coalesce_len >= max_page_len);
1357
1358	if (ptob(mnode_pages) < max_coalesce_len) {
1359		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1361		return (0);
1362	}
1363
1364	for (i = 0; i < n_mblocks; i++) {
1365		uint64_t base = mb->base;
1366		uint64_t end = mb->base + mb->size - 1;
1367		uint64_t ra_to_pa = mb->ra_to_pa;
1368
1369		/*
1370		 * If mblock is smaller than the max page size, then
1371		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1372		 * not span mnodes.
1373		 */
1374		if (mb->size < max_page_len) {
1375			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1376				MPO_STATUS("Small mblock spans mnodes; "
1377				    "MPO disabled: base = %lx, end = %lx, "
1378				    "ra2pa = %lx\n", base, end, ra_to_pa);
1379				return (0);
1380			}
1381		} else {
1382			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1383			uint64_t pa_base = base + ra_to_pa;
1384			if ((base & (max_coalesce_len - 1)) !=
1385			    (pa_base & (max_coalesce_len - 1))) {
1386				MPO_STATUS("bad page alignment; MPO disabled: "
1387				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1388				    base, pa_base, max_coalesce_len);
1389				return (0);
1390			}
1391		}
1392
1393		/*
1394		 * Find start of last large page in mblock in RA space.
1395		 * If page extends into the next mblock, verify the
1396		 * mnode does not change.
1397		 */
1398		last_page_base = P2ALIGN(end, max_coalesce_len);
1399		if (i + 1 < n_mblocks &&
1400		    last_page_base + max_coalesce_len > mb[1].base &&
1401		    MNODE(last_page_base + ra_to_pa) !=
1402		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1403			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1406			    mb[1].ra_to_pa, max_coalesce_len);
1407			return (0);
1408		}
1409
1410		mb++;
1411	}
1412	return (1);
1413}
1414
1415
1416/*
1417 * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418 * if any, and remove them.  This yields a config where the "coarse
1419 * grained" lgroups cover all of memory, even though part of that memory
1420 * is fine grain interleaved and does not deliver a purely local memory
1421 * latency.
1422 *
1423 * This function reads and modifies the globals:
1424 *	mpo_lgroup[], n_lgrpnodes
1425 *
1426 * Returns 1 if lgroup nodes were removed, 0 otherwise.
1427 */
1428
1429static int
1430fix_interleave(void)
1431{
1432	int i, j;
1433	uint64_t mask = 0;
1434
1435	j = 0;
1436	for (i = 0; i < n_lgrpnodes; i++) {
1437		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1438			/* remove this lgroup */
1439			mask = mpo_lgroup[i].addr_mask;
1440		} else {
1441			mpo_lgroup[j++] = mpo_lgroup[i];
1442		}
1443	}
1444	n_lgrpnodes = j;
1445
1446	if (mask != 0)
1447		MPO_STATUS("sub-page interleave %lx found; "
1448		    "removing lgroup.\n", mask);
1449
1450	return (mask != 0);
1451}
1452
1453/*
1454 * mblock_alloc
1455 *
1456 * Allocate memory for mblock an stripe arrays from either static or
1457 * dynamic space depending on utype, and return the result in mc.
1458 * Returns 0 on success and -1 on error.
1459 */
1460
1461static int
1462mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks)
1463{
1464	mblock_md_t *mb = NULL;
1465	mem_stripe_t *ms = NULL;
1466	int nstripes = MAX_MEM_NODES * nmblocks;
1467	size_t mblocksz = nmblocks * sizeof (struct mblock_md);
1468	size_t mstripesz = nstripes * sizeof (mem_stripe_t);
1469	size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
1470
1471	/*
1472	 * Allocate space for mblocks and mstripes.
1473	 *
1474	 * For DR allocations, just use kmem_alloc(), and set
1475	 * mc_alloc_sz to indicate it was used.
1476	 *
1477	 * For boot allocation:
1478	 * If we have a small number of mblocks we will use the space
1479	 * that we preallocated. Otherwise, we will dynamically
1480	 * allocate the space from the prom and map it to the
1481	 * reserved VA at MPOBUF_BASE.
1482	 */
1483
1484	if (utype == U_ADD || utype == U_DEL) {
1485		mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP);
1486		ms = (mem_stripe_t *)(mb + nmblocks);
1487		mc->mc_alloc_sz = allocsz;
1488	} else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
1489		mb = &small_mpo_mblocks[0];
1490		ms = &small_mem_stripes[0];
1491		mc->mc_alloc_sz = 0;
1492	} else {
1493		/* Ensure that we dont request more space than reserved */
1494		if (allocsz > MPOBUF_SIZE) {
1495			MPO_STATUS("mblock_alloc: Insufficient space "
1496			    "for mblock structures \n");
1497			return (-1);
1498		}
1499		mb = (struct mblock_md *)
1500		    prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
1501		if (mb != (struct mblock_md *)MPOBUF_BASE) {
1502			MPO_STATUS("mblock_alloc: Cannot allocate space "
1503			    "for mblocks \n");
1504			return (-1);
1505		}
1506		mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
1507		mpo_heap32_bufsz = MPOBUF_SIZE;
1508		ms = (mem_stripe_t *)(mb + nmblocks);
1509		mc->mc_alloc_sz = 0;
1510	}
1511	mc->mc_mblocks = mb;
1512	mc->mc_stripes = ms;
1513	mc->mc_nmblocks = nmblocks;
1514	mc->mc_nstripes = nstripes;
1515	MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks);
1516	return (0);
1517}
1518
1519/*
1520 * mblock_free
1521 *
1522 * Free memory in mc that was allocated by mblock_alloc.
1523 */
1524
1525static void
1526mblock_free(mpo_config_t *mc)
1527{
1528	if (mc->mc_alloc_sz > 0) {
1529		ASSERT(mc->mc_mblocks != mpo_mblock);
1530		kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz);
1531	}
1532	bzero(mc, sizeof (*mc));
1533}
1534
1535/*
1536 * mblock_install
1537 *
1538 * Install mblock config passed in mc as the global configuration.
1539 * May only be called at boot or while holding mpo_wr_lock.
1540 */
1541
1542static void
1543mblock_install(mpo_config_t *mc)
1544{
1545	mpo_mblock = mc->mc_mblocks;
1546	n_mblocks = mc->mc_nmblocks;
1547	mem_stripes = mc->mc_stripes;
1548	n_mem_stripes = mc->mc_nstripes;
1549	base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa);
1550	mpo_config = *mc;
1551}
1552
1553/*
1554 * mblock_update
1555 *
1556 * Traverse mblocknodes, read the mblock properties from the MD, and
1557 * save the mblocks in mc.
1558 */
1559
1560static void
1561mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes)
1562{
1563	uint64_t i, j;
1564	int result = 0;
1565	mblock_md_t *mblock = mc->mc_mblocks;
1566
1567	for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
1568
1569		/* Without a base or size value we will fail */
1570		result = get_int(md, mblocknodes[j], PROP_LG_BASE,
1571		    &mblock[i].base);
1572		if (result < 0) {
1573			MPO_STATUS("mblock_update: "
1574			    "PROP_LG_BASE is missing\n");
1575			mc->mc_nmblocks = 0;
1576			return;
1577		}
1578
1579		result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
1580		    &mblock[i].size);
1581		if (result < 0) {
1582			MPO_STATUS("mblock_update: "
1583			    "PROP_LG_SIZE is missing\n");
1584			mc->mc_nmblocks = 0;
1585			return;
1586		}
1587
1588		result = get_int(md, mblocknodes[j],
1589		    PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa);
1590
1591		/* If we don't have an ra_pa_offset, just set it to 0 */
1592		if (result < 0)
1593			mblock[i].ra_to_pa = 0;
1594
1595		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
1596		    "ra_to_pa = %lx\n", i,
1597		    mblock[i].base,
1598		    mblock[i].size,
1599		    mblock[i].ra_to_pa);
1600
1601		/* check for unsupportable values of base and size */
1602		if (mblock[i].base > mblock[i].base + mblock[i].size) {
1603			MPO_STATUS("mblock_update: "
1604			    "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
1605			    "base = %lx, size = %lx\n",
1606			    mblock[i].base, mblock[i].size);
1607			mc->mc_nmblocks = 0;
1608			return;
1609		}
1610
1611		/* eliminate size==0 blocks */
1612		if (mblock[i].size != 0) {
1613			uint64_t base = mblock[i].base;
1614			uint64_t end = base + mblock[i].size;
1615			ASSERT(end > base);
1616			mblock[i].base_pfn = btop(base);
1617			mblock[i].end_pfn = btop(end - 1);
1618			i++;
1619		}
1620	}
1621
1622	if (i == 0) {
1623		MPO_STATUS("mblock_update: "
1624		    "No non-empty mblock nodes were found "
1625		    "in the Machine Descriptor\n");
1626		mc->mc_nmblocks = 0;
1627		return;
1628	}
1629	ASSERT(i <= mc->mc_nmblocks);
1630	mc->mc_nmblocks = i;
1631
1632	/* Must sort mblocks by address for mem_node_iterator_init() */
1633	mblock_sort(mblock, mc->mc_nmblocks);
1634}
1635
1636/*
1637 * mblock_update_add
1638 *
1639 * Update mblock config after a memory DR add.  The added range is not
1640 * needed, as we read *all* mblock nodes from the MD.  Save the mblocks
1641 * in mc.
1642 */
1643
1644static void
1645mblock_update_add(mpo_config_t *mc)
1646{
1647	md_t *md;
1648	mde_cookie_t root, *mblocknodes;
1649	int nmblocks = 0;
1650
1651	if ((md = md_get_handle()) == NULL) {
1652		MPO_STATUS("Cannot access Machine Descriptor\n");
1653		goto error;
1654	}
1655
1656	if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE)
1657		goto error;
1658
1659	nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
1660	    &mblocknodes);
1661	if (nmblocks <= 0) {
1662		MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
1663		goto error;
1664	}
1665
1666	if (mblock_alloc(mc, U_ADD, nmblocks) < 0)
1667		goto error;
1668
1669	mblock_update(mc, md, mblocknodes);
1670	md_free_scan_dag(md, &mblocknodes);
1671	(void) md_fini_handle(md);
1672	return;
1673error:
1674	panic("mblock_update_add: cannot process mblocks from MD.\n");
1675}
1676
1677/*
1678 * mblock_update_del
1679 *
1680 * Update mblocks after a memory DR deletion of the range (ubase, uend).
1681 * Allocate a new mblock config, copy old config to the new, modify the new
1682 * mblocks to reflect the deletion.   The new mblocks are returned in
1683 * mc_new and are not yet installed as the active config.
1684 */
1685
1686static void
1687mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase,
1688    pfn_t uend)
1689{
1690	int i, j;
1691	pfn_t base, end;
1692	mblock_md_t *mblock;
1693	int nmblocks = mc_old->mc_nmblocks;
1694
1695	MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend);
1696
1697	/*
1698	 * Allocate mblocks in mc_new and copy the old to the new.
1699	 * Allocate one extra in case the deletion splits an mblock.
1700	 */
1701	if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0)
1702		return;
1703	mblock = mc_new->mc_mblocks;
1704	bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t));
1705
1706	/*
1707	 * Find the mblock containing the deleted range and adjust it in
1708	 * the new config.
1709	 */
1710	for (i = 0; i < nmblocks; i++) {
1711
1712		base = btop(mblock[i].base);
1713		end = base + btop(mblock[i].size) - 1;
1714
1715		/*
1716		 * Adjust the mblock based on the subset that was deleted.
1717		 *
1718		 * If the entire mblk was deleted, compact the table.
1719		 *
1720		 * If the middle of the mblk was deleted, extend
1721		 * the table.  Space for the new slot was already
1722		 * allocated.
1723		 *
1724		 * The memory to be deleted is a mblock or a subset of
1725		 * and does not span multiple mblocks.
1726		 */
1727		if (base == ubase && end == uend) {
1728			for (j = i; j < nmblocks - 1; j++)
1729				mblock[j] = mblock[j + 1];
1730			nmblocks--;
1731			bzero(&mblock[nmblocks], sizeof (*mblock));
1732			break;
1733		} else if (base < ubase && end > uend) {
1734			for (j = nmblocks - 1; j >= i; j--)
1735				mblock[j + 1] = mblock[j];
1736			mblock[i].size = ptob(ubase - base);
1737			mblock[i].end_pfn = ubase - 1;
1738			mblock[i + 1].base = ptob(uend + 1);
1739			mblock[i + 1].size = ptob(end - uend);
1740			mblock[i + 1].base_pfn = uend + 1;
1741			nmblocks++;
1742			break;
1743		} else if (base == ubase) {
1744			MPO_DEBUG("mblock_update_del: shrink>"
1745			    " i=%d base=0x%lx end=0x%lx", i, base, end);
1746			mblock[i].base = ptob(uend + 1);
1747			mblock[i].size -= ptob(uend - ubase + 1);
1748			base = uend + 1;
1749			mblock[i].base_pfn = base;
1750			mblock[i].end_pfn = end;
1751			MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1752			break;
1753		} else if (end == uend) {
1754			MPO_DEBUG("mblock_update_del: shrink<"
1755			    " i=%d base=0x%lx end=0x%lx", i, base, end);
1756			mblock[i].size -= ptob(uend - ubase + 1);
1757			end = ubase - 1;
1758			mblock[i].base_pfn = base;
1759			mblock[i].end_pfn = end;
1760			MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
1761			break;
1762		}
1763	}
1764	mc_new->mc_nmblocks = nmblocks;
1765	ASSERT(end > base);
1766}
1767
1768/*
1769 * mstripe_update
1770 *
1771 * Read mblocks from mc and update mstripes in mc
1772 */
1773
1774static void
1775mstripe_update(mpo_config_t *mc)
1776{
1777	lgrp_handle_t lgrphand, lgrp_start;
1778	int i, mnode;
1779	uint64_t offset, stripe_end, base, end, ra_to_pa, stride;
1780	uint64_t stripe, frag, remove;
1781	mem_stripe_t *ms;
1782	mblock_md_t *mblock = mc->mc_mblocks;
1783	int nmblocks = mc->mc_nmblocks;
1784	int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t);
1785
1786	/* Check for non-MPO sun4v platforms or memory DR removal */
1787	if (n_locality_groups <= 1) {
1788		ASSERT(n_locality_groups == 1);
1789		ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
1790
1791		if (nmblocks == 1) {
1792			mc->mc_nstripes = 0;
1793		} else {
1794			mc->mc_nstripes = nmblocks;
1795			bzero(mc->mc_stripes, mstripesz);
1796			for (i = 0; i < nmblocks; i++) {
1797				mc->mc_stripes[i].exists = 1;
1798				mc->mc_stripes[i].physbase = mblock[i].base_pfn;
1799				mc->mc_stripes[i].physmax = mblock[i].end_pfn;
1800			}
1801		}
1802		return;
1803	}
1804
1805	bzero(mc->mc_stripes, mstripesz);
1806	mc->mc_nstripes = max_locality_groups * nmblocks;
1807	stripe = ptob(mnode_pages);
1808	stride = max_locality_groups * stripe;
1809
1810	for (i = 0; i < nmblocks; i++) {
1811		base = mblock[i].base;
1812		end = base + mblock[i].size;
1813		ra_to_pa = mblock[i].ra_to_pa;
1814
1815		/* Find the offset from the prev stripe boundary in PA space. */
1816		offset = (base + ra_to_pa) & (stripe - 1);
1817
1818		/* Set the next stripe boundary. */
1819		stripe_end = base - offset + stripe;
1820
1821		lgrp_start = (((base + ra_to_pa) & home_mask) >>
1822		    home_mask_shift);
1823		lgrphand = lgrp_start;
1824
1825		/*
1826		 * Loop over all lgroups covered by the mblock, creating a
1827		 * stripe for each.  Stop when lgrp_start is visited again.
1828		 */
1829		do {
1830			/* mblock may not span all lgroups */
1831			if (base >= end)
1832				break;
1833
1834			mnode = lgrphand;
1835			ASSERT(mnode < max_mem_nodes);
1836
1837			/*
1838			 * Calculate the size of the fragment that does not
1839			 * belong to the mnode in the last partial stride.
1840			 */
1841			frag = (end - (base - offset)) & (stride - 1);
1842			if (frag == 0) {
1843				/* remove the gap */
1844				remove = stride - stripe;
1845			} else if (frag < stripe) {
1846				/* fragment fits in stripe; keep it all */
1847				remove = 0;
1848			} else {
1849				/* fragment is large; trim after whole stripe */
1850				remove = frag - stripe;
1851			}
1852
1853			ms = &mc->mc_stripes[i * max_locality_groups + mnode];
1854			ms->physbase = btop(base);
1855			ms->physmax = btop(end - 1 - remove);
1856			ms->offset = btop(offset);
1857			ms->exists = 1;
1858
1859			base = stripe_end;
1860			stripe_end += stripe;
1861			offset = 0;
1862			lgrphand = (((base + ra_to_pa) & home_mask) >>
1863			    home_mask_shift);
1864		} while (lgrphand != lgrp_start);
1865	}
1866}
1867
1868#define	INTERSECT(a, b, c, d)				\
1869	if (((a) >= (c) && (a) <= (d)) ||		\
1870	    ((c) >= (a) && (c) <= (b))) {		\
1871		(c) = MAX((a), (c));			\
1872		(d) = MIN((b), (d));			\
1873	} else {					\
1874		ASSERT((a) >= (d) || (b) <= (c));	\
1875		continue;				\
1876	}						\
1877
1878/*
1879 * mnode_update
1880 *
1881 * Read stripes from mc and update mnode extents.  The mnode extents are
1882 * part of the live configuration, so this can only be done at boot time
1883 * or while holding the mpo_wr_lock.
1884 */
1885
1886static void
1887mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype)
1888{
1889	int i, j, mnode, found;
1890	pfn_t base, end;
1891	mem_stripe_t *ms;
1892
1893	MPO_DEBUG("mnode_udpate: basepfn: %lx  endpfn: %lx\n", ubase, uend);
1894
1895	if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) {
1896		if (utype == U_ADD)
1897			mpo_mem_node_add_slice(ubase, uend);
1898		else if (utype == U_DEL)
1899			mpo_mem_node_del_slice(ubase, uend);
1900		else
1901			panic("mnode update: %d: invalid\n", utype);
1902		return;
1903	}
1904
1905	found = 0;
1906	for (i = 0; i < mc->mc_nmblocks; i++) {
1907		for (mnode = 0; mnode < max_locality_groups; mnode++) {
1908
1909			j = i * max_locality_groups + mnode;
1910			ms = &mc->mc_stripes[j];
1911			if (!ms->exists)
1912				continue;
1913
1914			base = ms->physbase;
1915			end = ms->physmax;
1916
1917			/*
1918			 * Look for the mstripes intersecting this slice.
1919			 *
1920			 * The mstripe and slice pairs may not be equal
1921			 * if a subset of a mblock is added/deleted.
1922			 */
1923			switch (utype) {
1924			case U_ADD:
1925				INTERSECT(ubase, uend, base, end);
1926				/*FALLTHROUGH*/
1927			case U_ADD_ALL:
1928				if (n_locality_groups > 1)
1929					mpo_plat_assign_lgrphand_to_mem_node(
1930					    mnode, mnode);
1931				mpo_mem_node_add_slice(base, end);
1932				break;
1933			case U_DEL:
1934				INTERSECT(ubase, uend, base, end);
1935				mpo_mem_node_del_slice(base, end);
1936				break;
1937			default:
1938				panic("mnode_update: %d: invalid\n", utype);
1939				break;
1940			}
1941
1942			found++;
1943		}
1944	}
1945
1946	if (!found)
1947		panic("mnode_update: mstripe not found");
1948
1949#ifdef	DEBUG
1950	if (utype == U_ADD_ALL || utype == U_DEL)
1951		return;
1952	found = 0;
1953	for (i = 0; i < max_mem_nodes; i++) {
1954		if (!mem_node_config[i].exists)
1955			continue;
1956		if (ubase >= mem_node_config[i].physbase &&
1957		    ubase <= mem_node_config[i].physmax)
1958			found |= 1;
1959		if (uend >= mem_node_config[i].physbase &&
1960		    uend <= mem_node_config[i].physmax)
1961			found |= 2;
1962	}
1963	ASSERT(found == 3);
1964	{
1965		pfn_t minpfn, maxpfn;
1966
1967		mem_node_max_range(&minpfn, &maxpfn);
1968		ASSERT(minpfn <= ubase);
1969		ASSERT(maxpfn >= uend);
1970	}
1971#endif
1972}
1973
1974/*
1975 * Plat_slice_add()/plat_slice_del() are the platform hooks
1976 * for adding/deleting a pfn range to/from the system.
1977 *
1978 * Platform_slice_add() is used for both boot/DR cases.
1979 *
1980 * - Zeus has already added the mblocks to the MD, so read the updated
1981 *   MD and allocate all data structures required to manage the new memory
1982 *   configuration.
1983 *
1984 * - Recompute the stripes which are derived from the mblocks.
1985 *
1986 * - Update (expand) the mnode extents and install the modified mblocks as
1987 *   the new mpo config.  This must be done while holding the mpo_wr_lock
1988 *   to guarantee that no other threads access the mpo meta-data.
1989 *
1990 * - Unlock MPO data structures; the new config is live.  Free the old config.
1991 *
1992 * Plat_slice_del() is used for DR only.
1993 *
1994 * - Zeus has not yet modified the MD to reflect the deletion, so copy
1995 *   the old mpo mblocks and delete the range from the copy.
1996 *
1997 * - Recompute the stripes which are derived from the mblocks.
1998 *
1999 * - Update (shrink) the mnode extents and install the modified mblocks as
2000 *   the new mpo config.  This must be done while holding the mpo_wr_lock
2001 *   to guarantee that no other threads access the mpo meta-data.
2002 *
2003 * - Unlock MPO data structures; the new config is live.  Free the old config.
2004 */
2005
2006void
2007plat_slice_add(pfn_t base, pfn_t end)
2008{
2009	mpo_config_t old_config = mpo_config;
2010	mpo_config_t new_config;
2011
2012	VALIDATE_SLICE(base, end);
2013	mblock_update_add(&new_config);
2014	mstripe_update(&new_config);
2015	mpo_wr_lock();
2016	mblock_install(&new_config);
2017	/* Use new config to add all ranges for mnode_update */
2018	mnode_update(&new_config, base, end, U_ADD);
2019	mpo_genid++;
2020	mpo_wr_unlock();
2021	mblock_free(&old_config);
2022}
2023
2024void
2025plat_slice_del(pfn_t base, pfn_t end)
2026{
2027	mpo_config_t old_config = mpo_config;
2028	mpo_config_t new_config;
2029
2030	VALIDATE_SLICE(base, end);
2031	mblock_update_del(&new_config, &old_config, base, end);
2032	mstripe_update(&new_config);
2033	mpo_wr_lock();
2034	/* Use old config to find deleted range for mnode_update */
2035	mnode_update(&old_config, base, end, U_DEL);
2036	mblock_install(&new_config);
2037	mpo_genid++;
2038	mpo_wr_unlock();
2039	mblock_free(&old_config);
2040}
2041