xref: /illumos-gate/usr/src/uts/sun4v/os/mpo.c (revision 12551037)
1ce8eb11aSdp /*
2ce8eb11aSdp  * CDDL HEADER START
3ce8eb11aSdp  *
4ce8eb11aSdp  * The contents of this file are subject to the terms of the
5ce8eb11aSdp  * Common Development and Distribution License (the "License").
6ce8eb11aSdp  * You may not use this file except in compliance with the License.
7ce8eb11aSdp  *
8ce8eb11aSdp  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9ce8eb11aSdp  * or http://www.opensolaris.org/os/licensing.
10ce8eb11aSdp  * See the License for the specific language governing permissions
11ce8eb11aSdp  * and limitations under the License.
12ce8eb11aSdp  *
13ce8eb11aSdp  * When distributing Covered Code, include this CDDL HEADER in each
14ce8eb11aSdp  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15ce8eb11aSdp  * If applicable, add the following below this CDDL HEADER, with the
16ce8eb11aSdp  * fields enclosed by brackets "[]" replaced with your own identifying
17ce8eb11aSdp  * information: Portions Copyright [yyyy] [name of copyright owner]
18ce8eb11aSdp  *
19ce8eb11aSdp  * CDDL HEADER END
20ce8eb11aSdp  */
21ce8eb11aSdp 
22ce8eb11aSdp /*
23183ef8a1SHaik Aftandilian  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24ce8eb11aSdp  * Use is subject to license terms.
25ce8eb11aSdp  */
26ce8eb11aSdp 
27ce8eb11aSdp #include <sys/types.h>
28ce8eb11aSdp #include <sys/sysmacros.h>
29ce8eb11aSdp #include <sys/machsystm.h>
30ce8eb11aSdp #include <sys/machparam.h>
31ce8eb11aSdp #include <sys/cmn_err.h>
32ce8eb11aSdp #include <sys/stat.h>
33ce8eb11aSdp #include <sys/mach_descrip.h>
34ce8eb11aSdp #include <sys/memnode.h>
35ce8eb11aSdp #include <sys/mdesc.h>
36ce8eb11aSdp #include <sys/mpo.h>
379853d9e8SJason Beloro #include <vm/page.h>
38ce8eb11aSdp #include <vm/vm_dep.h>
39e853d8c3Sjc #include <vm/hat_sfmmu.h>
40bb57d1f5Sjc #include <sys/promif.h>
41ce8eb11aSdp 
42ce8eb11aSdp /*
43ce8eb11aSdp  * MPO and the sun4v memory representation
44ce8eb11aSdp  * ---------------------------------------
45ce8eb11aSdp  *
46ce8eb11aSdp  * Latency groups are defined in the sun4v achitecture by memory-latency-group
47ce8eb11aSdp  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
48ce8eb11aSdp  * tie together cpu nodes and mblock nodes, and contain mask and match
49ce8eb11aSdp  * properties that identify the portion of an mblock that belongs to the
50ce8eb11aSdp  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
51ce8eb11aSdp  * but an mblock defines Real Addresses (RA).  To translate, the mblock
52ce8eb11aSdp  * includes the property address-congruence-offset, hereafter referred to as
53ce8eb11aSdp  * ra_to_pa.  A real address ra is a member of an lgroup if
54ce8eb11aSdp  *
55ce8eb11aSdp  *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
56ce8eb11aSdp  *
57ce8eb11aSdp  * The MD is traversed, and information on all mblocks is kept in the array
58ce8eb11aSdp  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
59ce8eb11aSdp  * to, is kept in the array mpo_cpu[].
60ce8eb11aSdp  *
61ce8eb11aSdp  * This implementation makes (and verifies) the simplifying assumption that
62ce8eb11aSdp  * the mask bits are the same for all defined lgroups, and that all 1 bits in
63ce8eb11aSdp  * the mask are contiguous.  Thus the number of lgroups is bounded by the
64ce8eb11aSdp  * number of possible mask values, and the lgrp_handle_t is defined as the
65ce8eb11aSdp  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
66ce8eb11aSdp  * masks and values are also referred to as "home bits" in the code.
67ce8eb11aSdp  *
68ce8eb11aSdp  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
69ce8eb11aSdp  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
70ce8eb11aSdp  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
71ce8eb11aSdp  * home bits.  This yields the mem_node.
72ce8eb11aSdp  *
73ce8eb11aSdp  * Interfaces
74ce8eb11aSdp  * ----------
75ce8eb11aSdp  *
76ce8eb11aSdp  * This file exports the following entry points:
77ce8eb11aSdp  *
78ce8eb11aSdp  * plat_lgrp_init()
79ce8eb11aSdp  * plat_build_mem_nodes()
80ce8eb11aSdp  * plat_lgrp_cpu_to_hand()
81ce8eb11aSdp  * plat_lgrp_latency()
82ce8eb11aSdp  * plat_pfn_to_mem_node()
83ce8eb11aSdp  *	These implement the usual platform lgroup interfaces.
84ce8eb11aSdp  *
85ce8eb11aSdp  * plat_rapfn_to_papfn()
86ce8eb11aSdp  *	Recover the PA page coloring bits from an RA.
87ce8eb11aSdp  *
88ce8eb11aSdp  * plat_mem_node_iterator_init()
89ce8eb11aSdp  *	Initialize an iterator to efficiently step through pages in a mem_node.
90ce8eb11aSdp  *
91ce8eb11aSdp  * plat_mem_node_intersect_range()
92ce8eb11aSdp  *	Find the intersection with a mem_node.
939853d9e8SJason Beloro  *
949853d9e8SJason Beloro  * plat_slice_add()
959853d9e8SJason Beloro  * plat_slice_del()
969853d9e8SJason Beloro  *	Platform hooks to add/delete a pfn range.
979853d9e8SJason Beloro  *
989853d9e8SJason Beloro  * Internal Organization
999853d9e8SJason Beloro  * ---------------------
1009853d9e8SJason Beloro  *
1019853d9e8SJason Beloro  * A number of routines are used both boot/DR code which (re)build
1029853d9e8SJason Beloro  * appropriate MPO structures.
1039853d9e8SJason Beloro  *
1049853d9e8SJason Beloro  * mblock_alloc()
1059853d9e8SJason Beloro  *	Allocate memory for mblocks and stripes as
1069853d9e8SJason Beloro  *	appropriate for boot or memory DR.
1079853d9e8SJason Beloro  *
1089853d9e8SJason Beloro  * mblock_free()
1099853d9e8SJason Beloro  *	Free memory allocated by mblock_alloc.
1109853d9e8SJason Beloro  *
1119853d9e8SJason Beloro  * mblock_update()
1129853d9e8SJason Beloro  *	Build mblocks based on mblock nodes read from the MD.
1139853d9e8SJason Beloro  *
1149853d9e8SJason Beloro  * mblock_update_add()
1159853d9e8SJason Beloro  *	Rebuild mblocks after a memory DR add operation.
1169853d9e8SJason Beloro  *
1179853d9e8SJason Beloro  * mblock_update_del()
1189853d9e8SJason Beloro  *	Rebuild mblocks after a memory DR delete operation.
1199853d9e8SJason Beloro  *
1209853d9e8SJason Beloro  * mblock_install()
1219853d9e8SJason Beloro  *	Install mblocks as the new configuration.
1229853d9e8SJason Beloro  *
1239853d9e8SJason Beloro  * mstripe_update()
1249853d9e8SJason Beloro  *	Build stripes based on mblocks.
1259853d9e8SJason Beloro  *
1269853d9e8SJason Beloro  * mnode_update()
1279853d9e8SJason Beloro  *	Call memnode layer to add/del a pfn range, based on stripes.
1289853d9e8SJason Beloro  *
1299853d9e8SJason Beloro  * The platform interfaces allocate all memory required for the
1309853d9e8SJason Beloro  * particualar update first, block access to the MPO structures
1319853d9e8SJason Beloro  * while they are updated, and free old structures after the update.
132ce8eb11aSdp  */
133ce8eb11aSdp 
134ce8eb11aSdp int	sun4v_mpo_enable = 1;
135ce8eb11aSdp int	sun4v_mpo_debug = 0;
136ce8eb11aSdp char	sun4v_mpo_status[256] = "";
137ce8eb11aSdp 
138ce8eb11aSdp /* Save CPU info from the MD and associate CPUs with lgroups */
139ce8eb11aSdp static	struct cpu_md mpo_cpu[NCPU];
140ce8eb11aSdp 
141ce8eb11aSdp /* Save lgroup info from the MD */
142ce8eb11aSdp #define	MAX_MD_LGROUPS 32
143ce8eb11aSdp static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
144ce8eb11aSdp static	int	n_lgrpnodes = 0;
145ce8eb11aSdp static	int	n_locality_groups = 0;
146ce8eb11aSdp static	int	max_locality_groups = 0;
1479853d9e8SJason Beloro static	int	szc_mask0 = 0;
148ce8eb11aSdp 
149ce8eb11aSdp /* Save mblocks from the MD */
150bb57d1f5Sjc #define	SMALL_MBLOCKS_COUNT	8
151*12551037SToomas Soome static	struct	mblock_md *mpo_mblock;
152*12551037SToomas Soome static	struct	mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
153ce8eb11aSdp static	int	n_mblocks = 0;
154ce8eb11aSdp 
155ce8eb11aSdp /* Save mem_node stripes calculate from mblocks and lgroups. */
156bb57d1f5Sjc static mem_stripe_t *mem_stripes;
157bb57d1f5Sjc static	mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
158ce8eb11aSdp static	int	n_mem_stripes = 0;
159ce8eb11aSdp static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
160ce8eb11aSdp static	int	stripe_shift;	/* stride/stripes expressed as a shift */
161ce8eb11aSdp static	pfn_t	mnode_pages;	/* mem_node stripe width */
162ce8eb11aSdp 
163ce8eb11aSdp /* Save home mask and shift used to calculate lgrp_handle_t values */
164ce8eb11aSdp static	uint64_t home_mask = 0;
165ce8eb11aSdp static	pfn_t	home_mask_pfn = 0;
166ce8eb11aSdp static	int	home_mask_shift = 0;
167ce8eb11aSdp static	uint_t	home_mask_pfn_shift = 0;
168ce8eb11aSdp 
169ce8eb11aSdp /* Save lowest and highest latencies found across all lgroups */
170ce8eb11aSdp static	int	lower_latency = 0;
171ce8eb11aSdp static	int	higher_latency = 0;
172ce8eb11aSdp 
173ce8eb11aSdp static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
1749853d9e8SJason Beloro static	int	mpo_genid;		/* config gen; updated by mem DR */
1759853d9e8SJason Beloro static	mpo_config_t mpo_config;	/* current mblocks and stripes */
1769853d9e8SJason Beloro 
1779853d9e8SJason Beloro typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t;
178ce8eb11aSdp 
179ce8eb11aSdp static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
180ce8eb11aSdp static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
181ce8eb11aSdp static	int	fix_interleave(void);
182ce8eb11aSdp 
1839853d9e8SJason Beloro static int  mblock_alloc(mpo_config_t *, update_t, int nmblocks);
1849853d9e8SJason Beloro static void mblock_install(mpo_config_t *);
1859853d9e8SJason Beloro static void mblock_free(mpo_config_t *);
1869853d9e8SJason Beloro static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes);
1879853d9e8SJason Beloro static void mblock_update_add(mpo_config_t *);
1889853d9e8SJason Beloro static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t);
1899853d9e8SJason Beloro static void mstripe_update(mpo_config_t *);
1909853d9e8SJason Beloro static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t);
1919853d9e8SJason Beloro 
192ce8eb11aSdp /* Debug support */
193ce8eb11aSdp #if defined(DEBUG) && !defined(lint)
194*12551037SToomas Soome #define	VALIDATE_SLICE(base, end) {					\
1959853d9e8SJason Beloro 	ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M)));		\
1969853d9e8SJason Beloro 	ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M)));	\
1979853d9e8SJason Beloro }
198ce8eb11aSdp #define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
199ce8eb11aSdp #else
2009853d9e8SJason Beloro #define	VALIDATE_SLICE(base, end)
201ce8eb11aSdp #define	MPO_DEBUG(...)
202ce8eb11aSdp #endif	/* DEBUG */
203ce8eb11aSdp 
204ce8eb11aSdp /* Record status message, viewable from mdb */
205ce8eb11aSdp #define	MPO_STATUS(args...) {						      \
206ce8eb11aSdp 	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
207ce8eb11aSdp 	MPO_DEBUG(sun4v_mpo_status);					      \
208ce8eb11aSdp }
209ce8eb11aSdp 
2109853d9e8SJason Beloro /*
2119853d9e8SJason Beloro  * The MPO locks are to protect the MPO metadata while that
2129853d9e8SJason Beloro  * information is updated as a result of a memory DR operation.
2139853d9e8SJason Beloro  * The read lock must be acquired to read the metadata and the
2149853d9e8SJason Beloro  * write locks must be acquired to update it.
2159853d9e8SJason Beloro  */
2169853d9e8SJason Beloro #define	mpo_rd_lock	kpreempt_disable
2179853d9e8SJason Beloro #define	mpo_rd_unlock	kpreempt_enable
2189853d9e8SJason Beloro 
2199853d9e8SJason Beloro static void
mpo_wr_lock()2209853d9e8SJason Beloro mpo_wr_lock()
2219853d9e8SJason Beloro {
2229853d9e8SJason Beloro 	mutex_enter(&cpu_lock);
2230ed5c46eSJosef 'Jeff' Sipek 	pause_cpus(NULL, NULL);
2249853d9e8SJason Beloro 	mutex_exit(&cpu_lock);
2259853d9e8SJason Beloro }
2269853d9e8SJason Beloro 
2279853d9e8SJason Beloro static void
mpo_wr_unlock()2289853d9e8SJason Beloro mpo_wr_unlock()
2299853d9e8SJason Beloro {
2309853d9e8SJason Beloro 	mutex_enter(&cpu_lock);
2319853d9e8SJason Beloro 	start_cpus();
2329853d9e8SJason Beloro 	mutex_exit(&cpu_lock);
2339853d9e8SJason Beloro }
2349853d9e8SJason Beloro 
235ce8eb11aSdp /*
236ce8eb11aSdp  * Routine to read a uint64_t from a given md
237ce8eb11aSdp  */
238ce8eb11aSdp static	int64_t
get_int(md_t md,mde_cookie_t node,char * propname,uint64_t * val)239ce8eb11aSdp get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
240ce8eb11aSdp {
241ce8eb11aSdp 	int err = md_get_prop_val(md, node, propname, val);
242ce8eb11aSdp 	return (err);
243ce8eb11aSdp }
244ce8eb11aSdp 
245ce8eb11aSdp static int
mblock_cmp(const void * a,const void * b)246ce8eb11aSdp mblock_cmp(const void *a, const void *b)
247ce8eb11aSdp {
248ce8eb11aSdp 	struct mblock_md *m1 = (struct mblock_md *)a;
249ce8eb11aSdp 	struct mblock_md *m2 = (struct mblock_md *)b;
250ce8eb11aSdp 
251ce8eb11aSdp 	if (m1->base < m2->base)
252ce8eb11aSdp 		return (-1);
253ce8eb11aSdp 	else if (m1->base == m2->base)
254ce8eb11aSdp 		return (0);
255ce8eb11aSdp 	else
256ce8eb11aSdp 		return (1);
257ce8eb11aSdp }
258ce8eb11aSdp 
259ce8eb11aSdp static void
mblock_sort(struct mblock_md * mblocks,int n)260ce8eb11aSdp mblock_sort(struct mblock_md *mblocks, int n)
261ce8eb11aSdp {
262ce8eb11aSdp 	extern void qsort(void *, size_t, size_t,
263ce8eb11aSdp 	    int (*)(const void *, const void *));
264ce8eb11aSdp 
265ce8eb11aSdp 	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
266ce8eb11aSdp }
267ce8eb11aSdp 
268924db11bSjc static void
mpo_update_tunables(void)269924db11bSjc mpo_update_tunables(void)
270924db11bSjc {
271924db11bSjc 	int i, ncpu_min;
272924db11bSjc 
273924db11bSjc 	/*
274924db11bSjc 	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
275924db11bSjc 	 * this process is currently running on before considering
276924db11bSjc 	 *  expanding threads to another lgroup.
277924db11bSjc 	 *
278924db11bSjc 	 * lgrp_expand_proc_diff determines how much less the remote lgroup
279924db11bSjc 	 *  must be loaded before expanding to it.
280924db11bSjc 	 *
281924db11bSjc 	 * On sun4v CMT processors, threads share a core pipeline, and
282924db11bSjc 	 * at less than 100% utilization, best throughput is obtained by
283924db11bSjc 	 * spreading threads across more cores, even if some are in a
284924db11bSjc 	 * different lgroup.  Spread threads to a new lgroup if the
285924db11bSjc 	 * current group is more than 50% loaded.  Because of virtualization,
286924db11bSjc 	 * lgroups may have different numbers of CPUs, but the tunables
287924db11bSjc 	 * apply to all lgroups, so find the smallest lgroup and compute
288924db11bSjc 	 * 50% loading.
289924db11bSjc 	 */
290924db11bSjc 
291924db11bSjc 	ncpu_min = NCPU;
292924db11bSjc 	for (i = 0; i < n_lgrpnodes; i++) {
293924db11bSjc 		int ncpu = mpo_lgroup[i].ncpu;
294924db11bSjc 		if (ncpu != 0 && ncpu < ncpu_min)
295924db11bSjc 			ncpu_min = ncpu;
296924db11bSjc 	}
297924db11bSjc 	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
298924db11bSjc 
299924db11bSjc 	/* new home may only be half as loaded as the existing home to use it */
300924db11bSjc 	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
301924db11bSjc 
302924db11bSjc 	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
303924db11bSjc }
304924db11bSjc 
305924db11bSjc static mde_cookie_t
cpuid_to_cpunode(md_t * md,int cpuid)306924db11bSjc cpuid_to_cpunode(md_t *md, int cpuid)
307924db11bSjc {
308924db11bSjc 	mde_cookie_t    rootnode, foundnode, *cpunodes;
309924db11bSjc 	uint64_t	cpuid_prop;
310*12551037SToomas Soome 	int	n_cpunodes, i;
311924db11bSjc 
312924db11bSjc 	if (md == NULL)
313924db11bSjc 		return (MDE_INVAL_ELEM_COOKIE);
314924db11bSjc 
315924db11bSjc 	rootnode = md_root_node(md);
316924db11bSjc 	if (rootnode == MDE_INVAL_ELEM_COOKIE)
317924db11bSjc 		return (MDE_INVAL_ELEM_COOKIE);
318924db11bSjc 
319924db11bSjc 	n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU,
320924db11bSjc 	    "fwd", &cpunodes);
321924db11bSjc 	if (n_cpunodes <= 0 || n_cpunodes > NCPU)
322924db11bSjc 		goto cpuid_fail;
323924db11bSjc 
324924db11bSjc 	for (i = 0; i < n_cpunodes; i++) {
325924db11bSjc 		if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID,
326924db11bSjc 		    &cpuid_prop))
327924db11bSjc 			break;
328924db11bSjc 		if (cpuid_prop == (uint64_t)cpuid) {
329924db11bSjc 			foundnode = cpunodes[i];
330924db11bSjc 			md_free_scan_dag(md, &cpunodes);
331924db11bSjc 			return (foundnode);
332924db11bSjc 		}
333924db11bSjc 	}
334924db11bSjc cpuid_fail:
335924db11bSjc 	if (n_cpunodes > 0)
336924db11bSjc 		md_free_scan_dag(md, &cpunodes);
337924db11bSjc 	return (MDE_INVAL_ELEM_COOKIE);
338924db11bSjc }
339924db11bSjc 
340924db11bSjc static int
mpo_cpu_to_lgroup(md_t * md,mde_cookie_t cpunode)341924db11bSjc mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode)
342924db11bSjc {
343924db11bSjc 	mde_cookie_t *nodes;
344924db11bSjc 	uint64_t latency, lowest_latency;
345924db11bSjc 	uint64_t address_match, lowest_address_match;
346924db11bSjc 	int n_lgroups, j, result = 0;
347924db11bSjc 
348924db11bSjc 	/* Find lgroup nodes reachable from this cpu */
349924db11bSjc 	n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG,
350924db11bSjc 	    "fwd", &nodes);
351924db11bSjc 
352924db11bSjc 	lowest_latency = ~(0UL);
353924db11bSjc 
354924db11bSjc 	/* Find the lgroup node with the smallest latency */
355924db11bSjc 	for (j = 0; j < n_lgroups; j++) {
356924db11bSjc 		result = get_int(md, nodes[j], PROP_LG_LATENCY,
357924db11bSjc 		    &latency);
358924db11bSjc 		result |= get_int(md, nodes[j], PROP_LG_MATCH,
359924db11bSjc 		    &address_match);
360924db11bSjc 		if (result != 0) {
361924db11bSjc 			j = -1;
362924db11bSjc 			goto to_lgrp_done;
363924db11bSjc 		}
364924db11bSjc 		if (latency < lowest_latency) {
365924db11bSjc 			lowest_latency = latency;
366924db11bSjc 			lowest_address_match = address_match;
367924db11bSjc 		}
368924db11bSjc 	}
369924db11bSjc 	for (j = 0; j < n_lgrpnodes; j++) {
370924db11bSjc 		if ((mpo_lgroup[j].latency == lowest_latency) &&
371924db11bSjc 		    (mpo_lgroup[j].addr_match == lowest_address_match))
372924db11bSjc 			break;
373924db11bSjc 	}
374924db11bSjc 	if (j == n_lgrpnodes)
375924db11bSjc 		j = -1;
376924db11bSjc 
377924db11bSjc to_lgrp_done:
378924db11bSjc 	if (n_lgroups > 0)
379924db11bSjc 		md_free_scan_dag(md, &nodes);
380924db11bSjc 	return (j);
381924db11bSjc }
382924db11bSjc 
383924db11bSjc /* Called when DR'ing in a CPU */
384924db11bSjc void
mpo_cpu_add(md_t * md,int cpuid)385183ef8a1SHaik Aftandilian mpo_cpu_add(md_t *md, int cpuid)
386924db11bSjc {
387924db11bSjc 	mde_cookie_t cpunode;
388924db11bSjc 
389924db11bSjc 	int i;
390924db11bSjc 
391924db11bSjc 	if (n_lgrpnodes <= 0)
392924db11bSjc 		return;
393924db11bSjc 
394924db11bSjc 	if (md == NULL)
395924db11bSjc 		goto add_fail;
396924db11bSjc 
397924db11bSjc 	cpunode = cpuid_to_cpunode(md, cpuid);
398924db11bSjc 	if (cpunode == MDE_INVAL_ELEM_COOKIE)
399924db11bSjc 		goto add_fail;
400924db11bSjc 
401924db11bSjc 	i = mpo_cpu_to_lgroup(md, cpunode);
402924db11bSjc 	if (i == -1)
403924db11bSjc 		goto add_fail;
404924db11bSjc 
405924db11bSjc 	mpo_cpu[cpuid].lgrp_index = i;
406924db11bSjc 	mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift;
407924db11bSjc 	mpo_lgroup[i].ncpu++;
408924db11bSjc 	mpo_update_tunables();
409924db11bSjc 	return;
410924db11bSjc add_fail:
411924db11bSjc 	panic("mpo_cpu_add: Cannot read MD");
412924db11bSjc }
413924db11bSjc 
414924db11bSjc /* Called when DR'ing out a CPU */
415924db11bSjc void
mpo_cpu_remove(int cpuid)416924db11bSjc mpo_cpu_remove(int cpuid)
417924db11bSjc {
418924db11bSjc 	int i;
419924db11bSjc 
420924db11bSjc 	if (n_lgrpnodes <= 0)
421924db11bSjc 		return;
422924db11bSjc 
423924db11bSjc 	i = mpo_cpu[cpuid].lgrp_index;
424924db11bSjc 	mpo_lgroup[i].ncpu--;
425924db11bSjc 	mpo_cpu[cpuid].home = 0;
426924db11bSjc 	mpo_cpu[cpuid].lgrp_index = -1;
427924db11bSjc 	mpo_update_tunables();
428924db11bSjc }
429924db11bSjc 
4309853d9e8SJason Beloro static mde_cookie_t
md_get_root(md_t * md)4319853d9e8SJason Beloro md_get_root(md_t *md)
432ce8eb11aSdp {
4339853d9e8SJason Beloro 	mde_cookie_t root = MDE_INVAL_ELEM_COOKIE;
4349853d9e8SJason Beloro 	int n_nodes;
435ce8eb11aSdp 
436ce8eb11aSdp 	n_nodes = md_node_count(md);
437ce8eb11aSdp 
438ce8eb11aSdp 	if (n_nodes <= 0) {
4399853d9e8SJason Beloro 		MPO_STATUS("md_get_root: No nodes in node count\n");
4409853d9e8SJason Beloro 		return (root);
441ce8eb11aSdp 	}
442ce8eb11aSdp 
443ce8eb11aSdp 	root = md_root_node(md);
444ce8eb11aSdp 
445ce8eb11aSdp 	if (root == MDE_INVAL_ELEM_COOKIE) {
4469853d9e8SJason Beloro 		MPO_STATUS("md_get_root: Root node is missing\n");
4479853d9e8SJason Beloro 		return (root);
448ce8eb11aSdp 	}
449ce8eb11aSdp 
4509853d9e8SJason Beloro 	MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes);
4519853d9e8SJason Beloro 	MPO_DEBUG("md_get_root: md: %p\n", md);
4529853d9e8SJason Beloro 	MPO_DEBUG("md_get_root: root: %lx\n", root);
4539853d9e8SJason Beloro done:
4549853d9e8SJason Beloro 	return (root);
4559853d9e8SJason Beloro }
456ce8eb11aSdp 
4579853d9e8SJason Beloro static int
lgrp_update(md_t * md,mde_cookie_t root)4589853d9e8SJason Beloro lgrp_update(md_t *md, mde_cookie_t root)
4599853d9e8SJason Beloro {
4609853d9e8SJason Beloro 	int i, j, result;
4619853d9e8SJason Beloro 	int ret_val = 0;
4629853d9e8SJason Beloro 	int sub_page_fix;
4639853d9e8SJason Beloro 	mde_cookie_t *nodes, *lgrpnodes;
464ce8eb11aSdp 
465ce8eb11aSdp 	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
466ce8eb11aSdp 	    "fwd", &lgrpnodes);
467ce8eb11aSdp 
468ce8eb11aSdp 	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
4699853d9e8SJason Beloro 		MPO_STATUS("lgrp_update: No Lgroups\n");
470ce8eb11aSdp 		ret_val = -1;
471ce8eb11aSdp 		goto fail;
472ce8eb11aSdp 	}
473ce8eb11aSdp 
4749853d9e8SJason Beloro 	MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes);
475ce8eb11aSdp 
476ce8eb11aSdp 	for (i = 0; i < n_lgrpnodes; i++) {
477ce8eb11aSdp 		mpo_lgroup[i].node = lgrpnodes[i];
478ce8eb11aSdp 		mpo_lgroup[i].id = i;
479ce8eb11aSdp 		mpo_lgroup[i].ncpu = 0;
480ce8eb11aSdp 		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
481ce8eb11aSdp 		    &mpo_lgroup[i].addr_mask);
482ce8eb11aSdp 		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
483ce8eb11aSdp 		    &mpo_lgroup[i].addr_match);
484ce8eb11aSdp 
485ce8eb11aSdp 		/*
486ce8eb11aSdp 		 * If either the mask or match properties are missing, set to 0
487ce8eb11aSdp 		 */
488ce8eb11aSdp 		if (result < 0) {
489ce8eb11aSdp 			mpo_lgroup[i].addr_mask = 0;
490ce8eb11aSdp 			mpo_lgroup[i].addr_match = 0;
491ce8eb11aSdp 		}
492ce8eb11aSdp 
493ce8eb11aSdp 		/* Set latency to 0 if property not present */
494ce8eb11aSdp 
495ce8eb11aSdp 		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
496ce8eb11aSdp 		    &mpo_lgroup[i].latency);
497ce8eb11aSdp 		if (result < 0)
498ce8eb11aSdp 			mpo_lgroup[i].latency = 0;
499ce8eb11aSdp 	}
500ce8eb11aSdp 
501ce8eb11aSdp 	/*
502ce8eb11aSdp 	 * Sub-page level interleave is not yet supported.  Check for it,
503ce8eb11aSdp 	 * and remove sub-page interleaved lgroups from mpo_lgroup and
504ce8eb11aSdp 	 * n_lgrpnodes.  If no lgroups are left, return.
505ce8eb11aSdp 	 */
506ce8eb11aSdp 
507ce8eb11aSdp 	sub_page_fix = fix_interleave();
508ce8eb11aSdp 	if (n_lgrpnodes == 0) {
509ce8eb11aSdp 		ret_val = -1;
510ce8eb11aSdp 		goto fail;
511ce8eb11aSdp 	}
512ce8eb11aSdp 
513ce8eb11aSdp 	/* Ensure that all of the addr_mask values are the same */
514ce8eb11aSdp 
515ce8eb11aSdp 	for (i = 0; i < n_lgrpnodes; i++) {
516ce8eb11aSdp 		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
5179853d9e8SJason Beloro 			MPO_STATUS("lgrp_update: "
518ce8eb11aSdp 			    "addr_mask values are not the same\n");
519ce8eb11aSdp 			ret_val = -1;
520ce8eb11aSdp 			goto fail;
521ce8eb11aSdp 		}
522ce8eb11aSdp 	}
523ce8eb11aSdp 
524ce8eb11aSdp 	/*
525ce8eb11aSdp 	 * Ensure that all lgrp nodes see all the mblocks. However, if
526ce8eb11aSdp 	 * sub-page interleave is being fixed, they do not, so skip
527ce8eb11aSdp 	 * the check.
528ce8eb11aSdp 	 */
529ce8eb11aSdp 
530ce8eb11aSdp 	if (sub_page_fix == 0) {
531ce8eb11aSdp 		for (i = 0; i < n_lgrpnodes; i++) {
532ce8eb11aSdp 			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
533ce8eb11aSdp 			    PROP_LG_MBLOCK, "fwd", &nodes);
534ce8eb11aSdp 			md_free_scan_dag(md, &nodes);
535ce8eb11aSdp 			if (j != n_mblocks) {
5369853d9e8SJason Beloro 				MPO_STATUS("lgrp_update: "
537ce8eb11aSdp 				    "sub-page interleave is being fixed\n");
538ce8eb11aSdp 				ret_val = -1;
539ce8eb11aSdp 				goto fail;
540ce8eb11aSdp 			}
541ce8eb11aSdp 		}
542ce8eb11aSdp 	}
5439853d9e8SJason Beloro fail:
5449853d9e8SJason Beloro 	if (n_lgrpnodes > 0) {
5459853d9e8SJason Beloro 		md_free_scan_dag(md, &lgrpnodes);
5469853d9e8SJason Beloro 		for (i = 0; i < n_lgrpnodes; i++)
5479853d9e8SJason Beloro 			mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
5489853d9e8SJason Beloro 	}
5499853d9e8SJason Beloro 
5509853d9e8SJason Beloro 	return (ret_val);
5519853d9e8SJason Beloro }
5529853d9e8SJason Beloro 
5539853d9e8SJason Beloro /*
5549853d9e8SJason Beloro  *
5559853d9e8SJason Beloro  * Traverse the MD to determine:
5569853d9e8SJason Beloro  *
5579853d9e8SJason Beloro  *  Number of CPU nodes, lgrp_nodes, and mblocks
5589853d9e8SJason Beloro  *  Then for each lgrp_node, obtain the appropriate data.
5599853d9e8SJason Beloro  *  For each CPU, determine its home locality and store it.
5609853d9e8SJason Beloro  *  For each mblock, retrieve its data and store it.
5619853d9e8SJason Beloro  */
5629853d9e8SJason Beloro static	int
lgrp_traverse(md_t * md)5639853d9e8SJason Beloro lgrp_traverse(md_t *md)
5649853d9e8SJason Beloro {
5659853d9e8SJason Beloro 	mde_cookie_t root, *cpunodes, *mblocknodes;
5669853d9e8SJason Beloro 	int o;
5679853d9e8SJason Beloro 	uint64_t i, k, stripe, stride;
5689853d9e8SJason Beloro 	uint64_t mem_lg_homeset = 0;
5699853d9e8SJason Beloro 	int ret_val = 0;
5709853d9e8SJason Beloro 	int result = 0;
5719853d9e8SJason Beloro 	int n_cpunodes = 0;
5729853d9e8SJason Beloro 	mpo_config_t new_config;
5739853d9e8SJason Beloro 
5749853d9e8SJason Beloro 	if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) {
5759853d9e8SJason Beloro 		ret_val = -1;
5769853d9e8SJason Beloro 		goto fail;
5779853d9e8SJason Beloro 	}
5789853d9e8SJason Beloro 
5799853d9e8SJason Beloro 	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
5809853d9e8SJason Beloro 	    &mblocknodes);
5819853d9e8SJason Beloro 	if (n_mblocks <= 0) {
5829853d9e8SJason Beloro 		MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine "
5839853d9e8SJason Beloro 		    "Descriptor\n");
5849853d9e8SJason Beloro 		ret_val = -1;
5859853d9e8SJason Beloro 		goto fail;
5869853d9e8SJason Beloro 	}
5879853d9e8SJason Beloro 
5889853d9e8SJason Beloro 	/*
5899853d9e8SJason Beloro 	 * Build the Memory Nodes.  Do this before any possibility of
5909853d9e8SJason Beloro 	 * bailing from this routine so we obtain ra_to_pa (needed for page
5919853d9e8SJason Beloro 	 * coloring) even when there are no lgroups defined.
5929853d9e8SJason Beloro 	 */
5939853d9e8SJason Beloro 	if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) {
5949853d9e8SJason Beloro 		ret_val = -1;
5959853d9e8SJason Beloro 		goto fail;
5969853d9e8SJason Beloro 	}
5979853d9e8SJason Beloro 
5989853d9e8SJason Beloro 	mblock_update(&new_config, md, mblocknodes);
5999853d9e8SJason Beloro 	mblock_install(&new_config);
6009853d9e8SJason Beloro 
6019853d9e8SJason Beloro 	/* Page coloring hook is required so we can iterate through mnodes */
6029853d9e8SJason Beloro 	if (&page_next_pfn_for_color_cpu == NULL) {
6039853d9e8SJason Beloro 		MPO_STATUS("lgrp_traverse: No page coloring support\n");
6049853d9e8SJason Beloro 		ret_val = -1;
6059853d9e8SJason Beloro 		goto fail;
6069853d9e8SJason Beloro 	}
6079853d9e8SJason Beloro 
6089853d9e8SJason Beloro 	/* Global enable for mpo */
6099853d9e8SJason Beloro 	if (sun4v_mpo_enable == 0) {
6109853d9e8SJason Beloro 		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
6119853d9e8SJason Beloro 		ret_val = -1;
6129853d9e8SJason Beloro 		goto fail;
6139853d9e8SJason Beloro 	}
6149853d9e8SJason Beloro 
6159853d9e8SJason Beloro 	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
6169853d9e8SJason Beloro 
6179853d9e8SJason Beloro 	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
6189853d9e8SJason Beloro 		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
6199853d9e8SJason Beloro 		    "in MD\n");
6209853d9e8SJason Beloro 		ret_val = -1;
6219853d9e8SJason Beloro 		goto fail;
6229853d9e8SJason Beloro 	}
6239853d9e8SJason Beloro 
6249853d9e8SJason Beloro 	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
6259853d9e8SJason Beloro 
6269853d9e8SJason Beloro 	if ((ret_val = lgrp_update(md, root)) == -1)
6279853d9e8SJason Beloro 		goto fail;
628ce8eb11aSdp 
629ce8eb11aSdp 	/*
630ce8eb11aSdp 	 * Use the address mask from the first lgroup node
631ce8eb11aSdp 	 * to establish our home_mask.
632ce8eb11aSdp 	 */
633ce8eb11aSdp 	home_mask = mpo_lgroup[0].addr_mask;
634ce8eb11aSdp 	home_mask_pfn = btop(home_mask);
635ce8eb11aSdp 	home_mask_shift = lowbit(home_mask) - 1;
636ce8eb11aSdp 	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
637ce8eb11aSdp 	mnode_pages = btop(1ULL << home_mask_shift);
638ce8eb11aSdp 
639ce8eb11aSdp 	/*
640ce8eb11aSdp 	 * How many values are possible in home mask?  Assume the mask
641ce8eb11aSdp 	 * bits are contiguous.
642ce8eb11aSdp 	 */
643ce8eb11aSdp 	max_locality_groups =
644ce8eb11aSdp 	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
645ce8eb11aSdp 
6469853d9e8SJason Beloro 	stripe_shift = highbit(max_locality_groups) - 1;
6479853d9e8SJason Beloro 	stripe = ptob(mnode_pages);
6489853d9e8SJason Beloro 	stride = max_locality_groups * stripe;
6499853d9e8SJason Beloro 	mnode_stride = btop(stride);
6509853d9e8SJason Beloro 
651ce8eb11aSdp 	/* Now verify the home mask bits are contiguous */
652ce8eb11aSdp 
653ce8eb11aSdp 	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
654ce8eb11aSdp 		MPO_STATUS("lgrp_traverse: "
655ce8eb11aSdp 		    "home mask bits are not contiguous\n");
656ce8eb11aSdp 		ret_val = -1;
657ce8eb11aSdp 		goto fail;
658ce8eb11aSdp 	}
659ce8eb11aSdp 
660ce8eb11aSdp 	/* Record all of the home bits */
661ce8eb11aSdp 
662ce8eb11aSdp 	for (i = 0; i < n_lgrpnodes; i++) {
663ce8eb11aSdp 		HOMESET_ADD(mem_lg_homeset,
664ce8eb11aSdp 		    mpo_lgroup[i].addr_match >> home_mask_shift);
665ce8eb11aSdp 	}
666ce8eb11aSdp 
667ce8eb11aSdp 	/* Count the number different "home"  mem_lg's we've discovered */
668ce8eb11aSdp 
669ce8eb11aSdp 	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
670ce8eb11aSdp 
671ce8eb11aSdp 	/* If we have only 1 locality group then we can exit */
672ce8eb11aSdp 	if (n_locality_groups == 1) {
673ce8eb11aSdp 		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
674ce8eb11aSdp 		ret_val = -1;
675ce8eb11aSdp 		goto fail;
676ce8eb11aSdp 	}
677ce8eb11aSdp 
678ce8eb11aSdp 	/*
679ce8eb11aSdp 	 * Set the latencies.  A CPU's lgroup is defined by the lowest
680ce8eb11aSdp 	 * latency found.  All other memory is considered remote, and the
681ce8eb11aSdp 	 * remote latency is represented by the highest latency found.
682ce8eb11aSdp 	 * Thus hierarchical lgroups, if any, are approximated by a
683ce8eb11aSdp 	 * two level scheme.
684ce8eb11aSdp 	 *
685ce8eb11aSdp 	 * The Solaris MPO framework by convention wants to see latencies
686ce8eb11aSdp 	 * in units of nano-sec/10. In the MD, the units are defined to be
687ce8eb11aSdp 	 * pico-seconds.
688ce8eb11aSdp 	 */
689ce8eb11aSdp 
690ce8eb11aSdp 	lower_latency = mpo_lgroup[0].latency;
691ce8eb11aSdp 	higher_latency = mpo_lgroup[0].latency;
692ce8eb11aSdp 
693ce8eb11aSdp 	for (i = 1; i < n_lgrpnodes; i++) {
694ce8eb11aSdp 		if (mpo_lgroup[i].latency < lower_latency) {
695ce8eb11aSdp 			lower_latency = mpo_lgroup[i].latency;
696ce8eb11aSdp 		}
697ce8eb11aSdp 		if (mpo_lgroup[i].latency > higher_latency) {
698ce8eb11aSdp 			higher_latency = mpo_lgroup[i].latency;
699ce8eb11aSdp 		}
700ce8eb11aSdp 	}
701ce8eb11aSdp 	lower_latency /= 10000;
702ce8eb11aSdp 	higher_latency /= 10000;
703ce8eb11aSdp 
704ce8eb11aSdp 	/* Clear our CPU data */
705ce8eb11aSdp 
706ce8eb11aSdp 	for (i = 0; i < NCPU; i++) {
707ce8eb11aSdp 		mpo_cpu[i].home = 0;
708924db11bSjc 		mpo_cpu[i].lgrp_index = -1;
709ce8eb11aSdp 	}
710ce8eb11aSdp 
711ce8eb11aSdp 	/* Build the CPU nodes */
712ce8eb11aSdp 	for (i = 0; i < n_cpunodes; i++) {
713ce8eb11aSdp 
714ce8eb11aSdp 		/* Read in the lgroup nodes */
715ce8eb11aSdp 		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
716ce8eb11aSdp 		if (result < 0) {
717ce8eb11aSdp 			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
718ce8eb11aSdp 			ret_val = -1;
719ce8eb11aSdp 			goto fail;
720ce8eb11aSdp 		}
721ce8eb11aSdp 
722924db11bSjc 		o = mpo_cpu_to_lgroup(md, cpunodes[i]);
723924db11bSjc 		if (o == -1) {
724ce8eb11aSdp 			ret_val = -1;
725ce8eb11aSdp 			goto fail;
726ce8eb11aSdp 		}
727924db11bSjc 		mpo_cpu[k].lgrp_index = o;
728924db11bSjc 		mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift;
729924db11bSjc 		mpo_lgroup[o].ncpu++;
730ce8eb11aSdp 	}
731ce8eb11aSdp 	/* Validate that no large pages cross mnode boundaries. */
732ce8eb11aSdp 	if (valid_pages(md, cpunodes[0]) == 0) {
733ce8eb11aSdp 		ret_val = -1;
734ce8eb11aSdp 		goto fail;
735ce8eb11aSdp 	}
736ce8eb11aSdp 
737ce8eb11aSdp fail:
738ce8eb11aSdp 	if (n_cpunodes > 0)
739ce8eb11aSdp 		md_free_scan_dag(md, &cpunodes);
740ce8eb11aSdp 	if (n_mblocks > 0)
741ce8eb11aSdp 		md_free_scan_dag(md, &mblocknodes);
742ce8eb11aSdp 	else
743ce8eb11aSdp 		panic("lgrp_traverse: No memory blocks found");
744ce8eb11aSdp 
7459853d9e8SJason Beloro 	if (ret_val == 0) {
746ce8eb11aSdp 		MPO_STATUS("MPO feature is enabled.\n");
7479853d9e8SJason Beloro 	} else
7489853d9e8SJason Beloro 		sun4v_mpo_enable = 0;	/* set this for DR */
749ce8eb11aSdp 
750ce8eb11aSdp 	return (ret_val);
751ce8eb11aSdp }
752ce8eb11aSdp 
753ce8eb11aSdp /*
754ce8eb11aSdp  *  Determine the number of unique mem_lg's present in our system
755ce8eb11aSdp  */
756ce8eb11aSdp static	int
unique_home_mem_lg_count(uint64_t mem_lg_homeset)757ce8eb11aSdp unique_home_mem_lg_count(uint64_t mem_lg_homeset)
758ce8eb11aSdp {
759ce8eb11aSdp 	int homeid;
760ce8eb11aSdp 	int count = 0;
761ce8eb11aSdp 
762ce8eb11aSdp 	/*
763ce8eb11aSdp 	 * Scan the "home" bits of the mem_lgs, count
764ce8eb11aSdp 	 * the number that are unique.
765ce8eb11aSdp 	 */
766ce8eb11aSdp 
767ce8eb11aSdp 	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
768ce8eb11aSdp 		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
769ce8eb11aSdp 			count++;
770ce8eb11aSdp 		}
771ce8eb11aSdp 	}
772ce8eb11aSdp 
773ce8eb11aSdp 	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
774ce8eb11aSdp 	    mem_lg_homeset);
775ce8eb11aSdp 	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
776ce8eb11aSdp 
777ce8eb11aSdp 	/* Default must be at least one */
778ce8eb11aSdp 	if (count == 0)
779ce8eb11aSdp 		count = 1;
780ce8eb11aSdp 
781ce8eb11aSdp 	return (count);
782ce8eb11aSdp }
783ce8eb11aSdp 
784ce8eb11aSdp /*
785ce8eb11aSdp  * Platform specific lgroup initialization
786ce8eb11aSdp  */
787ce8eb11aSdp void
plat_lgrp_init(void)788ce8eb11aSdp plat_lgrp_init(void)
789ce8eb11aSdp {
790ce8eb11aSdp 	md_t *md;
791924db11bSjc 	int rc;
792ce8eb11aSdp 
793ce8eb11aSdp 	/* Get the Machine Descriptor handle */
794ce8eb11aSdp 
795ce8eb11aSdp 	md = md_get_handle();
796ce8eb11aSdp 
797ce8eb11aSdp 	/* If not, we cannot continue */
798ce8eb11aSdp 
799ce8eb11aSdp 	if (md == NULL) {
800ce8eb11aSdp 		panic("cannot access machine descriptor\n");
801ce8eb11aSdp 	} else {
802ce8eb11aSdp 		rc = lgrp_traverse(md);
803ce8eb11aSdp 		(void) md_fini_handle(md);
804ce8eb11aSdp 	}
805ce8eb11aSdp 
806ce8eb11aSdp 	/*
807ce8eb11aSdp 	 * If we can't process the MD for lgroups then at least let the
808ce8eb11aSdp 	 * system try to boot.  Assume we have one lgroup so that
809ce8eb11aSdp 	 * when plat_build_mem_nodes is called, it will attempt to init
810ce8eb11aSdp 	 * an mnode based on the supplied memory segment.
811ce8eb11aSdp 	 */
812ce8eb11aSdp 
813ce8eb11aSdp 	if (rc == -1) {
814ce8eb11aSdp 		home_mask_pfn = 0;
815ce8eb11aSdp 		max_locality_groups = 1;
816ce8eb11aSdp 		n_locality_groups = 1;
817ce8eb11aSdp 		return;
818ce8eb11aSdp 	}
819ce8eb11aSdp 
820ce8eb11aSdp 	mem_node_pfn_shift = 0;
821ce8eb11aSdp 	mem_node_physalign = 0;
822ce8eb11aSdp 
823ce8eb11aSdp 	/* Use lgroup-aware TSB allocations */
824ce8eb11aSdp 	tsb_lgrp_affinity = 1;
825ce8eb11aSdp 
826ce8eb11aSdp 	/* Require that a home lgroup have some memory to be chosen */
827ce8eb11aSdp 	lgrp_mem_free_thresh = 1;
828ce8eb11aSdp 
829ce8eb11aSdp 	/* Standard home-on-next-touch policy */
830ce8eb11aSdp 	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
831ce8eb11aSdp 
832ce8eb11aSdp 	/* Disable option to choose root lgroup if all leaf lgroups are busy */
833ce8eb11aSdp 	lgrp_load_thresh = UINT32_MAX;
834924db11bSjc 
835924db11bSjc 	mpo_update_tunables();
836ce8eb11aSdp }
837ce8eb11aSdp 
838ce8eb11aSdp /*
839ce8eb11aSdp  *  Helper routine for debugging calls to mem_node_add_slice()
840ce8eb11aSdp  */
841ce8eb11aSdp static	void
mpo_mem_node_add_slice(pfn_t basepfn,pfn_t endpfn)842ce8eb11aSdp mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
843ce8eb11aSdp {
844ce8eb11aSdp #if defined(DEBUG) && !defined(lint)
845ce8eb11aSdp 	static int slice_count = 0;
846ce8eb11aSdp 
847ce8eb11aSdp 	slice_count++;
848ce8eb11aSdp 	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
849ce8eb11aSdp 	    slice_count, basepfn, endpfn);
850ce8eb11aSdp #endif
851ce8eb11aSdp 	mem_node_add_slice(basepfn, endpfn);
852ce8eb11aSdp }
853ce8eb11aSdp 
8549853d9e8SJason Beloro static	void
mpo_mem_node_del_slice(pfn_t basepfn,pfn_t endpfn)8559853d9e8SJason Beloro mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn)
8569853d9e8SJason Beloro {
8579853d9e8SJason Beloro #if defined(DEBUG) && !defined(lint)
8589853d9e8SJason Beloro 	static int slice_count = 0;
8599853d9e8SJason Beloro 
8609853d9e8SJason Beloro 	slice_count++;
8619853d9e8SJason Beloro 	MPO_DEBUG("mem_del_slice(%d): basepfn: %lx  endpfn: %lx\n",
8629853d9e8SJason Beloro 	    slice_count, basepfn, endpfn);
8639853d9e8SJason Beloro #endif
8649853d9e8SJason Beloro 	mem_node_del_slice(basepfn, endpfn);
8659853d9e8SJason Beloro }
8669853d9e8SJason Beloro 
867ce8eb11aSdp /*
868ce8eb11aSdp  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
869ce8eb11aSdp  */
870ce8eb11aSdp static	void
mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand,int mnode)871ce8eb11aSdp mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
872ce8eb11aSdp {
8739853d9e8SJason Beloro 	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, "
874ce8eb11aSdp 	    "mnode index: %d\n", plathand, mnode);
875ce8eb11aSdp 	plat_assign_lgrphand_to_mem_node(plathand, mnode);
876ce8eb11aSdp }
877ce8eb11aSdp 
878ce8eb11aSdp /*
879ce8eb11aSdp  * plat_build_mem_nodes()
880ce8eb11aSdp  *
881ce8eb11aSdp  * Define the mem_nodes based on the modified boot memory list,
882ce8eb11aSdp  * or based on info read from the MD in plat_lgrp_init().
883ce8eb11aSdp  *
884ce8eb11aSdp  * When the home mask lies in the middle of the address bits (as it does on
885ce8eb11aSdp  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
886ce8eb11aSdp  * it is striped across an mblock in a repeating pattern of contiguous memory
887ce8eb11aSdp  * followed by a gap.  The stripe width is the size of the contiguous piece.
888ce8eb11aSdp  * The stride is the distance from the start of one contiguous piece to the
889ce8eb11aSdp  * start of the next.  The gap is thus stride - stripe_width.
890ce8eb11aSdp  *
891ce8eb11aSdp  * The stripe of an mnode that falls within an mblock is described by the type
892ce8eb11aSdp  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
893ce8eb11aSdp  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
894ce8eb11aSdp  * this array is predetermined.  The mem_stripe_t that describes mnode m
895ce8eb11aSdp  * within mpo_mblock[i] is stored at
896ce8eb11aSdp  *	 mem_stripes[ m + i * max_locality_groups ]
897ce8eb11aSdp  *
898ce8eb11aSdp  * max_locality_groups is the total number of possible locality groups,
899ce8eb11aSdp  * as defined by the size of the home mask, even if the memory assigned
900ce8eb11aSdp  * to the domain is small and does not cover all the lgroups.  Thus some
901ce8eb11aSdp  * mem_stripe_t's may be empty.
902ce8eb11aSdp  *
903ce8eb11aSdp  * The members of mem_stripe_t are:
904ce8eb11aSdp  *	physbase: First valid page in mem_node in the corresponding mblock
905ce8eb11aSdp  *	physmax: Last valid page in mem_node in mblock
906ce8eb11aSdp  *	offset:  The full stripe width starts at physbase - offset.
907ce8eb11aSdp  *	    Thus if offset is non-zero, this mem_node starts in the middle
908ce8eb11aSdp  *	    of a stripe width, and the second full stripe starts at
909ce8eb11aSdp  *	    physbase - offset + stride.  (even though physmax may fall in the
910ce8eb11aSdp  *	    middle of a stripe width, we do not save the ending fragment size
911ce8eb11aSdp  *	    in this data structure.)
912ce8eb11aSdp  *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
913ce8eb11aSdp  *
914ce8eb11aSdp  *	The stripe width is kept in the global mnode_pages.
915ce8eb11aSdp  *	The stride is kept in the global mnode_stride.
916ce8eb11aSdp  *	All the above use pfn's as the unit.
917ce8eb11aSdp  *
918ce8eb11aSdp  * As an example, the memory layout for a domain with 2 mblocks and 4
919ce8eb11aSdp  * mem_nodes 0,1,2,3 could look like this:
920ce8eb11aSdp  *
921ce8eb11aSdp  *	123012301230 ...	012301230123 ...
922ce8eb11aSdp  *	  mblock 0		  mblock 1
923ce8eb11aSdp  */
924ce8eb11aSdp 
9259853d9e8SJason Beloro /*ARGSUSED*/
926ce8eb11aSdp void
plat_build_mem_nodes(prom_memlist_t * list,size_t nelems)927986fd29aSsetje plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
928ce8eb11aSdp {
9299853d9e8SJason Beloro 	int elem;
9309853d9e8SJason Beloro 	uint64_t base, len;
931ce8eb11aSdp 
932e853d8c3Sjc 	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
933e853d8c3Sjc 	max_mem_nodes = max_locality_groups;
934ce8eb11aSdp 
9359853d9e8SJason Beloro 	mstripe_update(&mpo_config);
9369853d9e8SJason Beloro 
937e853d8c3Sjc 	/* Check for non-MPO sun4v platforms */
938ce8eb11aSdp 	if (n_locality_groups <= 1) {
939e853d8c3Sjc 		mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
940986fd29aSsetje 		for (elem = 0; elem < nelems; list++, elem++) {
941986fd29aSsetje 			base = list->addr;
942986fd29aSsetje 			len = list->size;
943ce8eb11aSdp 
944ce8eb11aSdp 			mpo_mem_node_add_slice(btop(base),
945ce8eb11aSdp 			    btop(base + len - 1));
946ce8eb11aSdp 		}
947ce8eb11aSdp 		mem_node_pfn_shift = 0;
948ce8eb11aSdp 		mem_node_physalign = 0;
9499853d9e8SJason Beloro 	} else
9509853d9e8SJason Beloro 		mnode_update(&mpo_config, 0, 0, U_ADD_ALL);
951b779d3e0Sdp 
9529853d9e8SJason Beloro 	/*
9539853d9e8SJason Beloro 	 * Indicate to vm_pagelist that the hpm_counters array
9549853d9e8SJason Beloro 	 * should be shared because the ranges overlap.
9559853d9e8SJason Beloro 	 */
9569853d9e8SJason Beloro 	if (max_mem_nodes > 1) {
9579853d9e8SJason Beloro 		interleaved_mnodes = 1;
958ce8eb11aSdp 	}
9599853d9e8SJason Beloro }
960ce8eb11aSdp 
961ce8eb11aSdp /*
962ce8eb11aSdp  * Return the locality group value for the supplied processor
963ce8eb11aSdp  */
964ce8eb11aSdp lgrp_handle_t
plat_lgrp_cpu_to_hand(processorid_t id)965ce8eb11aSdp plat_lgrp_cpu_to_hand(processorid_t id)
966ce8eb11aSdp {
9679853d9e8SJason Beloro 	lgrp_handle_t lgrphand;
9689853d9e8SJason Beloro 
9699853d9e8SJason Beloro 	mpo_rd_lock();
970ce8eb11aSdp 	if (n_locality_groups > 1) {
9719853d9e8SJason Beloro 		lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home;
972ce8eb11aSdp 	} else {
9739853d9e8SJason Beloro 		lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */
974ce8eb11aSdp 	}
9759853d9e8SJason Beloro 	mpo_rd_unlock();
9769853d9e8SJason Beloro 
9779853d9e8SJason Beloro 	return (lgrphand);
978ce8eb11aSdp }
979ce8eb11aSdp 
980ce8eb11aSdp int
plat_lgrp_latency(lgrp_handle_t from,lgrp_handle_t to)981ce8eb11aSdp plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
982ce8eb11aSdp {
983ce8eb11aSdp 	/*
984ce8eb11aSdp 	 * Return min remote latency when there are more than two lgroups
985ce8eb11aSdp 	 * (root and child) and getting latency between two different lgroups
986ce8eb11aSdp 	 * or root is involved.
987ce8eb11aSdp 	 */
988ce8eb11aSdp 	if (lgrp_optimizations() && (from != to ||
989ce8eb11aSdp 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
990ce8eb11aSdp 		return ((int)higher_latency);
991ce8eb11aSdp 	} else {
992ce8eb11aSdp 		return ((int)lower_latency);
993ce8eb11aSdp 	}
994ce8eb11aSdp }
995ce8eb11aSdp 
996ce8eb11aSdp int
plat_pfn_to_mem_node(pfn_t pfn)997ce8eb11aSdp plat_pfn_to_mem_node(pfn_t pfn)
998ce8eb11aSdp {
999ce8eb11aSdp 	int i, mnode;
1000ce8eb11aSdp 	pfn_t ra_to_pa_pfn;
1001ce8eb11aSdp 	struct mblock_md *mb;
1002ce8eb11aSdp 
1003ce8eb11aSdp 	if (n_locality_groups <= 1)
1004ce8eb11aSdp 		return (0);
1005ce8eb11aSdp 
1006ce8eb11aSdp 	/*
1007ce8eb11aSdp 	 * The mnode is defined to be 1:1 with the lgroup handle, which
1008ce8eb11aSdp 	 * is taken from from the home bits.  Find the mblock in which
1009ce8eb11aSdp 	 * the pfn falls to get the ra_to_pa adjustment, and extract
1010ce8eb11aSdp 	 * the home bits.
1011ce8eb11aSdp 	 */
10129853d9e8SJason Beloro 	mpo_rd_lock();
1013ce8eb11aSdp 	mb = &mpo_mblock[0];
1014ce8eb11aSdp 	for (i = 0; i < n_mblocks; i++) {
1015ce8eb11aSdp 		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
1016ce8eb11aSdp 			ra_to_pa_pfn = btop(mb->ra_to_pa);
1017ce8eb11aSdp 			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
1018ce8eb11aSdp 			    home_mask_pfn_shift);
1019ce8eb11aSdp 			ASSERT(mnode < max_mem_nodes);
10209853d9e8SJason Beloro 			mpo_rd_unlock();
1021ce8eb11aSdp 			return (mnode);
1022ce8eb11aSdp 		}
1023ce8eb11aSdp 		mb++;
1024ce8eb11aSdp 	}
1025ce8eb11aSdp 
1026ce8eb11aSdp 	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
1027ce8eb11aSdp 	return (pfn);
1028ce8eb11aSdp }
1029ce8eb11aSdp 
1030ce8eb11aSdp /*
1031ce8eb11aSdp  * plat_rapfn_to_papfn
1032ce8eb11aSdp  *
1033ce8eb11aSdp  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
1034ce8eb11aSdp  * and home mask bits are correct.  The upper bits do not necessarily
1035ce8eb11aSdp  * match the actual PA, however.
1036ce8eb11aSdp  */
1037ce8eb11aSdp pfn_t
plat_rapfn_to_papfn(pfn_t pfn)1038ce8eb11aSdp plat_rapfn_to_papfn(pfn_t pfn)
1039ce8eb11aSdp {
1040ce8eb11aSdp 	int i;
1041ce8eb11aSdp 	pfn_t ra_to_pa_pfn;
1042ce8eb11aSdp 	struct mblock_md *mb;
1043ce8eb11aSdp 
1044ce8eb11aSdp 	ASSERT(n_mblocks > 0);
1045ce8eb11aSdp 	if (n_mblocks == 1)
1046ce8eb11aSdp 		return (pfn + base_ra_to_pa_pfn);
1047ce8eb11aSdp 
1048ce8eb11aSdp 	/*
1049ce8eb11aSdp 	 * Find the mblock in which the pfn falls
1050ce8eb11aSdp 	 * in order to get the ra_to_pa adjustment.
1051ce8eb11aSdp 	 */
10529853d9e8SJason Beloro 	mpo_rd_lock();
1053ce8eb11aSdp 	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
1054ce8eb11aSdp 		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
1055ce8eb11aSdp 			ra_to_pa_pfn = btop(mb->ra_to_pa);
10569853d9e8SJason Beloro 			mpo_rd_unlock();
1057ce8eb11aSdp 			return (pfn + ra_to_pa_pfn);
1058ce8eb11aSdp 		}
1059ce8eb11aSdp 	}
1060ce8eb11aSdp 
1061ce8eb11aSdp 	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
1062ce8eb11aSdp 	return (pfn);
1063ce8eb11aSdp }
1064ce8eb11aSdp 
1065ce8eb11aSdp /*
1066ce8eb11aSdp  * plat_mem_node_iterator_init()
1067b779d3e0Sdp  *      Initialize cookie "it" to iterate over pfn's in an mnode.  There is
1068b779d3e0Sdp  *      no additional iterator function.  The caller uses the info from
1069b779d3e0Sdp  *      the iterator structure directly.
1070b779d3e0Sdp  *
1071b779d3e0Sdp  *      pfn: starting pfn.
1072b779d3e0Sdp  *      mnode: desired mnode.
1073b779d3e0Sdp  *	szc: desired page size.
1074b779d3e0Sdp  *      init:
1075b779d3e0Sdp  *          if 1, start a new traversal, initialize "it", find first
1076b779d3e0Sdp  *              mblock containing pfn, and return its starting pfn
1077b779d3e0Sdp  *              within the mnode.
1078b779d3e0Sdp  *          if 0, continue the previous traversal using passed-in data
1079b779d3e0Sdp  *              from "it", advance to the next mblock, and return its
1080b779d3e0Sdp  *              starting pfn within the mnode.
1081b779d3e0Sdp  *      it: returns readonly data to the caller; see below.
1082ce8eb11aSdp  *
1083b779d3e0Sdp  *	The input pfn must be aligned for the page size szc.
1084ce8eb11aSdp  *
1085b779d3e0Sdp  *      Returns: starting pfn for the iteration for the mnode/mblock,
1086b779d3e0Sdp  *	    which is aligned according to the page size,
1087b779d3e0Sdp  *          or returns (pfn_t)(-1) if the input pfn lies past the last
1088b779d3e0Sdp  *          valid pfn of the mnode.
1089b779d3e0Sdp  *      Returns misc values in the "it" struct that allows the caller
1090b779d3e0Sdp  *          to advance the pfn within an mblock using address arithmetic;
1091b779d3e0Sdp  *          see definition of mem_node_iterator_t in vm_dep.h.
1092b779d3e0Sdp  *          When the caller calculates a pfn that is greater than the
1093b779d3e0Sdp  *          returned value it->mi_mblock_end, the caller should again
1094b779d3e0Sdp  *          call plat_mem_node_iterator_init, passing init=0.
10959853d9e8SJason Beloro  *
10969853d9e8SJason Beloro  *          The last mblock in continuation case may be invalid because
10979853d9e8SJason Beloro  *          of memory DR.  To detect this situation mi_genid is checked
10989853d9e8SJason Beloro  *          against mpo_genid which is incremented after a memory DR
10999853d9e8SJason Beloro  *          operation.  See also plat_slice_add()/plat_slice_del().
1100ce8eb11aSdp  */
1101ce8eb11aSdp pfn_t
plat_mem_node_iterator_init(pfn_t pfn,int mnode,uchar_t szc,mem_node_iterator_t * it,int init)1102b779d3e0Sdp plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc,
1103ce8eb11aSdp     mem_node_iterator_t *it, int init)
1104ce8eb11aSdp {
1105ce8eb11aSdp 	int i;
1106b779d3e0Sdp 	pgcnt_t szcpgcnt = PNUM_SIZE(szc);
1107ce8eb11aSdp 	struct mblock_md *mblock;
1108ce8eb11aSdp 	pfn_t base, end;
1109b779d3e0Sdp 	mem_stripe_t *ms;
1110b779d3e0Sdp 	uint64_t szcpagesize;
1111ce8eb11aSdp 
1112ce8eb11aSdp 	ASSERT(it != NULL);
1113ce8eb11aSdp 	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1114ce8eb11aSdp 	ASSERT(n_mblocks > 0);
1115b779d3e0Sdp 	ASSERT(P2PHASE(pfn, szcpgcnt) == 0);
1116ce8eb11aSdp 
11179853d9e8SJason Beloro 	mpo_rd_lock();
11189853d9e8SJason Beloro 
11199853d9e8SJason Beloro 	if (init || (it->mi_genid != mpo_genid)) {
11209853d9e8SJason Beloro 		it->mi_genid = mpo_genid;
1121ce8eb11aSdp 		it->mi_last_mblock = 0;
1122ce8eb11aSdp 		it->mi_init = 1;
1123ce8eb11aSdp 	}
1124ce8eb11aSdp 
1125ce8eb11aSdp 	/* Check if mpo is not enabled and we only have one mblock */
1126ce8eb11aSdp 	if (n_locality_groups == 1 && n_mblocks == 1) {
11279853d9e8SJason Beloro 		if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) {
11289853d9e8SJason Beloro 			pfn = (pfn_t)-1;
11299853d9e8SJason Beloro 			goto done;
11309853d9e8SJason Beloro 		}
1131ce8eb11aSdp 		it->mi_mnode = mnode;
1132ce8eb11aSdp 		it->mi_ra_to_pa = base_ra_to_pa_pfn;
1133ce8eb11aSdp 		it->mi_mnode_pfn_mask = 0;
1134ce8eb11aSdp 		it->mi_mnode_pfn_shift = 0;
1135ce8eb11aSdp 		it->mi_mnode_mask = 0;
1136ce8eb11aSdp 		it->mi_mblock_base = mem_node_config[mnode].physbase;
1137ce8eb11aSdp 		it->mi_mblock_end = mem_node_config[mnode].physmax;
1138ce8eb11aSdp 		if (pfn < it->mi_mblock_base)
1139b779d3e0Sdp 			pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt);
1140b779d3e0Sdp 		if ((pfn + szcpgcnt - 1) > it->mi_mblock_end)
1141ce8eb11aSdp 			pfn = (pfn_t)-1;
11429853d9e8SJason Beloro 		goto done;
1143ce8eb11aSdp 	}
1144ce8eb11aSdp 
1145b779d3e0Sdp 	/* init=1 means begin iterator, init=0 means continue */
1146b779d3e0Sdp 	if (init == 1) {
1147b779d3e0Sdp 		i = 0;
1148b779d3e0Sdp 	} else {
1149b779d3e0Sdp 		ASSERT(it->mi_last_mblock < n_mblocks);
1150b779d3e0Sdp 		i = it->mi_last_mblock;
1151b779d3e0Sdp 		ASSERT(pfn >
1152b779d3e0Sdp 		    mem_stripes[i * max_locality_groups + mnode].physmax);
11539853d9e8SJason Beloro 		if (++i == n_mblocks) {
11549853d9e8SJason Beloro 			pfn = (pfn_t)-1;
11559853d9e8SJason Beloro 			goto done;
11569853d9e8SJason Beloro 		}
1157b779d3e0Sdp 	}
1158b779d3e0Sdp 
1159ce8eb11aSdp 	/*
1160b779d3e0Sdp 	 * Find mblock that contains pfn for mnode's stripe, or first such an
1161b779d3e0Sdp 	 * mblock after pfn, else pfn is out of bound and we'll return -1.
1162b779d3e0Sdp 	 * mblocks and stripes are sorted in ascending address order.
1163ce8eb11aSdp 	 */
1164b779d3e0Sdp 	szcpagesize = szcpgcnt << PAGESHIFT;
1165ce8eb11aSdp 	for (; i < n_mblocks; i++) {
1166b779d3e0Sdp 		if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize))
1167b779d3e0Sdp 			continue;
1168b779d3e0Sdp 		ms = &mem_stripes[i * max_locality_groups + mnode];
1169b779d3e0Sdp 		if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax &&
1170b779d3e0Sdp 		    (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <=
1171b779d3e0Sdp 		    ms->physmax)
1172ce8eb11aSdp 			break;
1173ce8eb11aSdp 	}
1174ce8eb11aSdp 	if (i == n_mblocks) {
1175ce8eb11aSdp 		it->mi_last_mblock = i - 1;
11769853d9e8SJason Beloro 		pfn = (pfn_t)-1;
11779853d9e8SJason Beloro 		goto done;
1178ce8eb11aSdp 	}
1179b779d3e0Sdp 
1180ce8eb11aSdp 	it->mi_last_mblock = i;
1181ce8eb11aSdp 
1182ce8eb11aSdp 	mblock = &mpo_mblock[i];
1183b779d3e0Sdp 	base = ms->physbase;
1184b779d3e0Sdp 	end = ms->physmax;
1185ce8eb11aSdp 
1186ce8eb11aSdp 	it->mi_mnode = mnode;
1187ce8eb11aSdp 	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1188ce8eb11aSdp 	it->mi_mblock_base = base;
1189ce8eb11aSdp 	it->mi_mblock_end = end;
1190ce8eb11aSdp 	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1191ce8eb11aSdp 	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1192ce8eb11aSdp 	it->mi_mnode_mask = max_locality_groups - 1;
1193b779d3e0Sdp 	if (pfn < base) {
1194b779d3e0Sdp 		pfn = P2ROUNDUP(base, szcpgcnt);
1195b779d3e0Sdp 		ASSERT(pfn + szcpgcnt - 1 <= end);
1196b779d3e0Sdp 	}
1197b779d3e0Sdp 	ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn);
11989853d9e8SJason Beloro done:
11999853d9e8SJason Beloro 	mpo_rd_unlock();
1200ce8eb11aSdp 	return (pfn);
1201ce8eb11aSdp }
1202ce8eb11aSdp 
1203ce8eb11aSdp /*
1204ce8eb11aSdp  * plat_mem_node_intersect_range()
1205ce8eb11aSdp  *
1206ce8eb11aSdp  * Find the intersection between a memnode and a range of pfn's.
1207ce8eb11aSdp  */
1208ce8eb11aSdp void
plat_mem_node_intersect_range(pfn_t test_base,pgcnt_t test_len,int mnode,pgcnt_t * npages_out)1209ce8eb11aSdp plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1210ce8eb11aSdp     int mnode, pgcnt_t *npages_out)
1211ce8eb11aSdp {
1212ce8eb11aSdp 	pfn_t offset, len, hole, base, end, test_end, frag;
1213ce8eb11aSdp 	pfn_t nearest;
1214ce8eb11aSdp 	mem_stripe_t *ms;
1215ce8eb11aSdp 	int i, npages;
1216ce8eb11aSdp 
1217ce8eb11aSdp 	*npages_out = 0;
1218ce8eb11aSdp 
1219ce8eb11aSdp 	if (!mem_node_config[mnode].exists || test_len == 0)
1220ce8eb11aSdp 		return;
1221ce8eb11aSdp 
1222ce8eb11aSdp 	base = mem_node_config[mnode].physbase;
1223ce8eb11aSdp 	end = mem_node_config[mnode].physmax;
1224ce8eb11aSdp 
1225ce8eb11aSdp 	test_end = test_base + test_len - 1;
1226ce8eb11aSdp 	if (end < test_base || base > test_end)
1227ce8eb11aSdp 		return;
1228ce8eb11aSdp 
1229ce8eb11aSdp 	if (n_locality_groups == 1) {
1230ce8eb11aSdp 		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1231ce8eb11aSdp 		return;
1232ce8eb11aSdp 	}
1233ce8eb11aSdp 
1234ce8eb11aSdp 	hole = mnode_stride - mnode_pages;
1235ce8eb11aSdp 	npages = 0;
1236ce8eb11aSdp 
1237ce8eb11aSdp 	/*
1238ce8eb11aSdp 	 * Iterate over all the stripes for this mnode (one per mblock),
1239ce8eb11aSdp 	 * find the intersection with each, and accumulate the intersections.
1240ce8eb11aSdp 	 *
1241ce8eb11aSdp 	 * Determing the intersection with a stripe is tricky.  If base or end
1242ce8eb11aSdp 	 * fall outside the mem_node bounds, round them to physbase/physmax of
1243ce8eb11aSdp 	 * mem_node.  If base or end fall in a gap, round them to start of
1244ce8eb11aSdp 	 * nearest stripe.  If they fall within a stripe, keep base or end,
1245ce8eb11aSdp 	 * but calculate the fragment size that should be excluded from the
1246ce8eb11aSdp 	 * stripe.  Calculate how many strides fall in the adjusted range,
1247ce8eb11aSdp 	 * multiply by stripe width, and add the start and end fragments.
1248ce8eb11aSdp 	 */
1249ce8eb11aSdp 
12509853d9e8SJason Beloro 	mpo_rd_lock();
1251ce8eb11aSdp 	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1252ce8eb11aSdp 		ms = &mem_stripes[i];
1253ce8eb11aSdp 		if (ms->exists &&
1254ce8eb11aSdp 		    test_base <= (end = ms->physmax) &&
1255ce8eb11aSdp 		    test_end >= (base = ms->physbase)) {
1256ce8eb11aSdp 
1257ce8eb11aSdp 			offset = ms->offset;
1258ce8eb11aSdp 
1259ce8eb11aSdp 			if (test_base > base) {
1260ce8eb11aSdp 				/* Round test_base to next multiple of stride */
1261ce8eb11aSdp 				len = P2ROUNDUP(test_base - (base - offset),
1262ce8eb11aSdp 				    mnode_stride);
1263ce8eb11aSdp 				nearest = base - offset + len;
1264ce8eb11aSdp 				/*
1265ce8eb11aSdp 				 * Compute distance from test_base to the
1266ce8eb11aSdp 				 * stride boundary to see if test_base falls
1267ce8eb11aSdp 				 * in the stripe or in the hole.
1268ce8eb11aSdp 				 */
1269ce8eb11aSdp 				if (nearest - test_base > hole) {
1270ce8eb11aSdp 					/*
1271ce8eb11aSdp 					 * test_base lies in stripe,
1272ce8eb11aSdp 					 * and offset should be excluded.
1273ce8eb11aSdp 					 */
1274ce8eb11aSdp 					offset = test_base -
1275ce8eb11aSdp 					    (nearest - mnode_stride);
1276ce8eb11aSdp 					base = test_base;
1277ce8eb11aSdp 				} else {
1278ce8eb11aSdp 					/* round up to next stripe start */
1279ce8eb11aSdp 					offset = 0;
1280ce8eb11aSdp 					base = nearest;
1281ce8eb11aSdp 					if (base > end)
1282ce8eb11aSdp 						continue;
1283ce8eb11aSdp 				}
1284ce8eb11aSdp 
1285ce8eb11aSdp 			}
1286ce8eb11aSdp 
1287ce8eb11aSdp 			if (test_end < end)
1288ce8eb11aSdp 				end = test_end;
1289ce8eb11aSdp 			end++;		/* adjust to an exclusive bound */
1290ce8eb11aSdp 
1291ce8eb11aSdp 			/* Round end to next multiple of stride */
1292ce8eb11aSdp 			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1293ce8eb11aSdp 			nearest = (base - offset) + len;
1294ce8eb11aSdp 			if (nearest - end <= hole) {
1295ce8eb11aSdp 				/* end falls in hole, use entire last stripe */
1296ce8eb11aSdp 				frag = 0;
1297ce8eb11aSdp 			} else {
1298ce8eb11aSdp 				/* end falls in stripe, compute fragment */
1299ce8eb11aSdp 				frag = nearest - hole - end;
1300ce8eb11aSdp 			}
1301ce8eb11aSdp 
1302ce8eb11aSdp 			len = (len >> stripe_shift) - offset - frag;
1303ce8eb11aSdp 			npages += len;
1304ce8eb11aSdp 		}
1305ce8eb11aSdp 	}
1306ce8eb11aSdp 
1307ce8eb11aSdp 	*npages_out = npages;
13089853d9e8SJason Beloro 	mpo_rd_unlock();
1309ce8eb11aSdp }
1310ce8eb11aSdp 
1311ce8eb11aSdp /*
1312ce8eb11aSdp  * valid_pages()
1313ce8eb11aSdp  *
1314ce8eb11aSdp  * Return 1 if pages are valid and do not cross mnode boundaries
1315ce8eb11aSdp  * (which would break page free list assumptions), and 0 otherwise.
1316ce8eb11aSdp  */
1317ce8eb11aSdp 
1318ce8eb11aSdp #define	MNODE(pa)	\
1319ce8eb11aSdp 	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1320ce8eb11aSdp 
1321ce8eb11aSdp static int
valid_pages(md_t * md,mde_cookie_t cpu0)1322ce8eb11aSdp valid_pages(md_t *md, mde_cookie_t cpu0)
1323ce8eb11aSdp {
1324ce8eb11aSdp 	int i, max_szc;
1325ce8eb11aSdp 	uint64_t last_page_base, szc_mask;
1326ce8eb11aSdp 	uint64_t max_page_len, max_coalesce_len;
1327ce8eb11aSdp 	struct mblock_md *mb = mpo_mblock;
1328ce8eb11aSdp 
1329ce8eb11aSdp 	/*
1330ce8eb11aSdp 	 * Find the smaller of the largest page possible and supported.
1331ce8eb11aSdp 	 * mmu_exported_pagesize_mask is not yet initialized, so read
1332ce8eb11aSdp 	 * it from the MD.  Apply minimal fixups in case of broken MDs
1333ce8eb11aSdp 	 * to get a sane mask.
1334ce8eb11aSdp 	 */
1335ce8eb11aSdp 
1336*12551037SToomas Soome 	if (cpu0 == 0)
13379853d9e8SJason Beloro 		szc_mask = szc_mask0;
13389853d9e8SJason Beloro 	else {
13399853d9e8SJason Beloro 		if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
13409853d9e8SJason Beloro 			szc_mask = 0;
13419853d9e8SJason Beloro 		/* largest in sun4v default support */
13429853d9e8SJason Beloro 		szc_mask |=  (1 << TTE4M);
13439853d9e8SJason Beloro 		szc_mask0 = szc_mask;
13449853d9e8SJason Beloro 	}
1345ce8eb11aSdp 	max_szc = highbit(szc_mask) - 1;
1346ce8eb11aSdp 	if (max_szc > TTE256M)
1347ce8eb11aSdp 		max_szc = TTE256M;
1348ce8eb11aSdp 	max_page_len = TTEBYTES(max_szc);
1349ce8eb11aSdp 
1350ce8eb11aSdp 	/*
1351ce8eb11aSdp 	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1352ce8eb11aSdp 	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1353ce8eb11aSdp 	 * within one mnode to use MPO.
1354ce8eb11aSdp 	 */
1355ce8eb11aSdp 	max_coalesce_len = TTEBYTES(TTE256M);
1356ce8eb11aSdp 	ASSERT(max_coalesce_len >= max_page_len);
1357ce8eb11aSdp 
1358ce8eb11aSdp 	if (ptob(mnode_pages) < max_coalesce_len) {
1359ce8eb11aSdp 		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1360ce8eb11aSdp 		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1361ce8eb11aSdp 		return (0);
1362ce8eb11aSdp 	}
1363ce8eb11aSdp 
1364ce8eb11aSdp 	for (i = 0; i < n_mblocks; i++) {
1365ce8eb11aSdp 		uint64_t base = mb->base;
1366ce8eb11aSdp 		uint64_t end = mb->base + mb->size - 1;
1367ce8eb11aSdp 		uint64_t ra_to_pa = mb->ra_to_pa;
1368ce8eb11aSdp 
1369ce8eb11aSdp 		/*
1370ce8eb11aSdp 		 * If mblock is smaller than the max page size, then
1371ce8eb11aSdp 		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1372ce8eb11aSdp 		 * not span mnodes.
1373ce8eb11aSdp 		 */
1374ce8eb11aSdp 		if (mb->size < max_page_len) {
1375ce8eb11aSdp 			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1376ce8eb11aSdp 				MPO_STATUS("Small mblock spans mnodes; "
1377ce8eb11aSdp 				    "MPO disabled: base = %lx, end = %lx, "
1378ce8eb11aSdp 				    "ra2pa = %lx\n", base, end, ra_to_pa);
1379ce8eb11aSdp 				return (0);
1380ce8eb11aSdp 			}
1381ce8eb11aSdp 		} else {
1382ce8eb11aSdp 			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1383ce8eb11aSdp 			uint64_t pa_base = base + ra_to_pa;
1384ce8eb11aSdp 			if ((base & (max_coalesce_len - 1)) !=
1385ce8eb11aSdp 			    (pa_base & (max_coalesce_len - 1))) {
1386ce8eb11aSdp 				MPO_STATUS("bad page alignment; MPO disabled: "
1387ce8eb11aSdp 				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1388ce8eb11aSdp 				    base, pa_base, max_coalesce_len);
1389ce8eb11aSdp 				return (0);
1390ce8eb11aSdp 			}
1391ce8eb11aSdp 		}
1392ce8eb11aSdp 
1393ce8eb11aSdp 		/*
1394ce8eb11aSdp 		 * Find start of last large page in mblock in RA space.
1395ce8eb11aSdp 		 * If page extends into the next mblock, verify the
1396ce8eb11aSdp 		 * mnode does not change.
1397ce8eb11aSdp 		 */
1398ce8eb11aSdp 		last_page_base = P2ALIGN(end, max_coalesce_len);
1399ce8eb11aSdp 		if (i + 1 < n_mblocks &&
1400ce8eb11aSdp 		    last_page_base + max_coalesce_len > mb[1].base &&
1401ce8eb11aSdp 		    MNODE(last_page_base + ra_to_pa) !=
1402ce8eb11aSdp 		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1403ce8eb11aSdp 			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1404ce8eb11aSdp 			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1405ce8eb11aSdp 			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1406ce8eb11aSdp 			    mb[1].ra_to_pa, max_coalesce_len);
1407ce8eb11aSdp 			return (0);
1408ce8eb11aSdp 		}
1409ce8eb11aSdp 
1410ce8eb11aSdp 		mb++;
1411ce8eb11aSdp 	}
1412ce8eb11aSdp 	return (1);
1413ce8eb11aSdp }
1414ce8eb11aSdp 
1415ce8eb11aSdp 
1416ce8eb11aSdp /*
1417ce8eb11aSdp  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1418ce8eb11aSdp  * if any, and remove them.  This yields a config where the "coarse
1419ce8eb11aSdp  * grained" lgroups cover all of memory, even though part of that memory
1420ce8eb11aSdp  * is fine grain interleaved and does not deliver a purely local memory
1421ce8eb11aSdp  * latency.
1422ce8eb11aSdp  *
1423ce8eb11aSdp  * This function reads and modifies the globals:
1424ce8eb11aSdp  *	mpo_lgroup[], n_lgrpnodes
1425ce8eb11aSdp  *
1426ce8eb11aSdp  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1427ce8eb11aSdp  */
1428ce8eb11aSdp 
1429ce8eb11aSdp static int
fix_interleave(void)1430ce8eb11aSdp fix_interleave(void)
1431ce8eb11aSdp {
1432ce8eb11aSdp 	int i, j;
1433ce8eb11aSdp 	uint64_t mask = 0;
1434ce8eb11aSdp 
1435ce8eb11aSdp 	j = 0;
1436ce8eb11aSdp 	for (i = 0; i < n_lgrpnodes; i++) {
1437ce8eb11aSdp 		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1438ce8eb11aSdp 			/* remove this lgroup */
1439ce8eb11aSdp 			mask = mpo_lgroup[i].addr_mask;
1440ce8eb11aSdp 		} else {
1441ce8eb11aSdp 			mpo_lgroup[j++] = mpo_lgroup[i];
1442ce8eb11aSdp 		}
1443ce8eb11aSdp 	}
1444ce8eb11aSdp 	n_lgrpnodes = j;
1445ce8eb11aSdp 
1446ce8eb11aSdp 	if (mask != 0)
1447ce8eb11aSdp 		MPO_STATUS("sub-page interleave %lx found; "
1448ce8eb11aSdp 		    "removing lgroup.\n", mask);
1449ce8eb11aSdp 
1450ce8eb11aSdp 	return (mask != 0);
1451ce8eb11aSdp }
14529853d9e8SJason Beloro 
14539853d9e8SJason Beloro /*
14549853d9e8SJason Beloro  * mblock_alloc
14559853d9e8SJason Beloro  *
14569853d9e8SJason Beloro  * Allocate memory for mblock an stripe arrays from either static or
14579853d9e8SJason Beloro  * dynamic space depending on utype, and return the result in mc.
14589853d9e8SJason Beloro  * Returns 0 on success and -1 on error.
14599853d9e8SJason Beloro  */
14609853d9e8SJason Beloro 
14619853d9e8SJason Beloro static int
mblock_alloc(mpo_config_t * mc,update_t utype,int nmblocks)14629853d9e8SJason Beloro mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks)
14639853d9e8SJason Beloro {
14649853d9e8SJason Beloro 	mblock_md_t *mb = NULL;
14659853d9e8SJason Beloro 	mem_stripe_t *ms = NULL;
14669853d9e8SJason Beloro 	int nstripes = MAX_MEM_NODES * nmblocks;
14679853d9e8SJason Beloro 	size_t mblocksz = nmblocks * sizeof (struct mblock_md);
14689853d9e8SJason Beloro 	size_t mstripesz = nstripes * sizeof (mem_stripe_t);
14699853d9e8SJason Beloro 	size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
14709853d9e8SJason Beloro 
14719853d9e8SJason Beloro 	/*
14729853d9e8SJason Beloro 	 * Allocate space for mblocks and mstripes.
14739853d9e8SJason Beloro 	 *
14749853d9e8SJason Beloro 	 * For DR allocations, just use kmem_alloc(), and set
14759853d9e8SJason Beloro 	 * mc_alloc_sz to indicate it was used.
14769853d9e8SJason Beloro 	 *
14779853d9e8SJason Beloro 	 * For boot allocation:
14789853d9e8SJason Beloro 	 * If we have a small number of mblocks we will use the space
14799853d9e8SJason Beloro 	 * that we preallocated. Otherwise, we will dynamically
14809853d9e8SJason Beloro 	 * allocate the space from the prom and map it to the
14819853d9e8SJason Beloro 	 * reserved VA at MPOBUF_BASE.
14829853d9e8SJason Beloro 	 */
14839853d9e8SJason Beloro 
14849853d9e8SJason Beloro 	if (utype == U_ADD || utype == U_DEL) {
14859853d9e8SJason Beloro 		mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP);
14869853d9e8SJason Beloro 		ms = (mem_stripe_t *)(mb + nmblocks);
14879853d9e8SJason Beloro 		mc->mc_alloc_sz = allocsz;
14889853d9e8SJason Beloro 	} else if (nmblocks <= SMALL_MBLOCKS_COUNT) {
14899853d9e8SJason Beloro 		mb = &small_mpo_mblocks[0];
14909853d9e8SJason Beloro 		ms = &small_mem_stripes[0];
14919853d9e8SJason Beloro 		mc->mc_alloc_sz = 0;
14929853d9e8SJason Beloro 	} else {
14939853d9e8SJason Beloro 		/* Ensure that we dont request more space than reserved */
14949853d9e8SJason Beloro 		if (allocsz > MPOBUF_SIZE) {
14959853d9e8SJason Beloro 			MPO_STATUS("mblock_alloc: Insufficient space "
14969853d9e8SJason Beloro 			    "for mblock structures \n");
14979853d9e8SJason Beloro 			return (-1);
14989853d9e8SJason Beloro 		}
14999853d9e8SJason Beloro 		mb = (struct mblock_md *)
15009853d9e8SJason Beloro 		    prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
15019853d9e8SJason Beloro 		if (mb != (struct mblock_md *)MPOBUF_BASE) {
15029853d9e8SJason Beloro 			MPO_STATUS("mblock_alloc: Cannot allocate space "
15039853d9e8SJason Beloro 			    "for mblocks \n");
15049853d9e8SJason Beloro 			return (-1);
15059853d9e8SJason Beloro 		}
15069853d9e8SJason Beloro 		mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
15079853d9e8SJason Beloro 		mpo_heap32_bufsz = MPOBUF_SIZE;
15089853d9e8SJason Beloro 		ms = (mem_stripe_t *)(mb + nmblocks);
15099853d9e8SJason Beloro 		mc->mc_alloc_sz = 0;
15109853d9e8SJason Beloro 	}
15119853d9e8SJason Beloro 	mc->mc_mblocks = mb;
15129853d9e8SJason Beloro 	mc->mc_stripes = ms;
15139853d9e8SJason Beloro 	mc->mc_nmblocks = nmblocks;
15149853d9e8SJason Beloro 	mc->mc_nstripes = nstripes;
15159853d9e8SJason Beloro 	MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks);
15169853d9e8SJason Beloro 	return (0);
15179853d9e8SJason Beloro }
15189853d9e8SJason Beloro 
15199853d9e8SJason Beloro /*
15209853d9e8SJason Beloro  * mblock_free
15219853d9e8SJason Beloro  *
15229853d9e8SJason Beloro  * Free memory in mc that was allocated by mblock_alloc.
15239853d9e8SJason Beloro  */
15249853d9e8SJason Beloro 
15259853d9e8SJason Beloro static void
mblock_free(mpo_config_t * mc)15269853d9e8SJason Beloro mblock_free(mpo_config_t *mc)
15279853d9e8SJason Beloro {
15289853d9e8SJason Beloro 	if (mc->mc_alloc_sz > 0) {
15299853d9e8SJason Beloro 		ASSERT(mc->mc_mblocks != mpo_mblock);
15309853d9e8SJason Beloro 		kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz);
15319853d9e8SJason Beloro 	}
15329853d9e8SJason Beloro 	bzero(mc, sizeof (*mc));
15339853d9e8SJason Beloro }
15349853d9e8SJason Beloro 
15359853d9e8SJason Beloro /*
15369853d9e8SJason Beloro  * mblock_install
15379853d9e8SJason Beloro  *
15389853d9e8SJason Beloro  * Install mblock config passed in mc as the global configuration.
15399853d9e8SJason Beloro  * May only be called at boot or while holding mpo_wr_lock.
15409853d9e8SJason Beloro  */
15419853d9e8SJason Beloro 
15429853d9e8SJason Beloro static void
mblock_install(mpo_config_t * mc)15439853d9e8SJason Beloro mblock_install(mpo_config_t *mc)
15449853d9e8SJason Beloro {
15459853d9e8SJason Beloro 	mpo_mblock = mc->mc_mblocks;
15469853d9e8SJason Beloro 	n_mblocks = mc->mc_nmblocks;
15479853d9e8SJason Beloro 	mem_stripes = mc->mc_stripes;
15489853d9e8SJason Beloro 	n_mem_stripes = mc->mc_nstripes;
15499853d9e8SJason Beloro 	base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa);
15509853d9e8SJason Beloro 	mpo_config = *mc;
15519853d9e8SJason Beloro }
15529853d9e8SJason Beloro 
15539853d9e8SJason Beloro /*
15549853d9e8SJason Beloro  * mblock_update
15559853d9e8SJason Beloro  *
15569853d9e8SJason Beloro  * Traverse mblocknodes, read the mblock properties from the MD, and
15579853d9e8SJason Beloro  * save the mblocks in mc.
15589853d9e8SJason Beloro  */
15599853d9e8SJason Beloro 
15609853d9e8SJason Beloro static void
mblock_update(mpo_config_t * mc,md_t md,mde_cookie_t * mblocknodes)15619853d9e8SJason Beloro mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes)
15629853d9e8SJason Beloro {
15639853d9e8SJason Beloro 	uint64_t i, j;
15649853d9e8SJason Beloro 	int result = 0;
15659853d9e8SJason Beloro 	mblock_md_t *mblock = mc->mc_mblocks;
15669853d9e8SJason Beloro 
15679853d9e8SJason Beloro 	for (i = 0, j = 0; j < mc->mc_nmblocks; j++) {
15689853d9e8SJason Beloro 
15699853d9e8SJason Beloro 		/* Without a base or size value we will fail */
15709853d9e8SJason Beloro 		result = get_int(md, mblocknodes[j], PROP_LG_BASE,
15719853d9e8SJason Beloro 		    &mblock[i].base);
15729853d9e8SJason Beloro 		if (result < 0) {
15739853d9e8SJason Beloro 			MPO_STATUS("mblock_update: "
15749853d9e8SJason Beloro 			    "PROP_LG_BASE is missing\n");
15759853d9e8SJason Beloro 			mc->mc_nmblocks = 0;
15769853d9e8SJason Beloro 			return;
15779853d9e8SJason Beloro 		}
15789853d9e8SJason Beloro 
15799853d9e8SJason Beloro 		result = get_int(md, mblocknodes[j], PROP_LG_SIZE,
15809853d9e8SJason Beloro 		    &mblock[i].size);
15819853d9e8SJason Beloro 		if (result < 0) {
15829853d9e8SJason Beloro 			MPO_STATUS("mblock_update: "
15839853d9e8SJason Beloro 			    "PROP_LG_SIZE is missing\n");
15849853d9e8SJason Beloro 			mc->mc_nmblocks = 0;
15859853d9e8SJason Beloro 			return;
15869853d9e8SJason Beloro 		}
15879853d9e8SJason Beloro 
15889853d9e8SJason Beloro 		result = get_int(md, mblocknodes[j],
15899853d9e8SJason Beloro 		    PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa);
15909853d9e8SJason Beloro 
15919853d9e8SJason Beloro 		/* If we don't have an ra_pa_offset, just set it to 0 */
15929853d9e8SJason Beloro 		if (result < 0)
15939853d9e8SJason Beloro 			mblock[i].ra_to_pa = 0;
15949853d9e8SJason Beloro 
15959853d9e8SJason Beloro 		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
15969853d9e8SJason Beloro 		    "ra_to_pa = %lx\n", i,
15979853d9e8SJason Beloro 		    mblock[i].base,
15989853d9e8SJason Beloro 		    mblock[i].size,
15999853d9e8SJason Beloro 		    mblock[i].ra_to_pa);
16009853d9e8SJason Beloro 
16019853d9e8SJason Beloro 		/* check for unsupportable values of base and size */
16029853d9e8SJason Beloro 		if (mblock[i].base > mblock[i].base + mblock[i].size) {
16039853d9e8SJason Beloro 			MPO_STATUS("mblock_update: "
16049853d9e8SJason Beloro 			    "PROP_LG_BASE+PROP_LG_SIZE is invalid: "
16059853d9e8SJason Beloro 			    "base = %lx, size = %lx\n",
16069853d9e8SJason Beloro 			    mblock[i].base, mblock[i].size);
16079853d9e8SJason Beloro 			mc->mc_nmblocks = 0;
16089853d9e8SJason Beloro 			return;
16099853d9e8SJason Beloro 		}
16109853d9e8SJason Beloro 
16119853d9e8SJason Beloro 		/* eliminate size==0 blocks */
16129853d9e8SJason Beloro 		if (mblock[i].size != 0) {
16139853d9e8SJason Beloro 			uint64_t base = mblock[i].base;
16149853d9e8SJason Beloro 			uint64_t end = base + mblock[i].size;
16159853d9e8SJason Beloro 			ASSERT(end > base);
16169853d9e8SJason Beloro 			mblock[i].base_pfn = btop(base);
16179853d9e8SJason Beloro 			mblock[i].end_pfn = btop(end - 1);
16189853d9e8SJason Beloro 			i++;
16199853d9e8SJason Beloro 		}
16209853d9e8SJason Beloro 	}
16219853d9e8SJason Beloro 
16229853d9e8SJason Beloro 	if (i == 0) {
16239853d9e8SJason Beloro 		MPO_STATUS("mblock_update: "
16249853d9e8SJason Beloro 		    "No non-empty mblock nodes were found "
16259853d9e8SJason Beloro 		    "in the Machine Descriptor\n");
16269853d9e8SJason Beloro 		mc->mc_nmblocks = 0;
16279853d9e8SJason Beloro 		return;
16289853d9e8SJason Beloro 	}
16299853d9e8SJason Beloro 	ASSERT(i <= mc->mc_nmblocks);
16309853d9e8SJason Beloro 	mc->mc_nmblocks = i;
16319853d9e8SJason Beloro 
16329853d9e8SJason Beloro 	/* Must sort mblocks by address for mem_node_iterator_init() */
16339853d9e8SJason Beloro 	mblock_sort(mblock, mc->mc_nmblocks);
16349853d9e8SJason Beloro }
16359853d9e8SJason Beloro 
16369853d9e8SJason Beloro /*
16379853d9e8SJason Beloro  * mblock_update_add
16389853d9e8SJason Beloro  *
16399853d9e8SJason Beloro  * Update mblock config after a memory DR add.  The added range is not
16409853d9e8SJason Beloro  * needed, as we read *all* mblock nodes from the MD.  Save the mblocks
16419853d9e8SJason Beloro  * in mc.
16429853d9e8SJason Beloro  */
16439853d9e8SJason Beloro 
16449853d9e8SJason Beloro static void
mblock_update_add(mpo_config_t * mc)16459853d9e8SJason Beloro mblock_update_add(mpo_config_t *mc)
16469853d9e8SJason Beloro {
16479853d9e8SJason Beloro 	md_t *md;
16489853d9e8SJason Beloro 	mde_cookie_t root, *mblocknodes;
16499853d9e8SJason Beloro 	int nmblocks = 0;
16509853d9e8SJason Beloro 
16519853d9e8SJason Beloro 	if ((md = md_get_handle()) == NULL) {
16529853d9e8SJason Beloro 		MPO_STATUS("Cannot access Machine Descriptor\n");
16539853d9e8SJason Beloro 		goto error;
16549853d9e8SJason Beloro 	}
16559853d9e8SJason Beloro 
16569853d9e8SJason Beloro 	if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE)
16579853d9e8SJason Beloro 		goto error;
16589853d9e8SJason Beloro 
16599853d9e8SJason Beloro 	nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd",
16609853d9e8SJason Beloro 	    &mblocknodes);
16619853d9e8SJason Beloro 	if (nmblocks <= 0) {
16629853d9e8SJason Beloro 		MPO_STATUS("No mblock nodes detected in Machine Descriptor\n");
16639853d9e8SJason Beloro 		goto error;
16649853d9e8SJason Beloro 	}
16659853d9e8SJason Beloro 
16669853d9e8SJason Beloro 	if (mblock_alloc(mc, U_ADD, nmblocks) < 0)
16679853d9e8SJason Beloro 		goto error;
16689853d9e8SJason Beloro 
16699853d9e8SJason Beloro 	mblock_update(mc, md, mblocknodes);
16709853d9e8SJason Beloro 	md_free_scan_dag(md, &mblocknodes);
16719853d9e8SJason Beloro 	(void) md_fini_handle(md);
16729853d9e8SJason Beloro 	return;
16739853d9e8SJason Beloro error:
16749853d9e8SJason Beloro 	panic("mblock_update_add: cannot process mblocks from MD.\n");
16759853d9e8SJason Beloro }
16769853d9e8SJason Beloro 
16779853d9e8SJason Beloro /*
16789853d9e8SJason Beloro  * mblock_update_del
16799853d9e8SJason Beloro  *
16809853d9e8SJason Beloro  * Update mblocks after a memory DR deletion of the range (ubase, uend).
16819853d9e8SJason Beloro  * Allocate a new mblock config, copy old config to the new, modify the new
16829853d9e8SJason Beloro  * mblocks to reflect the deletion.   The new mblocks are returned in
16839853d9e8SJason Beloro  * mc_new and are not yet installed as the active config.
16849853d9e8SJason Beloro  */
16859853d9e8SJason Beloro 
16869853d9e8SJason Beloro static void
mblock_update_del(mpo_config_t * mc_new,mpo_config_t * mc_old,pfn_t ubase,pfn_t uend)16879853d9e8SJason Beloro mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase,
16889853d9e8SJason Beloro     pfn_t uend)
16899853d9e8SJason Beloro {
16909853d9e8SJason Beloro 	int i, j;
16919853d9e8SJason Beloro 	pfn_t base, end;
16929853d9e8SJason Beloro 	mblock_md_t *mblock;
16939853d9e8SJason Beloro 	int nmblocks = mc_old->mc_nmblocks;
16949853d9e8SJason Beloro 
16959853d9e8SJason Beloro 	MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend);
16969853d9e8SJason Beloro 
16979853d9e8SJason Beloro 	/*
16989853d9e8SJason Beloro 	 * Allocate mblocks in mc_new and copy the old to the new.
16999853d9e8SJason Beloro 	 * Allocate one extra in case the deletion splits an mblock.
17009853d9e8SJason Beloro 	 */
17019853d9e8SJason Beloro 	if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0)
17029853d9e8SJason Beloro 		return;
17039853d9e8SJason Beloro 	mblock = mc_new->mc_mblocks;
17049853d9e8SJason Beloro 	bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t));
17059853d9e8SJason Beloro 
17069853d9e8SJason Beloro 	/*
17079853d9e8SJason Beloro 	 * Find the mblock containing the deleted range and adjust it in
17089853d9e8SJason Beloro 	 * the new config.
17099853d9e8SJason Beloro 	 */
17109853d9e8SJason Beloro 	for (i = 0; i < nmblocks; i++) {
17119853d9e8SJason Beloro 
17129853d9e8SJason Beloro 		base = btop(mblock[i].base);
17139853d9e8SJason Beloro 		end = base + btop(mblock[i].size) - 1;
17149853d9e8SJason Beloro 
17159853d9e8SJason Beloro 		/*
17169853d9e8SJason Beloro 		 * Adjust the mblock based on the subset that was deleted.
17179853d9e8SJason Beloro 		 *
17189853d9e8SJason Beloro 		 * If the entire mblk was deleted, compact the table.
17199853d9e8SJason Beloro 		 *
17209853d9e8SJason Beloro 		 * If the middle of the mblk was deleted, extend
17219853d9e8SJason Beloro 		 * the table.  Space for the new slot was already
17229853d9e8SJason Beloro 		 * allocated.
17239853d9e8SJason Beloro 		 *
17249853d9e8SJason Beloro 		 * The memory to be deleted is a mblock or a subset of
17259853d9e8SJason Beloro 		 * and does not span multiple mblocks.
17269853d9e8SJason Beloro 		 */
17279853d9e8SJason Beloro 		if (base == ubase && end == uend) {
17289853d9e8SJason Beloro 			for (j = i; j < nmblocks - 1; j++)
17299853d9e8SJason Beloro 				mblock[j] = mblock[j + 1];
17309853d9e8SJason Beloro 			nmblocks--;
17319853d9e8SJason Beloro 			bzero(&mblock[nmblocks], sizeof (*mblock));
17329853d9e8SJason Beloro 			break;
17339853d9e8SJason Beloro 		} else if (base < ubase && end > uend) {
17349853d9e8SJason Beloro 			for (j = nmblocks - 1; j >= i; j--)
17359853d9e8SJason Beloro 				mblock[j + 1] = mblock[j];
17369853d9e8SJason Beloro 			mblock[i].size = ptob(ubase - base);
17379853d9e8SJason Beloro 			mblock[i].end_pfn = ubase - 1;
17389853d9e8SJason Beloro 			mblock[i + 1].base = ptob(uend + 1);
17399853d9e8SJason Beloro 			mblock[i + 1].size = ptob(end - uend);
17409853d9e8SJason Beloro 			mblock[i + 1].base_pfn = uend + 1;
17419853d9e8SJason Beloro 			nmblocks++;
17429853d9e8SJason Beloro 			break;
17439853d9e8SJason Beloro 		} else if (base == ubase) {
17449853d9e8SJason Beloro 			MPO_DEBUG("mblock_update_del: shrink>"
17459853d9e8SJason Beloro 			    " i=%d base=0x%lx end=0x%lx", i, base, end);
17469853d9e8SJason Beloro 			mblock[i].base = ptob(uend + 1);
17479853d9e8SJason Beloro 			mblock[i].size -= ptob(uend - ubase + 1);
17489853d9e8SJason Beloro 			base = uend + 1;
17499853d9e8SJason Beloro 			mblock[i].base_pfn = base;
17509853d9e8SJason Beloro 			mblock[i].end_pfn = end;
17519853d9e8SJason Beloro 			MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
17529853d9e8SJason Beloro 			break;
17539853d9e8SJason Beloro 		} else if (end == uend) {
17549853d9e8SJason Beloro 			MPO_DEBUG("mblock_update_del: shrink<"
17559853d9e8SJason Beloro 			    " i=%d base=0x%lx end=0x%lx", i, base, end);
17569853d9e8SJason Beloro 			mblock[i].size -= ptob(uend - ubase + 1);
17579853d9e8SJason Beloro 			end = ubase - 1;
17589853d9e8SJason Beloro 			mblock[i].base_pfn = base;
17599853d9e8SJason Beloro 			mblock[i].end_pfn = end;
17609853d9e8SJason Beloro 			MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end);
17619853d9e8SJason Beloro 			break;
17629853d9e8SJason Beloro 		}
17639853d9e8SJason Beloro 	}
17649853d9e8SJason Beloro 	mc_new->mc_nmblocks = nmblocks;
17659853d9e8SJason Beloro 	ASSERT(end > base);
17669853d9e8SJason Beloro }
17679853d9e8SJason Beloro 
17689853d9e8SJason Beloro /*
17699853d9e8SJason Beloro  * mstripe_update
17709853d9e8SJason Beloro  *
17719853d9e8SJason Beloro  * Read mblocks from mc and update mstripes in mc
17729853d9e8SJason Beloro  */
17739853d9e8SJason Beloro 
17749853d9e8SJason Beloro static void
mstripe_update(mpo_config_t * mc)17759853d9e8SJason Beloro mstripe_update(mpo_config_t *mc)
17769853d9e8SJason Beloro {
17779853d9e8SJason Beloro 	lgrp_handle_t lgrphand, lgrp_start;
17789853d9e8SJason Beloro 	int i, mnode;
17799853d9e8SJason Beloro 	uint64_t offset, stripe_end, base, end, ra_to_pa, stride;
17809853d9e8SJason Beloro 	uint64_t stripe, frag, remove;
17819853d9e8SJason Beloro 	mem_stripe_t *ms;
17829853d9e8SJason Beloro 	mblock_md_t *mblock = mc->mc_mblocks;
17839853d9e8SJason Beloro 	int nmblocks = mc->mc_nmblocks;
17849853d9e8SJason Beloro 	int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t);
17859853d9e8SJason Beloro 
17869853d9e8SJason Beloro 	/* Check for non-MPO sun4v platforms or memory DR removal */
17879853d9e8SJason Beloro 	if (n_locality_groups <= 1) {
17889853d9e8SJason Beloro 		ASSERT(n_locality_groups == 1);
17899853d9e8SJason Beloro 		ASSERT(max_locality_groups == 1 && max_mem_nodes == 1);
17909853d9e8SJason Beloro 
17919853d9e8SJason Beloro 		if (nmblocks == 1) {
17929853d9e8SJason Beloro 			mc->mc_nstripes = 0;
17939853d9e8SJason Beloro 		} else {
17949853d9e8SJason Beloro 			mc->mc_nstripes = nmblocks;
17959853d9e8SJason Beloro 			bzero(mc->mc_stripes, mstripesz);
17969853d9e8SJason Beloro 			for (i = 0; i < nmblocks; i++) {
17979853d9e8SJason Beloro 				mc->mc_stripes[i].exists = 1;
17989853d9e8SJason Beloro 				mc->mc_stripes[i].physbase = mblock[i].base_pfn;
17999853d9e8SJason Beloro 				mc->mc_stripes[i].physmax = mblock[i].end_pfn;
18009853d9e8SJason Beloro 			}
18019853d9e8SJason Beloro 		}
18029853d9e8SJason Beloro 		return;
18039853d9e8SJason Beloro 	}
18049853d9e8SJason Beloro 
18059853d9e8SJason Beloro 	bzero(mc->mc_stripes, mstripesz);
18069853d9e8SJason Beloro 	mc->mc_nstripes = max_locality_groups * nmblocks;
18079853d9e8SJason Beloro 	stripe = ptob(mnode_pages);
18089853d9e8SJason Beloro 	stride = max_locality_groups * stripe;
18099853d9e8SJason Beloro 
18109853d9e8SJason Beloro 	for (i = 0; i < nmblocks; i++) {
18119853d9e8SJason Beloro 		base = mblock[i].base;
18129853d9e8SJason Beloro 		end = base + mblock[i].size;
18139853d9e8SJason Beloro 		ra_to_pa = mblock[i].ra_to_pa;
18149853d9e8SJason Beloro 
18159853d9e8SJason Beloro 		/* Find the offset from the prev stripe boundary in PA space. */
18169853d9e8SJason Beloro 		offset = (base + ra_to_pa) & (stripe - 1);
18179853d9e8SJason Beloro 
18189853d9e8SJason Beloro 		/* Set the next stripe boundary. */
18199853d9e8SJason Beloro 		stripe_end = base - offset + stripe;
18209853d9e8SJason Beloro 
18219853d9e8SJason Beloro 		lgrp_start = (((base + ra_to_pa) & home_mask) >>
18229853d9e8SJason Beloro 		    home_mask_shift);
18239853d9e8SJason Beloro 		lgrphand = lgrp_start;
18249853d9e8SJason Beloro 
18259853d9e8SJason Beloro 		/*
18269853d9e8SJason Beloro 		 * Loop over all lgroups covered by the mblock, creating a
18279853d9e8SJason Beloro 		 * stripe for each.  Stop when lgrp_start is visited again.
18289853d9e8SJason Beloro 		 */
18299853d9e8SJason Beloro 		do {
18309853d9e8SJason Beloro 			/* mblock may not span all lgroups */
18319853d9e8SJason Beloro 			if (base >= end)
18329853d9e8SJason Beloro 				break;
18339853d9e8SJason Beloro 
18349853d9e8SJason Beloro 			mnode = lgrphand;
18359853d9e8SJason Beloro 			ASSERT(mnode < max_mem_nodes);
18369853d9e8SJason Beloro 
18379853d9e8SJason Beloro 			/*
18389853d9e8SJason Beloro 			 * Calculate the size of the fragment that does not
18399853d9e8SJason Beloro 			 * belong to the mnode in the last partial stride.
18409853d9e8SJason Beloro 			 */
18419853d9e8SJason Beloro 			frag = (end - (base - offset)) & (stride - 1);
18429853d9e8SJason Beloro 			if (frag == 0) {
18439853d9e8SJason Beloro 				/* remove the gap */
18449853d9e8SJason Beloro 				remove = stride - stripe;
18459853d9e8SJason Beloro 			} else if (frag < stripe) {
18469853d9e8SJason Beloro 				/* fragment fits in stripe; keep it all */
18479853d9e8SJason Beloro 				remove = 0;
18489853d9e8SJason Beloro 			} else {
18499853d9e8SJason Beloro 				/* fragment is large; trim after whole stripe */
18509853d9e8SJason Beloro 				remove = frag - stripe;
18519853d9e8SJason Beloro 			}
18529853d9e8SJason Beloro 
18539853d9e8SJason Beloro 			ms = &mc->mc_stripes[i * max_locality_groups + mnode];
18549853d9e8SJason Beloro 			ms->physbase = btop(base);
18559853d9e8SJason Beloro 			ms->physmax = btop(end - 1 - remove);
18569853d9e8SJason Beloro 			ms->offset = btop(offset);
18579853d9e8SJason Beloro 			ms->exists = 1;
18589853d9e8SJason Beloro 
18599853d9e8SJason Beloro 			base = stripe_end;
18609853d9e8SJason Beloro 			stripe_end += stripe;
18619853d9e8SJason Beloro 			offset = 0;
18629853d9e8SJason Beloro 			lgrphand = (((base + ra_to_pa) & home_mask) >>
18639853d9e8SJason Beloro 			    home_mask_shift);
18649853d9e8SJason Beloro 		} while (lgrphand != lgrp_start);
18659853d9e8SJason Beloro 	}
18669853d9e8SJason Beloro }
18679853d9e8SJason Beloro 
18689853d9e8SJason Beloro #define	INTERSECT(a, b, c, d)				\
18699853d9e8SJason Beloro 	if (((a) >= (c) && (a) <= (d)) ||		\
18709853d9e8SJason Beloro 	    ((c) >= (a) && (c) <= (b))) {		\
18719853d9e8SJason Beloro 		(c) = MAX((a), (c));			\
18729853d9e8SJason Beloro 		(d) = MIN((b), (d));			\
18739853d9e8SJason Beloro 	} else {					\
18749853d9e8SJason Beloro 		ASSERT((a) >= (d) || (b) <= (c));	\
18759853d9e8SJason Beloro 		continue;				\
18769853d9e8SJason Beloro 	}						\
18779853d9e8SJason Beloro 
18789853d9e8SJason Beloro /*
18799853d9e8SJason Beloro  * mnode_update
18809853d9e8SJason Beloro  *
18819853d9e8SJason Beloro  * Read stripes from mc and update mnode extents.  The mnode extents are
18829853d9e8SJason Beloro  * part of the live configuration, so this can only be done at boot time
18839853d9e8SJason Beloro  * or while holding the mpo_wr_lock.
18849853d9e8SJason Beloro  */
18859853d9e8SJason Beloro 
18869853d9e8SJason Beloro static void
mnode_update(mpo_config_t * mc,pfn_t ubase,pfn_t uend,update_t utype)18879853d9e8SJason Beloro mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype)
18889853d9e8SJason Beloro {
18899853d9e8SJason Beloro 	int i, j, mnode, found;
18909853d9e8SJason Beloro 	pfn_t base, end;
18919853d9e8SJason Beloro 	mem_stripe_t *ms;
18929853d9e8SJason Beloro 
18939853d9e8SJason Beloro 	MPO_DEBUG("mnode_udpate: basepfn: %lx  endpfn: %lx\n", ubase, uend);
18949853d9e8SJason Beloro 
18959853d9e8SJason Beloro 	if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) {
18969853d9e8SJason Beloro 		if (utype == U_ADD)
18979853d9e8SJason Beloro 			mpo_mem_node_add_slice(ubase, uend);
18989853d9e8SJason Beloro 		else if (utype == U_DEL)
18999853d9e8SJason Beloro 			mpo_mem_node_del_slice(ubase, uend);
19009853d9e8SJason Beloro 		else
19019853d9e8SJason Beloro 			panic("mnode update: %d: invalid\n", utype);
19029853d9e8SJason Beloro 		return;
19039853d9e8SJason Beloro 	}
19049853d9e8SJason Beloro 
19059853d9e8SJason Beloro 	found = 0;
19069853d9e8SJason Beloro 	for (i = 0; i < mc->mc_nmblocks; i++) {
19079853d9e8SJason Beloro 		for (mnode = 0; mnode < max_locality_groups; mnode++) {
19089853d9e8SJason Beloro 
19099853d9e8SJason Beloro 			j = i * max_locality_groups + mnode;
19109853d9e8SJason Beloro 			ms = &mc->mc_stripes[j];
19119853d9e8SJason Beloro 			if (!ms->exists)
19129853d9e8SJason Beloro 				continue;
19139853d9e8SJason Beloro 
19149853d9e8SJason Beloro 			base = ms->physbase;
19159853d9e8SJason Beloro 			end = ms->physmax;
19169853d9e8SJason Beloro 
19179853d9e8SJason Beloro 			/*
19189853d9e8SJason Beloro 			 * Look for the mstripes intersecting this slice.
19199853d9e8SJason Beloro 			 *
19209853d9e8SJason Beloro 			 * The mstripe and slice pairs may not be equal
19219853d9e8SJason Beloro 			 * if a subset of a mblock is added/deleted.
19229853d9e8SJason Beloro 			 */
19239853d9e8SJason Beloro 			switch (utype) {
19249853d9e8SJason Beloro 			case U_ADD:
19259853d9e8SJason Beloro 				INTERSECT(ubase, uend, base, end);
19269853d9e8SJason Beloro 				/*FALLTHROUGH*/
19279853d9e8SJason Beloro 			case U_ADD_ALL:
19289853d9e8SJason Beloro 				if (n_locality_groups > 1)
19299853d9e8SJason Beloro 					mpo_plat_assign_lgrphand_to_mem_node(
19309853d9e8SJason Beloro 					    mnode, mnode);
19319853d9e8SJason Beloro 				mpo_mem_node_add_slice(base, end);
19329853d9e8SJason Beloro 				break;
19339853d9e8SJason Beloro 			case U_DEL:
19349853d9e8SJason Beloro 				INTERSECT(ubase, uend, base, end);
19359853d9e8SJason Beloro 				mpo_mem_node_del_slice(base, end);
19369853d9e8SJason Beloro 				break;
19379853d9e8SJason Beloro 			default:
19389853d9e8SJason Beloro 				panic("mnode_update: %d: invalid\n", utype);
19399853d9e8SJason Beloro 				break;
19409853d9e8SJason Beloro 			}
19419853d9e8SJason Beloro 
19429853d9e8SJason Beloro 			found++;
19439853d9e8SJason Beloro 		}
19449853d9e8SJason Beloro 	}
19459853d9e8SJason Beloro 
19469853d9e8SJason Beloro 	if (!found)
19479853d9e8SJason Beloro 		panic("mnode_update: mstripe not found");
19489853d9e8SJason Beloro 
19499853d9e8SJason Beloro #ifdef	DEBUG
19509853d9e8SJason Beloro 	if (utype == U_ADD_ALL || utype == U_DEL)
19519853d9e8SJason Beloro 		return;
19529853d9e8SJason Beloro 	found = 0;
19539853d9e8SJason Beloro 	for (i = 0; i < max_mem_nodes; i++) {
19549853d9e8SJason Beloro 		if (!mem_node_config[i].exists)
19559853d9e8SJason Beloro 			continue;
19569853d9e8SJason Beloro 		if (ubase >= mem_node_config[i].physbase &&
19579853d9e8SJason Beloro 		    ubase <= mem_node_config[i].physmax)
19589853d9e8SJason Beloro 			found |= 1;
19599853d9e8SJason Beloro 		if (uend >= mem_node_config[i].physbase &&
19609853d9e8SJason Beloro 		    uend <= mem_node_config[i].physmax)
19619853d9e8SJason Beloro 			found |= 2;
19629853d9e8SJason Beloro 	}
19639853d9e8SJason Beloro 	ASSERT(found == 3);
19649853d9e8SJason Beloro 	{
19659853d9e8SJason Beloro 		pfn_t minpfn, maxpfn;
19669853d9e8SJason Beloro 
19679853d9e8SJason Beloro 		mem_node_max_range(&minpfn, &maxpfn);
19689853d9e8SJason Beloro 		ASSERT(minpfn <= ubase);
19699853d9e8SJason Beloro 		ASSERT(maxpfn >= uend);
19709853d9e8SJason Beloro 	}
19719853d9e8SJason Beloro #endif
19729853d9e8SJason Beloro }
19739853d9e8SJason Beloro 
19749853d9e8SJason Beloro /*
19759853d9e8SJason Beloro  * Plat_slice_add()/plat_slice_del() are the platform hooks
19769853d9e8SJason Beloro  * for adding/deleting a pfn range to/from the system.
19779853d9e8SJason Beloro  *
19789853d9e8SJason Beloro  * Platform_slice_add() is used for both boot/DR cases.
19799853d9e8SJason Beloro  *
19809853d9e8SJason Beloro  * - Zeus has already added the mblocks to the MD, so read the updated
19819853d9e8SJason Beloro  *   MD and allocate all data structures required to manage the new memory
19829853d9e8SJason Beloro  *   configuration.
19839853d9e8SJason Beloro  *
19849853d9e8SJason Beloro  * - Recompute the stripes which are derived from the mblocks.
19859853d9e8SJason Beloro  *
19869853d9e8SJason Beloro  * - Update (expand) the mnode extents and install the modified mblocks as
19879853d9e8SJason Beloro  *   the new mpo config.  This must be done while holding the mpo_wr_lock
19889853d9e8SJason Beloro  *   to guarantee that no other threads access the mpo meta-data.
19899853d9e8SJason Beloro  *
19909853d9e8SJason Beloro  * - Unlock MPO data structures; the new config is live.  Free the old config.
19919853d9e8SJason Beloro  *
19929853d9e8SJason Beloro  * Plat_slice_del() is used for DR only.
19939853d9e8SJason Beloro  *
19949853d9e8SJason Beloro  * - Zeus has not yet modified the MD to reflect the deletion, so copy
19959853d9e8SJason Beloro  *   the old mpo mblocks and delete the range from the copy.
19969853d9e8SJason Beloro  *
19979853d9e8SJason Beloro  * - Recompute the stripes which are derived from the mblocks.
19989853d9e8SJason Beloro  *
19999853d9e8SJason Beloro  * - Update (shrink) the mnode extents and install the modified mblocks as
20009853d9e8SJason Beloro  *   the new mpo config.  This must be done while holding the mpo_wr_lock
20019853d9e8SJason Beloro  *   to guarantee that no other threads access the mpo meta-data.
20029853d9e8SJason Beloro  *
20039853d9e8SJason Beloro  * - Unlock MPO data structures; the new config is live.  Free the old config.
20049853d9e8SJason Beloro  */
20059853d9e8SJason Beloro 
20069853d9e8SJason Beloro void
plat_slice_add(pfn_t base,pfn_t end)20079853d9e8SJason Beloro plat_slice_add(pfn_t base, pfn_t end)
20089853d9e8SJason Beloro {
20099853d9e8SJason Beloro 	mpo_config_t old_config = mpo_config;
20109853d9e8SJason Beloro 	mpo_config_t new_config;
20119853d9e8SJason Beloro 
20129853d9e8SJason Beloro 	VALIDATE_SLICE(base, end);
20139853d9e8SJason Beloro 	mblock_update_add(&new_config);
20149853d9e8SJason Beloro 	mstripe_update(&new_config);
20159853d9e8SJason Beloro 	mpo_wr_lock();
20169853d9e8SJason Beloro 	mblock_install(&new_config);
20179853d9e8SJason Beloro 	/* Use new config to add all ranges for mnode_update */
20189853d9e8SJason Beloro 	mnode_update(&new_config, base, end, U_ADD);
20199853d9e8SJason Beloro 	mpo_genid++;
20209853d9e8SJason Beloro 	mpo_wr_unlock();
20219853d9e8SJason Beloro 	mblock_free(&old_config);
20229853d9e8SJason Beloro }
20239853d9e8SJason Beloro 
20249853d9e8SJason Beloro void
plat_slice_del(pfn_t base,pfn_t end)20259853d9e8SJason Beloro plat_slice_del(pfn_t base, pfn_t end)
20269853d9e8SJason Beloro {
20279853d9e8SJason Beloro 	mpo_config_t old_config = mpo_config;
20289853d9e8SJason Beloro 	mpo_config_t new_config;
20299853d9e8SJason Beloro 
20309853d9e8SJason Beloro 	VALIDATE_SLICE(base, end);
20319853d9e8SJason Beloro 	mblock_update_del(&new_config, &old_config, base, end);
20329853d9e8SJason Beloro 	mstripe_update(&new_config);
20339853d9e8SJason Beloro 	mpo_wr_lock();
20349853d9e8SJason Beloro 	/* Use old config to find deleted range for mnode_update */
20359853d9e8SJason Beloro 	mnode_update(&old_config, base, end, U_DEL);
20369853d9e8SJason Beloro 	mblock_install(&new_config);
20379853d9e8SJason Beloro 	mpo_genid++;
20389853d9e8SJason Beloro 	mpo_wr_unlock();
20399853d9e8SJason Beloro 	mblock_free(&old_config);
20409853d9e8SJason Beloro }
2041