xref: /illumos-gate/usr/src/uts/sun4u/opl/os/opl.c (revision 6a634c9d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/cpuvar.h>
26 #include <sys/systm.h>
27 #include <sys/sysmacros.h>
28 #include <sys/promif.h>
29 #include <sys/platform_module.h>
30 #include <sys/cmn_err.h>
31 #include <sys/errno.h>
32 #include <sys/machsystm.h>
33 #include <sys/bootconf.h>
34 #include <sys/nvpair.h>
35 #include <sys/kobj.h>
36 #include <sys/mem_cage.h>
37 #include <sys/opl.h>
38 #include <sys/scfd/scfostoescf.h>
39 #include <sys/cpu_sgnblk_defs.h>
40 #include <sys/utsname.h>
41 #include <sys/ddi.h>
42 #include <sys/sunndi.h>
43 #include <sys/lgrp.h>
44 #include <sys/memnode.h>
45 #include <sys/sysmacros.h>
46 #include <sys/time.h>
47 #include <sys/cpu.h>
48 #include <sys/dumphdr.h>
49 #include <vm/vm_dep.h>
50 
51 int (*opl_get_mem_unum)(int, uint64_t, char *, int, int *);
52 int (*opl_get_mem_sid)(char *unum, char *buf, int buflen, int *lenp);
53 int (*opl_get_mem_offset)(uint64_t paddr, uint64_t *offp);
54 int (*opl_get_mem_addr)(char *unum, char *sid,
55     uint64_t offset, uint64_t *paddr);
56 
57 /* Memory for fcode claims.  16k times # maximum possible IO units */
58 #define	EFCODE_SIZE	(OPL_MAX_BOARDS * OPL_MAX_IO_UNITS_PER_BOARD * 0x4000)
59 int efcode_size = EFCODE_SIZE;
60 
61 #define	OPL_MC_MEMBOARD_SHIFT 38	/* Boards on 256BG boundary */
62 
63 /* Set the maximum number of boards for DR */
64 int opl_boards = OPL_MAX_BOARDS;
65 
66 void sgn_update_all_cpus(ushort_t, uchar_t, uchar_t);
67 
68 extern int tsb_lgrp_affinity;
69 
70 int opl_tsb_spares = (OPL_MAX_BOARDS) * (OPL_MAX_PCICH_UNITS_PER_BOARD) *
71 	(OPL_MAX_TSBS_PER_PCICH);
72 
73 pgcnt_t opl_startup_cage_size = 0;
74 
75 /*
76  * The length of the delay in seconds in communication with XSCF after
77  * which the warning message will be logged.
78  */
79 uint_t	xscf_connect_delay = 60 * 15;
80 
81 static opl_model_info_t opl_models[] = {
82 	{ "FF1", OPL_MAX_BOARDS_FF1, FF1, STD_DISPATCH_TABLE },
83 	{ "FF2", OPL_MAX_BOARDS_FF2, FF2, STD_DISPATCH_TABLE },
84 	{ "DC1", OPL_MAX_BOARDS_DC1, DC1, STD_DISPATCH_TABLE },
85 	{ "DC2", OPL_MAX_BOARDS_DC2, DC2, EXT_DISPATCH_TABLE },
86 	{ "DC3", OPL_MAX_BOARDS_DC3, DC3, EXT_DISPATCH_TABLE },
87 	{ "IKKAKU", OPL_MAX_BOARDS_IKKAKU, IKKAKU, STD_DISPATCH_TABLE },
88 };
89 static	int	opl_num_models = sizeof (opl_models)/sizeof (opl_model_info_t);
90 
91 /*
92  * opl_cur_model
93  */
94 static	opl_model_info_t *opl_cur_model = NULL;
95 
96 static struct memlist *opl_memlist_per_board(struct memlist *ml);
97 static void post_xscf_msg(char *, int);
98 static void pass2xscf_thread();
99 
100 /*
101  * Note FF/DC out-of-order instruction engine takes only a
102  * single cycle to execute each spin loop
103  * for comparison, Panther takes 6 cycles for same loop
104  * OPL_BOFF_SPIN = base spin loop, roughly one memory reference time
105  * OPL_BOFF_TM = approx nsec for OPL sleep instruction (1600 for OPL-C)
106  * OPL_BOFF_SLEEP = approx number of SPIN iterations to equal one sleep
107  * OPL_BOFF_MAX_SCALE - scaling factor for max backoff based on active cpus
108  * Listed values tuned for 2.15GHz to 2.64GHz systems
109  * Value may change for future systems
110  */
111 #define	OPL_BOFF_SPIN 7
112 #define	OPL_BOFF_SLEEP 4
113 #define	OPL_BOFF_TM 1600
114 #define	OPL_BOFF_MAX_SCALE 8
115 
116 #define	OPL_CLOCK_TICK_THRESHOLD	128
117 #define	OPL_CLOCK_TICK_NCPUS		64
118 
119 extern int	clock_tick_threshold;
120 extern int	clock_tick_ncpus;
121 
122 int
123 set_platform_max_ncpus(void)
124 {
125 	return (OPL_MAX_CPU_PER_BOARD * OPL_MAX_BOARDS);
126 }
127 
128 int
129 set_platform_tsb_spares(void)
130 {
131 	return (MIN(opl_tsb_spares, MAX_UPA));
132 }
133 
134 static void
135 set_model_info()
136 {
137 	extern int ts_dispatch_extended;
138 	char	name[MAXSYSNAME];
139 	int	i;
140 
141 	/*
142 	 * Get model name from the root node.
143 	 *
144 	 * We are using the prom device tree since, at this point,
145 	 * the Solaris device tree is not yet setup.
146 	 */
147 	(void) prom_getprop(prom_rootnode(), "model", (caddr_t)name);
148 
149 	for (i = 0; i < opl_num_models; i++) {
150 		if (strncmp(name, opl_models[i].model_name, MAXSYSNAME) == 0) {
151 			opl_cur_model = &opl_models[i];
152 			break;
153 		}
154 	}
155 
156 	/*
157 	 * If model not matched, it's an unknown model.
158 	 * Just return.  It will default to standard dispatch tables.
159 	 */
160 	if (i == opl_num_models)
161 		return;
162 
163 	if ((opl_cur_model->model_cmds & EXT_DISPATCH_TABLE) &&
164 	    (ts_dispatch_extended == -1)) {
165 		/*
166 		 * Based on a platform model, select a dispatch table.
167 		 * Only DC2 and DC3 systems uses the alternate/extended
168 		 * TS dispatch table.
169 		 * IKKAKU, FF1, FF2 and DC1 systems use standard dispatch
170 		 * tables.
171 		 */
172 		ts_dispatch_extended = 1;
173 	}
174 
175 }
176 
177 static void
178 set_max_mmu_ctxdoms()
179 {
180 	extern uint_t	max_mmu_ctxdoms;
181 	int		max_boards;
182 
183 	/*
184 	 * From the model, get the maximum number of boards
185 	 * supported and set the value accordingly. If the model
186 	 * could not be determined or recognized, we assume the max value.
187 	 */
188 	if (opl_cur_model == NULL)
189 		max_boards = OPL_MAX_BOARDS;
190 	else
191 		max_boards = opl_cur_model->model_max_boards;
192 
193 	/*
194 	 * On OPL, cores and MMUs are one-to-one.
195 	 */
196 	max_mmu_ctxdoms = OPL_MAX_CORE_UNITS_PER_BOARD * max_boards;
197 }
198 
199 #pragma weak mmu_init_large_pages
200 
201 void
202 set_platform_defaults(void)
203 {
204 	extern char *tod_module_name;
205 	extern void cpu_sgn_update(ushort_t, uchar_t, uchar_t, int);
206 	extern void mmu_init_large_pages(size_t);
207 
208 	/* Set the CPU signature function pointer */
209 	cpu_sgn_func = cpu_sgn_update;
210 
211 	/* Set appropriate tod module for OPL platform */
212 	ASSERT(tod_module_name == NULL);
213 	tod_module_name = "todopl";
214 
215 	if ((mmu_page_sizes == max_mmu_page_sizes) &&
216 	    (mmu_ism_pagesize != DEFAULT_ISM_PAGESIZE)) {
217 		if (&mmu_init_large_pages)
218 			mmu_init_large_pages(mmu_ism_pagesize);
219 	}
220 
221 	tsb_lgrp_affinity = 1;
222 
223 	set_max_mmu_ctxdoms();
224 
225 	/* set OPL threshold for compressed dumps */
226 	dump_plat_mincpu_default = DUMP_PLAT_SUN4U_OPL_MINCPU;
227 }
228 
229 /*
230  * Convert logical a board number to a physical one.
231  */
232 
233 #define	LSBPROP		"board#"
234 #define	PSBPROP		"physical-board#"
235 
236 int
237 opl_get_physical_board(int id)
238 {
239 	dev_info_t	*root_dip, *dip = NULL;
240 	char		*dname = NULL;
241 	int		circ;
242 
243 	pnode_t		pnode;
244 	char		pname[MAXSYSNAME] = {0};
245 
246 	int		lsb_id;	/* Logical System Board ID */
247 	int		psb_id;	/* Physical System Board ID */
248 
249 
250 	/*
251 	 * This function is called on early stage of bootup when the
252 	 * kernel device tree is not initialized yet, and also
253 	 * later on when the device tree is up. We want to try
254 	 * the fast track first.
255 	 */
256 	root_dip = ddi_root_node();
257 	if (root_dip) {
258 		/* Get from devinfo node */
259 		ndi_devi_enter(root_dip, &circ);
260 		for (dip = ddi_get_child(root_dip); dip;
261 		    dip = ddi_get_next_sibling(dip)) {
262 
263 			dname = ddi_node_name(dip);
264 			if (strncmp(dname, "pseudo-mc", 9) != 0)
265 				continue;
266 
267 			if ((lsb_id = (int)ddi_getprop(DDI_DEV_T_ANY, dip,
268 			    DDI_PROP_DONTPASS, LSBPROP, -1)) == -1)
269 				continue;
270 
271 			if (id == lsb_id) {
272 				if ((psb_id = (int)ddi_getprop(DDI_DEV_T_ANY,
273 				    dip, DDI_PROP_DONTPASS, PSBPROP, -1))
274 				    == -1) {
275 					ndi_devi_exit(root_dip, circ);
276 					return (-1);
277 				} else {
278 					ndi_devi_exit(root_dip, circ);
279 					return (psb_id);
280 				}
281 			}
282 		}
283 		ndi_devi_exit(root_dip, circ);
284 	}
285 
286 	/*
287 	 * We do not have the kernel device tree, or we did not
288 	 * find the node for some reason (let's say the kernel
289 	 * device tree was modified), let's try the OBP tree.
290 	 */
291 	pnode = prom_rootnode();
292 	for (pnode = prom_childnode(pnode); pnode;
293 	    pnode = prom_nextnode(pnode)) {
294 
295 		if ((prom_getprop(pnode, "name", (caddr_t)pname) == -1) ||
296 		    (strncmp(pname, "pseudo-mc", 9) != 0))
297 			continue;
298 
299 		if (prom_getprop(pnode, LSBPROP, (caddr_t)&lsb_id) == -1)
300 			continue;
301 
302 		if (id == lsb_id) {
303 			if (prom_getprop(pnode, PSBPROP,
304 			    (caddr_t)&psb_id) == -1) {
305 				return (-1);
306 			} else {
307 				return (psb_id);
308 			}
309 		}
310 	}
311 
312 	return (-1);
313 }
314 
315 /*
316  * For OPL it's possible that memory from two or more successive boards
317  * will be contiguous across the boards, and therefore represented as a
318  * single chunk.
319  * This function splits such chunks down the board boundaries.
320  */
321 static struct memlist *
322 opl_memlist_per_board(struct memlist *ml)
323 {
324 	uint64_t ssize, low, high, boundary;
325 	struct memlist *head, *tail, *new;
326 
327 	ssize = (1ull << OPL_MC_MEMBOARD_SHIFT);
328 
329 	head = tail = NULL;
330 
331 	for (; ml; ml = ml->ml_next) {
332 		low  = (uint64_t)ml->ml_address;
333 		high = low+(uint64_t)(ml->ml_size);
334 		while (low < high) {
335 			boundary = roundup(low+1, ssize);
336 			boundary = MIN(high, boundary);
337 			new = kmem_zalloc(sizeof (struct memlist), KM_SLEEP);
338 			new->ml_address = low;
339 			new->ml_size = boundary - low;
340 			if (head == NULL)
341 				head = new;
342 			if (tail) {
343 				tail->ml_next = new;
344 				new->ml_prev = tail;
345 			}
346 			tail = new;
347 			low = boundary;
348 		}
349 	}
350 	return (head);
351 }
352 
353 void
354 set_platform_cage_params(void)
355 {
356 	extern pgcnt_t total_pages;
357 	extern struct memlist *phys_avail;
358 	struct memlist *ml, *tml;
359 
360 	if (kernel_cage_enable) {
361 		pgcnt_t preferred_cage_size;
362 
363 		preferred_cage_size = MAX(opl_startup_cage_size,
364 		    total_pages / 256);
365 
366 		ml = opl_memlist_per_board(phys_avail);
367 
368 		/*
369 		 * Note: we are assuming that post has load the
370 		 * whole show in to the high end of memory. Having
371 		 * taken this leap, we copy the whole of phys_avail
372 		 * the glist and arrange for the cage to grow
373 		 * downward (descending pfns).
374 		 */
375 		kcage_range_init(ml, KCAGE_DOWN, preferred_cage_size);
376 
377 		/* free the memlist */
378 		do {
379 			tml = ml->ml_next;
380 			kmem_free(ml, sizeof (struct memlist));
381 			ml = tml;
382 		} while (ml != NULL);
383 	}
384 
385 	if (kcage_on)
386 		cmn_err(CE_NOTE, "!DR Kernel Cage is ENABLED");
387 	else
388 		cmn_err(CE_NOTE, "!DR Kernel Cage is DISABLED");
389 }
390 
391 /*ARGSUSED*/
392 int
393 plat_cpu_poweron(struct cpu *cp)
394 {
395 	int (*opl_cpu_poweron)(struct cpu *) = NULL;
396 
397 	opl_cpu_poweron =
398 	    (int (*)(struct cpu *))kobj_getsymvalue("drmach_cpu_poweron", 0);
399 
400 	if (opl_cpu_poweron == NULL)
401 		return (ENOTSUP);
402 	else
403 		return ((opl_cpu_poweron)(cp));
404 
405 }
406 
407 /*ARGSUSED*/
408 int
409 plat_cpu_poweroff(struct cpu *cp)
410 {
411 	int (*opl_cpu_poweroff)(struct cpu *) = NULL;
412 
413 	opl_cpu_poweroff =
414 	    (int (*)(struct cpu *))kobj_getsymvalue("drmach_cpu_poweroff", 0);
415 
416 	if (opl_cpu_poweroff == NULL)
417 		return (ENOTSUP);
418 	else
419 		return ((opl_cpu_poweroff)(cp));
420 
421 }
422 
423 int
424 plat_max_boards(void)
425 {
426 	/*
427 	 * If the model cannot be determined, default to the max value.
428 	 * Otherwise, Ikkaku model only supports 1 system board.
429 	 */
430 	if ((opl_cur_model != NULL) && (opl_cur_model->model_type == IKKAKU))
431 		return (OPL_MAX_BOARDS_IKKAKU);
432 	else
433 		return (OPL_MAX_BOARDS);
434 }
435 
436 int
437 plat_max_cpu_units_per_board(void)
438 {
439 	return (OPL_MAX_CPU_PER_BOARD);
440 }
441 
442 int
443 plat_max_mem_units_per_board(void)
444 {
445 	return (OPL_MAX_MEM_UNITS_PER_BOARD);
446 }
447 
448 int
449 plat_max_io_units_per_board(void)
450 {
451 	return (OPL_MAX_IO_UNITS_PER_BOARD);
452 }
453 
454 int
455 plat_max_cmp_units_per_board(void)
456 {
457 	return (OPL_MAX_CMP_UNITS_PER_BOARD);
458 }
459 
460 int
461 plat_max_core_units_per_board(void)
462 {
463 	return (OPL_MAX_CORE_UNITS_PER_BOARD);
464 }
465 
466 int
467 plat_pfn_to_mem_node(pfn_t pfn)
468 {
469 	return (pfn >> mem_node_pfn_shift);
470 }
471 
472 /* ARGSUSED */
473 void
474 plat_build_mem_nodes(prom_memlist_t *list, size_t nelems)
475 {
476 	size_t	elem;
477 	pfn_t	basepfn;
478 	pgcnt_t	npgs;
479 	uint64_t	boundary, ssize;
480 	uint64_t	low, high;
481 
482 	/*
483 	 * OPL mem slices are always aligned on a 256GB boundary.
484 	 */
485 	mem_node_pfn_shift = OPL_MC_MEMBOARD_SHIFT - MMU_PAGESHIFT;
486 	mem_node_physalign = 0;
487 
488 	/*
489 	 * Boot install lists are arranged <addr, len>, <addr, len>, ...
490 	 */
491 	ssize = (1ull << OPL_MC_MEMBOARD_SHIFT);
492 	for (elem = 0; elem < nelems; list++, elem++) {
493 		low  = list->addr;
494 		high = low + list->size;
495 		while (low < high) {
496 			boundary = roundup(low+1, ssize);
497 			boundary = MIN(high, boundary);
498 			basepfn = btop(low);
499 			npgs = btop(boundary - low);
500 			mem_node_add_slice(basepfn, basepfn + npgs - 1);
501 			low = boundary;
502 		}
503 	}
504 }
505 
506 /*
507  * Find the CPU associated with a slice at boot-time.
508  */
509 void
510 plat_fill_mc(pnode_t nodeid)
511 {
512 	int board;
513 	int memnode;
514 	struct {
515 		uint64_t	addr;
516 		uint64_t	size;
517 	} mem_range;
518 
519 	if (prom_getprop(nodeid, "board#", (caddr_t)&board) < 0) {
520 		panic("Can not find board# property in mc node %x", nodeid);
521 	}
522 	if (prom_getprop(nodeid, "sb-mem-ranges", (caddr_t)&mem_range) < 0) {
523 		panic("Can not find sb-mem-ranges property in mc node %x",
524 		    nodeid);
525 	}
526 	memnode = mem_range.addr >> OPL_MC_MEMBOARD_SHIFT;
527 	plat_assign_lgrphand_to_mem_node(board, memnode);
528 }
529 
530 /*
531  * Return the platform handle for the lgroup containing the given CPU
532  *
533  * For OPL, lgroup platform handle == board #.
534  */
535 
536 extern int mpo_disabled;
537 extern lgrp_handle_t lgrp_default_handle;
538 
539 lgrp_handle_t
540 plat_lgrp_cpu_to_hand(processorid_t id)
541 {
542 	lgrp_handle_t plathand;
543 
544 	/*
545 	 * Return the real platform handle for the CPU until
546 	 * such time as we know that MPO should be disabled.
547 	 * At that point, we set the "mpo_disabled" flag to true,
548 	 * and from that point on, return the default handle.
549 	 *
550 	 * By the time we know that MPO should be disabled, the
551 	 * first CPU will have already been added to a leaf
552 	 * lgroup, but that's ok. The common lgroup code will
553 	 * double check that the boot CPU is in the correct place,
554 	 * and in the case where mpo should be disabled, will move
555 	 * it to the root if necessary.
556 	 */
557 	if (mpo_disabled) {
558 		/* If MPO is disabled, return the default (UMA) handle */
559 		plathand = lgrp_default_handle;
560 	} else
561 		plathand = (lgrp_handle_t)LSB_ID(id);
562 	return (plathand);
563 }
564 
565 /*
566  * Platform specific lgroup initialization
567  */
568 void
569 plat_lgrp_init(void)
570 {
571 	extern uint32_t lgrp_expand_proc_thresh;
572 	extern uint32_t lgrp_expand_proc_diff;
573 	const uint_t m = LGRP_LOADAVG_THREAD_MAX;
574 
575 	/*
576 	 * Set tuneables for the OPL architecture
577 	 *
578 	 * lgrp_expand_proc_thresh is the threshold load on the set of
579 	 * lgroups a process is currently using on before considering
580 	 * adding another lgroup to the set.  For Oly-C and Jupiter
581 	 * systems, there are four sockets per lgroup. Setting
582 	 * lgrp_expand_proc_thresh to add lgroups when the load reaches
583 	 * four threads will spread the load when it exceeds one thread
584 	 * per socket, optimizing memory bandwidth and L2 cache space.
585 	 *
586 	 * lgrp_expand_proc_diff determines how much less another lgroup
587 	 * must be loaded before shifting the start location of a thread
588 	 * to it.
589 	 *
590 	 * lgrp_loadavg_tolerance is the threshold where two lgroups are
591 	 * considered to have different loads.  It is set to be less than
592 	 * 1% so that even a small residual load will be considered different
593 	 * from no residual load.
594 	 *
595 	 * We note loadavg values are not precise.
596 	 * Every 1/10 of a second loadavg values are reduced by 5%.
597 	 * This adjustment can come in the middle of the lgroup selection
598 	 * process, and for larger parallel apps with many threads can
599 	 * frequently occur between the start of the second thread
600 	 * placement and the finish of the last thread placement.
601 	 * We also must be careful to not use too small of a threshold
602 	 * since the cumulative decay for 1 second idle time is 40%.
603 	 * That is, the residual load from completed threads will still
604 	 * be 60% one second after the proc goes idle or 8% after 5 seconds.
605 	 *
606 	 * To allow for lag time in loadavg calculations
607 	 * remote thresh = 3.75 * LGRP_LOADAVG_THREAD_MAX
608 	 * local thresh  = 0.75 * LGRP_LOADAVG_THREAD_MAX
609 	 * tolerance	 = 0.0078 * LGRP_LOADAVG_THREAD_MAX
610 	 *
611 	 * The load placement algorithms consider LGRP_LOADAVG_THREAD_MAX
612 	 * as the equivalent of a load of 1. To make the code more compact,
613 	 * we set m = LGRP_LOADAVG_THREAD_MAX.
614 	 */
615 	lgrp_expand_proc_thresh = (m * 3) + (m >> 1) + (m >> 2);
616 	lgrp_expand_proc_diff = (m >> 1) + (m >> 2);
617 	lgrp_loadavg_tolerance = (m >> 7);
618 }
619 
620 /*
621  * Platform notification of lgroup (re)configuration changes
622  */
623 /*ARGSUSED*/
624 void
625 plat_lgrp_config(lgrp_config_flag_t evt, uintptr_t arg)
626 {
627 	update_membounds_t *umb;
628 	lgrp_config_mem_rename_t lmr;
629 	int sbd, tbd;
630 	lgrp_handle_t hand, shand, thand;
631 	int mnode, snode, tnode;
632 	pfn_t start, end;
633 
634 	if (mpo_disabled)
635 		return;
636 
637 	switch (evt) {
638 
639 	case LGRP_CONFIG_MEM_ADD:
640 		/*
641 		 * Establish the lgroup handle to memnode translation.
642 		 */
643 		umb = (update_membounds_t *)arg;
644 
645 		hand = umb->u_board;
646 		mnode = plat_pfn_to_mem_node(umb->u_base >> MMU_PAGESHIFT);
647 		plat_assign_lgrphand_to_mem_node(hand, mnode);
648 
649 		break;
650 
651 	case LGRP_CONFIG_MEM_DEL:
652 		/*
653 		 * Special handling for possible memory holes.
654 		 */
655 		umb = (update_membounds_t *)arg;
656 		hand = umb->u_board;
657 		if ((mnode = plat_lgrphand_to_mem_node(hand)) != -1) {
658 			if (mem_node_config[mnode].exists) {
659 				start = mem_node_config[mnode].physbase;
660 				end = mem_node_config[mnode].physmax;
661 				mem_node_del_slice(start, end);
662 			}
663 		}
664 
665 		break;
666 
667 	case LGRP_CONFIG_MEM_RENAME:
668 		/*
669 		 * During a DR copy-rename operation, all of the memory
670 		 * on one board is moved to another board -- but the
671 		 * addresses/pfns and memnodes don't change. This means
672 		 * the memory has changed locations without changing identity.
673 		 *
674 		 * Source is where we are copying from and target is where we
675 		 * are copying to.  After source memnode is copied to target
676 		 * memnode, the physical addresses of the target memnode are
677 		 * renamed to match what the source memnode had.  Then target
678 		 * memnode can be removed and source memnode can take its
679 		 * place.
680 		 *
681 		 * To do this, swap the lgroup handle to memnode mappings for
682 		 * the boards, so target lgroup will have source memnode and
683 		 * source lgroup will have empty target memnode which is where
684 		 * its memory will go (if any is added to it later).
685 		 *
686 		 * Then source memnode needs to be removed from its lgroup
687 		 * and added to the target lgroup where the memory was living
688 		 * but under a different name/memnode.  The memory was in the
689 		 * target memnode and now lives in the source memnode with
690 		 * different physical addresses even though it is the same
691 		 * memory.
692 		 */
693 		sbd = arg & 0xffff;
694 		tbd = (arg & 0xffff0000) >> 16;
695 		shand = sbd;
696 		thand = tbd;
697 		snode = plat_lgrphand_to_mem_node(shand);
698 		tnode = plat_lgrphand_to_mem_node(thand);
699 
700 		/*
701 		 * Special handling for possible memory holes.
702 		 */
703 		if (tnode != -1 && mem_node_config[tnode].exists) {
704 			start = mem_node_config[tnode].physbase;
705 			end = mem_node_config[tnode].physmax;
706 			mem_node_del_slice(start, end);
707 		}
708 
709 		plat_assign_lgrphand_to_mem_node(thand, snode);
710 		plat_assign_lgrphand_to_mem_node(shand, tnode);
711 
712 		lmr.lmem_rename_from = shand;
713 		lmr.lmem_rename_to = thand;
714 
715 		/*
716 		 * Remove source memnode of copy rename from its lgroup
717 		 * and add it to its new target lgroup
718 		 */
719 		lgrp_config(LGRP_CONFIG_MEM_RENAME, (uintptr_t)snode,
720 		    (uintptr_t)&lmr);
721 
722 		break;
723 
724 	default:
725 		break;
726 	}
727 }
728 
729 /*
730  * Return latency between "from" and "to" lgroups
731  *
732  * This latency number can only be used for relative comparison
733  * between lgroups on the running system, cannot be used across platforms,
734  * and may not reflect the actual latency.  It is platform and implementation
735  * specific, so platform gets to decide its value.  It would be nice if the
736  * number was at least proportional to make comparisons more meaningful though.
737  * NOTE: The numbers below are supposed to be load latencies for uncached
738  * memory divided by 10.
739  *
740  */
741 int
742 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
743 {
744 	/*
745 	 * Return min remote latency when there are more than two lgroups
746 	 * (root and child) and getting latency between two different lgroups
747 	 * or root is involved
748 	 */
749 	if (lgrp_optimizations() && (from != to ||
750 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE))
751 		return (42);
752 	else
753 		return (35);
754 }
755 
756 /*
757  * Return platform handle for root lgroup
758  */
759 lgrp_handle_t
760 plat_lgrp_root_hand(void)
761 {
762 	if (mpo_disabled)
763 		return (lgrp_default_handle);
764 
765 	return (LGRP_DEFAULT_HANDLE);
766 }
767 
768 /*ARGSUSED*/
769 void
770 plat_freelist_process(int mnode)
771 {
772 }
773 
774 void
775 load_platform_drivers(void)
776 {
777 	(void) i_ddi_attach_pseudo_node("dr");
778 }
779 
780 /*
781  * No platform drivers on this platform
782  */
783 char *platform_module_list[] = {
784 	(char *)0
785 };
786 
787 /*ARGSUSED*/
788 void
789 plat_tod_fault(enum tod_fault_type tod_bad)
790 {
791 }
792 
793 /*ARGSUSED*/
794 void
795 cpu_sgn_update(ushort_t sgn, uchar_t state, uchar_t sub_state, int cpuid)
796 {
797 	static void (*scf_panic_callback)(int);
798 	static void (*scf_shutdown_callback)(int);
799 
800 	/*
801 	 * This is for notifing system panic/shutdown to SCF.
802 	 * In case of shutdown and panic, SCF call back
803 	 * function should be called.
804 	 *  <SCF call back functions>
805 	 *   scf_panic_callb()   : panicsys()->panic_quiesce_hw()
806 	 *   scf_shutdown_callb(): halt() or power_down() or reboot_machine()
807 	 * cpuid should be -1 and state should be SIGST_EXIT.
808 	 */
809 	if (state == SIGST_EXIT && cpuid == -1) {
810 
811 		/*
812 		 * find the symbol for the SCF panic callback routine in driver
813 		 */
814 		if (scf_panic_callback == NULL)
815 			scf_panic_callback = (void (*)(int))
816 			    modgetsymvalue("scf_panic_callb", 0);
817 		if (scf_shutdown_callback == NULL)
818 			scf_shutdown_callback = (void (*)(int))
819 			    modgetsymvalue("scf_shutdown_callb", 0);
820 
821 		switch (sub_state) {
822 		case SIGSUBST_PANIC:
823 			if (scf_panic_callback == NULL) {
824 				cmn_err(CE_NOTE, "!cpu_sgn_update: "
825 				    "scf_panic_callb not found\n");
826 				return;
827 			}
828 			scf_panic_callback(SIGSUBST_PANIC);
829 			break;
830 
831 		case SIGSUBST_HALT:
832 			if (scf_shutdown_callback == NULL) {
833 				cmn_err(CE_NOTE, "!cpu_sgn_update: "
834 				    "scf_shutdown_callb not found\n");
835 				return;
836 			}
837 			scf_shutdown_callback(SIGSUBST_HALT);
838 			break;
839 
840 		case SIGSUBST_ENVIRON:
841 			if (scf_shutdown_callback == NULL) {
842 				cmn_err(CE_NOTE, "!cpu_sgn_update: "
843 				    "scf_shutdown_callb not found\n");
844 				return;
845 			}
846 			scf_shutdown_callback(SIGSUBST_ENVIRON);
847 			break;
848 
849 		case SIGSUBST_REBOOT:
850 			if (scf_shutdown_callback == NULL) {
851 				cmn_err(CE_NOTE, "!cpu_sgn_update: "
852 				    "scf_shutdown_callb not found\n");
853 				return;
854 			}
855 			scf_shutdown_callback(SIGSUBST_REBOOT);
856 			break;
857 		}
858 	}
859 }
860 
861 /*ARGSUSED*/
862 int
863 plat_get_mem_unum(int synd_code, uint64_t flt_addr, int flt_bus_id,
864 	int flt_in_memory, ushort_t flt_status,
865 	char *buf, int buflen, int *lenp)
866 {
867 	/*
868 	 * check if it's a Memory error.
869 	 */
870 	if (flt_in_memory) {
871 		if (opl_get_mem_unum != NULL) {
872 			return (opl_get_mem_unum(synd_code, flt_addr, buf,
873 			    buflen, lenp));
874 		} else {
875 			return (ENOTSUP);
876 		}
877 	} else {
878 		return (ENOTSUP);
879 	}
880 }
881 
882 /*ARGSUSED*/
883 int
884 plat_get_cpu_unum(int cpuid, char *buf, int buflen, int *lenp)
885 {
886 	int	ret = 0;
887 	int	sb;
888 	int	plen;
889 
890 	sb = opl_get_physical_board(LSB_ID(cpuid));
891 	if (sb == -1) {
892 		return (ENXIO);
893 	}
894 
895 	/*
896 	 * opl_cur_model is assigned here
897 	 */
898 	if (opl_cur_model == NULL) {
899 		set_model_info();
900 
901 		/*
902 		 * if not matched, return
903 		 */
904 		if (opl_cur_model == NULL)
905 			return (ENODEV);
906 	}
907 
908 	ASSERT((opl_cur_model - opl_models) == (opl_cur_model->model_type));
909 
910 	switch (opl_cur_model->model_type) {
911 	case FF1:
912 		plen = snprintf(buf, buflen, "/%s/CPUM%d", "MBU_A",
913 		    CHIP_ID(cpuid) / 2);
914 		break;
915 
916 	case FF2:
917 		plen = snprintf(buf, buflen, "/%s/CPUM%d", "MBU_B",
918 		    (CHIP_ID(cpuid) / 2) + (sb * 2));
919 		break;
920 
921 	case DC1:
922 	case DC2:
923 	case DC3:
924 		plen = snprintf(buf, buflen, "/%s%02d/CPUM%d", "CMU", sb,
925 		    CHIP_ID(cpuid));
926 		break;
927 
928 	case IKKAKU:
929 		plen = snprintf(buf, buflen, "/%s", "MBU_A");
930 		break;
931 
932 	default:
933 		/* This should never happen */
934 		return (ENODEV);
935 	}
936 
937 	if (plen >= buflen) {
938 		ret = ENOSPC;
939 	} else {
940 		if (lenp)
941 			*lenp = strlen(buf);
942 	}
943 	return (ret);
944 }
945 
946 void
947 plat_nodename_set(void)
948 {
949 	post_xscf_msg((char *)&utsname, sizeof (struct utsname));
950 }
951 
952 caddr_t	efcode_vaddr = NULL;
953 
954 /*
955  * Preallocate enough memory for fcode claims.
956  */
957 
958 caddr_t
959 efcode_alloc(caddr_t alloc_base)
960 {
961 	caddr_t efcode_alloc_base = (caddr_t)roundup((uintptr_t)alloc_base,
962 	    MMU_PAGESIZE);
963 	caddr_t vaddr;
964 
965 	/*
966 	 * allocate the physical memory for the Oberon fcode.
967 	 */
968 	if ((vaddr = (caddr_t)BOP_ALLOC(bootops, efcode_alloc_base,
969 	    efcode_size, MMU_PAGESIZE)) == NULL)
970 		cmn_err(CE_PANIC, "Cannot allocate Efcode Memory");
971 
972 	efcode_vaddr = vaddr;
973 
974 	return (efcode_alloc_base + efcode_size);
975 }
976 
977 caddr_t
978 plat_startup_memlist(caddr_t alloc_base)
979 {
980 	caddr_t tmp_alloc_base;
981 
982 	tmp_alloc_base = efcode_alloc(alloc_base);
983 	tmp_alloc_base =
984 	    (caddr_t)roundup((uintptr_t)tmp_alloc_base, ecache_alignsize);
985 	return (tmp_alloc_base);
986 }
987 
988 /* need to forward declare these */
989 static void plat_lock_delay(uint_t);
990 
991 void
992 startup_platform(void)
993 {
994 	if (clock_tick_threshold == 0)
995 		clock_tick_threshold = OPL_CLOCK_TICK_THRESHOLD;
996 	if (clock_tick_ncpus == 0)
997 		clock_tick_ncpus = OPL_CLOCK_TICK_NCPUS;
998 	mutex_lock_delay = plat_lock_delay;
999 	mutex_cap_factor = OPL_BOFF_MAX_SCALE;
1000 }
1001 
1002 static uint_t
1003 get_mmu_id(processorid_t cpuid)
1004 {
1005 	int pb = opl_get_physical_board(LSB_ID(cpuid));
1006 
1007 	if (pb == -1) {
1008 		cmn_err(CE_PANIC,
1009 		    "opl_get_physical_board failed (cpu %d LSB %u)",
1010 		    cpuid, LSB_ID(cpuid));
1011 	}
1012 	return (pb * OPL_MAX_COREID_PER_BOARD) + (CHIP_ID(cpuid) *
1013 	    OPL_MAX_COREID_PER_CMP) + CORE_ID(cpuid);
1014 }
1015 
1016 void
1017 plat_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *info)
1018 {
1019 	int	impl;
1020 
1021 	impl = cpunodes[cpuid].implementation;
1022 	if (IS_OLYMPUS_C(impl) || IS_JUPITER(impl)) {
1023 		info->mmu_idx = get_mmu_id(cpuid);
1024 		info->mmu_nctxs = 8192;
1025 	} else {
1026 		cmn_err(CE_PANIC, "Unknown processor %d", impl);
1027 	}
1028 }
1029 
1030 int
1031 plat_get_mem_sid(char *unum, char *buf, int buflen, int *lenp)
1032 {
1033 	if (opl_get_mem_sid == NULL) {
1034 		return (ENOTSUP);
1035 	}
1036 	return (opl_get_mem_sid(unum, buf, buflen, lenp));
1037 }
1038 
1039 int
1040 plat_get_mem_offset(uint64_t paddr, uint64_t *offp)
1041 {
1042 	if (opl_get_mem_offset == NULL) {
1043 		return (ENOTSUP);
1044 	}
1045 	return (opl_get_mem_offset(paddr, offp));
1046 }
1047 
1048 int
1049 plat_get_mem_addr(char *unum, char *sid, uint64_t offset, uint64_t *addrp)
1050 {
1051 	if (opl_get_mem_addr == NULL) {
1052 		return (ENOTSUP);
1053 	}
1054 	return (opl_get_mem_addr(unum, sid, offset, addrp));
1055 }
1056 
1057 void
1058 plat_lock_delay(uint_t backoff)
1059 {
1060 	int i;
1061 	uint_t cnt, remcnt;
1062 	int ctr;
1063 	hrtime_t delay_start, rem_delay;
1064 	/*
1065 	 * Platform specific lock delay code for OPL
1066 	 *
1067 	 * Using staged linear increases in the delay.
1068 	 * The sleep instruction is the preferred method of delay,
1069 	 * but is too large of granularity for the initial backoff.
1070 	 */
1071 
1072 	if (backoff < 100) {
1073 		/*
1074 		 * If desired backoff is long enough,
1075 		 * use sleep for most of it
1076 		 */
1077 		for (cnt = backoff;
1078 		    cnt >= OPL_BOFF_SLEEP;
1079 		    cnt -= OPL_BOFF_SLEEP) {
1080 			cpu_smt_pause();
1081 		}
1082 		/*
1083 		 * spin for small remainder of backoff
1084 		 */
1085 		for (ctr = cnt * OPL_BOFF_SPIN; ctr; ctr--) {
1086 			mutex_delay_default();
1087 		}
1088 	} else {
1089 		/* backoff is large.  Fill it by sleeping */
1090 		delay_start = gethrtime_waitfree();
1091 		cnt = backoff / OPL_BOFF_SLEEP;
1092 		/*
1093 		 * use sleep instructions for delay
1094 		 */
1095 		for (i = 0; i < cnt; i++) {
1096 			cpu_smt_pause();
1097 		}
1098 
1099 		/*
1100 		 * Note: if the other strand executes a sleep instruction,
1101 		 * then the sleep ends immediately with a minimum time of
1102 		 * 42 clocks.  We check gethrtime to insure we have
1103 		 * waited long enough.  And we include both a short
1104 		 * spin loop and a sleep for repeated delay times.
1105 		 */
1106 
1107 		rem_delay = gethrtime_waitfree() - delay_start;
1108 		while (rem_delay < cnt * OPL_BOFF_TM) {
1109 			remcnt = cnt - (rem_delay / OPL_BOFF_TM);
1110 			for (i = 0; i < remcnt; i++) {
1111 				cpu_smt_pause();
1112 				for (ctr = OPL_BOFF_SPIN; ctr; ctr--) {
1113 					mutex_delay_default();
1114 				}
1115 			}
1116 			rem_delay = gethrtime_waitfree() - delay_start;
1117 		}
1118 	}
1119 }
1120 
1121 /*
1122  * The following code implements asynchronous call to XSCF to setup the
1123  * domain node name.
1124  */
1125 
1126 #define	FREE_MSG(m)		kmem_free((m), NM_LEN((m)->len))
1127 
1128 /*
1129  * The following three macros define the all operations on the request
1130  * list we are using here, and hide the details of the list
1131  * implementation from the code.
1132  */
1133 #define	PUSH(m) \
1134 	{ \
1135 		(m)->next = ctl_msg.head; \
1136 		(m)->prev = NULL; \
1137 		if ((m)->next != NULL) \
1138 			(m)->next->prev = (m); \
1139 		ctl_msg.head = (m); \
1140 	}
1141 
1142 #define	REMOVE(m) \
1143 	{ \
1144 		if ((m)->prev != NULL) \
1145 			(m)->prev->next = (m)->next; \
1146 		else \
1147 			ctl_msg.head = (m)->next; \
1148 		if ((m)->next != NULL) \
1149 			(m)->next->prev = (m)->prev; \
1150 	}
1151 
1152 #define	FREE_THE_TAIL(head) \
1153 	{ \
1154 		nm_msg_t *n_msg, *m; \
1155 		m = (head)->next; \
1156 		(head)->next = NULL; \
1157 		while (m != NULL) { \
1158 			n_msg = m->next; \
1159 			FREE_MSG(m); \
1160 			m = n_msg; \
1161 		} \
1162 	}
1163 
1164 #define	SCF_PUTINFO(f, s, p) \
1165 	f(KEY_ESCF, 0x01, 0, s, p)
1166 
1167 #define	PASS2XSCF(m, r)	((r = SCF_PUTINFO(ctl_msg.scf_service_function, \
1168 					    (m)->len, (m)->data)) == 0)
1169 
1170 /*
1171  * The value of the following macro loosely depends on the
1172  * value of the "device busy" timeout used in the SCF driver.
1173  * (See pass2xscf_thread()).
1174  */
1175 #define	SCF_DEVBUSY_DELAY	10
1176 
1177 /*
1178  * The default number of attempts to contact the scf driver
1179  * if we cannot fetch any information about the timeout value
1180  * it uses.
1181  */
1182 
1183 #define	REPEATS		4
1184 
1185 typedef struct nm_msg {
1186 	struct nm_msg *next;
1187 	struct nm_msg *prev;
1188 	int len;
1189 	char data[1];
1190 } nm_msg_t;
1191 
1192 #define	NM_LEN(len)		(sizeof (nm_msg_t) + (len) - 1)
1193 
1194 static struct ctlmsg {
1195 	nm_msg_t	*head;
1196 	nm_msg_t	*now_serving;
1197 	kmutex_t	nm_lock;
1198 	kthread_t	*nmt;
1199 	int		cnt;
1200 	int (*scf_service_function)(uint32_t, uint8_t,
1201 				    uint32_t, uint32_t, void *);
1202 } ctl_msg;
1203 
1204 static void
1205 post_xscf_msg(char *dp, int len)
1206 {
1207 	nm_msg_t *msg;
1208 
1209 	msg = (nm_msg_t *)kmem_zalloc(NM_LEN(len), KM_SLEEP);
1210 
1211 	bcopy(dp, msg->data, len);
1212 	msg->len = len;
1213 
1214 	mutex_enter(&ctl_msg.nm_lock);
1215 	if (ctl_msg.nmt == NULL) {
1216 		ctl_msg.nmt =  thread_create(NULL, 0, pass2xscf_thread,
1217 		    NULL, 0, &p0, TS_RUN, minclsyspri);
1218 	}
1219 
1220 	PUSH(msg);
1221 	ctl_msg.cnt++;
1222 	mutex_exit(&ctl_msg.nm_lock);
1223 }
1224 
1225 static void
1226 pass2xscf_thread()
1227 {
1228 	nm_msg_t *msg;
1229 	int ret;
1230 	uint_t i, msg_sent, xscf_driver_delay;
1231 	static uint_t repeat_cnt;
1232 	uint_t *scf_wait_cnt;
1233 
1234 	mutex_enter(&ctl_msg.nm_lock);
1235 
1236 	/*
1237 	 * Find the address of the SCF put routine if it's not done yet.
1238 	 */
1239 	if (ctl_msg.scf_service_function == NULL) {
1240 		if ((ctl_msg.scf_service_function =
1241 		    (int (*)(uint32_t, uint8_t, uint32_t, uint32_t, void *))
1242 		    modgetsymvalue("scf_service_putinfo", 0)) == NULL) {
1243 			cmn_err(CE_NOTE, "pass2xscf_thread: "
1244 			    "scf_service_putinfo not found\n");
1245 			ctl_msg.nmt = NULL;
1246 			mutex_exit(&ctl_msg.nm_lock);
1247 			return;
1248 		}
1249 	}
1250 
1251 	/*
1252 	 * Calculate the number of attempts to connect XSCF based on the
1253 	 * scf driver delay (which is
1254 	 * SCF_DEVBUSY_DELAY*scf_online_wait_rcnt seconds) and the value
1255 	 * of xscf_connect_delay (the total number of seconds to wait
1256 	 * till xscf get ready.)
1257 	 */
1258 	if (repeat_cnt == 0) {
1259 		if ((scf_wait_cnt =
1260 		    (uint_t *)
1261 		    modgetsymvalue("scf_online_wait_rcnt", 0)) == NULL) {
1262 			repeat_cnt = REPEATS;
1263 		} else {
1264 
1265 			xscf_driver_delay = *scf_wait_cnt *
1266 			    SCF_DEVBUSY_DELAY;
1267 			repeat_cnt = (xscf_connect_delay/xscf_driver_delay) + 1;
1268 		}
1269 	}
1270 
1271 	while (ctl_msg.cnt != 0) {
1272 
1273 		/*
1274 		 * Take the very last request from the queue,
1275 		 */
1276 		ctl_msg.now_serving = ctl_msg.head;
1277 		ASSERT(ctl_msg.now_serving != NULL);
1278 
1279 		/*
1280 		 * and discard all the others if any.
1281 		 */
1282 		FREE_THE_TAIL(ctl_msg.now_serving);
1283 		ctl_msg.cnt = 1;
1284 		mutex_exit(&ctl_msg.nm_lock);
1285 
1286 		/*
1287 		 * Pass the name to XSCF. Note please, we do not hold the
1288 		 * mutex while we are doing this.
1289 		 */
1290 		msg_sent = 0;
1291 		for (i = 0; i < repeat_cnt; i++) {
1292 			if (PASS2XSCF(ctl_msg.now_serving, ret)) {
1293 				msg_sent = 1;
1294 				break;
1295 			} else {
1296 				if (ret != EBUSY) {
1297 					cmn_err(CE_NOTE, "pass2xscf_thread:"
1298 					    " unexpected return code"
1299 					    " from scf_service_putinfo():"
1300 					    " %d\n", ret);
1301 				}
1302 			}
1303 		}
1304 
1305 		if (msg_sent) {
1306 
1307 			/*
1308 			 * Remove the request from the list
1309 			 */
1310 			mutex_enter(&ctl_msg.nm_lock);
1311 			msg = ctl_msg.now_serving;
1312 			ctl_msg.now_serving = NULL;
1313 			REMOVE(msg);
1314 			ctl_msg.cnt--;
1315 			mutex_exit(&ctl_msg.nm_lock);
1316 			FREE_MSG(msg);
1317 		} else {
1318 
1319 			/*
1320 			 * If while we have tried to communicate with
1321 			 * XSCF there were any other requests we are
1322 			 * going to drop this one and take the latest
1323 			 * one.  Otherwise we will try to pass this one
1324 			 * again.
1325 			 */
1326 			cmn_err(CE_NOTE,
1327 			    "pass2xscf_thread: "
1328 			    "scf_service_putinfo "
1329 			    "not responding\n");
1330 		}
1331 		mutex_enter(&ctl_msg.nm_lock);
1332 	}
1333 
1334 	/*
1335 	 * The request queue is empty, exit.
1336 	 */
1337 	ctl_msg.nmt = NULL;
1338 	mutex_exit(&ctl_msg.nm_lock);
1339 }
1340