xref: /illumos-gate/usr/src/uts/common/io/pciex/pcie_pwr.c (revision b3d69c05)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2019 Joyent, Inc.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/ddi.h>
29 #include <sys/kmem.h>
30 #include <sys/sysmacros.h>
31 #include <sys/sunddi.h>
32 #include <sys/sunpm.h>
33 #include <sys/epm.h>
34 #include <sys/sunndi.h>
35 #include <sys/ddi_impldefs.h>
36 #include <sys/ddi_implfuncs.h>
37 #include <sys/pcie.h>
38 #include <sys/pcie_impl.h>
39 #include <sys/promif.h>		/* prom_printf */
40 #include <sys/pcie_pwr.h>
41 
42 /*
43  * This file implements the power management functionality for
44  * pci express switch and pci express-to-pci/pci-x bridge. All the
45  * code in this file is generic and is not specific to a particular chip.
46  * The algorithm, which decides when to go to a lower power is explained
47  * below:
48  *
49  *	1. Initially when no children are attached, the driver is idle from
50  *	PM framework point of view ( PM idle/PM busy).
51  *
52  *	2. Driver is PM busy if either a reference count called pwr_hold is
53  *	greater than zero or driver is already at the lowest possible power
54  *	level. The lowest possible power level for the driver is equal to the
55  *	highest power level among its children. The PM busy condition is
56  *	indicated by PCIE_PM_BUSY bit. At any point, only one pm_busy_component
57  *	call is made for a nexus driver instance.
58  *
59  *	3. Driver is PM idle if the pwr_hold is zero and the lowest
60  *	possible power level is less than the driver's current power level.
61  *	At any point, only one pm_idle_component call is made for a nexus
62  *	driver instance.
63  *
64  *	4. For any events like child attach, it increments pwr_hold and marks
65  *	itslef busy, if it is not already done so. This temporary hold is
66  *	removed when the event is complete.
67  *
68  *	5. Any child's power change requires the parent (this driver) to be
69  *	full power. So it raises its power and increments pwr_hold. It also
70  *	marks itself temporarily busy, if it is not already done. This hold
71  *	is removed when the child power change is complete.
72  *
73  *	6. After each child power change, it evaluates what is the lowest
74  *	possible power level. If the lowest possible power level is less than
75  *	the current power level and pwr_hold is zero, then it marks itself
76  *	idle. The lowest power level is equal or greater than the highest level
77  *	among the children. It keeps track of children's power level by
78  *	using counters.
79  *
80  *	7. Any code e.g., which is accessing the driver's own registers should
81  *	place a temporary hold using pcie_pm_hold.
82  */
83 
84 static int pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new);
85 static void pwr_update_counters(int *countersp, int olevel, int nlevel);
86 static int pwr_level_allowed(pcie_pwr_t *pwr_p);
87 static void pcie_add_comps(dev_info_t *dip, dev_info_t *cdip,
88     pcie_pwr_t *pwr_p);
89 static void pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip,
90     pcie_pwr_t *pwr_p);
91 static void pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p);
92 static boolean_t pcie_is_pcie(dev_info_t *dip);
93 #ifdef DEBUG
94 static char *pcie_decode_pwr_op(pm_bus_power_op_t op);
95 #else
96 #define	pcie_decode_pwr_op
97 #endif
98 
99 /*
100  * power entry point.
101  *
102  * This function decides whether the PM request is honorable.
103  * If yes, it then does what's necessary for switch or
104  *    bridge to change its power.
105  */
106 /* ARGSUSED */
107 int
108 pcie_power(dev_info_t *dip, int component, int level)
109 {
110 	pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
111 	int *counters = pwr_p->pwr_counters;
112 	int pmcaps = pwr_p->pwr_pmcaps;
113 	int ret = DDI_FAILURE;
114 
115 #if defined(__i386) || defined(__amd64)
116 	if (dip)
117 		return (DDI_SUCCESS);
118 #endif /* defined(__i386) || defined(__amd64) */
119 
120 	ASSERT(level != PM_LEVEL_UNKNOWN);
121 	/* PM should not asking for a level, which is unsupported */
122 	ASSERT(level == PM_LEVEL_D0 || level == PM_LEVEL_D3 ||
123 	    (level == PM_LEVEL_D1 && (pmcaps & PCIE_SUPPORTS_D1)) ||
124 	    (level == PM_LEVEL_D2 && (pmcaps & PCIE_SUPPORTS_D2)));
125 
126 	mutex_enter(&pwr_p->pwr_lock);
127 	PCIE_DBG("%s(%d): pcie_power: change from %d to %d\n",
128 	    ddi_driver_name(dip), ddi_get_instance(dip), pwr_p->pwr_func_lvl,
129 	    level);
130 	if (pwr_p->pwr_func_lvl == level) {
131 		PCIE_DBG("%s(%d): pcie_power: already at %d\n",
132 		    ddi_driver_name(dip), ddi_get_instance(dip), level);
133 		ret = DDI_SUCCESS;
134 		goto pcie_pwr_done;
135 	}
136 
137 	if (level < pwr_p->pwr_func_lvl) {
138 		/*
139 		 * Going to lower power. Reject this if we are either busy
140 		 * or there is a hold.
141 		 */
142 		if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
143 			PCIE_DBG("%s(%d): pcie_power: rejecting change to %d "
144 			    "as busy\n", ddi_driver_name(dip),
145 			    ddi_get_instance(dip), level);
146 			goto pcie_pwr_done;
147 		}
148 
149 		/*
150 		 * Now we know that we are neither busy nor there is a hold.
151 		 * At this point none of the children should be at full power.
152 		 * Reject the request if level reqested is lower than the level
153 		 * possible.
154 		 */
155 		ASSERT(!counters[PCIE_D0_INDEX] &&
156 		    !counters[PCIE_UNKNOWN_INDEX]);
157 		if (level < pwr_level_allowed(pwr_p)) {
158 			PCIE_DBG("%s(%d): pcie_power: rejecting level %d as"
159 			    " %d is the lowest possible\n",
160 			    ddi_driver_name(dip), ddi_get_instance(dip), level,
161 			    pwr_level_allowed(pwr_p));
162 			goto pcie_pwr_done;
163 		}
164 	}
165 
166 	if (pcie_pwr_change(dip, pwr_p, level) != DDI_SUCCESS) {
167 		PCIE_DBG("%s(%d): pcie_power: attempt to change to %d "
168 		    " failed \n", ddi_driver_name(dip), ddi_get_instance(dip),
169 		    level);
170 		goto pcie_pwr_done;
171 	}
172 	pwr_p->pwr_func_lvl = level;
173 	PCIE_DBG("%s(%d): pcie_power: level changed to %d \n",
174 	    ddi_driver_name(dip), ddi_get_instance(dip), level);
175 	ret = DDI_SUCCESS;
176 
177 pcie_pwr_done:
178 	mutex_exit(&pwr_p->pwr_lock);
179 	return (ret);
180 }
181 
182 /*
183  * Called by pcie_power() only. Caller holds the pwr_lock.
184  *
185  * dip - dev_info pointer
186  * pwr_p - pm info for the node.
187  * new     - new level
188  */
189 static int
190 pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new)
191 {
192 	uint16_t pmcsr;
193 
194 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
195 	ASSERT(new != pwr_p->pwr_func_lvl);
196 	pmcsr = pci_config_get16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset);
197 	pmcsr &= ~PCI_PMCSR_STATE_MASK;
198 	switch (new) {
199 	case PM_LEVEL_D0:
200 		pmcsr |= PCI_PMCSR_D0;
201 		break;
202 
203 	case PM_LEVEL_D1:
204 		pmcsr |= PCI_PMCSR_D1;
205 		break;
206 
207 	case PM_LEVEL_D2:
208 		pmcsr |= PCI_PMCSR_D2;
209 		break;
210 
211 	case PM_LEVEL_D3:
212 		pmcsr |= PCI_PMCSR_D3HOT;
213 		break;
214 
215 	default:
216 		ASSERT(0);
217 		break;
218 	}
219 	/* Save config space, if going to D3 */
220 	if (new == PM_LEVEL_D3) {
221 		PCIE_DBG("%s(%d): pwr_change: saving config space regs\n",
222 		    ddi_driver_name(dip), ddi_get_instance(dip));
223 		if (pci_save_config_regs(dip) != DDI_SUCCESS) {
224 			PCIE_DBG("%s(%d): pcie_pwr_change: failed to save "
225 			    "config space regs\n", ddi_driver_name(dip),
226 			    ddi_get_instance(dip));
227 			return (DDI_FAILURE);
228 		}
229 	}
230 
231 	pci_config_put16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset, pmcsr);
232 
233 	/*
234 	 * TBD: Taken from pci_pci driver. Is this required?
235 	 * No bus transactions should occur without waiting for
236 	 * settle time specified in PCI PM spec rev 2.1 sec 5.6.1
237 	 * To make things simple, just use the max time specified for
238 	 * all state transitions.
239 	 */
240 	delay(drv_usectohz(PCI_CLK_SETTLE_TIME));
241 
242 	/*
243 	 * Restore config space if coming out of D3
244 	 */
245 	if (pwr_p->pwr_func_lvl == PM_LEVEL_D3) {
246 		PCIE_DBG("%s(%d): pcie_pwr_change: restoring config space\n",
247 		    ddi_driver_name(dip), ddi_get_instance(dip));
248 		if (pci_restore_config_regs(dip) != DDI_SUCCESS) {
249 			PCIE_DBG("%s(%d): pcie_pwr_change: failed to restore "
250 			    "config space regs\n", ddi_driver_name(dip),
251 			    ddi_get_instance(dip));
252 			return (DDI_FAILURE);
253 		}
254 	}
255 	return (DDI_SUCCESS);
256 }
257 
258 /*
259  * bus_ctlops.bus_power function.
260  *
261  * This function handles PRE_ POST_ change notifications, sent by
262  * PM framework related to child's power level change. It marks itself
263  * idle or busy based on the children's power level.
264  */
265 int
266 pcie_bus_power(dev_info_t *dip, void *impl_arg, pm_bus_power_op_t op,
267     void *arg, void *result)
268 {
269 	pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
270 	int *counters = pwr_p->pwr_counters; /* nexus counters */
271 	int *child_counters; /* per child dip counters */
272 	pm_bp_child_pwrchg_t *bpc;
273 	pm_bp_has_changed_t *bphc;
274 	dev_info_t *cdip;
275 	int new_level;
276 	int old_level;
277 	int rv = DDI_SUCCESS;
278 	int level_allowed, comp;
279 
280 #if defined(__i386) || defined(__amd64)
281 	if (dip)
282 		return (DDI_SUCCESS);
283 #endif /* defined(__i386) || defined(__amd64) */
284 
285 	switch (op) {
286 	case BUS_POWER_PRE_NOTIFICATION:
287 	case BUS_POWER_POST_NOTIFICATION:
288 		bpc = (pm_bp_child_pwrchg_t *)arg;
289 		cdip = bpc->bpc_dip;
290 		new_level = bpc->bpc_nlevel;
291 		old_level = bpc->bpc_olevel;
292 		comp = bpc->bpc_comp;
293 		break;
294 
295 	case BUS_POWER_HAS_CHANGED:
296 		bphc = (pm_bp_has_changed_t *)arg;
297 		cdip = bphc->bphc_dip;
298 		new_level = bphc->bphc_nlevel;
299 		old_level = bphc->bphc_olevel;
300 		comp = bphc->bphc_comp;
301 		break;
302 
303 	default:
304 		break;
305 
306 	}
307 
308 	ASSERT(pwr_p);
309 	mutex_enter(&pwr_p->pwr_lock);
310 	switch (op) {
311 	case BUS_POWER_PRE_NOTIFICATION:
312 		PCIE_DBG("%s(%d): pcie_bus_power: %s@%d op %s %d->%d\n",
313 		    ddi_driver_name(dip), ddi_get_instance(dip),
314 		    ddi_driver_name(cdip), ddi_get_instance(cdip),
315 		    pcie_decode_pwr_op(op), old_level, new_level);
316 		/*
317 		 * If the nexus doesn't want the child to go into
318 		 * non-D0 state, mark the child busy. This way PM
319 		 * framework will never try to lower the child's power.
320 		 * In case of pm_lower_power, marking busy won't help.
321 		 * So we need to specifically reject the attempt to
322 		 * go to non-D0 state.
323 		 */
324 		if (pwr_p->pwr_flags & PCIE_NO_CHILD_PM) {
325 			if (!PCIE_IS_COMPS_COUNTED(cdip)) {
326 				PCIE_DBG("%s(%d): pcie_bus_power: marking "
327 				    "child busy to disable pm \n",
328 				    ddi_driver_name(dip),
329 				    ddi_get_instance(dip));
330 				(void) pm_busy_component(cdip, 0);
331 			}
332 			if (new_level < PM_LEVEL_D0 && !comp) {
333 				PCIE_DBG("%s(%d): pcie_bus_power: rejecting "
334 				    "child's attempt to go to %d\n",
335 				    ddi_driver_name(dip), ddi_get_instance(dip),
336 				    new_level);
337 				rv = DDI_FAILURE;
338 			}
339 		}
340 		mutex_exit(&pwr_p->pwr_lock);
341 		if (rv == DDI_SUCCESS)
342 			rv = pcie_pm_hold(dip);
343 		return (rv);
344 
345 	case BUS_POWER_HAS_CHANGED:
346 	case BUS_POWER_POST_NOTIFICATION:
347 		PCIE_DBG("%s(%d): pcie_bus_power: %s@%d op %s %d->%d\n",
348 		    ddi_driver_name(dip), ddi_get_instance(dip),
349 		    ddi_driver_name(cdip), ddi_get_instance(cdip),
350 		    pcie_decode_pwr_op(op), old_level, new_level);
351 		/*
352 		 * Child device power changed
353 		 * If pm components of this child aren't accounted for
354 		 * then add the components to the counters. This can't
355 		 * be done in POST_ATTACH ctlop as pm info isn't created
356 		 * by then. Also because a driver can make a pm call during
357 		 * the attach.
358 		 */
359 		if (!PCIE_IS_COMPS_COUNTED(cdip)) {
360 			(void) pcie_pm_add_child(dip, cdip);
361 			if ((pwr_p->pwr_flags & PCIE_NO_CHILD_PM) &&
362 			    (op == BUS_POWER_HAS_CHANGED)) {
363 				PCIE_DBG("%s(%d): pcie_bus_power: marking "
364 				    "child busy to disable pm \n",
365 				    ddi_driver_name(dip),
366 				    ddi_get_instance(dip));
367 				(void) pm_busy_component(cdip, 0);
368 				/*
369 				 * If the driver has already changed to lower
370 				 * power(pm_power_has_changed) on its own,
371 				 * there is nothing we can do other than
372 				 * logging the warning message on the console.
373 				 */
374 				if (new_level < PM_LEVEL_D0)
375 					cmn_err(CE_WARN, "!Downstream device "
376 					    "%s@%d went to non-D0 state: "
377 					    "possible loss of link\n",
378 					    ddi_driver_name(cdip),
379 					    ddi_get_instance(cdip));
380 			}
381 		}
382 
383 
384 		/*
385 		 * If it is POST and device PM is supported, release the
386 		 * hold done in PRE.
387 		 */
388 		if (op == BUS_POWER_POST_NOTIFICATION &&
389 		    PCIE_SUPPORTS_DEVICE_PM(dip)) {
390 			pcie_pm_subrelease(dip, pwr_p);
391 		}
392 
393 		if (*((int *)result) == DDI_FAILURE) {
394 			PCIE_DBG("%s(%d): pcie_bus_power: change for %s%d "
395 			    "failed\n", ddi_driver_name(dip),
396 			    ddi_get_instance(dip), ddi_driver_name(cdip),
397 			    ddi_get_instance(cdip));
398 			break;
399 		}
400 		/* Modify counters appropriately */
401 		pwr_update_counters(counters, old_level, new_level);
402 
403 		child_counters = PCIE_CHILD_COUNTERS(cdip);
404 		pwr_update_counters(child_counters, old_level, new_level);
405 
406 		/* If no device PM, return */
407 		if (!PCIE_SUPPORTS_DEVICE_PM(dip))
408 			break;
409 
410 		level_allowed = pwr_level_allowed(pwr_p);
411 		/*
412 		 * Check conditions for marking busy
413 		 * Check the flag to set this busy only once for multiple
414 		 * busy conditions. Mark busy if our current lowest possible
415 		 * is equal or greater to the current level.
416 		 */
417 		if (level_allowed >= pwr_p->pwr_func_lvl &&
418 		    !(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
419 			PCIE_DBG("%s(%d): pcie_bus_power: marking busy\n",
420 			    ddi_driver_name(dip), ddi_get_instance(dip));
421 			(void) pm_busy_component(dip, 0);
422 			pwr_p->pwr_flags |= PCIE_PM_BUSY;
423 			break;
424 		}
425 		/*
426 		 * Check conditions for marking idle.
427 		 * If our lowest possible level is less than our current
428 		 * level mark idle. Mark idle only if it is not already done.
429 		 */
430 		if ((level_allowed < pwr_p->pwr_func_lvl) &&
431 		    (pwr_p->pwr_hold == 0) &&
432 		    (pwr_p->pwr_flags & PCIE_PM_BUSY)) {
433 			/*
434 			 * For pci express, we should check here whether
435 			 * the link is in L1 state or not.
436 			 */
437 			PCIE_DBG("%s(%d): pcie_bus_power: marking idle\n",
438 			    ddi_driver_name(dip), ddi_get_instance(dip));
439 			(void) pm_idle_component(dip, 0);
440 			pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
441 			break;
442 		}
443 		break;
444 
445 	default:
446 		mutex_exit(&pwr_p->pwr_lock);
447 		return (pm_busop_bus_power(dip, impl_arg, op, arg, result));
448 	}
449 	mutex_exit(&pwr_p->pwr_lock);
450 	return (rv);
451 }
452 
453 /*
454  * Decrement the count of children at olevel by one and increment
455  * count of children at nlevel by one.
456  */
457 static void
458 pwr_update_counters(int *countersp, int olevel, int nlevel)
459 {
460 	uint32_t	index;
461 
462 	ASSERT(olevel >= PM_LEVEL_UNKNOWN && olevel <= PM_LEVEL_D0);
463 	ASSERT(nlevel >= PM_LEVEL_UNKNOWN && nlevel <= PM_LEVEL_D0);
464 
465 	index = (olevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : olevel);
466 	countersp[index]--;
467 	index = (nlevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : nlevel);
468 	countersp[index]++;
469 }
470 
471 /*
472  * Returns the lowest possible power level allowed for nexus
473  * based on children's power level. Lowest possible level is
474  * equal to the highest level among the children. It also checks
475  * for the supported level
476  * UNKNOWN = D0 > D1 > D2 > D3
477  */
478 static int
479 pwr_level_allowed(pcie_pwr_t *pwr_p)
480 {
481 	int *counters = pwr_p->pwr_counters;
482 	int i, j;
483 
484 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
485 	/*
486 	 * Search from UNKNOWN to D2. unknown is same as D0.
487 	 * find the highest level among the children. If that
488 	 * level is supported, return that level. If not,
489 	 * find the next higher supported level and return that
490 	 * level. For example, if the D1 is the highest among
491 	 * children and if D1 isn't supported return D0 as the
492 	 * lowest possible level. We don't need to look at D3
493 	 * as that is the default lowest level and it is always
494 	 * supported.
495 	 */
496 	for (i = PCIE_UNKNOWN_INDEX; i > 0; i--) {
497 		if (counters[i]) {
498 			if (i == PCIE_UNKNOWN_INDEX)
499 				return (PM_LEVEL_D0);
500 			/*
501 			 * i is the highest level among children. If this is
502 			 * supported, return i.
503 			 */
504 			if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, i))
505 				return (i);
506 			/* find the next higher supported level */
507 			for (j = i + 1; j <= PCIE_D0_INDEX; j++) {
508 				if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, j))
509 					return (j);
510 			}
511 		}
512 	}
513 
514 	return (PM_LEVEL_D3);
515 }
516 
517 /*
518  * Update the counters with number pm components of the child
519  * all components are assumed to be at UNKNOWN level.
520  */
521 static void
522 pcie_add_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
523 {
524 	int comps = PM_NUMCMPTS(cdip);
525 	pcie_pm_t *pcie_pm_p;
526 	pcie_pwr_child_t *cpwr_p;
527 
528 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
529 	if (!comps)
530 		return;
531 
532 	PCIE_DBG("%s(%d): pcie_add_comps: unknown level counter incremented "
533 	    "from %d by %d because of %s@%d\n",
534 	    ddi_driver_name(dip), ddi_get_instance(dip),
535 	    (pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX], comps,
536 	    ddi_driver_name(cdip), ddi_get_instance(cdip));
537 	(pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX] += comps;
538 	/*
539 	 * Allocate counters per child. This is a part of pcie
540 	 * pm info. If there is no pcie pm info, allocate it here.
541 	 * pcie pm info might already be there for pci express nexus
542 	 * driver e.g. pcieb. For all leaf nodes, it is allocated here.
543 	 */
544 	if ((pcie_pm_p = PCIE_PMINFO(cdip)) == NULL) {
545 		pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
546 		    sizeof (pcie_pm_t), KM_SLEEP);
547 		PCIE_SET_PMINFO(cdip, pcie_pm_p);
548 	}
549 	cpwr_p = (pcie_pwr_child_t *)kmem_zalloc(sizeof (pcie_pwr_child_t),
550 	    KM_SLEEP);
551 	pcie_pm_p->pcie_par_pminfo = cpwr_p;
552 	(cpwr_p->pwr_child_counters)[PCIE_UNKNOWN_INDEX] += comps;
553 }
554 
555 /*
556  * Remove the pm components of a child from our counters.
557  */
558 static void
559 pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
560 {
561 	int i;
562 	int *child_counters;
563 
564 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
565 	if (!(PCIE_PMINFO(cdip)) || !PCIE_PAR_PMINFO(cdip)) {
566 		if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
567 			/*
568 			 * Driver never made a PM call and we didn't create
569 			 * any counters for this device. This also means that
570 			 * hold made at the PRE_ATTACH time, still remains.
571 			 * Remove the hold now. The correct thing to do is to
572 			 * stay at full power when a child is at full power
573 			 * whether a driver is there or not. This will be
574 			 * implemented in the future.
575 			 */
576 			pcie_pm_subrelease(dip, pwr_p);
577 		}
578 		return;
579 	}
580 	PCIE_DBG("%s(%d): pcie_remove_comps:counters decremented because of "
581 	    "%s@%d\n", ddi_driver_name(dip), ddi_get_instance(dip),
582 	    ddi_driver_name(cdip), ddi_get_instance(cdip));
583 	child_counters = PCIE_CHILD_COUNTERS(cdip);
584 	/*
585 	 * Adjust the nexus counters. No need to adjust per child dip
586 	 * counters as we are freeing the per child dip info.
587 	 */
588 	for (i = 0; i < PCIE_MAX_PWR_LEVELS; i++) {
589 		ASSERT((pwr_p->pwr_counters)[i] >= child_counters[i]);
590 		(pwr_p->pwr_counters)[i] -= child_counters[i];
591 	}
592 	/* remove both parent pm info and pcie pminfo itself */
593 	kmem_free(PCIE_PAR_PMINFO(cdip), sizeof (pcie_pwr_child_t));
594 	kmem_free(PCIE_PMINFO(cdip), sizeof (pcie_pm_t));
595 	PCIE_RESET_PMINFO(cdip);
596 }
597 
598 /*
599  * Power management related initialization common to px and pcieb
600  */
601 int
602 pwr_common_setup(dev_info_t *dip)
603 {
604 	pcie_pm_t		*pcie_pm_p;
605 	pcie_pwr_t		*pwr_p;
606 	int			pminfo_created = 0;
607 
608 	/* Create pminfo, if it doesn't exist already */
609 	if ((pcie_pm_p = PCIE_PMINFO(dip)) == NULL) {
610 		pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
611 		    sizeof (pcie_pm_t), KM_SLEEP);
612 		PCIE_SET_PMINFO(dip, pcie_pm_p);
613 		pminfo_created = 1;
614 	}
615 	pwr_p = (pcie_pwr_t *)kmem_zalloc(sizeof (pcie_pwr_t), KM_SLEEP);
616 	mutex_init(&pwr_p->pwr_lock, NULL, MUTEX_DRIVER, NULL);
617 	/* Initialize the power level and default level support */
618 	pwr_p->pwr_func_lvl = PM_LEVEL_UNKNOWN;
619 	pwr_p->pwr_pmcaps = PCIE_DEFAULT_LEVEL_SUPPORTED;
620 
621 	if (pcie_plat_pwr_setup(dip) != DDI_SUCCESS)
622 		goto pwr_common_err;
623 
624 	pcie_pm_p->pcie_pwr_p = pwr_p;
625 	return (DDI_SUCCESS);
626 
627 pwr_common_err:
628 	mutex_destroy(&pwr_p->pwr_lock);
629 	kmem_free(pwr_p, sizeof (pcie_pwr_t));
630 	if (pminfo_created) {
631 		PCIE_RESET_PMINFO(dip);
632 		kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
633 	}
634 	return (DDI_FAILURE);
635 
636 }
637 
638 /*
639  * Undo whatever is done in pwr_common_setup. Called by px_detach or pxb_detach
640  */
641 void
642 pwr_common_teardown(dev_info_t *dip)
643 {
644 	pcie_pm_t *pcie_pm_p = PCIE_PMINFO(dip);
645 	pcie_pwr_t *pwr_p;
646 
647 	if (!pcie_pm_p || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
648 		return;
649 
650 	pcie_plat_pwr_teardown(dip);
651 	mutex_destroy(&pwr_p->pwr_lock);
652 	pcie_pm_p->pcie_pwr_p = NULL;
653 	kmem_free(pwr_p, sizeof (pcie_pwr_t));
654 	/*
655 	 * If the parent didn't store have any pm info about
656 	 * this node, that means parent doesn't need pminfo when it handles
657 	 * POST_DETACH for this node. For example, if dip is the dip of
658 	 * root complex, then there is no parent pm info.
659 	 */
660 	if (!PCIE_PAR_PMINFO(dip)) {
661 		kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
662 		PCIE_RESET_PMINFO(dip);
663 	}
664 }
665 
666 /*
667  * Raises the power and marks itself busy.
668  */
669 int
670 pcie_pm_hold(dev_info_t *dip)
671 {
672 	pcie_pwr_t *pwr_p;
673 
674 	/* If no PM info or no device PM, return */
675 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
676 	    !(PCIE_SUPPORTS_DEVICE_PM(dip)))
677 		return (DDI_SUCCESS);
678 
679 	/*
680 	 * If we are not at full power, then powerup.
681 	 * Need to be at full power so that link can be
682 	 * at L0. Similarly for PCI/PCI-X bus, it should be
683 	 * at full power.
684 	 */
685 	mutex_enter(&pwr_p->pwr_lock);
686 	ASSERT(pwr_p->pwr_hold >= 0);
687 	PCIE_DBG("%s(%d): pm_hold: incrementing hold \n",
688 	    ddi_driver_name(dip), ddi_get_instance(dip));
689 	pwr_p->pwr_hold++;
690 	/* Mark itself busy, if it is not done already */
691 	if (!(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
692 		PCIE_DBG("%s(%d): pm_hold: marking busy\n",
693 		    ddi_driver_name(dip), ddi_get_instance(dip));
694 		pwr_p->pwr_flags |= PCIE_PM_BUSY;
695 		(void) pm_busy_component(dip, 0);
696 	}
697 	if (pwr_p->pwr_func_lvl == PM_LEVEL_D0) {
698 		mutex_exit(&pwr_p->pwr_lock);
699 		return (DDI_SUCCESS);
700 	}
701 	mutex_exit(&pwr_p->pwr_lock);
702 	if (pm_raise_power(dip, 0, PM_LEVEL_D0) != DDI_SUCCESS) {
703 		PCIE_DBG("%s(%d): pm_hold: attempt to raise power "
704 		    "from %d to %d failed\n", ddi_driver_name(dip),
705 		    ddi_get_instance(dip), pwr_p->pwr_func_lvl,
706 		    PM_LEVEL_D0);
707 		pcie_pm_release(dip);
708 		return (DDI_FAILURE);
709 	}
710 	return (DDI_SUCCESS);
711 }
712 
713 /*
714  * Reverse the things done in pcie_pm_hold
715  */
716 void
717 pcie_pm_release(dev_info_t *dip)
718 {
719 	pcie_pwr_t *pwr_p;
720 
721 	/* If no PM info or no device PM, return */
722 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
723 	    !(PCIE_SUPPORTS_DEVICE_PM(dip)))
724 		return;
725 
726 	mutex_enter(&pwr_p->pwr_lock);
727 	pcie_pm_subrelease(dip, pwr_p);
728 	mutex_exit(&pwr_p->pwr_lock);
729 }
730 
731 static void
732 pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p)
733 {
734 	int level;
735 
736 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
737 	ASSERT(pwr_p->pwr_hold > 0);
738 	PCIE_DBG("%s(%d): pm_subrelease: decrementing hold \n",
739 	    ddi_driver_name(dip), ddi_get_instance(dip));
740 	pwr_p->pwr_hold--;
741 	ASSERT(pwr_p->pwr_hold >= 0);
742 	ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
743 	level = pwr_level_allowed(pwr_p);
744 	if (pwr_p->pwr_hold == 0 && level < pwr_p->pwr_func_lvl) {
745 		PCIE_DBG("%s(%d): pm_subrelease: marking idle \n",
746 		    ddi_driver_name(dip), ddi_get_instance(dip));
747 		(void) pm_idle_component(dip, 0);
748 		pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
749 	}
750 }
751 
752 /*
753  * Called when the child makes the first power management call.
754  * sets up the counters. All the components of the child device are
755  * assumed to be at unknown level. It also releases the power hold
756  *	pwr_p - parent's pwr_t
757  *	cdip   - child's dip
758  */
759 int
760 pcie_pm_add_child(dev_info_t *dip, dev_info_t *cdip)
761 {
762 	pcie_pwr_t *pwr_p;
763 
764 	/* If no PM info, return */
765 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
766 		return (DDI_SUCCESS);
767 
768 	ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
769 	ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
770 	pcie_add_comps(dip, cdip, pwr_p);
771 
772 	/* If no device power management then return */
773 	if (!PCIE_SUPPORTS_DEVICE_PM(dip))
774 		return (DDI_SUCCESS);
775 
776 	/*
777 	 * We have informed PM that we are busy at PRE_ATTACH time for
778 	 * this child. Release the hold and but don't clear the busy bit.
779 	 * If a device never changes power, hold will not be released
780 	 * and we stay at full power.
781 	 */
782 	ASSERT(pwr_p->pwr_hold > 0);
783 	PCIE_DBG("%s(%d): pm_add_child: decrementing hold \n",
784 	    ddi_driver_name(dip), ddi_get_instance(dip));
785 	pwr_p->pwr_hold--;
786 	/*
787 	 * We must have made sure that busy bit
788 	 * is set when we put the hold
789 	 */
790 	ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
791 	return (DDI_SUCCESS);
792 }
793 
794 /*
795  * Adjust the counters when a child detaches
796  * Marks itself idle if the idle conditions are met.
797  * Called at POST_DETACH time
798  */
799 int
800 pcie_pm_remove_child(dev_info_t *dip, dev_info_t *cdip)
801 {
802 	int *counters;
803 	int total;
804 	pcie_pwr_t *pwr_p;
805 
806 	/* If no PM info, return */
807 	if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
808 		return (DDI_SUCCESS);
809 
810 	counters = pwr_p->pwr_counters;
811 	mutex_enter(&pwr_p->pwr_lock);
812 	pcie_remove_comps(dip, cdip, pwr_p);
813 	/* If no device power management then return */
814 	if (!PCIE_SUPPORTS_DEVICE_PM(dip)) {
815 		mutex_exit(&pwr_p->pwr_lock);
816 		return (DDI_SUCCESS);
817 	}
818 	total = (counters[PCIE_D0_INDEX] + counters[PCIE_UNKNOWN_INDEX] +
819 	    counters[PCIE_D1_INDEX] + counters[PCIE_D2_INDEX] +
820 	    counters[PCIE_D3_INDEX]);
821 	/*
822 	 * Mark idle if either there are no children or our lowest
823 	 * possible level is less than the current level. Mark idle
824 	 * only if it is not already done.
825 	 */
826 	if ((pwr_p->pwr_hold == 0) &&
827 	    (!total || (pwr_level_allowed(pwr_p) < pwr_p->pwr_func_lvl))) {
828 		if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
829 			PCIE_DBG("%s(%d): pcie_bus_power: marking idle\n",
830 			    ddi_driver_name(dip), ddi_get_instance(dip));
831 			(void) pm_idle_component(dip, 0);
832 			pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
833 		}
834 	}
835 	mutex_exit(&pwr_p->pwr_lock);
836 	return (DDI_SUCCESS);
837 }
838 
839 boolean_t
840 pcie_is_pcie(dev_info_t *dip)
841 {
842 	pcie_bus_t *bus_p = PCIE_DIP2BUS(dip);
843 	ASSERT(bus_p);
844 	return (bus_p->bus_pcie_off != 0);
845 }
846 
847 /*
848  * Called by px_attach or pcieb_attach:: DDI_RESUME
849  */
850 int
851 pcie_pwr_resume(dev_info_t *dip)
852 {
853 	dev_info_t *cdip;
854 	pcie_pwr_t *pwr_p = NULL;
855 
856 #if defined(__i386) || defined(__amd64)
857 	if (dip)
858 		return (DDI_SUCCESS);
859 #endif /* defined(__i386) || defined(__amd64) */
860 
861 	if (PCIE_PMINFO(dip))
862 		pwr_p = PCIE_NEXUS_PMINFO(dip);
863 
864 	if (pwr_p) {
865 		/* Inform the PM framework that dip is at full power */
866 		if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
867 			ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
868 			(void) pm_raise_power(dip, 0,
869 			    pwr_p->pwr_func_lvl);
870 		}
871 	}
872 
873 	/*
874 	 * Code taken from pci driver.
875 	 * Restore config registers for children that did not save
876 	 * their own registers.  Children pwr states are UNKNOWN after
877 	 * a resume since it is possible for the PM framework to call
878 	 * resume without an actual power cycle. (ie if suspend fails).
879 	 */
880 	for (cdip = ddi_get_child(dip); cdip != NULL;
881 	    cdip = ddi_get_next_sibling(cdip)) {
882 		boolean_t	is_pcie;
883 
884 		/*
885 		 * Not interested in children who are not already
886 		 * init'ed.  They will be set up by init_child().
887 		 */
888 		if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
889 			PCIE_DBG("%s(%d): "
890 			    "DDI_RESUME: skipping %s%d not in CF1\n",
891 			    ddi_driver_name(dip), ddi_get_instance(dip),
892 			    ddi_driver_name(cdip), ddi_get_instance(cdip));
893 			continue;
894 		}
895 
896 		/*
897 		 * Only restore config registers if saved by nexus.
898 		 */
899 		if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
900 		    "nexus-saved-config-regs") != 1)
901 			continue;
902 
903 		PCIE_DBG("%s(%d): "
904 		    "DDI_RESUME: nexus restoring %s%d config regs\n",
905 		    ddi_driver_name(dip), ddi_get_instance(dip),
906 		    ddi_driver_name(cdip), ddi_get_instance(cdip));
907 
908 		/* clear errors left by OBP scrubbing */
909 		pcie_clear_errors(cdip);
910 
911 		/* PCIe workaround: disable errors during 4K config resore */
912 		is_pcie = pcie_is_pcie(cdip);
913 		if (is_pcie)
914 			pcie_disable_errors(cdip);
915 		(void) pci_restore_config_regs(cdip);
916 		if (is_pcie) {
917 			pcie_enable_errors(cdip);
918 			(void) pcie_enable_ce(cdip);
919 		}
920 
921 		if (ndi_prop_remove(DDI_DEV_T_NONE, cdip,
922 		    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
923 			PCIE_DBG("%s(%d): %s%d can't remove prop %s",
924 			    ddi_driver_name(dip), ddi_get_instance(dip),
925 			    ddi_driver_name(cdip), ddi_get_instance(cdip),
926 			    "nexus-saved-config-regs");
927 		}
928 	}
929 	return (DDI_SUCCESS);
930 }
931 
932 /*
933  * Called by pcie_detach or pcieb_detach:: DDI_SUSPEND
934  */
935 int
936 pcie_pwr_suspend(dev_info_t *dip)
937 {
938 	dev_info_t *cdip;
939 	int i, *counters; /* per nexus counters */
940 	int *child_counters = NULL; /* per child dip counters */
941 	pcie_pwr_t *pwr_p = NULL;
942 
943 #if defined(__i386) || defined(__amd64)
944 	if (dip)
945 		return (DDI_SUCCESS);
946 #endif /* defined(__i386) || defined(__amd64) */
947 
948 	if (PCIE_PMINFO(dip))
949 		pwr_p = PCIE_NEXUS_PMINFO(dip);
950 
951 	/*
952 	 * Mark all children to be unknown and bring our power level
953 	 * to full, if required. This is to avoid any panics while
954 	 * accessing the child's config space.
955 	 */
956 	if (pwr_p) {
957 		mutex_enter(&pwr_p->pwr_lock);
958 		if (PCIE_SUPPORTS_DEVICE_PM(dip) &&
959 		    pwr_p->pwr_func_lvl != PM_LEVEL_D0) {
960 			mutex_exit(&pwr_p->pwr_lock);
961 			if (pm_raise_power(dip, 0, PM_LEVEL_D0) !=
962 			    DDI_SUCCESS) {
963 				PCIE_DBG("%s(%d): pwr_suspend: attempt "
964 				    "to raise power from %d to %d "
965 				    "failed\n", ddi_driver_name(dip),
966 				    ddi_get_instance(dip), pwr_p->pwr_func_lvl,
967 				    PM_LEVEL_D0);
968 				return (DDI_FAILURE);
969 			}
970 			mutex_enter(&pwr_p->pwr_lock);
971 		}
972 		counters = pwr_p->pwr_counters;
973 		/*
974 		 * Update the nexus counters. At the resume time all
975 		 * components are considered to be at unknown level. Use the
976 		 * fact that counters for unknown level are at the end.
977 		 */
978 		for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
979 			counters[PCIE_UNKNOWN_INDEX] += counters[i];
980 			counters[i] = 0;
981 		}
982 		mutex_exit(&pwr_p->pwr_lock);
983 	}
984 
985 	/*
986 	 * Code taken from pci driver.
987 	 * Save the state of the configuration headers of child
988 	 * nodes.
989 	 */
990 	for (cdip = ddi_get_child(dip); cdip != NULL;
991 	    cdip = ddi_get_next_sibling(cdip)) {
992 		boolean_t	is_pcie;
993 
994 		/*
995 		 * Not interested in children who are not already
996 		 * init'ed.  They will be set up in init_child().
997 		 */
998 		if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
999 			PCIE_DBG("%s(%d): DDI_SUSPEND: skipping "
1000 			    "%s%d not in CF1\n", ddi_driver_name(dip),
1001 			    ddi_get_instance(dip), ddi_driver_name(cdip),
1002 			    ddi_get_instance(cdip));
1003 			continue;
1004 		}
1005 		/*
1006 		 * Update per child dip counters, if any. Counters
1007 		 * will not exist if the child is not power manageable
1008 		 * or if its power entry is never invoked.
1009 		 */
1010 		if (PCIE_PMINFO(cdip) && PCIE_PAR_PMINFO(cdip))
1011 			child_counters = PCIE_CHILD_COUNTERS(cdip);
1012 		if (child_counters && pwr_p) {
1013 			mutex_enter(&pwr_p->pwr_lock);
1014 			for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
1015 				child_counters[PCIE_UNKNOWN_INDEX] +=
1016 				    child_counters[i];
1017 				child_counters[i] = 0;
1018 			}
1019 			mutex_exit(&pwr_p->pwr_lock);
1020 		}
1021 
1022 		/*
1023 		 * Only save config registers if not already saved by child.
1024 		 */
1025 		if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
1026 		    SAVED_CONFIG_REGS) == 1) {
1027 			continue;
1028 		}
1029 
1030 		/*
1031 		 * The nexus needs to save config registers.  Create a property
1032 		 * so it knows to restore on resume.
1033 		 */
1034 		if (ndi_prop_create_boolean(DDI_DEV_T_NONE, cdip,
1035 		    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
1036 			PCIE_DBG("%s(%d): %s%d can't update prop %s",
1037 			    ddi_driver_name(dip), ddi_get_instance(dip),
1038 			    ddi_driver_name(cdip), ddi_get_instance(cdip),
1039 			    "nexus-saved-config-regs");
1040 		}
1041 		PCIE_DBG("%s(%d): DDI_SUSPEND: saving config space for"
1042 		    " %s%d\n", ddi_driver_name(dip), ddi_get_instance(dip),
1043 		    ddi_driver_name(cdip), ddi_get_instance(cdip));
1044 
1045 		/* PCIe workaround: disable errors during 4K config save */
1046 		is_pcie = pcie_is_pcie(cdip);
1047 		if (is_pcie)
1048 			pcie_disable_errors(cdip);
1049 		(void) pci_save_config_regs(cdip);
1050 		if (is_pcie) {
1051 			pcie_enable_errors(cdip);
1052 			(void) pcie_enable_ce(cdip);
1053 		}
1054 	}
1055 	return (DDI_SUCCESS);
1056 }
1057 
1058 #ifdef DEBUG
1059 /*
1060  * Description of bus_power_op.
1061  */
1062 typedef struct pcie_buspwr_desc {
1063 	pm_bus_power_op_t pwr_op;
1064 	char *pwr_desc;
1065 } pcie_buspwr_desc_t;
1066 
1067 static pcie_buspwr_desc_t pcie_buspwr_desc[] = {
1068 	{BUS_POWER_CHILD_PWRCHG, "CHILD_PWRCHG"},
1069 	{BUS_POWER_NEXUS_PWRUP, "NEXUS_PWRUP"},
1070 	{BUS_POWER_PRE_NOTIFICATION, "PRE_NOTIFICATION"},
1071 	{BUS_POWER_POST_NOTIFICATION, "POST_NOTIFICATION"},
1072 	{BUS_POWER_HAS_CHANGED, "HAS_CHANGED"},
1073 	{BUS_POWER_NOINVOL, "NOINVOL"},
1074 	{-1, NULL}
1075 };
1076 
1077 /*
1078  * Returns description of the bus_power_op.
1079  */
1080 static char *
1081 pcie_decode_pwr_op(pm_bus_power_op_t op)
1082 {
1083 	pcie_buspwr_desc_t *descp = pcie_buspwr_desc;
1084 
1085 	for (; descp->pwr_desc; descp++) {
1086 		if (op == descp->pwr_op)
1087 			return (descp->pwr_desc);
1088 	}
1089 	return ("UNKNOWN OP");
1090 }
1091 #endif
1092