/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include /* * CPU Caps implementation * ======================= * * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU * usage for all projects running inside the zone. If the zone CPU cap is set * below the project CPU cap, the latter will have no effect. * * When CPU usage of projects and/or zones reaches specified caps, threads in * them do not get scheduled and instead are placed on wait queues associated * with a cap. Such threads will start running again only when CPU usage drops * below the cap level. Each zone and each project has its own wait queue. * * When CPU cap is set, the kernel continously keeps track of CPU time used by * capped zones and/or projects over a short time interval and calculates their * current CPU usage as a percentage. When the accumulated usage reaches the CPU * cap, LWPs running in the user-land (when they are not holding any critical * kernel locks) are placed on special wait queues until their project's or * zone's CPU usage drops below the cap. * * The system maintains a list of all capped projects and all capped zones. On * every clock tick every active thread belonging to a capped project adds its * CPU usage to its project. Usage from all projects belonging to a capped zone * is aggregated to get the zone usage. * * When the current CPU usage is above the cap, a project or zone is considered * over-capped. Every user thread caught running in an over-capped project or * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and * is requested to surrender its CPU. This causes scheduling class specific * CL_PREEMPT() callback to be invoked. The callback function places threads * marked as TS_PROJWAIT on a wait queue and calls switch(). * * Threads are only placed on wait queues after trapping from user-land * (they could be holding some user locks, but no kernel locks) and while * returning from the trap back to the user-land when no kernel locks are held. * Putting threads on wait queues in random places while running in the * kernel might lead to all kinds of locking problems. * * Accounting * ========== * * Accounting of CPU usage is based on per-thread micro-state accounting data. * On every clock tick clock() adds new on-CPU time for every thread found on * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. * New times means time since it was last accounted for. On-CPU times greater * than 1 tick are truncated to 1 tick. * * Project CPU usage is aggregated from all threads within the project. * Zone CPU usage is the sum of usages for all projects within the zone. Zone * CPU usage is calculated on every clock tick by walking list of projects and * adding their usage together. * * Decay * ===== * * CPU usage is decayed by the caps_update() routine which is called once per * every clock tick. It walks lists of project caps and decays their usages by * one per cent. If CPU usage drops below cap levels, threads on the wait queue * are made runnable again, one thread per clock tick. * * Interfaces * ========== * * The CPU Caps facility provides the following interfaces to the rest of the * system: * * cpucaps_project_add(kproject_t *) * * Notifies the framework of a new project. It should be put on the * capped_projects list if its zone has a cap. * * cpucaps_project_remove(kproject_t *) * * Remove the association between the specified project and its cap. * Called right before the project is destroyed. * * cpucaps_project_set(kproject_t *, rctl_qty_t) * * Set project cap of the specified project to the specified value. Setting the * value to NOCAP is equivalent to removing the cap. * * cpucaps_zone_set(zone_t *, rctl_qty_t) * * Set zone cap of the specified zone to the specified value. Setting the value * to NOCAP is equivalent to removing the cap. * * cpucaps_zone_remove(zone_t *) * * Remove the association between the zone and its cap. * * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) * * Charges specified thread's project the amount of on-CPU time that it used. * If the third argument is CPUCAPS_CHARGE_ONLY returns False. * Otherwise returns True if project or zone should be penalized because its * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ * bits in t_schedflag in this case. * * CPUCAPS_ENFORCE(kthread_id_t *) * * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER * state on project or zone wait queues, as requested by TS_PROJWAITQ or * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a * wait queue or False otherwise. * * cpucaps_sc_init(caps_sc_t *) * * Initializes the scheduling-class specific CPU Caps data for a thread. * * LOCKS * ===== * * all the individual caps structures and their lists are protected by a global * caps_lock mutex. The lock is grabbed either by clock() or by events modifying * caps, so it is usually uncontended. We avoid all blocking memory allocations * while holding caps_lock to prevent clock() from blocking. * * Thread state is protected by the thread lock. It protects the association * between a thread and its project and, as a consequence, to its zone. The * association can not break while thread lock is held, so the project or zone * cap are not going to disappear while thread lock is held. * * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is * grabbed by scheduling classes already holding thread lock at high PIL and by * clock thread performing usage decay. We should do as little work as possible * while holding the lock since it may be very hot. All threads in the project * contend for the same cache line doing cap usage updates. */ /* * caps_lock protects list of capped projects and zones, changes in the cap * state and changes of the global cpucaps_enabled flag. * * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is * modified in parallel. This can be per-zone cap flag, but we don't keep any * cap state for now. */ static kmutex_t caps_lock; /* lock to protect: */ static list_t capped_zones; /* - list of zones with caps */ static list_t capped_projects; /* - list of projects with caps */ boolean_t cpucaps_enabled; /* - are there any caps defined? */ boolean_t cpucaps_busy; /* - is framework busy? */ /* * The accounting is based on the number of nanoseconds threads spend running * during a tick which is kept in the cap_tick_cost variable. */ static hrtime_t cap_tick_cost; /* * How much of the usage value is decayed every clock tick * Decay one per cent of value per tick */ #define CAP_DECAY_FACTOR 100 /* * Scale the value and round it to the closest integer value */ #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) static void caps_update(); /* * CAP kstats. */ struct cap_kstat { kstat_named_t cap_value; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; kstat_named_t cap_above; kstat_named_t cap_maxusage; kstat_named_t cap_zonename; } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; static kmutex_t cap_kstat_lock; static int cap_kstat_update(kstat_t *, int); /* * Initialize CPU caps infrastructure. * - Initialize lists of capped zones and capped projects * - Set cpucaps_clock_callout to NULL */ void cpucaps_init() { /* * Initialize global variables */ cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); list_create(&capped_zones, sizeof (cpucap_t), offsetof(cpucap_t, cap_link)); list_create(&capped_projects, sizeof (cpucap_t), offsetof(cpucap_t, cap_link)); cpucaps_enabled = B_FALSE; cpucaps_busy = B_FALSE; cpucaps_clock_callout = NULL; } /* * Initialize scheduling-class specific CPU Caps data. */ void cpucaps_sc_init(caps_sc_t *csc) { csc->csc_cputime = 0; } /* * Allocate and initialize cpucap structure */ static cpucap_t * cap_alloc(void) { cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); DISP_LOCK_INIT(&cap->cap_usagelock); waitq_init(&cap->cap_waitq); return (cap); } /* * Free cpucap structure */ static void cap_free(cpucap_t *cap) { if (cap == NULL) return; /* * This cap should not be active */ ASSERT(!list_link_active(&cap->cap_link)); ASSERT(cap->cap_value == 0); ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); waitq_fini(&cap->cap_waitq); DISP_LOCK_DESTROY(&cap->cap_usagelock); kmem_free(cap, sizeof (cpucap_t)); } /* * Activate cap - insert into active list and unblock its * wait queue. Should be called with caps_lock held. * The cap_value field is set to the value supplied. */ static void cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) { ASSERT(MUTEX_HELD(&caps_lock)); /* * Cap can not be already enabled */ ASSERT(!CAP_ENABLED(cap)); ASSERT(!list_link_active(&cap->cap_link)); list_insert_tail(l, cap); cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; cap->cap_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; cpucaps_clock_callout = caps_update; } } /* * Deactivate cap * - Block its wait queue. This prevents any new threads from being * enqueued there and moves all enqueued threads to the run queue. * - Remove cap from list l. * - Disable CPU caps globally if there are no capped projects or zones * * Should be called with caps_lock held. */ static void cap_disable(list_t *l, cpucap_t *cap) { ASSERT(MUTEX_HELD(&caps_lock)); /* * Cap should be currently active */ ASSERT(CPUCAPS_ON()); ASSERT(list_link_active(&cap->cap_link)); ASSERT(CAP_ENABLED(cap)); waitq_block(&cap->cap_waitq); list_remove(l, cap); if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } cap->cap_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; if (cap->cap_kstat != NULL) { kstat_delete(cap->cap_kstat); cap->cap_kstat = NULL; } } /* * Enable cap for a project kpj * It is safe to enable already enabled project cap. * Should be called with caps_lock held. */ static void cap_project_enable(kproject_t *kpj, hrtime_t value) { cpucap_t *cap = kpj->kpj_cpucap; ASSERT(MUTEX_HELD(&caps_lock)); ASSERT(cap != NULL); if (CAP_DISABLED(cap)) { ASSERT(cap->cap_kstat == NULL); cap_enable(&capped_projects, cap, value); cap->cap_project = kpj; cap->cap_zone = kpj->kpj_zone; /* * Create cap kstats */ if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", KSTAT_TYPE_NAMED, sizeof (cap_kstat) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { cap->cap_kstat->ks_data_size += strlen(cap->cap_zone->zone_name) + 1; cap->cap_kstat->ks_lock = &cap_kstat_lock; cap->cap_kstat->ks_data = &cap_kstat; cap->cap_kstat->ks_update = cap_kstat_update; cap->cap_kstat->ks_private = cap; kstat_install(cap->cap_kstat); } } } /* * Disable project cap. * It is safe to disable already disabled project cap. * Should be called with caps_lock held. */ static void cap_project_disable(kproject_t *kpj) { cpucap_t *cap = kpj->kpj_cpucap; ASSERT(MUTEX_HELD(&caps_lock)); ASSERT(cap != NULL); ASSERT(cap->cap_project == kpj); if (CAP_ENABLED(cap)) cap_disable(&capped_projects, cap); } /* * Enable cap for a zone * It is safe to enable already enabled zone cap. * Should be called with caps_lock held. */ static void cap_zone_enable(zone_t *zone, hrtime_t value) { cpucap_t *cap = zone->zone_cpucap; ASSERT(MUTEX_HELD(&caps_lock)); ASSERT(cap != NULL); if (CAP_DISABLED(cap)) { ASSERT(cap->cap_kstat == NULL); cap_enable(&capped_zones, cap, value); cap->cap_zone = zone; /* * Create cap kstats */ if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", KSTAT_TYPE_NAMED, sizeof (cap_kstat) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { cap->cap_kstat->ks_data_size += strlen(cap->cap_zone->zone_name) + 1; cap->cap_kstat->ks_lock = &cap_kstat_lock; cap->cap_kstat->ks_data = &cap_kstat; cap->cap_kstat->ks_update = cap_kstat_update; cap->cap_kstat->ks_private = cap; kstat_install(cap->cap_kstat); } } } /* * Disable zone cap. * It is safe to disable already disabled zone cap. * Should be called with caps_lock held. */ static void cap_zone_disable(zone_t *zone) { cpucap_t *cap = zone->zone_cpucap; ASSERT(MUTEX_HELD(&caps_lock)); ASSERT(cap != NULL); ASSERT(cap->cap_zone == zone); if (CAP_ENABLED(cap)) cap_disable(&capped_zones, cap); } /* * Apply specified callback to all caps contained in the list `l'. */ static void cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) { static uint64_t cpucap_walk_gen; cpucap_t *cap; ASSERT(MUTEX_HELD(&caps_lock)); for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { (*cb)(cap, cpucap_walk_gen); } atomic_inc_64(&cpucap_walk_gen); } /* * If cap limit is not reached, make one thread from wait queue runnable. * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). */ /* ARGSUSED */ static void cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); if (cap->cap_usage >= cap->cap_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; cap->cap_below++; if (!waitq_isempty(wq)) waitq_runone(wq); } } /* * The callback function called for every cap on capped_projects list. * Decay cap usage by CAP_DECAY_FACTOR * Add this cap project usage to its zone usage. * Kick off a thread from the cap waitq if cap is not reached. */ static void cap_project_usage_walker(cpucap_t *cap, int64_t gen) { zone_t *zone = cap->cap_zone; hrtime_t cap_usage = cap->cap_usage; ASSERT(MUTEX_HELD(&caps_lock)); ASSERT(cap->cap_project->kpj_cpucap == cap); ASSERT(zone == cap->cap_project->kpj_zone); ASSERT(CAP_ENABLED(cap)); /* * Set or clear the CAP_REACHED flag based on the current usage. * Only projects having their own caps are ever marked as CAP_REACHED. */ cap_poke_waitq(cap, 0); /* * Add project's CPU usage to our zone's CPU usage. */ if (ZONE_IS_CAPPED(zone)) { cpucap_t *zcap = zone->zone_cpucap; ASSERT(zcap->cap_zone == zone); /* * If we haven't reset this zone's usage during this clock tick * yet, then do it now. The cap_gen field is used to check * whether this is the first zone's project we see during this * tick or a subsequent one. */ if (zcap->cap_gen != gen) { if (zcap->cap_usage > zcap->cap_maxusage) zcap->cap_maxusage = zcap->cap_usage; zcap->cap_usage = 0; zcap->cap_gen = gen; } DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, hrtime_t, cap_usage); zcap->cap_usage += cap_usage; /* Check for overflows */ if (zcap->cap_usage < 0) zcap->cap_usage = MAX_USAGE - 1; } /* * Decay project usage. */ disp_lock_enter(&cap->cap_usagelock); cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); disp_lock_exit(&cap->cap_usagelock); } /* * On every clock tick walk the list of project caps and update the CPU usage. * Also walk the list of zone caps checking whether any threads should * transition from wait queue to run queue. * * This function gets called by the clock thread directly when there are any * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs * caps_lock for long periods of time, so there should be almost no contention * for it. */ static void caps_update() { mutex_enter(&caps_lock); cap_walk(&capped_projects, cap_project_usage_walker); cap_walk(&capped_zones, cap_poke_waitq); mutex_exit(&caps_lock); } /* * The function is called for each project in a zone when the zone cap is * modified. It enables project caps if zone cap is enabled and disables if the * zone cap is disabled and project doesn't have its own cap. * * For each project that does not have cpucap structure allocated it allocates a * new structure and assigns to kpj->cpu_cap. The allocation is performed * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock * held. */ static int cap_project_zone_modify_walker(kproject_t *kpj, void *arg) { cpucap_t *project_cap = NULL; cpucap_t *zone_cap = (cpucap_t *)arg; ASSERT(zone_cap != NULL); if (kpj->kpj_cpucap == NULL) { /* * This is the first time any cap was established for this * project. Allocate a new cpucap structure for it. */ project_cap = cap_alloc(); } mutex_enter(&caps_lock); /* * Double-check that kpj_cpucap is still NULL - now with caps_lock held * and assign the newly allocated cpucap structure to it. */ if (kpj->kpj_cpucap == NULL) { kpj->kpj_cpucap = project_cap; } else if (project_cap != NULL) { cap_free(project_cap); } project_cap = kpj->kpj_cpucap; if (CAP_DISABLED(zone_cap)) { /* * Remove all projects in this zone without caps * from the capped_projects list. */ if (project_cap->cap_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ ASSERT(project_cap->cap_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); return (0); } /* * Set zone cap to cap_val * If cap_val is equal to NOCAP, disable zone cap. * * If this is the first time a cap is set on a zone, allocate cpucap structure * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. */ int cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) { cpucap_t *cap = NULL; hrtime_t value; if (cap_val == 0) return (EINVAL); ASSERT(cap_val <= MAXCAP); if (cap_val > MAXCAP) cap_val = MAXCAP; /* * Nothing to do if trying to disable a cap on a zone when caps are off * or a zone which does not have a cap yet. */ if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) return (0); if (zone->zone_cpucap == NULL) cap = cap_alloc(); mutex_enter(&caps_lock); if (cpucaps_busy) { mutex_exit(&caps_lock); return (EBUSY); } /* * Double-check whether zone->zone_cpucap is NULL, now with caps_lock * held. If it is still NULL, assign a newly allocated cpucap to it. */ if (zone->zone_cpucap == NULL) { zone->zone_cpucap = cap; } else if (cap != NULL) { cap_free(cap); } cap = zone->zone_cpucap; value = cap_val * cap_tick_cost; if (value < 0) value = MAX_USAGE; /* Nothing to do if the value is staying the same */ if (value == cap->cap_value) { mutex_exit(&caps_lock); return (0); } /* * Clear cap statistics since the cap value itself changes. */ cap->cap_above = cap->cap_below = 0; if (cap_val == NOCAP) { if (CAP_ENABLED(cap)) { /* * Remove cap for the zone */ cap_zone_disable(zone); cpucaps_busy = B_TRUE; mutex_exit(&caps_lock); /* * Disable caps for all project belonging to this zone * unless they have their own cap. */ (void) project_walk_all(zone->zone_id, cap_project_zone_modify_walker, cap); mutex_enter(&caps_lock); cpucaps_busy = B_FALSE; } } else if (CAP_DISABLED(cap)) { /* * Set a cap on a zone which previously was not capped. */ cap_zone_enable(zone, value); cpucaps_busy = B_TRUE; mutex_exit(&caps_lock); /* * Enable cap for all projects belonging to this zone. */ (void) project_walk_all(zone->zone_id, cap_project_zone_modify_walker, cap); mutex_enter(&caps_lock); cpucaps_busy = B_FALSE; } else { /* * No state transitions, just change the value */ cap->cap_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); ASSERT(!cpucaps_busy); mutex_exit(&caps_lock); return (0); } /* * The project is going away so disable its cap. */ void cpucaps_project_remove(kproject_t *kpj) { mutex_enter(&caps_lock); if (PROJECT_IS_CAPPED(kpj)) cap_project_disable(kpj); if (kpj->kpj_cpucap != NULL) { cap_free(kpj->kpj_cpucap); kpj->kpj_cpucap = NULL; } mutex_exit(&caps_lock); } /* * The zone is going away, so disable its cap. */ void cpucaps_zone_remove(zone_t *zone) { mutex_enter(&caps_lock); while (ZONE_IS_CAPPED(zone)) { mutex_exit(&caps_lock); (void) cpucaps_zone_set(zone, NOCAP); mutex_enter(&caps_lock); } if (zone->zone_cpucap != NULL) { cap_free(zone->zone_cpucap); zone->zone_cpucap = NULL; } mutex_exit(&caps_lock); } /* * New project was created. It should be put on the capped_projects list if * its zone has a cap. */ void cpucaps_project_add(kproject_t *kpj) { cpucap_t *cap = NULL; if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) return; /* * This project was never capped before, so allocate its cap structure. */ if (kpj->kpj_cpucap == NULL) cap = cap_alloc(); mutex_enter(&caps_lock); /* * Double-check with caps_lock held */ if (kpj->kpj_cpucap == NULL) { kpj->kpj_cpucap = cap; } else if (cap != NULL) { cap_free(cap); } if (ZONE_IS_CAPPED(kpj->kpj_zone)) cap_project_enable(kpj, MAX_USAGE); mutex_exit(&caps_lock); } /* * Set project cap to cap_val * If cap_val is equal to NOCAP, disable project cap. * * If this is the first time a cap is set on a project, allocate cpucap * structure without holding caps_lock to avoid KM_SLEEP allocation with * caps_lock held. */ int cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) { cpucap_t *cap = NULL; hrtime_t value; if (cap_val == 0) return (EINVAL); ASSERT(cap_val <= MAXCAP); if (cap_val > MAXCAP) cap_val = MAXCAP; /* * Nothing to do if trying to disable project cap and caps are not * enabled or if trying to disable cap on a project that does not have * cap enabled. */ if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) return (0); if (kpj->kpj_cpucap == NULL) { /* * This project was never capped before, so allocate its cap * structure. */ cap = cap_alloc(); } mutex_enter(&caps_lock); /* * Double-check with caps_lock held. */ if (kpj->kpj_cpucap == NULL) { kpj->kpj_cpucap = cap; } else if (cap != NULL) { cap_free(cap); } /* * Get the actual pointer to the project cap. */ cap = kpj->kpj_cpucap; value = cap_val * cap_tick_cost; if (value < 0) value = MAX_USAGE; /* * Nothing to do if the value is not changing */ if (value == cap->cap_value) { mutex_exit(&caps_lock); return (0); } /* * Clear cap statistics since the cap value itself changes. */ cap->cap_above = cap->cap_below = 0; cap->cap_maxusage = 0; if (cap_val != NOCAP) { /* * Enable this cap if it is not already enabled. */ if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else cap->cap_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of * capped zone, keep the cap and set the value to MAX_USAGE, * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { cap->cap_value = MAX_USAGE; } else { cap_project_disable(kpj); } } mutex_exit(&caps_lock); return (0); } /* * Get cap usage. */ static rctl_qty_t cap_get(cpucap_t *cap) { return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); } /* * Get current project usage. */ rctl_qty_t cpucaps_project_get(kproject_t *kpj) { return (cap_get(kpj->kpj_cpucap)); } /* * Get current zone usage. */ rctl_qty_t cpucaps_zone_get(zone_t *zone) { return (cap_get(zone->zone_cpucap)); } /* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * * Record the current on-CPU time in the csc structure. * * Do not adjust for more than one tick worth of time. * * It is possible that the project cap is being disabled while this routine is * executed. This should not cause any issues since the association between the * thread and its project is protected by thread lock. */ static void caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) { kproject_t *kpj = ttoproj(t); hrtime_t new_usage; hrtime_t usage_delta; ASSERT(THREAD_LOCK_HELD(t)); ASSERT(kpj->kpj_cpucap != NULL); /* Get on-CPU time since birth of a thread */ new_usage = mstate_thread_onproc_time(t); /* Time spent on CPU since last checked */ usage_delta = new_usage - csc->csc_cputime; /* Save the accumulated on-CPU time */ csc->csc_cputime = new_usage; /* Charge at most one tick worth of on-CPU time */ if (usage_delta > cap_tick_cost) usage_delta = cap_tick_cost; /* Add usage_delta to the project usage value. */ if (usage_delta > 0) { cpucap_t *cap = kpj->kpj_cpucap; DTRACE_PROBE2(cpucaps__project__charge, kthread_id_t, t, hrtime_t, usage_delta); disp_lock_enter_high(&cap->cap_usagelock); cap->cap_usage += usage_delta; /* Check for overflows */ if (cap->cap_usage < 0) cap->cap_usage = MAX_USAGE - 1; disp_lock_exit_high(&cap->cap_usagelock); /* * cap_maxusage is only kept for observability. Move it outside * the lock to reduce the time spent while holding the lock. */ if (cap->cap_usage > cap->cap_maxusage) cap->cap_maxusage = cap->cap_usage; } } /* * Charge thread's project and return True if project or zone should be * penalized because its project or zone is exceeding its cap. Also sets * TS_PROJWAITQ or TS_ZONEWAITQ in this case. * * It is possible that the project cap is being disabled while this routine is * executed. This should not cause any issues since the association between the * thread and its project is protected by thread lock. It will still set * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place * anything on the blocked wait queue. * */ boolean_t cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) { kproject_t *kpj = ttoproj(t); klwp_t *lwp = t->t_lwp; zone_t *zone; cpucap_t *project_cap; boolean_t rc = B_FALSE; ASSERT(THREAD_LOCK_HELD(t)); /* Nothing to do for projects that are not capped. */ if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) return (B_FALSE); caps_charge_adjust(t, csc); /* * The caller only requested to charge the project usage, no enforcement * part. */ if (charge_type == CPUCAPS_CHARGE_ONLY) return (B_FALSE); project_cap = kpj->kpj_cpucap; if (project_cap->cap_usage >= project_cap->cap_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { t->t_schedflag &= ~TS_PROJWAITQ; } zone = ttozone(t); if (!ZONE_IS_CAPPED(zone)) { if (t->t_schedflag & TS_ZONEWAITQ) t->t_schedflag &= ~TS_ZONEWAITQ; } else { cpucap_t *zone_cap = zone->zone_cpucap; if (zone_cap->cap_usage >= zone_cap->cap_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { t->t_schedflag &= ~TS_ZONEWAITQ; } } return (rc); } /* * Enforce CPU caps. If got preempted in the user-land, we know that thread does * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. * * CPU Caps are only enforced for user threads. * * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. * * It is possible that by the time we enter cpucaps_enforce() the cap is already * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer * apply. */ boolean_t cpucaps_enforce(kthread_t *t) { klwp_t *lwp = t->t_lwp; ASSERT(THREAD_LOCK_HELD(t)); if (lwp != NULL && lwp->lwp_state == LWP_USER) { if (t->t_schedflag & TS_PROJWAITQ) { ASSERT(ttoproj(t)->kpj_cpucap != NULL); t->t_schedflag &= ~TS_ANYWAITQ; if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), t)) { return (B_TRUE); } } if (t->t_schedflag & TS_ZONEWAITQ) { ASSERT(ttozone(t)->zone_cpucap != NULL); t->t_schedflag &= ~TS_ZONEWAITQ; if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), t)) { return (B_TRUE); } } } /* * The thread is not enqueued on the wait queue. */ return (B_FALSE); } /* * Convert internal cap statistics into values exported by cap kstat. */ static int cap_kstat_update(kstat_t *ksp, int rw) { struct cap_kstat *capsp = &cap_kstat; cpucap_t *cap = ksp->ks_private; clock_t tick_sec = SEC_TO_TICK(1); char *zonename = cap->cap_zone->zone_name; if (rw == KSTAT_WRITE) return (EACCES); capsp->cap_value.value.ui64 = ROUND_SCALE(cap->cap_value, cap_tick_cost); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); }