/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * RESOURCE POOLS * * The resource pools facility brings together process-bindable resource into * a common abstraction called a pool. Processor sets and other entities can * be configured, grouped, and labelled such that workload components can be * associated with a subset of a system's total resources. * * When disabled, the pools facility is "invisible". All processes belong * to the same pool (pool_default), and processor sets can be managed through * the old pset() system call. When enabled, processor sets can only be * managed via the pools facility. New pools can be created and associated * with processor sets. Processes can be bound to pools which have non-empty * resource sets. * * Locking: pool_lock() protects global pools state and must be called * before modifying the configuration, or when taking a snapshot of the * configuration. If pool_lock_intr() is used, the operation may be * interrupted by a signal or a request. * * To prevent processes from being rebound between pools while they are * the middle of an operation which affects resource set bindings, such * operations must be surrounded by calls to pool_barrier_enter() and * pool_barrier_exit(). This mechanism guarantees that such processes will * be stopped either at the beginning or at the end of the barrier so that * the rebind operation can atomically bind the process and its threads * to new resource sets, and then let process run again. * * Lock ordering with respect to other locks is as follows: * * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock * * Most static and global variables defined in this file are protected * by calling pool_lock(). * * The operation that binds tasks and projects to pools is atomic. That is, * either all processes in a given task or a project will be bound to a * new pool, or (in case of an error) they will be all left bound to the * old pool. Processes in a given task or a given project can only be bound to * different pools if they were rebound individually one by one as single * processes. Threads or LWPs of the same process do not have pool bindings, * and are bound to the same resource sets associated with the resource pool * of that process. * * The following picture shows one possible pool configuration with three * pools and three processor sets. Note that processor set "foo" is not * associated with any pools and therefore cannot have any processes * bound to it. Two pools (default and foo) are associated with the * same processor set (default). Also, note that processes in Task 2 * are bound to different pools. * * * Processor Sets * +---------+ * +--------------+========================>| default | * a| | +---------+ * s| | || * s| | +---------+ * o| | | foo | * c| | +---------+ * i| | || * a| | +---------+ * t| | +------>| bar | * e| | | +---------+ * d| | | * | | | * +---------+ +---------+ +---------+ * Pools | default |======| foo |======| bar | * +---------+ +---------+ +---------+ * @ @ @ @ @ @ * b| | | | | | * o| | | | | | * u| +-----+ | +-------+ | +---+ * n| | | | | | * ....d|........|......|......|.........|.......|.... * : | :: | | | :: | | : * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : * Processes : | p | :: | p | | p | | p | :: | p |...| p | : * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : * :........::......................::...............: * Task 1 Task 2 Task N * | | | * | | | * | +-----------+ | +-----------+ * +--| Project 1 |--+ | Project N | * +-----------+ +-----------+ * * This is just an illustration of relationships between processes, tasks, * projects, pools, and processor sets. New types of resource sets will be * added in the future. */ pool_t *pool_default; /* default pool which always exists */ int pool_count; /* number of pools created on this system */ int pool_state; /* pools state -- enabled/disabled */ void *pool_buf; /* pre-commit snapshot of the pools state */ size_t pool_bufsz; /* size of pool_buf */ static hrtime_t pool_pool_mod; /* last modification time for pools */ static hrtime_t pool_sys_mod; /* last modification time for system */ static nvlist_t *pool_sys_prop; /* system properties */ static id_space_t *pool_ids; /* pool ID space */ static list_t pool_list; /* doubly-linked list of pools */ static kmutex_t pool_mutex; /* protects pool_busy_* */ static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ static int pool_barrier_count; /* synch. with pool_barrier_* */ static list_t pool_event_cb_list; /* pool event callbacks */ static boolean_t pool_event_cb_init = B_FALSE; static kmutex_t pool_event_cb_lock; static taskq_t *pool_event_cb_taskq = NULL; void pool_event_dispatch(pool_event_t, poolid_t); /* * Boot-time pool initialization. */ void pool_init(void) { pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); /* * Initialize default pool. */ pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); pool_default->pool_id = POOL_DEFAULT; list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); list_insert_head(&pool_list, pool_default); /* * Initialize plugins for resource sets. */ pool_pset_init(); pool_count = 1; p0.p_pool = pool_default; global_zone->zone_pool = pool_default; pool_default->pool_ref = 1; } /* * Synchronization routines. * * pool_lock is only called from syscall-level routines (processor_bind(), * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long * periods of time, including across sleeping operations, so we allow its * acquisition to be interruptible. * * The current thread that owns the "lock" is stored in the variable * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. */ void pool_lock(void) { mutex_enter(&pool_mutex); ASSERT(!pool_lock_held()); while (pool_busy_thread != NULL) cv_wait(&pool_busy_cv, &pool_mutex); pool_busy_thread = curthread; mutex_exit(&pool_mutex); } int pool_lock_intr(void) { mutex_enter(&pool_mutex); ASSERT(!pool_lock_held()); while (pool_busy_thread != NULL) { if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { cv_signal(&pool_busy_cv); mutex_exit(&pool_mutex); return (1); } } pool_busy_thread = curthread; mutex_exit(&pool_mutex); return (0); } int pool_lock_held(void) { return (pool_busy_thread == curthread); } void pool_unlock(void) { mutex_enter(&pool_mutex); ASSERT(pool_lock_held()); pool_busy_thread = NULL; cv_signal(&pool_busy_cv); mutex_exit(&pool_mutex); } /* * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize * with pool_do_bind(). * * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all * operations which modify pool or pset associations. They can be called * while the process is multi-threaded. In the common case, when current * process is not being rebound (PBWAIT flag is not set), these functions * will be just incrementing and decrementing reference counts. */ void pool_barrier_enter(void) { proc_t *p = curproc; ASSERT(MUTEX_HELD(&p->p_lock)); while (p->p_poolflag & PBWAIT) cv_wait(&p->p_poolcv, &p->p_lock); p->p_poolcnt++; } void pool_barrier_exit(void) { proc_t *p = curproc; ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(p->p_poolcnt > 0); p->p_poolcnt--; if (p->p_poolflag & PBWAIT) { mutex_enter(&pool_barrier_lock); ASSERT(pool_barrier_count > 0); pool_barrier_count--; if (pool_barrier_count == 0) cv_signal(&pool_barrier_cv); mutex_exit(&pool_barrier_lock); while (p->p_poolflag & PBWAIT) cv_wait(&p->p_poolcv, &p->p_lock); } } /* * Enable pools facility. */ static int pool_enable(void) { int ret; ASSERT(pool_lock_held()); ASSERT(pool_count == 1); ret = pool_pset_enable(); if (ret != 0) return (ret); (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); (void) nvlist_add_string(pool_sys_prop, "system.name", "default"); (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", "wt-load"); (void) nvlist_alloc(&pool_default->pool_props, NV_UNIQUE_NAME, KM_SLEEP); (void) nvlist_add_string(pool_default->pool_props, "pool.name", "pool_default"); (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); (void) nvlist_add_int64(pool_default->pool_props, "pool.importance", 1); (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", pool_default->pool_id); pool_sys_mod = pool_pool_mod = gethrtime(); return (ret); } /* * Disable pools facility. */ static int pool_disable(void) { int ret; ASSERT(pool_lock_held()); if (pool_count > 1) /* must destroy all pools first */ return (EBUSY); ret = pool_pset_disable(); if (ret != 0) return (ret); if (pool_sys_prop != NULL) { nvlist_free(pool_sys_prop); pool_sys_prop = NULL; } if (pool_default->pool_props != NULL) { nvlist_free(pool_default->pool_props); pool_default->pool_props = NULL; } return (0); } pool_t * pool_lookup_pool_by_name(char *name) { pool_t *pool = pool_default; char *p; ASSERT(pool_lock_held()); for (pool = list_head(&pool_list); pool; pool = list_next(&pool_list, pool)) { if (nvlist_lookup_string(pool->pool_props, "pool.name", &p) == 0 && strcmp(name, p) == 0) return (pool); } return (NULL); } pool_t * pool_lookup_pool_by_id(poolid_t poolid) { pool_t *pool = pool_default; ASSERT(pool_lock_held()); for (pool = list_head(&pool_list); pool; pool = list_next(&pool_list, pool)) { if (pool->pool_id == poolid) return (pool); } return (NULL); } pool_t * pool_lookup_pool_by_pset(int id) { pool_t *pool = pool_default; psetid_t psetid = (psetid_t)id; ASSERT(pool_lock_held()); for (pool = list_head(&pool_list); pool != NULL; pool = list_next(&pool_list, pool)) { if (pool->pool_pset->pset_id == psetid) return (pool); } return (NULL); } /* * Create new pool, associate it with default resource sets, and give * it a temporary name. */ static int pool_pool_create(poolid_t *poolid) { pool_t *pool; char pool_name[40]; ASSERT(pool_lock_held()); pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); pool->pool_id = *poolid = id_alloc(pool_ids); pool->pool_pset = pool_pset_default; pool_pset_default->pset_npools++; list_insert_tail(&pool_list, pool); (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); pool_pool_mod = gethrtime(); (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", pool_pool_mod); (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); pool_count++; return (0); } struct destroy_zone_arg { pool_t *old; pool_t *new; }; /* * Update pool pointers for zones that are currently bound to pool "old" * to be bound to pool "new". */ static int pool_destroy_zone_cb(zone_t *zone, void *arg) { struct destroy_zone_arg *dza = arg; ASSERT(pool_lock_held()); ASSERT(MUTEX_HELD(&cpu_lock)); if (zone_pool_get(zone) == dza->old) zone_pool_set(zone, dza->new); return (0); } /* * Destroy specified pool, and rebind all processes in it * to the default pool. */ static int pool_pool_destroy(poolid_t poolid) { pool_t *pool; int ret; ASSERT(pool_lock_held()); if (poolid == POOL_DEFAULT) return (EINVAL); if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) return (ESRCH); ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); if (ret == 0) { struct destroy_zone_arg dzarg; dzarg.old = pool; dzarg.new = pool_default; mutex_enter(&cpu_lock); ret = zone_walk(pool_destroy_zone_cb, &dzarg); mutex_exit(&cpu_lock); ASSERT(ret == 0); ASSERT(pool->pool_ref == 0); (void) nvlist_free(pool->pool_props); id_free(pool_ids, pool->pool_id); pool->pool_pset->pset_npools--; list_remove(&pool_list, pool); pool_count--; pool_pool_mod = gethrtime(); kmem_free(pool, sizeof (pool_t)); } return (ret); } /* * Create new pool or resource set. */ int pool_create(int class, int subclass, id_t *id) { int ret; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (class) { case PEC_POOL: ret = pool_pool_create((poolid_t *)id); break; case PEC_RES_COMP: switch (subclass) { case PREC_PSET: ret = pool_pset_create((psetid_t *)id); break; default: ret = EINVAL; } break; case PEC_RES_AGG: ret = ENOTSUP; break; default: ret = EINVAL; } return (ret); } /* * Destroy an existing pool or resource set. */ int pool_destroy(int class, int subclass, id_t id) { int ret; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (class) { case PEC_POOL: ret = pool_pool_destroy((poolid_t)id); break; case PEC_RES_COMP: switch (subclass) { case PREC_PSET: ret = pool_pset_destroy((psetid_t)id); break; default: ret = EINVAL; } break; case PEC_RES_AGG: ret = ENOTSUP; break; default: ret = EINVAL; } return (ret); } /* * Enable or disable pools. */ int pool_status(int status) { int ret = 0; ASSERT(pool_lock_held()); if (pool_state == status) return (0); switch (status) { case POOL_ENABLED: ret = pool_enable(); if (ret != 0) return (ret); pool_state = POOL_ENABLED; pool_event_dispatch(POOL_E_ENABLE, 0); break; case POOL_DISABLED: ret = pool_disable(); if (ret != 0) return (ret); pool_state = POOL_DISABLED; pool_event_dispatch(POOL_E_DISABLE, 0); break; default: ret = EINVAL; } return (ret); } /* * Associate pool with resource set. */ int pool_assoc(poolid_t poolid, int idtype, id_t id) { int ret; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (idtype) { case PREC_PSET: ret = pool_pset_assoc(poolid, (psetid_t)id); if (ret == 0) pool_event_dispatch(POOL_E_CHANGE, poolid); break; default: ret = EINVAL; } if (ret == 0) pool_pool_mod = gethrtime(); return (ret); } /* * Disassociate resource set from pool. */ int pool_dissoc(poolid_t poolid, int idtype) { int ret; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (idtype) { case PREC_PSET: ret = pool_pset_assoc(poolid, PS_NONE); if (ret == 0) pool_event_dispatch(POOL_E_CHANGE, poolid); break; default: ret = EINVAL; } if (ret == 0) pool_pool_mod = gethrtime(); return (ret); } /* * Transfer specified quantity of resources between resource sets. */ /*ARGSUSED*/ int pool_transfer(int type, id_t src, id_t dst, uint64_t qty) { int ret = EINVAL; return (ret); } static poolid_t pool_lookup_id_by_pset(int id) { pool_t *pool = pool_default; psetid_t psetid = (psetid_t)id; ASSERT(pool_lock_held()); for (pool = list_head(&pool_list); pool != NULL; pool = list_next(&pool_list, pool)) { if (pool->pool_pset->pset_id == psetid) return (pool->pool_id); } return (POOL_INVALID); } /* * Transfer resources specified by their IDs between resource sets. */ int pool_xtransfer(int type, id_t src_pset, id_t dst_pset, uint_t size, id_t *ids) { int ret; poolid_t src_pool, dst_pool; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (type) { case PREC_PSET: ret = pool_pset_xtransfer((psetid_t)src_pset, (psetid_t)dst_pset, size, ids); if (ret == 0) { if ((src_pool = pool_lookup_id_by_pset(src_pset)) != POOL_INVALID) pool_event_dispatch(POOL_E_CHANGE, src_pool); if ((dst_pool = pool_lookup_id_by_pset(dst_pset)) != POOL_INVALID) pool_event_dispatch(POOL_E_CHANGE, dst_pool); } break; default: ret = EINVAL; } return (ret); } /* * Bind processes to pools. */ int pool_bind(poolid_t poolid, idtype_t idtype, id_t id) { pool_t *pool; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) return (ESRCH); switch (idtype) { case P_PID: case P_TASKID: case P_PROJID: case P_ZONEID: break; default: return (EINVAL); } return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); } /* * Query pool binding of the specifed process. */ int pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) { proc_t *p; if (idtype != P_PID) return (ENOTSUP); if (id == P_MYID) id = curproc->p_pid; ASSERT(pool_lock_held()); mutex_enter(&pidlock); if ((p = prfind((pid_t)id)) == NULL) { mutex_exit(&pidlock); return (ESRCH); } mutex_enter(&p->p_lock); /* * In local zones, lie about pool bindings of processes from * the global zone. */ if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { pool_t *pool; pool = zone_pool_get(curproc->p_zone); *poolid = pool->pool_id; } else { *poolid = p->p_pool->pool_id; } mutex_exit(&p->p_lock); mutex_exit(&pidlock); return (0); } static ea_object_t * pool_system_pack(void) { ea_object_t *eo_system; size_t bufsz = 0; char *buf = NULL; ASSERT(pool_lock_held()); eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); if (INGLOBALZONE(curproc)) (void) ea_attach_item(eo_system, &pool_pool_mod, sizeof (hrtime_t), EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); else (void) ea_attach_item(eo_system, &curproc->p_zone->zone_pool_mod, sizeof (hrtime_t), EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); (void) ea_attach_item(eo_system, buf, bufsz, EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); kmem_free(buf, bufsz); return (eo_system); } /* * Pack information about pools and attach it to specified exacct group. */ static int pool_pool_pack(ea_object_t *eo_system) { ea_object_t *eo_pool; pool_t *pool; size_t bufsz; char *buf; pool_t *myzonepool; ASSERT(pool_lock_held()); myzonepool = zone_pool_get(curproc->p_zone); for (pool = list_head(&pool_list); pool; pool = list_next(&pool_list, pool)) { if (!INGLOBALZONE(curproc) && myzonepool != pool) continue; bufsz = 0; buf = NULL; eo_pool = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_POOL); (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); (void) nvlist_pack(pool->pool_props, &buf, &bufsz, NV_ENCODE_NATIVE, 0); (void) ea_attach_item(eo_pool, buf, bufsz, EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); kmem_free(buf, bufsz); (void) ea_attach_to_group(eo_system, eo_pool); } return (0); } /* * Pack the whole pool configuration in the specified buffer. */ int pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) { ea_object_t *eo_system; size_t ksize; int ret = 0; ASSERT(pool_lock_held()); eo_system = pool_system_pack(); /* 1. pack system */ (void) pool_pool_pack(eo_system); /* 2. pack all pools */ (void) pool_pset_pack(eo_system); /* 3. pack all psets */ ksize = ea_pack_object(eo_system, NULL, 0); if (kbuf == NULL || kbufsz == 0) *asize = ksize; else if (ksize > kbufsz) ret = ENOMEM; else *asize = ea_pack_object(eo_system, kbuf, kbufsz); ea_free_object(eo_system, EUP_ALLOC); return (ret); } /* * Start/end the commit transaction. If commit transaction is currently * in progress, then all POOL_QUERY ioctls will return pools configuration * at the beginning of transaction. */ int pool_commit(int state) { ea_object_t *eo_system; int ret = 0; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (state) { case 1: /* * Beginning commit transation. */ if (pool_buf != NULL) /* transaction in progress */ return (EBUSY); eo_system = pool_system_pack(); /* 1. pack system */ (void) pool_pool_pack(eo_system); /* 2. pack all pools */ (void) pool_pset_pack(eo_system); /* 3. pack all psets */ pool_bufsz = ea_pack_object(eo_system, NULL, 0); pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); ea_free_object(eo_system, EUP_ALLOC); break; case 0: /* * Finishing commit transaction. */ if (pool_buf != NULL) { kmem_free(pool_buf, pool_bufsz); pool_buf = NULL; pool_bufsz = 0; } break; default: ret = EINVAL; } return (ret); } /* * Check is the specified property is special */ static pool_property_t * pool_property_find(char *name, pool_property_t *list) { pool_property_t *prop; for (prop = list; prop->pp_name != NULL; prop++) if (strcmp(prop->pp_name, name) == 0) return (prop); return (NULL); } static pool_property_t pool_prop_sys[] = { { "system.name", DATA_TYPE_STRING, PP_RDWR }, { "system.comment", DATA_TYPE_STRING, PP_RDWR }, { "system.version", DATA_TYPE_UINT64, PP_READ }, { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, { "system.allocate-method", DATA_TYPE_STRING, PP_RDWR | PP_OPTIONAL }, { "system.poold.log-level", DATA_TYPE_STRING, PP_RDWR | PP_OPTIONAL }, { "system.poold.log-location", DATA_TYPE_STRING, PP_RDWR | PP_OPTIONAL }, { "system.poold.monitor-interval", DATA_TYPE_UINT64, PP_RDWR | PP_OPTIONAL }, { "system.poold.history-file", DATA_TYPE_STRING, PP_RDWR | PP_OPTIONAL }, { "system.poold.objectives", DATA_TYPE_STRING, PP_RDWR | PP_OPTIONAL }, { NULL, 0, 0 } }; static pool_property_t pool_prop_pool[] = { { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, { "pool.name", DATA_TYPE_STRING, PP_RDWR }, { "pool.default", DATA_TYPE_BYTE, PP_READ }, { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, { "pool.scheduler", DATA_TYPE_STRING, PP_RDWR | PP_OPTIONAL }, { NULL, 0, 0 } }; /* * Common routine to put new property on the specified list */ int pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) { pool_property_t *prop; if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { /* * No read-only properties or properties with bad types */ if (!(prop->pp_perm & PP_WRITE) || prop->pp_type != nvpair_type(pair)) return (EINVAL); } return (nvlist_add_nvpair(nvlist, pair)); } /* * Common routine to remove property from the given list */ int pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) { pool_property_t *prop; if ((prop = pool_property_find(name, props)) != NULL) { if (!(prop->pp_perm & PP_OPTIONAL)) return (EINVAL); } return (nvlist_remove_all(nvlist, name)); } static int pool_system_propput(nvpair_t *pair) { int ret; ASSERT(pool_lock_held()); ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); if (ret == 0) pool_sys_mod = gethrtime(); return (ret); } static int pool_system_proprm(char *name) { int ret; ASSERT(pool_lock_held()); ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); if (ret == 0) pool_sys_mod = gethrtime(); return (ret); } static int pool_pool_propput(poolid_t poolid, nvpair_t *pair) { pool_t *pool; int ret; ASSERT(pool_lock_held()); if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) return (ESRCH); ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); if (ret == 0) pool_pool_mod = gethrtime(); return (ret); } static int pool_pool_proprm(poolid_t poolid, char *name) { int ret; pool_t *pool; ASSERT(pool_lock_held()); if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) return (ESRCH); ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); if (ret == 0) pool_pool_mod = gethrtime(); return (ret); } int pool_propput(int class, int subclass, id_t id, nvpair_t *pair) { int ret; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (class) { case PEC_SYSTEM: ret = pool_system_propput(pair); break; case PEC_POOL: ret = pool_pool_propput((poolid_t)id, pair); break; case PEC_RES_COMP: switch (subclass) { case PREC_PSET: ret = pool_pset_propput((psetid_t)id, pair); break; default: ret = EINVAL; } break; case PEC_RES_AGG: ret = ENOTSUP; break; case PEC_COMP: switch (subclass) { case PCEC_CPU: ret = pool_cpu_propput((processorid_t)id, pair); break; default: ret = EINVAL; } break; default: ret = EINVAL; } return (ret); } int pool_proprm(int class, int subclass, id_t id, char *name) { int ret; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); switch (class) { case PEC_SYSTEM: ret = pool_system_proprm(name); break; case PEC_POOL: ret = pool_pool_proprm((poolid_t)id, name); break; case PEC_RES_COMP: switch (subclass) { case PREC_PSET: ret = pool_pset_proprm((psetid_t)id, name); break; default: ret = EINVAL; } break; case PEC_RES_AGG: ret = ENOTSUP; break; case PEC_COMP: switch (subclass) { case PCEC_CPU: ret = pool_cpu_proprm((processorid_t)id, name); break; default: ret = EINVAL; } break; default: ret = EINVAL; } return (ret); } int pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) { int ret; nvlist_t *nvl; ASSERT(pool_lock_held()); if (pool_state == POOL_DISABLED) return (ENOTACTIVE); (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); switch (class) { case PEC_SYSTEM: case PEC_POOL: ret = EINVAL; break; case PEC_RES_COMP: switch (subclass) { case PREC_PSET: ret = pool_pset_propget((psetid_t)id, name, nvl); break; default: ret = EINVAL; } break; case PEC_RES_AGG: ret = ENOTSUP; break; case PEC_COMP: switch (subclass) { case PCEC_CPU: ret = pool_cpu_propget((processorid_t)id, name, nvl); break; default: ret = EINVAL; } break; default: ret = EINVAL; } if (ret == 0) *nvlp = nvl; else nvlist_free(nvl); return (ret); } /* * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs * in case of failure in pool_do_bind(). */ static void pool_bind_wake(proc_t *p) { ASSERT(pool_lock_held()); mutex_enter(&p->p_lock); ASSERT(p->p_poolflag & PBWAIT); if (p->p_poolcnt > 0) { mutex_enter(&pool_barrier_lock); pool_barrier_count -= p->p_poolcnt; mutex_exit(&pool_barrier_lock); } p->p_poolflag &= ~PBWAIT; cv_signal(&p->p_poolcv); mutex_exit(&p->p_lock); } static void pool_bind_wakeall(proc_t **procs) { proc_t *p, **pp; ASSERT(pool_lock_held()); for (pp = procs; (p = *pp) != NULL; pp++) pool_bind_wake(p); } /* * Return the scheduling class for this pool, or * POOL_CLASS_UNSET if not set * POOL_CLASS_INVAL if set to an invalid class ID. */ id_t pool_get_class(pool_t *pool) { char *name; id_t cid; ASSERT(pool_lock_held()); if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", &name) == 0) { if (getcidbyname(name, &cid) == 0) return (cid); else return (POOL_CLASS_INVAL); } return (POOL_CLASS_UNSET); } /* * Move process to the new scheduling class. */ static void pool_change_class(proc_t *p, id_t cid) { kthread_t *t; void *cldata; id_t oldcid; void **bufs; void **buf; int nlwp; int ret; int i; /* * Do not move kernel processes (such as zsched). */ if (p->p_flag & SSYS) return; /* * This process is in the pool barrier, so it can't possibly be * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 * (for possible agent LWP which doesn't use pool barrier) as * our upper bound. */ nlwp = p->p_lwpcnt + p->p_zombcnt + 1; /* * Pre-allocate scheduling class specific buffers before * grabbing p_lock. */ bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); for (i = 0, buf = bufs; i < nlwp; i++, buf++) { ret = CL_ALLOC(buf, cid, KM_SLEEP); ASSERT(ret == 0); } /* * Move threads one by one to the new scheduling class. * This never fails because we have all the right * privileges here. */ mutex_enter(&p->p_lock); ASSERT(p->p_poolflag & PBWAIT); buf = bufs; t = p->p_tlist; ASSERT(t != NULL); do { if (t->t_cid != cid) { oldcid = t->t_cid; cldata = t->t_cldata; ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); ASSERT(ret == 0); CL_EXITCLASS(oldcid, cldata); schedctl_set_cidpri(t); *buf++ = NULL; } } while ((t = t->t_forw) != p->p_tlist); mutex_exit(&p->p_lock); /* * Free unused scheduling class specific buffers. */ for (i = 0, buf = bufs; i < nlwp; i++, buf++) { if (*buf != NULL) { CL_FREE(cid, *buf); *buf = NULL; } } kmem_free(bufs, nlwp * sizeof (void *)); } void pool_get_name(pool_t *pool, char **name) { ASSERT(pool_lock_held()); (void) nvlist_lookup_string(pool->pool_props, "pool.name", name); ASSERT(strlen(*name) != 0); } /* * The meat of the bind operation. The steps in pool_do_bind are: * * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all * such processes to an array. For any interesting process that has * threads inside the pool barrier set, increment a counter by the * count of such threads. Once PBWAIT is set on a process, that process * will not disappear. * * 2) Wait for the counter from step 2 to drop to zero. Any process which * calls pool_barrier_exit() and notices that PBWAIT has been set on it * will decrement that counter before going to sleep, and the process * calling pool_barrier_exit() which does the final decrement will wake us. * * 3) For each interesting process, perform a calculation on it to see if * the bind will actually succeed. This uses the following three * resource-set-specific functions: * * - int set_bind_start(procs, pool) * * Determine whether the given array of processes can be bound to the * resource set associated with the given pool. If it can, take and hold * any locks necessary to ensure that the operation will succeed, and * make any necessary reservations in the target resource set. If it * can't, return failure with no reservations made and no new locks held. * * - void set_bind_abort(procs, pool) * * set_bind_start() has completed successfully, but another resource set's * set_bind_start() has failed, and we haven't begun the bind yet. Undo * any reservations made and drop any locks acquired by our * set_bind_start(). * * - void set_bind_finish(void) * * The bind has completed successfully. The processes have been released, * and the reservation acquired in set_bind_start() has been depleted as * the processes have finished their bindings. Drop any locks acquired by * set_bind_start(). * * 4) If we've decided that we can proceed with the bind, iterate through * the list of interesting processes, grab the necessary locks (which * may differ per resource set), perform the bind, and ASSERT that it * succeeds. Once a process has been rebound, it can be awakened. * * The operations from step 4 must be kept in sync with anything which might * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and * are thus located in the same source files as the associated bind operations. */ int pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) { extern uint_t nproc; klwp_t *lwp = ttolwp(curthread); proc_t **pp, **procs; proc_t *prstart; int procs_count = 0; kproject_t *kpj = NULL; procset_t set; zone_t *zone = NULL; int procs_size; int rv = 0; proc_t *p; id_t cid = -1; ASSERT(pool_lock_held()); if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) return (EINVAL); if (idtype == P_ZONEID) { zone = zone_find_by_id(id); if (zone == NULL) return (ESRCH); if (zone_status_get(zone) > ZONE_IS_RUNNING) { zone_rele(zone); return (EBUSY); } } if (idtype == P_PROJID) { kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); if (kpj == NULL) return (ESRCH); mutex_enter(&kpj->kpj_poolbind); } if (idtype == P_PID) { /* * Fast-path for a single process case. */ procs_size = 2; /* procs is NULL-terminated */ procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); mutex_enter(&pidlock); } else { /* * We will need enough slots for proc_t pointers for as many as * twice the number of currently running processes (assuming * that each one could be in fork() creating a new child). */ for (;;) { procs_size = nproc * 2; procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); mutex_enter(&pidlock); if (nproc * 2 <= procs_size) break; /* * If nproc has changed, try again. */ mutex_exit(&pidlock); kmem_free(procs, procs_size * sizeof (proc_t *)); } } if (id == P_MYID) id = getmyid(idtype); setprocset(&set, POP_AND, idtype, id, P_ALL, 0); /* * Do a first scan, and select target processes. */ if (idtype == P_PID) prstart = prfind(id); else prstart = practive; for (p = prstart, pp = procs; p != NULL; p = p->p_next) { mutex_enter(&p->p_lock); /* * Skip processes that don't match our (id, idtype) set or * on the way of becoming zombies. Skip kernel processes * from the global zone. */ if (procinset(p, &set) == 0 || p->p_poolflag & PEXITED || ((p->p_flag & SSYS) && INGLOBALZONE(p))) { mutex_exit(&p->p_lock); continue; } if (!INGLOBALZONE(p)) { switch (idtype) { case P_PID: case P_TASKID: default: /* * Can't bind processes or tasks in local zones * to pools. Also catch all remaining types of * idtype_t that should already have been * filtered out. */ mutex_exit(&p->p_lock); mutex_exit(&pidlock); pool_bind_wakeall(procs); rv = EINVAL; goto out; case P_PROJID: /* * Only projects in the global * zone can be rebound. */ mutex_exit(&p->p_lock); continue; case P_POOLID: /* * When rebinding pools, processes can be * in different zones. */ break; } } p->p_poolflag |= PBWAIT; /* * If some threads in this process are inside the pool * barrier, add them to pool_barrier_count, as we have * to wait for all of them to exit the barrier. */ if (p->p_poolcnt > 0) { mutex_enter(&pool_barrier_lock); pool_barrier_count += p->p_poolcnt; mutex_exit(&pool_barrier_lock); } ASSERT(pp < &procs[procs_size]); *pp++ = p; procs_count++; mutex_exit(&p->p_lock); /* * We just found our process, so if we're only rebinding a * single process then get out of this loop. */ if (idtype == P_PID) break; } *pp = NULL; /* cap off the end of the array */ mutex_exit(&pidlock); /* * Wait for relevant processes to stop before they try to enter the * barrier or at the exit from the barrier. Make sure that we do * not get stopped here while we're holding pool_lock. If we were * requested to stop, or got a signal then return EAGAIN to let the * library know that it needs to retry. */ mutex_enter(&pool_barrier_lock); lwp->lwp_nostop++; while (pool_barrier_count > 0) { (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); if (pool_barrier_count > 0) { /* * We either got a signal or were requested to * stop by /proc. Bail out with EAGAIN. If we were * requested to stop, we'll stop in post_syscall() * on our way back to userland. */ mutex_exit(&pool_barrier_lock); pool_bind_wakeall(procs); lwp->lwp_nostop--; rv = EAGAIN; goto out; } } lwp->lwp_nostop--; mutex_exit(&pool_barrier_lock); if (idtype == P_PID) { if ((p = *procs) == NULL) goto skip; mutex_enter(&p->p_lock); /* Drop the process if it is exiting */ if (p->p_poolflag & PEXITED) { mutex_exit(&p->p_lock); pool_bind_wake(p); procs_count--; } else mutex_exit(&p->p_lock); goto skip; } /* * Do another run, and drop processes that were inside the barrier * in exit(), but when they have dropped to pool_barrier_exit * they have become of no interest to us. Pick up child processes that * were created by fork() but didn't exist during our first scan. * Their parents are now stopped at pool_barrier_exit in cfork(). */ mutex_enter(&pidlock); for (pp = procs; (p = *pp) != NULL; pp++) { mutex_enter(&p->p_lock); if (p->p_poolflag & PEXITED) { ASSERT(p->p_lwpcnt == 0); mutex_exit(&p->p_lock); pool_bind_wake(p); /* flip w/last non-NULL slot */ *pp = procs[procs_count - 1]; procs[procs_count - 1] = NULL; procs_count--; pp--; /* try this slot again */ continue; } else mutex_exit(&p->p_lock); /* * Look at the child and check if it should be rebound also. * We're holding pidlock, so it is safe to reference p_child. */ if ((p = p->p_child) == NULL) continue; mutex_enter(&p->p_lock); /* * Skip system processes and make sure that the child is in * the same task/project/pool/zone as the parent. */ if ((!INGLOBALZONE(p) && idtype != P_ZONEID && idtype != P_POOLID) || p->p_flag & SSYS) { mutex_exit(&p->p_lock); continue; } /* * If the child process has been already created by fork(), has * not exited, and has not been added to the list already, * then add it now. We will hit this process again (since we * stick it at the end of the procs list) but it will ignored * because it will have the PBWAIT flag set. */ if (procinset(p, &set) && !(p->p_poolflag & PEXITED) && !(p->p_poolflag & PBWAIT)) { ASSERT(p->p_child == NULL); /* no child of a child */ procs[procs_count] = p; procs[procs_count + 1] = NULL; procs_count++; p->p_poolflag |= PBWAIT; } mutex_exit(&p->p_lock); } mutex_exit(&pidlock); skip: /* * If there's no processes to rebind then return ESRCH, unless * we're associating a pool with new resource set, destroying it, * or binding a zone to a pool. */ if (procs_count == 0) { if (idtype == P_POOLID || idtype == P_ZONEID) rv = 0; else rv = ESRCH; goto out; } #ifdef DEBUG /* * All processes in the array should have PBWAIT set, and none * should be in the critical section. Thus, although p_poolflag * and p_poolcnt are protected by p_lock, their ASSERTions below * should be stable without it. procinset(), however, ASSERTs that * the p_lock is held upon entry. */ for (pp = procs; (p = *pp) != NULL; pp++) { int in_set; mutex_enter(&p->p_lock); in_set = procinset(p, &set); mutex_exit(&p->p_lock); ASSERT(in_set); ASSERT(p->p_poolflag & PBWAIT); ASSERT(p->p_poolcnt == 0); } #endif /* * Do the check if processor set rebinding is going to succeed or not. */ if ((flags & POOL_BIND_PSET) && (rv = pset_bind_start(procs, pool)) != 0) { pool_bind_wakeall(procs); goto out; } /* * At this point, all bind operations should succeed. */ for (pp = procs; (p = *pp) != NULL; pp++) { if (flags & POOL_BIND_PSET) { psetid_t psetid = pool->pool_pset->pset_id; void *zonebuf; void *projbuf; /* * Pre-allocate one buffer for FSS (per-project * buffer for a new pset) in case if this is the * first thread from its current project getting * bound to this processor set. */ projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); mutex_enter(&pidlock); mutex_enter(&p->p_lock); pool_pset_bind(p, psetid, projbuf, zonebuf); mutex_exit(&p->p_lock); mutex_exit(&pidlock); /* * Free buffers pre-allocated above if it * wasn't actually used. */ fss_freebuf(projbuf, FSS_ALLOC_PROJ); fss_freebuf(zonebuf, FSS_ALLOC_ZONE); } /* * Now let's change the scheduling class of this * process if our target pool has it defined. */ if (cid != POOL_CLASS_UNSET) pool_change_class(p, cid); /* * It is safe to reference p_pool here without holding * p_lock because it cannot change underneath of us. * We're holding pool_lock here, so nobody else can be * moving this process between pools. If process "p" * would be exiting, we're guaranteed that it would be blocked * at pool_barrier_enter() in exit(). Otherwise, it would've * been skipped by one of our scans of the practive list * as a process with PEXITED flag set. */ if (p->p_pool != pool) { ASSERT(p->p_pool->pool_ref > 0); atomic_dec_32(&p->p_pool->pool_ref); p->p_pool = pool; atomic_inc_32(&p->p_pool->pool_ref); } /* * Okay, we've tortured this guy enough. * Let this poor process go now. */ pool_bind_wake(p); } if (flags & POOL_BIND_PSET) pset_bind_finish(); out: switch (idtype) { case P_PROJID: ASSERT(kpj != NULL); mutex_exit(&kpj->kpj_poolbind); project_rele(kpj); break; case P_ZONEID: if (rv == 0) { mutex_enter(&cpu_lock); zone_pool_set(zone, pool); mutex_exit(&cpu_lock); } zone->zone_pool_mod = gethrtime(); zone_rele(zone); break; default: break; } kmem_free(procs, procs_size * sizeof (proc_t *)); ASSERT(pool_barrier_count == 0); return (rv); } void pool_event_cb_register(pool_event_cb_t *cb) { ASSERT(!pool_lock_held() || panicstr); ASSERT(cb->pec_func != NULL); mutex_enter(&pool_event_cb_lock); if (!pool_event_cb_init) { list_create(&pool_event_cb_list, sizeof (pool_event_cb_t), offsetof(pool_event_cb_t, pec_list)); pool_event_cb_init = B_TRUE; } list_insert_tail(&pool_event_cb_list, cb); mutex_exit(&pool_event_cb_lock); } void pool_event_cb_unregister(pool_event_cb_t *cb) { ASSERT(!pool_lock_held() || panicstr); mutex_enter(&pool_event_cb_lock); list_remove(&pool_event_cb_list, cb); mutex_exit(&pool_event_cb_lock); } typedef struct { pool_event_t tqd_what; poolid_t tqd_id; } pool_tqd_t; void pool_event_notify(void *arg) { pool_tqd_t *tqd = (pool_tqd_t *)arg; pool_event_cb_t *cb; ASSERT(!pool_lock_held() || panicstr); mutex_enter(&pool_event_cb_lock); for (cb = list_head(&pool_event_cb_list); cb != NULL; cb = list_next(&pool_event_cb_list, cb)) { cb->pec_func(tqd->tqd_what, tqd->tqd_id, cb->pec_arg); } mutex_exit(&pool_event_cb_lock); kmem_free(tqd, sizeof (*tqd)); } void pool_event_dispatch(pool_event_t what, poolid_t id) { pool_tqd_t *tqd = NULL; ASSERT(pool_lock_held()); if (pool_event_cb_taskq == NULL) { pool_event_cb_taskq = taskq_create("pool_event_cb_taskq", 1, -1, 1, 1, TASKQ_PREPOPULATE); } tqd = kmem_alloc(sizeof (*tqd), KM_SLEEP); tqd->tqd_what = what; tqd->tqd_id = id; (void) taskq_dispatch(pool_event_cb_taskq, pool_event_notify, tqd, KM_SLEEP); }