/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2019 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include /* * Big Theory Statement for readers/writer locking primitives. * * An rwlock provides exclusive access to a single thread ("writer") or * concurrent access to multiple threads ("readers"). See rwlock(9F) * for a full description of the interfaces and programming model. * The rest of this comment describes the implementation. * * An rwlock is a single word with the following structure: * * --------------------------------------------------------------------- * | OWNER (writer) or HOLD COUNT (readers) | WRLOCK | WRWANT | WAIT | * --------------------------------------------------------------------- * 63 / 31 .. 3 2 1 0 * * The waiters bit (0) indicates whether any threads are blocked waiting * for the lock. The write-wanted bit (1) indicates whether any threads * are blocked waiting for write access. The write-locked bit (2) indicates * whether the lock is held by a writer, which determines whether the upper * bits (3..31 in ILP32, 3..63 in LP64) should be interpreted as the owner * (thread pointer) or the hold count (number of readers). * * In the absence of any contention, a writer gets the lock by setting * this word to (curthread | RW_WRITE_LOCKED); a reader gets the lock * by incrementing the hold count (i.e. adding 8, aka RW_READ_LOCK). * * A writer will fail to acquire the lock if any other thread owns it. * A reader will fail if the lock is either owned (in the RW_READER and * RW_READER_STARVEWRITER cases) or wanted by a writer (in the RW_READER * case). rw_tryenter() returns 0 in these cases; rw_enter() blocks until * the lock becomes available. * * When a thread blocks it acquires the rwlock's hashed turnstile lock and * attempts to set RW_HAS_WAITERS (and RW_WRITE_WANTED in the writer case) * atomically *only if the lock still appears busy*. A thread must never * accidentally block for an available lock since there would be no owner * to awaken it. casip() provides the required atomicity. Once casip() * succeeds, the decision to block becomes final and irreversible. The * thread will not become runnable again until it has been granted ownership * of the lock via direct handoff from a former owner as described below. * * In the absence of any waiters, rw_exit() just clears the lock (if it * is write-locked) or decrements the hold count (if it is read-locked). * Note that even if waiters are present, decrementing the hold count * to a non-zero value requires no special action since the lock is still * held by at least one other thread. * * On the "final exit" (transition to unheld state) of a lock with waiters, * rw_exit_wakeup() grabs the turnstile lock and transfers ownership directly * to the next writer or set of readers. There are several advantages to this * approach: (1) it closes all windows for priority inversion (when a new * writer has grabbed the lock but has not yet inherited from blocked readers); * (2) it prevents starvation of equal-priority threads by granting the lock * in FIFO order; (3) it eliminates the need for a write-wanted count -- a * single bit suffices because the lock remains held until all waiting * writers are gone; (4) when we awaken N readers we can perform a single * "atomic_add(&x, N)" to set the total hold count rather than having all N * threads fight for the cache to perform an "atomic_add(&x, 1)" upon wakeup. * * The most interesting policy decision in rw_exit_wakeup() is which thread * to wake. Starvation is always possible with priority-based scheduling, * but any sane wakeup policy should at least satisfy these requirements: * * (1) The highest-priority thread in the system should not starve. * (2) The highest-priority writer should not starve. * (3) No writer should starve due to lower-priority threads. * (4) No reader should starve due to lower-priority writers. * (5) If all threads have equal priority, none of them should starve. * * We used to employ a writers-always-win policy, which doesn't even * satisfy (1): a steady stream of low-priority writers can starve out * a real-time reader! This is clearly a broken policy -- it violates * (1), (4), and (5) -- but it's how rwlocks always used to behave. * * A round-robin policy (exiting readers grant the lock to blocked writers * and vice versa) satisfies all but (3): a single high-priority writer * and many low-priority readers can starve out medium-priority writers. * * A strict priority policy (grant the lock to the highest priority blocked * thread) satisfies everything but (2): a steady stream of high-priority * readers can permanently starve the highest-priority writer. * * The reason we care about (2) is that it's important to process writers * reasonably quickly -- even if they're low priority -- because their very * presence causes all readers to take the slow (blocking) path through this * code. There is also a general sense that writers deserve some degree of * deference because they're updating the data upon which all readers act. * Presumably this data should not be allowed to become arbitrarily stale * due to writer starvation. Finally, it seems reasonable to level the * playing field a bit to compensate for the fact that it's so much harder * for a writer to get in when there are already many readers present. * * A hybrid of round-robin and strict priority can be made to satisfy * all five criteria. In this "writer priority policy" exiting readers * always grant the lock to waiting writers, but exiting writers only * grant the lock to readers of the same or higher priority than the * highest-priority blocked writer. Thus requirement (2) is satisfied, * necessarily, by a willful act of priority inversion: an exiting reader * will grant the lock to a blocked writer even if there are blocked * readers of higher priority. The situation is mitigated by the fact * that writers always inherit priority from blocked readers, and the * writer will awaken those readers as soon as it exits the lock. * * Finally, note that this hybrid scheme -- and indeed, any scheme that * satisfies requirement (2) -- has an important consequence: if a lock is * held as reader and a writer subsequently becomes blocked, any further * readers must be blocked to avoid writer starvation. This implementation * detail has ramifications for the semantics of rwlocks, as it prohibits * recursively acquiring an rwlock as reader: any writer that wishes to * acquire the lock after the first but before the second acquisition as * reader will block the second acquisition -- resulting in deadlock. This * itself is not necessarily prohibitive, as it is often straightforward to * prevent a single thread from recursively acquiring an rwlock as reader. * However, a more subtle situation arises when both a traditional mutex and * a reader lock are acquired by two different threads in opposite order. * (That is, one thread first acquires the mutex and then the rwlock as * reader; the other acquires the rwlock as reader and then the mutex.) As * with the single threaded case, this is fine absent a blocked writer: the * thread that acquires the mutex before acquiring the rwlock as reader will * be able to successfully acquire the rwlock -- even as/if the other thread * has the rwlock as reader and is blocked on the held mutex. However, if * an unrelated writer (that is, a third thread) becomes blocked on the * rwlock after the first thread acquires the rwlock as reader but before * it's able to acquire the mutex, the second thread -- with the mutex held * -- will not be able to acquire the rwlock as reader due to the waiting * writer, deadlocking the three threads. Unlike the single-threaded * (recursive) rwlock acquisition case, this case can be quite a bit * thornier to fix, especially as there is nothing inherently wrong in the * locking strategy: the deadlock is really induced by requirement (2), not * the consumers of the rwlock. To permit such consumers, we allow rwlock * acquirers to explicitly opt out of requirement (2) by specifying * RW_READER_STARVEWRITER when acquiring the rwlock. This (obviously) means * that inifinite readers can starve writers, but it also allows for * multiple readers in the presence of other synchronization primitives * without regard for lock-ordering. And while certainly odd (and perhaps * unwise), RW_READER_STARVEWRITER can be safely used alongside RW_READER on * the same lock -- RW_READER_STARVEWRITER describes only the act of lock * acquisition with respect to waiting writers, not the lock itself. * * rw_downgrade() follows the same wakeup policy as an exiting writer. * * rw_tryupgrade() has the same failure mode as rw_tryenter() for a * write lock. Both honor the WRITE_WANTED bit by specification. * * The following rules apply to manipulation of rwlock internal state: * * (1) The rwlock is only modified via the atomic primitives casip() * and atomic_add_ip(). * * (2) The waiters bit and write-wanted bit are only modified under * turnstile_lookup(). This ensures that the turnstile is consistent * with the rwlock. * * (3) Waiters receive the lock by direct handoff from the previous * owner. Therefore, waiters *always* wake up holding the lock. */ /* * The sobj_ops vector exports a set of functions needed when a thread * is asleep on a synchronization object of a given type. */ static sobj_ops_t rw_sobj_ops = { SOBJ_RWLOCK, rw_owner, turnstile_stay_asleep, turnstile_change_pri }; /* * If the system panics on an rwlock, save the address of the offending * rwlock in panic_rwlock_addr, and save the contents in panic_rwlock. */ static rwlock_impl_t panic_rwlock; static rwlock_impl_t *panic_rwlock_addr; static void rw_panic(char *msg, rwlock_impl_t *lp) { if (panicstr) return; if (atomic_cas_ptr(&panic_rwlock_addr, NULL, lp) == NULL) panic_rwlock = *lp; panic("%s, lp=%p wwwh=%lx thread=%p", msg, (void *)lp, panic_rwlock.rw_wwwh, (void *)curthread); } /* ARGSUSED */ void rw_init(krwlock_t *rwlp, char *name, krw_type_t type, void *arg) { ((rwlock_impl_t *)rwlp)->rw_wwwh = 0; } void rw_destroy(krwlock_t *rwlp) { rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; if (lp->rw_wwwh != 0) { if ((lp->rw_wwwh & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) rw_panic("rw_destroy: lock already destroyed", lp); else rw_panic("rw_destroy: lock still active", lp); } lp->rw_wwwh = RW_DOUBLE_LOCK; } /* * Verify that an rwlock is held correctly. */ static int rw_locked(rwlock_impl_t *lp, krw_t rw) { uintptr_t old = lp->rw_wwwh; if (rw == RW_READER || rw == RW_READER_STARVEWRITER) return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); if (rw == RW_WRITER) return ((old & RW_OWNER) == (uintptr_t)curthread); return (0); } uint_t (*rw_lock_backoff)(uint_t) = NULL; void (*rw_lock_delay)(uint_t) = NULL; /* * Full-service implementation of rw_enter() to handle all the hard cases. * Called from the assembly version if anything complicated is going on. */ void rw_enter_sleep(rwlock_impl_t *lp, krw_t rw) { uintptr_t old, new, lock_value, lock_busy, lock_wait; hrtime_t sleep_time; turnstile_t *ts; uint_t backoff = 0; int loop_count = 0; if (rw == RW_READER) { lock_value = RW_READ_LOCK; lock_busy = RW_WRITE_CLAIMED; lock_wait = RW_HAS_WAITERS; } else if (rw == RW_READER_STARVEWRITER) { lock_value = RW_READ_LOCK; lock_busy = RW_WRITE_LOCKED; lock_wait = RW_HAS_WAITERS; } else { lock_value = RW_WRITE_LOCK(curthread); lock_busy = (uintptr_t)RW_LOCKED; lock_wait = RW_HAS_WAITERS | RW_WRITE_WANTED; } for (;;) { if (((old = lp->rw_wwwh) & lock_busy) == 0) { if (casip(&lp->rw_wwwh, old, old + lock_value) != old) { if (rw_lock_delay != NULL) { backoff = rw_lock_backoff(backoff); rw_lock_delay(backoff); if (++loop_count == ncpus_online) { backoff = 0; loop_count = 0; } } continue; } break; } if (panicstr) return; if ((old & RW_DOUBLE_LOCK) == RW_DOUBLE_LOCK) { rw_panic("rw_enter: bad rwlock", lp); return; } if ((old & RW_OWNER) == (uintptr_t)curthread) { rw_panic("recursive rw_enter", lp); return; } ts = turnstile_lookup(lp); do { if (((old = lp->rw_wwwh) & lock_busy) == 0) break; new = old | lock_wait; } while (old != new && casip(&lp->rw_wwwh, old, new) != old); if ((old & lock_busy) == 0) { /* * The lock appears free now; try the dance again */ turnstile_exit(lp); continue; } /* * We really are going to block, so bump the stats. */ ASSERT(lp->rw_wwwh & lock_wait); ASSERT(lp->rw_wwwh & RW_LOCKED); sleep_time = -gethrtime(); if (rw != RW_WRITER) { CPU_STATS_ADDQ(CPU, sys, rw_rdfails, 1); (void) turnstile_block(ts, TS_READER_Q, lp, &rw_sobj_ops, NULL, NULL); } else { CPU_STATS_ADDQ(CPU, sys, rw_wrfails, 1); (void) turnstile_block(ts, TS_WRITER_Q, lp, &rw_sobj_ops, NULL, NULL); } sleep_time += gethrtime(); LOCKSTAT_RECORD4(LS_RW_ENTER_BLOCK, lp, sleep_time, rw, (old & RW_WRITE_LOCKED) ? 1 : 0, old >> RW_HOLD_COUNT_SHIFT); /* * We wake up holding the lock via direct handoff from the * previous owner. */ break; } ASSERT(rw_locked(lp, rw)); membar_enter(); LOCKSTAT_RECORD(LS_RW_ENTER_ACQUIRE, lp, rw); } /* * Return the number of readers to wake, or zero if we should wake a writer. * Called only by exiting/downgrading writers (readers don't wake readers). */ static int rw_readers_to_wake(turnstile_t *ts) { kthread_t *next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; kthread_t *next_reader = ts->ts_sleepq[TS_READER_Q].sq_first; pri_t wpri = (next_writer != NULL) ? DISP_PRIO(next_writer) : -1; int count = 0; while (next_reader != NULL) { if (DISP_PRIO(next_reader) < wpri) break; next_reader = next_reader->t_link; count++; } return (count); } /* * Full-service implementation of rw_exit() to handle all the hard cases. * Called from the assembly version if anything complicated is going on. * There is no semantic difference between calling rw_exit() and calling * rw_exit_wakeup() directly. */ void rw_exit_wakeup(rwlock_impl_t *lp) { turnstile_t *ts; uintptr_t old, new, lock_value; kthread_t *next_writer; int nreaders; uint_t backoff = 0; int loop_count = 0; membar_exit(); old = lp->rw_wwwh; if (old & RW_WRITE_LOCKED) { if ((old & RW_OWNER) != (uintptr_t)curthread) { rw_panic("rw_exit: not owner", lp); lp->rw_wwwh = 0; return; } lock_value = RW_WRITE_LOCK(curthread); } else { if ((old & RW_LOCKED) == 0) { rw_panic("rw_exit: lock not held", lp); return; } lock_value = RW_READ_LOCK; } for (;;) { /* * If this is *not* the final exit of a lock with waiters, * just drop the lock -- there's nothing tricky going on. */ old = lp->rw_wwwh; new = old - lock_value; if ((new & (RW_LOCKED | RW_HAS_WAITERS)) != RW_HAS_WAITERS) { if (casip(&lp->rw_wwwh, old, new) != old) { if (rw_lock_delay != NULL) { backoff = rw_lock_backoff(backoff); rw_lock_delay(backoff); if (++loop_count == ncpus_online) { backoff = 0; loop_count = 0; } } continue; } break; } /* * This appears to be the final exit of a lock with waiters. * If we do not have the lock as writer (that is, if this is * the last exit of a reader with waiting writers), we will * grab the lock as writer to prevent additional readers. * (This is required because a reader that is acquiring the * lock via RW_READER_STARVEWRITER will not observe the * RW_WRITE_WANTED bit -- and we could therefore be racing * with such readers here.) */ if (!(old & RW_WRITE_LOCKED)) { new = RW_WRITE_LOCK(curthread) | RW_HAS_WAITERS | RW_WRITE_WANTED; if (casip(&lp->rw_wwwh, old, new) != old) continue; } /* * Perform the final exit of a lock that has waiters. */ ts = turnstile_lookup(lp); next_writer = ts->ts_sleepq[TS_WRITER_Q].sq_first; if ((old & RW_WRITE_LOCKED) && (nreaders = rw_readers_to_wake(ts)) > 0) { /* * Don't drop the lock -- just set the hold count * such that we grant the lock to all readers at once. */ new = nreaders * RW_READ_LOCK; if (ts->ts_waiters > nreaders) new |= RW_HAS_WAITERS; if (next_writer) new |= RW_WRITE_WANTED; lp->rw_wwwh = new; membar_enter(); turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); } else { /* * Don't drop the lock -- just transfer ownership * directly to next_writer. Note that there must * be at least one waiting writer, because we get * here only if (A) the lock is read-locked or * (B) there are no waiting readers. In case (A), * since the lock is read-locked there would be no * reason for other readers to have blocked unless * the RW_WRITE_WANTED bit was set. In case (B), * since there are waiters but no waiting readers, * they must all be waiting writers. */ ASSERT(lp->rw_wwwh & RW_WRITE_WANTED); new = RW_WRITE_LOCK(next_writer); if (ts->ts_waiters > 1) new |= RW_HAS_WAITERS; if (next_writer->t_link) new |= RW_WRITE_WANTED; lp->rw_wwwh = new; membar_enter(); turnstile_wakeup(ts, TS_WRITER_Q, 1, next_writer); } break; } if (lock_value == RW_READ_LOCK) { LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_READER); } else { LOCKSTAT_RECORD(LS_RW_EXIT_RELEASE, lp, RW_WRITER); } } int rw_tryenter(krwlock_t *rwlp, krw_t rw) { rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; uintptr_t old; if (rw != RW_WRITER) { uint_t backoff = 0; int loop_count = 0; for (;;) { if ((old = lp->rw_wwwh) & (rw == RW_READER ? RW_WRITE_CLAIMED : RW_WRITE_LOCKED)) { return (0); } if (casip(&lp->rw_wwwh, old, old + RW_READ_LOCK) == old) break; if (rw_lock_delay != NULL) { backoff = rw_lock_backoff(backoff); rw_lock_delay(backoff); if (++loop_count == ncpus_online) { backoff = 0; loop_count = 0; } } } LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); } else { if (casip(&lp->rw_wwwh, 0, RW_WRITE_LOCK(curthread)) != 0) return (0); LOCKSTAT_RECORD(LS_RW_TRYENTER_ACQUIRE, lp, rw); } ASSERT(rw_locked(lp, rw)); membar_enter(); return (1); } void rw_downgrade(krwlock_t *rwlp) { rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; membar_exit(); if ((lp->rw_wwwh & RW_OWNER) != (uintptr_t)curthread) { rw_panic("rw_downgrade: not owner", lp); return; } if (atomic_add_ip_nv(&lp->rw_wwwh, RW_READ_LOCK - RW_WRITE_LOCK(curthread)) & RW_HAS_WAITERS) { turnstile_t *ts = turnstile_lookup(lp); int nreaders = rw_readers_to_wake(ts); if (nreaders > 0) { uintptr_t delta = nreaders * RW_READ_LOCK; if (ts->ts_waiters == nreaders) delta -= RW_HAS_WAITERS; atomic_add_ip(&lp->rw_wwwh, delta); } turnstile_wakeup(ts, TS_READER_Q, nreaders, NULL); } ASSERT(rw_locked(lp, RW_READER)); LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, lp); } int rw_tryupgrade(krwlock_t *rwlp) { rwlock_impl_t *lp = (rwlock_impl_t *)rwlp; uintptr_t old, new; ASSERT(rw_locked(lp, RW_READER)); do { if (((old = lp->rw_wwwh) & ~RW_HAS_WAITERS) != RW_READ_LOCK) return (0); new = old + RW_WRITE_LOCK(curthread) - RW_READ_LOCK; } while (casip(&lp->rw_wwwh, old, new) != old); membar_enter(); LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, lp); ASSERT(rw_locked(lp, RW_WRITER)); return (1); } int rw_read_held(krwlock_t *rwlp) { uintptr_t tmp; return (_RW_READ_HELD(rwlp, tmp)); } int rw_write_held(krwlock_t *rwlp) { return (_RW_WRITE_HELD(rwlp)); } int rw_lock_held(krwlock_t *rwlp) { return (_RW_LOCK_HELD(rwlp)); } /* * Like rw_read_held(), but ASSERTs that the lock is currently held */ int rw_read_locked(krwlock_t *rwlp) { uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; ASSERT(old & RW_LOCKED); return ((old & RW_LOCKED) && !(old & RW_WRITE_LOCKED)); } /* * Returns non-zero if the lock is either held or desired by a writer */ int rw_iswriter(krwlock_t *rwlp) { return (_RW_ISWRITER(rwlp)); } kthread_t * rw_owner(krwlock_t *rwlp) { uintptr_t old = ((rwlock_impl_t *)rwlp)->rw_wwwh; return ((old & RW_WRITE_LOCKED) ? (kthread_t *)(old & RW_OWNER) : NULL); }