xref: /illumos-gate/usr/src/uts/common/os/flock.c (revision 3df2e8b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved */
29 
30 /*
31  * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
32  * Copyright 2015 Joyent, Inc.
33  */
34 
35 #include <sys/flock_impl.h>
36 #include <sys/vfs.h>
37 #include <sys/t_lock.h>		/* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/clconf.h>
40 #include <sys/cladm.h>
41 #include <sys/nbmlock.h>
42 #include <sys/cred.h>
43 #include <sys/policy.h>
44 
45 /*
46  * The following four variables are for statistics purposes and they are
47  * not protected by locks. They may not be accurate but will at least be
48  * close to the actual value.
49  */
50 
51 int	flk_lock_allocs;
52 int	flk_lock_frees;
53 int 	edge_allocs;
54 int	edge_frees;
55 int 	flk_proc_vertex_allocs;
56 int 	flk_proc_edge_allocs;
57 int	flk_proc_vertex_frees;
58 int	flk_proc_edge_frees;
59 
60 static kmutex_t flock_lock;
61 
62 #ifdef DEBUG
63 int check_debug = 0;
64 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
65 					check_active_locks(gp);
66 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
67 						check_sleeping_locks(gp);
68 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
69 		if (check_debug)	\
70 			check_owner_locks(gp, pid, sysid, vp);
71 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
72 	{ \
73 		if (check_lock_transition(old_state, new_state)) { \
74 			cmn_err(CE_PANIC, "Illegal lock transition \
75 			    from %d to %d", old_state, new_state); \
76 		} \
77 	}
78 #else
79 
80 #define	CHECK_ACTIVE_LOCKS(gp)
81 #define	CHECK_SLEEPING_LOCKS(gp)
82 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
83 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
84 
85 #endif /* DEBUG */
86 
87 struct kmem_cache	*flk_edge_cache;
88 
89 graph_t		*lock_graph[HASH_SIZE];
90 proc_graph_t	pgraph;
91 
92 /*
93  * Clustering.
94  *
95  * NLM REGISTRY TYPE IMPLEMENTATION
96  *
97  * Assumptions:
98  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
99  *	integers; maximum node id is returned by clconf_maximum_nodeid().
100  *  2.  We use this node id to identify the node an NLM server runs on.
101  */
102 
103 /*
104  * NLM registry object keeps track of NLM servers via their
105  * nlmids (which are the node ids of the node in the cluster they run on)
106  * that have requested locks at this LLM with which this registry is
107  * associated.
108  *
109  * Representation of abstraction:
110  *    rep = record[	states: array[nlm_state],
111  *			lock: mutex]
112  *
113  *    Representation invariants:
114  *	1. index i of rep.states is between 0 and n - 1 where n is number
115  *	   of elements in the array, which happen to be the maximum number
116  *	   of nodes in the cluster configuration + 1.
117  *	2. map nlmid to index i of rep.states
118  *		0   -> 0
119  *		1   -> 1
120  *		2   -> 2
121  *		n-1 -> clconf_maximum_nodeid()+1
122  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
123  *	    from forgetting to subtract 1 from the index.
124  *	4.  The reason we keep the 0th index is the following.  A legitimate
125  *	    cluster configuration includes making a UFS file system NFS
126  *	    exportable.  The code is structured so that if you're in a cluster
127  *	    you do one thing; otherwise, you do something else.  The problem
128  *	    is what to do if you think you're in a cluster with PXFS loaded,
129  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
130  *	    encode the node id of the node where NLM server runs; these bytes
131  *	    are zero for UFS.  Since the nodeid is used to index into the
132  *	    registry, we can record the NLM server state information at index
133  *	    0 using the same mechanism used for PXFS file locks!
134  */
135 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
136 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
137 static uint_t nlm_status_size;			/* size of state array */
138 
139 /*
140  * Although we need a global lock dependency graph (and associated data
141  * structures), we also need a per-zone notion of whether the lock manager is
142  * running, and so whether to allow lock manager requests or not.
143  *
144  * Thus, on a per-zone basis we maintain a ``global'' variable
145  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
146  * manager is determined to be changing state (starting or stopping).
147  *
148  * Each graph/zone pair also has a copy of this variable, which is protected by
149  * the graph's mutex.
150  *
151  * The per-graph copies are used to synchronize lock requests with shutdown
152  * requests.  The global copy is used to initialize the per-graph field when a
153  * new graph is created.
154  */
155 struct flock_globals {
156 	flk_lockmgr_status_t flk_lockmgr_status;
157 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
158 };
159 
160 zone_key_t flock_zone_key;
161 
162 static void create_flock(lock_descriptor_t *, flock64_t *);
163 static lock_descriptor_t	*flk_get_lock(void);
164 static void	flk_free_lock(lock_descriptor_t	*lock);
165 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
166 static int flk_process_request(lock_descriptor_t *);
167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
168 static edge_t *flk_get_edge(void);
169 static int flk_wait_execute_request(lock_descriptor_t *);
170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
171 static void flk_insert_active_lock(lock_descriptor_t *);
172 static void flk_delete_active_lock(lock_descriptor_t *, int);
173 static void flk_insert_sleeping_lock(lock_descriptor_t *);
174 static void flk_graph_uncolor(graph_t *);
175 static void flk_wakeup(lock_descriptor_t *, int);
176 static void flk_free_edge(edge_t *);
177 static void flk_recompute_dependencies(lock_descriptor_t *,
178 			lock_descriptor_t **,  int, int);
179 static int flk_find_barriers(lock_descriptor_t *);
180 static void flk_update_barriers(lock_descriptor_t *);
181 static int flk_color_reachables(lock_descriptor_t *);
182 static int flk_canceled(lock_descriptor_t *);
183 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
185 static void wait_for_lock(lock_descriptor_t *);
186 static void unlock_lockmgr_granted(struct flock_globals *);
187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
188 
189 /* Clustering hooks */
190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
191 static void cl_flk_wakeup_sleeping_nlm_locks(int);
192 static void cl_flk_unlock_nlm_granted(int);
193 
194 #ifdef DEBUG
195 static int check_lock_transition(int, int);
196 static void check_sleeping_locks(graph_t *);
197 static void check_active_locks(graph_t *);
198 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
199 static void path(lock_descriptor_t *, lock_descriptor_t *);
200 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
201 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
202 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
203 #endif
204 
205 /*	proc_graph function definitions */
206 static int flk_check_deadlock(lock_descriptor_t *);
207 static void flk_proc_graph_uncolor(void);
208 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
209 static proc_edge_t *flk_get_proc_edge(void);
210 static void flk_proc_release(proc_vertex_t *);
211 static void flk_free_proc_edge(proc_edge_t *);
212 static void flk_update_proc_graph(edge_t *, int);
213 
214 /* Non-blocking mandatory locking */
215 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
216 			u_offset_t);
217 
218 static struct flock_globals *
flk_get_globals(void)219 flk_get_globals(void)
220 {
221 	/*
222 	 * The KLM module had better be loaded if we're attempting to handle
223 	 * lockmgr requests.
224 	 */
225 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
226 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
227 }
228 
229 static flk_lockmgr_status_t
flk_get_lockmgr_status(void)230 flk_get_lockmgr_status(void)
231 {
232 	struct flock_globals *fg;
233 
234 	ASSERT(MUTEX_HELD(&flock_lock));
235 
236 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
237 		/*
238 		 * KLM module not loaded; lock manager definitely not running.
239 		 */
240 		return (FLK_LOCKMGR_DOWN);
241 	}
242 	fg = flk_get_globals();
243 	return (fg->flk_lockmgr_status);
244 }
245 
246 /*
247  * This implements Open File Description (not descriptor) style record locking.
248  * These locks can also be thought of as pid-less since they are not tied to a
249  * specific process, thus they're preserved across fork.
250  *
251  * Called directly from fcntl.
252  *
253  * See reclock() for the implementation of the traditional POSIX style record
254  * locking scheme (pid-ful). This function is derived from reclock() but
255  * simplified and modified to work for OFD style locking.
256  *
257  * The two primary advantages of OFD style of locking are:
258  * 1) It is per-file description, so closing a file descriptor that refers to a
259  *    different file description for the same file will not drop the lock (i.e.
260  *    two open's of the same file get different descriptions but a dup or fork
261  *    will refer to the same description).
262  * 2) Locks are preserved across fork(2).
263  *
264  * Because these locks are per-description a lock ptr lives at the f_filocks
265  * member of the file_t and the lock_descriptor includes a file_t pointer
266  * to enable unique lock identification and management.
267  *
268  * Since these locks are pid-less we cannot do deadlock detection with the
269  * current process-oriented implementation. This is consistent with OFD locking
270  * behavior on other operating systems such as Linux. Since we don't do
271  * deadlock detection we never interact with the process graph that is
272  * maintained for deadlock detection on the traditional POSIX-style locks.
273  *
274  * Future Work:
275  *
276  * The current implementation does not support record locks. That is,
277  * currently the single lock must cover the entire file. This is validated in
278  * fcntl. To support record locks the f_filock pointer in the file_t needs to
279  * be changed to a list of pointers to the locks. That list needs to be
280  * managed independently of the lock list on the vnode itself and it needs to
281  * be maintained as record locks are created, split, coalesced and deleted.
282  *
283  * The current implementation does not support remote file systems (e.g.
284  * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
285  * interact with the NLM is not clear since the NLM protocol/implementation
286  * appears to be oriented around locks associated with a process. A further
287  * problem is that a design is needed for what nlm_send_siglost() should do and
288  * where it will send SIGLOST. More recent versions of Linux apparently try to
289  * emulate OFD locks on NFS by converting them to traditional POSIX style locks
290  * that work with the NLM. It is not clear that this provides the correct
291  * semantics in all cases.
292  */
293 int
ofdlock(file_t * fp,int fcmd,flock64_t * lckdat,int flag,u_offset_t offset)294 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
295 {
296 	int cmd = 0;
297 	vnode_t *vp;
298 	lock_descriptor_t	stack_lock_request;
299 	lock_descriptor_t	*lock_request;
300 	int error = 0;
301 	graph_t	*gp;
302 	int serialize = 0;
303 
304 	if (fcmd != F_OFD_GETLK)
305 		cmd = SETFLCK;
306 
307 	if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
308 		cmd |= SLPFLCK;
309 
310 	/* see block comment */
311 	VERIFY(lckdat->l_whence == 0);
312 	VERIFY(lckdat->l_start == 0);
313 	VERIFY(lckdat->l_len == 0);
314 
315 	vp = fp->f_vnode;
316 
317 	/*
318 	 * For reclock fs_frlock() would normally have set these in a few
319 	 * places but for us it's cleaner to centralize it here. Note that
320 	 * IGN_PID is -1. We use 0 for our pid-less locks.
321 	 */
322 	lckdat->l_pid = 0;
323 	lckdat->l_sysid = 0;
324 
325 	/*
326 	 * Check access permissions
327 	 */
328 	if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
329 	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
330 	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
331 		return (EBADF);
332 
333 	/*
334 	 * for query and unlock we use the stack_lock_request
335 	 */
336 	if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
337 		lock_request = &stack_lock_request;
338 		(void) bzero((caddr_t)lock_request,
339 		    sizeof (lock_descriptor_t));
340 
341 		/*
342 		 * following is added to make the assertions in
343 		 * flk_execute_request() pass
344 		 */
345 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
346 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
347 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
348 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
349 		lock_request->l_status = FLK_INITIAL_STATE;
350 	} else {
351 		lock_request = flk_get_lock();
352 		fp->f_filock = (struct filock *)lock_request;
353 	}
354 	lock_request->l_state = 0;
355 	lock_request->l_vnode = vp;
356 	lock_request->l_zoneid = getzoneid();
357 	lock_request->l_ofd = fp;
358 
359 	/*
360 	 * Convert the request range into the canonical start and end
361 	 * values then check the validity of the lock range.
362 	 */
363 	error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
364 	    &lock_request->l_end, offset);
365 	if (error)
366 		goto done;
367 
368 	error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
369 	    MAXEND);
370 	if (error)
371 		goto done;
372 
373 	ASSERT(lock_request->l_end >= lock_request->l_start);
374 
375 	lock_request->l_type = lckdat->l_type;
376 	if (cmd & SLPFLCK)
377 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
378 
379 	if (!(cmd & SETFLCK)) {
380 		if (lock_request->l_type == F_RDLCK ||
381 		    lock_request->l_type == F_WRLCK)
382 			lock_request->l_state |= QUERY_LOCK;
383 	}
384 	lock_request->l_flock = (*lckdat);
385 
386 	/*
387 	 * We are ready for processing the request
388 	 */
389 
390 	if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
391 	    nbl_need_check(vp)) {
392 		nbl_start_crit(vp, RW_WRITER);
393 		serialize = 1;
394 	}
395 
396 	/* Get the lock graph for a particular vnode */
397 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
398 
399 	mutex_enter(&gp->gp_mutex);
400 
401 	lock_request->l_state |= REFERENCED_LOCK;
402 	lock_request->l_graph = gp;
403 
404 	switch (lock_request->l_type) {
405 	case F_RDLCK:
406 	case F_WRLCK:
407 		if (IS_QUERY_LOCK(lock_request)) {
408 			flk_get_first_blocking_lock(lock_request);
409 			if (lock_request->l_ofd != NULL)
410 				lock_request->l_flock.l_pid = -1;
411 			(*lckdat) = lock_request->l_flock;
412 		} else {
413 			/* process the request now */
414 			error = flk_process_request(lock_request);
415 		}
416 		break;
417 
418 	case F_UNLCK:
419 		/* unlock request will not block so execute it immediately */
420 		error = flk_execute_request(lock_request);
421 		break;
422 
423 	default:
424 		error = EINVAL;
425 		break;
426 	}
427 
428 	if (lock_request == &stack_lock_request) {
429 		flk_set_state(lock_request, FLK_DEAD_STATE);
430 	} else {
431 		lock_request->l_state &= ~REFERENCED_LOCK;
432 		if ((error != 0) || IS_DELETED(lock_request)) {
433 			flk_set_state(lock_request, FLK_DEAD_STATE);
434 			flk_free_lock(lock_request);
435 		}
436 	}
437 
438 	mutex_exit(&gp->gp_mutex);
439 	if (serialize)
440 		nbl_end_crit(vp);
441 
442 	return (error);
443 
444 done:
445 	flk_set_state(lock_request, FLK_DEAD_STATE);
446 	if (lock_request != &stack_lock_request)
447 		flk_free_lock(lock_request);
448 	return (error);
449 }
450 
451 /*
452  * Remove any lock on the vnode belonging to the given file_t.
453  * Called from closef on last close, file_t is locked.
454  *
455  * This is modeled on the cleanlocks() function but only removes the single
456  * lock associated with fp.
457  */
458 void
ofdcleanlock(file_t * fp)459 ofdcleanlock(file_t *fp)
460 {
461 	lock_descriptor_t *fplock, *lock, *nlock;
462 	vnode_t *vp;
463 	graph_t	*gp;
464 
465 	ASSERT(MUTEX_HELD(&fp->f_tlock));
466 
467 	if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
468 		return;
469 
470 	fp->f_filock = NULL;
471 	vp = fp->f_vnode;
472 
473 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
474 
475 	if (gp == NULL)
476 		return;
477 	mutex_enter(&gp->gp_mutex);
478 
479 	CHECK_SLEEPING_LOCKS(gp);
480 	CHECK_ACTIVE_LOCKS(gp);
481 
482 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
483 
484 	if (lock) {
485 		do {
486 			nlock = lock->l_next;
487 			if (fplock == lock) {
488 				CANCEL_WAKEUP(lock);
489 				break;
490 			}
491 			lock = nlock;
492 		} while (lock->l_vnode == vp);
493 	}
494 
495 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
496 
497 	if (lock) {
498 		do {
499 			nlock = lock->l_next;
500 			if (fplock == lock) {
501 				flk_delete_active_lock(lock, 0);
502 				flk_wakeup(lock, 1);
503 				flk_free_lock(lock);
504 				break;
505 			}
506 			lock = nlock;
507 		} while (lock->l_vnode == vp);
508 	}
509 
510 	CHECK_SLEEPING_LOCKS(gp);
511 	CHECK_ACTIVE_LOCKS(gp);
512 	mutex_exit(&gp->gp_mutex);
513 }
514 
515 /*
516  * Routine called from fs_frlock in fs/fs_subr.c
517  *
518  * This implements traditional POSIX style record locking. The two primary
519  * drawbacks to this style of locking are:
520  * 1) It is per-process, so any close of a file descriptor that refers to the
521  *    file will drop the lock (e.g. lock /etc/passwd, call a library function
522  *    which opens /etc/passwd to read the file, when the library closes it's
523  *    file descriptor the application loses its lock and does not know).
524  * 2) Locks are not preserved across fork(2).
525  *
526  * Because these locks are only associated with a PID, they are per-process.
527  * This is why any close will drop the lock and is also why, once the process
528  * forks, the lock is no longer related to the new process. These locks can
529  * be considered as PID-ful.
530  *
531  * See ofdlock() for the implementation of a similar but improved locking
532  * scheme.
533  */
534 int
reclock(vnode_t * vp,flock64_t * lckdat,int cmd,int flag,u_offset_t offset,flk_callback_t * flk_cbp)535 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
536     flk_callback_t *flk_cbp)
537 {
538 	lock_descriptor_t	stack_lock_request;
539 	lock_descriptor_t	*lock_request;
540 	int error = 0;
541 	graph_t	*gp;
542 	int			nlmid;
543 
544 	/*
545 	 * Check access permissions
546 	 */
547 	if ((cmd & SETFLCK) &&
548 	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
549 	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
550 			return (EBADF);
551 
552 	/*
553 	 * for query and unlock we use the stack_lock_request
554 	 */
555 
556 	if ((lckdat->l_type == F_UNLCK) ||
557 	    !((cmd & INOFLCK) || (cmd & SETFLCK))) {
558 		lock_request = &stack_lock_request;
559 		(void) bzero((caddr_t)lock_request,
560 		    sizeof (lock_descriptor_t));
561 
562 		/*
563 		 * following is added to make the assertions in
564 		 * flk_execute_request() to pass through
565 		 */
566 
567 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
568 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
569 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
570 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
571 		lock_request->l_status = FLK_INITIAL_STATE;
572 	} else {
573 		lock_request = flk_get_lock();
574 	}
575 	lock_request->l_state = 0;
576 	lock_request->l_vnode = vp;
577 	lock_request->l_zoneid = getzoneid();
578 
579 	/*
580 	 * Convert the request range into the canonical start and end
581 	 * values.  The NLM protocol supports locking over the entire
582 	 * 32-bit range, so there's no range checking for remote requests,
583 	 * but we still need to verify that local requests obey the rules.
584 	 */
585 	/* Clustering */
586 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
587 		ASSERT(lckdat->l_whence == 0);
588 		lock_request->l_start = lckdat->l_start;
589 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
590 		    lckdat->l_start + (lckdat->l_len - 1);
591 	} else {
592 		/* check the validity of the lock range */
593 		error = flk_convert_lock_data(vp, lckdat,
594 		    &lock_request->l_start, &lock_request->l_end,
595 		    offset);
596 		if (error) {
597 			goto done;
598 		}
599 		error = flk_check_lock_data(lock_request->l_start,
600 		    lock_request->l_end, MAXEND);
601 		if (error) {
602 			goto done;
603 		}
604 	}
605 
606 	ASSERT(lock_request->l_end >= lock_request->l_start);
607 
608 	lock_request->l_type = lckdat->l_type;
609 	if (cmd & INOFLCK)
610 		lock_request->l_state |= IO_LOCK;
611 	if (cmd & SLPFLCK)
612 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
613 	if (cmd & RCMDLCK)
614 		lock_request->l_state |= LOCKMGR_LOCK;
615 	if (cmd & NBMLCK)
616 		lock_request->l_state |= NBMAND_LOCK;
617 	/*
618 	 * Clustering: set flag for PXFS locks
619 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
620 	 * also be of type 'RCMDLCK'.
621 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
622 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
623 	 */
624 
625 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
626 		lock_request->l_state |= PXFS_LOCK;
627 	}
628 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
629 		if (lock_request->l_type == F_RDLCK ||
630 		    lock_request->l_type == F_WRLCK)
631 			lock_request->l_state |= QUERY_LOCK;
632 	}
633 	lock_request->l_flock = (*lckdat);
634 	lock_request->l_callbacks = flk_cbp;
635 
636 	/*
637 	 * We are ready for processing the request
638 	 */
639 	if (IS_LOCKMGR(lock_request)) {
640 		/*
641 		 * If the lock request is an NLM server request ....
642 		 */
643 		if (nlm_status_size == 0) { /* not booted as cluster */
644 			mutex_enter(&flock_lock);
645 			/*
646 			 * Bail out if this is a lock manager request and the
647 			 * lock manager is not supposed to be running.
648 			 */
649 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
650 				mutex_exit(&flock_lock);
651 				error = ENOLCK;
652 				goto done;
653 			}
654 			mutex_exit(&flock_lock);
655 		} else {			/* booted as a cluster */
656 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
657 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
658 
659 			mutex_enter(&nlm_reg_lock);
660 			/*
661 			 * If the NLM registry does not know about this
662 			 * NLM server making the request, add its nlmid
663 			 * to the registry.
664 			 */
665 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
666 			    nlmid)) {
667 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
668 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
669 			    nlmid)) {
670 				/*
671 				 * If the NLM server is already known (has made
672 				 * previous lock requests) and its state is
673 				 * not NLM_UP (means that NLM server is
674 				 * shutting down), then bail out with an
675 				 * error to deny the lock request.
676 				 */
677 				mutex_exit(&nlm_reg_lock);
678 				error = ENOLCK;
679 				goto done;
680 			}
681 			mutex_exit(&nlm_reg_lock);
682 		}
683 	}
684 
685 	/* Now get the lock graph for a particular vnode */
686 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
687 
688 	/*
689 	 * We drop rwlock here otherwise this might end up causing a
690 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
691 	 */
692 
693 	if (IS_IO_LOCK(lock_request)) {
694 		VOP_RWUNLOCK(vp,
695 		    (lock_request->l_type == F_RDLCK) ?
696 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
697 	}
698 	mutex_enter(&gp->gp_mutex);
699 
700 	lock_request->l_state |= REFERENCED_LOCK;
701 	lock_request->l_graph = gp;
702 
703 	switch (lock_request->l_type) {
704 	case F_RDLCK:
705 	case F_WRLCK:
706 		if (IS_QUERY_LOCK(lock_request)) {
707 			flk_get_first_blocking_lock(lock_request);
708 			if (lock_request->l_ofd != NULL)
709 				lock_request->l_flock.l_pid = -1;
710 			(*lckdat) = lock_request->l_flock;
711 			break;
712 		}
713 
714 		/* process the request now */
715 
716 		error = flk_process_request(lock_request);
717 		break;
718 
719 	case F_UNLCK:
720 		/* unlock request will not block so execute it immediately */
721 
722 		if (IS_LOCKMGR(lock_request) &&
723 		    flk_canceled(lock_request)) {
724 			error = 0;
725 		} else {
726 			error = flk_execute_request(lock_request);
727 		}
728 		break;
729 
730 	case F_UNLKSYS:
731 		/*
732 		 * Recovery mechanism to release lock manager locks when
733 		 * NFS client crashes and restart. NFS server will clear
734 		 * old locks and grant new locks.
735 		 */
736 
737 		if (lock_request->l_flock.l_sysid == 0) {
738 			mutex_exit(&gp->gp_mutex);
739 			return (EINVAL);
740 		}
741 		if (secpolicy_nfs(CRED()) != 0) {
742 			mutex_exit(&gp->gp_mutex);
743 			return (EPERM);
744 		}
745 		flk_delete_locks_by_sysid(lock_request);
746 		lock_request->l_state &= ~REFERENCED_LOCK;
747 		flk_set_state(lock_request, FLK_DEAD_STATE);
748 		flk_free_lock(lock_request);
749 		mutex_exit(&gp->gp_mutex);
750 		return (0);
751 
752 	default:
753 		error = EINVAL;
754 		break;
755 	}
756 
757 	/* Clustering: For blocked PXFS locks, return */
758 	if (error == PXFS_LOCK_BLOCKED) {
759 		lock_request->l_state &= ~REFERENCED_LOCK;
760 		mutex_exit(&gp->gp_mutex);
761 		return (error);
762 	}
763 
764 	/*
765 	 * Now that we have seen the status of locks in the system for
766 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
767 	 */
768 
769 	if (IS_IO_LOCK(lock_request)) {
770 		(void) VOP_RWLOCK(vp,
771 		    (lock_request->l_type == F_RDLCK) ?
772 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
773 		if (!error) {
774 			lckdat->l_type = F_UNLCK;
775 
776 			/*
777 			 * This wake up is needed otherwise
778 			 * if IO_LOCK has slept the dependents on this
779 			 * will not be woken up at all. (bugid # 1185482).
780 			 */
781 
782 			flk_wakeup(lock_request, 1);
783 			flk_set_state(lock_request, FLK_DEAD_STATE);
784 			flk_free_lock(lock_request);
785 		}
786 		/*
787 		 * else if error had occurred either flk_process_request()
788 		 * has returned EDEADLK in which case there will be no
789 		 * dependents for this lock or EINTR from flk_wait_execute_
790 		 * request() in which case flk_cancel_sleeping_lock()
791 		 * would have been done. same is true with EBADF.
792 		 */
793 	}
794 
795 	if (lock_request == &stack_lock_request) {
796 		flk_set_state(lock_request, FLK_DEAD_STATE);
797 	} else {
798 		lock_request->l_state &= ~REFERENCED_LOCK;
799 		if ((error != 0) || IS_DELETED(lock_request)) {
800 			flk_set_state(lock_request, FLK_DEAD_STATE);
801 			flk_free_lock(lock_request);
802 		}
803 	}
804 
805 	mutex_exit(&gp->gp_mutex);
806 	return (error);
807 
808 done:
809 	flk_set_state(lock_request, FLK_DEAD_STATE);
810 	if (lock_request != &stack_lock_request)
811 		flk_free_lock(lock_request);
812 	return (error);
813 }
814 
815 /*
816  * Invoke the callbacks in the given list.  If before sleeping, invoke in
817  * list order.  If after sleeping, invoke in reverse order.
818  *
819  * CPR (suspend/resume) support: if one of the callbacks returns a
820  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
821  * while it is sleeping.  There should be at most one callb_cpr_t for the
822  * thread.
823  * XXX This is unnecessarily complicated.  The CPR information should just
824  * get passed in directly through VOP_FRLOCK and reclock, rather than
825  * sneaking it in via a callback.
826  */
827 
828 callb_cpr_t *
flk_invoke_callbacks(flk_callback_t * cblist,flk_cb_when_t when)829 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
830 {
831 	callb_cpr_t *cpr_callbackp = NULL;
832 	callb_cpr_t *one_result;
833 	flk_callback_t *cb;
834 
835 	if (cblist == NULL)
836 		return (NULL);
837 
838 	if (when == FLK_BEFORE_SLEEP) {
839 		cb = cblist;
840 		do {
841 			one_result = (*cb->cb_callback)(when, cb->cb_data);
842 			if (one_result != NULL) {
843 				ASSERT(cpr_callbackp == NULL);
844 				cpr_callbackp = one_result;
845 			}
846 			cb = cb->cb_next;
847 		} while (cb != cblist);
848 	} else {
849 		cb = cblist->cb_prev;
850 		do {
851 			one_result = (*cb->cb_callback)(when, cb->cb_data);
852 			if (one_result != NULL) {
853 				cpr_callbackp = one_result;
854 			}
855 			cb = cb->cb_prev;
856 		} while (cb != cblist->cb_prev);
857 	}
858 
859 	return (cpr_callbackp);
860 }
861 
862 /*
863  * Initialize a flk_callback_t to hold the given callback.
864  */
865 
866 void
flk_init_callback(flk_callback_t * flk_cb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata)867 flk_init_callback(flk_callback_t *flk_cb,
868     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
869 {
870 	flk_cb->cb_next = flk_cb;
871 	flk_cb->cb_prev = flk_cb;
872 	flk_cb->cb_callback = cb_fcn;
873 	flk_cb->cb_data = cbdata;
874 }
875 
876 /*
877  * Initialize an flk_callback_t and then link it into the head of an
878  * existing list (which may be NULL).
879  */
880 
881 void
flk_add_callback(flk_callback_t * newcb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata,flk_callback_t * cblist)882 flk_add_callback(flk_callback_t *newcb,
883     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
884     void *cbdata, flk_callback_t *cblist)
885 {
886 	flk_init_callback(newcb, cb_fcn, cbdata);
887 
888 	if (cblist == NULL)
889 		return;
890 
891 	newcb->cb_prev = cblist->cb_prev;
892 	newcb->cb_next = cblist;
893 	cblist->cb_prev->cb_next = newcb;
894 	cblist->cb_prev = newcb;
895 }
896 
897 /*
898  * Remove the callback from a list.
899  */
900 
901 void
flk_del_callback(flk_callback_t * flk_cb)902 flk_del_callback(flk_callback_t *flk_cb)
903 {
904 	flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
905 	flk_cb->cb_prev->cb_next = flk_cb->cb_next;
906 
907 	flk_cb->cb_prev = flk_cb;
908 	flk_cb->cb_next = flk_cb;
909 }
910 
911 /*
912  * Initialize the flk_edge_cache data structure and create the
913  * nlm_reg_status array.
914  */
915 
916 void
flk_init(void)917 flk_init(void)
918 {
919 	uint_t	i;
920 
921 	flk_edge_cache = kmem_cache_create("flk_edges",
922 	    sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
923 	if (flk_edge_cache == NULL) {
924 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
925 	}
926 	/*
927 	 * Create the NLM registry object.
928 	 */
929 
930 	if (cluster_bootflags & CLUSTER_BOOTED) {
931 		/*
932 		 * This routine tells you the maximum node id that will be used
933 		 * in the cluster.  This number will be the size of the nlm
934 		 * registry status array.  We add 1 because we will be using
935 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
936 		 * to 64, for a total of 65 entries.
937 		 */
938 		nlm_status_size = clconf_maximum_nodeid() + 1;
939 	} else {
940 		nlm_status_size = 0;
941 	}
942 
943 	if (nlm_status_size != 0) {	/* booted as a cluster */
944 		nlm_reg_status = (flk_nlm_status_t *)
945 		    kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
946 		    KM_SLEEP);
947 
948 		/* initialize all NLM states in array to NLM_UNKNOWN */
949 		for (i = 0; i < nlm_status_size; i++) {
950 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
951 		}
952 	}
953 }
954 
955 /*
956  * Zone constructor/destructor callbacks to be executed when a zone is
957  * created/destroyed.
958  */
959 /* ARGSUSED */
960 void *
flk_zone_init(zoneid_t zoneid)961 flk_zone_init(zoneid_t zoneid)
962 {
963 	struct flock_globals *fg;
964 	uint_t i;
965 
966 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
967 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
968 	for (i = 0; i < HASH_SIZE; i++)
969 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
970 	return (fg);
971 }
972 
973 /* ARGSUSED */
974 void
flk_zone_fini(zoneid_t zoneid,void * data)975 flk_zone_fini(zoneid_t zoneid, void *data)
976 {
977 	struct flock_globals *fg = data;
978 
979 	kmem_free(fg, sizeof (*fg));
980 }
981 
982 /*
983  * Get a lock_descriptor structure with initialization of edge lists.
984  */
985 
986 static lock_descriptor_t *
flk_get_lock(void)987 flk_get_lock(void)
988 {
989 	lock_descriptor_t	*l;
990 
991 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
992 
993 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
994 	l->l_edge.edge_in_next = &l->l_edge;
995 	l->l_edge.edge_in_prev = &l->l_edge;
996 	l->l_edge.edge_adj_next = &l->l_edge;
997 	l->l_edge.edge_adj_prev = &l->l_edge;
998 	l->pvertex = -1;
999 	l->l_status = FLK_INITIAL_STATE;
1000 	flk_lock_allocs++;
1001 	return (l);
1002 }
1003 
1004 /*
1005  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1006  * when some thread has a reference to it as in reclock().
1007  */
1008 
1009 void
flk_free_lock(lock_descriptor_t * lock)1010 flk_free_lock(lock_descriptor_t	*lock)
1011 {
1012 	file_t *fp;
1013 
1014 	ASSERT(IS_DEAD(lock));
1015 
1016 	if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1017 		fp->f_filock = NULL;
1018 
1019 	if (IS_REFERENCED(lock)) {
1020 		lock->l_state |= DELETED_LOCK;
1021 		return;
1022 	}
1023 	flk_lock_frees++;
1024 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
1025 }
1026 
1027 void
flk_set_state(lock_descriptor_t * lock,int new_state)1028 flk_set_state(lock_descriptor_t *lock, int new_state)
1029 {
1030 	/*
1031 	 * Locks in the sleeping list may be woken up in a number of ways,
1032 	 * and more than once.  If a sleeping lock is signaled awake more
1033 	 * than once, then it may or may not change state depending on its
1034 	 * current state.
1035 	 * Also note that NLM locks that are sleeping could be moved to an
1036 	 * interrupted state more than once if the unlock request is
1037 	 * retransmitted by the NLM client - the second time around, this is
1038 	 * just a nop.
1039 	 * The ordering of being signaled awake is:
1040 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1041 	 * The checks below implement this ordering.
1042 	 */
1043 	if (IS_INTERRUPTED(lock)) {
1044 		if ((new_state == FLK_CANCELLED_STATE) ||
1045 		    (new_state == FLK_GRANTED_STATE) ||
1046 		    (new_state == FLK_INTERRUPTED_STATE)) {
1047 			return;
1048 		}
1049 	}
1050 	if (IS_CANCELLED(lock)) {
1051 		if ((new_state == FLK_GRANTED_STATE) ||
1052 		    (new_state == FLK_CANCELLED_STATE)) {
1053 			return;
1054 		}
1055 	}
1056 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1057 	if (IS_PXFS(lock)) {
1058 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1059 	}
1060 	lock->l_status = new_state;
1061 }
1062 
1063 /*
1064  * Routine that checks whether there are any blocking locks in the system.
1065  *
1066  * The policy followed is if a write lock is sleeping we don't allow read
1067  * locks before this write lock even though there may not be any active
1068  * locks corresponding to the read locks' region.
1069  *
1070  * flk_add_edge() function adds an edge between l1 and l2 iff there
1071  * is no path between l1 and l2. This is done to have a "minimum
1072  * storage representation" of the dependency graph.
1073  *
1074  * Another property of the graph is since only the new request throws
1075  * edges to the existing locks in the graph, the graph is always topologically
1076  * ordered.
1077  */
1078 
1079 static int
flk_process_request(lock_descriptor_t * request)1080 flk_process_request(lock_descriptor_t *request)
1081 {
1082 	graph_t	*gp = request->l_graph;
1083 	lock_descriptor_t *lock;
1084 	int request_blocked_by_active = 0;
1085 	int request_blocked_by_granted = 0;
1086 	int request_blocked_by_sleeping = 0;
1087 	vnode_t	*vp = request->l_vnode;
1088 	int	error = 0;
1089 	int request_will_wait = 0;
1090 	int found_covering_lock = 0;
1091 	lock_descriptor_t *covered_by = NULL;
1092 
1093 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1094 	request_will_wait = IS_WILLING_TO_SLEEP(request);
1095 
1096 	/*
1097 	 * check active locks
1098 	 */
1099 
1100 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1101 
1102 
1103 	if (lock) {
1104 		do {
1105 			if (BLOCKS(lock, request)) {
1106 				if (!request_will_wait)
1107 					return (EAGAIN);
1108 				request_blocked_by_active = 1;
1109 				break;
1110 			}
1111 			/*
1112 			 * Grant lock if it is for the same owner holding active
1113 			 * lock that covers the request.
1114 			 */
1115 
1116 			if (SAME_OWNER(lock, request) &&
1117 			    COVERS(lock, request) &&
1118 			    (request->l_type == F_RDLCK))
1119 				return (flk_execute_request(request));
1120 			lock = lock->l_next;
1121 		} while (lock->l_vnode == vp);
1122 	}
1123 
1124 	if (!request_blocked_by_active) {
1125 		lock_descriptor_t *lk[1];
1126 		lock_descriptor_t *first_glock = NULL;
1127 		/*
1128 		 * Shall we grant this?! NO!!
1129 		 * What about those locks that were just granted and still
1130 		 * in sleep queue. Those threads are woken up and so locks
1131 		 * are almost active.
1132 		 */
1133 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1134 		if (lock) {
1135 			do {
1136 				if (BLOCKS(lock, request)) {
1137 					if (IS_GRANTED(lock)) {
1138 						request_blocked_by_granted = 1;
1139 					} else {
1140 						request_blocked_by_sleeping = 1;
1141 					}
1142 				}
1143 
1144 				lock = lock->l_next;
1145 			} while ((lock->l_vnode == vp));
1146 			first_glock = lock->l_prev;
1147 			ASSERT(first_glock->l_vnode == vp);
1148 		}
1149 
1150 		if (request_blocked_by_granted)
1151 			goto block;
1152 
1153 		if (!request_blocked_by_sleeping) {
1154 			/*
1155 			 * If the request isn't going to be blocked by a
1156 			 * sleeping request, we know that it isn't going to
1157 			 * be blocked; we can just execute the request --
1158 			 * without performing costly deadlock detection.
1159 			 */
1160 			ASSERT(!request_blocked_by_active);
1161 			return (flk_execute_request(request));
1162 		} else if (request->l_type == F_RDLCK) {
1163 			/*
1164 			 * If we have a sleeping writer in the requested
1165 			 * lock's range, block.
1166 			 */
1167 			goto block;
1168 		}
1169 
1170 		lk[0] = request;
1171 		request->l_state |= RECOMPUTE_LOCK;
1172 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1173 		if (lock) {
1174 			do {
1175 				flk_recompute_dependencies(lock, lk, 1, 0);
1176 				lock = lock->l_next;
1177 			} while (lock->l_vnode == vp);
1178 		}
1179 		lock = first_glock;
1180 		if (lock) {
1181 			do {
1182 				if (IS_GRANTED(lock)) {
1183 				flk_recompute_dependencies(lock, lk, 1, 0);
1184 				}
1185 				lock = lock->l_prev;
1186 			} while ((lock->l_vnode == vp));
1187 		}
1188 		request->l_state &= ~RECOMPUTE_LOCK;
1189 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1190 			return (EDEADLK);
1191 		return (flk_execute_request(request));
1192 	}
1193 
1194 block:
1195 	if (request_will_wait)
1196 		flk_graph_uncolor(gp);
1197 
1198 	/* check sleeping locks */
1199 
1200 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1201 
1202 	/*
1203 	 * If we find a sleeping write lock that is a superset of the
1204 	 * region wanted by request we can be assured that by adding an
1205 	 * edge to this write lock we have paths to all locks in the
1206 	 * graph that blocks the request except in one case and that is why
1207 	 * another check for SAME_OWNER in the loop below. The exception
1208 	 * case is when this process that owns the sleeping write lock 'l1'
1209 	 * has other locks l2, l3, l4 that are in the system and arrived
1210 	 * before l1. l1 does not have path to these locks as they are from
1211 	 * same process. We break when we find a second covering sleeping
1212 	 * lock l5 owned by a process different from that owning l1, because
1213 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1214 	 * it has l1 would have produced a deadlock already.
1215 	 */
1216 
1217 	if (lock) {
1218 		do {
1219 			if (BLOCKS(lock, request)) {
1220 				if (!request_will_wait)
1221 					return (EAGAIN);
1222 				if (COVERS(lock, request) &&
1223 				    lock->l_type == F_WRLCK) {
1224 					if (found_covering_lock &&
1225 					    !SAME_OWNER(lock, covered_by)) {
1226 						found_covering_lock++;
1227 						break;
1228 					}
1229 					found_covering_lock = 1;
1230 					covered_by = lock;
1231 				}
1232 				if (found_covering_lock &&
1233 				    !SAME_OWNER(lock, covered_by)) {
1234 					lock = lock->l_next;
1235 					continue;
1236 				}
1237 				if ((error = flk_add_edge(request, lock,
1238 				    !found_covering_lock, 0)))
1239 					return (error);
1240 			}
1241 			lock = lock->l_next;
1242 		} while (lock->l_vnode == vp);
1243 	}
1244 
1245 /*
1246  * found_covering_lock == 2 iff at this point 'request' has paths
1247  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1248  * point 'request' has paths to all locks that blocks 'request' whose owners
1249  * are not same as the one that covers 'request' (covered_by above) and
1250  * we can have locks whose owner is same as covered_by in the active list.
1251  */
1252 
1253 	if (request_blocked_by_active && found_covering_lock != 2) {
1254 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1255 		ASSERT(lock != NULL);
1256 		do {
1257 			if (BLOCKS(lock, request)) {
1258 				if (found_covering_lock &&
1259 				    !SAME_OWNER(lock, covered_by)) {
1260 					lock = lock->l_next;
1261 					continue;
1262 				}
1263 				if ((error = flk_add_edge(request, lock,
1264 				    CHECK_CYCLE, 0)))
1265 					return (error);
1266 			}
1267 			lock = lock->l_next;
1268 		} while (lock->l_vnode == vp);
1269 	}
1270 
1271 	if (NOT_BLOCKED(request)) {
1272 		/*
1273 		 * request not dependent on any other locks
1274 		 * so execute this request
1275 		 */
1276 		return (flk_execute_request(request));
1277 	} else {
1278 		/*
1279 		 * check for deadlock
1280 		 */
1281 		if (flk_check_deadlock(request))
1282 			return (EDEADLK);
1283 		/*
1284 		 * this thread has to sleep
1285 		 */
1286 		return (flk_wait_execute_request(request));
1287 	}
1288 }
1289 
1290 /*
1291  * The actual execution of the request in the simple case is only to
1292  * insert the 'request' in the list of active locks if it is not an
1293  * UNLOCK.
1294  * We have to consider the existing active locks' relation to
1295  * this 'request' if they are owned by same process. flk_relation() does
1296  * this job and sees to that the dependency graph information is maintained
1297  * properly.
1298  */
1299 
1300 int
flk_execute_request(lock_descriptor_t * request)1301 flk_execute_request(lock_descriptor_t *request)
1302 {
1303 	graph_t	*gp = request->l_graph;
1304 	vnode_t	*vp = request->l_vnode;
1305 	lock_descriptor_t	*lock, *lock1;
1306 	int done_searching = 0;
1307 
1308 	CHECK_SLEEPING_LOCKS(gp);
1309 	CHECK_ACTIVE_LOCKS(gp);
1310 
1311 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1312 
1313 	flk_set_state(request, FLK_START_STATE);
1314 
1315 	ASSERT(NOT_BLOCKED(request));
1316 
1317 	/* IO_LOCK requests are only to check status */
1318 
1319 	if (IS_IO_LOCK(request))
1320 		return (0);
1321 
1322 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1323 
1324 	if (lock == NULL && request->l_type == F_UNLCK)
1325 		return (0);
1326 	if (lock == NULL) {
1327 		flk_insert_active_lock(request);
1328 		return (0);
1329 	}
1330 
1331 	do {
1332 		lock1 = lock->l_next;
1333 		if (SAME_OWNER(request, lock)) {
1334 			done_searching = flk_relation(lock, request);
1335 		}
1336 		lock = lock1;
1337 	} while (lock->l_vnode == vp && !done_searching);
1338 
1339 	/*
1340 	 * insert in active queue
1341 	 */
1342 
1343 	if (request->l_type != F_UNLCK)
1344 		flk_insert_active_lock(request);
1345 
1346 	return (0);
1347 }
1348 
1349 /*
1350  * 'request' is blocked by some one therefore we put it into sleep queue.
1351  */
1352 static int
flk_wait_execute_request(lock_descriptor_t * request)1353 flk_wait_execute_request(lock_descriptor_t *request)
1354 {
1355 	graph_t	*gp = request->l_graph;
1356 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1357 	struct flock_globals *fg;
1358 	int index;
1359 
1360 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1361 	ASSERT(IS_WILLING_TO_SLEEP(request));
1362 
1363 	flk_insert_sleeping_lock(request);
1364 
1365 	index = 0;	/* quiesce compiler warning. */
1366 	fg = NULL;
1367 	if (IS_LOCKMGR(request)) {
1368 		index = HASH_INDEX(request->l_vnode);
1369 		fg = flk_get_globals();
1370 
1371 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1372 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1373 				flk_cancel_sleeping_lock(request, 1);
1374 				return (ENOLCK);
1375 			}
1376 		} else {			/* booted as a cluster */
1377 			/*
1378 			 * If the request is an NLM server lock request,
1379 			 * and the NLM state of the lock request is not
1380 			 * NLM_UP (because the NLM server is shutting
1381 			 * down), then cancel the sleeping lock and
1382 			 * return error ENOLCK that will encourage the
1383 			 * client to retransmit.
1384 			 */
1385 			if (!IS_NLM_UP(request)) {
1386 				flk_cancel_sleeping_lock(request, 1);
1387 				return (ENOLCK);
1388 			}
1389 		}
1390 	}
1391 
1392 	/* Clustering: For blocking PXFS locks, return */
1393 	if (IS_PXFS(request)) {
1394 		/*
1395 		 * PXFS locks sleep on the client side.
1396 		 * The callback argument is used to wake up the sleeper
1397 		 * when the lock is granted.
1398 		 * We return -1 (rather than an errno value) to indicate
1399 		 * the client side should sleep
1400 		 */
1401 		return (PXFS_LOCK_BLOCKED);
1402 	}
1403 
1404 	if (request->l_callbacks != NULL) {
1405 		/*
1406 		 * To make sure the shutdown code works correctly, either
1407 		 * the callback must happen after putting the lock on the
1408 		 * sleep list, or we must check the shutdown status after
1409 		 * returning from the callback (and before sleeping).  At
1410 		 * least for now, we'll use the first option.  If a
1411 		 * shutdown or signal or whatever happened while the graph
1412 		 * mutex was dropped, that will be detected by
1413 		 * wait_for_lock().
1414 		 */
1415 		mutex_exit(&gp->gp_mutex);
1416 
1417 		cprp = flk_invoke_callbacks(request->l_callbacks,
1418 		    FLK_BEFORE_SLEEP);
1419 
1420 		mutex_enter(&gp->gp_mutex);
1421 
1422 		if (cprp == NULL) {
1423 			wait_for_lock(request);
1424 		} else {
1425 			mutex_enter(cprp->cc_lockp);
1426 			CALLB_CPR_SAFE_BEGIN(cprp);
1427 			mutex_exit(cprp->cc_lockp);
1428 			wait_for_lock(request);
1429 			mutex_enter(cprp->cc_lockp);
1430 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1431 			mutex_exit(cprp->cc_lockp);
1432 		}
1433 
1434 		mutex_exit(&gp->gp_mutex);
1435 		(void) flk_invoke_callbacks(request->l_callbacks,
1436 		    FLK_AFTER_SLEEP);
1437 		mutex_enter(&gp->gp_mutex);
1438 	} else {
1439 		wait_for_lock(request);
1440 	}
1441 
1442 	if (IS_LOCKMGR(request)) {
1443 		/*
1444 		 * If the lock manager is shutting down, return an
1445 		 * error that will encourage the client to retransmit.
1446 		 */
1447 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1448 		    !IS_GRANTED(request)) {
1449 			flk_cancel_sleeping_lock(request, 1);
1450 			return (ENOLCK);
1451 		}
1452 	}
1453 
1454 	if (IS_INTERRUPTED(request)) {
1455 		/* we got a signal, or act like we did */
1456 		flk_cancel_sleeping_lock(request, 1);
1457 		return (EINTR);
1458 	}
1459 
1460 	/* Cancelled if some other thread has closed the file */
1461 
1462 	if (IS_CANCELLED(request)) {
1463 		flk_cancel_sleeping_lock(request, 1);
1464 		return (EBADF);
1465 	}
1466 
1467 	request->l_state &= ~GRANTED_LOCK;
1468 	REMOVE_SLEEP_QUEUE(request);
1469 	return (flk_execute_request(request));
1470 }
1471 
1472 /*
1473  * This routine adds an edge between from and to because from depends
1474  * to. If asked to check for deadlock it checks whether there are any
1475  * reachable locks from "from_lock" that is owned by the same process
1476  * as "from_lock".
1477  * NOTE: It is the caller's responsibility to make sure that the color
1478  * of the graph is consistent between the calls to flk_add_edge as done
1479  * in flk_process_request. This routine does not color and check for
1480  * deadlock explicitly.
1481  */
1482 
1483 static int
flk_add_edge(lock_descriptor_t * from_lock,lock_descriptor_t * to_lock,int check_cycle,int update_graph)1484 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1485     int check_cycle, int update_graph)
1486 {
1487 	edge_t	*edge;
1488 	edge_t	*ep;
1489 	lock_descriptor_t	*vertex;
1490 	lock_descriptor_t *vertex_stack;
1491 
1492 	STACK_INIT(vertex_stack);
1493 
1494 	/*
1495 	 * if to vertex already has mark_color just return
1496 	 * don't add an edge as it is reachable from from vertex
1497 	 * before itself.
1498 	 */
1499 
1500 	if (COLORED(to_lock))
1501 		return (0);
1502 
1503 	edge = flk_get_edge();
1504 
1505 	/*
1506 	 * set the from and to vertex
1507 	 */
1508 
1509 	edge->from_vertex = from_lock;
1510 	edge->to_vertex = to_lock;
1511 
1512 	/*
1513 	 * put in adjacency list of from vertex
1514 	 */
1515 
1516 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1517 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1518 	edge->edge_adj_prev = &from_lock->l_edge;
1519 	from_lock->l_edge.edge_adj_next = edge;
1520 
1521 	/*
1522 	 * put in list of to vertex
1523 	 */
1524 
1525 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1526 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1527 	to_lock->l_edge.edge_in_next = edge;
1528 	edge->edge_in_prev = &to_lock->l_edge;
1529 
1530 
1531 	if (update_graph) {
1532 		flk_update_proc_graph(edge, 0);
1533 		return (0);
1534 	}
1535 	if (!check_cycle) {
1536 		return (0);
1537 	}
1538 
1539 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1540 
1541 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1542 
1543 		STACK_POP(vertex_stack, l_stack);
1544 
1545 		for (ep = FIRST_ADJ(vertex);
1546 		    ep != HEAD(vertex);
1547 		    ep = NEXT_ADJ(ep)) {
1548 			if (COLORED(ep->to_vertex))
1549 				continue;
1550 			COLOR(ep->to_vertex);
1551 			if (SAME_OWNER(ep->to_vertex, from_lock))
1552 				goto dead_lock;
1553 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1554 		}
1555 	}
1556 	return (0);
1557 
1558 dead_lock:
1559 
1560 	/*
1561 	 * remove all edges
1562 	 */
1563 
1564 	ep = FIRST_ADJ(from_lock);
1565 
1566 	while (ep != HEAD(from_lock)) {
1567 		IN_LIST_REMOVE(ep);
1568 		from_lock->l_sedge = NEXT_ADJ(ep);
1569 		ADJ_LIST_REMOVE(ep);
1570 		flk_free_edge(ep);
1571 		ep = from_lock->l_sedge;
1572 	}
1573 	return (EDEADLK);
1574 }
1575 
1576 /*
1577  * Get an edge structure for representing the dependency between two locks.
1578  */
1579 
1580 static edge_t *
flk_get_edge()1581 flk_get_edge()
1582 {
1583 	edge_t	*ep;
1584 
1585 	ASSERT(flk_edge_cache != NULL);
1586 
1587 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1588 	edge_allocs++;
1589 	return (ep);
1590 }
1591 
1592 /*
1593  * Free the edge structure.
1594  */
1595 
1596 static void
flk_free_edge(edge_t * ep)1597 flk_free_edge(edge_t *ep)
1598 {
1599 	edge_frees++;
1600 	kmem_cache_free(flk_edge_cache, (void *)ep);
1601 }
1602 
1603 /*
1604  * Check the relationship of request with lock and perform the
1605  * recomputation of dependencies, break lock if required, and return
1606  * 1 if request cannot have any more relationship with the next
1607  * active locks.
1608  * The 'lock' and 'request' are compared and in case of overlap we
1609  * delete the 'lock' and form new locks to represent the non-overlapped
1610  * portion of original 'lock'. This function has side effects such as
1611  * 'lock' will be freed, new locks will be added to the active list.
1612  */
1613 
1614 static int
flk_relation(lock_descriptor_t * lock,lock_descriptor_t * request)1615 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1616 {
1617 	int lock_effect;
1618 	lock_descriptor_t *lock1, *lock2;
1619 	lock_descriptor_t *topology[3];
1620 	int nvertex = 0;
1621 	int i;
1622 	edge_t	*ep;
1623 	graph_t	*gp = (lock->l_graph);
1624 
1625 
1626 	CHECK_SLEEPING_LOCKS(gp);
1627 	CHECK_ACTIVE_LOCKS(gp);
1628 
1629 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1630 
1631 	topology[0] = topology[1] = topology[2] = NULL;
1632 
1633 	if (request->l_type == F_UNLCK)
1634 		lock_effect = FLK_UNLOCK;
1635 	else if (request->l_type == F_RDLCK &&
1636 	    lock->l_type == F_WRLCK)
1637 		lock_effect = FLK_DOWNGRADE;
1638 	else if (request->l_type == F_WRLCK &&
1639 	    lock->l_type == F_RDLCK)
1640 		lock_effect = FLK_UPGRADE;
1641 	else
1642 		lock_effect = FLK_STAY_SAME;
1643 
1644 	if (lock->l_end < request->l_start) {
1645 		if (lock->l_end == request->l_start - 1 &&
1646 		    lock_effect == FLK_STAY_SAME) {
1647 			topology[0] = request;
1648 			request->l_start = lock->l_start;
1649 			nvertex = 1;
1650 			goto recompute;
1651 		} else {
1652 			return (0);
1653 		}
1654 	}
1655 
1656 	if (lock->l_start > request->l_end) {
1657 		if (request->l_end == lock->l_start - 1 &&
1658 		    lock_effect == FLK_STAY_SAME) {
1659 			topology[0] = request;
1660 			request->l_end = lock->l_end;
1661 			nvertex = 1;
1662 			goto recompute;
1663 		} else {
1664 			return (1);
1665 		}
1666 	}
1667 
1668 	if (request->l_end < lock->l_end) {
1669 		if (request->l_start > lock->l_start) {
1670 			if (lock_effect == FLK_STAY_SAME) {
1671 				request->l_start = lock->l_start;
1672 				request->l_end = lock->l_end;
1673 				topology[0] = request;
1674 				nvertex = 1;
1675 			} else {
1676 				lock1 = flk_get_lock();
1677 				lock2 = flk_get_lock();
1678 				COPY(lock1, lock);
1679 				COPY(lock2, lock);
1680 				lock1->l_start = lock->l_start;
1681 				lock1->l_end = request->l_start - 1;
1682 				lock2->l_start = request->l_end + 1;
1683 				lock2->l_end = lock->l_end;
1684 				topology[0] = lock1;
1685 				topology[1] = lock2;
1686 				topology[2] = request;
1687 				nvertex = 3;
1688 			}
1689 		} else if (request->l_start < lock->l_start) {
1690 			if (lock_effect == FLK_STAY_SAME) {
1691 				request->l_end = lock->l_end;
1692 				topology[0] = request;
1693 				nvertex = 1;
1694 			} else {
1695 				lock1 = flk_get_lock();
1696 				COPY(lock1, lock);
1697 				lock1->l_start = request->l_end + 1;
1698 				topology[0] = lock1;
1699 				topology[1] = request;
1700 				nvertex = 2;
1701 			}
1702 		} else  {
1703 			if (lock_effect == FLK_STAY_SAME) {
1704 				request->l_start = lock->l_start;
1705 				request->l_end = lock->l_end;
1706 				topology[0] = request;
1707 				nvertex = 1;
1708 			} else {
1709 				lock1 = flk_get_lock();
1710 				COPY(lock1, lock);
1711 				lock1->l_start = request->l_end + 1;
1712 				topology[0] = lock1;
1713 				topology[1] = request;
1714 				nvertex = 2;
1715 			}
1716 		}
1717 	} else if (request->l_end > lock->l_end) {
1718 		if (request->l_start > lock->l_start)  {
1719 			if (lock_effect == FLK_STAY_SAME) {
1720 				request->l_start = lock->l_start;
1721 				topology[0] = request;
1722 				nvertex = 1;
1723 			} else {
1724 				lock1 = flk_get_lock();
1725 				COPY(lock1, lock);
1726 				lock1->l_end = request->l_start - 1;
1727 				topology[0] = lock1;
1728 				topology[1] = request;
1729 				nvertex = 2;
1730 			}
1731 		} else if (request->l_start < lock->l_start)  {
1732 			topology[0] = request;
1733 			nvertex = 1;
1734 		} else {
1735 			topology[0] = request;
1736 			nvertex = 1;
1737 		}
1738 	} else {
1739 		if (request->l_start > lock->l_start) {
1740 			if (lock_effect == FLK_STAY_SAME) {
1741 				request->l_start = lock->l_start;
1742 				topology[0] = request;
1743 				nvertex = 1;
1744 			} else {
1745 				lock1 = flk_get_lock();
1746 				COPY(lock1, lock);
1747 				lock1->l_end = request->l_start - 1;
1748 				topology[0] = lock1;
1749 				topology[1] = request;
1750 				nvertex = 2;
1751 			}
1752 		} else if (request->l_start < lock->l_start) {
1753 			topology[0] = request;
1754 			nvertex = 1;
1755 		} else {
1756 			if (lock_effect !=  FLK_UNLOCK) {
1757 				topology[0] = request;
1758 				nvertex = 1;
1759 			} else {
1760 				flk_delete_active_lock(lock, 0);
1761 				flk_wakeup(lock, 1);
1762 				flk_free_lock(lock);
1763 				CHECK_SLEEPING_LOCKS(gp);
1764 				CHECK_ACTIVE_LOCKS(gp);
1765 				return (1);
1766 			}
1767 		}
1768 	}
1769 
1770 recompute:
1771 
1772 	/*
1773 	 * For unlock we don't send the 'request' to for recomputing
1774 	 * dependencies because no lock will add an edge to this.
1775 	 */
1776 
1777 	if (lock_effect == FLK_UNLOCK) {
1778 		topology[nvertex-1] = NULL;
1779 		nvertex--;
1780 	}
1781 	for (i = 0; i < nvertex; i++) {
1782 		topology[i]->l_state |= RECOMPUTE_LOCK;
1783 		topology[i]->l_color = NO_COLOR;
1784 	}
1785 
1786 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1787 
1788 	/*
1789 	 * we remove the adjacent edges for all vertices' to this vertex
1790 	 * 'lock'.
1791 	 */
1792 
1793 	ep = FIRST_IN(lock);
1794 	while (ep != HEAD(lock)) {
1795 		ADJ_LIST_REMOVE(ep);
1796 		ep = NEXT_IN(ep);
1797 	}
1798 
1799 	flk_delete_active_lock(lock, 0);
1800 
1801 	/* We are ready for recomputing the dependencies now */
1802 
1803 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1804 
1805 	for (i = 0; i < nvertex; i++) {
1806 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1807 		topology[i]->l_color = NO_COLOR;
1808 	}
1809 
1810 
1811 	if (lock_effect == FLK_UNLOCK) {
1812 		nvertex++;
1813 	}
1814 	for (i = 0; i < nvertex - 1; i++) {
1815 		flk_insert_active_lock(topology[i]);
1816 	}
1817 
1818 
1819 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1820 		flk_wakeup(lock, 0);
1821 	} else {
1822 		ep = FIRST_IN(lock);
1823 		while (ep != HEAD(lock)) {
1824 			lock->l_sedge = NEXT_IN(ep);
1825 			IN_LIST_REMOVE(ep);
1826 			flk_update_proc_graph(ep, 1);
1827 			flk_free_edge(ep);
1828 			ep = lock->l_sedge;
1829 		}
1830 	}
1831 	flk_free_lock(lock);
1832 
1833 	CHECK_SLEEPING_LOCKS(gp);
1834 	CHECK_ACTIVE_LOCKS(gp);
1835 	return (0);
1836 }
1837 
1838 /*
1839  * Insert a lock into the active queue.
1840  */
1841 
1842 static void
flk_insert_active_lock(lock_descriptor_t * new_lock)1843 flk_insert_active_lock(lock_descriptor_t *new_lock)
1844 {
1845 	graph_t	*gp = new_lock->l_graph;
1846 	vnode_t	*vp = new_lock->l_vnode;
1847 	lock_descriptor_t *first_lock, *lock;
1848 
1849 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1850 
1851 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1852 	first_lock = lock;
1853 
1854 	if (first_lock != NULL) {
1855 		for (; (lock->l_vnode == vp &&
1856 		    lock->l_start < new_lock->l_start); lock = lock->l_next)
1857 			;
1858 	} else {
1859 		lock = ACTIVE_HEAD(gp);
1860 	}
1861 
1862 	lock->l_prev->l_next = new_lock;
1863 	new_lock->l_next = lock;
1864 	new_lock->l_prev = lock->l_prev;
1865 	lock->l_prev = new_lock;
1866 
1867 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1868 		vp->v_filocks = (struct filock *)new_lock;
1869 	}
1870 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1871 	new_lock->l_state |= ACTIVE_LOCK;
1872 
1873 	CHECK_ACTIVE_LOCKS(gp);
1874 	CHECK_SLEEPING_LOCKS(gp);
1875 }
1876 
1877 /*
1878  * Delete the active lock : Performs two functions depending on the
1879  * value of second parameter. One is to remove from the active lists
1880  * only and other is to both remove and free the lock.
1881  */
1882 
1883 static void
flk_delete_active_lock(lock_descriptor_t * lock,int free_lock)1884 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1885 {
1886 	vnode_t *vp = lock->l_vnode;
1887 	graph_t	*gp = lock->l_graph;
1888 
1889 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1890 	if (free_lock)
1891 		ASSERT(NO_DEPENDENTS(lock));
1892 	ASSERT(NOT_BLOCKED(lock));
1893 	ASSERT(IS_ACTIVE(lock));
1894 
1895 	ASSERT((vp->v_filocks != NULL));
1896 
1897 	if (vp->v_filocks == (struct filock *)lock) {
1898 		vp->v_filocks = (struct filock *)
1899 		    ((lock->l_next->l_vnode == vp) ? lock->l_next :
1900 		    NULL);
1901 	}
1902 	lock->l_next->l_prev = lock->l_prev;
1903 	lock->l_prev->l_next = lock->l_next;
1904 	lock->l_next = lock->l_prev = NULL;
1905 	flk_set_state(lock, FLK_DEAD_STATE);
1906 	lock->l_state &= ~ACTIVE_LOCK;
1907 
1908 	if (free_lock)
1909 		flk_free_lock(lock);
1910 	CHECK_ACTIVE_LOCKS(gp);
1911 	CHECK_SLEEPING_LOCKS(gp);
1912 }
1913 
1914 /*
1915  * Insert into the sleep queue.
1916  */
1917 
1918 static void
flk_insert_sleeping_lock(lock_descriptor_t * request)1919 flk_insert_sleeping_lock(lock_descriptor_t *request)
1920 {
1921 	graph_t *gp = request->l_graph;
1922 	vnode_t	*vp = request->l_vnode;
1923 	lock_descriptor_t	*lock;
1924 
1925 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1926 	ASSERT(IS_INITIAL(request));
1927 
1928 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1929 	    lock->l_vnode < vp); lock = lock->l_next)
1930 		;
1931 
1932 	lock->l_prev->l_next = request;
1933 	request->l_prev = lock->l_prev;
1934 	lock->l_prev = request;
1935 	request->l_next = lock;
1936 	flk_set_state(request, FLK_SLEEPING_STATE);
1937 	request->l_state |= SLEEPING_LOCK;
1938 }
1939 
1940 /*
1941  * Cancelling a sleeping lock implies removing a vertex from the
1942  * dependency graph and therefore we should recompute the dependencies
1943  * of all vertices that have a path  to this vertex, w.r.t. all
1944  * vertices reachable from this vertex.
1945  */
1946 
1947 void
flk_cancel_sleeping_lock(lock_descriptor_t * request,int remove_from_queue)1948 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1949 {
1950 	graph_t	*gp = request->l_graph;
1951 	vnode_t *vp = request->l_vnode;
1952 	lock_descriptor_t **topology = NULL;
1953 	edge_t	*ep;
1954 	lock_descriptor_t *vertex, *lock;
1955 	int nvertex = 0;
1956 	int i;
1957 	lock_descriptor_t *vertex_stack;
1958 
1959 	STACK_INIT(vertex_stack);
1960 
1961 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1962 	/*
1963 	 * count number of vertex pointers that has to be allocated
1964 	 * All vertices that are reachable from request.
1965 	 */
1966 
1967 	STACK_PUSH(vertex_stack, request, l_stack);
1968 
1969 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1970 		STACK_POP(vertex_stack, l_stack);
1971 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1972 		    ep = NEXT_ADJ(ep)) {
1973 			if (IS_RECOMPUTE(ep->to_vertex))
1974 				continue;
1975 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1976 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1977 			nvertex++;
1978 		}
1979 	}
1980 
1981 	/*
1982 	 * allocate memory for holding the vertex pointers
1983 	 */
1984 
1985 	if (nvertex) {
1986 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1987 		    KM_SLEEP);
1988 	}
1989 
1990 	/*
1991 	 * one more pass to actually store the vertices in the
1992 	 * allocated array.
1993 	 * We first check sleeping locks and then active locks
1994 	 * so that topology array will be in a topological
1995 	 * order.
1996 	 */
1997 
1998 	nvertex = 0;
1999 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2000 
2001 	if (lock) {
2002 		do {
2003 			if (IS_RECOMPUTE(lock)) {
2004 				lock->l_index = nvertex;
2005 				topology[nvertex++] = lock;
2006 			}
2007 			lock->l_color = NO_COLOR;
2008 			lock = lock->l_next;
2009 		} while (lock->l_vnode == vp);
2010 	}
2011 
2012 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2013 
2014 	if (lock) {
2015 		do {
2016 			if (IS_RECOMPUTE(lock)) {
2017 				lock->l_index = nvertex;
2018 				topology[nvertex++] = lock;
2019 			}
2020 			lock->l_color = NO_COLOR;
2021 			lock = lock->l_next;
2022 		} while (lock->l_vnode == vp);
2023 	}
2024 
2025 	/*
2026 	 * remove in and out edges of request
2027 	 * They are freed after updating proc_graph below.
2028 	 */
2029 
2030 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2031 		ADJ_LIST_REMOVE(ep);
2032 	}
2033 
2034 
2035 	if (remove_from_queue)
2036 		REMOVE_SLEEP_QUEUE(request);
2037 
2038 	/* we are ready to recompute */
2039 
2040 	flk_recompute_dependencies(request, topology, nvertex, 1);
2041 
2042 	ep = FIRST_ADJ(request);
2043 	while (ep != HEAD(request)) {
2044 		IN_LIST_REMOVE(ep);
2045 		request->l_sedge = NEXT_ADJ(ep);
2046 		ADJ_LIST_REMOVE(ep);
2047 		flk_update_proc_graph(ep, 1);
2048 		flk_free_edge(ep);
2049 		ep = request->l_sedge;
2050 	}
2051 
2052 
2053 	/*
2054 	 * unset the RECOMPUTE flag in those vertices
2055 	 */
2056 
2057 	for (i = 0; i < nvertex; i++) {
2058 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
2059 	}
2060 
2061 	/*
2062 	 * free the topology
2063 	 */
2064 	if (nvertex)
2065 		kmem_free((void *)topology,
2066 		    (nvertex * sizeof (lock_descriptor_t *)));
2067 	/*
2068 	 * Possibility of some locks unblocked now
2069 	 */
2070 
2071 	flk_wakeup(request, 0);
2072 
2073 	/*
2074 	 * we expect to have a correctly recomputed graph  now.
2075 	 */
2076 	flk_set_state(request, FLK_DEAD_STATE);
2077 	flk_free_lock(request);
2078 	CHECK_SLEEPING_LOCKS(gp);
2079 	CHECK_ACTIVE_LOCKS(gp);
2080 
2081 }
2082 
2083 /*
2084  * Uncoloring the graph is simply to increment the mark value of the graph
2085  * And only when wrap round takes place will we color all vertices in
2086  * the graph explicitly.
2087  */
2088 
2089 static void
flk_graph_uncolor(graph_t * gp)2090 flk_graph_uncolor(graph_t *gp)
2091 {
2092 	lock_descriptor_t *lock;
2093 
2094 	if (gp->mark == UINT_MAX) {
2095 		gp->mark = 1;
2096 		for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2097 		    lock = lock->l_next)
2098 			lock->l_color  = 0;
2099 
2100 		for (lock = SLEEPING_HEAD(gp)->l_next;
2101 		    lock != SLEEPING_HEAD(gp); lock = lock->l_next)
2102 			lock->l_color  = 0;
2103 	} else {
2104 		gp->mark++;
2105 	}
2106 }
2107 
2108 /*
2109  * Wake up locks that are blocked on the given lock.
2110  */
2111 
2112 static void
flk_wakeup(lock_descriptor_t * lock,int adj_list_remove)2113 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2114 {
2115 	edge_t	*ep;
2116 	graph_t	*gp = lock->l_graph;
2117 	lock_descriptor_t	*lck;
2118 
2119 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2120 	if (NO_DEPENDENTS(lock))
2121 		return;
2122 	ep = FIRST_IN(lock);
2123 	do {
2124 		/*
2125 		 * delete the edge from the adjacency list
2126 		 * of from vertex. if no more adjacent edges
2127 		 * for this vertex wake this process.
2128 		 */
2129 		lck = ep->from_vertex;
2130 		if (adj_list_remove)
2131 			ADJ_LIST_REMOVE(ep);
2132 		flk_update_proc_graph(ep, 1);
2133 		if (NOT_BLOCKED(lck)) {
2134 			GRANT_WAKEUP(lck);
2135 		}
2136 		lock->l_sedge = NEXT_IN(ep);
2137 		IN_LIST_REMOVE(ep);
2138 		flk_free_edge(ep);
2139 		ep = lock->l_sedge;
2140 	} while (ep != HEAD(lock));
2141 	ASSERT(NO_DEPENDENTS(lock));
2142 }
2143 
2144 /*
2145  * The dependents of request, is checked for its dependency against the
2146  * locks in topology (called topology because the array is and should be in
2147  * topological order for this algorithm, if not in topological order the
2148  * inner loop below might add more edges than necessary. Topological ordering
2149  * of vertices satisfies the property that all edges will be from left to
2150  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
2151  * If lock l1 in the dependent set of request is dependent (blocked by)
2152  * on lock l2 in topology but does not have a path to it, we add an edge
2153  * in the inner loop below.
2154  *
2155  * We don't want to add an edge between l1 and l2 if there exists
2156  * already a path from l1 to l2, so care has to be taken for those vertices
2157  * that  have two paths to 'request'. These vertices are referred to here
2158  * as barrier locks.
2159  *
2160  * The barriers has to be found (those vertex that originally had two paths
2161  * to request) because otherwise we may end up adding edges unnecessarily
2162  * to vertices in topology, and thus barrier vertices can have an edge
2163  * to a vertex in topology as well a path to it.
2164  */
2165 
2166 static void
flk_recompute_dependencies(lock_descriptor_t * request,lock_descriptor_t ** topology,int nvertex,int update_graph)2167 flk_recompute_dependencies(lock_descriptor_t *request,
2168     lock_descriptor_t **topology, int nvertex, int update_graph)
2169 {
2170 	lock_descriptor_t *vertex, *lock;
2171 	graph_t	*gp = request->l_graph;
2172 	int i, count;
2173 	int barrier_found = 0;
2174 	edge_t	*ep;
2175 	lock_descriptor_t *vertex_stack;
2176 
2177 	STACK_INIT(vertex_stack);
2178 
2179 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2180 	if (nvertex == 0)
2181 		return;
2182 	flk_graph_uncolor(request->l_graph);
2183 	barrier_found = flk_find_barriers(request);
2184 	request->l_state |= RECOMPUTE_DONE;
2185 
2186 	STACK_PUSH(vertex_stack, request, l_stack);
2187 	request->l_sedge = FIRST_IN(request);
2188 
2189 
2190 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2191 		if (vertex->l_state & RECOMPUTE_DONE) {
2192 			count = 0;
2193 			goto next_in_edge;
2194 		}
2195 		if (IS_BARRIER(vertex)) {
2196 			/* decrement the barrier count */
2197 			if (vertex->l_index) {
2198 				vertex->l_index--;
2199 				/* this guy will be pushed again anyway ? */
2200 				STACK_POP(vertex_stack, l_stack);
2201 				if (vertex->l_index == 0)  {
2202 				/*
2203 				 * barrier is over we can recompute
2204 				 * dependencies for this lock in the
2205 				 * next stack pop
2206 				 */
2207 					vertex->l_state &= ~BARRIER_LOCK;
2208 				}
2209 				continue;
2210 			}
2211 		}
2212 		vertex->l_state |= RECOMPUTE_DONE;
2213 		flk_graph_uncolor(gp);
2214 		count = flk_color_reachables(vertex);
2215 		for (i = 0; i < nvertex; i++) {
2216 			lock = topology[i];
2217 			if (COLORED(lock))
2218 				continue;
2219 			if (BLOCKS(lock, vertex)) {
2220 				(void) flk_add_edge(vertex, lock,
2221 				    NO_CHECK_CYCLE, update_graph);
2222 				COLOR(lock);
2223 				count++;
2224 				count += flk_color_reachables(lock);
2225 			}
2226 
2227 		}
2228 
2229 next_in_edge:
2230 		if (count == nvertex ||
2231 		    vertex->l_sedge == HEAD(vertex)) {
2232 			/* prune the tree below this */
2233 			STACK_POP(vertex_stack, l_stack);
2234 			vertex->l_state &= ~RECOMPUTE_DONE;
2235 			/* update the barrier locks below this! */
2236 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2237 				flk_graph_uncolor(gp);
2238 				flk_update_barriers(vertex);
2239 			}
2240 			continue;
2241 		}
2242 
2243 		ep = vertex->l_sedge;
2244 		lock = ep->from_vertex;
2245 		STACK_PUSH(vertex_stack, lock, l_stack);
2246 		lock->l_sedge = FIRST_IN(lock);
2247 		vertex->l_sedge = NEXT_IN(ep);
2248 	}
2249 
2250 }
2251 
2252 /*
2253  * Color all reachable vertices from vertex that belongs to topology (here
2254  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2255  *
2256  * Note: we need to use a different stack_link l_stack1 because this is
2257  * called from flk_recompute_dependencies() that already uses a stack with
2258  * l_stack as stack_link.
2259  */
2260 
2261 static int
flk_color_reachables(lock_descriptor_t * vertex)2262 flk_color_reachables(lock_descriptor_t *vertex)
2263 {
2264 	lock_descriptor_t *ver, *lock;
2265 	int count;
2266 	edge_t	*ep;
2267 	lock_descriptor_t *vertex_stack;
2268 
2269 	STACK_INIT(vertex_stack);
2270 
2271 	STACK_PUSH(vertex_stack, vertex, l_stack1);
2272 	count = 0;
2273 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2274 
2275 		STACK_POP(vertex_stack, l_stack1);
2276 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2277 		    ep = NEXT_ADJ(ep)) {
2278 			lock = ep->to_vertex;
2279 			if (COLORED(lock))
2280 				continue;
2281 			COLOR(lock);
2282 			if (IS_RECOMPUTE(lock))
2283 				count++;
2284 			STACK_PUSH(vertex_stack, lock, l_stack1);
2285 		}
2286 
2287 	}
2288 	return (count);
2289 }
2290 
2291 /*
2292  * Called from flk_recompute_dependencies() this routine decrements
2293  * the barrier count of barrier vertices that are reachable from lock.
2294  */
2295 
2296 static void
flk_update_barriers(lock_descriptor_t * lock)2297 flk_update_barriers(lock_descriptor_t *lock)
2298 {
2299 	lock_descriptor_t *vertex, *lck;
2300 	edge_t	*ep;
2301 	lock_descriptor_t *vertex_stack;
2302 
2303 	STACK_INIT(vertex_stack);
2304 
2305 	STACK_PUSH(vertex_stack, lock, l_stack1);
2306 
2307 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2308 		STACK_POP(vertex_stack, l_stack1);
2309 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2310 		    ep = NEXT_IN(ep)) {
2311 			lck = ep->from_vertex;
2312 			if (COLORED(lck)) {
2313 				if (IS_BARRIER(lck)) {
2314 					ASSERT(lck->l_index > 0);
2315 					lck->l_index--;
2316 					if (lck->l_index == 0)
2317 						lck->l_state &= ~BARRIER_LOCK;
2318 				}
2319 				continue;
2320 			}
2321 			COLOR(lck);
2322 			if (IS_BARRIER(lck)) {
2323 				ASSERT(lck->l_index > 0);
2324 				lck->l_index--;
2325 				if (lck->l_index == 0)
2326 					lck->l_state &= ~BARRIER_LOCK;
2327 			}
2328 			STACK_PUSH(vertex_stack, lck, l_stack1);
2329 		}
2330 	}
2331 }
2332 
2333 /*
2334  * Finds all vertices that are reachable from 'lock' more than once and
2335  * mark them as barrier vertices and increment their barrier count.
2336  * The barrier count is one minus the total number of paths from lock
2337  * to that vertex.
2338  */
2339 
2340 static int
flk_find_barriers(lock_descriptor_t * lock)2341 flk_find_barriers(lock_descriptor_t *lock)
2342 {
2343 	lock_descriptor_t *vertex, *lck;
2344 	int found = 0;
2345 	edge_t	*ep;
2346 	lock_descriptor_t *vertex_stack;
2347 
2348 	STACK_INIT(vertex_stack);
2349 
2350 	STACK_PUSH(vertex_stack, lock, l_stack1);
2351 
2352 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2353 		STACK_POP(vertex_stack, l_stack1);
2354 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2355 		    ep = NEXT_IN(ep)) {
2356 			lck = ep->from_vertex;
2357 			if (COLORED(lck)) {
2358 				/* this is a barrier */
2359 				lck->l_state |= BARRIER_LOCK;
2360 				/* index will have barrier count */
2361 				lck->l_index++;
2362 				if (!found)
2363 					found = 1;
2364 				continue;
2365 			}
2366 			COLOR(lck);
2367 			lck->l_index = 0;
2368 			STACK_PUSH(vertex_stack, lck, l_stack1);
2369 		}
2370 	}
2371 	return (found);
2372 }
2373 
2374 /*
2375  * Finds the first lock that is mainly responsible for blocking this
2376  * request.  If there is no such lock, request->l_flock.l_type is set to
2377  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2378  * of the blocking lock.
2379  *
2380  * Note: It is possible a request is blocked by a sleeping lock because
2381  * of the fairness policy used in flk_process_request() to construct the
2382  * dependencies. (see comments before flk_process_request()).
2383  */
2384 
2385 static void
flk_get_first_blocking_lock(lock_descriptor_t * request)2386 flk_get_first_blocking_lock(lock_descriptor_t *request)
2387 {
2388 	graph_t	*gp = request->l_graph;
2389 	vnode_t *vp = request->l_vnode;
2390 	lock_descriptor_t *lock, *blocker;
2391 
2392 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2393 	blocker = NULL;
2394 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2395 
2396 	if (lock) {
2397 		do {
2398 			if (BLOCKS(lock, request)) {
2399 				blocker = lock;
2400 				break;
2401 			}
2402 			lock = lock->l_next;
2403 		} while (lock->l_vnode == vp);
2404 	}
2405 
2406 	if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2407 		/*
2408 		 * No active lock is blocking this request, but if a read
2409 		 * lock is requested, it may also get blocked by a waiting
2410 		 * writer. So search all sleeping locks and see if there is
2411 		 * a writer waiting.
2412 		 */
2413 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2414 		if (lock) {
2415 			do {
2416 				if (BLOCKS(lock, request)) {
2417 					blocker = lock;
2418 					break;
2419 				}
2420 				lock = lock->l_next;
2421 			} while (lock->l_vnode == vp);
2422 		}
2423 	}
2424 
2425 	if (blocker) {
2426 		report_blocker(blocker, request);
2427 	} else
2428 		request->l_flock.l_type = F_UNLCK;
2429 }
2430 
2431 /*
2432  * Get the graph_t structure associated with a vnode.
2433  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2434  * not yet been initialized, then a new element is allocated and returned.
2435  */
2436 graph_t *
flk_get_lock_graph(vnode_t * vp,int initialize)2437 flk_get_lock_graph(vnode_t *vp, int initialize)
2438 {
2439 	graph_t *gp;
2440 	graph_t *gp_alloc = NULL;
2441 	int index = HASH_INDEX(vp);
2442 
2443 	if (initialize == FLK_USE_GRAPH) {
2444 		mutex_enter(&flock_lock);
2445 		gp = lock_graph[index];
2446 		mutex_exit(&flock_lock);
2447 		return (gp);
2448 	}
2449 
2450 	ASSERT(initialize == FLK_INIT_GRAPH);
2451 
2452 	if (lock_graph[index] == NULL) {
2453 
2454 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2455 
2456 		/* Initialize the graph */
2457 
2458 		gp_alloc->active_locks.l_next =
2459 		    gp_alloc->active_locks.l_prev =
2460 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2461 		gp_alloc->sleeping_locks.l_next =
2462 		    gp_alloc->sleeping_locks.l_prev =
2463 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2464 		gp_alloc->index = index;
2465 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2466 	}
2467 
2468 	mutex_enter(&flock_lock);
2469 
2470 	gp = lock_graph[index];
2471 
2472 	/* Recheck the value within flock_lock */
2473 	if (gp == NULL) {
2474 		struct flock_globals *fg;
2475 
2476 		/* We must have previously allocated the graph_t structure */
2477 		ASSERT(gp_alloc != NULL);
2478 		lock_graph[index] = gp = gp_alloc;
2479 		/*
2480 		 * The lockmgr status is only needed if KLM is loaded.
2481 		 */
2482 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2483 			fg = flk_get_globals();
2484 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2485 		}
2486 	}
2487 
2488 	mutex_exit(&flock_lock);
2489 
2490 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2491 		/* There was a race to allocate the graph_t and we lost */
2492 		mutex_destroy(&gp_alloc->gp_mutex);
2493 		kmem_free(gp_alloc, sizeof (graph_t));
2494 	}
2495 
2496 	return (gp);
2497 }
2498 
2499 /*
2500  * PSARC case 1997/292
2501  */
2502 int
cl_flk_has_remote_locks_for_nlmid(vnode_t * vp,int nlmid)2503 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2504 {
2505 	lock_descriptor_t *lock;
2506 	int result = 0;
2507 	graph_t *gp;
2508 	int			lock_nlmid;
2509 
2510 	/*
2511 	 * Check to see if node is booted as a cluster. If not, return.
2512 	 */
2513 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2514 		return (0);
2515 	}
2516 
2517 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2518 	if (gp == NULL) {
2519 		return (0);
2520 	}
2521 
2522 	mutex_enter(&gp->gp_mutex);
2523 
2524 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2525 
2526 	if (lock) {
2527 		while (lock->l_vnode == vp) {
2528 			/* get NLM id from sysid */
2529 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2530 
2531 			/*
2532 			 * If NLM server request _and_ nlmid of lock matches
2533 			 * nlmid of argument, then we've found a remote lock.
2534 			 */
2535 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2536 				result = 1;
2537 				goto done;
2538 			}
2539 			lock = lock->l_next;
2540 		}
2541 	}
2542 
2543 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2544 
2545 	if (lock) {
2546 		while (lock->l_vnode == vp) {
2547 			/* get NLM id from sysid */
2548 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2549 
2550 			/*
2551 			 * If NLM server request _and_ nlmid of lock matches
2552 			 * nlmid of argument, then we've found a remote lock.
2553 			 */
2554 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2555 				result = 1;
2556 				goto done;
2557 			}
2558 			lock = lock->l_next;
2559 		}
2560 	}
2561 
2562 done:
2563 	mutex_exit(&gp->gp_mutex);
2564 	return (result);
2565 }
2566 
2567 /*
2568  * Determine whether there are any locks for the given vnode with a remote
2569  * sysid.  Returns zero if not, non-zero if there are.
2570  *
2571  * Note that the return value from this function is potentially invalid
2572  * once it has been returned.  The caller is responsible for providing its
2573  * own synchronization mechanism to ensure that the return value is useful
2574  * (e.g., see nfs_lockcompletion()).
2575  */
2576 int
flk_has_remote_locks(vnode_t * vp)2577 flk_has_remote_locks(vnode_t *vp)
2578 {
2579 	lock_descriptor_t *lock;
2580 	int result = 0;
2581 	graph_t *gp;
2582 
2583 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2584 	if (gp == NULL) {
2585 		return (0);
2586 	}
2587 
2588 	mutex_enter(&gp->gp_mutex);
2589 
2590 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2591 
2592 	if (lock) {
2593 		while (lock->l_vnode == vp) {
2594 			if (IS_REMOTE(lock)) {
2595 				result = 1;
2596 				goto done;
2597 			}
2598 			lock = lock->l_next;
2599 		}
2600 	}
2601 
2602 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2603 
2604 	if (lock) {
2605 		while (lock->l_vnode == vp) {
2606 			if (IS_REMOTE(lock)) {
2607 				result = 1;
2608 				goto done;
2609 			}
2610 			lock = lock->l_next;
2611 		}
2612 	}
2613 
2614 done:
2615 	mutex_exit(&gp->gp_mutex);
2616 	return (result);
2617 }
2618 
2619 /*
2620  * Determine whether there are any locks for the given vnode with a remote
2621  * sysid matching given sysid.
2622  * Used by the new (open source) NFS Lock Manager (NLM)
2623  */
2624 int
flk_has_remote_locks_for_sysid(vnode_t * vp,int sysid)2625 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2626 {
2627 	lock_descriptor_t *lock;
2628 	int result = 0;
2629 	graph_t *gp;
2630 
2631 	if (sysid == 0)
2632 		return (0);
2633 
2634 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2635 	if (gp == NULL) {
2636 		return (0);
2637 	}
2638 
2639 	mutex_enter(&gp->gp_mutex);
2640 
2641 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2642 
2643 	if (lock) {
2644 		while (lock->l_vnode == vp) {
2645 			if (lock->l_flock.l_sysid == sysid) {
2646 				result = 1;
2647 				goto done;
2648 			}
2649 			lock = lock->l_next;
2650 		}
2651 	}
2652 
2653 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2654 
2655 	if (lock) {
2656 		while (lock->l_vnode == vp) {
2657 			if (lock->l_flock.l_sysid == sysid) {
2658 				result = 1;
2659 				goto done;
2660 			}
2661 			lock = lock->l_next;
2662 		}
2663 	}
2664 
2665 done:
2666 	mutex_exit(&gp->gp_mutex);
2667 	return (result);
2668 }
2669 
2670 /*
2671  * Determine if there are any locks owned by the given sysid.
2672  * Returns zero if not, non-zero if there are.  Note that this return code
2673  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2674  * avoids all the memory allocations of those routines.
2675  *
2676  * This routine has the same synchronization issues as
2677  * flk_has_remote_locks.
2678  */
2679 
2680 int
flk_sysid_has_locks(int sysid,int lck_type)2681 flk_sysid_has_locks(int sysid, int lck_type)
2682 {
2683 	int		has_locks = 0;
2684 	lock_descriptor_t	*lock;
2685 	graph_t 	*gp;
2686 	int		i;
2687 
2688 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2689 		mutex_enter(&flock_lock);
2690 		gp = lock_graph[i];
2691 		mutex_exit(&flock_lock);
2692 		if (gp == NULL) {
2693 			continue;
2694 		}
2695 
2696 		mutex_enter(&gp->gp_mutex);
2697 
2698 		if (lck_type & FLK_QUERY_ACTIVE) {
2699 			for (lock = ACTIVE_HEAD(gp)->l_next;
2700 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2701 			    lock = lock->l_next) {
2702 				if (lock->l_flock.l_sysid == sysid)
2703 					has_locks = 1;
2704 			}
2705 		}
2706 
2707 		if (lck_type & FLK_QUERY_SLEEPING) {
2708 			for (lock = SLEEPING_HEAD(gp)->l_next;
2709 			    lock != SLEEPING_HEAD(gp) && !has_locks;
2710 			    lock = lock->l_next) {
2711 				if (lock->l_flock.l_sysid == sysid)
2712 					has_locks = 1;
2713 			}
2714 		}
2715 		mutex_exit(&gp->gp_mutex);
2716 	}
2717 
2718 	return (has_locks);
2719 }
2720 
2721 
2722 /*
2723  * PSARC case 1997/292
2724  *
2725  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2726  *  quantity, the real sysid generated by the NLM server; the upper half
2727  *  identifies the node of the cluster where the NLM server ran.
2728  *  This routine is only called by an NLM server running in a cluster.
2729  * Effects: Remove all locks held on behalf of the client identified
2730  *  by "sysid."
2731  */
2732 void
cl_flk_remove_locks_by_sysid(int sysid)2733 cl_flk_remove_locks_by_sysid(int sysid)
2734 {
2735 	graph_t	*gp;
2736 	int i;
2737 	lock_descriptor_t *lock, *nlock;
2738 
2739 	/*
2740 	 * Check to see if node is booted as a cluster. If not, return.
2741 	 */
2742 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2743 		return;
2744 	}
2745 
2746 	ASSERT(sysid != 0);
2747 	for (i = 0; i < HASH_SIZE; i++) {
2748 		mutex_enter(&flock_lock);
2749 		gp = lock_graph[i];
2750 		mutex_exit(&flock_lock);
2751 
2752 		if (gp == NULL)
2753 			continue;
2754 
2755 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2756 
2757 		/* signal sleeping requests so that they bail out */
2758 		lock = SLEEPING_HEAD(gp)->l_next;
2759 		while (lock != SLEEPING_HEAD(gp)) {
2760 			nlock = lock->l_next;
2761 			if (lock->l_flock.l_sysid == sysid) {
2762 				INTERRUPT_WAKEUP(lock);
2763 			}
2764 			lock = nlock;
2765 		}
2766 
2767 		/* delete active locks */
2768 		lock = ACTIVE_HEAD(gp)->l_next;
2769 		while (lock != ACTIVE_HEAD(gp)) {
2770 			nlock = lock->l_next;
2771 			if (lock->l_flock.l_sysid == sysid) {
2772 				flk_delete_active_lock(lock, 0);
2773 				flk_wakeup(lock, 1);
2774 				flk_free_lock(lock);
2775 			}
2776 			lock = nlock;
2777 		}
2778 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2779 	}
2780 }
2781 
2782 /*
2783  * Delete all locks in the system that belongs to the sysid of the request.
2784  */
2785 
2786 static void
flk_delete_locks_by_sysid(lock_descriptor_t * request)2787 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2788 {
2789 	int	sysid  = request->l_flock.l_sysid;
2790 	lock_descriptor_t *lock, *nlock;
2791 	graph_t	*gp;
2792 	int i;
2793 
2794 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2795 	ASSERT(sysid != 0);
2796 
2797 	mutex_exit(&request->l_graph->gp_mutex);
2798 
2799 	for (i = 0; i < HASH_SIZE; i++) {
2800 		mutex_enter(&flock_lock);
2801 		gp = lock_graph[i];
2802 		mutex_exit(&flock_lock);
2803 
2804 		if (gp == NULL)
2805 			continue;
2806 
2807 		mutex_enter(&gp->gp_mutex);
2808 
2809 		/* signal sleeping requests so that they bail out */
2810 		lock = SLEEPING_HEAD(gp)->l_next;
2811 		while (lock != SLEEPING_HEAD(gp)) {
2812 			nlock = lock->l_next;
2813 			if (lock->l_flock.l_sysid == sysid) {
2814 				INTERRUPT_WAKEUP(lock);
2815 			}
2816 			lock = nlock;
2817 		}
2818 
2819 		/* delete active locks */
2820 		lock = ACTIVE_HEAD(gp)->l_next;
2821 		while (lock != ACTIVE_HEAD(gp)) {
2822 			nlock = lock->l_next;
2823 			if (lock->l_flock.l_sysid == sysid) {
2824 				flk_delete_active_lock(lock, 0);
2825 				flk_wakeup(lock, 1);
2826 				flk_free_lock(lock);
2827 			}
2828 			lock = nlock;
2829 		}
2830 		mutex_exit(&gp->gp_mutex);
2831 	}
2832 
2833 	mutex_enter(&request->l_graph->gp_mutex);
2834 }
2835 
2836 /*
2837  * Clustering: Deletes PXFS locks
2838  * Effects: Delete all locks on files in the given file system and with the
2839  *  given PXFS id.
2840  */
2841 void
cl_flk_delete_pxfs_locks(struct vfs * vfsp,int pxfsid)2842 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2843 {
2844 	lock_descriptor_t *lock, *nlock;
2845 	graph_t	*gp;
2846 	int i;
2847 
2848 	for (i = 0; i < HASH_SIZE; i++) {
2849 		mutex_enter(&flock_lock);
2850 		gp = lock_graph[i];
2851 		mutex_exit(&flock_lock);
2852 
2853 		if (gp == NULL)
2854 			continue;
2855 
2856 		mutex_enter(&gp->gp_mutex);
2857 
2858 		/* signal sleeping requests so that they bail out */
2859 		lock = SLEEPING_HEAD(gp)->l_next;
2860 		while (lock != SLEEPING_HEAD(gp)) {
2861 			nlock = lock->l_next;
2862 			if (lock->l_vnode->v_vfsp == vfsp) {
2863 				ASSERT(IS_PXFS(lock));
2864 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2865 				    pxfsid) {
2866 					flk_set_state(lock,
2867 					    FLK_CANCELLED_STATE);
2868 					flk_cancel_sleeping_lock(lock, 1);
2869 				}
2870 			}
2871 			lock = nlock;
2872 		}
2873 
2874 		/* delete active locks */
2875 		lock = ACTIVE_HEAD(gp)->l_next;
2876 		while (lock != ACTIVE_HEAD(gp)) {
2877 			nlock = lock->l_next;
2878 			if (lock->l_vnode->v_vfsp == vfsp) {
2879 				ASSERT(IS_PXFS(lock));
2880 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2881 				    pxfsid) {
2882 					flk_delete_active_lock(lock, 0);
2883 					flk_wakeup(lock, 1);
2884 					flk_free_lock(lock);
2885 				}
2886 			}
2887 			lock = nlock;
2888 		}
2889 		mutex_exit(&gp->gp_mutex);
2890 	}
2891 }
2892 
2893 /*
2894  * Search for a sleeping lock manager lock which matches exactly this lock
2895  * request; if one is found, fake a signal to cancel it.
2896  *
2897  * Return 1 if a matching lock was found, 0 otherwise.
2898  */
2899 
2900 static int
flk_canceled(lock_descriptor_t * request)2901 flk_canceled(lock_descriptor_t *request)
2902 {
2903 	lock_descriptor_t *lock, *nlock;
2904 	graph_t *gp = request->l_graph;
2905 	vnode_t *vp = request->l_vnode;
2906 
2907 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2908 	ASSERT(IS_LOCKMGR(request));
2909 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2910 
2911 	if (lock) {
2912 		while (lock->l_vnode == vp) {
2913 			nlock = lock->l_next;
2914 			if (SAME_OWNER(lock, request) &&
2915 			    lock->l_start == request->l_start &&
2916 			    lock->l_end == request->l_end) {
2917 				INTERRUPT_WAKEUP(lock);
2918 				return (1);
2919 			}
2920 			lock = nlock;
2921 		}
2922 	}
2923 	return (0);
2924 }
2925 
2926 /*
2927  * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2928  * That is, since OFD locks are pid-less we'll never match on the incoming
2929  * pid. OFD locks are removed earlier in the close() path via closef() and
2930  * ofdcleanlock().
2931  */
2932 void
cleanlocks(vnode_t * vp,pid_t pid,int sysid)2933 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2934 {
2935 	graph_t	*gp;
2936 	lock_descriptor_t *lock, *nlock;
2937 	lock_descriptor_t *link_stack;
2938 
2939 	STACK_INIT(link_stack);
2940 
2941 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2942 
2943 	if (gp == NULL)
2944 		return;
2945 	mutex_enter(&gp->gp_mutex);
2946 
2947 	CHECK_SLEEPING_LOCKS(gp);
2948 	CHECK_ACTIVE_LOCKS(gp);
2949 
2950 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2951 
2952 	if (lock) {
2953 		do {
2954 			nlock = lock->l_next;
2955 			if ((lock->l_flock.l_pid == pid ||
2956 			    pid == IGN_PID) &&
2957 			    lock->l_flock.l_sysid == sysid) {
2958 				CANCEL_WAKEUP(lock);
2959 			}
2960 			lock = nlock;
2961 		} while (lock->l_vnode == vp);
2962 	}
2963 
2964 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2965 
2966 	if (lock) {
2967 		do {
2968 			nlock = lock->l_next;
2969 			if ((lock->l_flock.l_pid == pid ||
2970 			    pid == IGN_PID) &&
2971 			    lock->l_flock.l_sysid == sysid) {
2972 				flk_delete_active_lock(lock, 0);
2973 				STACK_PUSH(link_stack, lock, l_stack);
2974 			}
2975 			lock = nlock;
2976 		} while (lock->l_vnode == vp);
2977 	}
2978 
2979 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2980 		STACK_POP(link_stack, l_stack);
2981 		flk_wakeup(lock, 1);
2982 		flk_free_lock(lock);
2983 	}
2984 
2985 	CHECK_SLEEPING_LOCKS(gp);
2986 	CHECK_ACTIVE_LOCKS(gp);
2987 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2988 	mutex_exit(&gp->gp_mutex);
2989 }
2990 
2991 
2992 /*
2993  * Called from 'fs' read and write routines for files that have mandatory
2994  * locking enabled.
2995  */
2996 
2997 int
chklock(struct vnode * vp,int iomode,u_offset_t offset,ssize_t len,int fmode,caller_context_t * ct)2998 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
2999     caller_context_t *ct)
3000 {
3001 	register int	i;
3002 	struct flock64 	bf;
3003 	int 		error = 0;
3004 
3005 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3006 	bf.l_whence = 0;
3007 	bf.l_start = offset;
3008 	bf.l_len = len;
3009 	if (ct == NULL) {
3010 		bf.l_pid = curproc->p_pid;
3011 		bf.l_sysid = 0;
3012 	} else {
3013 		bf.l_pid = ct->cc_pid;
3014 		bf.l_sysid = ct->cc_sysid;
3015 	}
3016 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3017 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3018 	    bf.l_type != F_UNLCK)
3019 		error = i ? i : EAGAIN;
3020 	return (error);
3021 }
3022 
3023 /*
3024  * convoff - converts the given data (start, whence) to the
3025  * given whence.
3026  */
3027 int
convoff(struct vnode * vp,struct flock64 * lckdat,int whence,offset_t offset)3028 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
3029 {
3030 	int 		error;
3031 	struct vattr 	vattr;
3032 
3033 	if ((lckdat->l_whence == 2) || (whence == 2)) {
3034 		vattr.va_mask = AT_SIZE;
3035 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3036 			return (error);
3037 	}
3038 
3039 	switch (lckdat->l_whence) {
3040 	case 1:
3041 		lckdat->l_start += offset;
3042 		break;
3043 	case 2:
3044 		lckdat->l_start += vattr.va_size;
3045 		/* FALLTHRU */
3046 	case 0:
3047 		break;
3048 	default:
3049 		return (EINVAL);
3050 	}
3051 
3052 	if (lckdat->l_start < 0)
3053 		return (EINVAL);
3054 
3055 	switch (whence) {
3056 	case 1:
3057 		lckdat->l_start -= offset;
3058 		break;
3059 	case 2:
3060 		lckdat->l_start -= vattr.va_size;
3061 		/* FALLTHRU */
3062 	case 0:
3063 		break;
3064 	default:
3065 		return (EINVAL);
3066 	}
3067 
3068 	lckdat->l_whence = (short)whence;
3069 	return (0);
3070 }
3071 
3072 
3073 /* 	proc_graph function definitions */
3074 
3075 /*
3076  * Function checks for deadlock due to the new 'lock'. If deadlock found
3077  * edges of this lock are freed and returned.
3078  */
3079 
3080 static int
flk_check_deadlock(lock_descriptor_t * lock)3081 flk_check_deadlock(lock_descriptor_t *lock)
3082 {
3083 	proc_vertex_t	*start_vertex, *pvertex;
3084 	proc_vertex_t *dvertex;
3085 	proc_edge_t *pep, *ppep;
3086 	edge_t	*ep, *nep;
3087 	proc_vertex_t *process_stack;
3088 
3089 	/*
3090 	 * OFD style locks are not associated with any process so there is
3091 	 * no proc graph for these. Thus we cannot, and do not, do deadlock
3092 	 * detection.
3093 	 */
3094 	if (lock->l_ofd != NULL)
3095 		return (0);
3096 
3097 	STACK_INIT(process_stack);
3098 
3099 	mutex_enter(&flock_lock);
3100 	start_vertex = flk_get_proc_vertex(lock);
3101 	ASSERT(start_vertex != NULL);
3102 
3103 	/* construct the edges from this process to other processes */
3104 
3105 	ep = FIRST_ADJ(lock);
3106 	while (ep != HEAD(lock)) {
3107 		proc_vertex_t *adj_proc;
3108 
3109 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
3110 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3111 			if (pep->to_proc == adj_proc) {
3112 				ASSERT(pep->refcount);
3113 				pep->refcount++;
3114 				break;
3115 			}
3116 		}
3117 		if (pep == NULL) {
3118 			pep = flk_get_proc_edge();
3119 			pep->to_proc = adj_proc;
3120 			pep->refcount = 1;
3121 			adj_proc->incount++;
3122 			pep->next = start_vertex->edge;
3123 			start_vertex->edge = pep;
3124 		}
3125 		ep = NEXT_ADJ(ep);
3126 	}
3127 
3128 	ep = FIRST_IN(lock);
3129 
3130 	while (ep != HEAD(lock)) {
3131 		proc_vertex_t *in_proc;
3132 
3133 		in_proc = flk_get_proc_vertex(ep->from_vertex);
3134 
3135 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3136 			if (pep->to_proc == start_vertex) {
3137 				ASSERT(pep->refcount);
3138 				pep->refcount++;
3139 				break;
3140 			}
3141 		}
3142 		if (pep == NULL) {
3143 			pep = flk_get_proc_edge();
3144 			pep->to_proc = start_vertex;
3145 			pep->refcount = 1;
3146 			start_vertex->incount++;
3147 			pep->next = in_proc->edge;
3148 			in_proc->edge = pep;
3149 		}
3150 		ep = NEXT_IN(ep);
3151 	}
3152 
3153 	if (start_vertex->incount == 0) {
3154 		mutex_exit(&flock_lock);
3155 		return (0);
3156 	}
3157 
3158 	flk_proc_graph_uncolor();
3159 
3160 	start_vertex->p_sedge = start_vertex->edge;
3161 
3162 	STACK_PUSH(process_stack, start_vertex, p_stack);
3163 
3164 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3165 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3166 			dvertex = pep->to_proc;
3167 			if (!PROC_ARRIVED(dvertex)) {
3168 				STACK_PUSH(process_stack, dvertex, p_stack);
3169 				dvertex->p_sedge = dvertex->edge;
3170 				PROC_ARRIVE(pvertex);
3171 				pvertex->p_sedge = pep->next;
3172 				break;
3173 			}
3174 			if (!PROC_DEPARTED(dvertex))
3175 				goto deadlock;
3176 		}
3177 		if (pep == NULL) {
3178 			PROC_DEPART(pvertex);
3179 			STACK_POP(process_stack, p_stack);
3180 		}
3181 	}
3182 	mutex_exit(&flock_lock);
3183 	return (0);
3184 
3185 deadlock:
3186 
3187 	/* we remove all lock edges and proc edges */
3188 
3189 	ep = FIRST_ADJ(lock);
3190 	while (ep != HEAD(lock)) {
3191 		proc_vertex_t *adj_proc;
3192 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
3193 		nep = NEXT_ADJ(ep);
3194 		IN_LIST_REMOVE(ep);
3195 		ADJ_LIST_REMOVE(ep);
3196 		flk_free_edge(ep);
3197 		ppep = start_vertex->edge;
3198 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3199 		    pep = ppep->next) {
3200 			if (pep->to_proc == adj_proc) {
3201 				pep->refcount--;
3202 				if (pep->refcount == 0) {
3203 					if (pep == ppep) {
3204 						start_vertex->edge = pep->next;
3205 					} else {
3206 						ppep->next = pep->next;
3207 					}
3208 					adj_proc->incount--;
3209 					flk_proc_release(adj_proc);
3210 					flk_free_proc_edge(pep);
3211 				}
3212 				break;
3213 			}
3214 		}
3215 		ep = nep;
3216 	}
3217 	ep = FIRST_IN(lock);
3218 	while (ep != HEAD(lock)) {
3219 		proc_vertex_t *in_proc;
3220 		in_proc = flk_get_proc_vertex(ep->from_vertex);
3221 		nep = NEXT_IN(ep);
3222 		IN_LIST_REMOVE(ep);
3223 		ADJ_LIST_REMOVE(ep);
3224 		flk_free_edge(ep);
3225 		ppep = in_proc->edge;
3226 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
3227 		    pep = ppep->next) {
3228 			if (pep->to_proc == start_vertex) {
3229 				pep->refcount--;
3230 				if (pep->refcount == 0) {
3231 					if (pep == ppep) {
3232 						in_proc->edge = pep->next;
3233 					} else {
3234 						ppep->next = pep->next;
3235 					}
3236 					start_vertex->incount--;
3237 					flk_proc_release(in_proc);
3238 					flk_free_proc_edge(pep);
3239 				}
3240 				break;
3241 			}
3242 		}
3243 		ep = nep;
3244 	}
3245 	flk_proc_release(start_vertex);
3246 	mutex_exit(&flock_lock);
3247 	return (1);
3248 }
3249 
3250 /*
3251  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3252  * from the list we return that, otherwise we allocate one. If necessary,
3253  * we grow the list of vertices also.
3254  */
3255 
3256 static proc_vertex_t *
flk_get_proc_vertex(lock_descriptor_t * lock)3257 flk_get_proc_vertex(lock_descriptor_t *lock)
3258 {
3259 	int i;
3260 	proc_vertex_t	*pv;
3261 	proc_vertex_t	**palloc;
3262 
3263 	ASSERT(MUTEX_HELD(&flock_lock));
3264 	if (lock->pvertex != -1) {
3265 		ASSERT(lock->pvertex >= 0);
3266 		pv = pgraph.proc[lock->pvertex];
3267 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3268 			return (pv);
3269 		}
3270 	}
3271 	for (i = 0; i < pgraph.gcount; i++) {
3272 		pv = pgraph.proc[i];
3273 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3274 			lock->pvertex = pv->index = i;
3275 			return (pv);
3276 		}
3277 	}
3278 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3279 	pv->pid = lock->l_flock.l_pid;
3280 	pv->sysid = lock->l_flock.l_sysid;
3281 	flk_proc_vertex_allocs++;
3282 	if (pgraph.free != 0) {
3283 		for (i = 0; i < pgraph.gcount; i++) {
3284 			if (pgraph.proc[i] == NULL) {
3285 				pgraph.proc[i] = pv;
3286 				lock->pvertex = pv->index = i;
3287 				pgraph.free--;
3288 				return (pv);
3289 			}
3290 		}
3291 	}
3292 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3293 	    sizeof (proc_vertex_t *), KM_SLEEP);
3294 
3295 	if (pgraph.proc) {
3296 		bcopy(pgraph.proc, palloc,
3297 		    pgraph.gcount * sizeof (proc_vertex_t *));
3298 
3299 		kmem_free(pgraph.proc,
3300 		    pgraph.gcount * sizeof (proc_vertex_t *));
3301 	}
3302 	pgraph.proc = palloc;
3303 	pgraph.free += (PROC_CHUNK - 1);
3304 	pv->index = lock->pvertex = pgraph.gcount;
3305 	pgraph.gcount += PROC_CHUNK;
3306 	pgraph.proc[pv->index] = pv;
3307 	return (pv);
3308 }
3309 
3310 /*
3311  * Allocate a proc edge.
3312  */
3313 
3314 static proc_edge_t *
flk_get_proc_edge()3315 flk_get_proc_edge()
3316 {
3317 	proc_edge_t *pep;
3318 
3319 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3320 	flk_proc_edge_allocs++;
3321 	return (pep);
3322 }
3323 
3324 /*
3325  * Free the proc edge. Called whenever its reference count goes to zero.
3326  */
3327 
3328 static void
flk_free_proc_edge(proc_edge_t * pep)3329 flk_free_proc_edge(proc_edge_t *pep)
3330 {
3331 	ASSERT(pep->refcount == 0);
3332 	kmem_free((void *)pep, sizeof (proc_edge_t));
3333 	flk_proc_edge_frees++;
3334 }
3335 
3336 /*
3337  * Color the graph explicitly done only when the mark value hits max value.
3338  */
3339 
3340 static void
flk_proc_graph_uncolor()3341 flk_proc_graph_uncolor()
3342 {
3343 	int i;
3344 
3345 	if (pgraph.mark == UINT_MAX) {
3346 		for (i = 0; i < pgraph.gcount; i++)
3347 			if (pgraph.proc[i] != NULL) {
3348 				pgraph.proc[i]->atime = 0;
3349 				pgraph.proc[i]->dtime = 0;
3350 			}
3351 		pgraph.mark = 1;
3352 	} else {
3353 		pgraph.mark++;
3354 	}
3355 }
3356 
3357 /*
3358  * Release the proc vertex iff both there are no in edges and out edges
3359  */
3360 
3361 static void
flk_proc_release(proc_vertex_t * proc)3362 flk_proc_release(proc_vertex_t *proc)
3363 {
3364 	ASSERT(MUTEX_HELD(&flock_lock));
3365 	if (proc->edge == NULL && proc->incount == 0) {
3366 		pgraph.proc[proc->index] = NULL;
3367 		pgraph.free++;
3368 		kmem_free(proc, sizeof (proc_vertex_t));
3369 		flk_proc_vertex_frees++;
3370 	}
3371 }
3372 
3373 /*
3374  * Updates process graph to reflect change in a lock_graph.
3375  * Note: We should call this function only after we have a correctly
3376  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3377  * eg: in function flk_relation() we call this function after flk_recompute_
3378  * dependencies() otherwise if a process tries to lock a vnode hashed
3379  * into another graph it might sleep for ever.
3380  */
3381 
3382 static void
flk_update_proc_graph(edge_t * ep,int delete)3383 flk_update_proc_graph(edge_t *ep, int delete)
3384 {
3385 	proc_vertex_t *toproc, *fromproc;
3386 	proc_edge_t *pep, *prevpep;
3387 
3388 	mutex_enter(&flock_lock);
3389 
3390 	/*
3391 	 * OFD style locks are not associated with any process so there is
3392 	 * no proc graph for these.
3393 	 */
3394 	if (ep->from_vertex->l_ofd != NULL) {
3395 		mutex_exit(&flock_lock);
3396 		return;
3397 	}
3398 
3399 	toproc = flk_get_proc_vertex(ep->to_vertex);
3400 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3401 
3402 	if (!delete)
3403 		goto add;
3404 	pep = prevpep = fromproc->edge;
3405 
3406 	ASSERT(pep != NULL);
3407 	while (pep != NULL) {
3408 		if (pep->to_proc == toproc) {
3409 			ASSERT(pep->refcount > 0);
3410 			pep->refcount--;
3411 			if (pep->refcount == 0) {
3412 				if (pep == prevpep) {
3413 					fromproc->edge = pep->next;
3414 				} else {
3415 					prevpep->next = pep->next;
3416 				}
3417 				toproc->incount--;
3418 				flk_proc_release(toproc);
3419 				flk_free_proc_edge(pep);
3420 			}
3421 			break;
3422 		}
3423 		prevpep = pep;
3424 		pep = pep->next;
3425 	}
3426 	flk_proc_release(fromproc);
3427 	mutex_exit(&flock_lock);
3428 	return;
3429 add:
3430 
3431 	pep = fromproc->edge;
3432 
3433 	while (pep != NULL) {
3434 		if (pep->to_proc == toproc) {
3435 			ASSERT(pep->refcount > 0);
3436 			pep->refcount++;
3437 			break;
3438 		}
3439 		pep = pep->next;
3440 	}
3441 	if (pep == NULL) {
3442 		pep = flk_get_proc_edge();
3443 		pep->to_proc = toproc;
3444 		pep->refcount = 1;
3445 		toproc->incount++;
3446 		pep->next = fromproc->edge;
3447 		fromproc->edge = pep;
3448 	}
3449 	mutex_exit(&flock_lock);
3450 }
3451 
3452 /*
3453  * Set the control status for lock manager requests.
3454  *
3455  */
3456 
3457 /*
3458  * PSARC case 1997/292
3459  *
3460  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3461  * Effects: Set the state of the NLM server identified by "nlmid"
3462  *   in the NLM registry to state "nlm_state."
3463  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3464  *   NLM server to this LLM.
3465  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3466  *   may be locks requests that have gotten started but not finished.  In
3467  *   particular, there may be blocking requests that are in the callback code
3468  *   before sleeping (so they're not holding the lock for the graph).  If
3469  *   such a thread reacquires the graph's lock (to go to sleep) after
3470  *   NLM state in the NLM registry  is set to a non-up value,
3471  *   it will notice the status and bail out.  If the request gets
3472  *   granted before the thread can check the NLM registry, let it
3473  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3474  *
3475  * Modifies: nlm_reg_obj (global)
3476  * Arguments:
3477  *    nlmid	(IN):    id uniquely identifying an NLM server
3478  *    nlm_state (IN):    NLM server state to change "nlmid" to
3479  */
3480 void
cl_flk_set_nlm_status(int nlmid,flk_nlm_status_t nlm_state)3481 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3482 {
3483 	/*
3484 	 * Check to see if node is booted as a cluster. If not, return.
3485 	 */
3486 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3487 		return;
3488 	}
3489 
3490 	/*
3491 	 * Check for development/debugging.  It is possible to boot a node
3492 	 * in non-cluster mode, and then run a special script, currently
3493 	 * available only to developers, to bring up the node as part of a
3494 	 * cluster.  The problem is that running such a script does not
3495 	 * result in the routine flk_init() being called and hence global array
3496 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3497 	 * but the LLM needs to do an additional check to see if the global
3498 	 * array has been created or not. If nlm_reg_status is NULL, then
3499 	 * return, else continue.
3500 	 */
3501 	if (nlm_reg_status == NULL) {
3502 		return;
3503 	}
3504 
3505 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3506 	mutex_enter(&nlm_reg_lock);
3507 
3508 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3509 		/*
3510 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3511 		 * add it to the registry in the nlm shutting down state.
3512 		 */
3513 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3514 		    FLK_NLM_SHUTTING_DOWN);
3515 	} else {
3516 		/*
3517 		 * Change the state of the NLM server identified by "nlmid"
3518 		 * in the NLM registry to the argument "nlm_state."
3519 		 */
3520 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3521 		    nlm_state);
3522 	}
3523 
3524 	/*
3525 	 *  The reason we must register the NLM server that is shutting down
3526 	 *  with an LLM that doesn't already know about it (never sent a lock
3527 	 *  request) is to handle correctly a race between shutdown and a new
3528 	 *  lock request.  Suppose that a shutdown request from the NLM server
3529 	 *  invokes this routine at the LLM, and a thread is spawned to
3530 	 *  service the request. Now suppose a new lock request is in
3531 	 *  progress and has already passed the first line of defense in
3532 	 *  reclock(), which denies new locks requests from NLM servers
3533 	 *  that are not in the NLM_UP state.  After the current routine
3534 	 *  is invoked for both phases of shutdown, the routine will return,
3535 	 *  having done nothing, and the lock request will proceed and
3536 	 *  probably be granted.  The problem is that the shutdown was ignored
3537 	 *  by the lock request because there was no record of that NLM server
3538 	 *  shutting down.   We will be in the peculiar position of thinking
3539 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3540 	 *  been discarded, but in fact there's still one lock held.
3541 	 *  The solution is to record the existence of NLM server and change
3542 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3543 	 *  progress may proceed because the next phase NLM_DOWN will catch
3544 	 *  this lock and discard it.
3545 	 */
3546 	mutex_exit(&nlm_reg_lock);
3547 
3548 	switch (nlm_state) {
3549 	case FLK_NLM_UP:
3550 		/*
3551 		 * Change the NLM state of all locks still held on behalf of
3552 		 * the NLM server identified by "nlmid" to NLM_UP.
3553 		 */
3554 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3555 		break;
3556 
3557 	case FLK_NLM_SHUTTING_DOWN:
3558 		/*
3559 		 * Wake up all sleeping locks for the NLM server identified
3560 		 * by "nlmid." Note that eventually all woken threads will
3561 		 * have their lock requests cancelled and descriptors
3562 		 * removed from the sleeping lock list.  Note that the NLM
3563 		 * server state associated with each lock descriptor is
3564 		 * changed to FLK_NLM_SHUTTING_DOWN.
3565 		 */
3566 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3567 		break;
3568 
3569 	case FLK_NLM_DOWN:
3570 		/*
3571 		 * Discard all active, granted locks for this NLM server
3572 		 * identified by "nlmid."
3573 		 */
3574 		cl_flk_unlock_nlm_granted(nlmid);
3575 		break;
3576 
3577 	default:
3578 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3579 	}
3580 }
3581 
3582 /*
3583  * Set the control status for lock manager requests.
3584  *
3585  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3586  * may be locks requests that have gotten started but not finished.  In
3587  * particular, there may be blocking requests that are in the callback code
3588  * before sleeping (so they're not holding the lock for the graph).  If
3589  * such a thread reacquires the graph's lock (to go to sleep) after
3590  * flk_lockmgr_status is set to a non-up value, it will notice the status
3591  * and bail out.  If the request gets granted before the thread can check
3592  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3593  * we are called with FLK_LOCKMGR_DOWN.
3594  */
3595 
3596 void
flk_set_lockmgr_status(flk_lockmgr_status_t status)3597 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3598 {
3599 	int i;
3600 	graph_t *gp;
3601 	struct flock_globals *fg;
3602 
3603 	fg = flk_get_globals();
3604 	ASSERT(fg != NULL);
3605 
3606 	mutex_enter(&flock_lock);
3607 	fg->flk_lockmgr_status = status;
3608 	mutex_exit(&flock_lock);
3609 
3610 	/*
3611 	 * If the lock manager is coming back up, all that's needed is to
3612 	 * propagate this information to the graphs.  If the lock manager
3613 	 * is going down, additional action is required, and each graph's
3614 	 * copy of the state is updated atomically with this other action.
3615 	 */
3616 	switch (status) {
3617 	case FLK_LOCKMGR_UP:
3618 		for (i = 0; i < HASH_SIZE; i++) {
3619 			mutex_enter(&flock_lock);
3620 			gp = lock_graph[i];
3621 			mutex_exit(&flock_lock);
3622 			if (gp == NULL)
3623 				continue;
3624 			mutex_enter(&gp->gp_mutex);
3625 			fg->lockmgr_status[i] = status;
3626 			mutex_exit(&gp->gp_mutex);
3627 		}
3628 		break;
3629 	case FLK_WAKEUP_SLEEPERS:
3630 		wakeup_sleeping_lockmgr_locks(fg);
3631 		break;
3632 	case FLK_LOCKMGR_DOWN:
3633 		unlock_lockmgr_granted(fg);
3634 		break;
3635 	default:
3636 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3637 		break;
3638 	}
3639 }
3640 
3641 /*
3642  * This routine returns all the locks that are active or sleeping and are
3643  * associated with a particular set of identifiers.  If lock_state != 0, then
3644  * only locks that match the lock_state are returned. If lock_state == 0, then
3645  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3646  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3647  * vnode pointer is ignored.
3648  *
3649  * A list containing the vnode pointer and an flock structure
3650  * describing the lock is returned.  Each element in the list is
3651  * dynamically allocated and must be freed by the caller.  The
3652  * last item in the list is denoted by a NULL value in the ll_next
3653  * field.
3654  *
3655  * The vnode pointers returned are held.  The caller is responsible
3656  * for releasing these.  Note that the returned list is only a snapshot of
3657  * the current lock information, and that it is a snapshot of a moving
3658  * target (only one graph is locked at a time).
3659  */
3660 
3661 locklist_t *
get_lock_list(int list_type,int lock_state,int sysid,boolean_t use_sysid,pid_t pid,const vnode_t * vp,zoneid_t zoneid)3662 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3663     pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3664 {
3665 	lock_descriptor_t	*lock;
3666 	lock_descriptor_t	*graph_head;
3667 	locklist_t		listhead;
3668 	locklist_t		*llheadp;
3669 	locklist_t		*llp;
3670 	locklist_t		*lltp;
3671 	graph_t			*gp;
3672 	int			i;
3673 	int			first_index; /* graph index */
3674 	int			num_indexes; /* graph index */
3675 
3676 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3677 	    (list_type == FLK_SLEEPING_STATE));
3678 
3679 	/*
3680 	 * Get a pointer to something to use as a list head while building
3681 	 * the rest of the list.
3682 	 */
3683 	llheadp = &listhead;
3684 	lltp = llheadp;
3685 	llheadp->ll_next = (locklist_t *)NULL;
3686 
3687 	/* Figure out which graphs we want to look at. */
3688 	if (vp == NULL) {
3689 		first_index = 0;
3690 		num_indexes = HASH_SIZE;
3691 	} else {
3692 		first_index = HASH_INDEX(vp);
3693 		num_indexes = 1;
3694 	}
3695 
3696 	for (i = first_index; i < first_index + num_indexes; i++) {
3697 		mutex_enter(&flock_lock);
3698 		gp = lock_graph[i];
3699 		mutex_exit(&flock_lock);
3700 		if (gp == NULL) {
3701 			continue;
3702 		}
3703 
3704 		mutex_enter(&gp->gp_mutex);
3705 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3706 		    ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3707 		for (lock = graph_head->l_next;
3708 		    lock != graph_head;
3709 		    lock = lock->l_next) {
3710 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3711 				continue;
3712 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3713 				continue;
3714 			if (vp != NULL && lock->l_vnode != vp)
3715 				continue;
3716 			if (lock_state && !(lock_state & lock->l_state))
3717 				continue;
3718 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3719 				continue;
3720 			/*
3721 			 * A matching lock was found.  Allocate
3722 			 * space for a new locklist entry and fill
3723 			 * it in.
3724 			 */
3725 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3726 			lltp->ll_next = llp;
3727 			VN_HOLD(lock->l_vnode);
3728 			llp->ll_vp = lock->l_vnode;
3729 			create_flock(lock, &(llp->ll_flock));
3730 			llp->ll_next = (locklist_t *)NULL;
3731 			lltp = llp;
3732 		}
3733 		mutex_exit(&gp->gp_mutex);
3734 	}
3735 
3736 	llp = llheadp->ll_next;
3737 	return (llp);
3738 }
3739 
3740 /*
3741  * These two functions are simply interfaces to get_lock_list.  They return
3742  * a list of sleeping or active locks for the given sysid and pid.  See
3743  * get_lock_list for details.
3744  *
3745  * In either case we don't particularly care to specify the zone of interest;
3746  * the sysid-space is global across zones, so the sysid will map to exactly one
3747  * zone, and we'll return information for that zone.
3748  */
3749 
3750 locklist_t *
flk_get_sleeping_locks(int sysid,pid_t pid)3751 flk_get_sleeping_locks(int sysid, pid_t pid)
3752 {
3753 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3754 	    ALL_ZONES));
3755 }
3756 
3757 locklist_t *
flk_get_active_locks(int sysid,pid_t pid)3758 flk_get_active_locks(int sysid, pid_t pid)
3759 {
3760 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3761 	    ALL_ZONES));
3762 }
3763 
3764 /*
3765  * Another interface to get_lock_list.  This one returns all the active
3766  * locks for a given vnode.  Again, see get_lock_list for details.
3767  *
3768  * We don't need to specify which zone's locks we're interested in.  The matter
3769  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3770  * be used by multiple zones, so the list of locks will all be from the right
3771  * zone.
3772  */
3773 
3774 locklist_t *
flk_active_locks_for_vp(const vnode_t * vp)3775 flk_active_locks_for_vp(const vnode_t *vp)
3776 {
3777 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3778 	    ALL_ZONES));
3779 }
3780 
3781 /*
3782  * Another interface to get_lock_list.  This one returns all the active
3783  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3784  *
3785  * See the comment for flk_active_locks_for_vp() for why we don't care to
3786  * specify the particular zone of interest.
3787  */
3788 locklist_t *
flk_active_nbmand_locks_for_vp(const vnode_t * vp)3789 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3790 {
3791 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3792 	    NOPID, vp, ALL_ZONES));
3793 }
3794 
3795 /*
3796  * Another interface to get_lock_list.  This one returns all the active
3797  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3798  *
3799  * The zone doesn't need to be specified here; the locks held by a
3800  * particular process will either be local (ie, non-NFS) or from the zone
3801  * the process is executing in.  This is because other parts of the system
3802  * ensure that an NFS vnode can't be used in a zone other than that in
3803  * which it was opened.
3804  */
3805 locklist_t *
flk_active_nbmand_locks(pid_t pid)3806 flk_active_nbmand_locks(pid_t pid)
3807 {
3808 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3809 	    pid, NULL, ALL_ZONES));
3810 }
3811 
3812 /*
3813  * Free up all entries in the locklist.
3814  */
3815 void
flk_free_locklist(locklist_t * llp)3816 flk_free_locklist(locklist_t *llp)
3817 {
3818 	locklist_t *next_llp;
3819 
3820 	while (llp) {
3821 		next_llp = llp->ll_next;
3822 		VN_RELE(llp->ll_vp);
3823 		kmem_free(llp, sizeof (*llp));
3824 		llp = next_llp;
3825 	}
3826 }
3827 
3828 static void
cl_flk_change_nlm_state_all_locks(int nlmid,flk_nlm_status_t nlm_state)3829 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3830 {
3831 	/*
3832 	 * For each graph "lg" in the hash table lock_graph do
3833 	 * a.  Get the list of sleeping locks
3834 	 * b.  For each lock descriptor in the list do
3835 	 *	i.   If the requested lock is an NLM server request AND
3836 	 *		the nlmid is the same as the routine argument then
3837 	 *		change the lock descriptor's state field to
3838 	 *		"nlm_state."
3839 	 * c.  Get the list of active locks
3840 	 * d.  For each lock descriptor in the list do
3841 	 *	i.   If the requested lock is an NLM server request AND
3842 	 *		the nlmid is the same as the routine argument then
3843 	 *		change the lock descriptor's state field to
3844 	 *		"nlm_state."
3845 	 */
3846 
3847 	int			i;
3848 	graph_t			*gp;			/* lock graph */
3849 	lock_descriptor_t	*lock;			/* lock */
3850 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3851 	int			lock_nlmid;
3852 
3853 	for (i = 0; i < HASH_SIZE; i++) {
3854 		mutex_enter(&flock_lock);
3855 		gp = lock_graph[i];
3856 		mutex_exit(&flock_lock);
3857 		if (gp == NULL) {
3858 			continue;
3859 		}
3860 
3861 		/* Get list of sleeping locks in current lock graph. */
3862 		mutex_enter(&gp->gp_mutex);
3863 		for (lock = SLEEPING_HEAD(gp)->l_next;
3864 		    lock != SLEEPING_HEAD(gp);
3865 		    lock = nlock) {
3866 			nlock = lock->l_next;
3867 			/* get NLM id */
3868 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3869 
3870 			/*
3871 			 * If NLM server request AND nlmid of lock matches
3872 			 * nlmid of argument, then set the NLM state of the
3873 			 * lock to "nlm_state."
3874 			 */
3875 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3876 				SET_NLM_STATE(lock, nlm_state);
3877 			}
3878 		}
3879 
3880 		/* Get list of active locks in current lock graph. */
3881 		for (lock = ACTIVE_HEAD(gp)->l_next;
3882 		    lock != ACTIVE_HEAD(gp);
3883 		    lock = nlock) {
3884 			nlock = lock->l_next;
3885 			/* get NLM id */
3886 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3887 
3888 			/*
3889 			 * If NLM server request AND nlmid of lock matches
3890 			 * nlmid of argument, then set the NLM state of the
3891 			 * lock to "nlm_state."
3892 			 */
3893 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3894 				ASSERT(IS_ACTIVE(lock));
3895 				SET_NLM_STATE(lock, nlm_state);
3896 			}
3897 		}
3898 		mutex_exit(&gp->gp_mutex);
3899 	}
3900 }
3901 
3902 /*
3903  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3904  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3905  *   identified by "nlmid." Poke those lock requests.
3906  */
3907 static void
cl_flk_wakeup_sleeping_nlm_locks(int nlmid)3908 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3909 {
3910 	lock_descriptor_t *lock;
3911 	lock_descriptor_t *nlock = NULL; /* next lock */
3912 	int i;
3913 	graph_t *gp;
3914 	int	lock_nlmid;
3915 
3916 	for (i = 0; i < HASH_SIZE; i++) {
3917 		mutex_enter(&flock_lock);
3918 		gp = lock_graph[i];
3919 		mutex_exit(&flock_lock);
3920 		if (gp == NULL) {
3921 			continue;
3922 		}
3923 
3924 		mutex_enter(&gp->gp_mutex);
3925 		for (lock = SLEEPING_HEAD(gp)->l_next;
3926 		    lock != SLEEPING_HEAD(gp);
3927 		    lock = nlock) {
3928 			nlock = lock->l_next;
3929 			/*
3930 			 * If NLM server request _and_ nlmid of lock matches
3931 			 * nlmid of argument, then set the NLM state of the
3932 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3933 			 * request.
3934 			 */
3935 			if (IS_LOCKMGR(lock)) {
3936 				/* get NLM id */
3937 				lock_nlmid =
3938 				    GETNLMID(lock->l_flock.l_sysid);
3939 				if (nlmid == lock_nlmid) {
3940 					SET_NLM_STATE(lock,
3941 					    FLK_NLM_SHUTTING_DOWN);
3942 					INTERRUPT_WAKEUP(lock);
3943 				}
3944 			}
3945 		}
3946 		mutex_exit(&gp->gp_mutex);
3947 	}
3948 }
3949 
3950 /*
3951  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3952  * Effects:  Find all active (granted) lock manager locks _only_ for the
3953  *   NLM server identified by "nlmid" and release them.
3954  */
3955 static void
cl_flk_unlock_nlm_granted(int nlmid)3956 cl_flk_unlock_nlm_granted(int nlmid)
3957 {
3958 	lock_descriptor_t *lock;
3959 	lock_descriptor_t *nlock = NULL; /* next lock */
3960 	int i;
3961 	graph_t *gp;
3962 	int	lock_nlmid;
3963 
3964 	for (i = 0; i < HASH_SIZE; i++) {
3965 		mutex_enter(&flock_lock);
3966 		gp = lock_graph[i];
3967 		mutex_exit(&flock_lock);
3968 		if (gp == NULL) {
3969 			continue;
3970 		}
3971 
3972 		mutex_enter(&gp->gp_mutex);
3973 		for (lock = ACTIVE_HEAD(gp)->l_next;
3974 		    lock != ACTIVE_HEAD(gp);
3975 		    lock = nlock) {
3976 			nlock = lock->l_next;
3977 			ASSERT(IS_ACTIVE(lock));
3978 
3979 			/*
3980 			 * If it's an  NLM server request _and_ nlmid of
3981 			 * the lock matches nlmid of argument, then
3982 			 * remove the active lock the list, wakup blocked
3983 			 * threads, and free the storage for the lock.
3984 			 * Note that there's no need to mark the NLM state
3985 			 * of this lock to NLM_DOWN because the lock will
3986 			 * be deleted anyway and its storage freed.
3987 			 */
3988 			if (IS_LOCKMGR(lock)) {
3989 				/* get NLM id */
3990 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3991 				if (nlmid == lock_nlmid) {
3992 					flk_delete_active_lock(lock, 0);
3993 					flk_wakeup(lock, 1);
3994 					flk_free_lock(lock);
3995 				}
3996 			}
3997 		}
3998 		mutex_exit(&gp->gp_mutex);
3999 	}
4000 }
4001 
4002 /*
4003  * Find all sleeping lock manager requests and poke them.
4004  */
4005 static void
wakeup_sleeping_lockmgr_locks(struct flock_globals * fg)4006 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4007 {
4008 	lock_descriptor_t *lock;
4009 	lock_descriptor_t *nlock = NULL; /* next lock */
4010 	int i;
4011 	graph_t *gp;
4012 	zoneid_t zoneid = getzoneid();
4013 
4014 	for (i = 0; i < HASH_SIZE; i++) {
4015 		mutex_enter(&flock_lock);
4016 		gp = lock_graph[i];
4017 		mutex_exit(&flock_lock);
4018 		if (gp == NULL) {
4019 			continue;
4020 		}
4021 
4022 		mutex_enter(&gp->gp_mutex);
4023 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4024 		for (lock = SLEEPING_HEAD(gp)->l_next;
4025 		    lock != SLEEPING_HEAD(gp);
4026 		    lock = nlock) {
4027 			nlock = lock->l_next;
4028 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4029 				INTERRUPT_WAKEUP(lock);
4030 			}
4031 		}
4032 		mutex_exit(&gp->gp_mutex);
4033 	}
4034 }
4035 
4036 
4037 /*
4038  * Find all active (granted) lock manager locks and release them.
4039  */
4040 static void
unlock_lockmgr_granted(struct flock_globals * fg)4041 unlock_lockmgr_granted(struct flock_globals *fg)
4042 {
4043 	lock_descriptor_t *lock;
4044 	lock_descriptor_t *nlock = NULL; /* next lock */
4045 	int i;
4046 	graph_t *gp;
4047 	zoneid_t zoneid = getzoneid();
4048 
4049 	for (i = 0; i < HASH_SIZE; i++) {
4050 		mutex_enter(&flock_lock);
4051 		gp = lock_graph[i];
4052 		mutex_exit(&flock_lock);
4053 		if (gp == NULL) {
4054 			continue;
4055 		}
4056 
4057 		mutex_enter(&gp->gp_mutex);
4058 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4059 		for (lock = ACTIVE_HEAD(gp)->l_next;
4060 		    lock != ACTIVE_HEAD(gp);
4061 		    lock = nlock) {
4062 			nlock = lock->l_next;
4063 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4064 				ASSERT(IS_ACTIVE(lock));
4065 				flk_delete_active_lock(lock, 0);
4066 				flk_wakeup(lock, 1);
4067 				flk_free_lock(lock);
4068 			}
4069 		}
4070 		mutex_exit(&gp->gp_mutex);
4071 	}
4072 }
4073 
4074 
4075 /*
4076  * Wait until a lock is granted, cancelled, or interrupted.
4077  */
4078 
4079 static void
wait_for_lock(lock_descriptor_t * request)4080 wait_for_lock(lock_descriptor_t *request)
4081 {
4082 	graph_t *gp = request->l_graph;
4083 
4084 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
4085 
4086 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4087 	    !(IS_INTERRUPTED(request))) {
4088 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
4089 			flk_set_state(request, FLK_INTERRUPTED_STATE);
4090 			request->l_state |= INTERRUPTED_LOCK;
4091 		}
4092 	}
4093 }
4094 
4095 /*
4096  * Create an flock structure from the existing lock information
4097  *
4098  * This routine is used to create flock structures for the lock manager
4099  * to use in a reclaim request.  Since the lock was originated on this
4100  * host, it must be conforming to UNIX semantics, so no checking is
4101  * done to make sure it falls within the lower half of the 32-bit range.
4102  */
4103 
4104 static void
create_flock(lock_descriptor_t * lp,flock64_t * flp)4105 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4106 {
4107 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4108 	ASSERT(lp->l_end >= lp->l_start);
4109 
4110 	flp->l_type = lp->l_type;
4111 	flp->l_whence = 0;
4112 	flp->l_start = lp->l_start;
4113 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4114 	    (lp->l_end - lp->l_start + 1);
4115 	flp->l_sysid = lp->l_flock.l_sysid;
4116 	flp->l_pid = lp->l_flock.l_pid;
4117 }
4118 
4119 /*
4120  * Convert flock_t data describing a lock range into unsigned long starting
4121  * and ending points, which are put into lock_request.  Returns 0 or an
4122  * errno value.
4123  */
4124 
4125 int
flk_convert_lock_data(vnode_t * vp,flock64_t * flp,u_offset_t * start,u_offset_t * end,offset_t offset)4126 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4127     u_offset_t *start, u_offset_t *end, offset_t offset)
4128 {
4129 	struct vattr	vattr;
4130 	int	error;
4131 
4132 	/*
4133 	 * Determine the starting point of the request
4134 	 */
4135 	switch (flp->l_whence) {
4136 	case 0:		/* SEEK_SET */
4137 		*start = (u_offset_t)flp->l_start;
4138 		break;
4139 	case 1:		/* SEEK_CUR */
4140 		*start = (u_offset_t)(flp->l_start + offset);
4141 		break;
4142 	case 2:		/* SEEK_END */
4143 		vattr.va_mask = AT_SIZE;
4144 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4145 			return (error);
4146 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
4147 		break;
4148 	default:
4149 		return (EINVAL);
4150 	}
4151 
4152 	/*
4153 	 * Determine the range covered by the request.
4154 	 */
4155 	if (flp->l_len == 0)
4156 		*end = MAX_U_OFFSET_T;
4157 	else if ((offset_t)flp->l_len > 0) {
4158 		*end = (u_offset_t)(*start + (flp->l_len - 1));
4159 	} else {
4160 		/*
4161 		 * Negative length; why do we even allow this ?
4162 		 * Because this allows easy specification of
4163 		 * the last n bytes of the file.
4164 		 */
4165 		*end = *start;
4166 		*start += (u_offset_t)flp->l_len;
4167 		(*start)++;
4168 	}
4169 	return (0);
4170 }
4171 
4172 /*
4173  * Check the validity of lock data.  This can used by the NFS
4174  * frlock routines to check data before contacting the server.  The
4175  * server must support semantics that aren't as restrictive as
4176  * the UNIX API, so the NFS client is required to check.
4177  * The maximum is now passed in by the caller.
4178  */
4179 
4180 int
flk_check_lock_data(u_offset_t start,u_offset_t end,offset_t max)4181 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4182 {
4183 	/*
4184 	 * The end (length) for local locking should never be greater
4185 	 * than MAXEND. However, the representation for
4186 	 * the entire file is MAX_U_OFFSET_T.
4187 	 */
4188 	if ((start > max) ||
4189 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
4190 		return (EINVAL);
4191 	}
4192 	if (start > end) {
4193 		return (EINVAL);
4194 	}
4195 	return (0);
4196 }
4197 
4198 /*
4199  * Fill in request->l_flock with information about the lock blocking the
4200  * request.  The complexity here is that lock manager requests are allowed
4201  * to see into the upper part of the 32-bit address range, whereas local
4202  * requests are only allowed to see signed values.
4203  *
4204  * What should be done when "blocker" is a lock manager lock that uses the
4205  * upper portion of the 32-bit range, but "request" is local?  Since the
4206  * request has already been determined to have been blocked by the blocker,
4207  * at least some portion of "blocker" must be in the range of the request,
4208  * or the request extends to the end of file.  For the first case, the
4209  * portion in the lower range is returned with the indication that it goes
4210  * "to EOF."  For the second case, the last byte of the lower range is
4211  * returned with the indication that it goes "to EOF."
4212  */
4213 
4214 static void
report_blocker(lock_descriptor_t * blocker,lock_descriptor_t * request)4215 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4216 {
4217 	flock64_t *flrp;			/* l_flock portion of request */
4218 
4219 	ASSERT(blocker != NULL);
4220 
4221 	flrp = &request->l_flock;
4222 	flrp->l_whence = 0;
4223 	flrp->l_type = blocker->l_type;
4224 	flrp->l_pid = blocker->l_flock.l_pid;
4225 	flrp->l_sysid = blocker->l_flock.l_sysid;
4226 	request->l_ofd = blocker->l_ofd;
4227 
4228 	if (IS_LOCKMGR(request)) {
4229 		flrp->l_start = blocker->l_start;
4230 		if (blocker->l_end == MAX_U_OFFSET_T)
4231 			flrp->l_len = 0;
4232 		else
4233 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
4234 	} else {
4235 		if (blocker->l_start > MAXEND) {
4236 			flrp->l_start = MAXEND;
4237 			flrp->l_len = 0;
4238 		} else {
4239 			flrp->l_start = blocker->l_start;
4240 			if (blocker->l_end == MAX_U_OFFSET_T)
4241 				flrp->l_len = 0;
4242 			else
4243 				flrp->l_len = blocker->l_end -
4244 				    blocker->l_start + 1;
4245 		}
4246 	}
4247 }
4248 
4249 /*
4250  * PSARC case 1997/292
4251  */
4252 /*
4253  * This is the public routine exported by flock.h.
4254  */
4255 void
cl_flk_change_nlm_state_to_unknown(int nlmid)4256 cl_flk_change_nlm_state_to_unknown(int nlmid)
4257 {
4258 	/*
4259 	 * Check to see if node is booted as a cluster. If not, return.
4260 	 */
4261 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4262 		return;
4263 	}
4264 
4265 	/*
4266 	 * See comment in cl_flk_set_nlm_status().
4267 	 */
4268 	if (nlm_reg_status == NULL) {
4269 		return;
4270 	}
4271 
4272 	/*
4273 	 * protect NLM registry state with a mutex.
4274 	 */
4275 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4276 	mutex_enter(&nlm_reg_lock);
4277 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4278 	mutex_exit(&nlm_reg_lock);
4279 }
4280 
4281 /*
4282  * Return non-zero if the given I/O request conflicts with an active NBMAND
4283  * lock.
4284  * If svmand is non-zero, it means look at all active locks, not just NBMAND
4285  * locks.
4286  */
4287 
4288 int
nbl_lock_conflict(vnode_t * vp,nbl_op_t op,u_offset_t offset,ssize_t length,int svmand,caller_context_t * ct)4289 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4290     ssize_t length, int svmand, caller_context_t *ct)
4291 {
4292 	int conflict = 0;
4293 	graph_t			*gp;
4294 	lock_descriptor_t	*lock;
4295 	pid_t pid;
4296 	int sysid;
4297 
4298 	if (ct == NULL) {
4299 		pid = curproc->p_pid;
4300 		sysid = 0;
4301 	} else {
4302 		pid = ct->cc_pid;
4303 		sysid = ct->cc_sysid;
4304 	}
4305 
4306 	mutex_enter(&flock_lock);
4307 	gp = lock_graph[HASH_INDEX(vp)];
4308 	mutex_exit(&flock_lock);
4309 	if (gp == NULL)
4310 		return (0);
4311 
4312 	mutex_enter(&gp->gp_mutex);
4313 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4314 
4315 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4316 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4317 		    (lock->l_flock.l_sysid != sysid ||
4318 		    lock->l_flock.l_pid != pid) &&
4319 		    lock_blocks_io(op, offset, length,
4320 		    lock->l_type, lock->l_start, lock->l_end)) {
4321 			DTRACE_PROBE1(conflict_lock,
4322 			    lock_descriptor_t *, lock);
4323 			conflict = 1;
4324 			break;
4325 		}
4326 	}
4327 	mutex_exit(&gp->gp_mutex);
4328 
4329 	return (conflict);
4330 }
4331 
4332 /*
4333  * Return non-zero if the given I/O request conflicts with the given lock.
4334  */
4335 
4336 static int
lock_blocks_io(nbl_op_t op,u_offset_t offset,ssize_t length,int lock_type,u_offset_t lock_start,u_offset_t lock_end)4337 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4338     int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4339 {
4340 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4341 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4342 
4343 	if (op == NBL_READ && lock_type == F_RDLCK)
4344 		return (0);
4345 
4346 	if (offset <= lock_start && lock_start < offset + length)
4347 		return (1);
4348 	if (lock_start <= offset && offset <= lock_end)
4349 		return (1);
4350 
4351 	return (0);
4352 }
4353 
4354 #ifdef DEBUG
4355 static void
check_active_locks(graph_t * gp)4356 check_active_locks(graph_t *gp)
4357 {
4358 	lock_descriptor_t *lock, *lock1;
4359 	edge_t	*ep;
4360 
4361 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4362 	    lock = lock->l_next) {
4363 		ASSERT(IS_ACTIVE(lock));
4364 		ASSERT(NOT_BLOCKED(lock));
4365 		ASSERT(!IS_BARRIER(lock));
4366 
4367 		ep = FIRST_IN(lock);
4368 
4369 		while (ep != HEAD(lock)) {
4370 			ASSERT(IS_SLEEPING(ep->from_vertex));
4371 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
4372 			ep = NEXT_IN(ep);
4373 		}
4374 
4375 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4376 		    lock1 = lock1->l_next) {
4377 			if (lock1->l_vnode == lock->l_vnode) {
4378 			if (BLOCKS(lock1, lock)) {
4379 				cmn_err(CE_PANIC,
4380 				    "active lock %p blocks %p",
4381 				    (void *)lock1, (void *)lock);
4382 			} else if (BLOCKS(lock, lock1)) {
4383 				cmn_err(CE_PANIC,
4384 				    "active lock %p blocks %p",
4385 				    (void *)lock, (void *)lock1);
4386 			}
4387 			}
4388 		}
4389 	}
4390 }
4391 
4392 /*
4393  * Effect: This functions checks to see if the transition from 'old_state' to
4394  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4395  *	and 1 if it is not.
4396  *	For a map of valid transitions, see sys/flock_impl.h
4397  */
4398 static int
check_lock_transition(int old_state,int new_state)4399 check_lock_transition(int old_state, int new_state)
4400 {
4401 	switch (old_state) {
4402 	case FLK_INITIAL_STATE:
4403 		if ((new_state == FLK_START_STATE) ||
4404 		    (new_state == FLK_SLEEPING_STATE) ||
4405 		    (new_state == FLK_ACTIVE_STATE) ||
4406 		    (new_state == FLK_DEAD_STATE)) {
4407 			return (0);
4408 		} else {
4409 			return (1);
4410 		}
4411 	case FLK_START_STATE:
4412 		if ((new_state == FLK_ACTIVE_STATE) ||
4413 		    (new_state == FLK_DEAD_STATE)) {
4414 			return (0);
4415 		} else {
4416 			return (1);
4417 		}
4418 	case FLK_ACTIVE_STATE:
4419 		if (new_state == FLK_DEAD_STATE) {
4420 			return (0);
4421 		} else {
4422 			return (1);
4423 		}
4424 	case FLK_SLEEPING_STATE:
4425 		if ((new_state == FLK_GRANTED_STATE) ||
4426 		    (new_state == FLK_INTERRUPTED_STATE) ||
4427 		    (new_state == FLK_CANCELLED_STATE)) {
4428 			return (0);
4429 		} else {
4430 			return (1);
4431 		}
4432 	case FLK_GRANTED_STATE:
4433 		if ((new_state == FLK_START_STATE) ||
4434 		    (new_state == FLK_INTERRUPTED_STATE) ||
4435 		    (new_state == FLK_CANCELLED_STATE)) {
4436 			return (0);
4437 		} else {
4438 			return (1);
4439 		}
4440 	case FLK_CANCELLED_STATE:
4441 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4442 		    (new_state == FLK_DEAD_STATE)) {
4443 			return (0);
4444 		} else {
4445 			return (1);
4446 		}
4447 	case FLK_INTERRUPTED_STATE:
4448 		if (new_state == FLK_DEAD_STATE) {
4449 			return (0);
4450 		} else {
4451 			return (1);
4452 		}
4453 	case FLK_DEAD_STATE:
4454 		/* May be set more than once */
4455 		if (new_state == FLK_DEAD_STATE) {
4456 			return (0);
4457 		} else {
4458 			return (1);
4459 		}
4460 	default:
4461 		return (1);
4462 	}
4463 }
4464 
4465 static void
check_sleeping_locks(graph_t * gp)4466 check_sleeping_locks(graph_t *gp)
4467 {
4468 	lock_descriptor_t *lock1, *lock2;
4469 	edge_t *ep;
4470 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4471 	    lock1 = lock1->l_next) {
4472 		ASSERT(!IS_BARRIER(lock1));
4473 		for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4474 		    lock2 = lock2->l_next) {
4475 			if (lock1->l_vnode == lock2->l_vnode) {
4476 				if (BLOCKS(lock2, lock1)) {
4477 					ASSERT(!IS_GRANTED(lock1));
4478 					ASSERT(!NOT_BLOCKED(lock1));
4479 					path(lock1, lock2);
4480 				}
4481 			}
4482 		}
4483 
4484 		for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4485 		    lock2 = lock2->l_next) {
4486 			ASSERT(!IS_BARRIER(lock1));
4487 			if (lock1->l_vnode == lock2->l_vnode) {
4488 				if (BLOCKS(lock2, lock1)) {
4489 					ASSERT(!IS_GRANTED(lock1));
4490 					ASSERT(!NOT_BLOCKED(lock1));
4491 					path(lock1, lock2);
4492 				}
4493 			}
4494 		}
4495 		ep = FIRST_ADJ(lock1);
4496 		while (ep != HEAD(lock1)) {
4497 			ASSERT(BLOCKS(ep->to_vertex, lock1));
4498 			ep = NEXT_ADJ(ep);
4499 		}
4500 	}
4501 }
4502 
4503 static int
level_two_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2,int no_path)4504 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4505 {
4506 	edge_t	*ep;
4507 	lock_descriptor_t	*vertex;
4508 	lock_descriptor_t *vertex_stack;
4509 
4510 	STACK_INIT(vertex_stack);
4511 
4512 	flk_graph_uncolor(lock1->l_graph);
4513 	ep = FIRST_ADJ(lock1);
4514 	ASSERT(ep != HEAD(lock1));
4515 	while (ep != HEAD(lock1)) {
4516 		if (no_path)
4517 			ASSERT(ep->to_vertex != lock2);
4518 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4519 		COLOR(ep->to_vertex);
4520 		ep = NEXT_ADJ(ep);
4521 	}
4522 
4523 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4524 		STACK_POP(vertex_stack, l_dstack);
4525 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4526 		    ep = NEXT_ADJ(ep)) {
4527 			if (COLORED(ep->to_vertex))
4528 				continue;
4529 			COLOR(ep->to_vertex);
4530 			if (ep->to_vertex == lock2)
4531 				return (1);
4532 
4533 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4534 		}
4535 	}
4536 	return (0);
4537 }
4538 
4539 static void
check_owner_locks(graph_t * gp,pid_t pid,int sysid,vnode_t * vp)4540 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4541 {
4542 	lock_descriptor_t *lock;
4543 
4544 	/* Ignore OFD style locks since they're not process-wide. */
4545 	if (pid == 0)
4546 		return;
4547 
4548 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4549 
4550 	if (lock) {
4551 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4552 			if (lock->l_flock.l_pid == pid &&
4553 			    lock->l_flock.l_sysid == sysid)
4554 				cmn_err(CE_PANIC,
4555 				    "owner pid %d's lock %p in active queue",
4556 				    pid, (void *)lock);
4557 			lock = lock->l_next;
4558 		}
4559 	}
4560 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4561 
4562 	if (lock) {
4563 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4564 			if (lock->l_flock.l_pid == pid &&
4565 			    lock->l_flock.l_sysid == sysid)
4566 				cmn_err(CE_PANIC,
4567 				    "owner pid %d's lock %p in sleep queue",
4568 				    pid, (void *)lock);
4569 			lock = lock->l_next;
4570 		}
4571 	}
4572 }
4573 
4574 static int
level_one_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4575 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4576 {
4577 	edge_t *ep = FIRST_ADJ(lock1);
4578 
4579 	while (ep != HEAD(lock1)) {
4580 		if (ep->to_vertex == lock2)
4581 			return (1);
4582 		else
4583 			ep = NEXT_ADJ(ep);
4584 	}
4585 	return (0);
4586 }
4587 
4588 static int
no_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4589 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4590 {
4591 	return (!level_two_path(lock1, lock2, 1));
4592 }
4593 
4594 static void
path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4595 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4596 {
4597 	if (level_one_path(lock1, lock2)) {
4598 		if (level_two_path(lock1, lock2, 0) != 0) {
4599 			cmn_err(CE_WARN,
4600 			    "one edge one path from lock1 %p lock2 %p",
4601 			    (void *)lock1, (void *)lock2);
4602 		}
4603 	} else if (no_path(lock1, lock2)) {
4604 		cmn_err(CE_PANIC,
4605 		    "No path from  lock1 %p to lock2 %p",
4606 		    (void *)lock1, (void *)lock2);
4607 	}
4608 }
4609 #endif /* DEBUG */
4610