1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2015 Joyent, Inc.
26 */
27
28#ifndef _SYS_FLOCK_IMPL_H
29#define	_SYS_FLOCK_IMPL_H
30
31#include <sys/types.h>
32#include <sys/fcntl.h>		/* flock definition */
33#include <sys/file.h>		/* FREAD etc */
34#include <sys/flock.h>		/* RCMD etc */
35#include <sys/kmem.h>
36#include <sys/user.h>
37#include <sys/thread.h>
38#include <sys/proc.h>
39#include <sys/cred.h>
40#include <sys/debug.h>
41#include <sys/cmn_err.h>
42#include <sys/errno.h>
43#include <sys/systm.h>
44#include <sys/vnode.h>
45#include <sys/share.h>		/* just to get GETSYSID def */
46
47#ifdef	__cplusplus
48extern "C" {
49#endif
50
51struct	edge {
52	struct	edge	*edge_adj_next;	/* adjacency list next */
53	struct	edge	*edge_adj_prev; /* adjacency list prev */
54	struct	edge	*edge_in_next;	/* incoming edges list next */
55	struct	edge	*edge_in_prev;	/* incoming edges list prev */
56	struct 	lock_descriptor	*from_vertex;	/* edge emanating from lock */
57	struct 	lock_descriptor	*to_vertex;	/* edge pointing to lock */
58};
59
60typedef	struct	edge	edge_t;
61
62struct lock_descriptor {
63	struct	lock_descriptor	*l_next;	/* next active/sleep lock */
64	struct	lock_descriptor	*l_prev;	/* previous active/sleep lock */
65	struct	edge		l_edge;		/* edge for adj and in lists */
66	struct	lock_descriptor	*l_stack;	/* for stack operations */
67	struct	lock_descriptor	*l_stack1;	/* for stack operations */
68	struct 	lock_descriptor *l_dstack;	/* stack for debug functions */
69	struct	edge		*l_sedge;	/* start edge for graph alg. */
70			int	l_index; 	/* used for barrier count */
71		struct	graph	*l_graph;	/* graph this belongs to */
72		vnode_t		*l_vnode;	/* vnode being locked */
73			int	l_type;		/* type of lock */
74			int	l_state;	/* state described below */
75		u_offset_t	l_start;	/* start offset */
76		u_offset_t	l_end;		/* end offset */
77		flock64_t	l_flock;	/* original flock request */
78			int	l_color;	/* color used for graph alg */
79		kcondvar_t	l_cv;		/* wait condition for lock */
80		int		pvertex;	/* index to proc vertex */
81			int	l_status;	/* status described below */
82		flk_nlm_status_t l_nlm_state;	/* state of NLM server */
83		flk_callback_t	*l_callbacks;	/* callbacks, or NULL */
84		zoneid_t	l_zoneid;	/* zone of request */
85		file_t		*l_ofd;		/* OFD-style reference */
86};
87
88typedef struct 	lock_descriptor	lock_descriptor_t;
89
90/*
91 * Each graph holds locking information for some number of vnodes.  The
92 * active and sleeping lists are circular, with a dummy head element.
93 */
94
95struct	graph {
96	kmutex_t	gp_mutex;	/* mutex for this graph */
97	struct	lock_descriptor	active_locks;
98	struct	lock_descriptor	sleeping_locks;
99	int index;	/* index of this graph into the hash table */
100	int mark;	/* used for coloring the graph */
101};
102
103typedef	struct	graph	graph_t;
104
105/*
106 * The possible states a lock can be in.  These states are stored in the
107 * 'l_status' member of the 'lock_descriptor_t' structure.  All locks start
108 * life in the INITIAL state, and end up in the DEAD state.  Possible state
109 * transitions are :
110 *
111 * INITIAL--> START    --> ACTIVE    --> DEAD
112 *
113 *                     --> DEAD
114 *
115 *        --> ACTIVE   --> DEAD          (new locks from flk_relation)
116 *
117 *        --> SLEEPING --> GRANTED   --> START     --> ACTIVE --> DEAD
118 *
119 *                                   --> INTR      --> DEAD
120 *
121 *                                   --> CANCELLED --> DEAD
122 *
123 *                                                 --> INTR   --> DEAD
124 *
125 *                     --> INTR      --> DEAD
126 *
127 *                     --> CANCELLED --> DEAD
128 *
129 *                                   --> INTR      --> DEAD
130 *
131 * Lock transitions are done in the following functions:
132 * --> INITIAL		flk_get_lock(), reclock()
133 * --> START		flk_execute_request()
134 * --> ACTIVE		flk_insert_active_lock()
135 * --> SLEEPING		flk_insert_sleeping_lock()
136 * --> GRANTED		GRANT_WAKEUP
137 * --> INTERRUPTED	INTERRUPT_WAKEUP
138 * --> CANCELLED	CANCEL_WAKEUP
139 * --> DEAD		reclock(), flk_delete_active_lock(), and
140 *                          flk_cancel_sleeping_lock()
141 */
142
143#define	FLK_INITIAL_STATE	1	/* Initial state of all requests */
144#define	FLK_START_STATE		2	/* Request has started execution */
145#define	FLK_ACTIVE_STATE	3	/* In active queue */
146#define	FLK_SLEEPING_STATE	4	/* Request is blocked */
147#define	FLK_GRANTED_STATE	5	/* Request is granted */
148#define	FLK_INTERRUPTED_STATE	6	/* Request is interrupted */
149#define	FLK_CANCELLED_STATE	7	/* Request is cancelled */
150#define	FLK_DEAD_STATE		8	/* Request is done - will be deleted */
151
152/* flags defining state of locks */
153
154/*
155 * The LLM design has been modified so that lock states are now stored
156 * in the l_status field of lock_descriptor_t.  The l_state field is
157 * currently preserved for binary compatibility, but may be modified or
158 * removed in a minor release of Solaris.  Note that both of these
159 * fields (and the rest of the lock_descriptor_t structure) are private
160 * to the implementation of the lock manager and should not be used
161 * externally.
162 */
163
164#define	ACTIVE_LOCK		0x0001	/* in active queue */
165#define	SLEEPING_LOCK		0x0002	/* in sleep queue */
166#define	IO_LOCK			0x0004	/* is an IO lock */
167#define	REFERENCED_LOCK		0x0008	/* referenced some where */
168#define	QUERY_LOCK		0x0010	/* querying about lock */
169#define	WILLING_TO_SLEEP_LOCK	0x0020	/* lock can be put in sleep queue */
170#define	RECOMPUTE_LOCK		0x0040	/* used for recomputing dependencies */
171#define	RECOMPUTE_DONE		0x0080	/* used for recomputing dependencies */
172#define	BARRIER_LOCK		0x0100	/* used for recomputing dependencies */
173#define	GRANTED_LOCK		0x0200	/* granted but still in sleep queue */
174#define	CANCELLED_LOCK		0x0400	/* cancelled will be thrown out */
175#define	DELETED_LOCK		0x0800	/* deleted - free at earliest */
176#define	INTERRUPTED_LOCK	0x1000	/* pretend signal */
177#define	LOCKMGR_LOCK		0x2000	/* remote lock (server-side) */
178/* Clustering: flag for PXFS locks */
179#define	PXFS_LOCK		0x4000	/* lock created by PXFS file system */
180#define	NBMAND_LOCK		0x8000	/* non-blocking mandatory locking */
181
182#define	HASH_SIZE	32
183#define	HASH_SHIFT	(HASH_SIZE - 1)
184#define	HASH_INDEX(vp)	(((uintptr_t)vp >> 7) & HASH_SHIFT)
185
186/* extern definitions */
187
188extern struct graph	*lock_graph[HASH_SIZE];
189extern struct kmem_cache *flk_edge_cache;
190
191/* Clustering: functions called by PXFS */
192int flk_execute_request(lock_descriptor_t *);
193void flk_cancel_sleeping_lock(lock_descriptor_t *, int);
194void flk_set_state(lock_descriptor_t *, int);
195graph_t *flk_get_lock_graph(vnode_t *, int);
196
197/* flags used for readability in flock.c */
198
199#define	FLK_USE_GRAPH	0	/* don't initialize the lock_graph */
200#define	FLK_INIT_GRAPH	1	/* initialize the lock graph */
201#define	NO_COLOR	0	/* vertex is not colored */
202#define	NO_CHECK_CYCLE	0	/* don't mark vertex's in flk_add_edge */
203#define	CHECK_CYCLE	1	/* mark vertex's in flk_add_edge */
204
205#define	SAME_OWNER(lock1, lock2)	\
206	(((lock1)->l_flock.l_pid == (lock2)->l_flock.l_pid) && \
207		((lock1)->l_flock.l_sysid == (lock2)->l_flock.l_sysid) && \
208		((lock1)->l_ofd == (lock2)->l_ofd))
209
210#define	COLORED(vertex)		((vertex)->l_color == (vertex)->l_graph->mark)
211#define	COLOR(vertex)		((vertex)->l_color = (vertex)->l_graph->mark)
212
213/*
214 * stack data structure and operations
215 */
216
217#define	STACK_INIT(stack)	((stack) = NULL)
218#define	STACK_PUSH(stack, ptr, stack_link)	(ptr)->stack_link = (stack),\
219				(stack) = (ptr)
220#define	STACK_POP(stack, stack_link)	(stack) = (stack)->stack_link
221#define	STACK_TOP(stack)	(stack)
222#define	STACK_EMPTY(stack)	((stack) == NULL)
223
224
225#define	ACTIVE_HEAD(gp)	(&(gp)->active_locks)
226
227#define	SLEEPING_HEAD(gp)	(&(gp)->sleeping_locks)
228
229#define	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp) \
230{ \
231	(lock) = (lock_descriptor_t *)vp->v_filocks;	\
232}
233
234#define	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp) \
235{ \
236for ((lock) = SLEEPING_HEAD((gp))->l_next; ((lock) != SLEEPING_HEAD((gp)) && \
237			(lock)->l_vnode != (vp)); (lock) = (lock)->l_next) \
238			; \
239(lock) = ((lock) == SLEEPING_HEAD((gp))) ? NULL : (lock); \
240}
241
242#define	OVERLAP(lock1, lock2) \
243	(((lock1)->l_start <= (lock2)->l_start && \
244		(lock2)->l_start <= (lock1)->l_end) || \
245	((lock2)->l_start <= (lock1)->l_start && \
246		(lock1)->l_start <= (lock2)->l_end))
247
248#define	IS_INITIAL(lock)	((lock)->l_status == FLK_INITIAL_STATE)
249#define	IS_ACTIVE(lock)		((lock)->l_status == FLK_ACTIVE_STATE)
250#define	IS_SLEEPING(lock)	((lock)->l_status == FLK_SLEEPING_STATE)
251#define	IS_GRANTED(lock)	((lock)->l_status == FLK_GRANTED_STATE)
252#define	IS_INTERRUPTED(lock)	((lock)->l_status == FLK_INTERRUPTED_STATE)
253#define	IS_CANCELLED(lock)	((lock)->l_status == FLK_CANCELLED_STATE)
254#define	IS_DEAD(lock)		((lock)->l_status == FLK_DEAD_STATE)
255
256#define	IS_QUERY_LOCK(lock)	((lock)->l_state & QUERY_LOCK)
257#define	IS_RECOMPUTE(lock)	((lock)->l_state & RECOMPUTE_LOCK)
258#define	IS_BARRIER(lock)	((lock)->l_state & BARRIER_LOCK)
259#define	IS_DELETED(lock)	((lock)->l_state & DELETED_LOCK)
260#define	IS_REFERENCED(lock)	((lock)->l_state & REFERENCED_LOCK)
261#define	IS_IO_LOCK(lock)	((lock)->l_state & IO_LOCK)
262#define	IS_WILLING_TO_SLEEP(lock)	\
263		((lock)->l_state & WILLING_TO_SLEEP_LOCK)
264#define	IS_LOCKMGR(lock)	((lock)->l_state & LOCKMGR_LOCK)
265#define	IS_NLM_UP(lock)		((lock)->l_nlm_state == FLK_NLM_UP)
266/* Clustering: Macro for PXFS locks */
267#define	IS_PXFS(lock)		((lock)->l_state & PXFS_LOCK)
268
269/*
270 * "local" requests don't involve the NFS lock manager in any way.
271 * "remote" requests can be on the server (requests from a remote client),
272 * in which case they should be associated with a local vnode (UFS, tmpfs,
273 * etc.).  These requests are flagged with LOCKMGR_LOCK and are made using
274 * kernel service threads.  Remote requests can also be on an NFS client,
275 * because the NFS lock manager uses local locking for some of its
276 * bookkeeping.  These requests are made by regular user processes.
277 */
278#define	IS_LOCAL(lock)	(GETSYSID((lock)->l_flock.l_sysid) == 0)
279#define	IS_REMOTE(lock)	(! IS_LOCAL(lock))
280
281/* Clustering: Return value for blocking PXFS locks */
282/*
283 * For PXFS locks, reclock() will return this error code for requests that
284 * need to block
285 */
286#define	PXFS_LOCK_BLOCKED -1
287
288/* Clustering: PXFS callback function */
289/*
290 * This function is a callback from the LLM into the PXFS server module.  It
291 * is initialized as a weak stub, and is functional when the pxfs server module
292 * is loaded.
293 */
294extern void cl_flk_state_transition_notify(lock_descriptor_t *lock,
295    int old_state, int new_state);
296
297#define	BLOCKS(lock1, lock2)	(!SAME_OWNER((lock1), (lock2)) && \
298					(((lock1)->l_type == F_WRLCK) || \
299					((lock2)->l_type == F_WRLCK)) && \
300					OVERLAP((lock1), (lock2)))
301
302#define	COVERS(lock1, lock2)	\
303		(((lock1)->l_start <= (lock2)->l_start) && \
304			((lock1)->l_end >= (lock2)->l_end))
305
306#define	IN_LIST_REMOVE(ep)	\
307	{ \
308	(ep)->edge_in_next->edge_in_prev = (ep)->edge_in_prev; \
309	(ep)->edge_in_prev->edge_in_next = (ep)->edge_in_next; \
310	}
311
312#define	ADJ_LIST_REMOVE(ep)	\
313	{ \
314	(ep)->edge_adj_next->edge_adj_prev = (ep)->edge_adj_prev; \
315	(ep)->edge_adj_prev->edge_adj_next = (ep)->edge_adj_next; \
316	}
317
318#define	NOT_BLOCKED(lock)	\
319	((lock)->l_edge.edge_adj_next == &(lock)->l_edge && !IS_GRANTED(lock))
320
321#define	GRANT_WAKEUP(lock)	\
322	{	\
323		flk_set_state(lock, FLK_GRANTED_STATE); \
324		(lock)->l_state |= GRANTED_LOCK; \
325		/* \
326		 * Clustering: PXFS locks do not sleep in the LLM, \
327		 * so there is no need to signal them \
328		 */ \
329		if (!IS_PXFS(lock)) { \
330			cv_signal(&(lock)->l_cv); \
331		} \
332	}
333
334#define	CANCEL_WAKEUP(lock)	\
335	{ \
336		flk_set_state(lock, FLK_CANCELLED_STATE); \
337		(lock)->l_state |= CANCELLED_LOCK; \
338		/* \
339		 * Clustering: PXFS locks do not sleep in the LLM, \
340		 * so there is no need to signal them \
341		 */ \
342		if (!IS_PXFS(lock)) { \
343			cv_signal(&(lock)->l_cv); \
344		} \
345	}
346
347#define	INTERRUPT_WAKEUP(lock)	\
348	{ \
349		flk_set_state(lock, FLK_INTERRUPTED_STATE); \
350		(lock)->l_state |= INTERRUPTED_LOCK; \
351		/* \
352		 * Clustering: PXFS locks do not sleep in the LLM, \
353		 * so there is no need to signal them \
354		 */ \
355		if (!IS_PXFS(lock)) { \
356			cv_signal(&(lock)->l_cv); \
357		} \
358	}
359
360#define	REMOVE_SLEEP_QUEUE(lock)	\
361	{ \
362	ASSERT(IS_SLEEPING(lock) || IS_GRANTED(lock) || \
363	    IS_INTERRUPTED(lock) || IS_CANCELLED(lock)); \
364	(lock)->l_state &= ~SLEEPING_LOCK; \
365	(lock)->l_next->l_prev = (lock)->l_prev; \
366	(lock)->l_prev->l_next = (lock)->l_next; \
367	(lock)->l_next = (lock)->l_prev = (lock_descriptor_t *)NULL; \
368	}
369
370#define	NO_DEPENDENTS(lock)	\
371	((lock)->l_edge.edge_in_next == &(lock)->l_edge)
372
373#define	GRANT(lock)	\
374	{ \
375	(lock)->l_state |= GRANTED_LOCK; \
376	flk_set_state(lock, FLK_GRANTED_STATE); \
377	}
378
379#define	FIRST_IN(lock)	((lock)->l_edge.edge_in_next)
380#define	FIRST_ADJ(lock)	((lock)->l_edge.edge_adj_next)
381#define	HEAD(lock)	(&(lock)->l_edge)
382#define	NEXT_ADJ(ep)	((ep)->edge_adj_next)
383#define	NEXT_IN(ep)	((ep)->edge_in_next)
384#define	IN_ADJ_INIT(lock)	\
385{	\
386(lock)->l_edge.edge_adj_next = (lock)->l_edge.edge_adj_prev = &(lock)->l_edge; \
387(lock)->l_edge.edge_in_next = (lock)->l_edge.edge_in_prev = &(lock)->l_edge; \
388}
389
390#define	COPY(lock1, lock2)	\
391{	\
392(lock1)->l_graph = (lock2)->l_graph; \
393(lock1)->l_vnode = (lock2)->l_vnode; \
394(lock1)->l_type = (lock2)->l_type; \
395(lock1)->l_state = (lock2)->l_state; \
396(lock1)->l_start = (lock2)->l_start; \
397(lock1)->l_end = (lock2)->l_end; \
398(lock1)->l_flock = (lock2)->l_flock; \
399(lock1)->l_zoneid = (lock2)->l_zoneid; \
400(lock1)->pvertex = (lock2)->pvertex; \
401}
402
403/*
404 * Clustering
405 */
406/* Routines to set and get the NLM state in a lock request */
407#define	SET_NLM_STATE(lock, nlm_state)	((lock)->l_nlm_state = nlm_state)
408#define	GET_NLM_STATE(lock)	((lock)->l_nlm_state)
409/*
410 * NLM registry abstraction:
411 *   Abstraction overview:
412 *   This registry keeps track of the NLM servers via their nlmids
413 *   that have requested locks at the LLM this registry is associated
414 *   with.
415 */
416/* Routines to manipulate the NLM registry object state */
417#define	FLK_REGISTRY_IS_NLM_UNKNOWN(nlmreg, nlmid) \
418	    ((nlmreg)[nlmid] == FLK_NLM_UNKNOWN)
419#define	FLK_REGISTRY_IS_NLM_UP(nlmreg, nlmid) \
420	    ((nlmreg)[nlmid] == FLK_NLM_UP)
421#define	FLK_REGISTRY_ADD_NLMID(nlmreg, nlmid) \
422	    ((nlmreg)[nlmid] = FLK_NLM_UP)
423#define	FLK_REGISTRY_CHANGE_NLM_STATE(nlmreg, nlmid, state) \
424	    ((nlmreg)[nlmid] = state)
425
426/* Indicates the effect of executing a request on the existing locks */
427
428#define	FLK_UNLOCK	0x1	/* request unlocks the existing lock */
429#define	FLK_DOWNGRADE	0x2	/* request downgrades the existing lock */
430#define	FLK_UPGRADE	0x3	/* request upgrades the existing lock */
431#define	FLK_STAY_SAME	0x4	/* request type is same as existing lock */
432
433
434/*	proc graph definitions	*/
435
436/*
437 * Proc graph is the global process graph that maintains information
438 * about the dependencies between processes. An edge is added between two
439 * processes represented by proc_vertex's A and B, iff there exists l1
440 * owned by process A in any of the lock_graph's dependent on l2
441 * (thus having an edge to l2) owned by process B.
442 */
443struct proc_vertex {
444	pid_t	pid;	/* pid of the process */
445	long	sysid;	/* sysid of the process */
446	struct proc_edge	*edge;	/* adajcent edges of this process */
447	int incount;		/* Number of inedges to this process */
448	struct proc_edge *p_sedge;	/* used for implementing stack alg. */
449	struct proc_vertex	*p_stack;	/* used for stack alg. */
450	int atime;	/* used for cycle detection algorithm */
451	int dtime;	/* used for cycle detection algorithm */
452	int index;	/* index into the  array of proc_graph vertices */
453};
454
455typedef	struct proc_vertex proc_vertex_t;
456
457struct proc_edge {
458	struct proc_edge	*next;	/* next edge in adjacency list */
459	int  refcount;			/* reference count of this edge */
460	struct proc_vertex	*to_proc;	/* process this points to */
461};
462
463typedef struct proc_edge proc_edge_t;
464
465
466#define	PROC_CHUNK	100
467
468struct proc_graph {
469	struct proc_vertex **proc;	/* list of proc_vertexes */
470	int gcount;		/* list size */
471	int free;		/* number of free slots in the list */
472	int mark;		/* used for graph coloring */
473};
474
475typedef struct proc_graph proc_graph_t;
476
477extern	struct proc_graph	pgraph;
478
479#define	PROC_SAME_OWNER(lock, pvertex)	\
480	(((lock)->l_flock.l_pid == (pvertex)->pid) && \
481		((lock)->l_flock.l_sysid == (pvertex)->sysid))
482
483#define	PROC_ARRIVE(pvertex)	((pvertex)->atime = pgraph.mark)
484#define	PROC_DEPART(pvertex)	((pvertex)->dtime = pgraph.mark)
485#define	PROC_ARRIVED(pvertex)	((pvertex)->atime == pgraph.mark)
486#define	PROC_DEPARTED(pvertex)  ((pvertex)->dtime == pgraph.mark)
487
488#ifdef	__cplusplus
489}
490#endif
491
492#endif	/* _SYS_FLOCK_IMPL_H */
493