xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_state.c (revision aab20b47)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2018 Nexenta Systems, Inc.
28  * Copyright 2019 Nexenta by DDN, Inc.
29  */
30 
31 #include <sys/systm.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/atomic.h>
35 #include <sys/clconf.h>
36 #include <sys/cladm.h>
37 #include <sys/flock.h>
38 #include <nfs/export.h>
39 #include <nfs/nfs.h>
40 #include <nfs/nfs4.h>
41 #include <nfs/nfssys.h>
42 #include <nfs/lm.h>
43 #include <sys/pathname.h>
44 #include <sys/sdt.h>
45 #include <sys/nvpair.h>
46 
47 extern u_longlong_t nfs4_srv_caller_id;
48 
49 extern uint_t nfs4_srv_vkey;
50 
51 stateid4 special0 = {
52 	0,
53 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
54 };
55 
56 stateid4 special1 = {
57 	0xffffffff,
58 	{
59 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
60 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
61 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
62 	}
63 };
64 
65 
66 #define	ISSPECIAL(id)  (stateid4_cmp(id, &special0) || \
67 			stateid4_cmp(id, &special1))
68 
69 /* For embedding the cluster nodeid into our clientid */
70 #define	CLUSTER_NODEID_SHIFT	24
71 #define	CLUSTER_MAX_NODEID	255
72 
73 #ifdef DEBUG
74 int rfs4_debug;
75 #endif
76 
77 rfs4_db_mem_cache_t rfs4_db_mem_cache_table[RFS4_DB_MEM_CACHE_NUM];
78 static uint32_t rfs4_database_debug = 0x00;
79 
80 /* CSTYLED */
81 static void rfs4_ss_clid_write(nfs4_srv_t *nsrv4, rfs4_client_t *cp, char *leaf);
82 static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
83 static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
84 static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
85 
86 /*
87  * Couple of simple init/destroy functions for a general waiter
88  */
89 void
rfs4_sw_init(rfs4_state_wait_t * swp)90 rfs4_sw_init(rfs4_state_wait_t *swp)
91 {
92 	mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
93 	cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
94 	swp->sw_active = FALSE;
95 	swp->sw_wait_count = 0;
96 }
97 
98 void
rfs4_sw_destroy(rfs4_state_wait_t * swp)99 rfs4_sw_destroy(rfs4_state_wait_t *swp)
100 {
101 	mutex_destroy(swp->sw_cv_lock);
102 	cv_destroy(swp->sw_cv);
103 }
104 
105 void
rfs4_sw_enter(rfs4_state_wait_t * swp)106 rfs4_sw_enter(rfs4_state_wait_t *swp)
107 {
108 	mutex_enter(swp->sw_cv_lock);
109 	while (swp->sw_active) {
110 		swp->sw_wait_count++;
111 		cv_wait(swp->sw_cv, swp->sw_cv_lock);
112 		swp->sw_wait_count--;
113 	}
114 	ASSERT(swp->sw_active == FALSE);
115 	swp->sw_active = TRUE;
116 	mutex_exit(swp->sw_cv_lock);
117 }
118 
119 void
rfs4_sw_exit(rfs4_state_wait_t * swp)120 rfs4_sw_exit(rfs4_state_wait_t *swp)
121 {
122 	mutex_enter(swp->sw_cv_lock);
123 	ASSERT(swp->sw_active == TRUE);
124 	swp->sw_active = FALSE;
125 	if (swp->sw_wait_count != 0)
126 		cv_broadcast(swp->sw_cv);
127 	mutex_exit(swp->sw_cv_lock);
128 }
129 
130 static void
deep_lock_copy(LOCK4res * dres,LOCK4res * sres)131 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
132 {
133 	lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
134 	lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
135 
136 	if (sres->status == NFS4ERR_DENIED) {
137 		dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
138 		bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
139 	}
140 }
141 
142 /*
143  * CPR callback id -- not related to v4 callbacks
144  */
145 static callb_id_t cpr_id = 0;
146 
147 static void
deep_lock_free(LOCK4res * res)148 deep_lock_free(LOCK4res *res)
149 {
150 	lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
151 
152 	if (res->status == NFS4ERR_DENIED)
153 		kmem_free(lo->owner_val, lo->owner_len);
154 }
155 
156 static void
deep_open_copy(OPEN4res * dres,OPEN4res * sres)157 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
158 {
159 	nfsace4 *sacep, *dacep;
160 
161 	if (sres->status != NFS4_OK) {
162 		return;
163 	}
164 
165 	dres->attrset = sres->attrset;
166 
167 	switch (sres->delegation.delegation_type) {
168 	case OPEN_DELEGATE_NONE:
169 		return;
170 	case OPEN_DELEGATE_READ:
171 		sacep = &sres->delegation.open_delegation4_u.read.permissions;
172 		dacep = &dres->delegation.open_delegation4_u.read.permissions;
173 		break;
174 	case OPEN_DELEGATE_WRITE:
175 		sacep = &sres->delegation.open_delegation4_u.write.permissions;
176 		dacep = &dres->delegation.open_delegation4_u.write.permissions;
177 		break;
178 	}
179 	dacep->who.utf8string_val =
180 	    kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
181 	bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
182 	    sacep->who.utf8string_len);
183 }
184 
185 static void
deep_open_free(OPEN4res * res)186 deep_open_free(OPEN4res *res)
187 {
188 	nfsace4 *acep;
189 	if (res->status != NFS4_OK)
190 		return;
191 
192 	switch (res->delegation.delegation_type) {
193 	case OPEN_DELEGATE_NONE:
194 		return;
195 	case OPEN_DELEGATE_READ:
196 		acep = &res->delegation.open_delegation4_u.read.permissions;
197 		break;
198 	case OPEN_DELEGATE_WRITE:
199 		acep = &res->delegation.open_delegation4_u.write.permissions;
200 		break;
201 	}
202 
203 	if (acep->who.utf8string_val) {
204 		kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
205 		acep->who.utf8string_val = NULL;
206 	}
207 }
208 
209 void
rfs4_free_reply(nfs_resop4 * rp)210 rfs4_free_reply(nfs_resop4 *rp)
211 {
212 	switch (rp->resop) {
213 	case OP_LOCK:
214 		deep_lock_free(&rp->nfs_resop4_u.oplock);
215 		break;
216 	case OP_OPEN:
217 		deep_open_free(&rp->nfs_resop4_u.opopen);
218 	default:
219 		break;
220 	}
221 }
222 
223 void
rfs4_copy_reply(nfs_resop4 * dst,nfs_resop4 * src)224 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
225 {
226 	*dst = *src;
227 
228 	/* Handle responses that need deep copy */
229 	switch (src->resop) {
230 	case OP_LOCK:
231 		deep_lock_copy(&dst->nfs_resop4_u.oplock,
232 		    &src->nfs_resop4_u.oplock);
233 		break;
234 	case OP_OPEN:
235 		deep_open_copy(&dst->nfs_resop4_u.opopen,
236 		    &src->nfs_resop4_u.opopen);
237 		break;
238 	default:
239 		break;
240 	};
241 }
242 
243 /*
244  * This is the implementation of the underlying state engine. The
245  * public interface to this engine is described by
246  * nfs4_state.h. Callers to the engine should hold no state engine
247  * locks when they call in to it. If the protocol needs to lock data
248  * structures it should do so after acquiring all references to them
249  * first and then follow the following lock order:
250  *
251  *	client > openowner > state > lo_state > lockowner > file.
252  *
253  * Internally we only allow a thread to hold one hash bucket lock at a
254  * time and the lock is higher in the lock order (must be acquired
255  * first) than the data structure that is on that hash list.
256  *
257  * If a new reference was acquired by the caller, that reference needs
258  * to be released after releasing all acquired locks with the
259  * corresponding rfs4_*_rele routine.
260  */
261 
262 /*
263  * This code is some what prototypical for now. Its purpose currently is to
264  * implement the interfaces sufficiently to finish the higher protocol
265  * elements. This will be replaced by a dynamically resizeable tables
266  * backed by kmem_cache allocator. However synchronization is handled
267  * correctly (I hope) and will not change by much.  The mutexes for
268  * the hash buckets that can be used to create new instances of data
269  * structures  might be good candidates to evolve into reader writer
270  * locks. If it has to do a creation, it would be holding the
271  * mutex across a kmem_alloc with KM_SLEEP specified.
272  */
273 
274 #ifdef DEBUG
275 #define	TABSIZE 17
276 #else
277 #define	TABSIZE 2047
278 #endif
279 
280 #define	ADDRHASH(key) ((unsigned long)(key) >> 3)
281 
282 #define	MAXTABSZ 1024*1024
283 
284 /* The values below are rfs4_lease_time units */
285 
286 #ifdef DEBUG
287 #define	CLIENT_CACHE_TIME 1
288 #define	OPENOWNER_CACHE_TIME 1
289 #define	STATE_CACHE_TIME 1
290 #define	LO_STATE_CACHE_TIME 1
291 #define	LOCKOWNER_CACHE_TIME 1
292 #define	FILE_CACHE_TIME 3
293 #define	DELEG_STATE_CACHE_TIME 1
294 #else
295 #define	CLIENT_CACHE_TIME 10
296 #define	OPENOWNER_CACHE_TIME 5
297 #define	STATE_CACHE_TIME 1
298 #define	LO_STATE_CACHE_TIME 1
299 #define	LOCKOWNER_CACHE_TIME 3
300 #define	FILE_CACHE_TIME 40
301 #define	DELEG_STATE_CACHE_TIME 1
302 #endif
303 
304 /*
305  * NFSv4 server state databases
306  *
307  * Initilized when the module is loaded and used by NFSv4 state tables.
308  * These kmem_cache databases are global, the tables that make use of these
309  * are per zone.
310  */
311 kmem_cache_t *rfs4_client_mem_cache;
312 kmem_cache_t *rfs4_clntIP_mem_cache;
313 kmem_cache_t *rfs4_openown_mem_cache;
314 kmem_cache_t *rfs4_openstID_mem_cache;
315 kmem_cache_t *rfs4_lockstID_mem_cache;
316 kmem_cache_t *rfs4_lockown_mem_cache;
317 kmem_cache_t *rfs4_file_mem_cache;
318 kmem_cache_t *rfs4_delegstID_mem_cache;
319 
320 /*
321  * NFSv4 state table functions
322  */
323 static bool_t rfs4_client_create(rfs4_entry_t, void *);
324 static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
325 static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
326 static void rfs4_client_destroy(rfs4_entry_t);
327 static bool_t rfs4_client_expiry(rfs4_entry_t);
328 static uint32_t clientid_hash(void *);
329 static bool_t clientid_compare(rfs4_entry_t, void *);
330 static void *clientid_mkkey(rfs4_entry_t);
331 static uint32_t nfsclnt_hash(void *);
332 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
333 static void *nfsclnt_mkkey(rfs4_entry_t);
334 static bool_t rfs4_clntip_expiry(rfs4_entry_t);
335 static void rfs4_clntip_destroy(rfs4_entry_t);
336 static bool_t rfs4_clntip_create(rfs4_entry_t, void *);
337 static uint32_t clntip_hash(void *);
338 static bool_t clntip_compare(rfs4_entry_t, void *);
339 static void *clntip_mkkey(rfs4_entry_t);
340 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
341 static void rfs4_openowner_destroy(rfs4_entry_t);
342 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
343 static uint32_t openowner_hash(void *);
344 static bool_t openowner_compare(rfs4_entry_t, void *);
345 static void *openowner_mkkey(rfs4_entry_t);
346 static bool_t rfs4_state_create(rfs4_entry_t, void *);
347 static void rfs4_state_destroy(rfs4_entry_t);
348 static bool_t rfs4_state_expiry(rfs4_entry_t);
349 static uint32_t state_hash(void *);
350 static bool_t state_compare(rfs4_entry_t, void *);
351 static void *state_mkkey(rfs4_entry_t);
352 static uint32_t state_owner_file_hash(void *);
353 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
354 static void *state_owner_file_mkkey(rfs4_entry_t);
355 static uint32_t state_file_hash(void *);
356 static bool_t state_file_compare(rfs4_entry_t, void *);
357 static void *state_file_mkkey(rfs4_entry_t);
358 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
359 static void rfs4_lo_state_destroy(rfs4_entry_t);
360 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
361 static uint32_t lo_state_hash(void *);
362 static bool_t lo_state_compare(rfs4_entry_t, void *);
363 static void *lo_state_mkkey(rfs4_entry_t);
364 static uint32_t lo_state_lo_hash(void *);
365 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
366 static void *lo_state_lo_mkkey(rfs4_entry_t);
367 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
368 static void rfs4_lockowner_destroy(rfs4_entry_t);
369 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
370 static uint32_t lockowner_hash(void *);
371 static bool_t lockowner_compare(rfs4_entry_t, void *);
372 static void *lockowner_mkkey(rfs4_entry_t);
373 static uint32_t pid_hash(void *);
374 static bool_t pid_compare(rfs4_entry_t, void *);
375 static void *pid_mkkey(rfs4_entry_t);
376 static bool_t rfs4_file_create(rfs4_entry_t, void *);
377 static void rfs4_file_destroy(rfs4_entry_t);
378 static uint32_t file_hash(void *);
379 static bool_t file_compare(rfs4_entry_t, void *);
380 static void *file_mkkey(rfs4_entry_t);
381 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
382 static void rfs4_deleg_state_destroy(rfs4_entry_t);
383 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
384 static uint32_t deleg_hash(void *);
385 static bool_t deleg_compare(rfs4_entry_t, void *);
386 static void *deleg_mkkey(rfs4_entry_t);
387 static uint32_t deleg_state_hash(void *);
388 static bool_t deleg_state_compare(rfs4_entry_t, void *);
389 static void *deleg_state_mkkey(rfs4_entry_t);
390 
391 static void rfs4_state_rele_nounlock(rfs4_state_t *);
392 
393 static int rfs4_ss_enabled = 0;
394 
395 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
396 
397 void
rfs4_ss_pnfree(rfs4_ss_pn_t * ss_pn)398 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
399 {
400 	kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
401 }
402 
403 static rfs4_ss_pn_t *
rfs4_ss_pnalloc(char * dir,char * leaf)404 rfs4_ss_pnalloc(char *dir, char *leaf)
405 {
406 	rfs4_ss_pn_t *ss_pn;
407 	int dir_len, leaf_len;
408 
409 	/*
410 	 * validate we have a resonable path
411 	 * (account for the '/' and trailing null)
412 	 */
413 	if ((dir_len = strlen(dir)) > MAXPATHLEN ||
414 	    (leaf_len = strlen(leaf)) > MAXNAMELEN ||
415 	    (dir_len + leaf_len + 2) > MAXPATHLEN) {
416 		return (NULL);
417 	}
418 
419 	ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
420 
421 	(void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
422 	/* Handy pointer to just the leaf name */
423 	ss_pn->leaf = ss_pn->pn + dir_len + 1;
424 	return (ss_pn);
425 }
426 
427 
428 /*
429  * Move the "leaf" filename from "sdir" directory
430  * to the "ddir" directory. Return the pathname of
431  * the destination unless the rename fails in which
432  * case we need to return the source pathname.
433  */
434 static rfs4_ss_pn_t *
rfs4_ss_movestate(char * sdir,char * ddir,char * leaf)435 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
436 {
437 	rfs4_ss_pn_t *src, *dst;
438 
439 	if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
440 		return (NULL);
441 
442 	if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
443 		rfs4_ss_pnfree(src);
444 		return (NULL);
445 	}
446 
447 	/*
448 	 * If the rename fails we shall return the src
449 	 * pathname and free the dst. Otherwise we need
450 	 * to free the src and return the dst pathanme.
451 	 */
452 	if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
453 		rfs4_ss_pnfree(dst);
454 		return (src);
455 	}
456 	rfs4_ss_pnfree(src);
457 	return (dst);
458 }
459 
460 
461 static rfs4_oldstate_t *
rfs4_ss_getstate(vnode_t * dvp,rfs4_ss_pn_t * ss_pn)462 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
463 {
464 	struct uio uio;
465 	struct iovec iov[3];
466 
467 	rfs4_oldstate_t *cl_ss = NULL;
468 	vnode_t *vp;
469 	vattr_t va;
470 	uint_t id_len;
471 	int err, kill_file, file_vers;
472 
473 	if (ss_pn == NULL)
474 		return (NULL);
475 
476 	/*
477 	 * open the state file.
478 	 */
479 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
480 		return (NULL);
481 	}
482 
483 	if (vp->v_type != VREG) {
484 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
485 		VN_RELE(vp);
486 		return (NULL);
487 	}
488 
489 	err = VOP_ACCESS(vp, VREAD, 0, CRED(), NULL);
490 	if (err) {
491 		/*
492 		 * We don't have read access? better get the heck out.
493 		 */
494 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
495 		VN_RELE(vp);
496 		return (NULL);
497 	}
498 
499 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
500 	/*
501 	 * get the file size to do some basic validation
502 	 */
503 	va.va_mask = AT_SIZE;
504 	err = VOP_GETATTR(vp, &va, 0, CRED(), NULL);
505 
506 	kill_file = (va.va_size == 0 || va.va_size <
507 	    (NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
508 
509 	if (err || kill_file) {
510 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
511 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
512 		VN_RELE(vp);
513 		if (kill_file) {
514 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
515 		}
516 		return (NULL);
517 	}
518 
519 	cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
520 
521 	/*
522 	 * build iovecs to read in the file_version, verifier and id_len
523 	 */
524 	iov[0].iov_base = (caddr_t)&file_vers;
525 	iov[0].iov_len = sizeof (int);
526 	iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
527 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
528 	iov[2].iov_base = (caddr_t)&id_len;
529 	iov[2].iov_len = sizeof (uint_t);
530 
531 	uio.uio_iov = iov;
532 	uio.uio_iovcnt = 3;
533 	uio.uio_segflg = UIO_SYSSPACE;
534 	uio.uio_loffset = 0;
535 	uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
536 
537 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
538 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
539 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
540 		VN_RELE(vp);
541 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
542 		return (NULL);
543 	}
544 
545 	/*
546 	 * if the file_version doesn't match or if the
547 	 * id_len is zero or the combination of the verifier,
548 	 * id_len and id_val is bigger than the file we have
549 	 * a problem. If so ditch the file.
550 	 */
551 	kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
552 	    (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
553 
554 	if (err || kill_file) {
555 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
556 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
557 		VN_RELE(vp);
558 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
559 		if (kill_file) {
560 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
561 		}
562 		return (NULL);
563 	}
564 
565 	/*
566 	 * now get the client id value
567 	 */
568 	cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
569 	iov[0].iov_base = cl_ss->cl_id4.id_val;
570 	iov[0].iov_len = id_len;
571 
572 	uio.uio_iov = iov;
573 	uio.uio_iovcnt = 1;
574 	uio.uio_segflg = UIO_SYSSPACE;
575 	uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
576 
577 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
578 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
579 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
580 		VN_RELE(vp);
581 		kmem_free(cl_ss->cl_id4.id_val, id_len);
582 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
583 		return (NULL);
584 	}
585 
586 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
587 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
588 	VN_RELE(vp);
589 	return (cl_ss);
590 }
591 
592 #ifdef	nextdp
593 #undef nextdp
594 #endif
595 #define	nextdp(dp)	((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
596 
597 /*
598  * Add entries from statedir to supplied oldstate list.
599  * Optionally, move all entries from statedir -> destdir.
600  */
601 void
rfs4_ss_oldstate(rfs4_oldstate_t * oldstate,char * statedir,char * destdir)602 rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
603 {
604 	rfs4_ss_pn_t *ss_pn;
605 	rfs4_oldstate_t *cl_ss = NULL;
606 	char	*dirt = NULL;
607 	int	err, dir_eof = 0, size = 0;
608 	vnode_t *dvp;
609 	struct iovec iov;
610 	struct uio uio;
611 	struct dirent64 *dep;
612 	offset_t dirchunk_offset = 0;
613 
614 	/*
615 	 * open the state directory
616 	 */
617 	if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
618 		return;
619 
620 	if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED(), NULL))
621 		goto out;
622 
623 	dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
624 
625 	/*
626 	 * Get and process the directory entries
627 	 */
628 	while (!dir_eof) {
629 		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
630 		iov.iov_base = dirt;
631 		iov.iov_len = RFS4_SS_DIRSIZE;
632 		uio.uio_iov = &iov;
633 		uio.uio_iovcnt = 1;
634 		uio.uio_segflg = UIO_SYSSPACE;
635 		uio.uio_loffset = dirchunk_offset;
636 		uio.uio_resid = RFS4_SS_DIRSIZE;
637 
638 		err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof, NULL, 0);
639 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
640 		if (err)
641 			goto out;
642 
643 		size = RFS4_SS_DIRSIZE - uio.uio_resid;
644 
645 		/*
646 		 * Process all the directory entries in this
647 		 * readdir chunk
648 		 */
649 		for (dep = (struct dirent64 *)dirt; size > 0;
650 		    dep = nextdp(dep)) {
651 
652 			size -= dep->d_reclen;
653 			dirchunk_offset = dep->d_off;
654 
655 			/*
656 			 * Skip '.' and '..'
657 			 */
658 			if (NFS_IS_DOTNAME(dep->d_name))
659 				continue;
660 
661 			ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
662 			if (ss_pn == NULL)
663 				continue;
664 
665 			if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
666 				if (destdir != NULL) {
667 					rfs4_ss_pnfree(ss_pn);
668 					cl_ss->ss_pn = rfs4_ss_movestate(
669 					    statedir, destdir, dep->d_name);
670 				} else {
671 					cl_ss->ss_pn = ss_pn;
672 				}
673 				insque(cl_ss, oldstate);
674 			} else {
675 				rfs4_ss_pnfree(ss_pn);
676 			}
677 		}
678 	}
679 
680 out:
681 	(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED(), NULL);
682 	VN_RELE(dvp);
683 	if (dirt)
684 		kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
685 }
686 
687 static void
rfs4_ss_init(nfs4_srv_t * nsrv4)688 rfs4_ss_init(nfs4_srv_t *nsrv4)
689 {
690 	int npaths = 1;
691 	char *default_dss_path = NFS4_DSS_VAR_DIR;
692 
693 	/* read the default stable storage state */
694 	rfs4_dss_readstate(nsrv4, npaths, &default_dss_path);
695 
696 	rfs4_ss_enabled = 1;
697 }
698 
699 static void
rfs4_ss_fini(nfs4_srv_t * nsrv4)700 rfs4_ss_fini(nfs4_srv_t *nsrv4)
701 {
702 	rfs4_servinst_t *sip;
703 
704 	mutex_enter(&nsrv4->servinst_lock);
705 	sip = nsrv4->nfs4_cur_servinst;
706 	while (sip != NULL) {
707 		rfs4_dss_clear_oldstate(sip);
708 		sip = sip->next;
709 	}
710 	mutex_exit(&nsrv4->servinst_lock);
711 }
712 
713 /*
714  * Remove all oldstate files referenced by this servinst.
715  */
716 static void
rfs4_dss_clear_oldstate(rfs4_servinst_t * sip)717 rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
718 {
719 	rfs4_oldstate_t *os_head, *osp;
720 
721 	rw_enter(&sip->oldstate_lock, RW_WRITER);
722 	os_head = sip->oldstate;
723 
724 	if (os_head == NULL) {
725 		rw_exit(&sip->oldstate_lock);
726 		return;
727 	}
728 
729 	/* skip dummy entry */
730 	osp = os_head->next;
731 	while (osp != os_head) {
732 		char *leaf = osp->ss_pn->leaf;
733 		rfs4_oldstate_t *os_next;
734 
735 		rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
736 
737 		if (osp->cl_id4.id_val)
738 			kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
739 		rfs4_ss_pnfree(osp->ss_pn);
740 
741 		os_next = osp->next;
742 		remque(osp);
743 		kmem_free(osp, sizeof (rfs4_oldstate_t));
744 		osp = os_next;
745 	}
746 
747 	rw_exit(&sip->oldstate_lock);
748 }
749 
750 /*
751  * Form the state and oldstate paths, and read in the stable storage files.
752  */
753 void
rfs4_dss_readstate(nfs4_srv_t * nsrv4,int npaths,char ** paths)754 rfs4_dss_readstate(nfs4_srv_t *nsrv4, int npaths, char **paths)
755 {
756 	int i;
757 	char *state, *oldstate;
758 
759 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
760 	oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
761 
762 	for (i = 0; i < npaths; i++) {
763 		char *path = paths[i];
764 
765 		(void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
766 		(void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
767 
768 		/*
769 		 * Populate the current server instance's oldstate list.
770 		 *
771 		 * 1. Read stable storage data from old state directory,
772 		 *    leaving its contents alone.
773 		 *
774 		 * 2. Read stable storage data from state directory,
775 		 *    and move the latter's contents to old state
776 		 *    directory.
777 		 */
778 		/* CSTYLED */
779 		rfs4_ss_oldstate(nsrv4->nfs4_cur_servinst->oldstate, oldstate, NULL);
780 		/* CSTYLED */
781 		rfs4_ss_oldstate(nsrv4->nfs4_cur_servinst->oldstate, state, oldstate);
782 	}
783 
784 	kmem_free(state, MAXPATHLEN);
785 	kmem_free(oldstate, MAXPATHLEN);
786 }
787 
788 
789 /*
790  * Check if we are still in grace and if the client can be
791  * granted permission to perform reclaims.
792  */
793 void
rfs4_ss_chkclid(nfs4_srv_t * nsrv4,rfs4_client_t * cp)794 rfs4_ss_chkclid(nfs4_srv_t *nsrv4, rfs4_client_t *cp)
795 {
796 	rfs4_servinst_t *sip;
797 
798 	/*
799 	 * It should be sufficient to check the oldstate data for just
800 	 * this client's instance. However, since our per-instance
801 	 * client grouping is solely temporal, HA-NFSv4 RG failover
802 	 * might result in clients of the same RG being partitioned into
803 	 * separate instances.
804 	 *
805 	 * Until the client grouping is improved, we must check the
806 	 * oldstate data for all instances with an active grace period.
807 	 *
808 	 * This also serves as the mechanism to remove stale oldstate data.
809 	 * The first time we check an instance after its grace period has
810 	 * expired, the oldstate data should be cleared.
811 	 *
812 	 * Start at the current instance, and walk the list backwards
813 	 * to the first.
814 	 */
815 	mutex_enter(&nsrv4->servinst_lock);
816 	for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
817 		rfs4_ss_chkclid_sip(cp, sip);
818 
819 		/* if the above check found this client, we're done */
820 		if (cp->rc_can_reclaim)
821 			break;
822 	}
823 	mutex_exit(&nsrv4->servinst_lock);
824 }
825 
826 static void
rfs4_ss_chkclid_sip(rfs4_client_t * cp,rfs4_servinst_t * sip)827 rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
828 {
829 	rfs4_oldstate_t *osp, *os_head;
830 
831 	/* short circuit everything if this server instance has no oldstate */
832 	rw_enter(&sip->oldstate_lock, RW_READER);
833 	os_head = sip->oldstate;
834 	rw_exit(&sip->oldstate_lock);
835 	if (os_head == NULL)
836 		return;
837 
838 	/*
839 	 * If this server instance is no longer in a grace period then
840 	 * the client won't be able to reclaim. No further need for this
841 	 * instance's oldstate data, so it can be cleared.
842 	 */
843 	if (!rfs4_servinst_in_grace(sip))
844 		return;
845 
846 	/* this instance is still in grace; search for the clientid */
847 
848 	rw_enter(&sip->oldstate_lock, RW_READER);
849 
850 	os_head = sip->oldstate;
851 	/* skip dummy entry */
852 	osp = os_head->next;
853 	while (osp != os_head) {
854 		if (osp->cl_id4.id_len == cp->rc_nfs_client.id_len) {
855 			if (bcmp(osp->cl_id4.id_val, cp->rc_nfs_client.id_val,
856 			    osp->cl_id4.id_len) == 0) {
857 				cp->rc_can_reclaim = 1;
858 				break;
859 			}
860 		}
861 		osp = osp->next;
862 	}
863 
864 	rw_exit(&sip->oldstate_lock);
865 }
866 
867 /*
868  * Place client information into stable storage: 1/3.
869  * First, generate the leaf filename, from the client's IP address and
870  * the server-generated short-hand clientid.
871  */
872 void
rfs4_ss_clid(nfs4_srv_t * nsrv4,rfs4_client_t * cp)873 rfs4_ss_clid(nfs4_srv_t *nsrv4, rfs4_client_t *cp)
874 {
875 	const char *kinet_ntop6(uchar_t *, char *, size_t);
876 	char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
877 	struct sockaddr *ca;
878 	uchar_t *b;
879 
880 	if (rfs4_ss_enabled == 0) {
881 		return;
882 	}
883 
884 	buf[0] = 0;
885 
886 	ca = (struct sockaddr *)&cp->rc_addr;
887 
888 	/*
889 	 * Convert the caller's IP address to a dotted string
890 	 */
891 	if (ca->sa_family == AF_INET) {
892 		b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
893 		(void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
894 		    b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
895 	} else if (ca->sa_family == AF_INET6) {
896 		struct sockaddr_in6 *sin6;
897 
898 		sin6 = (struct sockaddr_in6 *)ca;
899 		(void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
900 		    buf, INET6_ADDRSTRLEN);
901 	}
902 
903 	(void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
904 	    (longlong_t)cp->rc_clientid);
905 	rfs4_ss_clid_write(nsrv4, cp, leaf);
906 }
907 
908 /*
909  * Place client information into stable storage: 2/3.
910  * DSS: distributed stable storage: the file may need to be written to
911  * multiple directories.
912  */
913 static void
rfs4_ss_clid_write(nfs4_srv_t * nsrv4,rfs4_client_t * cp,char * leaf)914 rfs4_ss_clid_write(nfs4_srv_t *nsrv4, rfs4_client_t *cp, char *leaf)
915 {
916 	rfs4_servinst_t *sip;
917 
918 	/*
919 	 * It should be sufficient to write the leaf file to (all) DSS paths
920 	 * associated with just this client's instance. However, since our
921 	 * per-instance client grouping is solely temporal, HA-NFSv4 RG
922 	 * failover might result in us losing DSS data.
923 	 *
924 	 * Until the client grouping is improved, we must write the DSS data
925 	 * to all instances' paths. Start at the current instance, and
926 	 * walk the list backwards to the first.
927 	 */
928 	mutex_enter(&nsrv4->servinst_lock);
929 	for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
930 		int i, npaths = sip->dss_npaths;
931 
932 		/* write the leaf file to all DSS paths */
933 		for (i = 0; i < npaths; i++) {
934 			rfs4_dss_path_t *dss_path = sip->dss_paths[i];
935 
936 			/* HA-NFSv4 path might have been failed-away from us */
937 			if (dss_path == NULL)
938 				continue;
939 
940 			rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
941 		}
942 	}
943 	mutex_exit(&nsrv4->servinst_lock);
944 }
945 
946 /*
947  * Place client information into stable storage: 3/3.
948  * Write the stable storage data to the requested file.
949  */
950 static void
rfs4_ss_clid_write_one(rfs4_client_t * cp,char * dss_path,char * leaf)951 rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
952 {
953 	int ioflag;
954 	int file_vers = NFS4_SS_VERSION;
955 	size_t dirlen;
956 	struct uio uio;
957 	struct iovec iov[4];
958 	char *dir;
959 	rfs4_ss_pn_t *ss_pn;
960 	vnode_t *vp;
961 	nfs_client_id4 *cl_id4 = &(cp->rc_nfs_client);
962 
963 	/* allow 2 extra bytes for '/' & NUL */
964 	dirlen = strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2;
965 	dir = kmem_alloc(dirlen, KM_SLEEP);
966 	(void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
967 
968 	ss_pn = rfs4_ss_pnalloc(dir, leaf);
969 	/* rfs4_ss_pnalloc takes its own copy */
970 	kmem_free(dir, dirlen);
971 	if (ss_pn == NULL)
972 		return;
973 
974 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
975 	    CRCREAT, 0)) {
976 		rfs4_ss_pnfree(ss_pn);
977 		return;
978 	}
979 
980 	/*
981 	 * We need to record leaf - i.e. the filename - so that we know
982 	 * what to remove, in the future. However, the dir part of cp->ss_pn
983 	 * should never be referenced directly, since it's potentially only
984 	 * one of several paths with this leaf in it.
985 	 */
986 	if (cp->rc_ss_pn != NULL) {
987 		if (strcmp(cp->rc_ss_pn->leaf, leaf) == 0) {
988 			/* we've already recorded *this* leaf */
989 			rfs4_ss_pnfree(ss_pn);
990 		} else {
991 			/* replace with this leaf */
992 			rfs4_ss_pnfree(cp->rc_ss_pn);
993 			cp->rc_ss_pn = ss_pn;
994 		}
995 	} else {
996 		cp->rc_ss_pn = ss_pn;
997 	}
998 
999 	/*
1000 	 * Build a scatter list that points to the nfs_client_id4
1001 	 */
1002 	iov[0].iov_base = (caddr_t)&file_vers;
1003 	iov[0].iov_len = sizeof (int);
1004 	iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
1005 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
1006 	iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
1007 	iov[2].iov_len = sizeof (uint_t);
1008 	iov[3].iov_base = (caddr_t)cl_id4->id_val;
1009 	iov[3].iov_len = cl_id4->id_len;
1010 
1011 	uio.uio_iov = iov;
1012 	uio.uio_iovcnt = 4;
1013 	uio.uio_loffset = 0;
1014 	uio.uio_segflg = UIO_SYSSPACE;
1015 	uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
1016 	uio.uio_resid = cl_id4->id_len + sizeof (int) +
1017 	    NFS4_VERIFIER_SIZE + sizeof (uint_t);
1018 
1019 	ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1020 	uio.uio_extflg = UIO_COPY_DEFAULT;
1021 
1022 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1023 	/* write the full client id to the file. */
1024 	(void) VOP_WRITE(vp, &uio, ioflag, CRED(), NULL);
1025 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1026 
1027 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
1028 	VN_RELE(vp);
1029 }
1030 
1031 /*
1032  * DSS: distributed stable storage.
1033  * Unpack the list of paths passed by nfsd.
1034  * Use nvlist_alloc(9F) to manage the data.
1035  * The caller is responsible for allocating and freeing the buffer.
1036  */
1037 int
rfs4_dss_setpaths(char * buf,size_t buflen)1038 rfs4_dss_setpaths(char *buf, size_t buflen)
1039 {
1040 	int error;
1041 
1042 	/*
1043 	 * If this is a "warm start", i.e. we previously had DSS paths,
1044 	 * preserve the old paths.
1045 	 */
1046 	if (rfs4_dss_paths != NULL) {
1047 		/*
1048 		 * Before we lose the ptr, destroy the nvlist and pathnames
1049 		 * array from the warm start before this one.
1050 		 */
1051 		nvlist_free(rfs4_dss_oldpaths);
1052 		rfs4_dss_oldpaths = rfs4_dss_paths;
1053 	}
1054 
1055 	/* unpack the buffer into a searchable nvlist */
1056 	error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
1057 	if (error)
1058 		return (error);
1059 
1060 	/*
1061 	 * Search the nvlist for the pathnames nvpair (which is the only nvpair
1062 	 * in the list, and record its location.
1063 	 */
1064 	error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
1065 	    &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
1066 	return (error);
1067 }
1068 
1069 /*
1070  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1071  * to find and mark the client for forced expire.
1072  */
1073 static void
rfs4_client_scrub(rfs4_entry_t ent,void * arg)1074 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1075 {
1076 	rfs4_client_t *cp = (rfs4_client_t *)ent;
1077 	struct nfs4clrst_args *clr = arg;
1078 	struct sockaddr_in6 *ent_sin6;
1079 	struct in6_addr  clr_in6;
1080 	struct sockaddr_in  *ent_sin;
1081 	struct in_addr   clr_in;
1082 
1083 	if (clr->addr_type != cp->rc_addr.ss_family) {
1084 		return;
1085 	}
1086 
1087 	switch (clr->addr_type) {
1088 
1089 	case AF_INET6:
1090 		/* copyin the address from user space */
1091 		if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1092 			break;
1093 		}
1094 
1095 		ent_sin6 = (struct sockaddr_in6 *)&cp->rc_addr;
1096 
1097 		/*
1098 		 * now compare, and if equivalent mark entry
1099 		 * for forced expiration
1100 		 */
1101 		if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1102 			cp->rc_forced_expire = 1;
1103 		}
1104 		break;
1105 
1106 	case AF_INET:
1107 		/* copyin the address from user space */
1108 		if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1109 			break;
1110 		}
1111 
1112 		ent_sin = (struct sockaddr_in *)&cp->rc_addr;
1113 
1114 		/*
1115 		 * now compare, and if equivalent mark entry
1116 		 * for forced expiration
1117 		 */
1118 		if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1119 			cp->rc_forced_expire = 1;
1120 		}
1121 		break;
1122 
1123 	default:
1124 		/* force this assert to fail */
1125 		ASSERT(clr->addr_type != clr->addr_type);
1126 	}
1127 }
1128 
1129 /*
1130  * This is called from nfssys() in order to clear server state
1131  * for the specified client IP Address.
1132  */
1133 void
rfs4_clear_client_state(struct nfs4clrst_args * clr)1134 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1135 {
1136 	nfs4_srv_t *nsrv4;
1137 	nsrv4 = nfs4_get_srv();
1138 	(void) rfs4_dbe_walk(nsrv4->rfs4_client_tab, rfs4_client_scrub, clr);
1139 }
1140 
1141 /*
1142  * Used to initialize the NFSv4 server's state or database.  All of
1143  * the tables are created and timers are set.
1144  */
1145 void
rfs4_state_g_init()1146 rfs4_state_g_init()
1147 {
1148 	extern boolean_t rfs4_cpr_callb(void *, int);
1149 	/*
1150 	 * Add a CPR callback so that we can update client
1151 	 * access times to extend the lease after a suspend
1152 	 * and resume (using the same class as rpcmod/connmgr)
1153 	 */
1154 	cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1155 
1156 	/*
1157 	 * NFSv4 server state databases
1158 	 *
1159 	 * Initialized when the module is loaded and used by NFSv4 state
1160 	 * tables.  These kmem_cache free pools are used globally, the NFSv4
1161 	 * state tables which make use of these kmem_cache free pools are per
1162 	 * zone.
1163 	 *
1164 	 * initialize the global kmem_cache free pools which will be used by
1165 	 * the NFSv4 state tables.
1166 	 */
1167 	/* CSTYLED */
1168 	rfs4_client_mem_cache = nfs4_init_mem_cache("Client_entry_cache", 2, sizeof (rfs4_client_t), 0);
1169 	/* CSTYLED */
1170 	rfs4_clntIP_mem_cache = nfs4_init_mem_cache("ClntIP_entry_cache", 1, sizeof (rfs4_clntip_t), 1);
1171 	/* CSTYLED */
1172 	rfs4_openown_mem_cache = nfs4_init_mem_cache("OpenOwner_entry_cache", 1, sizeof (rfs4_openowner_t), 2);
1173 	/* CSTYLED */
1174 	rfs4_openstID_mem_cache = nfs4_init_mem_cache("OpenStateID_entry_cache", 3, sizeof (rfs4_state_t), 3);
1175 	/* CSTYLED */
1176 	rfs4_lockstID_mem_cache = nfs4_init_mem_cache("LockStateID_entry_cache", 3, sizeof (rfs4_lo_state_t), 4);
1177 	/* CSTYLED */
1178 	rfs4_lockown_mem_cache = nfs4_init_mem_cache("Lockowner_entry_cache", 2, sizeof (rfs4_lockowner_t), 5);
1179 	/* CSTYLED */
1180 	rfs4_file_mem_cache = nfs4_init_mem_cache("File_entry_cache", 1, sizeof (rfs4_file_t), 6);
1181 	/* CSTYLED */
1182 	rfs4_delegstID_mem_cache = nfs4_init_mem_cache("DelegStateID_entry_cache", 2, sizeof (rfs4_deleg_state_t), 7);
1183 
1184 	rfs4_client_clrst = rfs4_clear_client_state;
1185 }
1186 
1187 
1188 /*
1189  * Used at server shutdown to cleanup all of the NFSv4 server's structures
1190  * and other state.
1191  */
1192 void
rfs4_state_g_fini()1193 rfs4_state_g_fini()
1194 {
1195 	int i;
1196 	/*
1197 	 * Cleanup the CPR callback.
1198 	 */
1199 	if (cpr_id)
1200 		(void) callb_delete(cpr_id);
1201 
1202 	rfs4_client_clrst = NULL;
1203 
1204 	/* free the NFSv4 state databases */
1205 	for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
1206 		kmem_cache_destroy(rfs4_db_mem_cache_table[i].r_db_mem_cache);
1207 		rfs4_db_mem_cache_table[i].r_db_mem_cache = NULL;
1208 	}
1209 
1210 	rfs4_client_mem_cache = NULL;
1211 	rfs4_clntIP_mem_cache = NULL;
1212 	rfs4_openown_mem_cache = NULL;
1213 	rfs4_openstID_mem_cache = NULL;
1214 	rfs4_lockstID_mem_cache = NULL;
1215 	rfs4_lockown_mem_cache = NULL;
1216 	rfs4_file_mem_cache = NULL;
1217 	rfs4_delegstID_mem_cache = NULL;
1218 
1219 	/* DSS: distributed stable storage */
1220 	nvlist_free(rfs4_dss_oldpaths);
1221 	nvlist_free(rfs4_dss_paths);
1222 	rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
1223 }
1224 
1225 /*
1226  * Used to initialize the per zone NFSv4 server's state
1227  */
1228 void
rfs4_state_zone_init(nfs4_srv_t * nsrv4)1229 rfs4_state_zone_init(nfs4_srv_t *nsrv4)
1230 {
1231 	time_t start_time;
1232 	int start_grace;
1233 	char *dss_path = NFS4_DSS_VAR_DIR;
1234 
1235 	/* DSS: distributed stable storage: initialise served paths list */
1236 	nsrv4->dss_pathlist = NULL;
1237 
1238 	/*
1239 	 * Set the boot time.  If the server
1240 	 * has been restarted quickly and has had the opportunity to
1241 	 * service clients, then the start_time needs to be bumped
1242 	 * regardless.  A small window but it exists...
1243 	 */
1244 	start_time = gethrestime_sec();
1245 	if (nsrv4->rfs4_start_time < start_time)
1246 		nsrv4->rfs4_start_time = start_time;
1247 	else
1248 		nsrv4->rfs4_start_time++;
1249 
1250 	/*
1251 	 * Create the first server instance, or a new one if the server has
1252 	 * been restarted; see above comments on rfs4_start_time. Don't
1253 	 * start its grace period; that will be done later, to maximise the
1254 	 * clients' recovery window.
1255 	 */
1256 	start_grace = 0;
1257 	if (curzone == global_zone && rfs4_dss_numnewpaths > 0) {
1258 		int i;
1259 		char **dss_allpaths = NULL;
1260 		dss_allpaths = kmem_alloc(sizeof (char *) *
1261 		    (rfs4_dss_numnewpaths + 1), KM_SLEEP);
1262 		/*
1263 		 * Add the default path into the list of paths for saving
1264 		 * state informantion.
1265 		 */
1266 		dss_allpaths[0] = dss_path;
1267 		for (i = 0; i < rfs4_dss_numnewpaths; i++) {
1268 			dss_allpaths[i + 1] = rfs4_dss_newpaths[i];
1269 		}
1270 		rfs4_servinst_create(nsrv4, start_grace,
1271 		    (rfs4_dss_numnewpaths + 1), dss_allpaths);
1272 		kmem_free(dss_allpaths,
1273 		    (sizeof (char *) * (rfs4_dss_numnewpaths + 1)));
1274 	} else {
1275 		rfs4_servinst_create(nsrv4, start_grace, 1, &dss_path);
1276 	}
1277 
1278 	/* reset the "first NFSv4 request" status */
1279 	nsrv4->seen_first_compound = 0;
1280 
1281 	mutex_enter(&nsrv4->state_lock);
1282 
1283 	/*
1284 	 * If the server state database has already been initialized,
1285 	 * skip it
1286 	 */
1287 	if (nsrv4->nfs4_server_state != NULL) {
1288 		mutex_exit(&nsrv4->state_lock);
1289 		return;
1290 	}
1291 
1292 	rw_init(&nsrv4->rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1293 
1294 	/* set the various cache timers for table creation */
1295 	if (nsrv4->rfs4_client_cache_time == 0)
1296 		nsrv4->rfs4_client_cache_time = CLIENT_CACHE_TIME;
1297 	if (nsrv4->rfs4_openowner_cache_time == 0)
1298 		nsrv4->rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1299 	if (nsrv4->rfs4_state_cache_time == 0)
1300 		nsrv4->rfs4_state_cache_time = STATE_CACHE_TIME;
1301 	if (nsrv4->rfs4_lo_state_cache_time == 0)
1302 		nsrv4->rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1303 	if (nsrv4->rfs4_lockowner_cache_time == 0)
1304 		nsrv4->rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1305 	if (nsrv4->rfs4_file_cache_time == 0)
1306 		nsrv4->rfs4_file_cache_time = FILE_CACHE_TIME;
1307 	if (nsrv4->rfs4_deleg_state_cache_time == 0)
1308 		nsrv4->rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1309 
1310 	/* Create the overall database to hold all server state */
1311 	nsrv4->nfs4_server_state = rfs4_database_create(rfs4_database_debug);
1312 
1313 	/* Now create the individual tables */
1314 	nsrv4->rfs4_client_cache_time *= rfs4_lease_time;
1315 	nsrv4->rfs4_client_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1316 	    "Client",
1317 	    nsrv4->rfs4_client_cache_time,
1318 	    2,
1319 	    rfs4_client_create,
1320 	    rfs4_client_destroy,
1321 	    rfs4_client_expiry,
1322 	    sizeof (rfs4_client_t),
1323 	    TABSIZE,
1324 	    MAXTABSZ/8, 100);
1325 	nsrv4->rfs4_nfsclnt_idx = rfs4_index_create(nsrv4->rfs4_client_tab,
1326 	    "nfs_client_id4", nfsclnt_hash,
1327 	    nfsclnt_compare, nfsclnt_mkkey,
1328 	    TRUE);
1329 	nsrv4->rfs4_clientid_idx = rfs4_index_create(nsrv4->rfs4_client_tab,
1330 	    "client_id", clientid_hash,
1331 	    clientid_compare, clientid_mkkey,
1332 	    FALSE);
1333 
1334 	nsrv4->rfs4_clntip_cache_time = 86400 * 365;	/* about a year */
1335 	nsrv4->rfs4_clntip_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1336 	    "ClntIP",
1337 	    nsrv4->rfs4_clntip_cache_time,
1338 	    1,
1339 	    rfs4_clntip_create,
1340 	    rfs4_clntip_destroy,
1341 	    rfs4_clntip_expiry,
1342 	    sizeof (rfs4_clntip_t),
1343 	    TABSIZE,
1344 	    MAXTABSZ, 100);
1345 	nsrv4->rfs4_clntip_idx = rfs4_index_create(nsrv4->rfs4_clntip_tab,
1346 	    "client_ip", clntip_hash,
1347 	    clntip_compare, clntip_mkkey,
1348 	    TRUE);
1349 
1350 	nsrv4->rfs4_openowner_cache_time *= rfs4_lease_time;
1351 	nsrv4->rfs4_openowner_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1352 	    "OpenOwner",
1353 	    nsrv4->rfs4_openowner_cache_time,
1354 	    1,
1355 	    rfs4_openowner_create,
1356 	    rfs4_openowner_destroy,
1357 	    rfs4_openowner_expiry,
1358 	    sizeof (rfs4_openowner_t),
1359 	    TABSIZE,
1360 	    MAXTABSZ, 100);
1361 	nsrv4->rfs4_openowner_idx = rfs4_index_create(nsrv4->rfs4_openowner_tab,
1362 	    "open_owner4", openowner_hash,
1363 	    openowner_compare,
1364 	    openowner_mkkey, TRUE);
1365 
1366 	nsrv4->rfs4_state_cache_time *= rfs4_lease_time;
1367 	nsrv4->rfs4_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1368 	    "OpenStateID",
1369 	    nsrv4->rfs4_state_cache_time,
1370 	    3,
1371 	    rfs4_state_create,
1372 	    rfs4_state_destroy,
1373 	    rfs4_state_expiry,
1374 	    sizeof (rfs4_state_t),
1375 	    TABSIZE,
1376 	    MAXTABSZ, 100);
1377 
1378 	/* CSTYLED */
1379 	nsrv4->rfs4_state_owner_file_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1380 	    "Openowner-File",
1381 	    state_owner_file_hash,
1382 	    state_owner_file_compare,
1383 	    state_owner_file_mkkey, TRUE);
1384 
1385 	nsrv4->rfs4_state_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1386 	    "State-id", state_hash,
1387 	    state_compare, state_mkkey, FALSE);
1388 
1389 	nsrv4->rfs4_state_file_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1390 	    "File", state_file_hash,
1391 	    state_file_compare, state_file_mkkey,
1392 	    FALSE);
1393 
1394 	nsrv4->rfs4_lo_state_cache_time *= rfs4_lease_time;
1395 	nsrv4->rfs4_lo_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1396 	    "LockStateID",
1397 	    nsrv4->rfs4_lo_state_cache_time,
1398 	    2,
1399 	    rfs4_lo_state_create,
1400 	    rfs4_lo_state_destroy,
1401 	    rfs4_lo_state_expiry,
1402 	    sizeof (rfs4_lo_state_t),
1403 	    TABSIZE,
1404 	    MAXTABSZ, 100);
1405 
1406 	/* CSTYLED */
1407 	nsrv4->rfs4_lo_state_owner_idx = rfs4_index_create(nsrv4->rfs4_lo_state_tab,
1408 	    "lockownerxstate",
1409 	    lo_state_lo_hash,
1410 	    lo_state_lo_compare,
1411 	    lo_state_lo_mkkey, TRUE);
1412 
1413 	nsrv4->rfs4_lo_state_idx = rfs4_index_create(nsrv4->rfs4_lo_state_tab,
1414 	    "State-id",
1415 	    lo_state_hash, lo_state_compare,
1416 	    lo_state_mkkey, FALSE);
1417 
1418 	nsrv4->rfs4_lockowner_cache_time *= rfs4_lease_time;
1419 
1420 	nsrv4->rfs4_lockowner_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1421 	    "Lockowner",
1422 	    nsrv4->rfs4_lockowner_cache_time,
1423 	    2,
1424 	    rfs4_lockowner_create,
1425 	    rfs4_lockowner_destroy,
1426 	    rfs4_lockowner_expiry,
1427 	    sizeof (rfs4_lockowner_t),
1428 	    TABSIZE,
1429 	    MAXTABSZ, 100);
1430 
1431 	nsrv4->rfs4_lockowner_idx = rfs4_index_create(nsrv4->rfs4_lockowner_tab,
1432 	    "lock_owner4", lockowner_hash,
1433 	    lockowner_compare,
1434 	    lockowner_mkkey, TRUE);
1435 
1436 	/* CSTYLED */
1437 	nsrv4->rfs4_lockowner_pid_idx = rfs4_index_create(nsrv4->rfs4_lockowner_tab,
1438 	    "pid", pid_hash,
1439 	    pid_compare, pid_mkkey,
1440 	    FALSE);
1441 
1442 	nsrv4->rfs4_file_cache_time *= rfs4_lease_time;
1443 	nsrv4->rfs4_file_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1444 	    "File",
1445 	    nsrv4->rfs4_file_cache_time,
1446 	    1,
1447 	    rfs4_file_create,
1448 	    rfs4_file_destroy,
1449 	    NULL,
1450 	    sizeof (rfs4_file_t),
1451 	    TABSIZE,
1452 	    MAXTABSZ, -1);
1453 
1454 	nsrv4->rfs4_file_idx = rfs4_index_create(nsrv4->rfs4_file_tab,
1455 	    "Filehandle", file_hash,
1456 	    file_compare, file_mkkey, TRUE);
1457 
1458 	nsrv4->rfs4_deleg_state_cache_time *= rfs4_lease_time;
1459 	/* CSTYLED */
1460 	nsrv4->rfs4_deleg_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1461 	    "DelegStateID",
1462 	    nsrv4->rfs4_deleg_state_cache_time,
1463 	    2,
1464 	    rfs4_deleg_state_create,
1465 	    rfs4_deleg_state_destroy,
1466 	    rfs4_deleg_state_expiry,
1467 	    sizeof (rfs4_deleg_state_t),
1468 	    TABSIZE,
1469 	    MAXTABSZ, 100);
1470 	nsrv4->rfs4_deleg_idx = rfs4_index_create(nsrv4->rfs4_deleg_state_tab,
1471 	    "DelegByFileClient",
1472 	    deleg_hash,
1473 	    deleg_compare,
1474 	    deleg_mkkey, TRUE);
1475 
1476 	/* CSTYLED */
1477 	nsrv4->rfs4_deleg_state_idx = rfs4_index_create(nsrv4->rfs4_deleg_state_tab,
1478 	    "DelegState",
1479 	    deleg_state_hash,
1480 	    deleg_state_compare,
1481 	    deleg_state_mkkey, FALSE);
1482 
1483 	mutex_exit(&nsrv4->state_lock);
1484 
1485 	/*
1486 	 * Init the stable storage.
1487 	 */
1488 	rfs4_ss_init(nsrv4);
1489 }
1490 
1491 /*
1492  * Used at server shutdown to cleanup all of NFSv4 server's zone structures
1493  * and state.
1494  */
1495 void
rfs4_state_zone_fini()1496 rfs4_state_zone_fini()
1497 {
1498 	rfs4_database_t *dbp;
1499 	nfs4_srv_t *nsrv4;
1500 	nsrv4 = nfs4_get_srv();
1501 
1502 	rfs4_set_deleg_policy(nsrv4, SRV_NEVER_DELEGATE);
1503 
1504 	/*
1505 	 * Clean up any dangling stable storage structures BEFORE calling
1506 	 * rfs4_servinst_destroy_all() so there are no dangling structures
1507 	 * (i.e. the srvinsts are all cleared of danglers BEFORE they get
1508 	 * freed).
1509 	 */
1510 	rfs4_ss_fini(nsrv4);
1511 
1512 	mutex_enter(&nsrv4->state_lock);
1513 
1514 	if (nsrv4->nfs4_server_state == NULL) {
1515 		mutex_exit(&nsrv4->state_lock);
1516 		return;
1517 	}
1518 
1519 	/* destroy server instances and current instance ptr */
1520 	rfs4_servinst_destroy_all(nsrv4);
1521 
1522 	/* reset the "first NFSv4 request" status */
1523 	nsrv4->seen_first_compound = 0;
1524 
1525 	dbp = nsrv4->nfs4_server_state;
1526 	nsrv4->nfs4_server_state = NULL;
1527 
1528 	rw_destroy(&nsrv4->rfs4_findclient_lock);
1529 
1530 	/* First stop all of the reaper threads in the database */
1531 	rfs4_database_shutdown(dbp);
1532 
1533 	/*
1534 	 * WARNING: There may be consumers of the rfs4 database still
1535 	 * active as we destroy these.  IF that's the case, consider putting
1536 	 * some of their _zone_fini()-like functions into the zsd key as
1537 	 * ~~SHUTDOWN~~ functions instead of ~~DESTROY~~ functions.  We can
1538 	 * maintain some ordering guarantees better that way.
1539 	 */
1540 	/* Now destroy/release the database tables */
1541 	rfs4_database_destroy(dbp);
1542 
1543 	/* Reset the cache timers for next time */
1544 	nsrv4->rfs4_client_cache_time = 0;
1545 	nsrv4->rfs4_openowner_cache_time = 0;
1546 	nsrv4->rfs4_state_cache_time = 0;
1547 	nsrv4->rfs4_lo_state_cache_time = 0;
1548 	nsrv4->rfs4_lockowner_cache_time = 0;
1549 	nsrv4->rfs4_file_cache_time = 0;
1550 	nsrv4->rfs4_deleg_state_cache_time = 0;
1551 
1552 	mutex_exit(&nsrv4->state_lock);
1553 }
1554 
1555 typedef union {
1556 	struct {
1557 		uint32_t start_time;
1558 		uint32_t c_id;
1559 	} impl_id;
1560 	clientid4 id4;
1561 } cid;
1562 
1563 static int foreign_stateid(stateid_t *id);
1564 static int foreign_clientid(cid *cidp);
1565 static void embed_nodeid(cid *cidp);
1566 
1567 typedef union {
1568 	struct {
1569 		uint32_t c_id;
1570 		uint32_t gen_num;
1571 	} cv_impl;
1572 	verifier4	confirm_verf;
1573 } scid_confirm_verf;
1574 
1575 static uint32_t
clientid_hash(void * key)1576 clientid_hash(void *key)
1577 {
1578 	cid *idp = key;
1579 
1580 	return (idp->impl_id.c_id);
1581 }
1582 
1583 static bool_t
clientid_compare(rfs4_entry_t entry,void * key)1584 clientid_compare(rfs4_entry_t entry, void *key)
1585 {
1586 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1587 	clientid4 *idp = key;
1588 
1589 	return (*idp == cp->rc_clientid);
1590 }
1591 
1592 static void *
clientid_mkkey(rfs4_entry_t entry)1593 clientid_mkkey(rfs4_entry_t entry)
1594 {
1595 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1596 
1597 	return (&cp->rc_clientid);
1598 }
1599 
1600 static uint32_t
nfsclnt_hash(void * key)1601 nfsclnt_hash(void *key)
1602 {
1603 	nfs_client_id4 *client = key;
1604 	int i;
1605 	uint32_t hash = 0;
1606 
1607 	for (i = 0; i < client->id_len; i++) {
1608 		hash <<= 1;
1609 		hash += (uint_t)client->id_val[i];
1610 	}
1611 	return (hash);
1612 }
1613 
1614 
1615 static bool_t
nfsclnt_compare(rfs4_entry_t entry,void * key)1616 nfsclnt_compare(rfs4_entry_t entry, void *key)
1617 {
1618 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1619 	nfs_client_id4 *nfs_client = key;
1620 
1621 	if (cp->rc_nfs_client.id_len != nfs_client->id_len)
1622 		return (FALSE);
1623 
1624 	return (bcmp(cp->rc_nfs_client.id_val, nfs_client->id_val,
1625 	    nfs_client->id_len) == 0);
1626 }
1627 
1628 static void *
nfsclnt_mkkey(rfs4_entry_t entry)1629 nfsclnt_mkkey(rfs4_entry_t entry)
1630 {
1631 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1632 
1633 	return (&cp->rc_nfs_client);
1634 }
1635 
1636 static bool_t
rfs4_client_expiry(rfs4_entry_t u_entry)1637 rfs4_client_expiry(rfs4_entry_t u_entry)
1638 {
1639 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1640 	bool_t cp_expired;
1641 
1642 	if (rfs4_dbe_is_invalid(cp->rc_dbe)) {
1643 		cp->rc_ss_remove = 1;
1644 		return (TRUE);
1645 	}
1646 	/*
1647 	 * If the sysadmin has used clear_locks for this
1648 	 * entry then forced_expire will be set and we
1649 	 * want this entry to be reaped. Or the entry
1650 	 * has exceeded its lease period.
1651 	 */
1652 	cp_expired = (cp->rc_forced_expire ||
1653 	    (gethrestime_sec() - cp->rc_last_access
1654 	    > rfs4_lease_time));
1655 
1656 	if (!cp->rc_ss_remove && cp_expired)
1657 		cp->rc_ss_remove = 1;
1658 	return (cp_expired);
1659 }
1660 
1661 /*
1662  * Remove the leaf file from all distributed stable storage paths.
1663  */
1664 static void
rfs4_dss_remove_cpleaf(rfs4_client_t * cp)1665 rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
1666 {
1667 	nfs4_srv_t *nsrv4;
1668 	rfs4_servinst_t *sip;
1669 	char *leaf = cp->rc_ss_pn->leaf;
1670 
1671 	/*
1672 	 * since the state files are written to all DSS
1673 	 * paths we must remove this leaf file instance
1674 	 * from all server instances.
1675 	 */
1676 
1677 	nsrv4 = nfs4_get_srv();
1678 	mutex_enter(&nsrv4->servinst_lock);
1679 	for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
1680 		/* remove the leaf file associated with this server instance */
1681 		rfs4_dss_remove_leaf(sip, NFS4_DSS_STATE_LEAF, leaf);
1682 	}
1683 	mutex_exit(&nsrv4->servinst_lock);
1684 }
1685 
1686 static void
rfs4_dss_remove_leaf(rfs4_servinst_t * sip,char * dir_leaf,char * leaf)1687 rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
1688 {
1689 	int i, npaths = sip->dss_npaths;
1690 
1691 	for (i = 0; i < npaths; i++) {
1692 		rfs4_dss_path_t *dss_path = sip->dss_paths[i];
1693 		char *path, *dir;
1694 		size_t pathlen;
1695 
1696 		/* the HA-NFSv4 path might have been failed-over away from us */
1697 		if (dss_path == NULL)
1698 			continue;
1699 
1700 		dir = dss_path->path;
1701 
1702 		/* allow 3 extra bytes for two '/' & a NUL */
1703 		pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
1704 		path = kmem_alloc(pathlen, KM_SLEEP);
1705 		(void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
1706 
1707 		(void) vn_remove(path, UIO_SYSSPACE, RMFILE);
1708 
1709 		kmem_free(path, pathlen);
1710 	}
1711 }
1712 
1713 static void
rfs4_client_destroy(rfs4_entry_t u_entry)1714 rfs4_client_destroy(rfs4_entry_t u_entry)
1715 {
1716 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1717 
1718 	mutex_destroy(cp->rc_cbinfo.cb_lock);
1719 	cv_destroy(cp->rc_cbinfo.cb_cv);
1720 	cv_destroy(cp->rc_cbinfo.cb_cv_nullcaller);
1721 	list_destroy(&cp->rc_openownerlist);
1722 
1723 	/* free callback info */
1724 	rfs4_cbinfo_free(&cp->rc_cbinfo);
1725 
1726 	if (cp->rc_cp_confirmed)
1727 		rfs4_client_rele(cp->rc_cp_confirmed);
1728 
1729 	if (cp->rc_ss_pn) {
1730 		/* check if the stable storage files need to be removed */
1731 		if (cp->rc_ss_remove)
1732 			rfs4_dss_remove_cpleaf(cp);
1733 		rfs4_ss_pnfree(cp->rc_ss_pn);
1734 	}
1735 
1736 	/* Free the client supplied client id */
1737 	kmem_free(cp->rc_nfs_client.id_val, cp->rc_nfs_client.id_len);
1738 
1739 	if (cp->rc_sysidt != LM_NOSYSID)
1740 		lm_free_sysidt(cp->rc_sysidt);
1741 }
1742 
1743 static bool_t
rfs4_client_create(rfs4_entry_t u_entry,void * arg)1744 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1745 {
1746 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1747 	nfs_client_id4 *client = (nfs_client_id4 *)arg;
1748 	struct sockaddr *ca;
1749 	cid *cidp;
1750 	scid_confirm_verf *scvp;
1751 	nfs4_srv_t *nsrv4;
1752 
1753 	nsrv4 = nfs4_get_srv();
1754 
1755 	/* Get a clientid to give to the client */
1756 	cidp = (cid *)&cp->rc_clientid;
1757 	cidp->impl_id.start_time = nsrv4->rfs4_start_time;
1758 	cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->rc_dbe);
1759 
1760 	/* If we are booted as a cluster node, embed our nodeid */
1761 	if (cluster_bootflags & CLUSTER_BOOTED)
1762 		embed_nodeid(cidp);
1763 
1764 	/* Allocate and copy client's client id value */
1765 	cp->rc_nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1766 	cp->rc_nfs_client.id_len = client->id_len;
1767 	bcopy(client->id_val, cp->rc_nfs_client.id_val, client->id_len);
1768 	cp->rc_nfs_client.verifier = client->verifier;
1769 
1770 	/* Copy client's IP address */
1771 	ca = client->cl_addr;
1772 	if (ca->sa_family == AF_INET)
1773 		bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in));
1774 	else if (ca->sa_family == AF_INET6)
1775 		bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in6));
1776 	cp->rc_nfs_client.cl_addr = (struct sockaddr *)&cp->rc_addr;
1777 
1778 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1779 	scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1780 	scvp->cv_impl.c_id = cidp->impl_id.c_id;
1781 	scvp->cv_impl.gen_num = 0;
1782 
1783 	/* An F_UNLKSYS has been done for this client */
1784 	cp->rc_unlksys_completed = FALSE;
1785 
1786 	/* We need the client to ack us */
1787 	cp->rc_need_confirm = TRUE;
1788 	cp->rc_cp_confirmed = NULL;
1789 
1790 	/* TRUE all the time until the callback path actually fails */
1791 	cp->rc_cbinfo.cb_notified_of_cb_path_down = TRUE;
1792 
1793 	/* Initialize the access time to now */
1794 	cp->rc_last_access = gethrestime_sec();
1795 
1796 	cp->rc_cr_set = NULL;
1797 
1798 	cp->rc_sysidt = LM_NOSYSID;
1799 
1800 	list_create(&cp->rc_openownerlist, sizeof (rfs4_openowner_t),
1801 	    offsetof(rfs4_openowner_t, ro_node));
1802 
1803 	/* set up the callback control structure */
1804 	cp->rc_cbinfo.cb_state = CB_UNINIT;
1805 	mutex_init(cp->rc_cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1806 	cv_init(cp->rc_cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1807 	cv_init(cp->rc_cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1808 
1809 	/*
1810 	 * Associate the client_t with the current server instance.
1811 	 * The hold is solely to satisfy the calling requirement of
1812 	 * rfs4_servinst_assign(). In this case it's not strictly necessary.
1813 	 */
1814 	rfs4_dbe_hold(cp->rc_dbe);
1815 	rfs4_servinst_assign(nsrv4, cp, nsrv4->nfs4_cur_servinst);
1816 	rfs4_dbe_rele(cp->rc_dbe);
1817 
1818 	return (TRUE);
1819 }
1820 
1821 /*
1822  * Caller wants to generate/update the setclientid_confirm verifier
1823  * associated with a client.  This is done during the SETCLIENTID
1824  * processing.
1825  */
1826 void
rfs4_client_scv_next(rfs4_client_t * cp)1827 rfs4_client_scv_next(rfs4_client_t *cp)
1828 {
1829 	scid_confirm_verf *scvp;
1830 
1831 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1832 	scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1833 	scvp->cv_impl.gen_num++;
1834 }
1835 
1836 void
rfs4_client_rele(rfs4_client_t * cp)1837 rfs4_client_rele(rfs4_client_t *cp)
1838 {
1839 	rfs4_dbe_rele(cp->rc_dbe);
1840 }
1841 
1842 rfs4_client_t *
rfs4_findclient(nfs_client_id4 * client,bool_t * create,rfs4_client_t * oldcp)1843 rfs4_findclient(nfs_client_id4 *client, bool_t *create,	rfs4_client_t *oldcp)
1844 {
1845 	rfs4_client_t *cp;
1846 	nfs4_srv_t *nsrv4;
1847 	nsrv4 = nfs4_get_srv();
1848 
1849 
1850 	if (oldcp) {
1851 		rw_enter(&nsrv4->rfs4_findclient_lock, RW_WRITER);
1852 		rfs4_dbe_hide(oldcp->rc_dbe);
1853 	} else {
1854 		rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1855 	}
1856 
1857 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_nfsclnt_idx, client,
1858 	    create, (void *)client, RFS4_DBS_VALID);
1859 
1860 	if (oldcp)
1861 		rfs4_dbe_unhide(oldcp->rc_dbe);
1862 
1863 	rw_exit(&nsrv4->rfs4_findclient_lock);
1864 
1865 	return (cp);
1866 }
1867 
1868 rfs4_client_t *
rfs4_findclient_by_id(clientid4 clientid,bool_t find_unconfirmed)1869 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1870 {
1871 	rfs4_client_t *cp;
1872 	bool_t create = FALSE;
1873 	cid *cidp = (cid *)&clientid;
1874 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
1875 
1876 	/* If we're a cluster and the nodeid isn't right, short-circuit */
1877 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
1878 		return (NULL);
1879 
1880 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1881 
1882 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx, &clientid,
1883 	    &create, NULL, RFS4_DBS_VALID);
1884 
1885 	rw_exit(&nsrv4->rfs4_findclient_lock);
1886 
1887 	if (cp && cp->rc_need_confirm && find_unconfirmed == FALSE) {
1888 		rfs4_client_rele(cp);
1889 		return (NULL);
1890 	} else {
1891 		return (cp);
1892 	}
1893 }
1894 
1895 static uint32_t
clntip_hash(void * key)1896 clntip_hash(void *key)
1897 {
1898 	struct sockaddr *addr = key;
1899 	int i, len = 0;
1900 	uint32_t hash = 0;
1901 	char *ptr;
1902 
1903 	if (addr->sa_family == AF_INET) {
1904 		struct sockaddr_in *a = (struct sockaddr_in *)addr;
1905 		len = sizeof (struct in_addr);
1906 		ptr = (char *)&a->sin_addr;
1907 	} else if (addr->sa_family == AF_INET6) {
1908 		struct sockaddr_in6 *a = (struct sockaddr_in6 *)addr;
1909 		len = sizeof (struct in6_addr);
1910 		ptr = (char *)&a->sin6_addr;
1911 	} else
1912 		return (0);
1913 
1914 	for (i = 0; i < len; i++) {
1915 		hash <<= 1;
1916 		hash += (uint_t)ptr[i];
1917 	}
1918 	return (hash);
1919 }
1920 
1921 static bool_t
clntip_compare(rfs4_entry_t entry,void * key)1922 clntip_compare(rfs4_entry_t entry, void *key)
1923 {
1924 	rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1925 	struct sockaddr *addr = key;
1926 	int len = 0;
1927 	char *p1, *p2;
1928 
1929 	if (addr->sa_family == AF_INET) {
1930 		struct sockaddr_in *a1 = (struct sockaddr_in *)&cp->ri_addr;
1931 		struct sockaddr_in *a2 = (struct sockaddr_in *)addr;
1932 		len = sizeof (struct in_addr);
1933 		p1 = (char *)&a1->sin_addr;
1934 		p2 = (char *)&a2->sin_addr;
1935 	} else if (addr->sa_family == AF_INET6) {
1936 		struct sockaddr_in6 *a1 = (struct sockaddr_in6 *)&cp->ri_addr;
1937 		struct sockaddr_in6 *a2 = (struct sockaddr_in6 *)addr;
1938 		len = sizeof (struct in6_addr);
1939 		p1 = (char *)&a1->sin6_addr;
1940 		p2 = (char *)&a2->sin6_addr;
1941 	} else
1942 		return (0);
1943 
1944 	return (bcmp(p1, p2, len) == 0);
1945 }
1946 
1947 static void *
clntip_mkkey(rfs4_entry_t entry)1948 clntip_mkkey(rfs4_entry_t entry)
1949 {
1950 	rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1951 
1952 	return (&cp->ri_addr);
1953 }
1954 
1955 static bool_t
rfs4_clntip_expiry(rfs4_entry_t u_entry)1956 rfs4_clntip_expiry(rfs4_entry_t u_entry)
1957 {
1958 	rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1959 
1960 	if (rfs4_dbe_is_invalid(cp->ri_dbe))
1961 		return (TRUE);
1962 	return (FALSE);
1963 }
1964 
1965 /* ARGSUSED */
1966 static void
rfs4_clntip_destroy(rfs4_entry_t u_entry)1967 rfs4_clntip_destroy(rfs4_entry_t u_entry)
1968 {
1969 }
1970 
1971 static bool_t
rfs4_clntip_create(rfs4_entry_t u_entry,void * arg)1972 rfs4_clntip_create(rfs4_entry_t u_entry, void *arg)
1973 {
1974 	rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1975 	struct sockaddr *ca = (struct sockaddr *)arg;
1976 
1977 	/* Copy client's IP address */
1978 	if (ca->sa_family == AF_INET)
1979 		bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in));
1980 	else if (ca->sa_family == AF_INET6)
1981 		bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in6));
1982 	else
1983 		return (FALSE);
1984 	cp->ri_no_referrals = 1;
1985 
1986 	return (TRUE);
1987 }
1988 
1989 rfs4_clntip_t *
rfs4_find_clntip(struct sockaddr * addr,bool_t * create)1990 rfs4_find_clntip(struct sockaddr *addr, bool_t *create)
1991 {
1992 	rfs4_clntip_t *cp;
1993 	nfs4_srv_t *nsrv4;
1994 
1995 	nsrv4 = nfs4_get_srv();
1996 
1997 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1998 
1999 	cp = (rfs4_clntip_t *)rfs4_dbsearch(nsrv4->rfs4_clntip_idx, addr,
2000 	    create, addr, RFS4_DBS_VALID);
2001 
2002 	rw_exit(&nsrv4->rfs4_findclient_lock);
2003 
2004 	return (cp);
2005 }
2006 
2007 void
rfs4_invalidate_clntip(struct sockaddr * addr)2008 rfs4_invalidate_clntip(struct sockaddr *addr)
2009 {
2010 	rfs4_clntip_t *cp;
2011 	bool_t create = FALSE;
2012 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2013 
2014 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2015 
2016 	cp = (rfs4_clntip_t *)rfs4_dbsearch(nsrv4->rfs4_clntip_idx, addr,
2017 	    &create, NULL, RFS4_DBS_VALID);
2018 	if (cp == NULL) {
2019 		rw_exit(&nsrv4->rfs4_findclient_lock);
2020 		return;
2021 	}
2022 	rfs4_dbe_invalidate(cp->ri_dbe);
2023 	rfs4_dbe_rele(cp->ri_dbe);
2024 
2025 	rw_exit(&nsrv4->rfs4_findclient_lock);
2026 }
2027 
2028 bool_t
rfs4_lease_expired(rfs4_client_t * cp)2029 rfs4_lease_expired(rfs4_client_t *cp)
2030 {
2031 	bool_t rc;
2032 
2033 	rfs4_dbe_lock(cp->rc_dbe);
2034 
2035 	/*
2036 	 * If the admin has executed clear_locks for this
2037 	 * client id, force expire will be set, so no need
2038 	 * to calculate anything because it's "outa here".
2039 	 */
2040 	if (cp->rc_forced_expire) {
2041 		rc = TRUE;
2042 	} else {
2043 		rc = (gethrestime_sec() - cp->rc_last_access > rfs4_lease_time);
2044 	}
2045 
2046 	/*
2047 	 * If the lease has expired we will also want
2048 	 * to remove any stable storage state data. So
2049 	 * mark the client id accordingly.
2050 	 */
2051 	if (!cp->rc_ss_remove)
2052 		cp->rc_ss_remove = (rc == TRUE);
2053 
2054 	rfs4_dbe_unlock(cp->rc_dbe);
2055 
2056 	return (rc);
2057 }
2058 
2059 void
rfs4_update_lease(rfs4_client_t * cp)2060 rfs4_update_lease(rfs4_client_t *cp)
2061 {
2062 	rfs4_dbe_lock(cp->rc_dbe);
2063 	if (!cp->rc_forced_expire)
2064 		cp->rc_last_access = gethrestime_sec();
2065 	rfs4_dbe_unlock(cp->rc_dbe);
2066 }
2067 
2068 
2069 static bool_t
EQOPENOWNER(open_owner4 * a,open_owner4 * b)2070 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
2071 {
2072 	bool_t rc;
2073 
2074 	if (a->clientid != b->clientid)
2075 		return (FALSE);
2076 
2077 	if (a->owner_len != b->owner_len)
2078 		return (FALSE);
2079 
2080 	rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
2081 
2082 	return (rc);
2083 }
2084 
2085 static uint_t
openowner_hash(void * key)2086 openowner_hash(void *key)
2087 {
2088 	int i;
2089 	open_owner4 *openowner = key;
2090 	uint_t hash = 0;
2091 
2092 	for (i = 0; i < openowner->owner_len; i++) {
2093 		hash <<= 4;
2094 		hash += (uint_t)openowner->owner_val[i];
2095 	}
2096 	hash += (uint_t)openowner->clientid;
2097 	hash |= (openowner->clientid >> 32);
2098 
2099 	return (hash);
2100 }
2101 
2102 static bool_t
openowner_compare(rfs4_entry_t u_entry,void * key)2103 openowner_compare(rfs4_entry_t u_entry, void *key)
2104 {
2105 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2106 	open_owner4 *arg = key;
2107 
2108 	return (EQOPENOWNER(&oo->ro_owner, arg));
2109 }
2110 
2111 void *
openowner_mkkey(rfs4_entry_t u_entry)2112 openowner_mkkey(rfs4_entry_t u_entry)
2113 {
2114 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2115 
2116 	return (&oo->ro_owner);
2117 }
2118 
2119 /* ARGSUSED */
2120 static bool_t
rfs4_openowner_expiry(rfs4_entry_t u_entry)2121 rfs4_openowner_expiry(rfs4_entry_t u_entry)
2122 {
2123 	/* openstateid held us and did all needed delay */
2124 	return (TRUE);
2125 }
2126 
2127 static void
rfs4_openowner_destroy(rfs4_entry_t u_entry)2128 rfs4_openowner_destroy(rfs4_entry_t u_entry)
2129 {
2130 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2131 
2132 	/* Remove open owner from client's lists of open owners */
2133 	rfs4_dbe_lock(oo->ro_client->rc_dbe);
2134 	list_remove(&oo->ro_client->rc_openownerlist, oo);
2135 	rfs4_dbe_unlock(oo->ro_client->rc_dbe);
2136 
2137 	/* One less reference to the client */
2138 	rfs4_client_rele(oo->ro_client);
2139 	oo->ro_client = NULL;
2140 
2141 	/* Free the last reply for this lock owner */
2142 	rfs4_free_reply(&oo->ro_reply);
2143 
2144 	if (oo->ro_reply_fh.nfs_fh4_val) {
2145 		kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2146 		    oo->ro_reply_fh.nfs_fh4_len);
2147 		oo->ro_reply_fh.nfs_fh4_val = NULL;
2148 		oo->ro_reply_fh.nfs_fh4_len = 0;
2149 	}
2150 
2151 	rfs4_sw_destroy(&oo->ro_sw);
2152 	list_destroy(&oo->ro_statelist);
2153 
2154 	/* Free the lock owner id */
2155 	kmem_free(oo->ro_owner.owner_val, oo->ro_owner.owner_len);
2156 }
2157 
2158 void
rfs4_openowner_rele(rfs4_openowner_t * oo)2159 rfs4_openowner_rele(rfs4_openowner_t *oo)
2160 {
2161 	rfs4_dbe_rele(oo->ro_dbe);
2162 }
2163 
2164 static bool_t
rfs4_openowner_create(rfs4_entry_t u_entry,void * arg)2165 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
2166 {
2167 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2168 	rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
2169 	open_owner4 *openowner = &argp->ro_owner;
2170 	seqid4 seqid = argp->ro_open_seqid;
2171 	rfs4_client_t *cp;
2172 	bool_t create = FALSE;
2173 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2174 
2175 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2176 
2177 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx,
2178 	    &openowner->clientid,
2179 	    &create, NULL, RFS4_DBS_VALID);
2180 
2181 	rw_exit(&nsrv4->rfs4_findclient_lock);
2182 
2183 	if (cp == NULL)
2184 		return (FALSE);
2185 
2186 	oo->ro_reply_fh.nfs_fh4_len = 0;
2187 	oo->ro_reply_fh.nfs_fh4_val = NULL;
2188 
2189 	oo->ro_owner.clientid = openowner->clientid;
2190 	oo->ro_owner.owner_val =
2191 	    kmem_alloc(openowner->owner_len, KM_SLEEP);
2192 
2193 	bcopy(openowner->owner_val,
2194 	    oo->ro_owner.owner_val, openowner->owner_len);
2195 
2196 	oo->ro_owner.owner_len = openowner->owner_len;
2197 
2198 	oo->ro_need_confirm = TRUE;
2199 
2200 	rfs4_sw_init(&oo->ro_sw);
2201 
2202 	oo->ro_open_seqid = seqid;
2203 	bzero(&oo->ro_reply, sizeof (nfs_resop4));
2204 	oo->ro_client = cp;
2205 	oo->ro_cr_set = NULL;
2206 
2207 	list_create(&oo->ro_statelist, sizeof (rfs4_state_t),
2208 	    offsetof(rfs4_state_t, rs_node));
2209 
2210 	/* Insert openowner into client's open owner list */
2211 	rfs4_dbe_lock(cp->rc_dbe);
2212 	list_insert_tail(&cp->rc_openownerlist, oo);
2213 	rfs4_dbe_unlock(cp->rc_dbe);
2214 
2215 	return (TRUE);
2216 }
2217 
2218 rfs4_openowner_t *
rfs4_findopenowner(open_owner4 * openowner,bool_t * create,seqid4 seqid)2219 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
2220 {
2221 	rfs4_openowner_t *oo;
2222 	rfs4_openowner_t arg;
2223 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2224 
2225 	arg.ro_owner = *openowner;
2226 	arg.ro_open_seqid = seqid;
2227 	/* CSTYLED */
2228 	oo = (rfs4_openowner_t *)rfs4_dbsearch(nsrv4->rfs4_openowner_idx, openowner,
2229 	    create, &arg, RFS4_DBS_VALID);
2230 
2231 	return (oo);
2232 }
2233 
2234 void
rfs4_update_open_sequence(rfs4_openowner_t * oo)2235 rfs4_update_open_sequence(rfs4_openowner_t *oo)
2236 {
2237 
2238 	rfs4_dbe_lock(oo->ro_dbe);
2239 
2240 	oo->ro_open_seqid++;
2241 
2242 	rfs4_dbe_unlock(oo->ro_dbe);
2243 }
2244 
2245 void
rfs4_update_open_resp(rfs4_openowner_t * oo,nfs_resop4 * resp,nfs_fh4 * fh)2246 rfs4_update_open_resp(rfs4_openowner_t *oo, nfs_resop4 *resp, nfs_fh4 *fh)
2247 {
2248 
2249 	rfs4_dbe_lock(oo->ro_dbe);
2250 
2251 	rfs4_free_reply(&oo->ro_reply);
2252 
2253 	rfs4_copy_reply(&oo->ro_reply, resp);
2254 
2255 	/* Save the filehandle if provided and free if not used */
2256 	if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
2257 	    fh && fh->nfs_fh4_len) {
2258 		if (oo->ro_reply_fh.nfs_fh4_val == NULL)
2259 			oo->ro_reply_fh.nfs_fh4_val =
2260 			    kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2261 		nfs_fh4_copy(fh, &oo->ro_reply_fh);
2262 	} else {
2263 		if (oo->ro_reply_fh.nfs_fh4_val) {
2264 			kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2265 			    oo->ro_reply_fh.nfs_fh4_len);
2266 			oo->ro_reply_fh.nfs_fh4_val = NULL;
2267 			oo->ro_reply_fh.nfs_fh4_len = 0;
2268 		}
2269 	}
2270 
2271 	rfs4_dbe_unlock(oo->ro_dbe);
2272 }
2273 
2274 static bool_t
lockowner_compare(rfs4_entry_t u_entry,void * key)2275 lockowner_compare(rfs4_entry_t u_entry, void *key)
2276 {
2277 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2278 	lock_owner4 *b = (lock_owner4 *)key;
2279 
2280 	if (lo->rl_owner.clientid != b->clientid)
2281 		return (FALSE);
2282 
2283 	if (lo->rl_owner.owner_len != b->owner_len)
2284 		return (FALSE);
2285 
2286 	return (bcmp(lo->rl_owner.owner_val, b->owner_val,
2287 	    lo->rl_owner.owner_len) == 0);
2288 }
2289 
2290 void *
lockowner_mkkey(rfs4_entry_t u_entry)2291 lockowner_mkkey(rfs4_entry_t u_entry)
2292 {
2293 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2294 
2295 	return (&lo->rl_owner);
2296 }
2297 
2298 static uint32_t
lockowner_hash(void * key)2299 lockowner_hash(void *key)
2300 {
2301 	int i;
2302 	lock_owner4 *lockowner = key;
2303 	uint_t hash = 0;
2304 
2305 	for (i = 0; i < lockowner->owner_len; i++) {
2306 		hash <<= 4;
2307 		hash += (uint_t)lockowner->owner_val[i];
2308 	}
2309 	hash += (uint_t)lockowner->clientid;
2310 	hash |= (lockowner->clientid >> 32);
2311 
2312 	return (hash);
2313 }
2314 
2315 static uint32_t
pid_hash(void * key)2316 pid_hash(void *key)
2317 {
2318 	return ((uint32_t)(uintptr_t)key);
2319 }
2320 
2321 static void *
pid_mkkey(rfs4_entry_t u_entry)2322 pid_mkkey(rfs4_entry_t u_entry)
2323 {
2324 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2325 
2326 	return ((void *)(uintptr_t)lo->rl_pid);
2327 }
2328 
2329 static bool_t
pid_compare(rfs4_entry_t u_entry,void * key)2330 pid_compare(rfs4_entry_t u_entry, void *key)
2331 {
2332 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2333 
2334 	return (lo->rl_pid == (pid_t)(uintptr_t)key);
2335 }
2336 
2337 static void
rfs4_lockowner_destroy(rfs4_entry_t u_entry)2338 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
2339 {
2340 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2341 
2342 	/* Free the lock owner id */
2343 	kmem_free(lo->rl_owner.owner_val, lo->rl_owner.owner_len);
2344 	rfs4_client_rele(lo->rl_client);
2345 }
2346 
2347 void
rfs4_lockowner_rele(rfs4_lockowner_t * lo)2348 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
2349 {
2350 	rfs4_dbe_rele(lo->rl_dbe);
2351 }
2352 
2353 /* ARGSUSED */
2354 static bool_t
rfs4_lockowner_expiry(rfs4_entry_t u_entry)2355 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
2356 {
2357 	/*
2358 	 * Since expiry is called with no other references on
2359 	 * this struct, go ahead and have it removed.
2360 	 */
2361 	return (TRUE);
2362 }
2363 
2364 static bool_t
rfs4_lockowner_create(rfs4_entry_t u_entry,void * arg)2365 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
2366 {
2367 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2368 	lock_owner4 *lockowner = (lock_owner4 *)arg;
2369 	rfs4_client_t *cp;
2370 	bool_t create = FALSE;
2371 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2372 
2373 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2374 
2375 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx,
2376 	    &lockowner->clientid,
2377 	    &create, NULL, RFS4_DBS_VALID);
2378 
2379 	rw_exit(&nsrv4->rfs4_findclient_lock);
2380 
2381 	if (cp == NULL)
2382 		return (FALSE);
2383 
2384 	/* Reference client */
2385 	lo->rl_client = cp;
2386 	lo->rl_owner.clientid = lockowner->clientid;
2387 	lo->rl_owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
2388 	bcopy(lockowner->owner_val, lo->rl_owner.owner_val,
2389 	    lockowner->owner_len);
2390 	lo->rl_owner.owner_len = lockowner->owner_len;
2391 	lo->rl_pid = rfs4_dbe_getid(lo->rl_dbe);
2392 
2393 	return (TRUE);
2394 }
2395 
2396 rfs4_lockowner_t *
rfs4_findlockowner(lock_owner4 * lockowner,bool_t * create)2397 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2398 {
2399 	rfs4_lockowner_t *lo;
2400 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2401 
2402 	/* CSTYLED */
2403 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(nsrv4->rfs4_lockowner_idx, lockowner,
2404 	    create, lockowner, RFS4_DBS_VALID);
2405 
2406 	return (lo);
2407 }
2408 
2409 rfs4_lockowner_t *
rfs4_findlockowner_by_pid(pid_t pid)2410 rfs4_findlockowner_by_pid(pid_t pid)
2411 {
2412 	rfs4_lockowner_t *lo;
2413 	bool_t create = FALSE;
2414 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2415 
2416 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(nsrv4->rfs4_lockowner_pid_idx,
2417 	    (void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2418 
2419 	return (lo);
2420 }
2421 
2422 
2423 static uint32_t
file_hash(void * key)2424 file_hash(void *key)
2425 {
2426 	return (ADDRHASH(key));
2427 }
2428 
2429 static void *
file_mkkey(rfs4_entry_t u_entry)2430 file_mkkey(rfs4_entry_t u_entry)
2431 {
2432 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2433 
2434 	return (fp->rf_vp);
2435 }
2436 
2437 static bool_t
file_compare(rfs4_entry_t u_entry,void * key)2438 file_compare(rfs4_entry_t u_entry, void *key)
2439 {
2440 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2441 
2442 	return (fp->rf_vp == (vnode_t *)key);
2443 }
2444 
2445 static void
rfs4_file_destroy(rfs4_entry_t u_entry)2446 rfs4_file_destroy(rfs4_entry_t u_entry)
2447 {
2448 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2449 
2450 	list_destroy(&fp->rf_delegstatelist);
2451 
2452 	if (fp->rf_filehandle.nfs_fh4_val)
2453 		kmem_free(fp->rf_filehandle.nfs_fh4_val,
2454 		    fp->rf_filehandle.nfs_fh4_len);
2455 	cv_destroy(fp->rf_dinfo.rd_recall_cv);
2456 	if (fp->rf_vp) {
2457 		vnode_t *vp = fp->rf_vp;
2458 
2459 		mutex_enter(&vp->v_vsd_lock);
2460 		(void) vsd_set(vp, nfs4_srv_vkey, NULL);
2461 		mutex_exit(&vp->v_vsd_lock);
2462 		VN_RELE(vp);
2463 		fp->rf_vp = NULL;
2464 	}
2465 	rw_destroy(&fp->rf_file_rwlock);
2466 }
2467 
2468 /*
2469  * Used to unlock the underlying dbe struct only
2470  */
2471 void
rfs4_file_rele(rfs4_file_t * fp)2472 rfs4_file_rele(rfs4_file_t *fp)
2473 {
2474 	rfs4_dbe_rele(fp->rf_dbe);
2475 }
2476 
2477 typedef struct {
2478     vnode_t *vp;
2479     nfs_fh4 *fh;
2480 } rfs4_fcreate_arg;
2481 
2482 static bool_t
rfs4_file_create(rfs4_entry_t u_entry,void * arg)2483 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2484 {
2485 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2486 	rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2487 	vnode_t *vp = ap->vp;
2488 	nfs_fh4 *fh = ap->fh;
2489 
2490 	VN_HOLD(vp);
2491 
2492 	fp->rf_filehandle.nfs_fh4_len = 0;
2493 	fp->rf_filehandle.nfs_fh4_val = NULL;
2494 	ASSERT(fh && fh->nfs_fh4_len);
2495 	if (fh && fh->nfs_fh4_len) {
2496 		fp->rf_filehandle.nfs_fh4_val =
2497 		    kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2498 		nfs_fh4_copy(fh, &fp->rf_filehandle);
2499 	}
2500 	fp->rf_vp = vp;
2501 
2502 	list_create(&fp->rf_delegstatelist, sizeof (rfs4_deleg_state_t),
2503 	    offsetof(rfs4_deleg_state_t, rds_node));
2504 
2505 	fp->rf_share_deny = fp->rf_share_access = fp->rf_access_read = 0;
2506 	fp->rf_access_write = fp->rf_deny_read = fp->rf_deny_write = 0;
2507 
2508 	mutex_init(fp->rf_dinfo.rd_recall_lock, NULL, MUTEX_DEFAULT, NULL);
2509 	cv_init(fp->rf_dinfo.rd_recall_cv, NULL, CV_DEFAULT, NULL);
2510 
2511 	fp->rf_dinfo.rd_dtype = OPEN_DELEGATE_NONE;
2512 
2513 	rw_init(&fp->rf_file_rwlock, NULL, RW_DEFAULT, NULL);
2514 
2515 	mutex_enter(&vp->v_vsd_lock);
2516 	VERIFY(vsd_set(vp, nfs4_srv_vkey, (void *)fp) == 0);
2517 	mutex_exit(&vp->v_vsd_lock);
2518 
2519 	return (TRUE);
2520 }
2521 
2522 rfs4_file_t *
rfs4_findfile(vnode_t * vp,nfs_fh4 * fh,bool_t * create)2523 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2524 {
2525 	rfs4_file_t *fp;
2526 	rfs4_fcreate_arg arg;
2527 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2528 
2529 	arg.vp = vp;
2530 	arg.fh = fh;
2531 
2532 	if (*create == TRUE)
2533 		/* CSTYLED */
2534 		fp = (rfs4_file_t *)rfs4_dbsearch(nsrv4->rfs4_file_idx, vp, create,
2535 		    &arg, RFS4_DBS_VALID);
2536 	else {
2537 		mutex_enter(&vp->v_vsd_lock);
2538 		fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2539 		if (fp) {
2540 			rfs4_dbe_lock(fp->rf_dbe);
2541 			if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2542 			    (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2543 				rfs4_dbe_unlock(fp->rf_dbe);
2544 				fp = NULL;
2545 			} else {
2546 				rfs4_dbe_hold(fp->rf_dbe);
2547 				rfs4_dbe_unlock(fp->rf_dbe);
2548 			}
2549 		}
2550 		mutex_exit(&vp->v_vsd_lock);
2551 	}
2552 	return (fp);
2553 }
2554 
2555 /*
2556  * Find a file in the db and once it is located, take the rw lock.
2557  * Need to check the vnode pointer and if it does not exist (it was
2558  * removed between the db location and check) redo the find.  This
2559  * assumes that a file struct that has a NULL vnode pointer is marked
2560  * at 'invalid' and will not be found in the db the second time
2561  * around.
2562  */
2563 rfs4_file_t *
rfs4_findfile_withlock(vnode_t * vp,nfs_fh4 * fh,bool_t * create)2564 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2565 {
2566 	rfs4_file_t *fp;
2567 	rfs4_fcreate_arg arg;
2568 	bool_t screate = *create;
2569 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2570 
2571 	if (screate == FALSE) {
2572 		mutex_enter(&vp->v_vsd_lock);
2573 		fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2574 		if (fp) {
2575 			rfs4_dbe_lock(fp->rf_dbe);
2576 			if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2577 			    (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2578 				rfs4_dbe_unlock(fp->rf_dbe);
2579 				mutex_exit(&vp->v_vsd_lock);
2580 				fp = NULL;
2581 			} else {
2582 				rfs4_dbe_hold(fp->rf_dbe);
2583 				rfs4_dbe_unlock(fp->rf_dbe);
2584 				mutex_exit(&vp->v_vsd_lock);
2585 				rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2586 				if (fp->rf_vp == NULL) {
2587 					rw_exit(&fp->rf_file_rwlock);
2588 					rfs4_file_rele(fp);
2589 					fp = NULL;
2590 				}
2591 			}
2592 		} else {
2593 			mutex_exit(&vp->v_vsd_lock);
2594 		}
2595 	} else {
2596 retry:
2597 		arg.vp = vp;
2598 		arg.fh = fh;
2599 
2600 		fp = (rfs4_file_t *)rfs4_dbsearch(nsrv4->rfs4_file_idx, vp,
2601 		    create, &arg, RFS4_DBS_VALID);
2602 		if (fp != NULL) {
2603 			rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2604 			if (fp->rf_vp == NULL) {
2605 				rw_exit(&fp->rf_file_rwlock);
2606 				rfs4_file_rele(fp);
2607 				*create = screate;
2608 				goto retry;
2609 			}
2610 		}
2611 	}
2612 
2613 	return (fp);
2614 }
2615 
2616 static uint32_t
lo_state_hash(void * key)2617 lo_state_hash(void *key)
2618 {
2619 	stateid_t *id = key;
2620 
2621 	return (id->bits.ident+id->bits.pid);
2622 }
2623 
2624 static bool_t
lo_state_compare(rfs4_entry_t u_entry,void * key)2625 lo_state_compare(rfs4_entry_t u_entry, void *key)
2626 {
2627 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2628 	stateid_t *id = key;
2629 	bool_t rc;
2630 
2631 	rc = (lsp->rls_lockid.bits.boottime == id->bits.boottime &&
2632 	    lsp->rls_lockid.bits.type == id->bits.type &&
2633 	    lsp->rls_lockid.bits.ident == id->bits.ident &&
2634 	    lsp->rls_lockid.bits.pid == id->bits.pid);
2635 
2636 	return (rc);
2637 }
2638 
2639 static void *
lo_state_mkkey(rfs4_entry_t u_entry)2640 lo_state_mkkey(rfs4_entry_t u_entry)
2641 {
2642 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2643 
2644 	return (&lsp->rls_lockid);
2645 }
2646 
2647 static bool_t
rfs4_lo_state_expiry(rfs4_entry_t u_entry)2648 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2649 {
2650 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2651 
2652 	if (rfs4_dbe_is_invalid(lsp->rls_dbe))
2653 		return (TRUE);
2654 	if (lsp->rls_state->rs_closed)
2655 		return (TRUE);
2656 	return ((gethrestime_sec() -
2657 	    lsp->rls_state->rs_owner->ro_client->rc_last_access
2658 	    > rfs4_lease_time));
2659 }
2660 
2661 static void
rfs4_lo_state_destroy(rfs4_entry_t u_entry)2662 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2663 {
2664 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2665 
2666 	rfs4_dbe_lock(lsp->rls_state->rs_dbe);
2667 	list_remove(&lsp->rls_state->rs_lostatelist, lsp);
2668 	rfs4_dbe_unlock(lsp->rls_state->rs_dbe);
2669 
2670 	rfs4_sw_destroy(&lsp->rls_sw);
2671 
2672 	/* Make sure to release the file locks */
2673 	if (lsp->rls_locks_cleaned == FALSE) {
2674 		lsp->rls_locks_cleaned = TRUE;
2675 		if (lsp->rls_locker->rl_client->rc_sysidt != LM_NOSYSID) {
2676 			/* Is the PxFS kernel module loaded? */
2677 			if (lm_remove_file_locks != NULL) {
2678 				int new_sysid;
2679 
2680 				/* Encode the cluster nodeid in new sysid */
2681 				new_sysid =
2682 				    lsp->rls_locker->rl_client->rc_sysidt;
2683 				lm_set_nlmid_flk(&new_sysid);
2684 
2685 				/*
2686 				 * This PxFS routine removes file locks for a
2687 				 * client over all nodes of a cluster.
2688 				 */
2689 				DTRACE_PROBE1(nfss_i_clust_rm_lck,
2690 				    int, new_sysid);
2691 				(*lm_remove_file_locks)(new_sysid);
2692 			} else {
2693 				(void) cleanlocks(
2694 				    lsp->rls_state->rs_finfo->rf_vp,
2695 				    lsp->rls_locker->rl_pid,
2696 				    lsp->rls_locker->rl_client->rc_sysidt);
2697 			}
2698 		}
2699 	}
2700 
2701 	/* Free the last reply for this state */
2702 	rfs4_free_reply(&lsp->rls_reply);
2703 
2704 	rfs4_lockowner_rele(lsp->rls_locker);
2705 	lsp->rls_locker = NULL;
2706 
2707 	rfs4_state_rele_nounlock(lsp->rls_state);
2708 	lsp->rls_state = NULL;
2709 }
2710 
2711 static bool_t
rfs4_lo_state_create(rfs4_entry_t u_entry,void * arg)2712 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2713 {
2714 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2715 	rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2716 	rfs4_lockowner_t *lo = argp->rls_locker;
2717 	rfs4_state_t *sp = argp->rls_state;
2718 
2719 	lsp->rls_state = sp;
2720 
2721 	lsp->rls_lockid = sp->rs_stateid;
2722 	lsp->rls_lockid.bits.type = LOCKID;
2723 	lsp->rls_lockid.bits.chgseq = 0;
2724 	lsp->rls_lockid.bits.pid = lo->rl_pid;
2725 
2726 	lsp->rls_locks_cleaned = FALSE;
2727 	lsp->rls_lock_completed = FALSE;
2728 
2729 	rfs4_sw_init(&lsp->rls_sw);
2730 
2731 	/* Attached the supplied lock owner */
2732 	rfs4_dbe_hold(lo->rl_dbe);
2733 	lsp->rls_locker = lo;
2734 
2735 	rfs4_dbe_lock(sp->rs_dbe);
2736 	list_insert_tail(&sp->rs_lostatelist, lsp);
2737 	rfs4_dbe_hold(sp->rs_dbe);
2738 	rfs4_dbe_unlock(sp->rs_dbe);
2739 
2740 	return (TRUE);
2741 }
2742 
2743 void
rfs4_lo_state_rele(rfs4_lo_state_t * lsp,bool_t unlock_fp)2744 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2745 {
2746 	if (unlock_fp == TRUE)
2747 		rw_exit(&lsp->rls_state->rs_finfo->rf_file_rwlock);
2748 	rfs4_dbe_rele(lsp->rls_dbe);
2749 }
2750 
2751 static rfs4_lo_state_t *
rfs4_findlo_state(stateid_t * id,bool_t lock_fp)2752 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2753 {
2754 	rfs4_lo_state_t *lsp;
2755 	bool_t create = FALSE;
2756 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2757 
2758 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(nsrv4->rfs4_lo_state_idx, id,
2759 	    &create, NULL, RFS4_DBS_VALID);
2760 	if (lock_fp == TRUE && lsp != NULL)
2761 		rw_enter(&lsp->rls_state->rs_finfo->rf_file_rwlock, RW_READER);
2762 
2763 	return (lsp);
2764 }
2765 
2766 
2767 static uint32_t
lo_state_lo_hash(void * key)2768 lo_state_lo_hash(void *key)
2769 {
2770 	rfs4_lo_state_t *lsp = key;
2771 
2772 	return (ADDRHASH(lsp->rls_locker) ^ ADDRHASH(lsp->rls_state));
2773 }
2774 
2775 static bool_t
lo_state_lo_compare(rfs4_entry_t u_entry,void * key)2776 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2777 {
2778 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2779 	rfs4_lo_state_t *keyp = key;
2780 
2781 	return (keyp->rls_locker == lsp->rls_locker &&
2782 	    keyp->rls_state == lsp->rls_state);
2783 }
2784 
2785 static void *
lo_state_lo_mkkey(rfs4_entry_t u_entry)2786 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2787 {
2788 	return (u_entry);
2789 }
2790 
2791 rfs4_lo_state_t *
rfs4_findlo_state_by_owner(rfs4_lockowner_t * lo,rfs4_state_t * sp,bool_t * create)2792 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo, rfs4_state_t *sp,
2793     bool_t *create)
2794 {
2795 	rfs4_lo_state_t *lsp;
2796 	rfs4_lo_state_t arg;
2797 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2798 
2799 	arg.rls_locker = lo;
2800 	arg.rls_state = sp;
2801 
2802 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(nsrv4->rfs4_lo_state_owner_idx,
2803 	    &arg, create, &arg, RFS4_DBS_VALID);
2804 
2805 	return (lsp);
2806 }
2807 
2808 static stateid_t
get_stateid(id_t eid)2809 get_stateid(id_t eid)
2810 {
2811 	stateid_t id;
2812 	nfs4_srv_t *nsrv4;
2813 
2814 	nsrv4 = nfs4_get_srv();
2815 
2816 	id.bits.boottime = nsrv4->rfs4_start_time;
2817 	id.bits.ident = eid;
2818 	id.bits.chgseq = 0;
2819 	id.bits.type = 0;
2820 	id.bits.pid = 0;
2821 
2822 	/*
2823 	 * If we are booted as a cluster node, embed our nodeid.
2824 	 * We've already done sanity checks in rfs4_client_create() so no
2825 	 * need to repeat them here.
2826 	 */
2827 	id.bits.clnodeid = (cluster_bootflags & CLUSTER_BOOTED) ?
2828 	    clconf_get_nodeid() : 0;
2829 
2830 	return (id);
2831 }
2832 
2833 /*
2834  * For use only when booted as a cluster node.
2835  * Returns TRUE if the embedded nodeid indicates that this stateid was
2836  * generated on another node.
2837  */
2838 static int
foreign_stateid(stateid_t * id)2839 foreign_stateid(stateid_t *id)
2840 {
2841 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2842 	return (id->bits.clnodeid != (uint32_t)clconf_get_nodeid());
2843 }
2844 
2845 /*
2846  * For use only when booted as a cluster node.
2847  * Returns TRUE if the embedded nodeid indicates that this clientid was
2848  * generated on another node.
2849  */
2850 static int
foreign_clientid(cid * cidp)2851 foreign_clientid(cid *cidp)
2852 {
2853 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2854 	return (cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT !=
2855 	    (uint32_t)clconf_get_nodeid());
2856 }
2857 
2858 /*
2859  * For use only when booted as a cluster node.
2860  * Embed our cluster nodeid into the clientid.
2861  */
2862 static void
embed_nodeid(cid * cidp)2863 embed_nodeid(cid *cidp)
2864 {
2865 	int clnodeid;
2866 	/*
2867 	 * Currently, our state tables are small enough that their
2868 	 * ids will leave enough bits free for the nodeid. If the
2869 	 * tables become larger, we mustn't overwrite the id.
2870 	 * Equally, we only have room for so many bits of nodeid, so
2871 	 * must check that too.
2872 	 */
2873 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2874 	ASSERT(cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT == 0);
2875 	clnodeid = clconf_get_nodeid();
2876 	ASSERT(clnodeid <= CLUSTER_MAX_NODEID);
2877 	ASSERT(clnodeid != NODEID_UNKNOWN);
2878 	cidp->impl_id.c_id |= (clnodeid << CLUSTER_NODEID_SHIFT);
2879 }
2880 
2881 static uint32_t
state_hash(void * key)2882 state_hash(void *key)
2883 {
2884 	stateid_t *ip = (stateid_t *)key;
2885 
2886 	return (ip->bits.ident);
2887 }
2888 
2889 static bool_t
state_compare(rfs4_entry_t u_entry,void * key)2890 state_compare(rfs4_entry_t u_entry, void *key)
2891 {
2892 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2893 	stateid_t *id = (stateid_t *)key;
2894 	bool_t rc;
2895 
2896 	rc = (sp->rs_stateid.bits.boottime == id->bits.boottime &&
2897 	    sp->rs_stateid.bits.ident == id->bits.ident);
2898 
2899 	return (rc);
2900 }
2901 
2902 static void *
state_mkkey(rfs4_entry_t u_entry)2903 state_mkkey(rfs4_entry_t u_entry)
2904 {
2905 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2906 
2907 	return (&sp->rs_stateid);
2908 }
2909 
2910 static void
rfs4_state_destroy(rfs4_entry_t u_entry)2911 rfs4_state_destroy(rfs4_entry_t u_entry)
2912 {
2913 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2914 
2915 	/* remove from openowner list */
2916 	rfs4_dbe_lock(sp->rs_owner->ro_dbe);
2917 	list_remove(&sp->rs_owner->ro_statelist, sp);
2918 	rfs4_dbe_unlock(sp->rs_owner->ro_dbe);
2919 
2920 	list_destroy(&sp->rs_lostatelist);
2921 
2922 	/* release any share locks for this stateid if it's still open */
2923 	if (!sp->rs_closed) {
2924 		rfs4_dbe_lock(sp->rs_dbe);
2925 		(void) rfs4_unshare(sp);
2926 		rfs4_dbe_unlock(sp->rs_dbe);
2927 	}
2928 
2929 	/* Were done with the file */
2930 	rfs4_file_rele(sp->rs_finfo);
2931 	sp->rs_finfo = NULL;
2932 
2933 	/* And now with the openowner */
2934 	rfs4_openowner_rele(sp->rs_owner);
2935 	sp->rs_owner = NULL;
2936 }
2937 
2938 static void
rfs4_state_rele_nounlock(rfs4_state_t * sp)2939 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2940 {
2941 	rfs4_dbe_rele(sp->rs_dbe);
2942 }
2943 
2944 void
rfs4_state_rele(rfs4_state_t * sp)2945 rfs4_state_rele(rfs4_state_t *sp)
2946 {
2947 	rw_exit(&sp->rs_finfo->rf_file_rwlock);
2948 	rfs4_dbe_rele(sp->rs_dbe);
2949 }
2950 
2951 static uint32_t
deleg_hash(void * key)2952 deleg_hash(void *key)
2953 {
2954 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2955 
2956 	return (ADDRHASH(dsp->rds_client) ^ ADDRHASH(dsp->rds_finfo));
2957 }
2958 
2959 static bool_t
deleg_compare(rfs4_entry_t u_entry,void * key)2960 deleg_compare(rfs4_entry_t u_entry, void *key)
2961 {
2962 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2963 	rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2964 
2965 	return (dsp->rds_client == kdsp->rds_client &&
2966 	    dsp->rds_finfo == kdsp->rds_finfo);
2967 }
2968 
2969 static void *
deleg_mkkey(rfs4_entry_t u_entry)2970 deleg_mkkey(rfs4_entry_t u_entry)
2971 {
2972 	return (u_entry);
2973 }
2974 
2975 static uint32_t
deleg_state_hash(void * key)2976 deleg_state_hash(void *key)
2977 {
2978 	stateid_t *ip = (stateid_t *)key;
2979 
2980 	return (ip->bits.ident);
2981 }
2982 
2983 static bool_t
deleg_state_compare(rfs4_entry_t u_entry,void * key)2984 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2985 {
2986 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2987 	stateid_t *id = (stateid_t *)key;
2988 	bool_t rc;
2989 
2990 	if (id->bits.type != DELEGID)
2991 		return (FALSE);
2992 
2993 	rc = (dsp->rds_delegid.bits.boottime == id->bits.boottime &&
2994 	    dsp->rds_delegid.bits.ident == id->bits.ident);
2995 
2996 	return (rc);
2997 }
2998 
2999 static void *
deleg_state_mkkey(rfs4_entry_t u_entry)3000 deleg_state_mkkey(rfs4_entry_t u_entry)
3001 {
3002 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3003 
3004 	return (&dsp->rds_delegid);
3005 }
3006 
3007 static bool_t
rfs4_deleg_state_expiry(rfs4_entry_t u_entry)3008 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
3009 {
3010 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3011 
3012 	if (rfs4_dbe_is_invalid(dsp->rds_dbe))
3013 		return (TRUE);
3014 
3015 	if (dsp->rds_dtype == OPEN_DELEGATE_NONE)
3016 		return (TRUE);
3017 
3018 	if ((gethrestime_sec() - dsp->rds_client->rc_last_access
3019 	    > rfs4_lease_time)) {
3020 		rfs4_dbe_invalidate(dsp->rds_dbe);
3021 		return (TRUE);
3022 	}
3023 
3024 	return (FALSE);
3025 }
3026 
3027 static bool_t
rfs4_deleg_state_create(rfs4_entry_t u_entry,void * argp)3028 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
3029 {
3030 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3031 	rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->rds_finfo;
3032 	rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->rds_client;
3033 
3034 	rfs4_dbe_hold(fp->rf_dbe);
3035 	rfs4_dbe_hold(cp->rc_dbe);
3036 
3037 	dsp->rds_delegid = get_stateid(rfs4_dbe_getid(dsp->rds_dbe));
3038 	dsp->rds_delegid.bits.type = DELEGID;
3039 	dsp->rds_finfo = fp;
3040 	dsp->rds_client = cp;
3041 	dsp->rds_dtype = OPEN_DELEGATE_NONE;
3042 
3043 	dsp->rds_time_granted = gethrestime_sec();	/* observability */
3044 	dsp->rds_time_revoked = 0;
3045 
3046 	list_link_init(&dsp->rds_node);
3047 
3048 	return (TRUE);
3049 }
3050 
3051 static void
rfs4_deleg_state_destroy(rfs4_entry_t u_entry)3052 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
3053 {
3054 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3055 
3056 	/* return delegation if necessary */
3057 	rfs4_return_deleg(dsp, FALSE);
3058 
3059 	/* Were done with the file */
3060 	rfs4_file_rele(dsp->rds_finfo);
3061 	dsp->rds_finfo = NULL;
3062 
3063 	/* And now with the openowner */
3064 	rfs4_client_rele(dsp->rds_client);
3065 	dsp->rds_client = NULL;
3066 }
3067 
3068 rfs4_deleg_state_t *
rfs4_finddeleg(rfs4_state_t * sp,bool_t * create)3069 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
3070 {
3071 	rfs4_deleg_state_t ds, *dsp;
3072 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
3073 
3074 	ds.rds_client = sp->rs_owner->ro_client;
3075 	ds.rds_finfo = sp->rs_finfo;
3076 
3077 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(nsrv4->rfs4_deleg_idx, &ds,
3078 	    create, &ds, RFS4_DBS_VALID);
3079 
3080 	return (dsp);
3081 }
3082 
3083 rfs4_deleg_state_t *
rfs4_finddelegstate(stateid_t * id)3084 rfs4_finddelegstate(stateid_t *id)
3085 {
3086 	rfs4_deleg_state_t *dsp;
3087 	bool_t create = FALSE;
3088 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
3089 
3090 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(nsrv4->rfs4_deleg_state_idx,
3091 	    id, &create, NULL, RFS4_DBS_VALID);
3092 
3093 	return (dsp);
3094 }
3095 
3096 void
rfs4_deleg_state_rele(rfs4_deleg_state_t * dsp)3097 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
3098 {
3099 	rfs4_dbe_rele(dsp->rds_dbe);
3100 }
3101 
3102 void
rfs4_update_lock_sequence(rfs4_lo_state_t * lsp)3103 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
3104 {
3105 
3106 	rfs4_dbe_lock(lsp->rls_dbe);
3107 
3108 	/*
3109 	 * If we are skipping sequence id checking, this means that
3110 	 * this is the first lock request and therefore the sequence
3111 	 * id does not need to be updated.  This only happens on the
3112 	 * first lock request for a lockowner
3113 	 */
3114 	if (!lsp->rls_skip_seqid_check)
3115 		lsp->rls_seqid++;
3116 
3117 	rfs4_dbe_unlock(lsp->rls_dbe);
3118 }
3119 
3120 void
rfs4_update_lock_resp(rfs4_lo_state_t * lsp,nfs_resop4 * resp)3121 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
3122 {
3123 
3124 	rfs4_dbe_lock(lsp->rls_dbe);
3125 
3126 	rfs4_free_reply(&lsp->rls_reply);
3127 
3128 	rfs4_copy_reply(&lsp->rls_reply, resp);
3129 
3130 	rfs4_dbe_unlock(lsp->rls_dbe);
3131 }
3132 
3133 void
rfs4_free_opens(rfs4_openowner_t * oo,bool_t invalidate,bool_t close_of_client)3134 rfs4_free_opens(rfs4_openowner_t *oo, bool_t invalidate,
3135     bool_t close_of_client)
3136 {
3137 	rfs4_state_t *sp;
3138 
3139 	rfs4_dbe_lock(oo->ro_dbe);
3140 
3141 	for (sp = list_head(&oo->ro_statelist); sp != NULL;
3142 	    sp = list_next(&oo->ro_statelist, sp)) {
3143 		rfs4_state_close(sp, FALSE, close_of_client, CRED());
3144 		if (invalidate == TRUE)
3145 			rfs4_dbe_invalidate(sp->rs_dbe);
3146 	}
3147 
3148 	rfs4_dbe_invalidate(oo->ro_dbe);
3149 	rfs4_dbe_unlock(oo->ro_dbe);
3150 }
3151 
3152 static uint32_t
state_owner_file_hash(void * key)3153 state_owner_file_hash(void *key)