xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_db.c (revision 0dfe541e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2018 Nexenta Systems, Inc.
28  */
29 
30 #include <sys/systm.h>
31 #include <sys/cmn_err.h>
32 #include <sys/kmem.h>
33 #include <sys/disp.h>
34 #include <sys/id_space.h>
35 #include <sys/atomic.h>
36 #include <rpc/rpc.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfs4_db_impl.h>
39 #include <sys/sdt.h>
40 
41 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
42 
43 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
44 static void rfs4_dbe_destroy(rfs4_dbe_t *);
45 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
46 static void rfs4_start_reaper(rfs4_table_t *);
47 
48 /*
49  * t_lowat - integer percentage of table entries	/etc/system only
50  * t_hiwat - integer percentage of table entries	/etc/system only
51  * t_lreap - integer percentage of table reap time	mdb or /etc/system
52  * t_hreap - integer percentage of table reap time	mdb or /etc/system
53  */
54 uint32_t	t_lowat = 50;	/* reap at t_lreap when id's in use hit 50% */
55 uint32_t	t_hiwat = 75;	/* reap at t_hreap when id's in use hit 75% */
56 time_t		t_lreap = 50;	/* default to 50% of table's reap interval */
57 time_t		t_hreap = 10;	/* default to 10% of table's reap interval */
58 
59 id_t
60 rfs4_dbe_getid(rfs4_dbe_t *entry)
61 {
62 	return (entry->dbe_id);
63 }
64 
65 void
66 rfs4_dbe_hold(rfs4_dbe_t *entry)
67 {
68 	atomic_inc_32(&entry->dbe_refcnt);
69 }
70 
71 /*
72  * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
73  */
74 void
75 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
76 {
77 	atomic_dec_32(&entry->dbe_refcnt);
78 }
79 
80 
81 uint32_t
82 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
83 {
84 	return (entry->dbe_refcnt);
85 }
86 
87 /*
88  * Mark an entry such that the dbsearch will skip it.
89  * Caller does not want this entry to be found any longer
90  */
91 void
92 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
93 {
94 	entry->dbe_invalid = TRUE;
95 	entry->dbe_skipsearch = TRUE;
96 }
97 
98 /*
99  * Is this entry invalid?
100  */
101 bool_t
102 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
103 {
104 	return (entry->dbe_invalid);
105 }
106 
107 time_t
108 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
109 {
110 	return (entry->dbe_time_rele);
111 }
112 
113 /*
114  * Use these to temporarily hide/unhide a db entry.
115  */
116 void
117 rfs4_dbe_hide(rfs4_dbe_t *entry)
118 {
119 	rfs4_dbe_lock(entry);
120 	entry->dbe_skipsearch = TRUE;
121 	rfs4_dbe_unlock(entry);
122 }
123 
124 void
125 rfs4_dbe_unhide(rfs4_dbe_t *entry)
126 {
127 	rfs4_dbe_lock(entry);
128 	entry->dbe_skipsearch = FALSE;
129 	rfs4_dbe_unlock(entry);
130 }
131 
132 void
133 rfs4_dbe_rele(rfs4_dbe_t *entry)
134 {
135 	mutex_enter(entry->dbe_lock);
136 	ASSERT(entry->dbe_refcnt > 1);
137 	atomic_dec_32(&entry->dbe_refcnt);
138 	entry->dbe_time_rele = gethrestime_sec();
139 	mutex_exit(entry->dbe_lock);
140 }
141 
142 void
143 rfs4_dbe_lock(rfs4_dbe_t *entry)
144 {
145 	mutex_enter(entry->dbe_lock);
146 }
147 
148 void
149 rfs4_dbe_unlock(rfs4_dbe_t *entry)
150 {
151 	mutex_exit(entry->dbe_lock);
152 }
153 
154 bool_t
155 rfs4_dbe_islocked(rfs4_dbe_t *entry)
156 {
157 	return (mutex_owned(entry->dbe_lock));
158 }
159 
160 clock_t
161 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
162 {
163 	return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
164 }
165 
166 void
167 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
168 {
169 	cv_broadcast(entry->dbe_cv);
170 }
171 
172 /* ARGSUSED */
173 static int
174 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
175 {
176 	rfs4_dbe_t *entry = obj;
177 
178 	mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
179 	cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
180 
181 	return (0);
182 }
183 
184 static void
185 rfs4_dbe_kmem_destructor(void *obj, void *private)
186 {
187 	rfs4_dbe_t *entry = obj;
188 	/*LINTED*/
189 	rfs4_table_t *table = private;
190 
191 	mutex_destroy(entry->dbe_lock);
192 	cv_destroy(entry->dbe_cv);
193 }
194 
195 rfs4_database_t *
196 rfs4_database_create(uint32_t flags)
197 {
198 	rfs4_database_t *db;
199 
200 	db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
201 	mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
202 	db->db_tables = NULL;
203 	db->db_debug_flags = flags;
204 	db->db_shutdown_count = 0;
205 	cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
206 	return (db);
207 }
208 
209 
210 /*
211  * The reaper threads that have been created for the tables in this
212  * database must be stopped and the entries in the tables released.
213  * Each table will be marked as "shutdown" and the reaper threads
214  * poked and they will see that a shutdown is in progress and cleanup
215  * and exit.  This function waits for all reaper threads to stop
216  * before returning to the caller.
217  */
218 void
219 rfs4_database_shutdown(rfs4_database_t *db)
220 {
221 	rfs4_table_t *table;
222 
223 	mutex_enter(db->db_lock);
224 	for (table = db->db_tables; table; table = table->dbt_tnext) {
225 		mutex_enter(&table->dbt_reaper_cv_lock);
226 		table->dbt_reaper_shutdown = TRUE;
227 		cv_broadcast(&table->dbt_reaper_wait);
228 		db->db_shutdown_count++;
229 		mutex_exit(&table->dbt_reaper_cv_lock);
230 	}
231 	while (db->db_shutdown_count > 0) {
232 		cv_wait(&db->db_shutdown_wait, db->db_lock);
233 	}
234 	mutex_exit(db->db_lock);
235 }
236 
237 /*
238  * Given a database that has been "shutdown" by the function above all
239  * of the table tables are destroyed and then the database itself
240  * freed.
241  */
242 void
243 rfs4_database_destroy(rfs4_database_t *db)
244 {
245 	rfs4_table_t *next, *tmp;
246 
247 	for (next = db->db_tables; next; ) {
248 		tmp = next;
249 		next = tmp->dbt_tnext;
250 		rfs4_table_destroy(db, tmp);
251 	}
252 
253 	mutex_destroy(db->db_lock);
254 	kmem_free(db, sizeof (rfs4_database_t));
255 }
256 
257 /*
258  * Used to get the correct kmem_cache database for the state table being
259  * created.
260  * Helper function for rfs4_table_create
261  */
262 static kmem_cache_t *
263 get_db_mem_cache(char *name)
264 {
265 	int i;
266 
267 	for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
268 		if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
269 			return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
270 	}
271 	/*
272 	 * There is no associated kmem cache for this NFS4 server state
273 	 * table name
274 	 */
275 	return (NULL);
276 }
277 
278 /*
279  * Used to initialize the global NFSv4 server state database.
280  * Helper funtion for rfs4_state_g_init and called when module is loaded.
281  */
282 kmem_cache_t *
283 /* CSTYLED */
284 nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
285 {
286 	kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
287 	    sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
288 	    0,
289 	    rfs4_dbe_kmem_constructor,
290 	    rfs4_dbe_kmem_destructor,
291 	    NULL,
292 	    NULL,
293 	    NULL,
294 	    0);
295 	(void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
296 	    strlen(cache_name) + 1);
297 	rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
298 	return (mem_cache);
299 }
300 
301 rfs4_table_t *
302 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
303     uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
304     void (*destroy)(rfs4_entry_t),
305     bool_t (*expiry)(rfs4_entry_t),
306     uint32_t size, uint32_t hashsize,
307     uint32_t maxentries, id_t start)
308 {
309 	rfs4_table_t	*table;
310 	int		 len;
311 	char		*cache_name;
312 	char		*id_name;
313 
314 	table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
315 	table->dbt_db = db;
316 	rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
317 	mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
318 	mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
319 	cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
320 
321 	len = strlen(tabname);
322 	table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
323 	cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
324 	(void) strcpy(table->dbt_name, tabname);
325 	(void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
326 	table->dbt_max_cache_time = max_cache_time;
327 	table->dbt_usize = size;
328 	table->dbt_len = hashsize;
329 	table->dbt_count = 0;
330 	table->dbt_idxcnt = 0;
331 	table->dbt_ccnt = 0;
332 	table->dbt_maxcnt = idxcnt;
333 	table->dbt_indices = NULL;
334 	table->dbt_id_space = NULL;
335 	table->dbt_reaper_shutdown = FALSE;
336 
337 	if (start >= 0) {
338 		if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
339 			maxentries = INT32_MAX - start;
340 		id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
341 		(void) sprintf(id_name, "%s_id_space", table->dbt_name);
342 		table->dbt_id_space = id_space_create(id_name, start,
343 		    maxentries + start);
344 		kmem_free(id_name, len + 10);
345 	}
346 	ASSERT(t_lowat != 0);
347 	table->dbt_id_lwat = (maxentries * t_lowat) / 100;
348 	ASSERT(t_hiwat != 0);
349 	table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
350 	table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
351 	table->dbt_maxentries = maxentries;
352 	table->dbt_create = create;
353 	table->dbt_destroy = destroy;
354 	table->dbt_expiry = expiry;
355 
356 	/*
357 	 * get the correct kmem_cache for this table type based on the name.
358 	 */
359 	table->dbt_mem_cache = get_db_mem_cache(cache_name);
360 
361 	kmem_free(cache_name, len+13);
362 
363 	table->dbt_debug = db->db_debug_flags;
364 
365 	mutex_enter(db->db_lock);
366 	table->dbt_tnext = db->db_tables;
367 	db->db_tables = table;
368 	mutex_exit(db->db_lock);
369 
370 	rfs4_start_reaper(table);
371 
372 	return (table);
373 }
374 
375 void
376 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
377 {
378 	rfs4_table_t *p;
379 	rfs4_index_t *idx;
380 
381 	ASSERT(table->dbt_count == 0);
382 
383 	mutex_enter(db->db_lock);
384 	if (table == db->db_tables)
385 		db->db_tables = table->dbt_tnext;
386 	else {
387 		for (p = db->db_tables; p; p = p->dbt_tnext)
388 			if (p->dbt_tnext == table) {
389 				p->dbt_tnext = table->dbt_tnext;
390 				table->dbt_tnext = NULL;
391 				break;
392 			}
393 		ASSERT(p != NULL);
394 	}
395 	mutex_exit(db->db_lock);
396 
397 	/* Destroy indices */
398 	while (table->dbt_indices) {
399 		idx = table->dbt_indices;
400 		table->dbt_indices = idx->dbi_inext;
401 		rfs4_index_destroy(idx);
402 	}
403 
404 	rw_destroy(table->dbt_t_lock);
405 	mutex_destroy(table->dbt_lock);
406 	mutex_destroy(&table->dbt_reaper_cv_lock);
407 	cv_destroy(&table->dbt_reaper_wait);
408 
409 	kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
410 	if (table->dbt_id_space)
411 		id_space_destroy(table->dbt_id_space);
412 	table->dbt_mem_cache = NULL;
413 	kmem_free(table, sizeof (rfs4_table_t));
414 }
415 
416 rfs4_index_t *
417 rfs4_index_create(rfs4_table_t *table, char *keyname,
418     uint32_t (*hash)(void *),
419     bool_t (compare)(rfs4_entry_t, void *),
420     void *(*mkkey)(rfs4_entry_t),
421     bool_t createable)
422 {
423 	rfs4_index_t *idx;
424 
425 	ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
426 
427 	idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
428 
429 	idx->dbi_table = table;
430 	idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
431 	(void) strcpy(idx->dbi_keyname, keyname);
432 	idx->dbi_hash = hash;
433 	idx->dbi_compare = compare;
434 	idx->dbi_mkkey = mkkey;
435 	idx->dbi_tblidx = table->dbt_idxcnt;
436 	table->dbt_idxcnt++;
437 	if (createable) {
438 		table->dbt_ccnt++;
439 		if (table->dbt_ccnt > 1)
440 			panic("Table %s currently can have only have one "
441 			    "index that will allow creation of entries",
442 			    table->dbt_name);
443 		idx->dbi_createable = TRUE;
444 	} else {
445 		idx->dbi_createable = FALSE;
446 	}
447 
448 	idx->dbi_inext = table->dbt_indices;
449 	table->dbt_indices = idx;
450 	idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
451 	    KM_SLEEP);
452 
453 	return (idx);
454 }
455 
456 void
457 rfs4_index_destroy(rfs4_index_t *idx)
458 {
459 	kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
460 	kmem_free(idx->dbi_buckets,
461 	    sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
462 	kmem_free(idx, sizeof (rfs4_index_t));
463 }
464 
465 static void
466 rfs4_dbe_destroy(rfs4_dbe_t *entry)
467 {
468 	rfs4_index_t *idx;
469 	void *key;
470 	int i;
471 	rfs4_bucket_t *bp;
472 	rfs4_table_t *table = entry->dbe_table;
473 	rfs4_link_t *l;
474 
475 	NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
476 	    (CE_NOTE, "Destroying entry %p from %s",
477 	    (void*)entry, table->dbt_name));
478 
479 	mutex_enter(entry->dbe_lock);
480 	ASSERT(entry->dbe_refcnt == 0);
481 	mutex_exit(entry->dbe_lock);
482 
483 	/* Unlink from all indices */
484 	for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
485 		l = &entry->dbe_indices[idx->dbi_tblidx];
486 		/* check and see if we were ever linked in to the index */
487 		if (INVALID_LINK(l)) {
488 			ASSERT(l->next == NULL && l->prev == NULL);
489 			continue;
490 		}
491 		key = idx->dbi_mkkey(entry->dbe_data);
492 		i = HASH(idx, key);
493 		bp = &idx->dbi_buckets[i];
494 		ASSERT(bp->dbk_head != NULL);
495 		DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
496 	}
497 
498 	/* Destroy user data */
499 	if (table->dbt_destroy)
500 		(*table->dbt_destroy)(entry->dbe_data);
501 
502 	if (table->dbt_id_space)
503 		id_free(table->dbt_id_space, entry->dbe_id);
504 
505 	mutex_enter(table->dbt_lock);
506 	table->dbt_count--;
507 	mutex_exit(table->dbt_lock);
508 
509 	/* Destroy the entry itself */
510 	kmem_cache_free(table->dbt_mem_cache, entry);
511 }
512 
513 
514 static rfs4_dbe_t *
515 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
516 {
517 	rfs4_dbe_t *entry;
518 	int i;
519 
520 	NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
521 	    (CE_NOTE, "Creating entry in table %s", table->dbt_name));
522 
523 	entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
524 
525 	entry->dbe_refcnt = 1;
526 	entry->dbe_invalid = FALSE;
527 	entry->dbe_skipsearch = FALSE;
528 	entry->dbe_time_rele = 0;
529 	entry->dbe_id = 0;
530 
531 	if (table->dbt_id_space)
532 		entry->dbe_id = id;
533 	entry->dbe_table = table;
534 
535 	for (i = 0; i < table->dbt_maxcnt; i++) {
536 		entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
537 		entry->dbe_indices[i].entry = entry;
538 		/*
539 		 * We mark the entry as not indexed by setting the low
540 		 * order bit, since address are word aligned. This has
541 		 * the advantage of causeing a trap if the address is
542 		 * used. After the entry is linked in to the
543 		 * corresponding index the bit will be cleared.
544 		 */
545 		INVALIDATE_ADDR(entry->dbe_indices[i].entry);
546 	}
547 
548 	entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
549 	bzero(entry->dbe_data, table->dbt_usize);
550 	entry->dbe_data->dbe = entry;
551 
552 	if (!(*table->dbt_create)(entry->dbe_data, data)) {
553 		kmem_cache_free(table->dbt_mem_cache, entry);
554 		return (NULL);
555 	}
556 
557 	mutex_enter(table->dbt_lock);
558 	table->dbt_count++;
559 	mutex_exit(table->dbt_lock);
560 
561 	return (entry);
562 }
563 
564 static void
565 rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
566 {
567 	clock_t		tabreap;
568 	clock_t		reap_int;
569 	uint32_t	in_use;
570 
571 	/*
572 	 * Adjust the table's reap interval based on the
573 	 * number of id's currently in use. Each table's
574 	 * default remains the same if id usage subsides.
575 	 */
576 	ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
577 	tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
578 
579 	in_use = table->dbt_count + 1;	/* see rfs4_dbe_create */
580 	if (in_use >= table->dbt_id_hwat) {
581 		ASSERT(t_hreap != 0);
582 		reap_int = (tabreap * t_hreap) / 100;
583 	} else if (in_use >= table->dbt_id_lwat) {
584 		ASSERT(t_lreap != 0);
585 		reap_int = (tabreap * t_lreap) / 100;
586 	} else {
587 		reap_int = tabreap;
588 	}
589 	table->dbt_id_reap = reap_int;
590 	DTRACE_PROBE2(table__reap__interval, char *,
591 	    table->dbt_name, time_t, table->dbt_id_reap);
592 }
593 
594 rfs4_entry_t
595 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
596     rfs4_dbsearch_type_t dbsearch_type)
597 {
598 	int		 already_done;
599 	uint32_t	 i;
600 	rfs4_table_t	*table = idx->dbi_table;
601 	rfs4_index_t	*ip;
602 	rfs4_bucket_t	*bp;
603 	rfs4_link_t	*l;
604 	rfs4_dbe_t	*entry;
605 	id_t		 id = -1;
606 
607 	i = HASH(idx, key);
608 	bp = &idx->dbi_buckets[i];
609 
610 	NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
611 	    (CE_NOTE, "Searching for key %p in table %s by %s",
612 	    key, table->dbt_name, idx->dbi_keyname));
613 
614 	rw_enter(bp->dbk_lock, RW_READER);
615 retry:
616 	for (l = bp->dbk_head; l; l = l->next) {
617 		if (l->entry->dbe_refcnt > 0 &&
618 		    (l->entry->dbe_skipsearch == FALSE ||
619 		    (l->entry->dbe_skipsearch == TRUE &&
620 		    dbsearch_type == RFS4_DBS_INVALID)) &&
621 		    (*idx->dbi_compare)(l->entry->dbe_data, key)) {
622 			mutex_enter(l->entry->dbe_lock);
623 			if (l->entry->dbe_refcnt == 0) {
624 				mutex_exit(l->entry->dbe_lock);
625 				continue;
626 			}
627 
628 			/* place an additional hold since we are returning */
629 			rfs4_dbe_hold(l->entry);
630 
631 			mutex_exit(l->entry->dbe_lock);
632 			rw_exit(bp->dbk_lock);
633 
634 			*create = FALSE;
635 
636 			NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
637 			    (CE_NOTE, "Found entry %p for %p in table %s",
638 			    (void *)l->entry, key, table->dbt_name));
639 
640 			if (id != -1)
641 				id_free(table->dbt_id_space, id);
642 			return (l->entry->dbe_data);
643 		}
644 	}
645 
646 	if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
647 	    table->dbt_maxentries == table->dbt_count) {
648 		NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
649 		    (CE_NOTE, "Entry for %p in %s not found",
650 		    key, table->dbt_name));
651 
652 		rw_exit(bp->dbk_lock);
653 		if (id != -1)
654 			id_free(table->dbt_id_space, id);
655 		return (NULL);
656 	}
657 
658 	if (table->dbt_id_space && id == -1) {
659 		rw_exit(bp->dbk_lock);
660 
661 		/* get an id, ok to sleep for it here */
662 		id = id_alloc(table->dbt_id_space);
663 		ASSERT(id != -1);
664 
665 		mutex_enter(&table->dbt_reaper_cv_lock);
666 		rfs4_dbe_tabreap_adjust(table);
667 		mutex_exit(&table->dbt_reaper_cv_lock);
668 
669 		rw_enter(bp->dbk_lock, RW_WRITER);
670 		goto retry;
671 	}
672 
673 	/* get an exclusive lock on the bucket */
674 	if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
675 		NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
676 		    (CE_NOTE, "Trying to upgrade lock on "
677 		    "hash chain %d (%p) for  %s by %s",
678 		    i, (void*)bp, table->dbt_name, idx->dbi_keyname));
679 
680 		rw_exit(bp->dbk_lock);
681 		rw_enter(bp->dbk_lock, RW_WRITER);
682 		goto retry;
683 	}
684 
685 	/* create entry */
686 	entry = rfs4_dbe_create(table, id, arg);
687 	if (entry == NULL) {
688 		rw_exit(bp->dbk_lock);
689 		if (id != -1)
690 			id_free(table->dbt_id_space, id);
691 
692 		NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
693 		    (CE_NOTE, "Constructor for table %s failed",
694 		    table->dbt_name));
695 		return (NULL);
696 	}
697 
698 	/*
699 	 * Add one ref for entry into table's hash - only one
700 	 * reference added even though there may be multiple indices
701 	 */
702 	rfs4_dbe_hold(entry);
703 	ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
704 	VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
705 
706 	already_done = idx->dbi_tblidx;
707 	rw_exit(bp->dbk_lock);
708 
709 	for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
710 		if (ip->dbi_tblidx == already_done)
711 			continue;
712 		l = &entry->dbe_indices[ip->dbi_tblidx];
713 		i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
714 		ASSERT(i < ip->dbi_table->dbt_len);
715 		bp = &ip->dbi_buckets[i];
716 		ENQUEUE_IDX(bp, l);
717 	}
718 
719 	NFS4_DEBUG(
720 	    table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
721 	    (CE_NOTE, "Entry %p created for %s = %p in table %s",
722 	    (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
723 
724 	return (entry->dbe_data);
725 }
726 
727 /*ARGSUSED*/
728 boolean_t
729 rfs4_cpr_callb(void *arg, int code)
730 {
731 	rfs4_bucket_t *buckets, *bp;
732 	rfs4_link_t *l;
733 	rfs4_client_t *cp;
734 	int i;
735 
736 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
737 	rfs4_table_t *table = nsrv4->rfs4_client_tab;
738 
739 	/*
740 	 * We get called for Suspend and Resume events.
741 	 * For the suspend case we simply don't care!  Nor do we care if
742 	 * there are no clients.
743 	 */
744 	if (code == CB_CODE_CPR_CHKPT || table == NULL) {
745 		return (B_TRUE);
746 	}
747 
748 	buckets = table->dbt_indices->dbi_buckets;
749 
750 	/*
751 	 * When we get this far we are in the process of
752 	 * resuming the system from a previous suspend.
753 	 *
754 	 * We are going to blast through and update the
755 	 * last_access time for all the clients and in
756 	 * doing so extend them by one lease period.
757 	 */
758 	for (i = 0; i < table->dbt_len; i++) {
759 		bp = &buckets[i];
760 		for (l = bp->dbk_head; l; l = l->next) {
761 			cp = (rfs4_client_t *)l->entry->dbe_data;
762 			cp->rc_last_access = gethrestime_sec();
763 		}
764 	}
765 
766 	return (B_TRUE);
767 }
768 
769 /*
770  * Given a table, lock each of the buckets and walk all entries (in
771  * turn locking those) and calling the provided "callout" function
772  * with the provided parameter.  Obviously used to iterate across all
773  * entries in a particular table via the database locking hierarchy.
774  * Obviously the caller must not hold locks on any of the entries in
775  * the specified table.
776  */
777 void
778 rfs4_dbe_walk(rfs4_table_t *table,
779     void (*callout)(rfs4_entry_t, void *),
780     void *data)
781 {
782 	rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
783 	rfs4_link_t *l;
784 	rfs4_dbe_t *entry;
785 	int i;
786 
787 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
788 	    (CE_NOTE, "Walking entries in %s", table->dbt_name));
789 
790 	/* Walk the buckets looking for entries to release/destroy */
791 	for (i = 0; i < table->dbt_len; i++) {
792 		bp = &buckets[i];
793 		rw_enter(bp->dbk_lock, RW_READER);
794 		for (l = bp->dbk_head; l; l = l->next) {
795 			entry = l->entry;
796 			mutex_enter(entry->dbe_lock);
797 			(*callout)(entry->dbe_data, data);
798 			mutex_exit(entry->dbe_lock);
799 		}
800 		rw_exit(bp->dbk_lock);
801 	}
802 
803 	NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
804 	    (CE_NOTE, "Walking entries complete %s", table->dbt_name));
805 }
806 
807 
808 static void
809 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
810 {
811 	rfs4_index_t *idx = table->dbt_indices;
812 	rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
813 	rfs4_link_t *l, *t;
814 	rfs4_dbe_t *entry;
815 	bool_t found;
816 	int i;
817 	int count = 0;
818 
819 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
820 	    (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
821 	    desired, cache_time, table->dbt_name));
822 
823 	/* Walk the buckets looking for entries to release/destroy */
824 	for (i = 0; i < table->dbt_len; i++) {
825 		bp = &buckets[i];
826 		do {
827 			found = FALSE;
828 			rw_enter(bp->dbk_lock, RW_READER);
829 			for (l = bp->dbk_head; l; l = l->next) {
830 				entry = l->entry;
831 				/*
832 				 * Examine an entry.  Ref count of 1 means
833 				 * that the only reference is for the hash
834 				 * table reference.
835 				 */
836 				if (entry->dbe_refcnt != 1)
837 					continue;
838 				mutex_enter(entry->dbe_lock);
839 				if ((entry->dbe_refcnt == 1) &&
840 				    (table->dbt_reaper_shutdown ||
841 				    table->dbt_expiry == NULL ||
842 				    (*table->dbt_expiry)(entry->dbe_data))) {
843 					entry->dbe_refcnt--;
844 					count++;
845 					found = TRUE;
846 				}
847 				mutex_exit(entry->dbe_lock);
848 			}
849 			if (found) {
850 				if (!rw_tryupgrade(bp->dbk_lock)) {
851 					rw_exit(bp->dbk_lock);
852 					rw_enter(bp->dbk_lock, RW_WRITER);
853 				}
854 
855 				l = bp->dbk_head;
856 				while (l) {
857 					t = l;
858 					entry = t->entry;
859 					l = l->next;
860 					if (entry->dbe_refcnt == 0) {
861 						DEQUEUE(bp->dbk_head, t);
862 						t->next = NULL;
863 						t->prev = NULL;
864 						INVALIDATE_ADDR(t->entry);
865 						rfs4_dbe_destroy(entry);
866 					}
867 				}
868 			}
869 			rw_exit(bp->dbk_lock);
870 			/*
871 			 * delay slightly if there is more work to do
872 			 * with the expectation that other reaper
873 			 * threads are freeing data structures as well
874 			 * and in turn will reduce ref counts on
875 			 * entries in this table allowing them to be
876 			 * released.  This is only done in the
877 			 * instance that the tables are being shut down.
878 			 */
879 			if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
880 				delay(hz/100);
881 		/*
882 		 * If this is a table shutdown, keep going until
883 		 * everything is gone
884 		 */
885 		} while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
886 
887 		if (!table->dbt_reaper_shutdown && desired && count >= desired)
888 			break;
889 	}
890 
891 	NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
892 	    (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
893 	    count, cache_time, table->dbt_name));
894 }
895 
896 static void
897 reaper_thread(caddr_t *arg)
898 {
899 	rfs4_table_t	*table = (rfs4_table_t *)arg;
900 	clock_t		 rc;
901 
902 	NFS4_DEBUG(table->dbt_debug,
903 	    (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
904 
905 	CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
906 	    callb_generic_cpr, "nfsv4Reaper");
907 
908 	mutex_enter(&table->dbt_reaper_cv_lock);
909 	do {
910 		CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
911 		rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
912 		    &table->dbt_reaper_cv_lock,
913 		    SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
914 		CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
915 		    &table->dbt_reaper_cv_lock);
916 		rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
917 	} while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
918 
919 	CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
920 
921 	NFS4_DEBUG(table->dbt_debug,
922 	    (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
923 
924 	/* Notify the database shutdown processing that the table is shutdown */
925 	mutex_enter(table->dbt_db->db_lock);
926 	table->dbt_db->db_shutdown_count--;
927 	cv_signal(&table->dbt_db->db_shutdown_wait);
928 	mutex_exit(table->dbt_db->db_lock);
929 	zthread_exit();
930 }
931 
932 static void
933 rfs4_start_reaper(rfs4_table_t *table)
934 {
935 	if (table->dbt_max_cache_time == 0)
936 		return;
937 
938 	(void) zthread_create(NULL, 0, reaper_thread, table, 0,
939 	    minclsyspri);
940 }
941 
942 #ifdef DEBUG
943 void
944 rfs4_dbe_debug(rfs4_dbe_t *entry)
945 {
946 	cmn_err(CE_NOTE, "Entry %p from table %s",
947 	    (void *)entry, entry->dbe_table->dbt_name);
948 	cmn_err(CE_CONT, "\trefcnt = %d id = %d",
949 	    entry->dbe_refcnt, entry->dbe_id);
950 }
951 #endif
952