1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2018 Nexenta Systems, Inc.
28 */
29
30 #include <sys/systm.h>
31 #include <sys/cmn_err.h>
32 #include <sys/kmem.h>
33 #include <sys/disp.h>
34 #include <sys/id_space.h>
35 #include <sys/atomic.h>
36 #include <rpc/rpc.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfs4_db_impl.h>
39 #include <sys/sdt.h>
40
41 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
42
43 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
44 static void rfs4_dbe_destroy(rfs4_dbe_t *);
45 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
46 static void rfs4_start_reaper(rfs4_table_t *);
47
48 /*
49 * t_lowat - integer percentage of table entries /etc/system only
50 * t_hiwat - integer percentage of table entries /etc/system only
51 * t_lreap - integer percentage of table reap time mdb or /etc/system
52 * t_hreap - integer percentage of table reap time mdb or /etc/system
53 */
54 uint32_t t_lowat = 50; /* reap at t_lreap when id's in use hit 50% */
55 uint32_t t_hiwat = 75; /* reap at t_hreap when id's in use hit 75% */
56 time_t t_lreap = 50; /* default to 50% of table's reap interval */
57 time_t t_hreap = 10; /* default to 10% of table's reap interval */
58
59 id_t
rfs4_dbe_getid(rfs4_dbe_t * entry)60 rfs4_dbe_getid(rfs4_dbe_t *entry)
61 {
62 return (entry->dbe_id);
63 }
64
65 void
rfs4_dbe_hold(rfs4_dbe_t * entry)66 rfs4_dbe_hold(rfs4_dbe_t *entry)
67 {
68 atomic_inc_32(&entry->dbe_refcnt);
69 }
70
71 /*
72 * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
73 */
74 void
rfs4_dbe_rele_nolock(rfs4_dbe_t * entry)75 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
76 {
77 atomic_dec_32(&entry->dbe_refcnt);
78 }
79
80
81 uint32_t
rfs4_dbe_refcnt(rfs4_dbe_t * entry)82 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
83 {
84 return (entry->dbe_refcnt);
85 }
86
87 /*
88 * Mark an entry such that the dbsearch will skip it.
89 * Caller does not want this entry to be found any longer
90 */
91 void
rfs4_dbe_invalidate(rfs4_dbe_t * entry)92 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
93 {
94 entry->dbe_invalid = TRUE;
95 entry->dbe_skipsearch = TRUE;
96 }
97
98 /*
99 * Is this entry invalid?
100 */
101 bool_t
rfs4_dbe_is_invalid(rfs4_dbe_t * entry)102 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
103 {
104 return (entry->dbe_invalid);
105 }
106
107 time_t
rfs4_dbe_get_timerele(rfs4_dbe_t * entry)108 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
109 {
110 return (entry->dbe_time_rele);
111 }
112
113 /*
114 * Use these to temporarily hide/unhide a db entry.
115 */
116 void
rfs4_dbe_hide(rfs4_dbe_t * entry)117 rfs4_dbe_hide(rfs4_dbe_t *entry)
118 {
119 rfs4_dbe_lock(entry);
120 entry->dbe_skipsearch = TRUE;
121 rfs4_dbe_unlock(entry);
122 }
123
124 void
rfs4_dbe_unhide(rfs4_dbe_t * entry)125 rfs4_dbe_unhide(rfs4_dbe_t *entry)
126 {
127 rfs4_dbe_lock(entry);
128 entry->dbe_skipsearch = FALSE;
129 rfs4_dbe_unlock(entry);
130 }
131
132 void
rfs4_dbe_rele(rfs4_dbe_t * entry)133 rfs4_dbe_rele(rfs4_dbe_t *entry)
134 {
135 mutex_enter(entry->dbe_lock);
136 ASSERT(entry->dbe_refcnt > 1);
137 atomic_dec_32(&entry->dbe_refcnt);
138 entry->dbe_time_rele = gethrestime_sec();
139 mutex_exit(entry->dbe_lock);
140 }
141
142 void
rfs4_dbe_lock(rfs4_dbe_t * entry)143 rfs4_dbe_lock(rfs4_dbe_t *entry)
144 {
145 mutex_enter(entry->dbe_lock);
146 }
147
148 void
rfs4_dbe_unlock(rfs4_dbe_t * entry)149 rfs4_dbe_unlock(rfs4_dbe_t *entry)
150 {
151 mutex_exit(entry->dbe_lock);
152 }
153
154 bool_t
rfs4_dbe_islocked(rfs4_dbe_t * entry)155 rfs4_dbe_islocked(rfs4_dbe_t *entry)
156 {
157 return (mutex_owned(entry->dbe_lock));
158 }
159
160 clock_t
rfs4_dbe_twait(rfs4_dbe_t * entry,clock_t timeout)161 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
162 {
163 return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
164 }
165
166 void
rfs4_dbe_cv_broadcast(rfs4_dbe_t * entry)167 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
168 {
169 cv_broadcast(entry->dbe_cv);
170 }
171
172 /* ARGSUSED */
173 static int
rfs4_dbe_kmem_constructor(void * obj,void * private,int kmflag)174 rfs4_dbe_kmem_constructor(void *obj, void *private, int kmflag)
175 {
176 rfs4_dbe_t *entry = obj;
177
178 mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
179 cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
180
181 return (0);
182 }
183
184 static void
rfs4_dbe_kmem_destructor(void * obj,void * private)185 rfs4_dbe_kmem_destructor(void *obj, void *private)
186 {
187 rfs4_dbe_t *entry = obj;
188 /*LINTED*/
189 rfs4_table_t *table = private;
190
191 mutex_destroy(entry->dbe_lock);
192 cv_destroy(entry->dbe_cv);
193 }
194
195 rfs4_database_t *
rfs4_database_create(uint32_t flags)196 rfs4_database_create(uint32_t flags)
197 {
198 rfs4_database_t *db;
199
200 db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
201 mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
202 db->db_tables = NULL;
203 db->db_debug_flags = flags;
204 db->db_shutdown_count = 0;
205 cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
206 return (db);
207 }
208
209
210 /*
211 * The reaper threads that have been created for the tables in this
212 * database must be stopped and the entries in the tables released.
213 * Each table will be marked as "shutdown" and the reaper threads
214 * poked and they will see that a shutdown is in progress and cleanup
215 * and exit. This function waits for all reaper threads to stop
216 * before returning to the caller.
217 */
218 void
rfs4_database_shutdown(rfs4_database_t * db)219 rfs4_database_shutdown(rfs4_database_t *db)
220 {
221 rfs4_table_t *table;
222
223 mutex_enter(db->db_lock);
224 for (table = db->db_tables; table; table = table->dbt_tnext) {
225 mutex_enter(&table->dbt_reaper_cv_lock);
226 table->dbt_reaper_shutdown = TRUE;
227 cv_broadcast(&table->dbt_reaper_wait);
228 db->db_shutdown_count++;
229 mutex_exit(&table->dbt_reaper_cv_lock);
230 }
231 while (db->db_shutdown_count > 0) {
232 cv_wait(&db->db_shutdown_wait, db->db_lock);
233 }
234 mutex_exit(db->db_lock);
235 }
236
237 /*
238 * Given a database that has been "shutdown" by the function above all
239 * of the table tables are destroyed and then the database itself
240 * freed.
241 */
242 void
rfs4_database_destroy(rfs4_database_t * db)243 rfs4_database_destroy(rfs4_database_t *db)
244 {
245 rfs4_table_t *next, *tmp;
246
247 for (next = db->db_tables; next; ) {
248 tmp = next;
249 next = tmp->dbt_tnext;
250 rfs4_table_destroy(db, tmp);
251 }
252
253 mutex_destroy(db->db_lock);
254 kmem_free(db, sizeof (rfs4_database_t));
255 }
256
257 /*
258 * Used to get the correct kmem_cache database for the state table being
259 * created.
260 * Helper function for rfs4_table_create
261 */
262 static kmem_cache_t *
get_db_mem_cache(char * name)263 get_db_mem_cache(char *name)
264 {
265 int i;
266
267 for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
268 if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
269 return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
270 }
271 /*
272 * There is no associated kmem cache for this NFS4 server state
273 * table name
274 */
275 return (NULL);
276 }
277
278 /*
279 * Used to initialize the global NFSv4 server state database.
280 * Helper funtion for rfs4_state_g_init and called when module is loaded.
281 */
282 kmem_cache_t *
283 /* CSTYLED */
nfs4_init_mem_cache(char * cache_name,uint32_t idxcnt,uint32_t size,uint32_t idx)284 nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
285 {
286 kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
287 sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
288 0,
289 rfs4_dbe_kmem_constructor,
290 rfs4_dbe_kmem_destructor,
291 NULL,
292 NULL,
293 NULL,
294 0);
295 (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
296 strlen(cache_name) + 1);
297 rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
298 return (mem_cache);
299 }
300
301 rfs4_table_t *
rfs4_table_create(rfs4_database_t * db,char * tabname,time_t max_cache_time,uint32_t idxcnt,bool_t (* create)(rfs4_entry_t,void *),void (* destroy)(rfs4_entry_t),bool_t (* expiry)(rfs4_entry_t),uint32_t size,uint32_t hashsize,uint32_t maxentries,id_t start)302 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
303 uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
304 void (*destroy)(rfs4_entry_t),
305 bool_t (*expiry)(rfs4_entry_t),
306 uint32_t size, uint32_t hashsize,
307 uint32_t maxentries, id_t start)
308 {
309 rfs4_table_t *table;
310 int len;
311 char *cache_name;
312 char *id_name;
313
314 table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
315 table->dbt_db = db;
316 rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
317 mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
318 mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
319 cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
320
321 len = strlen(tabname);
322 table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
323 cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
324 (void) strcpy(table->dbt_name, tabname);
325 (void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
326 table->dbt_max_cache_time = max_cache_time;
327 table->dbt_usize = size;
328 table->dbt_len = hashsize;
329 table->dbt_count = 0;
330 table->dbt_idxcnt = 0;
331 table->dbt_ccnt = 0;
332 table->dbt_maxcnt = idxcnt;
333 table->dbt_indices = NULL;
334 table->dbt_id_space = NULL;
335 table->dbt_reaper_shutdown = FALSE;
336
337 if (start >= 0) {
338 if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
339 maxentries = INT32_MAX - start;
340 id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
341 (void) sprintf(id_name, "%s_id_space", table->dbt_name);
342 table->dbt_id_space = id_space_create(id_name, start,
343 maxentries + start);
344 kmem_free(id_name, len + 10);
345 }
346 ASSERT(t_lowat != 0);
347 table->dbt_id_lwat = (maxentries * t_lowat) / 100;
348 ASSERT(t_hiwat != 0);
349 table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
350 table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
351 table->dbt_maxentries = maxentries;
352 table->dbt_create = create;
353 table->dbt_destroy = destroy;
354 table->dbt_expiry = expiry;
355
356 /*
357 * get the correct kmem_cache for this table type based on the name.
358 */
359 table->dbt_mem_cache = get_db_mem_cache(cache_name);
360
361 kmem_free(cache_name, len+13);
362
363 table->dbt_debug = db->db_debug_flags;
364
365 mutex_enter(db->db_lock);
366 table->dbt_tnext = db->db_tables;
367 db->db_tables = table;
368 mutex_exit(db->db_lock);
369
370 rfs4_start_reaper(table);
371
372 return (table);
373 }
374
375 void
rfs4_table_destroy(rfs4_database_t * db,rfs4_table_t * table)376 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
377 {
378 rfs4_table_t *p;
379 rfs4_index_t *idx;
380
381 ASSERT(table->dbt_count == 0);
382
383 mutex_enter(db->db_lock);
384 if (table == db->db_tables)
385 db->db_tables = table->dbt_tnext;
386 else {
387 for (p = db->db_tables; p; p = p->dbt_tnext)
388 if (p->dbt_tnext == table) {
389 p->dbt_tnext = table->dbt_tnext;
390 table->dbt_tnext = NULL;
391 break;
392 }
393 ASSERT(p != NULL);
394 }
395 mutex_exit(db->db_lock);
396
397 /* Destroy indices */
398 while (table->dbt_indices) {
399 idx = table->dbt_indices;
400 table->dbt_indices = idx->dbi_inext;
401 rfs4_index_destroy(idx);
402 }
403
404 rw_destroy(table->dbt_t_lock);
405 mutex_destroy(table->dbt_lock);
406 mutex_destroy(&table->dbt_reaper_cv_lock);
407 cv_destroy(&table->dbt_reaper_wait);
408
409 kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
410 if (table->dbt_id_space)
411 id_space_destroy(table->dbt_id_space);
412 table->dbt_mem_cache = NULL;
413 kmem_free(table, sizeof (rfs4_table_t));
414 }
415
416 rfs4_index_t *
rfs4_index_create(rfs4_table_t * table,char * keyname,uint32_t (* hash)(void *),bool_t (compare)(rfs4_entry_t,void *),void * (* mkkey)(rfs4_entry_t),bool_t createable)417 rfs4_index_create(rfs4_table_t *table, char *keyname,
418 uint32_t (*hash)(void *),
419 bool_t (compare)(rfs4_entry_t, void *),
420 void *(*mkkey)(rfs4_entry_t),
421 bool_t createable)
422 {
423 rfs4_index_t *idx;
424
425 ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
426
427 idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
428
429 idx->dbi_table = table;
430 idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
431 (void) strcpy(idx->dbi_keyname, keyname);
432 idx->dbi_hash = hash;
433 idx->dbi_compare = compare;
434 idx->dbi_mkkey = mkkey;
435 idx->dbi_tblidx = table->dbt_idxcnt;
436 table->dbt_idxcnt++;
437 if (createable) {
438 table->dbt_ccnt++;
439 if (table->dbt_ccnt > 1)
440 panic("Table %s currently can have only have one "
441 "index that will allow creation of entries",
442 table->dbt_name);
443 idx->dbi_createable = TRUE;
444 } else {
445 idx->dbi_createable = FALSE;
446 }
447
448 idx->dbi_inext = table->dbt_indices;
449 table->dbt_indices = idx;
450 idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
451 KM_SLEEP);
452
453 return (idx);
454 }
455
456 void
rfs4_index_destroy(rfs4_index_t * idx)457 rfs4_index_destroy(rfs4_index_t *idx)
458 {
459 kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
460 kmem_free(idx->dbi_buckets,
461 sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
462 kmem_free(idx, sizeof (rfs4_index_t));
463 }
464
465 static void
rfs4_dbe_destroy(rfs4_dbe_t * entry)466 rfs4_dbe_destroy(rfs4_dbe_t *entry)
467 {
468 rfs4_index_t *idx;
469 void *key;
470 int i;
471 rfs4_bucket_t *bp;
472 rfs4_table_t *table = entry->dbe_table;
473 rfs4_link_t *l;
474
475 NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
476 (CE_NOTE, "Destroying entry %p from %s",
477 (void*)entry, table->dbt_name));
478
479 mutex_enter(entry->dbe_lock);
480 ASSERT(entry->dbe_refcnt == 0);
481 mutex_exit(entry->dbe_lock);
482
483 /* Unlink from all indices */
484 for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
485 l = &entry->dbe_indices[idx->dbi_tblidx];
486 /* check and see if we were ever linked in to the index */
487 if (INVALID_LINK(l)) {
488 ASSERT(l->next == NULL && l->prev == NULL);
489 continue;
490 }
491 key = idx->dbi_mkkey(entry->dbe_data);
492 i = HASH(idx, key);
493 bp = &idx->dbi_buckets[i];
494 ASSERT(bp->dbk_head != NULL);
495 DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
496 }
497
498 /* Destroy user data */
499 if (table->dbt_destroy)
500 (*table->dbt_destroy)(entry->dbe_data);
501
502 if (table->dbt_id_space)
503 id_free(table->dbt_id_space, entry->dbe_id);
504
505 mutex_enter(table->dbt_lock);
506 table->dbt_count--;
507 mutex_exit(table->dbt_lock);
508
509 /* Destroy the entry itself */
510 kmem_cache_free(table->dbt_mem_cache, entry);
511 }
512
513
514 static rfs4_dbe_t *
rfs4_dbe_create(rfs4_table_t * table,id_t id,rfs4_entry_t data)515 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
516 {
517 rfs4_dbe_t *entry;
518 int i;
519
520 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
521 (CE_NOTE, "Creating entry in table %s", table->dbt_name));
522
523 entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
524
525 entry->dbe_refcnt = 1;
526 entry->dbe_invalid = FALSE;
527 entry->dbe_skipsearch = FALSE;
528 entry->dbe_time_rele = 0;
529 entry->dbe_id = 0;
530
531 if (table->dbt_id_space)
532 entry->dbe_id = id;
533 entry->dbe_table = table;
534
535 for (i = 0; i < table->dbt_maxcnt; i++) {
536 entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
537 entry->dbe_indices[i].entry = entry;
538 /*
539 * We mark the entry as not indexed by setting the low
540 * order bit, since address are word aligned. This has
541 * the advantage of causeing a trap if the address is
542 * used. After the entry is linked in to the
543 * corresponding index the bit will be cleared.
544 */
545 INVALIDATE_ADDR(entry->dbe_indices[i].entry);
546 }
547
548 entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
549 bzero(entry->dbe_data, table->dbt_usize);
550 entry->dbe_data->dbe = entry;
551
552 if (!(*table->dbt_create)(entry->dbe_data, data)) {
553 kmem_cache_free(table->dbt_mem_cache, entry);
554 return (NULL);
555 }
556
557 mutex_enter(table->dbt_lock);
558 table->dbt_count++;
559 mutex_exit(table->dbt_lock);
560
561 return (entry);
562 }
563
564 static void
rfs4_dbe_tabreap_adjust(rfs4_table_t * table)565 rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
566 {
567 clock_t tabreap;
568 clock_t reap_int;
569 uint32_t in_use;
570
571 /*
572 * Adjust the table's reap interval based on the
573 * number of id's currently in use. Each table's
574 * default remains the same if id usage subsides.
575 */
576 ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
577 tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
578
579 in_use = table->dbt_count + 1; /* see rfs4_dbe_create */
580 if (in_use >= table->dbt_id_hwat) {
581 ASSERT(t_hreap != 0);
582 reap_int = (tabreap * t_hreap) / 100;
583 } else if (in_use >= table->dbt_id_lwat) {
584 ASSERT(t_lreap != 0);
585 reap_int = (tabreap * t_lreap) / 100;
586 } else {
587 reap_int = tabreap;
588 }
589 table->dbt_id_reap = reap_int;
590 DTRACE_PROBE2(table__reap__interval, char *,
591 table->dbt_name, time_t, table->dbt_id_reap);
592 }
593
594 rfs4_entry_t
rfs4_dbsearch(rfs4_index_t * idx,void * key,bool_t * create,void * arg,rfs4_dbsearch_type_t dbsearch_type)595 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
596 rfs4_dbsearch_type_t dbsearch_type)
597 {
598 int already_done;
599 uint32_t i;
600 rfs4_table_t *table = idx->dbi_table;
601 rfs4_index_t *ip;
602 rfs4_bucket_t *bp;
603 rfs4_link_t *l;
604 rfs4_dbe_t *entry;
605 id_t id = -1;
606
607 i = HASH(idx, key);
608 bp = &idx->dbi_buckets[i];
609
610 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
611 (CE_NOTE, "Searching for key %p in table %s by %s",
612 key, table->dbt_name, idx->dbi_keyname));
613
614 rw_enter(bp->dbk_lock, RW_READER);
615 retry:
616 for (l = bp->dbk_head; l; l = l->next) {
617 if (l->entry->dbe_refcnt > 0 &&
618 (l->entry->dbe_skipsearch == FALSE ||
619 (l->entry->dbe_skipsearch == TRUE &&
620 dbsearch_type == RFS4_DBS_INVALID)) &&
621 (*idx->dbi_compare)(l->entry->dbe_data, key)) {
622 mutex_enter(l->entry->dbe_lock);
623 if (l->entry->dbe_refcnt == 0) {
624 mutex_exit(l->entry->dbe_lock);
625 continue;
626 }
627
628 /* place an additional hold since we are returning */
629 rfs4_dbe_hold(l->entry);
630
631 mutex_exit(l->entry->dbe_lock);
632 rw_exit(bp->dbk_lock);
633
634 *create = FALSE;
635
636 NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
637 (CE_NOTE, "Found entry %p for %p in table %s",
638 (void *)l->entry, key, table->dbt_name));
639
640 if (id != -1)
641 id_free(table->dbt_id_space, id);
642 return (l->entry->dbe_data);
643 }
644 }
645
646 if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
647 table->dbt_maxentries == table->dbt_count) {
648 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
649 (CE_NOTE, "Entry for %p in %s not found",
650 key, table->dbt_name));
651
652 rw_exit(bp->dbk_lock);
653 if (id != -1)
654 id_free(table->dbt_id_space, id);
655 return (NULL);
656 }
657
658 if (table->dbt_id_space && id == -1) {
659 rw_exit(bp->dbk_lock);
660
661 /* get an id, ok to sleep for it here */
662 id = id_alloc(table->dbt_id_space);
663 ASSERT(id != -1);
664
665 mutex_enter(&table->dbt_reaper_cv_lock);
666 rfs4_dbe_tabreap_adjust(table);
667 mutex_exit(&table->dbt_reaper_cv_lock);
668
669 rw_enter(bp->dbk_lock, RW_WRITER);
670 goto retry;
671 }
672
673 /* get an exclusive lock on the bucket */
674 if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
675 NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
676 (CE_NOTE, "Trying to upgrade lock on "
677 "hash chain %d (%p) for %s by %s",
678 i, (void*)bp, table->dbt_name, idx->dbi_keyname));
679
680 rw_exit(bp->dbk_lock);
681 rw_enter(bp->dbk_lock, RW_WRITER);
682 goto retry;
683 }
684
685 /* create entry */
686 entry = rfs4_dbe_create(table, id, arg);
687 if (entry == NULL) {
688 rw_exit(bp->dbk_lock);
689 if (id != -1)
690 id_free(table->dbt_id_space, id);
691
692 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
693 (CE_NOTE, "Constructor for table %s failed",
694 table->dbt_name));
695 return (NULL);
696 }
697
698 /*
699 * Add one ref for entry into table's hash - only one
700 * reference added even though there may be multiple indices
701 */
702 rfs4_dbe_hold(entry);
703 ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
704 VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
705
706 already_done = idx->dbi_tblidx;
707 rw_exit(bp->dbk_lock);
708
709 for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
710 if (ip->dbi_tblidx == already_done)
711 continue;
712 l = &entry->dbe_indices[ip->dbi_tblidx];
713 i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
714 ASSERT(i < ip->dbi_table->dbt_len);
715 bp = &ip->dbi_buckets[i];
716 ENQUEUE_IDX(bp, l);
717 }
718
719 NFS4_DEBUG(
720 table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
721 (CE_NOTE, "Entry %p created for %s = %p in table %s",
722 (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
723
724 return (entry->dbe_data);
725 }
726
727 /*ARGSUSED*/
728 boolean_t
rfs4_cpr_callb(void * arg,int code)729 rfs4_cpr_callb(void *arg, int code)
730 {
731 rfs4_bucket_t *buckets, *bp;
732 rfs4_link_t *l;
733 rfs4_client_t *cp;
734 int i;
735
736 nfs4_srv_t *nsrv4 = nfs4_get_srv();
737 rfs4_table_t *table = nsrv4->rfs4_client_tab;
738
739 /*
740 * We get called for Suspend and Resume events.
741 * For the suspend case we simply don't care! Nor do we care if
742 * there are no clients.
743 */
744 if (code == CB_CODE_CPR_CHKPT || table == NULL) {
745 return (B_TRUE);
746 }
747
748 buckets = table->dbt_indices->dbi_buckets;
749
750 /*
751 * When we get this far we are in the process of
752 * resuming the system from a previous suspend.
753 *
754 * We are going to blast through and update the
755 * last_access time for all the clients and in
756 * doing so extend them by one lease period.
757 */
758 for (i = 0; i < table->dbt_len; i++) {
759 bp = &buckets[i];
760 for (l = bp->dbk_head; l; l = l->next) {
761 cp = (rfs4_client_t *)l->entry->dbe_data;
762 cp->rc_last_access = gethrestime_sec();
763 }
764 }
765
766 return (B_TRUE);
767 }
768
769 /*
770 * Given a table, lock each of the buckets and walk all entries (in
771 * turn locking those) and calling the provided "callout" function
772 * with the provided parameter. Obviously used to iterate across all
773 * entries in a particular table via the database locking hierarchy.
774 * Obviously the caller must not hold locks on any of the entries in
775 * the specified table.
776 */
777 void
rfs4_dbe_walk(rfs4_table_t * table,void (* callout)(rfs4_entry_t,void *),void * data)778 rfs4_dbe_walk(rfs4_table_t *table,
779 void (*callout)(rfs4_entry_t, void *),
780 void *data)
781 {
782 rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
783 rfs4_link_t *l;
784 rfs4_dbe_t *entry;
785 int i;
786
787 NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
788 (CE_NOTE, "Walking entries in %s", table->dbt_name));
789
790 /* Walk the buckets looking for entries to release/destroy */
791 for (i = 0; i < table->dbt_len; i++) {
792 bp = &buckets[i];
793 rw_enter(bp->dbk_lock, RW_READER);
794 for (l = bp->dbk_head; l; l = l->next) {
795 entry = l->entry;
796 mutex_enter(entry->dbe_lock);
797 (*callout)(entry->dbe_data, data);
798 mutex_exit(entry->dbe_lock);
799 }
800 rw_exit(bp->dbk_lock);
801 }
802
803 NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
804 (CE_NOTE, "Walking entries complete %s", table->dbt_name));
805 }
806
807
808 static void
rfs4_dbe_reap(rfs4_table_t * table,time_t cache_time,uint32_t desired)809 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
810 {
811 rfs4_index_t *idx = table->dbt_indices;
812 rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
813 rfs4_link_t *l, *t;
814 rfs4_dbe_t *entry;
815 bool_t found;
816 int i;
817 int count = 0;
818
819 NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
820 (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
821 desired, cache_time, table->dbt_name));
822
823 /* Walk the buckets looking for entries to release/destroy */
824 for (i = 0; i < table->dbt_len; i++) {
825 bp = &buckets[i];
826 do {
827 found = FALSE;
828 rw_enter(bp->dbk_lock, RW_READER);
829 for (l = bp->dbk_head; l; l = l->next) {
830 entry = l->entry;
831 /*
832 * Examine an entry. Ref count of 1 means
833 * that the only reference is for the hash
834 * table reference.
835 */
836 if (entry->dbe_refcnt != 1)
837 continue;
838 mutex_enter(entry->dbe_lock);
839 if ((entry->dbe_refcnt == 1) &&
840 (table->dbt_reaper_shutdown ||
841 table->dbt_expiry == NULL ||
842 (*table->dbt_expiry)(entry->dbe_data))) {
843 entry->dbe_refcnt--;
844 count++;
845 found = TRUE;
846 }
847 mutex_exit(entry->dbe_lock);
848 }
849 if (found) {
850 if (!rw_tryupgrade(bp->dbk_lock)) {
851 rw_exit(bp->dbk_lock);
852 rw_enter(bp->dbk_lock, RW_WRITER);
853 }
854
855 l = bp->dbk_head;
856 while (l) {
857 t = l;
858 entry = t->entry;
859 l = l->next;
860 if (entry->dbe_refcnt == 0) {
861 DEQUEUE(bp->dbk_head, t);
862 t->next = NULL;
863 t->prev = NULL;
864 INVALIDATE_ADDR(t->entry);
865 rfs4_dbe_destroy(entry);
866 }
867 }
868 }
869 rw_exit(bp->dbk_lock);
870 /*
871 * delay slightly if there is more work to do
872 * with the expectation that other reaper
873 * threads are freeing data structures as well
874 * and in turn will reduce ref counts on
875 * entries in this table allowing them to be
876 * released. This is only done in the
877 * instance that the tables are being shut down.
878 */
879 if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
880 delay(hz/100);
881 /*
882 * If this is a table shutdown, keep going until
883 * everything is gone
884 */
885 } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
886
887 if (!table->dbt_reaper_shutdown && desired && count >= desired)
888 break;
889 }
890
891 NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
892 (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
893 count, cache_time, table->dbt_name));
894 }
895
896 static void
reaper_thread(caddr_t * arg)897 reaper_thread(caddr_t *arg)
898 {
899 rfs4_table_t *table = (rfs4_table_t *)arg;
900 clock_t rc;
901
902 NFS4_DEBUG(table->dbt_debug,
903 (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
904
905 CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
906 callb_generic_cpr, "nfsv4Reaper");
907
908 mutex_enter(&table->dbt_reaper_cv_lock);
909 do {
910 CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
911 rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
912 &table->dbt_reaper_cv_lock,
913 SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
914 CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
915 &table->dbt_reaper_cv_lock);
916 rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
917 } while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
918
919 CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
920
921 NFS4_DEBUG(table->dbt_debug,
922 (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
923
924 /* Notify the database shutdown processing that the table is shutdown */
925 mutex_enter(table->dbt_db->db_lock);
926 table->dbt_db->db_shutdown_count--;
927 cv_signal(&table->dbt_db->db_shutdown_wait);
928 mutex_exit(table->dbt_db->db_lock);
929 zthread_exit();
930 }
931
932 static void
rfs4_start_reaper(rfs4_table_t * table)933 rfs4_start_reaper(rfs4_table_t *table)
934 {
935 if (table->dbt_max_cache_time == 0)
936 return;
937
938 (void) zthread_create(NULL, 0, reaper_thread, table, 0,
939 minclsyspri);
940 }
941
942 #ifdef DEBUG
943 void
rfs4_dbe_debug(rfs4_dbe_t * entry)944 rfs4_dbe_debug(rfs4_dbe_t *entry)
945 {
946 cmn_err(CE_NOTE, "Entry %p from table %s",
947 (void *)entry, entry->dbe_table->dbt_name);
948 cmn_err(CE_CONT, "\trefcnt = %d id = %d",
949 entry->dbe_refcnt, entry->dbe_id);
950 }
951 #endif
952