1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * Copyright (c) 2008, The Ohio State University. All rights reserved.
28 *
29 * Portions of this source code is developed by the team members of
30 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31 * headed by Professor Dhabaleswar K. (DK) Panda.
32 *
33 * Acknowledgements to contributions from developors:
34 *   Ranjit Noronha: noronha@cse.ohio-state.edu
35 *   Lei Chai      : chail@cse.ohio-state.edu
36 *   Weikuan Yu    : yuw@cse.ohio-state.edu
37 *
38 */
39
40#include <sys/systm.h>
41#include <sys/kstat.h>
42#include <sys/modctl.h>
43#include <sys/sdt.h>
44#include <rpc/rpc_rdma.h>
45
46#include <sys/ib/ibtl/ibti.h>
47
48uint_t rdma_minchunk = RDMA_MINCHUNK;
49
50/*
51 * Globals
52 */
53int rdma_modloaded = 0;		/* flag to load RDMA plugin modules */
54int rdma_dev_available = 0;	/* if any RDMA device is loaded */
55kmutex_t rdma_modload_lock;	/* protects rdma_modloaded flag */
56
57rdma_svc_wait_t rdma_wait;
58
59rdma_registry_t	*rdma_mod_head = NULL;	/* head for RDMA modules */
60krwlock_t	rdma_lock;		/* protects rdma_mod_head list */
61ldi_ident_t rpcmod_li = NULL;	/* identifies us with ldi_ framework */
62
63kmem_cache_t *clist_cache = NULL;
64
65/*
66 * Statics
67 */
68ldi_handle_t rpcib_handle = NULL;
69
70/*
71 * Externs
72 */
73extern	kstat_named_t	*rdmarcstat_ptr;
74extern	uint_t		rdmarcstat_ndata;
75extern	kstat_named_t	*rdmarsstat_ptr;
76extern	uint_t		rdmarsstat_ndata;
77
78void rdma_kstat_init();
79
80/*
81 * RDMATF module registration routine.
82 * This routine is expected to be called by the init routine in
83 * the plugin modules.
84 */
85rdma_stat
86rdma_register_mod(rdma_mod_t *mod)
87{
88	rdma_registry_t **mp, *m;
89
90	if (mod->rdma_version != RDMATF_VERS) {
91		return (RDMA_BADVERS);
92	}
93
94	rw_enter(&rdma_lock, RW_WRITER);
95	/*
96	 * Ensure not already registered
97	 */
98	mp = &rdma_mod_head;
99	while (*mp != NULL) {
100		if (strncmp((*mp)->r_mod->rdma_api, mod->rdma_api,
101		    KNC_STRSIZE) == 0) {
102			if ((*mp)->r_mod_state == RDMA_MOD_INACTIVE) {
103				(*mp)->r_mod_state = RDMA_MOD_ACTIVE;
104				(*mp)->r_mod->rdma_ops = mod->rdma_ops;
105				(*mp)->r_mod->rdma_count = mod->rdma_count;
106				goto announce_hca;
107			}
108			rw_exit(&rdma_lock);
109			return (RDMA_REG_EXIST);
110		}
111		mp = &((*mp)->r_next);
112	}
113
114	/*
115	 * New one, create and add to registry
116	 */
117	m = kmem_alloc(sizeof (rdma_registry_t), KM_SLEEP);
118	m->r_mod = kmem_alloc(sizeof (rdma_mod_t), KM_SLEEP);
119	*m->r_mod = *mod;
120	m->r_next = NULL;
121	m->r_mod->rdma_api = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
122	(void) strncpy(m->r_mod->rdma_api, mod->rdma_api, KNC_STRSIZE);
123	m->r_mod->rdma_api[KNC_STRSIZE - 1] = '\0';
124	m->r_mod_state = RDMA_MOD_ACTIVE;
125	*mp = m;
126
127announce_hca:
128	rw_exit(&rdma_lock);
129	/*
130	 * Start the nfs service on the rdma xprts.
131	 * (this notification mechanism will need to change when we support
132	 * multiple hcas and have support for multiple rdma plugins).
133	 */
134	mutex_enter(&rdma_wait.svc_lock);
135	rdma_wait.svc_stat = RDMA_HCA_ATTACH;
136	cv_signal(&rdma_wait.svc_cv);
137	mutex_exit(&rdma_wait.svc_lock);
138
139	return (RDMA_SUCCESS);
140}
141
142/*
143 * RDMATF module unregistration routine.
144 * This routine is expected to be called by the fini routine in
145 * the plugin modules.
146 */
147rdma_stat
148rdma_unregister_mod(rdma_mod_t *mod)
149{
150	rdma_registry_t **m, *mmod = NULL;
151
152	rw_enter(&rdma_lock, RW_WRITER);
153
154	m = &rdma_mod_head;
155	while (*m != NULL) {
156		if (strncmp((*m)->r_mod->rdma_api, mod->rdma_api,
157		    KNC_STRSIZE) != 0) {
158			m = &((*m)->r_next);
159			continue;
160		}
161		/*
162		 * Check if any device attached, if so return error
163		 */
164		if (mod->rdma_count != 0) {
165			rw_exit(&rdma_lock);
166			return (RDMA_FAILED);
167		}
168		/*
169		 * Found entry. Mark it inactive.
170		 */
171		mmod = *m;
172		mmod->r_mod->rdma_count = 0;
173		mmod->r_mod_state = RDMA_MOD_INACTIVE;
174		break;
175	}
176
177	rdma_modloaded = 0;
178	rdma_dev_available = 0;
179	rw_exit(&rdma_lock);
180
181	/*
182	 * Stop the nfs service running on the rdma xprts.
183	 * (this notification mechanism will need to change when we support
184	 * multiple hcas and have support for multiple rdma plugins).
185	 */
186	mutex_enter(&rdma_wait.svc_lock);
187	rdma_wait.svc_stat = RDMA_HCA_DETACH;
188	cv_signal(&rdma_wait.svc_cv);
189	mutex_exit(&rdma_wait.svc_lock);
190
191	/*
192	 * Not found.
193	 */
194	return (RDMA_SUCCESS);
195}
196
197struct clist *
198clist_alloc(void)
199{
200	struct clist *clp;
201
202	clp = kmem_cache_alloc(clist_cache, KM_SLEEP);
203
204	bzero(clp, sizeof (*clp));
205
206	return (clp);
207}
208
209uint32_t
210clist_len(struct clist *cl)
211{
212	uint32_t len = 0;
213	while (cl) {
214		len += cl->c_len;
215		cl = cl->c_next;
216	}
217	return (len);
218}
219
220void
221clist_zero_len(struct clist *cl)
222{
223	while (cl != NULL) {
224		if (cl->c_dmemhandle.mrc_rmr == 0)
225			break;
226		cl->c_len = 0;
227		cl = cl->c_next;
228	}
229}
230
231/*
232 * Creates a new chunk list entry, and
233 * adds it to the end of a chunk list.
234 */
235void
236clist_add(struct clist **clp, uint32_t xdroff, int len,
237    struct mrc *shandle, caddr_t saddr,
238    struct mrc *dhandle, caddr_t daddr)
239{
240	struct clist *cl;
241
242	/* Find the end of the list */
243
244	while (*clp != NULL)
245		clp = &((*clp)->c_next);
246
247	cl = clist_alloc();
248	cl->c_xdroff = xdroff;
249	cl->c_len = len;
250	cl->w.c_saddr = (uint64_t)(uintptr_t)saddr;
251	if (shandle)
252		cl->c_smemhandle = *shandle;
253	cl->u.c_daddr = (uint64_t)(uintptr_t)daddr;
254	if (dhandle)
255		cl->c_dmemhandle = *dhandle;
256	cl->c_next = NULL;
257
258	*clp = cl;
259}
260
261rdma_stat
262clist_register(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
263{
264	struct clist *c;
265	int status;
266
267	for (c = cl; c; c = c->c_next) {
268		if (c->c_len <= 0)
269			continue;
270
271		c->c_regtype = dstsrc;
272
273		switch (dstsrc) {
274		case CLIST_REG_SOURCE:
275			status = RDMA_REGMEMSYNC(conn,
276			    (caddr_t)(struct as *)c->c_adspc,
277			    (caddr_t)(uintptr_t)c->w.c_saddr3, c->c_len,
278			    &c->c_smemhandle, (void **)&c->c_ssynchandle,
279			    (void *)c->rb_longbuf.rb_private);
280			break;
281		case CLIST_REG_DST:
282			status = RDMA_REGMEMSYNC(conn,
283			    (caddr_t)(struct as *)c->c_adspc,
284			    (caddr_t)(uintptr_t)c->u.c_daddr3, c->c_len,
285			    &c->c_dmemhandle, (void **)&c->c_dsynchandle,
286			    (void *)c->rb_longbuf.rb_private);
287			break;
288		default:
289			return (RDMA_INVAL);
290		}
291		if (status != RDMA_SUCCESS) {
292			(void) clist_deregister(conn, cl);
293			return (status);
294		}
295	}
296
297	return (RDMA_SUCCESS);
298}
299
300rdma_stat
301clist_deregister(CONN *conn, struct clist *cl)
302{
303	struct clist *c;
304
305	for (c = cl; c; c = c->c_next) {
306		switch (c->c_regtype) {
307		case CLIST_REG_SOURCE:
308			if (c->c_smemhandle.mrc_rmr != 0) {
309				(void) RDMA_DEREGMEMSYNC(conn,
310				    (caddr_t)(uintptr_t)c->w.c_saddr3,
311				    c->c_smemhandle,
312				    (void *)(uintptr_t)c->c_ssynchandle,
313				    (void *)c->rb_longbuf.rb_private);
314				c->c_smemhandle.mrc_rmr = 0;
315				c->c_ssynchandle = 0;
316			}
317			break;
318		case CLIST_REG_DST:
319			if (c->c_dmemhandle.mrc_rmr != 0) {
320				(void) RDMA_DEREGMEMSYNC(conn,
321				    (caddr_t)(uintptr_t)c->u.c_daddr3,
322				    c->c_dmemhandle,
323				    (void *)(uintptr_t)c->c_dsynchandle,
324				    (void *)c->rb_longbuf.rb_private);
325				c->c_dmemhandle.mrc_rmr = 0;
326				c->c_dsynchandle = 0;
327			}
328			break;
329		default:
330			/* clist unregistered. continue */
331			break;
332		}
333	}
334
335	return (RDMA_SUCCESS);
336}
337
338rdma_stat
339clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc dstsrc)
340{
341	struct clist *c;
342	rdma_stat status;
343
344	c = cl;
345	switch (dstsrc) {
346	case CLIST_REG_SOURCE:
347		while (c != NULL) {
348			if (c->c_ssynchandle) {
349				status = RDMA_SYNCMEM(conn,
350				    (void *)(uintptr_t)c->c_ssynchandle,
351				    (caddr_t)(uintptr_t)c->w.c_saddr3,
352				    c->c_len, 0);
353				if (status != RDMA_SUCCESS)
354					return (status);
355			}
356			c = c->c_next;
357		}
358		break;
359	case CLIST_REG_DST:
360		while (c != NULL) {
361			if (c->c_ssynchandle) {
362				status = RDMA_SYNCMEM(conn,
363				    (void *)(uintptr_t)c->c_dsynchandle,
364				    (caddr_t)(uintptr_t)c->u.c_daddr3,
365				    c->c_len, 1);
366				if (status != RDMA_SUCCESS)
367					return (status);
368			}
369			c = c->c_next;
370		}
371		break;
372	default:
373		return (RDMA_INVAL);
374	}
375
376	return (RDMA_SUCCESS);
377}
378
379/*
380 * Frees up entries in chunk list
381 */
382void
383clist_free(struct clist *cl)
384{
385	struct clist *c = cl;
386
387	while (c != NULL) {
388		cl = cl->c_next;
389		kmem_cache_free(clist_cache, c);
390		c = cl;
391	}
392}
393
394rdma_stat
395rdma_clnt_postrecv(CONN *conn, uint32_t xid)
396{
397	struct clist *cl = NULL;
398	rdma_stat retval;
399	rdma_buf_t rbuf = {0};
400
401	rbuf.type = RECV_BUFFER;
402	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
403		return (RDMA_NORESOURCE);
404	}
405
406	clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
407	    NULL, NULL);
408	retval = RDMA_CLNT_RECVBUF(conn, cl, xid);
409	clist_free(cl);
410
411	return (retval);
412}
413
414rdma_stat
415rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid)
416{
417	return (RDMA_CLNT_RECVBUF_REMOVE(conn, xid));
418}
419
420rdma_stat
421rdma_svc_postrecv(CONN *conn)
422{
423	struct clist *cl = NULL;
424	rdma_stat retval;
425	rdma_buf_t rbuf = {0};
426
427	rbuf.type = RECV_BUFFER;
428	if (RDMA_BUF_ALLOC(conn, &rbuf)) {
429		retval = RDMA_NORESOURCE;
430	} else {
431		clist_add(&cl, 0, rbuf.len, &rbuf.handle, rbuf.addr,
432		    NULL, NULL);
433		retval = RDMA_SVC_RECVBUF(conn, cl);
434		clist_free(cl);
435	}
436	return (retval);
437}
438
439rdma_stat
440rdma_buf_alloc(CONN *conn, rdma_buf_t *rbuf)
441{
442	return (RDMA_BUF_ALLOC(conn, rbuf));
443}
444
445void
446rdma_buf_free(CONN *conn, rdma_buf_t *rbuf)
447{
448	if (!rbuf || rbuf->addr == NULL) {
449		return;
450	}
451	RDMA_BUF_FREE(conn, rbuf);
452	bzero(rbuf, sizeof (rdma_buf_t));
453}
454
455/*
456 * Caller is holding rdma_modload_lock mutex
457 */
458int
459rdma_modload()
460{
461	int status;
462	ASSERT(MUTEX_HELD(&rdma_modload_lock));
463	/*
464	 * Load all available RDMA plugins which right now is only IB plugin.
465	 * If no IB hardware is present, then quit right away.
466	 * ENODEV -- For no device on the system
467	 * EPROTONOSUPPORT -- For module not avilable either due to failure to
468	 * load or some other reason.
469	 */
470	rdma_modloaded = 1;
471	if (ibt_hw_is_present() == 0) {
472		rdma_dev_available = 0;
473		return (ENODEV);
474	}
475
476	rdma_dev_available = 1;
477	if (rpcmod_li == NULL)
478		return (EPROTONOSUPPORT);
479
480	status = ldi_open_by_name("/devices/ib/rpcib@0:rpcib",
481	    FREAD | FWRITE, kcred,
482	    &rpcib_handle, rpcmod_li);
483
484	if (status != 0)
485		return (EPROTONOSUPPORT);
486
487
488	/*
489	 * We will need to reload the plugin module after it was unregistered
490	 * but the resources below need to allocated only the first time.
491	 */
492	if (!clist_cache) {
493		clist_cache = kmem_cache_create("rdma_clist",
494		    sizeof (struct clist), _POINTER_ALIGNMENT, NULL,
495		    NULL, NULL, NULL, 0, 0);
496		rdma_kstat_init();
497	}
498
499	(void) ldi_close(rpcib_handle, FREAD|FWRITE, kcred);
500
501	return (0);
502}
503
504void
505rdma_kstat_init(void)
506{
507	kstat_t *ksp;
508
509	/*
510	 * The RDMA framework doesn't know how to deal with Zones, and is
511	 * only available in the global zone.
512	 */
513	ASSERT(INGLOBALZONE(curproc));
514	ksp = kstat_create_zone("unix", 0, "rpc_rdma_client", "rpc",
515	    KSTAT_TYPE_NAMED, rdmarcstat_ndata,
516	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID);
517	if (ksp) {
518		ksp->ks_data = (void *) rdmarcstat_ptr;
519		kstat_install(ksp);
520	}
521
522	ksp = kstat_create_zone("unix", 0, "rpc_rdma_server", "rpc",
523	    KSTAT_TYPE_NAMED, rdmarsstat_ndata,
524	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, GLOBAL_ZONEID);
525	if (ksp) {
526		ksp->ks_data = (void *) rdmarsstat_ptr;
527		kstat_install(ksp);
528	}
529}
530
531rdma_stat
532rdma_kwait(void)
533{
534	int ret;
535	rdma_stat stat;
536
537	mutex_enter(&rdma_wait.svc_lock);
538
539	ret = cv_wait_sig(&rdma_wait.svc_cv, &rdma_wait.svc_lock);
540
541	/*
542	 * If signalled by a hca attach/detach, pass the right
543	 * stat back.
544	 */
545
546	if (ret)
547		stat =  rdma_wait.svc_stat;
548	else
549		stat = RDMA_INTR;
550
551	mutex_exit(&rdma_wait.svc_lock);
552
553	return (stat);
554}
555