xref: /illumos-gate/usr/src/uts/common/rpc/ib.h (revision 7f379ad1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2007, The Ohio State University. All rights reserved.
27  *
28  * Portions of this source code is developed by the team members of
29  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30  * headed by Professor Dhabaleswar K. (DK) Panda.
31  *
32  * Acknowledgements to contributions from developors:
33  *   Ranjit Noronha: noronha@cse.ohio-state.edu
34  *   Lei Chai      : chail@cse.ohio-state.edu
35  *   Weikuan Yu    : yuw@cse.ohio-state.edu
36  *
37  */
38 
39 
40 #ifndef _IB_H
41 #define	_IB_H
42 
43 /*
44  * ib.h, rpcib plugin interface.
45  */
46 
47 #include <sys/types.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/conf.h>
51 #include <sys/stat.h>
52 #include <rpc/rpc.h>
53 #include <rpc/rpc_rdma.h>
54 #include <sys/ib/ibtl/ibti.h>
55 #include <sys/avl.h>
56 
57 #ifdef __cplusplus
58 extern "C" {
59 #endif
60 
61 #define	MAX_BUFS	1024	/* max no. of buffers per pool */
62 
63 #define	DEF_CQ_SIZE	4096 - 1	/* default CQ size */
64 				/*
65 				 * Tavor returns the next higher power of 2
66 				 * CQ entries than the requested size.
67 				 * For instance, if you request (2^12 - 1)
68 				 * CQ entries, Tavor returns 2^12 entries.
69 				 * 4K CQ entries suffice.  Hence, 4096 - 1.
70 				 */
71 #define	DEF_SQ_SIZE	128	/* default SendQ size */
72 #define	DEF_RQ_SIZE	256	/* default RecvQ size */
73 #define	DSEG_MAX	2
74 #define	RQ_DSEG_MAX	1	/* default RQ data seg */
75 #define	IBSRM_HB	0x8000	/* high order bit of pkey */
76 
77 /* max no. of refresh attempts on IBT_CM_CONN_STALE error */
78 #define	REFRESH_ATTEMPTS	3
79 
80 typedef struct rib_hca_s rib_hca_t;
81 typedef struct rib_qp_s rib_qp_t;
82 typedef struct rib_cq_s rib_cq_t;
83 
84 /*
85  * Notification for RDMA_DONE is based on xid
86  */
87 struct rdma_done_list {
88 	uint32_t	xid;		/* XID waiting for RDMA_DONE */
89 	kcondvar_t	rdma_done_cv;	/* cv for RDMA_DONE */
90 	struct rdma_done_list	*next;
91 	struct rdma_done_list	*prev;
92 };
93 
94 /*
95  * State of the plugin.
96  * ACCEPT = accepting new connections and requests
97  * NO_ACCEPT = not accepting new connection and requests
98  */
99 #define	ACCEPT		1
100 #define	NO_ACCEPT	2
101 
102 /*
103  * Send Wait states
104  */
105 #define	SEND_WAIT	-1
106 
107 /*
108  * Reply states
109  */
110 #define	REPLY_WAIT	-1
111 
112 typedef void * rib_pvoid;
113 typedef rib_pvoid RIB_SYNCMEM_HANDLE;
114 
115 /*
116  * IB buffer pool management structure
117  */
118 
119 /*
120  * Buffer pool info
121  */
122 typedef struct {
123 	kmutex_t	buflock;	/* lock for this structure */
124 	caddr_t		buf;		/* pool address */
125 	uint32_t	bufhandle;	/* rkey for this pool */
126 	ulong_t		bufsize;	/* size of pool */
127 	int		rsize;		/* size of each element */
128 	int		numelems;	/* no. of elements allocated */
129 	int		buffree;	/* no. of free elements */
130 	void		*buflist[1];	/* free elements in pool */
131 } bufpool_t;
132 
133 typedef struct {
134 	bufpool_t	*bpool;
135 	ibt_mr_hdl_t	*mr_hdl;
136 	ibt_mr_desc_t	*mr_desc;	/* vaddr, lkey, rkey */
137 } rib_bufpool_t;
138 
139 /*
140  * ATS relsted defines and structures.
141  */
142 #define	ATS_AR_DATA_LEN	16
143 #define	IBD_NAME	"ibd"
144 #define	N_IBD_INSTANCES	4
145 
146 
147 /*
148  * Service types supported by RPCIB
149  * For now only NFS is supported.
150  */
151 #define	NFS		1
152 #define	NLM		2
153 
154 /*
155  * Tracks consumer state (client or server).
156  */
157 typedef enum {
158 	RIB_SERVER,
159 	RIB_CLIENT
160 } rib_mode_t;
161 
162 /*
163  * CQ structure
164  */
165 struct rib_cq_s {
166 	rib_hca_t		*rib_hca;
167 	ibt_cq_hdl_t		rib_cq_hdl;
168 };
169 
170 /*
171  * Each registered service's data structure.
172  */
173 typedef struct rib_service_s rib_service_t;
174 struct rib_service_s {
175 	uint32_t		srv_type;	/* i.e, NFS, NLM, v4CBD */
176 	ibt_srv_hdl_t		srv_hdl;	/* from ibt_register call */
177 	ib_svc_id_t		srv_id;
178 	rib_service_t		*next;
179 };
180 
181 /*
182  * RPCIB plugin state
183  */
184 typedef struct rpcib_state {
185 	ibt_clnt_hdl_t		ibt_clnt_hdl;
186 	uint32_t		hca_count;
187 	uint32_t		nhca_inited;
188 	rib_hca_t		*hcas_list;
189 	krwlock_t		hcas_list_lock;	/* protects hcas_list */
190 	int			refcount;
191 	kmutex_t		open_hca_lock;
192 	queue_t			*q;		/* up queue for a serv_type */
193 	void			*private;
194 	rib_service_t		*service_list;
195 	krwlock_t		service_list_lock;
196 	kmutex_t		listen_lock;
197 } rpcib_state_t;
198 
199 /*
200  * Connection lists
201  */
202 typedef struct {
203 	krwlock_t	conn_lock;	/* list lock */
204 	CONN		*conn_hd;	/* list head */
205 } rib_conn_list_t;
206 
207 enum hca_state {
208 	HCA_DETACHED,		/* hca in detached state */
209 	HCA_INITED,		/* hca in up and running state */
210 };
211 
212 typedef struct rib_hca_service_s rib_hca_service_t;
213 struct rib_hca_service_s {
214 	ib_svc_id_t	srv_id;
215 	ib_gid_t	gid;
216 	ibt_sbind_hdl_t	sbind_hdl;
217 	rib_hca_service_t *next;
218 };
219 
220 /*
221  * RPCIB per HCA structure
222  */
223 struct rib_hca_s {
224 	ibt_clnt_hdl_t		ibt_clnt_hdl;
225 
226 	/*
227 	 * per HCA.
228 	 */
229 	ibt_hca_hdl_t		hca_hdl;	/* HCA handle */
230 	ibt_hca_attr_t		hca_attrs;	/* HCA attributes */
231 	ibt_pd_hdl_t		pd_hdl;
232 	rib_hca_service_t	*bound_services;
233 	krwlock_t		bound_services_lock;
234 	ib_guid_t		hca_guid;
235 	uint32_t		hca_nports;
236 	ibt_hca_portinfo_t	*hca_ports;
237 	size_t			hca_pinfosz;
238 	enum hca_state		state;		/* state of HCA */
239 	krwlock_t		state_lock;	/* protects state field */
240 	bool_t			inuse;		/* indicates HCA usage */
241 	kmutex_t		inuse_lock;	/* protects inuse field */
242 
243 	rib_conn_list_t		cl_conn_list;	/* client conn list */
244 	rib_conn_list_t		srv_conn_list;	/* server conn list */
245 
246 	rib_cq_t		*clnt_scq;
247 	rib_cq_t		*clnt_rcq;
248 	rib_cq_t		*svc_scq;
249 	rib_cq_t		*svc_rcq;
250 	kmutex_t		cb_lock;
251 	kcondvar_t		cb_cv;
252 
253 	rib_bufpool_t		*recv_pool;	/* recv buf pool */
254 	rib_bufpool_t		*send_pool;	/* send buf pool */
255 
256 	void			*iblock;	/* interrupt cookie */
257 
258 	kmem_cache_t	*server_side_cache;	/* long reply pool */
259 	avl_tree_t	avl_tree;
260 	kmutex_t	avl_lock;
261 	krwlock_t	avl_rw_lock;
262 	volatile bool_t avl_init;
263 	kmutex_t	cache_allocation_lock;
264 	ddi_taskq_t	*cleanup_helper;
265 	ib_svc_id_t	srv_id;
266 	ibt_srv_hdl_t 	srv_hdl;
267 	uint_t		reg_state;
268 
269 	volatile uint64_t	cache_allocation;
270 	uint64_t	cache_hits;
271 	uint64_t	cache_misses;
272 	uint64_t	cache_cold_misses;
273 	uint64_t	cache_hot_misses;
274 	uint64_t	cache_misses_above_the_limit;
275 
276 	struct rib_hca_s *next;
277 };
278 
279 
280 /*
281  * Structure on wait state of a post send
282  */
283 struct send_wid {
284 	uint32_t 	xid;
285 	int		cv_sig;
286 	kmutex_t	sendwait_lock;
287 	kcondvar_t	wait_cv;
288 	uint_t		status;
289 	rib_qp_t	*qp;
290 	int		nsbufs;			/* # of send buffers posted */
291 	uint64_t	sbufaddr[DSEG_MAX];	/* posted send buffers */
292 	caddr_t		c;
293 	caddr_t		c1;
294 	int		l1;
295 	caddr_t		c2;
296 	int		l2;
297 	int		wl, rl;
298 };
299 
300 /*
301  * Structure on reply descriptor for recv queue.
302  * Different from the above posting of a descriptor.
303  */
304 struct reply {
305 	uint32_t 	xid;
306 	uint_t		status;
307 	uint64_t	vaddr_cq;	/* buf addr from CQ */
308 	uint_t		bytes_xfer;
309 	kcondvar_t	wait_cv;
310 	struct reply	*next;
311 	struct reply 	*prev;
312 };
313 
314 struct svc_recv {
315 	rib_qp_t	*qp;
316 	uint64_t	vaddr;
317 	uint_t		bytes_xfer;
318 };
319 
320 struct recv_wid {
321 	uint32_t 	xid;
322 	rib_qp_t	*qp;
323 	uint64_t	addr;	/* posted buf addr */
324 };
325 
326 /*
327  * Per QP data structure
328  */
329 struct rib_qp_s {
330 	rib_hca_t		*hca;
331 	rib_mode_t		mode;	/* RIB_SERVER or RIB_CLIENT */
332 	CONN			rdmaconn;
333 	ibt_channel_hdl_t	qp_hdl;
334 	uint_t			port_num;
335 	ib_qpn_t		qpn;
336 	int			chan_flags;
337 	clock_t			timeout;
338 	ibt_rc_chan_query_attr_t	qp_q_attrs;
339 	rib_cq_t		*send_cq;	/* send CQ */
340 	rib_cq_t		*recv_cq;	/* recv CQ */
341 
342 	/*
343 	 * Number of pre-posted rbufs
344 	 */
345 	uint_t			n_posted_rbufs;
346 	kcondvar_t 		posted_rbufs_cv;
347 	kmutex_t		posted_rbufs_lock;
348 
349 	/*
350 	 * Number of SENDs pending completion
351 	 */
352 
353 	uint_t			n_send_rbufs;
354 	kcondvar_t 		send_rbufs_cv;
355 	kmutex_t		send_rbufs_lock;
356 
357 	/*
358 	 * RPC reply
359 	 */
360 	uint_t			rep_list_size;
361 	struct reply		*replylist;
362 	kmutex_t		replylist_lock;
363 
364 	/*
365 	 * server only, RDMA_DONE
366 	 */
367 	struct rdma_done_list	*rdlist;
368 	kmutex_t		rdlist_lock;
369 
370 	kmutex_t		cb_lock;
371 	kcondvar_t 		cb_conn_cv;
372 
373 	caddr_t			q;	/* upstream queue */
374 	struct send_wid		wd;
375 };
376 
377 #define	ctoqp(conn)	((rib_qp_t *)((conn)->c_private))
378 #define	qptoc(rqp)	((CONN *)&((rqp)->rdmaconn))
379 
380 /*
381  * Timeout for various calls
382  */
383 #define	CONN_WAIT_TIME	40
384 #define	SEND_WAIT_TIME	40	/* time for send completion */
385 
386 #define	REPLY_WAIT_TIME	40	/* time to get reply from remote QP */
387 
388 #ifdef __cplusplus
389 }
390 #endif
391 
392 #endif	/* !_IB_H */
393