xref: /illumos-gate/usr/src/uts/common/sys/ib/clients/ibd/ibd.h (revision 71be8d8f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef _SYS_IB_CLIENTS_IBD_H
28 #define	_SYS_IB_CLIENTS_IBD_H
29 
30 #ifdef __cplusplus
31 extern "C" {
32 #endif
33 
34 /*
35  * IETF defined IPoIB encapsulation header, with 2b of ethertype
36  * followed by 2 reserved bytes. This is at the start of the
37  * datagram sent to and received over the wire by the driver.
38  */
39 typedef struct ipoib_header {
40 	ushort_t	ipoib_type;
41 	ushort_t	ipoib_mbz;
42 } ipoib_hdr_t;
43 
44 #define	IPOIB_HDRSIZE	sizeof (struct ipoib_header)
45 
46 /*
47  * IETF defined IPoIB link address; IBA QPN, followed by GID,
48  * which has a prefix and suffix, as reported via ARP.
49  */
50 typedef struct ipoib_mac {
51 	uint32_t	ipoib_qpn;
52 	uint32_t	ipoib_gidpref[2];
53 	uint32_t	ipoib_gidsuff[2];
54 } ipoib_mac_t;
55 
56 #define	IPOIB_ADDRL	sizeof (struct ipoib_mac)
57 
58 /*
59  * Pseudo header prepended to datagram in DLIOCRAW transmit path
60  * and when GLD hands the datagram to the gldm_send entry point.
61  */
62 typedef struct ipoib_ptxhdr {
63 	ipoib_mac_t	ipoib_dest;
64 	ipoib_hdr_t	ipoib_rhdr;
65 } ipoib_ptxhdr_t;
66 
67 #define	IPOIBDLSAP(p, offset)	((ipoib_ptxhdr_t *)((caddr_t)(p)+offset))
68 
69 /*
70  * The pseudo-GRH structure that sits before the data in the
71  * receive buffer, and is overlaid on top of the real GRH.
72  * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH
73  * does not hold valid information. If it is indicated valid,
74  * the driver must additionally provide the sender's qpn in
75  * network byte order in ipoib_sqpn, and not touch the
76  * remaining parts which were DMA'ed in by the IBA hardware.
77  */
78 typedef struct ipoib_pgrh {
79 	uint32_t	ipoib_vertcflow;
80 	uint32_t	ipoib_sqpn;
81 	uint32_t	ipoib_sgid_pref[2];
82 	uint32_t	ipoib_sgid_suff[2];
83 	uint32_t	ipoib_dgid_pref[2];
84 	uint32_t	ipoib_dgid_suff[2];
85 } ipoib_pgrh_t;
86 
87 /*
88  * The GRH is also dma'ed into recv buffers, thus space needs
89  * to be allocated for them.
90  */
91 #define	IPOIB_GRH_SIZE	sizeof (ipoib_pgrh_t)
92 
93 #if defined(_KERNEL) && !defined(_BOOT)
94 
95 #include <sys/ib/ibtl/ibti.h>
96 #include <sys/ib/ib_pkt_hdrs.h>
97 #include <sys/list.h>
98 #include <sys/mac_provider.h>
99 #include <sys/mac_ib.h>
100 #include <sys/modhash.h>
101 
102 /*
103  * Structure to encapsulate various types of async requests.
104  */
105 typedef struct ibd_acache_rq {
106 	struct list_node 	rq_list; 	/* list of pending work */
107 	int			rq_op;		/* what operation */
108 	ipoib_mac_t		rq_mac;
109 	ib_gid_t		rq_gid;
110 	void			*rq_ptr;
111 } ibd_req_t;
112 
113 typedef struct ibd_mcache {
114 	struct list_node	mc_list;	/* full/non list */
115 	uint8_t			mc_jstate;
116 	boolean_t		mc_fullreap;
117 	ibt_mcg_info_t		mc_info;
118 	ibd_req_t		mc_req;		/* to queue LEAVE req */
119 } ibd_mce_t;
120 
121 typedef struct ibd_acache_s {
122 	struct list_node	ac_list;	/* free/active list */
123 	ibt_ud_dest_hdl_t	ac_dest;
124 	ipoib_mac_t		ac_mac;
125 	uint32_t		ac_ref;
126 	ibd_mce_t		*ac_mce;	/* for MCG AHs */
127 } ibd_ace_t;
128 
129 #define	IBD_MAX_SQSEG	59
130 #define	IBD_MAX_RQSEG	1
131 
132 typedef enum {
133 	IBD_WQE_SEND,
134 	IBD_WQE_RECV
135 } ibd_wqe_type_t;
136 
137 typedef enum {
138 	IBD_WQE_TXBUF = 1,
139 	IBD_WQE_LSOBUF = 2,
140 	IBD_WQE_MAPPED = 3
141 } ibd_wqe_buftype_t;
142 
143 /*
144  * Pre-registered copybuf used for send and receive
145  */
146 typedef struct ibd_copybuf_s {
147 	ibt_wr_ds_t		ic_sgl;
148 	uint8_t			*ic_bufaddr;
149 } ibd_copybuf_t;
150 
151 typedef struct ibd_wqe_s {
152 	struct ibd_wqe_s	*w_next;
153 	ibd_wqe_type_t		w_type;
154 	ibd_copybuf_t		w_copybuf;
155 	mblk_t			*im_mblk;
156 } ibd_wqe_t;
157 
158 /*
159  * Send WQE
160  */
161 typedef struct ibd_swqe_s {
162 	ibd_wqe_t		w_ibd_swqe;
163 	ibd_wqe_buftype_t	w_buftype;
164 	ibt_send_wr_t		w_swr;
165 	ibd_ace_t		*w_ahandle;
166 	ibt_mi_hdl_t		w_mi_hdl;
167 	ibt_wr_ds_t		w_sgl[IBD_MAX_SQSEG];
168 } ibd_swqe_t;
169 
170 #define	swqe_next		w_ibd_swqe.w_next
171 #define	swqe_type		w_ibd_swqe.w_type
172 #define	swqe_copybuf		w_ibd_swqe.w_copybuf
173 #define	swqe_im_mblk		w_ibd_swqe.im_mblk
174 #define	SWQE_TO_WQE(swqe)	(ibd_wqe_t *)&((swqe)->w_ibd_swqe)
175 #define	WQE_TO_SWQE(wqe)	(ibd_swqe_t *)wqe
176 
177 /*
178  * Receive WQE
179  */
180 typedef struct ibd_rwqe_s {
181 	ibd_wqe_t		w_ibd_rwqe;
182 	struct ibd_state_s	*w_state;
183 	ibt_recv_wr_t		w_rwr;
184 	boolean_t		w_freeing_wqe;
185 	frtn_t			w_freemsg_cb;
186 } ibd_rwqe_t;
187 
188 #define	rwqe_next		w_ibd_rwqe.w_next
189 #define	rwqe_type		w_ibd_rwqe.w_type
190 #define	rwqe_copybuf		w_ibd_rwqe.w_copybuf
191 #define	rwqe_im_mblk		w_ibd_rwqe.im_mblk
192 #define	RWQE_TO_WQE(rwqe)	(ibd_wqe_t *)&((rwqe)->w_ibd_rwqe)
193 #define	WQE_TO_RWQE(wqe)	(ibd_rwqe_t *)wqe
194 
195 typedef struct ibd_list_s {
196 	kmutex_t		dl_mutex;
197 	ibd_wqe_t		*dl_head;
198 	union {
199 		boolean_t	pending_sends;
200 		uint32_t	bufs_outstanding;
201 	} ustat;
202 	uint32_t		dl_cnt;
203 } ibd_list_t;
204 
205 #define	dl_pending_sends	ustat.pending_sends
206 #define	dl_bufs_outstanding	ustat.bufs_outstanding
207 
208 /*
209  * LSO buffers
210  *
211  * Under normal circumstances we should never need to use any buffer
212  * that's larger than MTU.  Unfortunately, IB HCA has limitations
213  * on the length of SGL that are much smaller than those for regular
214  * ethernet NICs.  Since the network layer doesn't care to limit the
215  * number of mblk fragments in any send mp chain, we end up having to
216  * use these larger-than-MTU sized (larger than id_tx_buf_sz actually)
217  * buffers occasionally.
218  */
219 typedef struct ibd_lsobuf_s {
220 	struct ibd_lsobuf_s *lb_next;
221 	uint8_t		*lb_buf;
222 	int		lb_isfree;
223 } ibd_lsobuf_t;
224 
225 typedef struct ibd_lsobkt_s {
226 	uint8_t		*bkt_mem;
227 	ibd_lsobuf_t	*bkt_bufl;
228 	ibd_lsobuf_t	*bkt_free_head;
229 	ibt_mr_hdl_t	bkt_mr_hdl;
230 	ibt_mr_desc_t	bkt_mr_desc;
231 	uint_t		bkt_nelem;
232 	uint_t		bkt_nfree;
233 } ibd_lsobkt_t;
234 
235 /*
236  * Posting to a single software rx post queue is contentious,
237  * so break it out to (multiple) an array of queues.
238  *
239  * Try to ensure rx_queue structs fall in different cache lines using a filler.
240  * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
241  */
242 #define	RX_QUEUE_CACHE_LINE \
243 	(64 - ((sizeof (kmutex_t) + 2 * sizeof (ibd_wqe_t *) + \
244 	2 * sizeof (uint32_t))))
245 typedef struct ibd_rx_queue_s {
246 	kmutex_t		rx_post_lock;
247 	ibd_wqe_t		*rx_head;
248 	ibd_wqe_t		*rx_tail;
249 	uint32_t		rx_stat;
250 	uint32_t		rx_cnt;
251 	uint8_t			rx_cache_filler[RX_QUEUE_CACHE_LINE];
252 } ibd_rx_queue_t;
253 
254 /*
255  * This structure maintains information per port per HCA
256  * (per network interface).
257  */
258 typedef struct ibd_state_s {
259 	dev_info_t		*id_dip;
260 	ibt_clnt_hdl_t		id_ibt_hdl;
261 	ibt_hca_hdl_t		id_hca_hdl;
262 	ibt_pd_hdl_t		id_pd_hdl;
263 	kmem_cache_t		*id_req_kmc;
264 
265 	ibd_list_t		id_tx_rel_list;
266 
267 	uint32_t		id_max_sqseg;
268 	uint32_t		id_max_sqseg_hiwm;
269 	ibd_list_t		id_tx_list;
270 	ddi_softintr_t		id_tx;
271 	uint32_t		id_tx_sends;
272 
273 	kmutex_t		id_txpost_lock;
274 	ibd_swqe_t		*id_tx_head;
275 	ibd_swqe_t		*id_tx_tail;
276 	int			id_tx_busy;
277 
278 	uint_t			id_tx_buf_sz;
279 	uint8_t			*id_tx_bufs;
280 	ibd_swqe_t		*id_tx_wqes;
281 	ibt_mr_hdl_t		id_tx_mr_hdl;
282 	ibt_mr_desc_t		id_tx_mr_desc;
283 
284 	kmutex_t		id_lso_lock;
285 	ibd_lsobkt_t		*id_lso;
286 
287 	kmutex_t		id_scq_poll_lock;
288 	int			id_scq_poll_busy;
289 
290 	ibt_cq_hdl_t		id_scq_hdl;
291 	ibt_wc_t		*id_txwcs;
292 	uint32_t		id_txwcs_size;
293 
294 	kmutex_t		id_rx_post_lock;
295 	int			id_rx_post_busy;
296 	int			id_rx_nqueues;
297 	ibd_rx_queue_t		*id_rx_queues;
298 	ibd_wqe_t		*id_rx_post_head;
299 
300 	ibd_rwqe_t		*id_rx_wqes;
301 	uint8_t			*id_rx_bufs;
302 	ibt_mr_hdl_t		id_rx_mr_hdl;
303 	ibt_mr_desc_t		id_rx_mr_desc;
304 	uint_t			id_rx_buf_sz;
305 	uint32_t		id_num_rwqe;
306 	ibd_list_t		id_rx_list;
307 	ddi_softintr_t		id_rx;
308 	uint32_t		id_rx_bufs_outstanding_limit;
309 	uint32_t		id_rx_allocb;
310 	uint32_t		id_rx_allocb_failed;
311 	ibd_list_t		id_rx_free_list;
312 
313 	kmutex_t		id_rcq_poll_lock;
314 	int			id_rcq_poll_busy;
315 	uint32_t		id_rxwcs_size;
316 	ibt_wc_t		*id_rxwcs;
317 	ibt_cq_hdl_t		id_rcq_hdl;
318 
319 	ibt_channel_hdl_t	id_chnl_hdl;
320 	ib_pkey_t		id_pkey;
321 	uint16_t		id_pkix;
322 	uint8_t			id_port;
323 	ibt_mcg_info_t		*id_mcinfo;
324 
325 	mac_handle_t		id_mh;
326 	mac_resource_handle_t	id_rh;
327 	ib_gid_t		id_sgid;
328 	ib_qpn_t		id_qpnum;
329 	ipoib_mac_t		id_macaddr;
330 	ib_gid_t		id_mgid;
331 	ipoib_mac_t		id_bcaddr;
332 
333 	int			id_mtu;
334 	uchar_t			id_scope;
335 
336 	kmutex_t		id_acache_req_lock;
337 	kcondvar_t		id_acache_req_cv;
338 	struct list		id_req_list;
339 	kt_did_t		id_async_thrid;
340 
341 	kmutex_t		id_ac_mutex;
342 	ibd_ace_t		*id_ac_hot_ace;
343 	struct list		id_ah_active;
344 	struct list		id_ah_free;
345 	ipoib_mac_t		id_ah_addr;
346 	ibd_req_t		id_ah_req;
347 	char			id_ah_op;
348 	uint64_t		id_ah_error;
349 	ibd_ace_t		*id_ac_list;
350 	mod_hash_t		*id_ah_active_hash;
351 
352 	kmutex_t		id_mc_mutex;
353 	struct list		id_mc_full;
354 	struct list		id_mc_non;
355 
356 	kmutex_t		id_trap_lock;
357 	kcondvar_t		id_trap_cv;
358 	boolean_t		id_trap_stop;
359 	uint32_t		id_trap_inprog;
360 
361 	char			id_prom_op;
362 
363 	kmutex_t		id_sched_lock;
364 	int			id_sched_needed;
365 	int			id_sched_cnt;
366 	int			id_sched_lso_cnt;
367 
368 	kmutex_t		id_link_mutex;
369 	link_state_t		id_link_state;
370 	uint64_t		id_link_speed;
371 
372 	uint64_t		id_num_intrs;
373 	uint64_t		id_tx_short;
374 	uint32_t		id_num_swqe;
375 
376 	uint64_t		id_xmt_bytes;
377 	uint64_t		id_rcv_bytes;
378 	uint64_t		id_multi_xmt;
379 	uint64_t		id_brd_xmt;
380 	uint64_t		id_multi_rcv;
381 	uint64_t		id_brd_rcv;
382 	uint64_t		id_xmt_pkt;
383 	uint64_t		id_rcv_pkt;
384 
385 	uint32_t		id_hwcksum_capab;
386 	boolean_t		id_lso_policy;
387 	boolean_t		id_lso_capable;
388 	uint_t			id_lso_maxlen;
389 	int			id_hca_res_lkey_capab;
390 	ibt_lkey_t		id_res_lkey;
391 
392 	boolean_t		id_bgroup_created;
393 	kmutex_t		id_macst_lock;
394 	kcondvar_t		id_macst_cv;
395 	uint32_t		id_mac_state;
396 } ibd_state_t;
397 
398 #endif /* _KERNEL && !_BOOT */
399 
400 #ifdef __cplusplus
401 }
402 #endif
403 
404 #endif	/* _SYS_IB_CLIENTS_IBD_H */
405