1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2018 Joyent, Inc.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 /*
29  * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
30  */
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/atomic.h>		/* for atomic_add*() */
45 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
46 #include <netinet/in.h>		/* for netinet/ip.h below */
47 #include <netinet/ip.h>		/* for struct ip */
48 #include <inet/common.h>	/* for inet/ip.h below */
49 #include <inet/ip.h>		/* for ipha_t */
50 #include <inet/ip_if.h>		/* for ETHERTYPE_IPV6 */
51 #include <inet/ip6.h>		/* for ip6_t */
52 #include <netinet/icmp6.h>	/* for icmp6_t */
53 
54 #include <sys/ib/clients/ibd/ibd.h>
55 
56 extern ibd_global_state_t ibd_gstate;
57 extern int ibd_rc_conn_timeout;
58 uint_t ibd_rc_tx_softintr = 1;
59 /*
60  * If the number of WRs in receive queue of each RC connection less than
61  * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
62  */
63 #define	IBD_RC_RX_WR_THRESHOLD		0x20
64 
65 /*
66  * If the number of free SWQEs (or large Tx buf) is larger than or equal to
67  * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
68  * transmitting packets.
69  */
70 #define	IBD_RC_TX_FREE_THRESH		8
71 
72 #define	IBD_RC_QPN_TO_SID(qpn) \
73 	((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
74 
75 /* For interop with legacy OFED */
76 #define	IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
77 	((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
78 
79 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
80 #define	IBD_RC_IP_ICMP_RETURN_DATA_BYTES	64
81 
82 
83 /* Functions for Reliable Connected Mode */
84 /* Connection Setup/Close Functions */
85 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
86     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
87 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
88     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
89 static void ibd_rc_act_close(ibd_rc_chan_t *, boolean_t);
90 
91 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
92     ibd_rc_chan_t *);
93 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
94     ibd_rc_chan_list_t *);
95 static inline ibd_rc_chan_t *ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
96     ibd_rc_chan_t *);
97 
98 /* CQ handlers */
99 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
100 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
101 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
102 
103 /* Receive Functions */
104 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
105 static void ibd_rc_srq_freemsg_cb(char *);
106 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
107 
108 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
109 static void ibd_rc_freemsg_cb(char *);
110 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
111 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
112 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
113 
114 
115 /* Send Functions */
116 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
117 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
118 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
119 static uint_t ibd_rc_tx_recycle(caddr_t);
120 
121 
122 void
ibd_async_rc_close_act_chan(ibd_state_t * state,ibd_req_t * req)123 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
124 {
125 	ibd_rc_chan_t *rc_chan = req->rq_ptr;
126 	ibd_ace_t *ace;
127 
128 	while (rc_chan != NULL) {
129 		ace = rc_chan->ace;
130 		ASSERT(ace != NULL);
131 		/* Close old RC channel */
132 		ibd_rc_act_close(rc_chan, B_TRUE);
133 		mutex_enter(&state->id_ac_mutex);
134 		ASSERT(ace->ac_ref != 0);
135 		atomic_dec_32(&ace->ac_ref);
136 		ace->ac_chan = NULL;
137 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
138 			IBD_ACACHE_INSERT_FREE(state, ace);
139 			ace->ac_ref = 0;
140 		} else {
141 			ace->ac_ref |= CYCLEVAL;
142 			state->rc_delay_ace_recycle++;
143 		}
144 		mutex_exit(&state->id_ac_mutex);
145 		rc_chan = ibd_rc_rm_header_chan_list(
146 		    &state->rc_obs_act_chan_list);
147 	}
148 }
149 
150 void
ibd_async_rc_recycle_ace(ibd_state_t * state,ibd_req_t * req)151 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
152 {
153 	ibd_ace_t *ace = req->rq_ptr;
154 	ibd_rc_chan_t *rc_chan;
155 
156 	ASSERT(ace != NULL);
157 	rc_chan = ace->ac_chan;
158 	ASSERT(rc_chan != NULL);
159 	/* Close old RC channel */
160 	ibd_rc_act_close(rc_chan, B_TRUE);
161 	mutex_enter(&state->id_ac_mutex);
162 	ASSERT(ace->ac_ref != 0);
163 	atomic_dec_32(&ace->ac_ref);
164 	ace->ac_chan = NULL;
165 	if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
166 		IBD_ACACHE_INSERT_FREE(state, ace);
167 		ace->ac_ref = 0;
168 	} else {
169 		ace->ac_ref |= CYCLEVAL;
170 		state->rc_delay_ace_recycle++;
171 	}
172 	mutex_exit(&state->id_ac_mutex);
173 	mutex_enter(&state->rc_ace_recycle_lock);
174 	state->rc_ace_recycle = NULL;
175 	mutex_exit(&state->rc_ace_recycle_lock);
176 }
177 
178 /* Simple ICMP IP Header Template */
179 static const ipha_t icmp_ipha = {
180 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
181 };
182 
183 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
184 void
ibd_async_rc_process_too_big(ibd_state_t * state,ibd_req_t * req)185 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
186 {
187 	mblk_t *mp = req->rq_ptr;
188 	ibd_ace_t *ace = req->rq_ptr2;
189 	uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
190 	uint_t	len_needed;
191 	size_t	msg_len;
192 	mblk_t	*pmtu_mp;
193 	ushort_t	sap;
194 	ib_header_info_t *ibha;	/* ib header for pmtu_pkt */
195 	/*
196 	 * ipha: IP header for pmtu_pkt
197 	 * old_ipha: IP header for old packet
198 	 */
199 	ipha_t *ipha, *old_ipha;
200 	icmph_t	*icmph;
201 
202 	sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
203 
204 	if (!pullupmsg(mp, -1)) {
205 		DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
206 		goto too_big_fail;
207 	}
208 	/* move to IP header. */
209 	mp->b_rptr += IPOIB_HDRSIZE;
210 	old_ipha = (ipha_t *)mp->b_rptr;
211 
212 	len_needed = IPH_HDR_LENGTH(old_ipha);
213 	if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
214 		len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
215 		    len_needed));
216 	} else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
217 		ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
218 		    + len_needed);
219 		len_needed += ip_hdr_length_v6(mp, ip6h);
220 	}
221 	len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
222 	msg_len = msgdsize(mp);
223 	if (msg_len > len_needed) {
224 		(void) adjmsg(mp, len_needed - msg_len);
225 		msg_len = len_needed;
226 	}
227 
228 	if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
229 	    + sizeof (icmph_t), BPRI_MED)) == NULL) {
230 		DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
231 		goto too_big_fail;
232 	}
233 	pmtu_mp->b_cont = mp;
234 	pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
235 	    + sizeof (ipha_t) + sizeof (icmph_t);
236 
237 	ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
238 
239 	/* Fill IB header */
240 	bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
241 	/*
242 	 * If the GRH is not valid, indicate to GLDv3 by setting
243 	 * the VerTcFlow field to 0.
244 	 */
245 	ibha->ib_grh.ipoib_vertcflow = 0;
246 	ibha->ipib_rhdr.ipoib_type = htons(sap);
247 	ibha->ipib_rhdr.ipoib_mbz = 0;
248 
249 	/* Fill IP header */
250 	ipha = (ipha_t *)&ibha[1];
251 	*ipha = icmp_ipha;
252 	ipha->ipha_src = old_ipha->ipha_dst;
253 	ipha->ipha_dst = old_ipha->ipha_src;
254 	ipha->ipha_ttl = old_ipha->ipha_ttl;
255 	msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
256 	if (msg_len > IP_MAXPACKET) {
257 		ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
258 		    "> IP_MAXPACKET", (uint32_t)msg_len);
259 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
260 		msg_len = IP_MAXPACKET;
261 	}
262 	ipha->ipha_length = htons((uint16_t)msg_len);
263 	ipha->ipha_hdr_checksum = 0;
264 	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
265 
266 	/* Fill ICMP body */
267 	icmph = (icmph_t *)&ipha[1];
268 	bzero(icmph, sizeof (icmph_t));
269 	icmph->icmph_type = ICMP_DEST_UNREACHABLE;
270 	icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
271 	icmph->icmph_du_mtu = htons(mtu);
272 	icmph->icmph_checksum = 0;
273 	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
274 	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
275 
276 	mac_hcksum_set(pmtu_mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
277 
278 	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
279 	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
280 	    sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
281 	    len_needed, (uint32_t)msg_len);
282 
283 	mac_rx(state->id_mh, state->id_rh, pmtu_mp);
284 
285 	mutex_enter(&ace->tx_too_big_mutex);
286 	ace->tx_too_big_ongoing = B_FALSE;
287 	mutex_exit(&ace->tx_too_big_mutex);
288 	return;
289 
290 too_big_fail:
291 	/* Drop packet */
292 	freemsg(mp);
293 	mutex_enter(&ace->tx_too_big_mutex);
294 	ace->tx_too_big_ongoing = B_FALSE;
295 	mutex_exit(&ace->tx_too_big_mutex);
296 }
297 
298 /*
299  * Check all active/passive channels. If any ative/passive
300  * channel has not been used for a long time, close it.
301  */
302 void
ibd_rc_conn_timeout_call(void * carg)303 ibd_rc_conn_timeout_call(void *carg)
304 {
305 	ibd_state_t *state = carg;
306 	ibd_ace_t *ace, *pre_ace;
307 	ibd_rc_chan_t *chan, *pre_chan, *next_chan;
308 	ibd_req_t *req;
309 
310 	/* Check all active channels. If chan->is_used == B_FALSE, close it */
311 	mutex_enter(&state->id_ac_mutex);
312 	ace = list_head(&state->id_ah_active);
313 	while ((pre_ace = ace) != NULL) {
314 		ace = list_next(&state->id_ah_active, ace);
315 		if (pre_ace->ac_chan != NULL) {
316 			chan = pre_ace->ac_chan;
317 			ASSERT(state->id_enable_rc == B_TRUE);
318 			if (chan->chan_state == IBD_RC_STATE_ACT_ESTAB) {
319 				if (chan->is_used == B_FALSE) {
320 					state->rc_timeout_act++;
321 					INC_REF(pre_ace, 1);
322 					IBD_ACACHE_PULLOUT_ACTIVE(state,
323 					    pre_ace);
324 					chan->chan_state =
325 					    IBD_RC_STATE_ACT_CLOSING;
326 					ibd_rc_signal_act_close(state, pre_ace);
327 				} else {
328 					chan->is_used = B_FALSE;
329 				}
330 			}
331 		}
332 	}
333 	mutex_exit(&state->id_ac_mutex);
334 
335 	/* Check all passive channels. If chan->is_used == B_FALSE, close it */
336 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
337 	next_chan = state->rc_pass_chan_list.chan_list;
338 	pre_chan = NULL;
339 	while ((chan = next_chan) != NULL) {
340 		next_chan = chan->next;
341 		if (chan->is_used == B_FALSE) {
342 			req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
343 			if (req != NULL) {
344 				/* remove it */
345 				state->rc_timeout_pas++;
346 				req->rq_ptr = chan;
347 				ibd_queue_work_slot(state, req,
348 				    IBD_ASYNC_RC_CLOSE_PAS_CHAN);
349 			} else {
350 				ibd_print_warn(state, "ibd_rc_conn_timeout: "
351 				    "alloc ibd_req_t fail");
352 				if (pre_chan == NULL) {
353 					state->rc_pass_chan_list.chan_list =
354 					    chan;
355 				} else {
356 					pre_chan->next = chan;
357 				}
358 				pre_chan = chan;
359 			}
360 		} else {
361 			if (pre_chan == NULL) {
362 				state->rc_pass_chan_list.chan_list = chan;
363 			} else {
364 				pre_chan->next = chan;
365 			}
366 			pre_chan = chan;
367 			chan->is_used = B_FALSE;
368 		}
369 	}
370 	if (pre_chan != NULL) {
371 		pre_chan->next = NULL;
372 	} else {
373 		state->rc_pass_chan_list.chan_list = NULL;
374 	}
375 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
376 
377 	mutex_enter(&state->rc_timeout_lock);
378 	if (state->rc_timeout_start == B_TRUE) {
379 		state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
380 		    SEC_TO_TICK(ibd_rc_conn_timeout));
381 	}
382 	mutex_exit(&state->rc_timeout_lock);
383 }
384 
385 #ifdef DEBUG
386 /*
387  * ibd_rc_update_stats - update driver private kstat counters
388  *
389  * This routine will dump the internal statistics counters for ibd's
390  * Reliable Connected Mode. The current stats dump values will
391  * be sent to the kernel status area.
392  */
393 static int
ibd_rc_update_stats(kstat_t * ksp,int rw)394 ibd_rc_update_stats(kstat_t *ksp, int rw)
395 {
396 	ibd_state_t *state;
397 	ibd_rc_stat_t *ibd_rc_ksp;
398 
399 	if (rw == KSTAT_WRITE)
400 		return (EACCES);
401 
402 	state = (ibd_state_t *)ksp->ks_private;
403 	ASSERT(state != NULL);
404 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
405 
406 	ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
407 	ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
408 	ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
409 	ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
410 	ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
411 
412 	ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
413 
414 	ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
415 
416 	ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
417 	ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
418 	ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
419 	    state->rc_xmt_fragmented_pkt;
420 	ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
421 	ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
422 	ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
423 
424 	ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
425 	ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
426 	ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
427 	ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
428 	ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
429 	ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
430 	    state->rc_xmt_buf_mac_update;
431 
432 	ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
433 	ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
434 	ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
435 	ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
436 
437 	ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
438 	ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
439 	ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
440 	ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
441 	    state->rc_act_close_simultaneous;
442 	ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
443 	ibd_rc_ksp->rc_timeout_act.value.ul = state->rc_timeout_act;
444 	ibd_rc_ksp->rc_timeout_pas.value.ul = state->rc_timeout_pas;
445 
446 	return (0);
447 }
448 
449 
450 /*
451  * ibd_rc_init_stats - initialize kstat data structures
452  *
453  * This routine will create and initialize the driver private
454  * statistics counters.
455  */
456 int
ibd_rc_init_stats(ibd_state_t * state)457 ibd_rc_init_stats(ibd_state_t *state)
458 {
459 	kstat_t *ksp;
460 	ibd_rc_stat_t *ibd_rc_ksp;
461 	char stat_name[KSTAT_STRLEN];
462 	int inst;
463 
464 	/*
465 	 * Create and init kstat
466 	 */
467 	inst = ddi_get_instance(state->id_dip);
468 	(void) snprintf(stat_name, KSTAT_STRLEN, "statistics%d_%x_%u", inst,
469 	    state->id_pkey, state->id_plinkid);
470 	ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED,
471 	    sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
472 
473 	if (ksp == NULL) {
474 		ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
475 		    "kernel statistics");
476 		return (DDI_FAILURE);
477 	}
478 
479 	state->rc_ksp = ksp;	/* Fill in the ksp of ibd over RC mode */
480 
481 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
482 
483 	/*
484 	 * Initialize all the statistics
485 	 */
486 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
487 	    "transfer mode", KSTAT_DATA_ULONG);
488 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
489 	    "transfer mode", KSTAT_DATA_ULONG);
490 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
491 	    "copy mode", KSTAT_DATA_ULONG);
492 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
493 	    "copy mode", KSTAT_DATA_ULONG);
494 	kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
495 	    KSTAT_DATA_ULONG);
496 
497 	kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
498 	    KSTAT_DATA_ULONG);
499 
500 	kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
501 	    KSTAT_DATA_ULONG);
502 
503 	kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
504 	    KSTAT_DATA_ULONG);
505 	kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
506 	    "RC: Tx pkt small size", KSTAT_DATA_ULONG);
507 	kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
508 	    "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
509 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
510 	    "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
511 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
512 	    "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
513 	kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
514 	    KSTAT_DATA_ULONG);
515 
516 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
517 	    "recycle", KSTAT_DATA_ULONG);
518 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
519 	    "after recycle", KSTAT_DATA_ULONG);
520 	kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
521 	    KSTAT_DATA_ULONG);
522 	kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
523 	    "#, swqe available", KSTAT_DATA_ULONG);
524 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
525 	    "ibd_send", KSTAT_DATA_ULONG);
526 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
527 	    "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
528 
529 	kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
530 	    KSTAT_DATA_ULONG);
531 	kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
532 	    KSTAT_DATA_ULONG);
533 	kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
534 	    "pkt", KSTAT_DATA_ULONG);
535 	kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
536 	    "state", KSTAT_DATA_ULONG);
537 
538 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
539 	    KSTAT_DATA_ULONG);
540 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
541 	    KSTAT_DATA_ULONG);
542 	kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
543 	    "recycle", KSTAT_DATA_ULONG);
544 	kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
545 	    "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
546 	kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
547 	    KSTAT_DATA_ULONG);
548 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: timeout act side",
549 	    KSTAT_DATA_ULONG);
550 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: timeout pas side",
551 	    KSTAT_DATA_ULONG);
552 
553 	/*
554 	 * Function to provide kernel stat update on demand
555 	 */
556 	ksp->ks_update = ibd_rc_update_stats;
557 
558 	/*
559 	 * Pointer into provider's raw statistics
560 	 */
561 	ksp->ks_private = (void *)state;
562 
563 	/*
564 	 * Add kstat to systems kstat chain
565 	 */
566 	kstat_install(ksp);
567 
568 	return (DDI_SUCCESS);
569 }
570 #endif
571 
572 static ibt_status_t
ibd_rc_alloc_chan(ibd_rc_chan_t ** ret_chan,ibd_state_t * state,boolean_t is_tx_chan)573 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
574     boolean_t is_tx_chan)
575 {
576 	ibt_status_t result;
577 	ibd_rc_chan_t *chan;
578 	ibt_rc_chan_alloc_args_t alloc_args;
579 	ibt_chan_alloc_flags_t alloc_flags;
580 	ibt_chan_sizes_t sizes;
581 	ibt_cq_attr_t cq_atts;
582 	int rv;
583 
584 	chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
585 
586 	chan->state = state;
587 	mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
588 	mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
589 	mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
590 	mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
591 	mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
592 	mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
593 
594 	/* Allocate IB structures for a new RC channel. */
595 	if (is_tx_chan) {
596 		chan->scq_size = state->id_rc_num_swqe;
597 		chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
598 	} else {
599 		chan->scq_size = IBD_RC_MIN_CQ_SIZE;
600 		chan->rcq_size = state->id_rc_num_rwqe;
601 	}
602 	cq_atts.cq_size = chan->scq_size;
603 	cq_atts.cq_sched = NULL;
604 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
605 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
606 	    &chan->scq_size);
607 	if (result != IBT_SUCCESS) {
608 		DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
609 		    "create scq completion queue (size <%d>)",
610 		    result, chan->scq_size);
611 		goto alloc_scq_err;
612 	}	/* if failure to alloc cq */
613 
614 	if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count,
615 	    state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) {
616 		DPRINT(30, "ibd_rc_alloc_chan: Send CQ "
617 		    "interrupt moderation failed");
618 	}
619 
620 	ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
621 	ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
622 	    (void *) (uintptr_t)chan);
623 
624 	cq_atts.cq_size = chan->rcq_size;
625 	cq_atts.cq_sched = NULL;
626 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
627 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
628 	    &chan->rcq_size);
629 	if (result != IBT_SUCCESS) {
630 		ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
631 		    "rx completion queue (size <%d>)", result, chan->rcq_size);
632 		goto alloc_rcq_err;
633 	}	/* if failure to alloc cq */
634 
635 	if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count,
636 	    state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) {
637 		DPRINT(30, "ibd_rc_alloc_chan: Receive CQ "
638 		    "interrupt moderation failed");
639 	}
640 
641 	ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
642 	ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
643 	    (void *)(uintptr_t)chan);
644 
645 	if (is_tx_chan) {
646 		chan->is_tx_chan = B_TRUE;
647 		if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
648 			ibd_print_warn(state, "ibd_rc_alloc_chan: "
649 			    "ibd_rc_init_txlist failed");
650 			goto init_txlist_err;
651 		}
652 		if (ibd_rc_tx_softintr == 1) {
653 			if ((rv = ddi_add_softintr(state->id_dip,
654 			    DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
655 			    ibd_rc_tx_recycle, (caddr_t)chan)) !=
656 			    DDI_SUCCESS) {
657 				DPRINT(10, "ibd_rc_alloc_chan: failed in "
658 				    "ddi_add_softintr(scq_softintr), ret=%d",
659 				    rv);
660 				goto alloc_softintr_err;
661 			}
662 		}
663 	} else {
664 		chan->is_tx_chan = B_FALSE;
665 	}
666 
667 	/*
668 	 * enable completions
669 	 */
670 	result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
671 	if (result != IBT_SUCCESS) {
672 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
673 		    "(scq) failed: status %d\n", result);
674 		goto alloc_scq_enable_err;
675 	}
676 
677 	/* We will enable chan->rcq_hdl later. */
678 
679 	/* alloc a RC channel */
680 	bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
681 	bzero(&sizes, sizeof (ibt_chan_sizes_t));
682 
683 	alloc_args.rc_flags = IBT_WR_SIGNALED;
684 	alloc_args.rc_control = IBT_CEP_NO_FLAGS;
685 
686 	alloc_args.rc_scq = chan->scq_hdl;
687 	alloc_args.rc_rcq = chan->rcq_hdl;
688 	alloc_args.rc_pd = state->id_pd_hdl;
689 
690 	alloc_args.rc_hca_port_num = state->id_port;
691 	alloc_args.rc_clone_chan = NULL;
692 
693 	/* scatter/gather */
694 	alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
695 
696 	/*
697 	 * For the number of SGL elements in receive side, I think it
698 	 * should be 1. Because ibd driver allocates a whole block memory
699 	 * for each ibt_post_recv().
700 	 */
701 	alloc_args.rc_sizes.cs_rq_sgl = 1;
702 
703 	/* The send queue size and the receive queue size */
704 	alloc_args.rc_sizes.cs_sq = chan->scq_size;
705 	alloc_args.rc_sizes.cs_rq = chan->rcq_size;
706 
707 	if (state->id_hca_res_lkey_capab) {
708 		alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
709 	} else {
710 		DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
711 	}
712 
713 	if (state->rc_enable_srq) {
714 		alloc_flags = IBT_ACHAN_USES_SRQ;
715 		alloc_args.rc_srq = state->rc_srq_hdl;
716 	} else {
717 		alloc_flags = IBT_ACHAN_NO_FLAGS;
718 	}
719 
720 	result = ibt_alloc_rc_channel(state->id_hca_hdl,
721 	    alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
722 	if (result != IBT_SUCCESS) {
723 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
724 		    " fail:<%d>", result);
725 		goto alloc_scq_enable_err;
726 	}
727 
728 	if (is_tx_chan)
729 		atomic_inc_32(&state->rc_num_tx_chan);
730 	else
731 		atomic_inc_32(&state->rc_num_rx_chan);
732 
733 	/* For the connection reaper routine ibd_rc_conn_timeout_call() */
734 	chan->is_used = B_TRUE;
735 
736 	*ret_chan = chan;
737 	return (IBT_SUCCESS);
738 
739 alloc_scq_enable_err:
740 	if (is_tx_chan) {
741 		if (ibd_rc_tx_softintr == 1) {
742 			ddi_remove_softintr(chan->scq_softintr);
743 		}
744 	}
745 alloc_softintr_err:
746 	if (is_tx_chan) {
747 		ibd_rc_fini_txlist(chan);
748 	}
749 init_txlist_err:
750 	(void) ibt_free_cq(chan->rcq_hdl);
751 alloc_rcq_err:
752 	(void) ibt_free_cq(chan->scq_hdl);
753 alloc_scq_err:
754 	mutex_destroy(&chan->tx_poll_lock);
755 	mutex_destroy(&chan->tx_post_lock);
756 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
757 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
758 	mutex_destroy(&chan->rx_free_list.dl_mutex);
759 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
760 	kmem_free(chan, sizeof (ibd_rc_chan_t));
761 	return (result);
762 }
763 
764 static void
ibd_rc_free_chan(ibd_rc_chan_t * chan)765 ibd_rc_free_chan(ibd_rc_chan_t *chan)
766 {
767 	ibt_status_t ret;
768 
769 	/* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
770 
771 	if (chan->chan_hdl != NULL) {
772 		ret = ibt_free_channel(chan->chan_hdl);
773 		if (ret != IBT_SUCCESS) {
774 			DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
775 			    "chan=%p, returned: %d", chan, ret);
776 			return;
777 		}
778 		chan->chan_hdl = NULL;
779 	}
780 
781 	if (chan->rcq_hdl != NULL) {
782 		ret = ibt_free_cq(chan->rcq_hdl);
783 		if (ret != IBT_SUCCESS) {
784 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
785 			    "chan=%p, returned: %d", chan, ret);
786 			return;
787 		}
788 		chan->rcq_hdl = NULL;
789 	}
790 
791 	if (chan->scq_hdl != NULL) {
792 		ret = ibt_free_cq(chan->scq_hdl);
793 		if (ret != IBT_SUCCESS) {
794 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
795 			    "chan=%p, returned: %d", chan, ret);
796 			return;
797 		}
798 		chan->scq_hdl = NULL;
799 	}
800 
801 	/* Free buffers */
802 	if (chan->is_tx_chan) {
803 		ibd_rc_fini_txlist(chan);
804 		if (ibd_rc_tx_softintr == 1) {
805 			ddi_remove_softintr(chan->scq_softintr);
806 		}
807 		atomic_dec_32(&chan->state->rc_num_tx_chan);
808 	} else {
809 		if (!chan->state->rc_enable_srq) {
810 			ibd_rc_fini_rxlist(chan);
811 		}
812 		atomic_dec_32(&chan->state->rc_num_rx_chan);
813 	}
814 
815 	mutex_destroy(&chan->tx_poll_lock);
816 	mutex_destroy(&chan->tx_post_lock);
817 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
818 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
819 	mutex_destroy(&chan->rx_free_list.dl_mutex);
820 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
821 
822 	/*
823 	 * If it is a passive channel, must make sure it has been removed
824 	 * from chan->state->rc_pass_chan_list
825 	 */
826 	kmem_free(chan, sizeof (ibd_rc_chan_t));
827 }
828 
829 /* Add a RC channel */
830 static inline void
ibd_rc_add_to_chan_list(ibd_rc_chan_list_t * list,ibd_rc_chan_t * chan)831 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
832 {
833 	mutex_enter(&list->chan_list_mutex);
834 	if (list->chan_list == NULL) {
835 		list->chan_list = chan;
836 		chan->next = NULL;
837 	} else {
838 		chan->next = list->chan_list;
839 		list->chan_list = chan;
840 	}
841 	mutex_exit(&list->chan_list_mutex);
842 }
843 
844 static boolean_t
ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t * chan)845 ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t *chan)
846 {
847 	ibd_state_t *state = chan->state;
848 
849 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
850 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0) {
851 		mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
852 		return (B_FALSE);
853 	} else {
854 		if (state->rc_pass_chan_list.chan_list == NULL) {
855 			state->rc_pass_chan_list.chan_list = chan;
856 			chan->next = NULL;
857 		} else {
858 			chan->next = state->rc_pass_chan_list.chan_list;
859 			state->rc_pass_chan_list.chan_list = chan;
860 		}
861 		mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
862 		return (B_TRUE);
863 	}
864 }
865 
866 /* Remove a RC channel */
867 static inline ibd_rc_chan_t *
ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t * list,ibd_rc_chan_t * chan)868 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
869 {
870 	ibd_rc_chan_t *pre_chan;
871 
872 	mutex_enter(&list->chan_list_mutex);
873 	if (list->chan_list == chan) {
874 		DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
875 		    " in chan_list", chan);
876 		list->chan_list = chan->next;
877 	} else {
878 		pre_chan = list->chan_list;
879 		while (pre_chan != NULL) {
880 			if (pre_chan->next == chan) {
881 				DPRINT(30, "ibd_rc_rm_from_chan_list"
882 				    "(middle): found chan(%p)", chan);
883 				pre_chan->next = chan->next;
884 				break;
885 			}
886 			pre_chan = pre_chan->next;
887 		}
888 		if (pre_chan == NULL)
889 			chan = NULL;
890 	}
891 	mutex_exit(&list->chan_list_mutex);
892 	return (chan);
893 }
894 
895 static inline ibd_rc_chan_t *
ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t * list)896 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
897 {
898 	ibd_rc_chan_t *rc_chan;
899 
900 	mutex_enter(&list->chan_list_mutex);
901 	rc_chan = list->chan_list;
902 	if (rc_chan != NULL) {
903 		list->chan_list = rc_chan->next;
904 	}
905 	mutex_exit(&list->chan_list_mutex);
906 	return (rc_chan);
907 }
908 
909 static int
ibd_rc_alloc_srq_copybufs(ibd_state_t * state)910 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
911 {
912 	ibt_mr_attr_t mem_attr;
913 	uint_t rc_rx_bufs_sz;
914 
915 	/*
916 	 * Allocate one big chunk for all regular rx copy bufs
917 	 */
918 	rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
919 
920 	state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
921 
922 	state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
923 	    sizeof (ibd_rwqe_t), KM_SLEEP);
924 
925 	/*
926 	 * Do one memory registration on the entire rxbuf area
927 	 */
928 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
929 	mem_attr.mr_len = rc_rx_bufs_sz;
930 	mem_attr.mr_as = NULL;
931 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
932 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
933 	    &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
934 	    != IBT_SUCCESS) {
935 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
936 		    "failed");
937 		kmem_free(state->rc_srq_rwqes,
938 		    state->rc_srq_size * sizeof (ibd_rwqe_t));
939 		kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
940 		state->rc_srq_rx_bufs = NULL;
941 		state->rc_srq_rwqes = NULL;
942 		return (DDI_FAILURE);
943 	}
944 
945 	return (DDI_SUCCESS);
946 }
947 
948 static void
ibd_rc_free_srq_copybufs(ibd_state_t * state)949 ibd_rc_free_srq_copybufs(ibd_state_t *state)
950 {
951 	uint_t rc_rx_buf_sz;
952 
953 	/*
954 	 * Don't change the value of state->rc_mtu at the period from call
955 	 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
956 	 */
957 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
958 
959 	/*
960 	 * Unregister rxbuf mr
961 	 */
962 	if (ibt_deregister_mr(state->id_hca_hdl,
963 	    state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
964 		DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
965 		    " failed");
966 	}
967 	state->rc_srq_rx_mr_hdl = NULL;
968 
969 	/*
970 	 * Free rxbuf memory
971 	 */
972 	kmem_free(state->rc_srq_rwqes,
973 	    state->rc_srq_size * sizeof (ibd_rwqe_t));
974 	kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
975 	state->rc_srq_rwqes = NULL;
976 	state->rc_srq_rx_bufs = NULL;
977 }
978 
979 /*
980  * Allocate and post a certain number of SRQ receive buffers and WRs.
981  */
982 int
ibd_rc_init_srq_list(ibd_state_t * state)983 ibd_rc_init_srq_list(ibd_state_t *state)
984 {
985 	ibd_rwqe_t *rwqe;
986 	ibt_lkey_t lkey;
987 	int i;
988 	uint_t len;
989 	uint8_t *bufaddr;
990 	ibt_srq_sizes_t srq_sizes;
991 	ibt_srq_sizes_t	 srq_real_sizes;
992 	ibt_status_t ret;
993 
994 	srq_sizes.srq_sgl_sz = 1;
995 	srq_sizes.srq_wr_sz = state->id_rc_num_srq;
996 	ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
997 	    state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
998 	if (ret != IBT_SUCCESS) {
999 		/*
1000 		 * The following code is for CR 6932460 (can't configure ibd
1001 		 * interface on 32 bits x86 systems). 32 bits x86 system has
1002 		 * less memory resource than 64 bits x86 system. If current
1003 		 * resource request can't be satisfied, we request less
1004 		 * resource here.
1005 		 */
1006 		len = state->id_rc_num_srq;
1007 		while ((ret == IBT_HCA_WR_EXCEEDED) &&
1008 		    (len >= 2 * IBD_RC_MIN_CQ_SIZE)) {
1009 			len = len/2;
1010 			srq_sizes.srq_sgl_sz = 1;
1011 			srq_sizes.srq_wr_sz = len;
1012 			ret = ibt_alloc_srq(state->id_hca_hdl,
1013 			    IBT_SRQ_NO_FLAGS, state->id_pd_hdl, &srq_sizes,
1014 			    &state->rc_srq_hdl, &srq_real_sizes);
1015 		}
1016 		if (ret != IBT_SUCCESS) {
1017 			DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
1018 			    "req_sgl_sz=%d, req_wr_sz=0x%x, final_req_wr_sz="
1019 			    "0x%x, ret=%d", srq_sizes.srq_sgl_sz,
1020 			    srq_sizes.srq_wr_sz, len, ret);
1021 			return (DDI_FAILURE);
1022 		}
1023 		state->id_rc_num_srq = len;
1024 		state->id_rc_num_rwqe = state->id_rc_num_srq + 1;
1025 	}
1026 
1027 	state->rc_srq_size = srq_real_sizes.srq_wr_sz;
1028 	if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
1029 		ret = ibt_free_srq(state->rc_srq_hdl);
1030 		if (ret != IBT_SUCCESS) {
1031 			ibd_print_warn(state, "ibd_rc_init_srq_list: "
1032 			    "ibt_free_srq fail, ret=%d", ret);
1033 		}
1034 		return (DDI_FAILURE);
1035 	}
1036 
1037 	/*
1038 	 * Allocate and setup the rwqe list
1039 	 */
1040 	lkey = state->rc_srq_rx_mr_desc.md_lkey;
1041 	rwqe = state->rc_srq_rwqes;
1042 	bufaddr = state->rc_srq_rx_bufs;
1043 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1044 	state->rc_srq_rwqe_list.dl_cnt = 0;
1045 	state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
1046 	for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
1047 		rwqe->w_state = state;
1048 		rwqe->w_freeing_wqe = B_FALSE;
1049 		rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
1050 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1051 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1052 
1053 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1054 		    &rwqe->w_freemsg_cb)) == NULL) {
1055 			DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
1056 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1057 			if (atomic_dec_32_nv(&state->id_running) != 0) {
1058 				cmn_err(CE_WARN, "ibd_rc_init_srq_list: "
1059 				    "id_running was not 1\n");
1060 			}
1061 			ibd_rc_fini_srq_list(state);
1062 			atomic_inc_32(&state->id_running);
1063 			return (DDI_FAILURE);
1064 		}
1065 
1066 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1067 		/* Leave IPOIB_GRH_SIZE space */
1068 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1069 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1070 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1071 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1072 		rwqe->w_rwr.wr_nds = 1;
1073 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1074 		(void) ibd_rc_post_srq(state, rwqe);
1075 	}
1076 
1077 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1078 	state->rc_srq_free_list.dl_head = NULL;
1079 	state->rc_srq_free_list.dl_cnt = 0;
1080 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1081 
1082 	return (DDI_SUCCESS);
1083 }
1084 
1085 /*
1086  * Free the statically allocated Rx buffer list for SRQ.
1087  */
1088 void
ibd_rc_fini_srq_list(ibd_state_t * state)1089 ibd_rc_fini_srq_list(ibd_state_t *state)
1090 {
1091 	ibd_rwqe_t *rwqe;
1092 	int i;
1093 	ibt_status_t ret;
1094 
1095 	ASSERT(state->id_running == 0);
1096 	ret = ibt_free_srq(state->rc_srq_hdl);
1097 	if (ret != IBT_SUCCESS) {
1098 		ibd_print_warn(state, "ibd_rc_fini_srq_list: "
1099 		    "ibt_free_srq fail, ret=%d", ret);
1100 	}
1101 
1102 	mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
1103 	rwqe = state->rc_srq_rwqes;
1104 	for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
1105 		if (rwqe->rwqe_im_mblk != NULL) {
1106 			rwqe->w_freeing_wqe = B_TRUE;
1107 			freemsg(rwqe->rwqe_im_mblk);
1108 		}
1109 	}
1110 	mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
1111 
1112 	ibd_rc_free_srq_copybufs(state);
1113 }
1114 
1115 /* Repost the elements in state->ib_rc_free_list */
1116 int
ibd_rc_repost_srq_free_list(ibd_state_t * state)1117 ibd_rc_repost_srq_free_list(ibd_state_t *state)
1118 {
1119 	ibd_rwqe_t *rwqe;
1120 	ibd_wqe_t *list;
1121 	uint_t len;
1122 
1123 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1124 	if (state->rc_srq_free_list.dl_head != NULL) {
1125 		/* repost them */
1126 		len = state->rc_mtu + IPOIB_GRH_SIZE;
1127 		list = state->rc_srq_free_list.dl_head;
1128 		state->rc_srq_free_list.dl_head = NULL;
1129 		state->rc_srq_free_list.dl_cnt = 0;
1130 		mutex_exit(&state->rc_srq_free_list.dl_mutex);
1131 		while (list != NULL) {
1132 			rwqe = WQE_TO_RWQE(list);
1133 			if ((rwqe->rwqe_im_mblk == NULL) &&
1134 			    ((rwqe->rwqe_im_mblk = desballoc(
1135 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
1136 			    &rwqe->w_freemsg_cb)) == NULL)) {
1137 				DPRINT(40, "ibd_rc_repost_srq_free_list: "
1138 				    "failed in desballoc()");
1139 				do {
1140 					ibd_rc_srq_free_rwqe(state, rwqe);
1141 					list = list->w_next;
1142 					rwqe = WQE_TO_RWQE(list);
1143 				} while (list != NULL);
1144 				return (DDI_FAILURE);
1145 			}
1146 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1147 				ibd_rc_srq_free_rwqe(state, rwqe);
1148 			}
1149 			list = list->w_next;
1150 		}
1151 		return (DDI_SUCCESS);
1152 	}
1153 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1154 	return (DDI_SUCCESS);
1155 }
1156 
1157 /*
1158  * Free an allocated recv wqe.
1159  */
1160 static void
ibd_rc_srq_free_rwqe(ibd_state_t * state,ibd_rwqe_t * rwqe)1161 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1162 {
1163 	/*
1164 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1165 	 *
1166 	 * This rwqe is placed on a free list so that it
1167 	 * can be reinstated in future.
1168 	 *
1169 	 * NOTE: no code currently exists to reinstate
1170 	 * these "lost" rwqes.
1171 	 */
1172 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1173 	state->rc_srq_free_list.dl_cnt++;
1174 	rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1175 	state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1176 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1177 }
1178 
1179 static void
ibd_rc_srq_freemsg_cb(char * arg)1180 ibd_rc_srq_freemsg_cb(char *arg)
1181 {
1182 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1183 	ibd_state_t *state = rwqe->w_state;
1184 
1185 	ASSERT(state->rc_enable_srq);
1186 
1187 	/*
1188 	 * If the driver is stopped, just free the rwqe.
1189 	 */
1190 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
1191 		if (!rwqe->w_freeing_wqe) {
1192 			atomic_dec_32(
1193 			    &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1194 			DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed");
1195 			rwqe->rwqe_im_mblk = NULL;
1196 			ibd_rc_srq_free_rwqe(state, rwqe);
1197 		}
1198 		return;
1199 	}
1200 
1201 	atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding);
1202 
1203 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1204 	ASSERT(!rwqe->w_freeing_wqe);
1205 
1206 	/*
1207 	 * Upper layer has released held mblk, so we have
1208 	 * no more use for keeping the old pointer in
1209 	 * our rwqe.
1210 	 */
1211 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1212 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1213 	if (rwqe->rwqe_im_mblk == NULL) {
1214 		DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1215 		ibd_rc_srq_free_rwqe(state, rwqe);
1216 		return;
1217 	}
1218 
1219 	if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1220 		ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq"
1221 		    " failed");
1222 		ibd_rc_srq_free_rwqe(state, rwqe);
1223 		return;
1224 	}
1225 }
1226 
1227 /*
1228  * Post a rwqe to the hardware and add it to the Rx list.
1229  */
1230 static int
ibd_rc_post_srq(ibd_state_t * state,ibd_rwqe_t * rwqe)1231 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1232 {
1233 	/*
1234 	 * Here we should add dl_cnt before post recv, because
1235 	 * we would have to make sure dl_cnt is updated before
1236 	 * the corresponding ibd_rc_process_rx() is called.
1237 	 */
1238 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1239 	atomic_inc_32(&state->rc_srq_rwqe_list.dl_cnt);
1240 	if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1241 	    IBT_SUCCESS) {
1242 		atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1243 		DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1244 		return (DDI_FAILURE);
1245 	}
1246 
1247 	return (DDI_SUCCESS);
1248 }
1249 
1250 /*
1251  * Post a rwqe to the hardware and add it to the Rx list.
1252  */
1253 static int
ibd_rc_post_rwqe(ibd_rc_chan_t * chan,ibd_rwqe_t * rwqe)1254 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1255 {
1256 	/*
1257 	 * Here we should add dl_cnt before post recv, because we would
1258 	 * have to make sure dl_cnt has already updated before
1259 	 * corresponding ibd_rc_process_rx() is called.
1260 	 */
1261 	atomic_inc_32(&chan->rx_wqe_list.dl_cnt);
1262 	if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1263 	    IBT_SUCCESS) {
1264 		atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1265 		DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1266 		return (DDI_FAILURE);
1267 	}
1268 	return (DDI_SUCCESS);
1269 }
1270 
1271 static int
ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t * chan)1272 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1273 {
1274 	ibd_state_t *state = chan->state;
1275 	ibt_mr_attr_t mem_attr;
1276 	uint_t rc_rx_bufs_sz;
1277 
1278 	/*
1279 	 * Allocate one big chunk for all regular rx copy bufs
1280 	 */
1281 	rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1282 
1283 	chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1284 
1285 	chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1286 	    sizeof (ibd_rwqe_t), KM_SLEEP);
1287 
1288 	/*
1289 	 * Do one memory registration on the entire rxbuf area
1290 	 */
1291 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1292 	mem_attr.mr_len = rc_rx_bufs_sz;
1293 	mem_attr.mr_as = NULL;
1294 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1295 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1296 	    &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1297 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1298 		kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1299 		kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1300 		chan->rx_bufs = NULL;
1301 		chan->rx_rwqes = NULL;
1302 		return (DDI_FAILURE);
1303 	}
1304 
1305 	return (DDI_SUCCESS);
1306 }
1307 
1308 static void
ibd_rc_free_rx_copybufs(ibd_rc_chan_t * chan)1309 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1310 {
1311 	ibd_state_t *state = chan->state;
1312 	uint_t rc_rx_buf_sz;
1313 
1314 	ASSERT(!state->rc_enable_srq);
1315 	ASSERT(chan->rx_rwqes != NULL);
1316 	ASSERT(chan->rx_bufs != NULL);
1317 
1318 	/*
1319 	 * Don't change the value of state->rc_mtu at the period from call
1320 	 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1321 	 */
1322 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1323 
1324 	/*
1325 	 * Unregister rxbuf mr
1326 	 */
1327 	if (ibt_deregister_mr(state->id_hca_hdl,
1328 	    chan->rx_mr_hdl) != IBT_SUCCESS) {
1329 		DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1330 	}
1331 	chan->rx_mr_hdl = NULL;
1332 
1333 	/*
1334 	 * Free rxbuf memory
1335 	 */
1336 	kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1337 	chan->rx_rwqes = NULL;
1338 
1339 	kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1340 	chan->rx_bufs = NULL;
1341 }
1342 
1343 /*
1344  * Post a certain number of receive buffers and WRs on a RC channel.
1345  */
1346 static int
ibd_rc_init_rxlist(ibd_rc_chan_t * chan)1347 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1348 {
1349 	ibd_state_t *state = chan->state;
1350 	ibd_rwqe_t *rwqe;
1351 	ibt_lkey_t lkey;
1352 	int i;
1353 	uint_t len;
1354 	uint8_t *bufaddr;
1355 
1356 	ASSERT(!state->rc_enable_srq);
1357 	if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1358 		return (DDI_FAILURE);
1359 
1360 	/*
1361 	 * Allocate and setup the rwqe list
1362 	 */
1363 	lkey = chan->rx_mr_desc.md_lkey;
1364 	rwqe = chan->rx_rwqes;
1365 	bufaddr = chan->rx_bufs;
1366 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1367 	for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1368 		rwqe->w_state = state;
1369 		rwqe->w_chan = chan;
1370 		rwqe->w_freeing_wqe = B_FALSE;
1371 		rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1372 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1373 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1374 
1375 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1376 		    &rwqe->w_freemsg_cb)) == NULL) {
1377 			DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1378 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1379 			ibd_rc_fini_rxlist(chan);
1380 			return (DDI_FAILURE);
1381 		}
1382 
1383 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1384 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1385 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1386 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1387 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1388 		rwqe->w_rwr.wr_nds = 1;
1389 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1390 		(void) ibd_rc_post_rwqe(chan, rwqe);
1391 	}
1392 
1393 	return (DDI_SUCCESS);
1394 }
1395 
1396 /*
1397  * Free the statically allocated Rx buffer list for SRQ.
1398  */
1399 static void
ibd_rc_fini_rxlist(ibd_rc_chan_t * chan)1400 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1401 {
1402 	ibd_rwqe_t *rwqe;
1403 	int i;
1404 
1405 	if (chan->rx_bufs == NULL) {
1406 		DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1407 		return;
1408 	}
1409 
1410 	/* bufs_outstanding must be 0 */
1411 	ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1412 	    (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1413 
1414 	mutex_enter(&chan->rx_wqe_list.dl_mutex);
1415 	rwqe = chan->rx_rwqes;
1416 	for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1417 		if (rwqe->rwqe_im_mblk != NULL) {
1418 			rwqe->w_freeing_wqe = B_TRUE;
1419 			freemsg(rwqe->rwqe_im_mblk);
1420 		}
1421 	}
1422 	mutex_exit(&chan->rx_wqe_list.dl_mutex);
1423 
1424 	ibd_rc_free_rx_copybufs(chan);
1425 }
1426 
1427 /*
1428  * Free an allocated recv wqe.
1429  */
1430 static void
ibd_rc_free_rwqe(ibd_rc_chan_t * chan,ibd_rwqe_t * rwqe)1431 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1432 {
1433 	/*
1434 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1435 	 *
1436 	 * This rwqe is placed on a free list so that it
1437 	 * can be reinstated in future.
1438 	 *
1439 	 * NOTE: no code currently exists to reinstate
1440 	 * these "lost" rwqes.
1441 	 */
1442 	mutex_enter(&chan->rx_free_list.dl_mutex);
1443 	chan->rx_free_list.dl_cnt++;
1444 	rwqe->rwqe_next = chan->rx_free_list.dl_head;
1445 	chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1446 	mutex_exit(&chan->rx_free_list.dl_mutex);
1447 }
1448 
1449 /*
1450  * Processing to be done after receipt of a packet; hand off to GLD
1451  * in the format expected by GLD.
1452  */
1453 static void
ibd_rc_process_rx(ibd_rc_chan_t * chan,ibd_rwqe_t * rwqe,ibt_wc_t * wc)1454 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1455 {
1456 	ibd_state_t *state = chan->state;
1457 	ib_header_info_t *phdr;
1458 	ipoib_hdr_t *ipibp;
1459 	mblk_t *mp;
1460 	mblk_t *mpc;
1461 	int rxcnt;
1462 	ip6_t *ip6h;
1463 	int len;
1464 
1465 	/*
1466 	 * Track number handed to upper layer, and number still
1467 	 * available to receive packets.
1468 	 */
1469 	if (state->rc_enable_srq) {
1470 		rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1471 	} else {
1472 		rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1473 	}
1474 
1475 	/*
1476 	 * It can not be a IBA multicast packet.
1477 	 */
1478 	ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1479 
1480 	/* For the connection reaper routine ibd_rc_conn_timeout_call() */
1481 	chan->is_used = B_TRUE;
1482 
1483 #ifdef DEBUG
1484 	if (rxcnt < state->id_rc_rx_rwqe_thresh) {
1485 		state->rc_rwqe_short++;
1486 	}
1487 #endif
1488 
1489 	/*
1490 	 * Possibly replenish the Rx pool if needed.
1491 	 */
1492 	if ((rxcnt >= state->id_rc_rx_rwqe_thresh) &&
1493 	    (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) {
1494 		atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1495 		atomic_inc_64(&state->rc_rcv_trans_pkt);
1496 
1497 		/*
1498 		 * Record how many rwqe has been occupied by upper
1499 		 * network layer
1500 		 */
1501 		if (state->rc_enable_srq) {
1502 			atomic_inc_32(
1503 			    &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1504 		} else {
1505 			atomic_inc_32(&chan->rx_wqe_list.dl_bufs_outstanding);
1506 		}
1507 		mp = rwqe->rwqe_im_mblk;
1508 	} else {
1509 		atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1510 		atomic_inc_64(&state->rc_rcv_copy_pkt);
1511 
1512 		if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1513 		    BPRI_HI)) == NULL) {	/* no memory */
1514 			DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1515 			state->rc_rcv_alloc_fail++;
1516 			if (state->rc_enable_srq) {
1517 				if (ibd_rc_post_srq(state, rwqe) ==
1518 				    DDI_FAILURE) {
1519 					ibd_rc_srq_free_rwqe(state, rwqe);
1520 				}
1521 			} else {
1522 				if (ibd_rc_post_rwqe(chan, rwqe) ==
1523 				    DDI_FAILURE) {
1524 					ibd_rc_free_rwqe(chan, rwqe);
1525 				}
1526 			}
1527 			return;
1528 		}
1529 
1530 		bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1531 		    mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1532 
1533 		if (state->rc_enable_srq) {
1534 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1535 				ibd_rc_srq_free_rwqe(state, rwqe);
1536 			}
1537 		} else {
1538 			if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1539 				ibd_rc_free_rwqe(chan, rwqe);
1540 			}
1541 		}
1542 	}
1543 
1544 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1545 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1546 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1547 		len = ntohs(ip6h->ip6_plen);
1548 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1549 			/* LINTED: E_CONSTANT_CONDITION */
1550 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1551 		}
1552 	}
1553 
1554 	phdr = (ib_header_info_t *)mp->b_rptr;
1555 	phdr->ib_grh.ipoib_vertcflow = 0;
1556 	ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1557 	    sizeof (ipoib_mac_t));
1558 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1559 
1560 	/*
1561 	 * Can RC mode in IB guarantee its checksum correctness?
1562 	 *
1563 	 * mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
1564 	 */
1565 
1566 	/*
1567 	 * Make sure this is NULL or we're in trouble.
1568 	 */
1569 	if (mp->b_next != NULL) {
1570 		ibd_print_warn(state,
1571 		    "ibd_rc_process_rx: got duplicate mp from rcq?");
1572 		mp->b_next = NULL;
1573 	}
1574 
1575 	/*
1576 	 * Add this mp to the list of processed mp's to send to
1577 	 * the nw layer
1578 	 */
1579 	if (state->rc_enable_srq) {
1580 		mutex_enter(&state->rc_rx_lock);
1581 		if (state->rc_rx_mp) {
1582 			ASSERT(state->rc_rx_mp_tail != NULL);
1583 			state->rc_rx_mp_tail->b_next = mp;
1584 		} else {
1585 			ASSERT(state->rc_rx_mp_tail == NULL);
1586 			state->rc_rx_mp = mp;
1587 		}
1588 
1589 		state->rc_rx_mp_tail = mp;
1590 		state->rc_rx_mp_len++;
1591 
1592 		if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1593 			mpc = state->rc_rx_mp;
1594 
1595 			state->rc_rx_mp = NULL;
1596 			state->rc_rx_mp_tail = NULL;
1597 			state->rc_rx_mp_len = 0;
1598 			mutex_exit(&state->rc_rx_lock);
1599 			mac_rx(state->id_mh, NULL, mpc);
1600 		} else {
1601 			mutex_exit(&state->rc_rx_lock);
1602 		}
1603 	} else {
1604 		mutex_enter(&chan->rx_lock);
1605 		if (chan->rx_mp) {
1606 			ASSERT(chan->rx_mp_tail != NULL);
1607 			chan->rx_mp_tail->b_next = mp;
1608 		} else {
1609 			ASSERT(chan->rx_mp_tail == NULL);
1610 			chan->rx_mp = mp;
1611 		}
1612 
1613 		chan->rx_mp_tail = mp;
1614 		chan->rx_mp_len++;
1615 
1616 		if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1617 			mpc = chan->rx_mp;
1618 
1619 			chan->rx_mp = NULL;
1620 			chan->rx_mp_tail = NULL;
1621 			chan->rx_mp_len = 0;
1622 			mutex_exit(&chan->rx_lock);
1623 			mac_rx(state->id_mh, NULL, mpc);
1624 		} else {
1625 			mutex_exit(&chan->rx_lock);
1626 		}
1627 	}
1628 }
1629 
1630 /*
1631  * Callback code invoked from STREAMs when the recv data buffer is free
1632  * for recycling.
1633  */
1634 static void
ibd_rc_freemsg_cb(char * arg)1635 ibd_rc_freemsg_cb(char *arg)
1636 {
1637 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1638 	ibd_rc_chan_t *chan = rwqe->w_chan;
1639 	ibd_state_t *state = rwqe->w_state;
1640 
1641 	/*
1642 	 * If the wqe is being destructed, do not attempt recycling.
1643 	 */
1644 	if (rwqe->w_freeing_wqe == B_TRUE) {
1645 		return;
1646 	}
1647 
1648 	ASSERT(!state->rc_enable_srq);
1649 	ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1650 
1651 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1652 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1653 	if (rwqe->rwqe_im_mblk == NULL) {
1654 		DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1655 		ibd_rc_free_rwqe(chan, rwqe);
1656 		return;
1657 	}
1658 
1659 	/*
1660 	 * Post back to h/w. We could actually have more than
1661 	 * id_num_rwqe WQEs on the list if there were multiple
1662 	 * ibd_freemsg_cb() calls outstanding (since the lock is
1663 	 * not held the entire time). This will start getting
1664 	 * corrected over subsequent ibd_freemsg_cb() calls.
1665 	 */
1666 	if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1667 		ibd_rc_free_rwqe(chan, rwqe);
1668 		return;
1669 	}
1670 	atomic_dec_32(&chan->rx_wqe_list.dl_bufs_outstanding);
1671 }
1672 
1673 /*
1674  * Common code for interrupt handling as well as for polling
1675  * for all completed wqe's while detaching.
1676  */
1677 static void
ibd_rc_poll_rcq(ibd_rc_chan_t * chan,ibt_cq_hdl_t cq_hdl)1678 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1679 {
1680 	ibd_wqe_t *wqe;
1681 	ibt_wc_t *wc, *wcs;
1682 	uint_t numwcs, real_numwcs;
1683 	int i;
1684 
1685 	wcs = chan->rx_wc;
1686 	numwcs = IBD_RC_MAX_CQ_WC;
1687 
1688 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1689 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1690 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1691 			if (wc->wc_status != IBT_WC_SUCCESS) {
1692 				chan->state->rc_rcq_err++;
1693 				/*
1694 				 * Channel being torn down.
1695 				 */
1696 				DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1697 				    "SUCC, chan=%p", wc->wc_status, chan);
1698 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1699 					/*
1700 					 * Do not invoke Rx handler because
1701 					 * it might add buffers to the Rx pool
1702 					 * when we are trying to deinitialize.
1703 					 */
1704 					continue;
1705 				}
1706 			}
1707 			ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1708 		}
1709 	}
1710 }
1711 
1712 /* Receive CQ handler */
1713 /* ARGSUSED */
1714 static void
ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl,void * arg)1715 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1716 {
1717 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1718 	ibd_state_t *state = chan->state;
1719 
1720 	atomic_inc_32(&chan->rcq_invoking);
1721 	ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1722 
1723 	/*
1724 	 * Poll for completed entries; the CQ will not interrupt any
1725 	 * more for incoming (or transmitted) packets.
1726 	 */
1727 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1728 
1729 	/*
1730 	 * Now enable CQ notifications; all packets that arrive now
1731 	 * (or complete transmission) will cause new interrupts.
1732 	 */
1733 	if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1734 	    IBT_SUCCESS) {
1735 		/*
1736 		 * We do not expect a failure here.
1737 		 */
1738 		DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1739 	}
1740 
1741 	/*
1742 	 * Repoll to catch all packets that might have arrived after
1743 	 * we finished the first poll loop and before interrupts got
1744 	 * armed.
1745 	 */
1746 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1747 
1748 	if (state->rc_enable_srq) {
1749 		mutex_enter(&state->rc_rx_lock);
1750 
1751 		if (state->rc_rx_mp != NULL) {
1752 			mblk_t *mpc;
1753 			mpc = state->rc_rx_mp;
1754 
1755 			state->rc_rx_mp = NULL;
1756 			state->rc_rx_mp_tail = NULL;
1757 			state->rc_rx_mp_len = 0;
1758 
1759 			mutex_exit(&state->rc_rx_lock);
1760 			mac_rx(state->id_mh, NULL, mpc);
1761 		} else {
1762 			mutex_exit(&state->rc_rx_lock);
1763 		}
1764 	} else {
1765 		mutex_enter(&chan->rx_lock);
1766 
1767 		if (chan->rx_mp != NULL) {
1768 			mblk_t *mpc;
1769 			mpc = chan->rx_mp;
1770 
1771 			chan->rx_mp = NULL;
1772 			chan->rx_mp_tail = NULL;
1773 			chan->rx_mp_len = 0;
1774 
1775 			mutex_exit(&chan->rx_lock);
1776 			mac_rx(state->id_mh, NULL, mpc);
1777 		} else {
1778 			mutex_exit(&chan->rx_lock);
1779 		}
1780 	}
1781 	atomic_dec_32(&chan->rcq_invoking);
1782 }
1783 
1784 /*
1785  * Allocate the statically allocated Tx buffer list.
1786  */
1787 int
ibd_rc_init_tx_largebuf_list(ibd_state_t * state)1788 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1789 {
1790 	ibd_rc_tx_largebuf_t *lbufp;
1791 	ibd_rc_tx_largebuf_t *tail;
1792 	uint8_t *memp;
1793 	ibt_mr_attr_t mem_attr;
1794 	uint32_t num_swqe;
1795 	size_t  mem_size;
1796 	int i;
1797 
1798 	num_swqe = state->id_rc_num_swqe - 1;
1799 
1800 	/*
1801 	 * Allocate one big chunk for all Tx large copy bufs
1802 	 */
1803 	/* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1804 	mem_size = num_swqe * state->rc_mtu;
1805 	state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1806 
1807 	mem_attr.mr_len = mem_size;
1808 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1809 	mem_attr.mr_as = NULL;
1810 	mem_attr.mr_flags = IBT_MR_SLEEP;
1811 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1812 	    &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1813 		DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1814 		    "failed");
1815 		kmem_free(state->rc_tx_mr_bufs, mem_size);
1816 		state->rc_tx_mr_bufs = NULL;
1817 		return (DDI_FAILURE);
1818 	}
1819 
1820 	state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1821 	    sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1822 
1823 	/*
1824 	 * Set up the buf chain
1825 	 */
1826 	memp = state->rc_tx_mr_bufs;
1827 	mutex_enter(&state->rc_tx_large_bufs_lock);
1828 	lbufp = state->rc_tx_largebuf_desc_base;
1829 	for (i = 0; i < num_swqe; i++) {
1830 		lbufp->lb_buf = memp;
1831 		lbufp->lb_next = lbufp + 1;
1832 
1833 		tail = lbufp;
1834 
1835 		memp += state->rc_mtu;
1836 		lbufp++;
1837 	}
1838 	tail->lb_next = NULL;
1839 
1840 	/*
1841 	 * Set up the buffer information in ibd state
1842 	 */
1843 	state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1844 	state->rc_tx_largebuf_nfree = num_swqe;
1845 	mutex_exit(&state->rc_tx_large_bufs_lock);
1846 	return (DDI_SUCCESS);
1847 }
1848 
1849 void
ibd_rc_fini_tx_largebuf_list(ibd_state_t * state)1850 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1851 {
1852 	uint32_t num_swqe;
1853 
1854 	num_swqe = state->id_rc_num_swqe - 1;
1855 
1856 	if (ibt_deregister_mr(state->id_hca_hdl,
1857 	    state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1858 		DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1859 		    "failed");
1860 	}
1861 	state->rc_tx_mr_hdl = NULL;
1862 
1863 	kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1864 	state->rc_tx_mr_bufs = NULL;
1865 
1866 	kmem_free(state->rc_tx_largebuf_desc_base,
1867 	    num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1868 	state->rc_tx_largebuf_desc_base = NULL;
1869 }
1870 
1871 static int
ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t * chan)1872 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1873 {
1874 	ibt_mr_attr_t mem_attr;
1875 	ibd_state_t *state;
1876 
1877 	state = chan->state;
1878 	ASSERT(state != NULL);
1879 
1880 	/*
1881 	 * Allocate one big chunk for all regular tx copy bufs
1882 	 */
1883 	mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh;
1884 
1885 	chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1886 
1887 	/*
1888 	 * Do one memory registration on the entire txbuf area
1889 	 */
1890 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1891 	mem_attr.mr_as = NULL;
1892 	mem_attr.mr_flags = IBT_MR_SLEEP;
1893 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1894 	    &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1895 		DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1896 		ASSERT(mem_attr.mr_len ==
1897 		    chan->scq_size * state->id_rc_tx_copy_thresh);
1898 		kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1899 		chan->tx_mr_bufs = NULL;
1900 		return (DDI_FAILURE);
1901 	}
1902 
1903 	return (DDI_SUCCESS);
1904 }
1905 
1906 /*
1907  * Allocate the statically allocated Tx buffer list.
1908  */
1909 static int
ibd_rc_init_txlist(ibd_rc_chan_t * chan)1910 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1911 {
1912 	ibd_swqe_t *swqe;
1913 	int i;
1914 	ibt_lkey_t lkey;
1915 	ibd_state_t *state = chan->state;
1916 
1917 	if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1918 		return (DDI_FAILURE);
1919 
1920 	/*
1921 	 * Allocate and setup the swqe list
1922 	 */
1923 	lkey = chan->tx_mr_desc.md_lkey;
1924 	chan->tx_wqes = kmem_zalloc(chan->scq_size *
1925 	    sizeof (ibd_swqe_t), KM_SLEEP);
1926 	swqe = chan->tx_wqes;
1927 	for (i = 0; i < chan->scq_size; i++, swqe++) {
1928 		swqe->swqe_next = NULL;
1929 		swqe->swqe_im_mblk = NULL;
1930 
1931 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1932 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1933 
1934 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1935 		swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1936 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1937 		    (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh);
1938 		swqe->w_swr.wr_trans = IBT_RC_SRV;
1939 
1940 		/* Add to list */
1941 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
1942 		chan->tx_wqe_list.dl_cnt++;
1943 		swqe->swqe_next = chan->tx_wqe_list.dl_head;
1944 		chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1945 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
1946 	}
1947 
1948 	return (DDI_SUCCESS);
1949 }
1950 
1951 /*
1952  * Free the statically allocated Tx buffer list.
1953  */
1954 static void
ibd_rc_fini_txlist(ibd_rc_chan_t * chan)1955 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1956 {
1957 	ibd_state_t *state = chan->state;
1958 	if (chan->tx_mr_hdl != NULL) {
1959 		if (ibt_deregister_mr(chan->state->id_hca_hdl,
1960 		    chan->tx_mr_hdl) != IBT_SUCCESS) {
1961 			DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1962 			    "failed");
1963 		}
1964 		chan->tx_mr_hdl = NULL;
1965 	}
1966 
1967 	if (chan->tx_mr_bufs != NULL) {
1968 		kmem_free(chan->tx_mr_bufs, chan->scq_size *
1969 		    state->id_rc_tx_copy_thresh);
1970 		chan->tx_mr_bufs = NULL;
1971 	}
1972 
1973 	if (chan->tx_wqes != NULL) {
1974 		kmem_free(chan->tx_wqes, chan->scq_size *
1975 		    sizeof (ibd_swqe_t));
1976 		chan->tx_wqes = NULL;
1977 	}
1978 }
1979 
1980 /*
1981  * Acquire send wqe from free list.
1982  * Returns error number and send wqe pointer.
1983  */
1984 ibd_swqe_t *
ibd_rc_acquire_swqes(ibd_rc_chan_t * chan)1985 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1986 {
1987 	ibd_swqe_t *wqe;
1988 
1989 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1990 	if (chan->tx_rel_list.dl_head != NULL) {
1991 		/* transfer id_tx_rel_list to id_tx_list */
1992 		chan->tx_wqe_list.dl_head =
1993 		    chan->tx_rel_list.dl_head;
1994 		chan->tx_wqe_list.dl_cnt =
1995 		    chan->tx_rel_list.dl_cnt;
1996 		chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1997 
1998 		/* clear id_tx_rel_list */
1999 		chan->tx_rel_list.dl_head = NULL;
2000 		chan->tx_rel_list.dl_cnt = 0;
2001 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2002 
2003 		wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
2004 		chan->tx_wqe_list.dl_cnt -= 1;
2005 		chan->tx_wqe_list.dl_head = wqe->swqe_next;
2006 	} else {	/* no free swqe */
2007 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2008 		chan->tx_wqe_list.dl_pending_sends = B_TRUE;
2009 		wqe = NULL;
2010 	}
2011 	return (wqe);
2012 }
2013 
2014 /*
2015  * Release send wqe back into free list.
2016  */
2017 static void
ibd_rc_release_swqe(ibd_rc_chan_t * chan,ibd_swqe_t * swqe)2018 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
2019 {
2020 	/*
2021 	 * Add back on Tx list for reuse.
2022 	 */
2023 	swqe->swqe_next = NULL;
2024 	mutex_enter(&chan->tx_rel_list.dl_mutex);
2025 	chan->tx_rel_list.dl_pending_sends = B_FALSE;
2026 	swqe->swqe_next = chan->tx_rel_list.dl_head;
2027 	chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
2028 	chan->tx_rel_list.dl_cnt++;
2029 	mutex_exit(&chan->tx_rel_list.dl_mutex);
2030 }
2031 
2032 void
ibd_rc_post_send(ibd_rc_chan_t * chan,ibd_swqe_t * node)2033 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
2034 {
2035 	uint_t		i;
2036 	uint_t		num_posted;
2037 	uint_t		n_wrs;
2038 	ibt_status_t	ibt_status;
2039 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
2040 	ibd_swqe_t	*tx_head, *elem;
2041 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
2042 
2043 	/* post the one request, then check for more */
2044 	ibt_status = ibt_post_send(chan->chan_hdl,
2045 	    &node->w_swr, 1, NULL);
2046 	if (ibt_status != IBT_SUCCESS) {
2047 		ibd_print_warn(chan->state, "ibd_post_send: "
2048 		    "posting one wr failed: ret=%d", ibt_status);
2049 		ibd_rc_tx_cleanup(node);
2050 	}
2051 
2052 	tx_head = NULL;
2053 	for (;;) {
2054 		if (tx_head == NULL) {
2055 			mutex_enter(&chan->tx_post_lock);
2056 			tx_head = chan->tx_head;
2057 			if (tx_head == NULL) {
2058 				chan->tx_busy = 0;
2059 				mutex_exit(&chan->tx_post_lock);
2060 				return;
2061 			}
2062 			chan->tx_head = NULL;
2063 			mutex_exit(&chan->tx_post_lock);
2064 		}
2065 
2066 		/*
2067 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
2068 		 * at a time if possible, and keep posting them.
2069 		 */
2070 		for (n_wrs = 0, elem = tx_head;
2071 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
2072 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
2073 			nodes[n_wrs] = elem;
2074 			wrs[n_wrs] = elem->w_swr;
2075 		}
2076 		tx_head = elem;
2077 
2078 		ASSERT(n_wrs != 0);
2079 
2080 		/*
2081 		 * If posting fails for some reason, we'll never receive
2082 		 * completion intimation, so we'll need to cleanup. But
2083 		 * we need to make sure we don't clean up nodes whose
2084 		 * wrs have been successfully posted. We assume that the
2085 		 * hca driver returns on the first failure to post and
2086 		 * therefore the first 'num_posted' entries don't need
2087 		 * cleanup here.
2088 		 */
2089 		num_posted = 0;
2090 		ibt_status = ibt_post_send(chan->chan_hdl,
2091 		    wrs, n_wrs, &num_posted);
2092 		if (ibt_status != IBT_SUCCESS) {
2093 			ibd_print_warn(chan->state, "ibd_post_send: "
2094 			    "posting multiple wrs failed: "
2095 			    "requested=%d, done=%d, ret=%d",
2096 			    n_wrs, num_posted, ibt_status);
2097 
2098 			for (i = num_posted; i < n_wrs; i++)
2099 				ibd_rc_tx_cleanup(nodes[i]);
2100 		}
2101 	}
2102 }
2103 
2104 /*
2105  * Common code that deals with clean ups after a successful or
2106  * erroneous transmission attempt.
2107  */
2108 void
ibd_rc_tx_cleanup(ibd_swqe_t * swqe)2109 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
2110 {
2111 	ibd_ace_t *ace = swqe->w_ahandle;
2112 	ibd_state_t *state;
2113 
2114 	ASSERT(ace != NULL);
2115 	ASSERT(ace->ac_chan != NULL);
2116 
2117 	state = ace->ac_chan->state;
2118 
2119 	/*
2120 	 * If this was a dynamic registration in ibd_send(),
2121 	 * deregister now.
2122 	 */
2123 	if (swqe->swqe_im_mblk != NULL) {
2124 		ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
2125 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
2126 			ibd_unmap_mem(state, swqe);
2127 		}
2128 		freemsg(swqe->swqe_im_mblk);
2129 		swqe->swqe_im_mblk = NULL;
2130 	} else {
2131 		ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
2132 	}
2133 
2134 	if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
2135 		ibd_rc_tx_largebuf_t *lbufp;
2136 
2137 		lbufp = swqe->w_rc_tx_largebuf;
2138 		ASSERT(lbufp != NULL);
2139 
2140 		mutex_enter(&state->rc_tx_large_bufs_lock);
2141 		lbufp->lb_next = state->rc_tx_largebuf_free_head;
2142 		state->rc_tx_largebuf_free_head = lbufp;
2143 		state->rc_tx_largebuf_nfree ++;
2144 		mutex_exit(&state->rc_tx_large_bufs_lock);
2145 		swqe->w_rc_tx_largebuf = NULL;
2146 	}
2147 
2148 
2149 	/*
2150 	 * Release the send wqe for reuse.
2151 	 */
2152 	ibd_rc_release_swqe(ace->ac_chan, swqe);
2153 
2154 	/*
2155 	 * Drop the reference count on the AH; it can be reused
2156 	 * now for a different destination if there are no more
2157 	 * posted sends that will use it. This can be eliminated
2158 	 * if we can always associate each Tx buffer with an AH.
2159 	 * The ace can be null if we are cleaning up from the
2160 	 * ibd_send() error path.
2161 	 */
2162 	ibd_dec_ref_ace(state, ace);
2163 }
2164 
2165 void
ibd_rc_drain_scq(ibd_rc_chan_t * chan,ibt_cq_hdl_t cq_hdl)2166 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2167 {
2168 	ibd_state_t *state = chan->state;
2169 	ibd_wqe_t *wqe;
2170 	ibt_wc_t *wc, *wcs;
2171 	ibd_ace_t *ace;
2172 	uint_t numwcs, real_numwcs;
2173 	int i;
2174 	boolean_t encount_error;
2175 
2176 	wcs = chan->tx_wc;
2177 	numwcs = IBD_RC_MAX_CQ_WC;
2178 	encount_error = B_FALSE;
2179 
2180 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2181 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2182 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2183 			if (wc->wc_status != IBT_WC_SUCCESS) {
2184 				if (encount_error == B_FALSE) {
2185 					/*
2186 					 * This RC channle is in error status,
2187 					 * remove it.
2188 					 */
2189 					encount_error = B_TRUE;
2190 					mutex_enter(&state->id_ac_mutex);
2191 					if ((chan->chan_state ==
2192 					    IBD_RC_STATE_ACT_ESTAB) &&
2193 					    (chan->state->id_link_state ==
2194 					    LINK_STATE_UP) &&
2195 					    ((ace = ibd_acache_find(state,
2196 					    &chan->ace->ac_mac, B_FALSE, 0))
2197 					    != NULL) && (ace == chan->ace)) {
2198 						ASSERT(ace->ac_mce == NULL);
2199 						INC_REF(ace, 1);
2200 						IBD_ACACHE_PULLOUT_ACTIVE(
2201 						    state, ace);
2202 						chan->chan_state =
2203 						    IBD_RC_STATE_ACT_CLOSING;
2204 						mutex_exit(&state->id_ac_mutex);
2205 						state->rc_reset_cnt++;
2206 						DPRINT(30, "ibd_rc_drain_scq: "
2207 						    "wc_status(%d) != SUCC, "
2208 						    "chan=%p, ace=%p, "
2209 						    "link_state=%d"
2210 						    "reset RC channel",
2211 						    wc->wc_status, chan,
2212 						    chan->ace, chan->state->
2213 						    id_link_state);
2214 						ibd_rc_signal_act_close(
2215 						    state, ace);
2216 					} else {
2217 						mutex_exit(&state->id_ac_mutex);
2218 						state->
2219 						    rc_act_close_simultaneous++;
2220 						DPRINT(40, "ibd_rc_drain_scq: "
2221 						    "wc_status(%d) != SUCC, "
2222 						    "chan=%p, chan_state=%d,"
2223 						    "ace=%p, link_state=%d."
2224 						    "other thread is closing "
2225 						    "it", wc->wc_status, chan,
2226 						    chan->chan_state, chan->ace,
2227 						    chan->state->id_link_state);
2228 					}
2229 				}
2230 			}
2231 			ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2232 		}
2233 
2234 		mutex_enter(&state->id_sched_lock);
2235 		if (state->id_sched_needed == 0) {
2236 			mutex_exit(&state->id_sched_lock);
2237 		} else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2238 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2239 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2240 			if ((chan->tx_rel_list.dl_cnt +
2241 			    chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2242 				state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2243 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2244 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2245 				mutex_exit(&state->id_sched_lock);
2246 				state->rc_swqe_mac_update++;
2247 				mac_tx_update(state->id_mh);
2248 			} else {
2249 				state->rc_scq_no_swqe++;
2250 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2251 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2252 				mutex_exit(&state->id_sched_lock);
2253 			}
2254 		} else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2255 			mutex_enter(&state->rc_tx_large_bufs_lock);
2256 			if (state->rc_tx_largebuf_nfree >
2257 			    IBD_RC_TX_FREE_THRESH) {
2258 				ASSERT(state->rc_tx_largebuf_free_head != NULL);
2259 				state->id_sched_needed &=
2260 				    ~IBD_RSRC_RC_TX_LARGEBUF;
2261 				mutex_exit(&state->rc_tx_large_bufs_lock);
2262 				mutex_exit(&state->id_sched_lock);
2263 				state->rc_xmt_buf_mac_update++;
2264 				mac_tx_update(state->id_mh);
2265 			} else {
2266 				state->rc_scq_no_largebuf++;
2267 				mutex_exit(&state->rc_tx_large_bufs_lock);
2268 				mutex_exit(&state->id_sched_lock);
2269 			}
2270 		} else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2271 			mutex_enter(&state->id_tx_list.dl_mutex);
2272 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
2273 			if ((state->id_tx_list.dl_cnt +
2274 			    state->id_tx_rel_list.dl_cnt)
2275 			    > IBD_FREE_SWQES_THRESH) {
2276 				state->id_sched_needed &= ~IBD_RSRC_SWQE;
2277 				state->id_sched_cnt++;
2278 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2279 				mutex_exit(&state->id_tx_list.dl_mutex);
2280 				mutex_exit(&state->id_sched_lock);
2281 				mac_tx_update(state->id_mh);
2282 			} else {
2283 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2284 				mutex_exit(&state->id_tx_list.dl_mutex);
2285 				mutex_exit(&state->id_sched_lock);
2286 			}
2287 		} else {
2288 			mutex_exit(&state->id_sched_lock);
2289 		}
2290 	}
2291 }
2292 
2293 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2294 /* ARGSUSED */
2295 static void
ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl,void * arg)2296 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2297 {
2298 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2299 
2300 	if (ibd_rc_tx_softintr == 1) {
2301 		mutex_enter(&chan->tx_poll_lock);
2302 		if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2303 			chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2304 			mutex_exit(&chan->tx_poll_lock);
2305 			return;
2306 		} else {
2307 			mutex_exit(&chan->tx_poll_lock);
2308 			ddi_trigger_softintr(chan->scq_softintr);
2309 		}
2310 	} else
2311 		(void) ibd_rc_tx_recycle(arg);
2312 }
2313 
2314 static uint_t
ibd_rc_tx_recycle(caddr_t arg)2315 ibd_rc_tx_recycle(caddr_t arg)
2316 {
2317 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2318 	ibd_state_t *state = chan->state;
2319 	int flag, redo_flag;
2320 	int redo = 1;
2321 
2322 	flag = IBD_CQ_POLLING;
2323 	redo_flag = IBD_REDO_CQ_POLLING;
2324 
2325 	mutex_enter(&chan->tx_poll_lock);
2326 	if (chan->tx_poll_busy & flag) {
2327 		ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2328 		    "threads");
2329 		chan->tx_poll_busy |= redo_flag;
2330 		mutex_exit(&chan->tx_poll_lock);
2331 		return (DDI_INTR_CLAIMED);
2332 	}
2333 	chan->tx_poll_busy |= flag;
2334 	mutex_exit(&chan->tx_poll_lock);
2335 
2336 	/*
2337 	 * Poll for completed entries; the CQ will not interrupt any
2338 	 * more for completed packets.
2339 	 */
2340 	ibd_rc_drain_scq(chan, chan->scq_hdl);
2341 
2342 	/*
2343 	 * Now enable CQ notifications; all completions originating now
2344 	 * will cause new interrupts.
2345 	 */
2346 	do {
2347 		if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2348 		    IBT_SUCCESS) {
2349 			/*
2350 			 * We do not expect a failure here.
2351 			 */
2352 			DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2353 			    " failed");
2354 		}
2355 
2356 		ibd_rc_drain_scq(chan, chan->scq_hdl);
2357 
2358 		mutex_enter(&chan->tx_poll_lock);
2359 		if (chan->tx_poll_busy & redo_flag)
2360 			chan->tx_poll_busy &= ~redo_flag;
2361 		else {
2362 			chan->tx_poll_busy &= ~flag;
2363 			redo = 0;
2364 		}
2365 		mutex_exit(&chan->tx_poll_lock);
2366 
2367 	} while (redo);
2368 
2369 	return (DDI_INTR_CLAIMED);
2370 }
2371 
2372 static ibt_status_t
ibd_register_service(ibt_srv_desc_t * srv,ib_svc_id_t sid,int num_sids,ibt_srv_hdl_t * srv_hdl,ib_svc_id_t * ret_sid)2373 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid,
2374     int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid)
2375 {
2376 	ibd_service_t *p;
2377 	ibt_status_t status;
2378 
2379 	mutex_enter(&ibd_gstate.ig_mutex);
2380 	for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) {
2381 		if (p->is_sid == sid) {
2382 			p->is_ref_cnt++;
2383 			*srv_hdl = p->is_srv_hdl;
2384 			*ret_sid = sid;
2385 			mutex_exit(&ibd_gstate.ig_mutex);
2386 			return (IBT_SUCCESS);
2387 		}
2388 	}
2389 	status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid,
2390 	    num_sids, srv_hdl, ret_sid);
2391 	if (status == IBT_SUCCESS) {
2392 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2393 		p->is_srv_hdl = *srv_hdl;
2394 		p->is_sid = sid;
2395 		p->is_ref_cnt = 1;
2396 		p->is_link = ibd_gstate.ig_service_list;
2397 		ibd_gstate.ig_service_list = p;
2398 	}
2399 	mutex_exit(&ibd_gstate.ig_mutex);
2400 	return (status);
2401 }
2402 
2403 static ibt_status_t
ibd_deregister_service(ibt_srv_hdl_t srv_hdl)2404 ibd_deregister_service(ibt_srv_hdl_t srv_hdl)
2405 {
2406 	ibd_service_t *p, **pp;
2407 	ibt_status_t status;
2408 
2409 	mutex_enter(&ibd_gstate.ig_mutex);
2410 	for (pp = &ibd_gstate.ig_service_list; *pp != NULL;
2411 	    pp = &((*pp)->is_link)) {
2412 		p = *pp;
2413 		if (p->is_srv_hdl == srv_hdl) {	/* Found it */
2414 			if (--p->is_ref_cnt == 0) {
2415 				status = ibt_deregister_service(
2416 				    ibd_gstate.ig_ibt_hdl, srv_hdl);
2417 				*pp = p->is_link; /* link prev to next */
2418 				kmem_free(p, sizeof (*p));
2419 			} else {
2420 				status = IBT_SUCCESS;
2421 			}
2422 			mutex_exit(&ibd_gstate.ig_mutex);
2423 			return (status);
2424 		}
2425 	}
2426 	/* Should not ever get here */
2427 	mutex_exit(&ibd_gstate.ig_mutex);
2428 	return (IBT_FAILURE);
2429 }
2430 
2431 /* Listen with corresponding service ID */
2432 ibt_status_t
ibd_rc_listen(ibd_state_t * state)2433 ibd_rc_listen(ibd_state_t *state)
2434 {
2435 	ibt_srv_desc_t srvdesc;
2436 	ib_svc_id_t ret_sid;
2437 	ibt_status_t status;
2438 	ib_gid_t gid;
2439 
2440 	if (state->rc_listen_hdl != NULL) {
2441 		DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2442 		return (IBT_FAILURE);
2443 	}
2444 
2445 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2446 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2447 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2448 
2449 	/*
2450 	 * Register the service with service id
2451 	 * Incoming connection requests should arrive on this service id.
2452 	 */
2453 	status = ibd_register_service(&srvdesc,
2454 	    IBD_RC_QPN_TO_SID(state->id_qpnum),
2455 	    1, &state->rc_listen_hdl, &ret_sid);
2456 	if (status != IBT_SUCCESS) {
2457 		DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2458 		    "ret=%d", status);
2459 		return (status);
2460 	}
2461 
2462 	gid = state->id_sgid;
2463 
2464 	/* pass state as cm_private */
2465 	status = ibt_bind_service(state->rc_listen_hdl,
2466 	    gid, NULL, state, &state->rc_listen_bind);
2467 	if (status != IBT_SUCCESS) {
2468 		DPRINT(40, "ibd_rc_listen:"
2469 		    " fail to bind port: <%d>", status);
2470 		(void) ibd_deregister_service(state->rc_listen_hdl);
2471 		return (status);
2472 	}
2473 
2474 	/*
2475 	 * Legacy OFED had used a wrong service ID (one additional zero digit)
2476 	 * for many years. To interop with legacy OFED, we support this wrong
2477 	 * service ID here.
2478 	 */
2479 	ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2480 
2481 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2482 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2483 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2484 
2485 	/*
2486 	 * Register the service with service id
2487 	 * Incoming connection requests should arrive on this service id.
2488 	 */
2489 	status = ibd_register_service(&srvdesc,
2490 	    IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2491 	    1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2492 	if (status != IBT_SUCCESS) {
2493 		DPRINT(40,
2494 		    "ibd_rc_listen: Service Registration for Legacy OFED "
2495 		    "Failed %d", status);
2496 		(void) ibt_unbind_service(state->rc_listen_hdl,
2497 		    state->rc_listen_bind);
2498 		(void) ibd_deregister_service(state->rc_listen_hdl);
2499 		return (status);
2500 	}
2501 
2502 	gid = state->id_sgid;
2503 
2504 	/* pass state as cm_private */
2505 	status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2506 	    gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2507 	if (status != IBT_SUCCESS) {
2508 		DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2509 		    "Legacy OFED listener", status);
2510 		(void) ibd_deregister_service(
2511 		    state->rc_listen_hdl_OFED_interop);
2512 		(void) ibt_unbind_service(state->rc_listen_hdl,
2513 		    state->rc_listen_bind);
2514 		(void) ibd_deregister_service(state->rc_listen_hdl);
2515 		return (status);
2516 	}
2517 
2518 	return (IBT_SUCCESS);
2519 }
2520 
2521 void
ibd_rc_stop_listen(ibd_state_t * state)2522 ibd_rc_stop_listen(ibd_state_t *state)
2523 {
2524 	int ret;
2525 
2526 	/* Disable incoming connection requests */
2527 	if (state->rc_listen_hdl != NULL) {
2528 		ret = ibt_unbind_all_services(state->rc_listen_hdl);
2529 		if (ret != 0) {
2530 			DPRINT(40, "ibd_rc_stop_listen:"
2531 			    "ibt_unbind_all_services() failed, ret=%d", ret);
2532 		}
2533 		ret = ibd_deregister_service(state->rc_listen_hdl);
2534 		if (ret != 0) {
2535 			DPRINT(40, "ibd_rc_stop_listen:"
2536 			    "ibd_deregister_service() failed, ret=%d", ret);
2537 		} else {
2538 			state->rc_listen_hdl = NULL;
2539 		}
2540 	}
2541 
2542 	/* Disable incoming connection requests */
2543 	if (state->rc_listen_hdl_OFED_interop != NULL) {
2544 		ret = ibt_unbind_all_services(
2545 		    state->rc_listen_hdl_OFED_interop);
2546 		if (ret != 0) {
2547 			DPRINT(40, "ibd_rc_stop_listen:"
2548 			    "ibt_unbind_all_services() failed: %d", ret);
2549 		}
2550 		ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop);
2551 		if (ret != 0) {
2552 			DPRINT(40, "ibd_rc_stop_listen:"
2553 			    "ibd_deregister_service() failed: %d", ret);
2554 		} else {
2555 			state->rc_listen_hdl_OFED_interop = NULL;
2556 		}
2557 	}
2558 }
2559 
2560 void
ibd_rc_close_all_chan(ibd_state_t * state)2561 ibd_rc_close_all_chan(ibd_state_t *state)
2562 {
2563 	ibd_rc_chan_t *rc_chan;
2564 	ibd_ace_t *ace, *pre_ace;
2565 	uint_t attempts;
2566 
2567 	/* Disable all Rx routines */
2568 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2569 	rc_chan = state->rc_pass_chan_list.chan_list;
2570 	while (rc_chan != NULL) {
2571 		ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2572 		rc_chan = rc_chan->next;
2573 	}
2574 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2575 
2576 	if (state->rc_enable_srq) {
2577 		attempts = 10;
2578 		while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2579 			DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2580 			delay(drv_usectohz(100000));
2581 			if (--attempts == 0) {
2582 				/*
2583 				 * There are pending bufs with the network
2584 				 * layer and we have no choice but to wait
2585 				 * for them to be done with. Reap all the
2586 				 * Tx/Rx completions that were posted since
2587 				 * we turned off the notification and
2588 				 * return failure.
2589 				 */
2590 				break;
2591 			}
2592 		}
2593 	}
2594 
2595 	/* Close all passive RC channels */
2596 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2597 	while (rc_chan != NULL) {
2598 		(void) ibd_rc_pas_close(rc_chan, B_TRUE, B_FALSE);
2599 		rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2600 	}
2601 
2602 	/* Close all active RC channels */
2603 	mutex_enter(&state->id_ac_mutex);
2604 	state->id_ac_hot_ace = NULL;
2605 	ace = list_head(&state->id_ah_active);
2606 	while ((pre_ace = ace) != NULL) {
2607 		ace = list_next(&state->id_ah_active, ace);
2608 		if (pre_ace->ac_chan != NULL) {
2609 			INC_REF(pre_ace, 1);
2610 			IBD_ACACHE_PULLOUT_ACTIVE(state, pre_ace);
2611 			pre_ace->ac_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2612 			ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2613 			    pre_ace->ac_chan);
2614 		}
2615 	}
2616 	mutex_exit(&state->id_ac_mutex);
2617 
2618 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2619 	while (rc_chan != NULL) {
2620 		ace = rc_chan->ace;
2621 		ibd_rc_act_close(rc_chan, B_TRUE);
2622 		if (ace != NULL) {
2623 			mutex_enter(&state->id_ac_mutex);
2624 			ASSERT(ace->ac_ref != 0);
2625 			atomic_dec_32(&ace->ac_ref);
2626 			ace->ac_chan = NULL;
2627 			if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
2628 				IBD_ACACHE_INSERT_FREE(state, ace);
2629 				ace->ac_ref = 0;
2630 			} else {
2631 				ace->ac_ref |= CYCLEVAL;
2632 				state->rc_delay_ace_recycle++;
2633 			}
2634 			mutex_exit(&state->id_ac_mutex);
2635 		}
2636 		rc_chan = ibd_rc_rm_header_chan_list(
2637 		    &state->rc_obs_act_chan_list);
2638 	}
2639 
2640 	attempts = 400;
2641 	while (((state->rc_num_tx_chan != 0) ||
2642 	    (state->rc_num_rx_chan != 0)) && (attempts > 0)) {
2643 		/* Other thread is closing CM channel, wait it */
2644 		delay(drv_usectohz(100000));
2645 		attempts--;
2646 	}
2647 }
2648 
2649 void
ibd_rc_try_connect(ibd_state_t * state,ibd_ace_t * ace,ibt_path_info_t * path)2650 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
2651 {
2652 	ibt_status_t status;
2653 
2654 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2655 		return;
2656 
2657 	status = ibd_rc_connect(state, ace, path,
2658 	    IBD_RC_SERVICE_ID_OFED_INTEROP);
2659 
2660 	if (status != IBT_SUCCESS) {
2661 		/* wait peer side remove stale channel */
2662 		delay(drv_usectohz(10000));
2663 		if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2664 			return;
2665 		status = ibd_rc_connect(state, ace, path,
2666 		    IBD_RC_SERVICE_ID_OFED_INTEROP);
2667 	}
2668 
2669 	if (status != IBT_SUCCESS) {
2670 		/* wait peer side remove stale channel */
2671 		delay(drv_usectohz(10000));
2672 		if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2673 			return;
2674 		(void) ibd_rc_connect(state, ace, path,
2675 		    IBD_RC_SERVICE_ID);
2676 	}
2677 }
2678 
2679 /*
2680  * Allocates channel and sets the ace->ac_chan to it.
2681  * Opens the channel.
2682  */
2683 ibt_status_t
ibd_rc_connect(ibd_state_t * state,ibd_ace_t * ace,ibt_path_info_t * path,uint64_t ietf_cm_service_id)2684 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
2685     uint64_t ietf_cm_service_id)
2686 {
2687 	ibt_status_t status = 0;
2688 	ibt_rc_returns_t open_returns;
2689 	ibt_chan_open_args_t open_args;
2690 	ibd_rc_msg_hello_t hello_req_msg;
2691 	ibd_rc_msg_hello_t *hello_ack_msg;
2692 	ibd_rc_chan_t *chan;
2693 	ibt_ud_dest_query_attr_t dest_attrs;
2694 
2695 	ASSERT(ace != NULL);
2696 	ASSERT(ace->ac_mce == NULL);
2697 	ASSERT(ace->ac_chan == NULL);
2698 
2699 	if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2700 		DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2701 		return (status);
2702 	}
2703 
2704 	ace->ac_chan = chan;
2705 	chan->state = state;
2706 	chan->ace = ace;
2707 
2708 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2709 
2710 	hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2711 
2712 	/*
2713 	 * open the channels
2714 	 */
2715 	bzero(&open_args, sizeof (ibt_chan_open_args_t));
2716 	bzero(&open_returns, sizeof (ibt_rc_returns_t));
2717 
2718 	open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2719 	open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2720 
2721 	/*
2722 	 * update path record with the SID
2723 	 */
2724 	if ((status = ibt_query_ud_dest(ace->ac_dest, &dest_attrs))
2725 	    != IBT_SUCCESS) {
2726 		DPRINT(40, "ibd_rc_connect: ibt_query_ud_dest() failed, "
2727 		    "ret=%d", status);
2728 		return (status);
2729 	}
2730 
2731 	path->pi_sid =
2732 	    ietf_cm_service_id | ((dest_attrs.ud_dst_qpn) & 0xffffff);
2733 
2734 
2735 	/* pre-allocate memory for hello ack message */
2736 	open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2737 	open_returns.rc_priv_data = hello_ack_msg;
2738 
2739 	open_args.oc_path = path;
2740 
2741 	open_args.oc_path_rnr_retry_cnt	= 1;
2742 	open_args.oc_path_retry_cnt = 1;
2743 
2744 	/* We don't do RDMA */
2745 	open_args.oc_rdma_ra_out = 0;
2746 	open_args.oc_rdma_ra_in	= 0;
2747 
2748 	hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2749 	hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2750 	open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2751 	open_args.oc_priv_data = (void *)(&hello_req_msg);
2752 
2753 	ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2754 	ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2755 	ASSERT(open_args.oc_cm_handler != NULL);
2756 
2757 	status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2758 	    IBT_BLOCKING, &open_args, &open_returns);
2759 
2760 	if (status == IBT_SUCCESS) {
2761 		/* Success! */
2762 		DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2763 		state->rc_conn_succ++;
2764 		kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2765 		return (IBT_SUCCESS);
2766 	}
2767 
2768 	/* failure */
2769 	(void) ibt_flush_channel(chan->chan_hdl);
2770 	ibd_rc_free_chan(chan);
2771 	ace->ac_chan = NULL;
2772 
2773 	/* check open_returns report error and exit */
2774 	DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2775 	    "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2776 	    " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2777 	    hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2778 	    dest_attrs.ud_dst_qpn);
2779 	kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2780 	return (status);
2781 }
2782 
2783 void
ibd_rc_signal_act_close(ibd_state_t * state,ibd_ace_t * ace)2784 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2785 {
2786 	ibd_req_t *req;
2787 
2788 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2789 	if (req == NULL) {
2790 		ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2791 		    "ibd_req_t fail");
2792 		mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2793 		ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2794 		state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2795 		mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2796 	} else {
2797 		req->rq_ptr = ace->ac_chan;
2798 		ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2799 	}
2800 }
2801 
2802 void
ibd_rc_signal_ace_recycle(ibd_state_t * state,ibd_ace_t * ace)2803 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2804 {
2805 	ibd_req_t *req;
2806 
2807 	mutex_enter(&state->rc_ace_recycle_lock);
2808 	if (state->rc_ace_recycle != NULL) {
2809 		mutex_exit(&state->rc_ace_recycle_lock);
2810 		return;
2811 	}
2812 
2813 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2814 	if (req == NULL) {
2815 		mutex_exit(&state->rc_ace_recycle_lock);
2816 		return;
2817 	}
2818 
2819 	state->rc_ace_recycle = ace;
2820 	mutex_exit(&state->rc_ace_recycle_lock);
2821 	ASSERT(ace->ac_mce == NULL);
2822 	INC_REF(ace, 1);
2823 	IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2824 	req->rq_ptr = ace;
2825 	ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2826 }
2827 
2828 /*
2829  * Close an active channel
2830  *
2831  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2832  */
2833 static void
ibd_rc_act_close(ibd_rc_chan_t * chan,boolean_t is_close_rc_chan)2834 ibd_rc_act_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan)
2835 {
2836 	ibd_state_t *state;
2837 	ibd_ace_t *ace;
2838 	uint_t times;
2839 	ibt_status_t ret;
2840 
2841 	ASSERT(chan != NULL);
2842 
2843 	chan->state->rc_act_close++;
2844 	switch (chan->chan_state) {
2845 	case IBD_RC_STATE_ACT_CLOSING:	/* stale, close it */
2846 	case IBD_RC_STATE_ACT_ESTAB:
2847 		DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2848 		    "act_state=%d, chan=%p", chan->chan_state, chan);
2849 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2850 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2851 		/*
2852 		 * Wait send queue empty. Its old value is 50 (5 seconds). But
2853 		 * in my experiment, 5 seconds is not enough time to let IBTL
2854 		 * return all buffers and ace->ac_ref. I tried 25 seconds, it
2855 		 * works well. As another evidence, I saw IBTL takes about 17
2856 		 * seconds every time it cleans a stale RC channel.
2857 		 */
2858 		times = 250;
2859 		ace = chan->ace;
2860 		ASSERT(ace != NULL);
2861 		state = chan->state;
2862 		ASSERT(state != NULL);
2863 		mutex_enter(&state->id_ac_mutex);
2864 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
2865 		mutex_enter(&chan->tx_rel_list.dl_mutex);
2866 		while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2867 		    != chan->scq_size) || ((ace->ac_ref != 1) &&
2868 		    (ace->ac_ref != (CYCLEVAL+1)))) {
2869 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2870 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2871 			mutex_exit(&state->id_ac_mutex);
2872 			times--;
2873 			if (times == 0) {
2874 				state->rc_act_close_not_clean++;
2875 				DPRINT(40, "ibd_rc_act_close: dl_cnt(tx_wqe_"
2876 				    "list=%d, tx_rel_list=%d) != chan->"
2877 				    "scq_size=%d, OR ac_ref(=%d) not clean",
2878 				    chan->tx_wqe_list.dl_cnt,
2879 				    chan->tx_rel_list.dl_cnt,
2880 				    chan->scq_size, ace->ac_ref);
2881 				break;
2882 			}
2883 			mutex_enter(&chan->tx_poll_lock);
2884 			if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2885 				DPRINT(40, "ibd_rc_act_close: multiple "
2886 				    "polling threads");
2887 				mutex_exit(&chan->tx_poll_lock);
2888 			} else {
2889 				chan->tx_poll_busy = IBD_CQ_POLLING;
2890 				mutex_exit(&chan->tx_poll_lock);
2891 				ibd_rc_drain_scq(chan, chan->scq_hdl);
2892 				mutex_enter(&chan->tx_poll_lock);
2893 				chan->tx_poll_busy = 0;
2894 				mutex_exit(&chan->tx_poll_lock);
2895 			}
2896 			delay(drv_usectohz(100000));
2897 			mutex_enter(&state->id_ac_mutex);
2898 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2899 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2900 		}
2901 		if (times != 0) {
2902 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2903 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2904 			mutex_exit(&state->id_ac_mutex);
2905 		}
2906 
2907 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2908 		if (is_close_rc_chan) {
2909 			ret = ibt_close_rc_channel(chan->chan_hdl,
2910 			    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
2911 			    0);
2912 			if (ret != IBT_SUCCESS) {
2913 				DPRINT(40, "ibd_rc_act_close: ibt_close_rc_"
2914 				    "channel fail, chan=%p, ret=%d",
2915 				    chan, ret);
2916 			} else {
2917 				DPRINT(30, "ibd_rc_act_close: ibt_close_rc_"
2918 				    "channel succ, chan=%p", chan);
2919 			}
2920 		}
2921 
2922 		ibd_rc_free_chan(chan);
2923 		break;
2924 	case IBD_RC_STATE_ACT_REP_RECV:
2925 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2926 		(void) ibt_flush_channel(chan->chan_hdl);
2927 		ibd_rc_free_chan(chan);
2928 		break;
2929 	case IBD_RC_STATE_ACT_ERROR:
2930 		DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2931 		break;
2932 	default:
2933 		DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2934 		    "chan=%p", chan->chan_state, chan);
2935 	}
2936 }
2937 
2938 /*
2939  * Close a passive channel
2940  *
2941  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2942  *
2943  * is_timeout_close: if B_TRUE, this function is called by the connection
2944  * reaper (refer to function ibd_rc_conn_timeout_call). When the connection
2945  * reaper calls ibd_rc_pas_close(), and if it finds that dl_bufs_outstanding
2946  * or chan->rcq_invoking is non-zero, then it can simply put that channel back
2947  * on the passive channels list and move on, since it might be an indication
2948  * that the channel became active again by the time we started it's cleanup.
2949  * It is costlier to do the cleanup and then reinitiate the channel
2950  * establishment and hence it will help to be conservative when we do the
2951  * cleanup.
2952  */
2953 int
ibd_rc_pas_close(ibd_rc_chan_t * chan,boolean_t is_close_rc_chan,boolean_t is_timeout_close)2954 ibd_rc_pas_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan,
2955     boolean_t is_timeout_close)
2956 {
2957 	uint_t times;
2958 	ibt_status_t ret;
2959 
2960 	ASSERT(chan != NULL);
2961 	chan->state->rc_pas_close++;
2962 
2963 	switch (chan->chan_state) {
2964 	case IBD_RC_STATE_PAS_ESTAB:
2965 		if (is_timeout_close) {
2966 			if ((chan->rcq_invoking != 0) ||
2967 			    ((!chan->state->rc_enable_srq) &&
2968 			    (chan->rx_wqe_list.dl_bufs_outstanding > 0))) {
2969 				if (ibd_rc_re_add_to_pas_chan_list(chan)) {
2970 					return (DDI_FAILURE);
2971 				}
2972 			}
2973 		}
2974 		/*
2975 		 * First, stop receive interrupts; this stops the
2976 		 * connection from handing up buffers to higher layers.
2977 		 * Wait for receive buffers to be returned; give up
2978 		 * after 5 seconds.
2979 		 */
2980 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2981 		/* Wait 0.01 second to let ibt_set_cq_handler() take effect */
2982 		delay(drv_usectohz(10000));
2983 		if (!chan->state->rc_enable_srq) {
2984 			times = 50;
2985 			while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2986 				delay(