1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
29  */
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/atomic.h>		/* for atomic_add*() */
44 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
45 #include <netinet/in.h>		/* for netinet/ip.h below */
46 #include <netinet/ip.h>		/* for struct ip */
47 #include <inet/common.h>	/* for inet/ip.h below */
48 #include <inet/ip.h>		/* for ipha_t */
49 #include <inet/ip_if.h>		/* for ETHERTYPE_IPV6 */
50 #include <inet/ip6.h>		/* for ip6_t */
51 #include <netinet/icmp6.h>	/* for icmp6_t */
52 
53 #include <sys/ib/clients/ibd/ibd.h>
54 
55 extern ibd_global_state_t ibd_gstate;
56 extern int ibd_rc_conn_timeout;
57 uint_t ibd_rc_tx_softintr = 1;
58 /*
59  * If the number of WRs in receive queue of each RC connection less than
60  * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
61  */
62 #define	IBD_RC_RX_WR_THRESHOLD		0x20
63 
64 /*
65  * If the number of free SWQEs (or large Tx buf) is larger than or equal to
66  * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
67  * transmitting packets.
68  */
69 #define	IBD_RC_TX_FREE_THRESH		8
70 
71 #define	IBD_RC_QPN_TO_SID(qpn) \
72 	((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
73 
74 /* For interop with legacy OFED */
75 #define	IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
76 	((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
77 
78 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
79 #define	IBD_RC_IP_ICMP_RETURN_DATA_BYTES	64
80 
81 
82 /* Functions for Reliable Connected Mode */
83 /* Connection Setup/Close Functions */
84 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
85     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
86 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
87     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
88 static void ibd_rc_act_close(ibd_rc_chan_t *, boolean_t);
89 
90 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
91     ibd_rc_chan_t *);
92 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
93     ibd_rc_chan_list_t *);
94 static inline ibd_rc_chan_t *ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
95     ibd_rc_chan_t *);
96 
97 /* CQ handlers */
98 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
99 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
100 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
101 
102 /* Receive Functions */
103 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
104 static void ibd_rc_srq_freemsg_cb(char *);
105 static void ibd_rc_srq_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
106 
107 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
108 static void ibd_rc_freemsg_cb(char *);
109 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
110 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
111 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
112 
113 
114 /* Send Functions */
115 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
116 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
117 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
118 static uint_t ibd_rc_tx_recycle(caddr_t);
119 
120 
121 void
122 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
123 {
124 	ibd_rc_chan_t *rc_chan = req->rq_ptr;
125 	ibd_ace_t *ace;
126 
127 	while (rc_chan != NULL) {
128 		ace = rc_chan->ace;
129 		ASSERT(ace != NULL);
130 		/* Close old RC channel */
131 		ibd_rc_act_close(rc_chan, B_TRUE);
132 		mutex_enter(&state->id_ac_mutex);
133 		ASSERT(ace->ac_ref != 0);
134 		atomic_dec_32(&ace->ac_ref);
135 		ace->ac_chan = NULL;
136 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
137 			IBD_ACACHE_INSERT_FREE(state, ace);
138 			ace->ac_ref = 0;
139 		} else {
140 			ace->ac_ref |= CYCLEVAL;
141 			state->rc_delay_ace_recycle++;
142 		}
143 		mutex_exit(&state->id_ac_mutex);
144 		rc_chan = ibd_rc_rm_header_chan_list(
145 		    &state->rc_obs_act_chan_list);
146 	}
147 }
148 
149 void
150 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
151 {
152 	ibd_ace_t *ace = req->rq_ptr;
153 	ibd_rc_chan_t *rc_chan;
154 
155 	ASSERT(ace != NULL);
156 	rc_chan = ace->ac_chan;
157 	ASSERT(rc_chan != NULL);
158 	/* Close old RC channel */
159 	ibd_rc_act_close(rc_chan, B_TRUE);
160 	mutex_enter(&state->id_ac_mutex);
161 	ASSERT(ace->ac_ref != 0);
162 	atomic_dec_32(&ace->ac_ref);
163 	ace->ac_chan = NULL;
164 	if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
165 		IBD_ACACHE_INSERT_FREE(state, ace);
166 		ace->ac_ref = 0;
167 	} else {
168 		ace->ac_ref |= CYCLEVAL;
169 		state->rc_delay_ace_recycle++;
170 	}
171 	mutex_exit(&state->id_ac_mutex);
172 	mutex_enter(&state->rc_ace_recycle_lock);
173 	state->rc_ace_recycle = NULL;
174 	mutex_exit(&state->rc_ace_recycle_lock);
175 }
176 
177 /* Simple ICMP IP Header Template */
178 static const ipha_t icmp_ipha = {
179 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
180 };
181 
182 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
183 void
184 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
185 {
186 	mblk_t *mp = req->rq_ptr;
187 	ibd_ace_t *ace = req->rq_ptr2;
188 	uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
189 	uint_t	len_needed;
190 	size_t	msg_len;
191 	mblk_t	*pmtu_mp;
192 	ushort_t	sap;
193 	ib_header_info_t *ibha;	/* ib header for pmtu_pkt */
194 	/*
195 	 * ipha: IP header for pmtu_pkt
196 	 * old_ipha: IP header for old packet
197 	 */
198 	ipha_t *ipha, *old_ipha;
199 	icmph_t	*icmph;
200 
201 	sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
202 
203 	if (!pullupmsg(mp, -1)) {
204 		DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
205 		goto too_big_fail;
206 	}
207 	/* move to IP header. */
208 	mp->b_rptr += IPOIB_HDRSIZE;
209 	old_ipha = (ipha_t *)mp->b_rptr;
210 
211 	len_needed = IPH_HDR_LENGTH(old_ipha);
212 	if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
213 		len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
214 		    len_needed));
215 	} else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
216 		ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
217 		    + len_needed);
218 		len_needed += ip_hdr_length_v6(mp, ip6h);
219 	}
220 	len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
221 	msg_len = msgdsize(mp);
222 	if (msg_len > len_needed) {
223 		(void) adjmsg(mp, len_needed - msg_len);
224 		msg_len = len_needed;
225 	}
226 
227 	if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
228 	    + sizeof (icmph_t), BPRI_MED)) == NULL) {
229 		DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
230 		goto too_big_fail;
231 	}
232 	pmtu_mp->b_cont = mp;
233 	pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
234 	    + sizeof (ipha_t) + sizeof (icmph_t);
235 
236 	ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
237 
238 	/* Fill IB header */
239 	bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
240 	/*
241 	 * If the GRH is not valid, indicate to GLDv3 by setting
242 	 * the VerTcFlow field to 0.
243 	 */
244 	ibha->ib_grh.ipoib_vertcflow = 0;
245 	ibha->ipib_rhdr.ipoib_type = htons(sap);
246 	ibha->ipib_rhdr.ipoib_mbz = 0;
247 
248 	/* Fill IP header */
249 	ipha = (ipha_t *)&ibha[1];
250 	*ipha = icmp_ipha;
251 	ipha->ipha_src = old_ipha->ipha_dst;
252 	ipha->ipha_dst = old_ipha->ipha_src;
253 	ipha->ipha_ttl = old_ipha->ipha_ttl;
254 	msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
255 	if (msg_len > IP_MAXPACKET) {
256 		ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
257 		    "> IP_MAXPACKET", (uint32_t)msg_len);
258 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
259 		msg_len = IP_MAXPACKET;
260 	}
261 	ipha->ipha_length = htons((uint16_t)msg_len);
262 	ipha->ipha_hdr_checksum = 0;
263 	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
264 
265 	/* Fill ICMP body */
266 	icmph = (icmph_t *)&ipha[1];
267 	bzero(icmph, sizeof (icmph_t));
268 	icmph->icmph_type = ICMP_DEST_UNREACHABLE;
269 	icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
270 	icmph->icmph_du_mtu = htons(mtu);
271 	icmph->icmph_checksum = 0;
272 	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
273 	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
274 
275 	(void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
276 	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
277 
278 	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
279 	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
280 	    sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
281 	    len_needed, (uint32_t)msg_len);
282 
283 	mac_rx(state->id_mh, state->id_rh, pmtu_mp);
284 
285 	mutex_enter(&ace->tx_too_big_mutex);
286 	ace->tx_too_big_ongoing = B_FALSE;
287 	mutex_exit(&ace->tx_too_big_mutex);
288 	return;
289 
290 too_big_fail:
291 	/* Drop packet */
292 	freemsg(mp);
293 	mutex_enter(&ace->tx_too_big_mutex);
294 	ace->tx_too_big_ongoing = B_FALSE;
295 	mutex_exit(&ace->tx_too_big_mutex);
296 }
297 
298 /*
299  * Check all active/passive channels. If any ative/passive
300  * channel has not been used for a long time, close it.
301  */
302 void
303 ibd_rc_conn_timeout_call(void *carg)
304 {
305 	ibd_state_t *state = carg;
306 	ibd_ace_t *ace, *pre_ace;
307 	ibd_rc_chan_t *chan, *pre_chan, *next_chan;
308 	ibd_req_t *req;
309 
310 	/* Check all active channels. If chan->is_used == B_FALSE, close it */
311 	mutex_enter(&state->id_ac_mutex);
312 	ace = list_head(&state->id_ah_active);
313 	while ((pre_ace = ace) != NULL) {
314 		ace = list_next(&state->id_ah_active, ace);
315 		if (pre_ace->ac_chan != NULL) {
316 			chan = pre_ace->ac_chan;
317 			ASSERT(state->id_enable_rc == B_TRUE);
318 			if (chan->chan_state == IBD_RC_STATE_ACT_ESTAB) {
319 				if (chan->is_used == B_FALSE) {
320 					state->rc_timeout_act++;
321 					INC_REF(pre_ace, 1);
322 					IBD_ACACHE_PULLOUT_ACTIVE(state,
323 					    pre_ace);
324 					chan->chan_state =
325 					    IBD_RC_STATE_ACT_CLOSING;
326 					ibd_rc_signal_act_close(state, pre_ace);
327 				} else {
328 					chan->is_used = B_FALSE;
329 				}
330 			}
331 		}
332 	}
333 	mutex_exit(&state->id_ac_mutex);
334 
335 	/* Check all passive channels. If chan->is_used == B_FALSE, close it */
336 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
337 	next_chan = state->rc_pass_chan_list.chan_list;
338 	pre_chan = NULL;
339 	while ((chan = next_chan) != NULL) {
340 		next_chan = chan->next;
341 		if (chan->is_used == B_FALSE) {
342 			req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
343 			if (req != NULL) {
344 				/* remove it */
345 				state->rc_timeout_pas++;
346 				req->rq_ptr = chan;
347 				ibd_queue_work_slot(state, req,
348 				    IBD_ASYNC_RC_CLOSE_PAS_CHAN);
349 			} else {
350 				ibd_print_warn(state, "ibd_rc_conn_timeout: "
351 				    "alloc ibd_req_t fail");
352 				if (pre_chan == NULL) {
353 					state->rc_pass_chan_list.chan_list =
354 					    chan;
355 				} else {
356 					pre_chan->next = chan;
357 				}
358 				pre_chan = chan;
359 			}
360 		} else {
361 			if (pre_chan == NULL) {
362 				state->rc_pass_chan_list.chan_list = chan;
363 			} else {
364 				pre_chan->next = chan;
365 			}
366 			pre_chan = chan;
367 			chan->is_used = B_FALSE;
368 		}
369 	}
370 	if (pre_chan != NULL) {
371 		pre_chan->next = NULL;
372 	} else {
373 		state->rc_pass_chan_list.chan_list = NULL;
374 	}
375 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
376 
377 	mutex_enter(&state->rc_timeout_lock);
378 	if (state->rc_timeout_start == B_TRUE) {
379 		state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
380 		    SEC_TO_TICK(ibd_rc_conn_timeout));
381 	}
382 	mutex_exit(&state->rc_timeout_lock);
383 }
384 
385 #ifdef DEBUG
386 /*
387  * ibd_rc_update_stats - update driver private kstat counters
388  *
389  * This routine will dump the internal statistics counters for ibd's
390  * Reliable Connected Mode. The current stats dump values will
391  * be sent to the kernel status area.
392  */
393 static int
394 ibd_rc_update_stats(kstat_t *ksp, int rw)
395 {
396 	ibd_state_t *state;
397 	ibd_rc_stat_t *ibd_rc_ksp;
398 
399 	if (rw == KSTAT_WRITE)
400 		return (EACCES);
401 
402 	state = (ibd_state_t *)ksp->ks_private;
403 	ASSERT(state != NULL);
404 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
405 
406 	ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
407 	ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
408 	ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
409 	ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
410 	ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
411 
412 	ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
413 
414 	ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
415 
416 	ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
417 	ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
418 	ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
419 	    state->rc_xmt_fragmented_pkt;
420 	ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
421 	ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
422 	ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
423 
424 	ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
425 	ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
426 	ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
427 	ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
428 	ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
429 	ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
430 	    state->rc_xmt_buf_mac_update;
431 
432 	ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
433 	ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
434 	ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
435 	ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
436 
437 	ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
438 	ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
439 	ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
440 	ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
441 	    state->rc_act_close_simultaneous;
442 	ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
443 	ibd_rc_ksp->rc_timeout_act.value.ul = state->rc_timeout_act;
444 	ibd_rc_ksp->rc_timeout_pas.value.ul = state->rc_timeout_pas;
445 
446 	return (0);
447 }
448 
449 
450 /*
451  * ibd_rc_init_stats - initialize kstat data structures
452  *
453  * This routine will create and initialize the driver private
454  * statistics counters.
455  */
456 int
457 ibd_rc_init_stats(ibd_state_t *state)
458 {
459 	kstat_t *ksp;
460 	ibd_rc_stat_t *ibd_rc_ksp;
461 	char stat_name[KSTAT_STRLEN];
462 	int inst;
463 
464 	/*
465 	 * Create and init kstat
466 	 */
467 	inst = ddi_get_instance(state->id_dip);
468 	(void) snprintf(stat_name, KSTAT_STRLEN, "statistics%d_%x_%u", inst,
469 	    state->id_pkey, state->id_plinkid);
470 	ksp = kstat_create("ibd", 0, stat_name, "net", KSTAT_TYPE_NAMED,
471 	    sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
472 
473 	if (ksp == NULL) {
474 		ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
475 		    "kernel statistics");
476 		return (DDI_FAILURE);
477 	}
478 
479 	state->rc_ksp = ksp;	/* Fill in the ksp of ibd over RC mode */
480 
481 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
482 
483 	/*
484 	 * Initialize all the statistics
485 	 */
486 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
487 	    "transfer mode", KSTAT_DATA_ULONG);
488 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
489 	    "transfer mode", KSTAT_DATA_ULONG);
490 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
491 	    "copy mode", KSTAT_DATA_ULONG);
492 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
493 	    "copy mode", KSTAT_DATA_ULONG);
494 	kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
495 	    KSTAT_DATA_ULONG);
496 
497 	kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
498 	    KSTAT_DATA_ULONG);
499 
500 	kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
501 	    KSTAT_DATA_ULONG);
502 
503 	kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
504 	    KSTAT_DATA_ULONG);
505 	kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
506 	    "RC: Tx pkt small size", KSTAT_DATA_ULONG);
507 	kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
508 	    "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
509 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
510 	    "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
511 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
512 	    "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
513 	kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
514 	    KSTAT_DATA_ULONG);
515 
516 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
517 	    "recycle", KSTAT_DATA_ULONG);
518 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
519 	    "after recycle", KSTAT_DATA_ULONG);
520 	kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
521 	    KSTAT_DATA_ULONG);
522 	kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
523 	    "#, swqe available", KSTAT_DATA_ULONG);
524 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
525 	    "ibd_send", KSTAT_DATA_ULONG);
526 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
527 	    "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
528 
529 	kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
530 	    KSTAT_DATA_ULONG);
531 	kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
532 	    KSTAT_DATA_ULONG);
533 	kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
534 	    "pkt", KSTAT_DATA_ULONG);
535 	kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
536 	    "state", KSTAT_DATA_ULONG);
537 
538 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
539 	    KSTAT_DATA_ULONG);
540 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
541 	    KSTAT_DATA_ULONG);
542 	kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
543 	    "recycle", KSTAT_DATA_ULONG);
544 	kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
545 	    "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
546 	kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
547 	    KSTAT_DATA_ULONG);
548 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: timeout act side",
549 	    KSTAT_DATA_ULONG);
550 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: timeout pas side",
551 	    KSTAT_DATA_ULONG);
552 
553 	/*
554 	 * Function to provide kernel stat update on demand
555 	 */
556 	ksp->ks_update = ibd_rc_update_stats;
557 
558 	/*
559 	 * Pointer into provider's raw statistics
560 	 */
561 	ksp->ks_private = (void *)state;
562 
563 	/*
564 	 * Add kstat to systems kstat chain
565 	 */
566 	kstat_install(ksp);
567 
568 	return (DDI_SUCCESS);
569 }
570 #endif
571 
572 static ibt_status_t
573 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
574     boolean_t is_tx_chan)
575 {
576 	ibt_status_t result;
577 	ibd_rc_chan_t *chan;
578 	ibt_rc_chan_alloc_args_t alloc_args;
579 	ibt_chan_alloc_flags_t alloc_flags;
580 	ibt_chan_sizes_t sizes;
581 	ibt_cq_attr_t cq_atts;
582 	int rv;
583 
584 	chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
585 
586 	chan->state = state;
587 	mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
588 	mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
589 	mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
590 	mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
591 	mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
592 	mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
593 
594 	/* Allocate IB structures for a new RC channel. */
595 	if (is_tx_chan) {
596 		chan->scq_size = state->id_rc_num_swqe;
597 		chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
598 	} else {
599 		chan->scq_size = IBD_RC_MIN_CQ_SIZE;
600 		chan->rcq_size = state->id_rc_num_rwqe;
601 	}
602 	cq_atts.cq_size = chan->scq_size;
603 	cq_atts.cq_sched = NULL;
604 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
605 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
606 	    &chan->scq_size);
607 	if (result != IBT_SUCCESS) {
608 		DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
609 		    "create scq completion queue (size <%d>)",
610 		    result, chan->scq_size);
611 		goto alloc_scq_err;
612 	}	/* if failure to alloc cq */
613 
614 	if (ibt_modify_cq(chan->scq_hdl, state->id_rc_tx_comp_count,
615 	    state->id_rc_tx_comp_usec, 0) != IBT_SUCCESS) {
616 		DPRINT(30, "ibd_rc_alloc_chan: Send CQ "
617 		    "interrupt moderation failed");
618 	}
619 
620 	ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
621 	ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
622 	    (void *) (uintptr_t)chan);
623 
624 	cq_atts.cq_size = chan->rcq_size;
625 	cq_atts.cq_sched = NULL;
626 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
627 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
628 	    &chan->rcq_size);
629 	if (result != IBT_SUCCESS) {
630 		ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
631 		    "rx completion queue (size <%d>)", result, chan->rcq_size);
632 		goto alloc_rcq_err;
633 	}	/* if failure to alloc cq */
634 
635 	if (ibt_modify_cq(chan->rcq_hdl, state->id_rc_rx_comp_count,
636 	    state->id_rc_rx_comp_usec, 0) != IBT_SUCCESS) {
637 		DPRINT(30, "ibd_rc_alloc_chan: Receive CQ "
638 		    "interrupt moderation failed");
639 	}
640 
641 	ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
642 	ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
643 	    (void *)(uintptr_t)chan);
644 
645 	if (is_tx_chan) {
646 		chan->is_tx_chan = B_TRUE;
647 		if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
648 			ibd_print_warn(state, "ibd_rc_alloc_chan: "
649 			    "ibd_rc_init_txlist failed");
650 			goto init_txlist_err;
651 		}
652 		if (ibd_rc_tx_softintr == 1) {
653 			if ((rv = ddi_add_softintr(state->id_dip,
654 			    DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
655 			    ibd_rc_tx_recycle, (caddr_t)chan)) !=
656 			    DDI_SUCCESS) {
657 				DPRINT(10, "ibd_rc_alloc_chan: failed in "
658 				    "ddi_add_softintr(scq_softintr), ret=%d",
659 				    rv);
660 				goto alloc_softintr_err;
661 			}
662 		}
663 	} else {
664 		chan->is_tx_chan = B_FALSE;
665 	}
666 
667 	/*
668 	 * enable completions
669 	 */
670 	result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
671 	if (result != IBT_SUCCESS) {
672 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
673 		    "(scq) failed: status %d\n", result);
674 		goto alloc_scq_enable_err;
675 	}
676 
677 	/* We will enable chan->rcq_hdl later. */
678 
679 	/* alloc a RC channel */
680 	bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
681 	bzero(&sizes, sizeof (ibt_chan_sizes_t));
682 
683 	alloc_args.rc_flags = IBT_WR_SIGNALED;
684 	alloc_args.rc_control = IBT_CEP_NO_FLAGS;
685 
686 	alloc_args.rc_scq = chan->scq_hdl;
687 	alloc_args.rc_rcq = chan->rcq_hdl;
688 	alloc_args.rc_pd = state->id_pd_hdl;
689 
690 	alloc_args.rc_hca_port_num = state->id_port;
691 	alloc_args.rc_clone_chan = NULL;
692 
693 	/* scatter/gather */
694 	alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
695 
696 	/*
697 	 * For the number of SGL elements in receive side, I think it
698 	 * should be 1. Because ibd driver allocates a whole block memory
699 	 * for each ibt_post_recv().
700 	 */
701 	alloc_args.rc_sizes.cs_rq_sgl = 1;
702 
703 	/* The send queue size and the receive queue size */
704 	alloc_args.rc_sizes.cs_sq = chan->scq_size;
705 	alloc_args.rc_sizes.cs_rq = chan->rcq_size;
706 
707 	if (state->id_hca_res_lkey_capab) {
708 		alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
709 	} else {
710 		DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
711 	}
712 
713 	if (state->rc_enable_srq) {
714 		alloc_flags = IBT_ACHAN_USES_SRQ;
715 		alloc_args.rc_srq = state->rc_srq_hdl;
716 	} else {
717 		alloc_flags = IBT_ACHAN_NO_FLAGS;
718 	}
719 
720 	result = ibt_alloc_rc_channel(state->id_hca_hdl,
721 	    alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
722 	if (result != IBT_SUCCESS) {
723 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
724 		    " fail:<%d>", result);
725 		goto alloc_scq_enable_err;
726 	}
727 
728 	if (is_tx_chan)
729 		atomic_inc_32(&state->rc_num_tx_chan);
730 	else
731 		atomic_inc_32(&state->rc_num_rx_chan);
732 
733 	/* For the connection reaper routine ibd_rc_conn_timeout_call() */
734 	chan->is_used = B_TRUE;
735 
736 	*ret_chan = chan;
737 	return (IBT_SUCCESS);
738 
739 alloc_scq_enable_err:
740 	if (is_tx_chan) {
741 		if (ibd_rc_tx_softintr == 1) {
742 			ddi_remove_softintr(chan->scq_softintr);
743 		}
744 	}
745 alloc_softintr_err:
746 	if (is_tx_chan) {
747 		ibd_rc_fini_txlist(chan);
748 	}
749 init_txlist_err:
750 	(void) ibt_free_cq(chan->rcq_hdl);
751 alloc_rcq_err:
752 	(void) ibt_free_cq(chan->scq_hdl);
753 alloc_scq_err:
754 	mutex_destroy(&chan->tx_poll_lock);
755 	mutex_destroy(&chan->tx_post_lock);
756 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
757 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
758 	mutex_destroy(&chan->rx_free_list.dl_mutex);
759 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
760 	kmem_free(chan, sizeof (ibd_rc_chan_t));
761 	return (result);
762 }
763 
764 static void
765 ibd_rc_free_chan(ibd_rc_chan_t *chan)
766 {
767 	ibt_status_t ret;
768 
769 	/* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
770 
771 	if (chan->chan_hdl != NULL) {
772 		ret = ibt_free_channel(chan->chan_hdl);
773 		if (ret != IBT_SUCCESS) {
774 			DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
775 			    "chan=%p, returned: %d", chan, ret);
776 			return;
777 		}
778 		chan->chan_hdl = NULL;
779 	}
780 
781 	if (chan->rcq_hdl != NULL) {
782 		ret = ibt_free_cq(chan->rcq_hdl);
783 		if (ret != IBT_SUCCESS) {
784 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
785 			    "chan=%p, returned: %d", chan, ret);
786 			return;
787 		}
788 		chan->rcq_hdl = NULL;
789 	}
790 
791 	if (chan->scq_hdl != NULL) {
792 		ret = ibt_free_cq(chan->scq_hdl);
793 		if (ret != IBT_SUCCESS) {
794 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
795 			    "chan=%p, returned: %d", chan, ret);
796 			return;
797 		}
798 		chan->scq_hdl = NULL;
799 	}
800 
801 	/* Free buffers */
802 	if (chan->is_tx_chan) {
803 		ibd_rc_fini_txlist(chan);
804 		if (ibd_rc_tx_softintr == 1) {
805 			ddi_remove_softintr(chan->scq_softintr);
806 		}
807 		atomic_dec_32(&chan->state->rc_num_tx_chan);
808 	} else {
809 		if (!chan->state->rc_enable_srq) {
810 			ibd_rc_fini_rxlist(chan);
811 		}
812 		atomic_dec_32(&chan->state->rc_num_rx_chan);
813 	}
814 
815 	mutex_destroy(&chan->tx_poll_lock);
816 	mutex_destroy(&chan->tx_post_lock);
817 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
818 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
819 	mutex_destroy(&chan->rx_free_list.dl_mutex);
820 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
821 
822 	/*
823 	 * If it is a passive channel, must make sure it has been removed
824 	 * from chan->state->rc_pass_chan_list
825 	 */
826 	kmem_free(chan, sizeof (ibd_rc_chan_t));
827 }
828 
829 /* Add a RC channel */
830 static inline void
831 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
832 {
833 	mutex_enter(&list->chan_list_mutex);
834 	if (list->chan_list == NULL) {
835 		list->chan_list = chan;
836 		chan->next = NULL;
837 	} else {
838 		chan->next = list->chan_list;
839 		list->chan_list = chan;
840 	}
841 	mutex_exit(&list->chan_list_mutex);
842 }
843 
844 static boolean_t
845 ibd_rc_re_add_to_pas_chan_list(ibd_rc_chan_t *chan)
846 {
847 	ibd_state_t *state = chan->state;
848 
849 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
850 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0) {
851 		mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
852 		return (B_FALSE);
853 	} else {
854 		if (state->rc_pass_chan_list.chan_list == NULL) {
855 			state->rc_pass_chan_list.chan_list = chan;
856 			chan->next = NULL;
857 		} else {
858 			chan->next = state->rc_pass_chan_list.chan_list;
859 			state->rc_pass_chan_list.chan_list = chan;
860 		}
861 		mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
862 		return (B_TRUE);
863 	}
864 }
865 
866 /* Remove a RC channel */
867 static inline ibd_rc_chan_t *
868 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
869 {
870 	ibd_rc_chan_t *pre_chan;
871 
872 	mutex_enter(&list->chan_list_mutex);
873 	if (list->chan_list == chan) {
874 		DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
875 		    " in chan_list", chan);
876 		list->chan_list = chan->next;
877 	} else {
878 		pre_chan = list->chan_list;
879 		while (pre_chan != NULL) {
880 			if (pre_chan->next == chan) {
881 				DPRINT(30, "ibd_rc_rm_from_chan_list"
882 				    "(middle): found chan(%p)", chan);
883 				pre_chan->next = chan->next;
884 				break;
885 			}
886 			pre_chan = pre_chan->next;
887 		}
888 		if (pre_chan == NULL)
889 			chan = NULL;
890 	}
891 	mutex_exit(&list->chan_list_mutex);
892 	return (chan);
893 }
894 
895 static inline ibd_rc_chan_t *
896 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
897 {
898 	ibd_rc_chan_t *rc_chan;
899 
900 	mutex_enter(&list->chan_list_mutex);
901 	rc_chan = list->chan_list;
902 	if (rc_chan != NULL) {
903 		list->chan_list = rc_chan->next;
904 	}
905 	mutex_exit(&list->chan_list_mutex);
906 	return (rc_chan);
907 }
908 
909 static int
910 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
911 {
912 	ibt_mr_attr_t mem_attr;
913 	uint_t rc_rx_bufs_sz;
914 
915 	/*
916 	 * Allocate one big chunk for all regular rx copy bufs
917 	 */
918 	rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
919 
920 	state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
921 
922 	state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
923 	    sizeof (ibd_rwqe_t), KM_SLEEP);
924 
925 	/*
926 	 * Do one memory registration on the entire rxbuf area
927 	 */
928 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
929 	mem_attr.mr_len = rc_rx_bufs_sz;
930 	mem_attr.mr_as = NULL;
931 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
932 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
933 	    &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
934 	    != IBT_SUCCESS) {
935 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
936 		    "failed");
937 		kmem_free(state->rc_srq_rwqes,
938 		    state->rc_srq_size * sizeof (ibd_rwqe_t));
939 		kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
940 		state->rc_srq_rx_bufs = NULL;
941 		state->rc_srq_rwqes = NULL;
942 		return (DDI_FAILURE);
943 	}
944 
945 	return (DDI_SUCCESS);
946 }
947 
948 static void
949 ibd_rc_free_srq_copybufs(ibd_state_t *state)
950 {
951 	uint_t rc_rx_buf_sz;
952 
953 	/*
954 	 * Don't change the value of state->rc_mtu at the period from call
955 	 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
956 	 */
957 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
958 
959 	/*
960 	 * Unregister rxbuf mr
961 	 */
962 	if (ibt_deregister_mr(state->id_hca_hdl,
963 	    state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
964 		DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
965 		    " failed");
966 	}
967 	state->rc_srq_rx_mr_hdl = NULL;
968 
969 	/*
970 	 * Free rxbuf memory
971 	 */
972 	kmem_free(state->rc_srq_rwqes,
973 	    state->rc_srq_size * sizeof (ibd_rwqe_t));
974 	kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
975 	state->rc_srq_rwqes = NULL;
976 	state->rc_srq_rx_bufs = NULL;
977 }
978 
979 /*
980  * Allocate and post a certain number of SRQ receive buffers and WRs.
981  */
982 int
983 ibd_rc_init_srq_list(ibd_state_t *state)
984 {
985 	ibd_rwqe_t *rwqe;
986 	ibt_lkey_t lkey;
987 	int i;
988 	uint_t len;
989 	uint8_t *bufaddr;
990 	ibt_srq_sizes_t srq_sizes;
991 	ibt_srq_sizes_t	 srq_real_sizes;
992 	ibt_status_t ret;
993 
994 	srq_sizes.srq_sgl_sz = 1;
995 	srq_sizes.srq_wr_sz = state->id_rc_num_srq;
996 	ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
997 	    state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
998 	if (ret != IBT_SUCCESS) {
999 		/*
1000 		 * The following code is for CR 6932460 (can't configure ibd
1001 		 * interface on 32 bits x86 systems). 32 bits x86 system has
1002 		 * less memory resource than 64 bits x86 system. If current
1003 		 * resource request can't be satisfied, we request less
1004 		 * resource here.
1005 		 */
1006 		len = state->id_rc_num_srq;
1007 		while ((ret == IBT_HCA_WR_EXCEEDED) &&
1008 		    (len >= 2 * IBD_RC_MIN_CQ_SIZE)) {
1009 			len = len/2;
1010 			srq_sizes.srq_sgl_sz = 1;
1011 			srq_sizes.srq_wr_sz = len;
1012 			ret = ibt_alloc_srq(state->id_hca_hdl,
1013 			    IBT_SRQ_NO_FLAGS, state->id_pd_hdl, &srq_sizes,
1014 			    &state->rc_srq_hdl, &srq_real_sizes);
1015 		}
1016 		if (ret != IBT_SUCCESS) {
1017 			DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
1018 			    "req_sgl_sz=%d, req_wr_sz=0x%x, final_req_wr_sz="
1019 			    "0x%x, ret=%d", srq_sizes.srq_sgl_sz,
1020 			    srq_sizes.srq_wr_sz, len, ret);
1021 			return (DDI_FAILURE);
1022 		}
1023 		state->id_rc_num_srq = len;
1024 		state->id_rc_num_rwqe = state->id_rc_num_srq + 1;
1025 	}
1026 
1027 	state->rc_srq_size = srq_real_sizes.srq_wr_sz;
1028 	if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
1029 		ret = ibt_free_srq(state->rc_srq_hdl);
1030 		if (ret != IBT_SUCCESS) {
1031 			ibd_print_warn(state, "ibd_rc_init_srq_list: "
1032 			    "ibt_free_srq fail, ret=%d", ret);
1033 		}
1034 		return (DDI_FAILURE);
1035 	}
1036 
1037 	/*
1038 	 * Allocate and setup the rwqe list
1039 	 */
1040 	lkey = state->rc_srq_rx_mr_desc.md_lkey;
1041 	rwqe = state->rc_srq_rwqes;
1042 	bufaddr = state->rc_srq_rx_bufs;
1043 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1044 	state->rc_srq_rwqe_list.dl_cnt = 0;
1045 	state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
1046 	for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
1047 		rwqe->w_state = state;
1048 		rwqe->w_freeing_wqe = B_FALSE;
1049 		rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
1050 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1051 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1052 
1053 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1054 		    &rwqe->w_freemsg_cb)) == NULL) {
1055 			DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
1056 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1057 			if (atomic_dec_32_nv(&state->id_running) != 0) {
1058 				cmn_err(CE_WARN, "ibd_rc_init_srq_list: "
1059 				    "id_running was not 1\n");
1060 			}
1061 			ibd_rc_fini_srq_list(state);
1062 			atomic_inc_32(&state->id_running);
1063 			return (DDI_FAILURE);
1064 		}
1065 
1066 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1067 		/* Leave IPOIB_GRH_SIZE space */
1068 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1069 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1070 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1071 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1072 		rwqe->w_rwr.wr_nds = 1;
1073 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1074 		(void) ibd_rc_post_srq(state, rwqe);
1075 	}
1076 
1077 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1078 	state->rc_srq_free_list.dl_head = NULL;
1079 	state->rc_srq_free_list.dl_cnt = 0;
1080 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1081 
1082 	return (DDI_SUCCESS);
1083 }
1084 
1085 /*
1086  * Free the statically allocated Rx buffer list for SRQ.
1087  */
1088 void
1089 ibd_rc_fini_srq_list(ibd_state_t *state)
1090 {
1091 	ibd_rwqe_t *rwqe;
1092 	int i;
1093 	ibt_status_t ret;
1094 
1095 	ASSERT(state->id_running == 0);
1096 	ret = ibt_free_srq(state->rc_srq_hdl);
1097 	if (ret != IBT_SUCCESS) {
1098 		ibd_print_warn(state, "ibd_rc_fini_srq_list: "
1099 		    "ibt_free_srq fail, ret=%d", ret);
1100 	}
1101 
1102 	mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
1103 	rwqe = state->rc_srq_rwqes;
1104 	for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
1105 		if (rwqe->rwqe_im_mblk != NULL) {
1106 			rwqe->w_freeing_wqe = B_TRUE;
1107 			freemsg(rwqe->rwqe_im_mblk);
1108 		}
1109 	}
1110 	mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
1111 
1112 	ibd_rc_free_srq_copybufs(state);
1113 }
1114 
1115 /* Repost the elements in state->ib_rc_free_list */
1116 int
1117 ibd_rc_repost_srq_free_list(ibd_state_t *state)
1118 {
1119 	ibd_rwqe_t *rwqe;
1120 	ibd_wqe_t *list;
1121 	uint_t len;
1122 
1123 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1124 	if (state->rc_srq_free_list.dl_head != NULL) {
1125 		/* repost them */
1126 		len = state->rc_mtu + IPOIB_GRH_SIZE;
1127 		list = state->rc_srq_free_list.dl_head;
1128 		state->rc_srq_free_list.dl_head = NULL;
1129 		state->rc_srq_free_list.dl_cnt = 0;
1130 		mutex_exit(&state->rc_srq_free_list.dl_mutex);
1131 		while (list != NULL) {
1132 			rwqe = WQE_TO_RWQE(list);
1133 			if ((rwqe->rwqe_im_mblk == NULL) &&
1134 			    ((rwqe->rwqe_im_mblk = desballoc(
1135 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
1136 			    &rwqe->w_freemsg_cb)) == NULL)) {
1137 				DPRINT(40, "ibd_rc_repost_srq_free_list: "
1138 				    "failed in desballoc()");
1139 				do {
1140 					ibd_rc_srq_free_rwqe(state, rwqe);
1141 					list = list->w_next;
1142 					rwqe = WQE_TO_RWQE(list);
1143 				} while (list != NULL);
1144 				return (DDI_FAILURE);
1145 			}
1146 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1147 				ibd_rc_srq_free_rwqe(state, rwqe);
1148 			}
1149 			list = list->w_next;
1150 		}
1151 		return (DDI_SUCCESS);
1152 	}
1153 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1154 	return (DDI_SUCCESS);
1155 }
1156 
1157 /*
1158  * Free an allocated recv wqe.
1159  */
1160 static void
1161 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1162 {
1163 	/*
1164 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1165 	 *
1166 	 * This rwqe is placed on a free list so that it
1167 	 * can be reinstated in future.
1168 	 *
1169 	 * NOTE: no code currently exists to reinstate
1170 	 * these "lost" rwqes.
1171 	 */
1172 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1173 	state->rc_srq_free_list.dl_cnt++;
1174 	rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1175 	state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1176 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1177 }
1178 
1179 static void
1180 ibd_rc_srq_freemsg_cb(char *arg)
1181 {
1182 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1183 	ibd_state_t *state = rwqe->w_state;
1184 
1185 	ASSERT(state->rc_enable_srq);
1186 
1187 	/*
1188 	 * If the driver is stopped, just free the rwqe.
1189 	 */
1190 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
1191 		if (!rwqe->w_freeing_wqe) {
1192 			atomic_dec_32(
1193 			    &state->rc_srq_rwqe_list.dl_bufs_outstanding);
1194 			DPRINT(6, "ibd_rc_srq_freemsg_cb: wqe being freed");
1195 			rwqe->rwqe_im_mblk = NULL;
1196 			ibd_rc_srq_free_rwqe(state, rwqe);
1197 		}
1198 		return;
1199 	}
1200 
1201 	atomic_dec_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding);
1202 
1203 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1204 	ASSERT(!rwqe->w_freeing_wqe);
1205 
1206 	/*
1207 	 * Upper layer has released held mblk, so we have
1208 	 * no more use for keeping the old pointer in
1209 	 * our rwqe.
1210 	 */
1211 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1212 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1213 	if (rwqe->rwqe_im_mblk == NULL) {
1214 		DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1215 		ibd_rc_srq_free_rwqe(state, rwqe);
1216 		return;
1217 	}
1218 
1219 	if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1220 		ibd_print_warn(state, "ibd_rc_srq_freemsg_cb: ibd_rc_post_srq"
1221 		    " failed");
1222 		ibd_rc_srq_free_rwqe(state, rwqe);
1223 		return;
1224 	}
1225 }
1226 
1227 /*
1228  * Post a rwqe to the hardware and add it to the Rx list.
1229  */
1230 static int
1231 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1232 {
1233 	/*
1234 	 * Here we should add dl_cnt before post recv, because
1235 	 * we would have to make sure dl_cnt is updated before
1236 	 * the corresponding ibd_rc_process_rx() is called.
1237 	 */
1238 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1239 	atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1);
1240 	if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1241 	    IBT_SUCCESS) {
1242 		atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1243 		DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1244 		return (DDI_FAILURE);
1245 	}
1246 
1247 	return (DDI_SUCCESS);
1248 }
1249 
1250 /*
1251  * Post a rwqe to the hardware and add it to the Rx list.
1252  */
1253 static int
1254 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1255 {
1256 	/*
1257 	 * Here we should add dl_cnt before post recv, because we would
1258 	 * have to make sure dl_cnt has already updated before
1259 	 * corresponding ibd_rc_process_rx() is called.
1260 	 */
1261 	atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1);
1262 	if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1263 	    IBT_SUCCESS) {
1264 		atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1265 		DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1266 		return (DDI_FAILURE);
1267 	}
1268 	return (DDI_SUCCESS);
1269 }
1270 
1271 static int
1272 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1273 {
1274 	ibd_state_t *state = chan->state;
1275 	ibt_mr_attr_t mem_attr;
1276 	uint_t rc_rx_bufs_sz;
1277 
1278 	/*
1279 	 * Allocate one big chunk for all regular rx copy bufs
1280 	 */
1281 	rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1282 
1283 	chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1284 
1285 	chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1286 	    sizeof (ibd_rwqe_t), KM_SLEEP);
1287 
1288 	/*
1289 	 * Do one memory registration on the entire rxbuf area
1290 	 */
1291 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1292 	mem_attr.mr_len = rc_rx_bufs_sz;
1293 	mem_attr.mr_as = NULL;
1294 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1295 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1296 	    &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1297 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1298 		kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1299 		kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1300 		chan->rx_bufs = NULL;
1301 		chan->rx_rwqes = NULL;
1302 		return (DDI_FAILURE);
1303 	}
1304 
1305 	return (DDI_SUCCESS);
1306 }
1307 
1308 static void
1309 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1310 {
1311 	ibd_state_t *state = chan->state;
1312 	uint_t rc_rx_buf_sz;
1313 
1314 	ASSERT(!state->rc_enable_srq);
1315 	ASSERT(chan->rx_rwqes != NULL);
1316 	ASSERT(chan->rx_bufs != NULL);
1317 
1318 	/*
1319 	 * Don't change the value of state->rc_mtu at the period from call
1320 	 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1321 	 */
1322 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1323 
1324 	/*
1325 	 * Unregister rxbuf mr
1326 	 */
1327 	if (ibt_deregister_mr(state->id_hca_hdl,
1328 	    chan->rx_mr_hdl) != IBT_SUCCESS) {
1329 		DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1330 	}
1331 	chan->rx_mr_hdl = NULL;
1332 
1333 	/*
1334 	 * Free rxbuf memory
1335 	 */
1336 	kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1337 	chan->rx_rwqes = NULL;
1338 
1339 	kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1340 	chan->rx_bufs = NULL;
1341 }
1342 
1343 /*
1344  * Post a certain number of receive buffers and WRs on a RC channel.
1345  */
1346 static int
1347 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1348 {
1349 	ibd_state_t *state = chan->state;
1350 	ibd_rwqe_t *rwqe;
1351 	ibt_lkey_t lkey;
1352 	int i;
1353 	uint_t len;
1354 	uint8_t *bufaddr;
1355 
1356 	ASSERT(!state->rc_enable_srq);
1357 	if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1358 		return (DDI_FAILURE);
1359 
1360 	/*
1361 	 * Allocate and setup the rwqe list
1362 	 */
1363 	lkey = chan->rx_mr_desc.md_lkey;
1364 	rwqe = chan->rx_rwqes;
1365 	bufaddr = chan->rx_bufs;
1366 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1367 	for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1368 		rwqe->w_state = state;
1369 		rwqe->w_chan = chan;
1370 		rwqe->w_freeing_wqe = B_FALSE;
1371 		rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1372 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1373 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1374 
1375 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1376 		    &rwqe->w_freemsg_cb)) == NULL) {
1377 			DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1378 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1379 			ibd_rc_fini_rxlist(chan);
1380 			return (DDI_FAILURE);
1381 		}
1382 
1383 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1384 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1385 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1386 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1387 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1388 		rwqe->w_rwr.wr_nds = 1;
1389 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1390 		(void) ibd_rc_post_rwqe(chan, rwqe);
1391 	}
1392 
1393 	return (DDI_SUCCESS);
1394 }
1395 
1396 /*
1397  * Free the statically allocated Rx buffer list for SRQ.
1398  */
1399 static void
1400 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1401 {
1402 	ibd_rwqe_t *rwqe;
1403 	int i;
1404 
1405 	if (chan->rx_bufs == NULL) {
1406 		DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1407 		return;
1408 	}
1409 
1410 	/* bufs_outstanding must be 0 */
1411 	ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1412 	    (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1413 
1414 	mutex_enter(&chan->rx_wqe_list.dl_mutex);
1415 	rwqe = chan->rx_rwqes;
1416 	for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1417 		if (rwqe->rwqe_im_mblk != NULL) {
1418 			rwqe->w_freeing_wqe = B_TRUE;
1419 			freemsg(rwqe->rwqe_im_mblk);
1420 		}
1421 	}
1422 	mutex_exit(&chan->rx_wqe_list.dl_mutex);
1423 
1424 	ibd_rc_free_rx_copybufs(chan);
1425 }
1426 
1427 /*
1428  * Free an allocated recv wqe.
1429  */
1430 static void
1431 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1432 {
1433 	/*
1434 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1435 	 *
1436 	 * This rwqe is placed on a free list so that it
1437 	 * can be reinstated in future.
1438 	 *
1439 	 * NOTE: no code currently exists to reinstate
1440 	 * these "lost" rwqes.
1441 	 */
1442 	mutex_enter(&chan->rx_free_list.dl_mutex);
1443 	chan->rx_free_list.dl_cnt++;
1444 	rwqe->rwqe_next = chan->rx_free_list.dl_head;
1445 	chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1446 	mutex_exit(&chan->rx_free_list.dl_mutex);
1447 }
1448 
1449 /*
1450  * Processing to be done after receipt of a packet; hand off to GLD
1451  * in the format expected by GLD.
1452  */
1453 static void
1454 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1455 {
1456 	ibd_state_t *state = chan->state;
1457 	ib_header_info_t *phdr;
1458 	ipoib_hdr_t *ipibp;
1459 	mblk_t *mp;
1460 	mblk_t *mpc;
1461 	int rxcnt;
1462 	ip6_t *ip6h;
1463 	int len;
1464 
1465 	/*
1466 	 * Track number handed to upper layer, and number still
1467 	 * available to receive packets.
1468 	 */
1469 	if (state->rc_enable_srq) {
1470 		rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1471 	} else {
1472 		rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1473 	}
1474 
1475 	/*
1476 	 * It can not be a IBA multicast packet.
1477 	 */
1478 	ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1479 
1480 	/* For the connection reaper routine ibd_rc_conn_timeout_call() */
1481 	chan->is_used = B_TRUE;
1482 
1483 #ifdef DEBUG
1484 	if (rxcnt < state->id_rc_rx_rwqe_thresh) {
1485 		state->rc_rwqe_short++;
1486 	}
1487 #endif
1488 
1489 	/*
1490 	 * Possibly replenish the Rx pool if needed.
1491 	 */
1492 	if ((rxcnt >= state->id_rc_rx_rwqe_thresh) &&
1493 	    (wc->wc_bytes_xfer > state->id_rc_rx_copy_thresh)) {
1494 		atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1495 		atomic_inc_64(&state->rc_rcv_trans_pkt);
1496 
1497 		/*
1498 		 * Record how many rwqe has been occupied by upper
1499 		 * network layer
1500 		 */
1501 		if (state->rc_enable_srq) {
1502 			atomic_add_32(&state->rc_srq_rwqe_list.
1503 			    dl_bufs_outstanding, 1);
1504 		} else {
1505 			atomic_add_32(&chan->rx_wqe_list.
1506 			    dl_bufs_outstanding, 1);
1507 		}
1508 		mp = rwqe->rwqe_im_mblk;
1509 	} else {
1510 		atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1511 		atomic_inc_64(&state->rc_rcv_copy_pkt);
1512 
1513 		if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1514 		    BPRI_HI)) == NULL) {	/* no memory */
1515 			DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1516 			state->rc_rcv_alloc_fail++;
1517 			if (state->rc_enable_srq) {
1518 				if (ibd_rc_post_srq(state, rwqe) ==
1519 				    DDI_FAILURE) {
1520 					ibd_rc_srq_free_rwqe(state, rwqe);
1521 				}
1522 			} else {
1523 				if (ibd_rc_post_rwqe(chan, rwqe) ==
1524 				    DDI_FAILURE) {
1525 					ibd_rc_free_rwqe(chan, rwqe);
1526 				}
1527 			}
1528 			return;
1529 		}
1530 
1531 		bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1532 		    mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1533 
1534 		if (state->rc_enable_srq) {
1535 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1536 				ibd_rc_srq_free_rwqe(state, rwqe);
1537 			}
1538 		} else {
1539 			if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1540 				ibd_rc_free_rwqe(chan, rwqe);
1541 			}
1542 		}
1543 	}
1544 
1545 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1546 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1547 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1548 		len = ntohs(ip6h->ip6_plen);
1549 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1550 			/* LINTED: E_CONSTANT_CONDITION */
1551 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1552 		}
1553 	}
1554 
1555 	phdr = (ib_header_info_t *)mp->b_rptr;
1556 	phdr->ib_grh.ipoib_vertcflow = 0;
1557 	ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1558 	    sizeof (ipoib_mac_t));
1559 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1560 
1561 	/*
1562 	 * Can RC mode in IB guarantee its checksum correctness?
1563 	 *
1564 	 *	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
1565 	 *	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
1566 	 */
1567 
1568 	/*
1569 	 * Make sure this is NULL or we're in trouble.
1570 	 */
1571 	if (mp->b_next != NULL) {
1572 		ibd_print_warn(state,
1573 		    "ibd_rc_process_rx: got duplicate mp from rcq?");
1574 		mp->b_next = NULL;
1575 	}
1576 
1577 	/*
1578 	 * Add this mp to the list of processed mp's to send to
1579 	 * the nw layer
1580 	 */
1581 	if (state->rc_enable_srq) {
1582 		mutex_enter(&state->rc_rx_lock);
1583 		if (state->rc_rx_mp) {
1584 			ASSERT(state->rc_rx_mp_tail != NULL);
1585 			state->rc_rx_mp_tail->b_next = mp;
1586 		} else {
1587 			ASSERT(state->rc_rx_mp_tail == NULL);
1588 			state->rc_rx_mp = mp;
1589 		}
1590 
1591 		state->rc_rx_mp_tail = mp;
1592 		state->rc_rx_mp_len++;
1593 
1594 		if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1595 			mpc = state->rc_rx_mp;
1596 
1597 			state->rc_rx_mp = NULL;
1598 			state->rc_rx_mp_tail = NULL;
1599 			state->rc_rx_mp_len = 0;
1600 			mutex_exit(&state->rc_rx_lock);
1601 			mac_rx(state->id_mh, NULL, mpc);
1602 		} else {
1603 			mutex_exit(&state->rc_rx_lock);
1604 		}
1605 	} else {
1606 		mutex_enter(&chan->rx_lock);
1607 		if (chan->rx_mp) {
1608 			ASSERT(chan->rx_mp_tail != NULL);
1609 			chan->rx_mp_tail->b_next = mp;
1610 		} else {
1611 			ASSERT(chan->rx_mp_tail == NULL);
1612 			chan->rx_mp = mp;
1613 		}
1614 
1615 		chan->rx_mp_tail = mp;
1616 		chan->rx_mp_len++;
1617 
1618 		if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1619 			mpc = chan->rx_mp;
1620 
1621 			chan->rx_mp = NULL;
1622 			chan->rx_mp_tail = NULL;
1623 			chan->rx_mp_len = 0;
1624 			mutex_exit(&chan->rx_lock);
1625 			mac_rx(state->id_mh, NULL, mpc);
1626 		} else {
1627 			mutex_exit(&chan->rx_lock);
1628 		}
1629 	}
1630 }
1631 
1632 /*
1633  * Callback code invoked from STREAMs when the recv data buffer is free
1634  * for recycling.
1635  */
1636 static void
1637 ibd_rc_freemsg_cb(char *arg)
1638 {
1639 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1640 	ibd_rc_chan_t *chan = rwqe->w_chan;
1641 	ibd_state_t *state = rwqe->w_state;
1642 
1643 	/*
1644 	 * If the wqe is being destructed, do not attempt recycling.
1645 	 */
1646 	if (rwqe->w_freeing_wqe == B_TRUE) {
1647 		return;
1648 	}
1649 
1650 	ASSERT(!state->rc_enable_srq);
1651 	ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1652 
1653 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1654 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1655 	if (rwqe->rwqe_im_mblk == NULL) {
1656 		DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1657 		ibd_rc_free_rwqe(chan, rwqe);
1658 		return;
1659 	}
1660 
1661 	/*
1662 	 * Post back to h/w. We could actually have more than
1663 	 * id_num_rwqe WQEs on the list if there were multiple
1664 	 * ibd_freemsg_cb() calls outstanding (since the lock is
1665 	 * not held the entire time). This will start getting
1666 	 * corrected over subsequent ibd_freemsg_cb() calls.
1667 	 */
1668 	if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1669 		ibd_rc_free_rwqe(chan, rwqe);
1670 		return;
1671 	}
1672 	atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1);
1673 }
1674 
1675 /*
1676  * Common code for interrupt handling as well as for polling
1677  * for all completed wqe's while detaching.
1678  */
1679 static void
1680 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1681 {
1682 	ibd_wqe_t *wqe;
1683 	ibt_wc_t *wc, *wcs;
1684 	uint_t numwcs, real_numwcs;
1685 	int i;
1686 
1687 	wcs = chan->rx_wc;
1688 	numwcs = IBD_RC_MAX_CQ_WC;
1689 
1690 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1691 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1692 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1693 			if (wc->wc_status != IBT_WC_SUCCESS) {
1694 				chan->state->rc_rcq_err++;
1695 				/*
1696 				 * Channel being torn down.
1697 				 */
1698 				DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1699 				    "SUCC, chan=%p", wc->wc_status, chan);
1700 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1701 					/*
1702 					 * Do not invoke Rx handler because
1703 					 * it might add buffers to the Rx pool
1704 					 * when we are trying to deinitialize.
1705 					 */
1706 					continue;
1707 				}
1708 			}
1709 			ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1710 		}
1711 	}
1712 }
1713 
1714 /* Receive CQ handler */
1715 /* ARGSUSED */
1716 static void
1717 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1718 {
1719 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1720 	ibd_state_t *state = chan->state;
1721 
1722 	atomic_inc_32(&chan->rcq_invoking);
1723 	ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1724 
1725 	/*
1726 	 * Poll for completed entries; the CQ will not interrupt any
1727 	 * more for incoming (or transmitted) packets.
1728 	 */
1729 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1730 
1731 	/*
1732 	 * Now enable CQ notifications; all packets that arrive now
1733 	 * (or complete transmission) will cause new interrupts.
1734 	 */
1735 	if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1736 	    IBT_SUCCESS) {
1737 		/*
1738 		 * We do not expect a failure here.
1739 		 */
1740 		DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1741 	}
1742 
1743 	/*
1744 	 * Repoll to catch all packets that might have arrived after
1745 	 * we finished the first poll loop and before interrupts got
1746 	 * armed.
1747 	 */
1748 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1749 
1750 	if (state->rc_enable_srq) {
1751 		mutex_enter(&state->rc_rx_lock);
1752 
1753 		if (state->rc_rx_mp != NULL) {
1754 			mblk_t *mpc;
1755 			mpc = state->rc_rx_mp;
1756 
1757 			state->rc_rx_mp = NULL;
1758 			state->rc_rx_mp_tail = NULL;
1759 			state->rc_rx_mp_len = 0;
1760 
1761 			mutex_exit(&state->rc_rx_lock);
1762 			mac_rx(state->id_mh, NULL, mpc);
1763 		} else {
1764 			mutex_exit(&state->rc_rx_lock);
1765 		}
1766 	} else {
1767 		mutex_enter(&chan->rx_lock);
1768 
1769 		if (chan->rx_mp != NULL) {
1770 			mblk_t *mpc;
1771 			mpc = chan->rx_mp;
1772 
1773 			chan->rx_mp = NULL;
1774 			chan->rx_mp_tail = NULL;
1775 			chan->rx_mp_len = 0;
1776 
1777 			mutex_exit(&chan->rx_lock);
1778 			mac_rx(state->id_mh, NULL, mpc);
1779 		} else {
1780 			mutex_exit(&chan->rx_lock);
1781 		}
1782 	}
1783 	atomic_dec_32(&chan->rcq_invoking);
1784 }
1785 
1786 /*
1787  * Allocate the statically allocated Tx buffer list.
1788  */
1789 int
1790 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1791 {
1792 	ibd_rc_tx_largebuf_t *lbufp;
1793 	ibd_rc_tx_largebuf_t *tail;
1794 	uint8_t *memp;
1795 	ibt_mr_attr_t mem_attr;
1796 	uint32_t num_swqe;
1797 	size_t  mem_size;
1798 	int i;
1799 
1800 	num_swqe = state->id_rc_num_swqe - 1;
1801 
1802 	/*
1803 	 * Allocate one big chunk for all Tx large copy bufs
1804 	 */
1805 	/* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1806 	mem_size = num_swqe * state->rc_mtu;
1807 	state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1808 
1809 	mem_attr.mr_len = mem_size;
1810 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1811 	mem_attr.mr_as = NULL;
1812 	mem_attr.mr_flags = IBT_MR_SLEEP;
1813 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1814 	    &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1815 		DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1816 		    "failed");
1817 		kmem_free(state->rc_tx_mr_bufs, mem_size);
1818 		state->rc_tx_mr_bufs = NULL;
1819 		return (DDI_FAILURE);
1820 	}
1821 
1822 	state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1823 	    sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1824 
1825 	/*
1826 	 * Set up the buf chain
1827 	 */
1828 	memp = state->rc_tx_mr_bufs;
1829 	mutex_enter(&state->rc_tx_large_bufs_lock);
1830 	lbufp = state->rc_tx_largebuf_desc_base;
1831 	for (i = 0; i < num_swqe; i++) {
1832 		lbufp->lb_buf = memp;
1833 		lbufp->lb_next = lbufp + 1;
1834 
1835 		tail = lbufp;
1836 
1837 		memp += state->rc_mtu;
1838 		lbufp++;
1839 	}
1840 	tail->lb_next = NULL;
1841 
1842 	/*
1843 	 * Set up the buffer information in ibd state
1844 	 */
1845 	state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1846 	state->rc_tx_largebuf_nfree = num_swqe;
1847 	mutex_exit(&state->rc_tx_large_bufs_lock);
1848 	return (DDI_SUCCESS);
1849 }
1850 
1851 void
1852 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1853 {
1854 	uint32_t num_swqe;
1855 
1856 	num_swqe = state->id_rc_num_swqe - 1;
1857 
1858 	if (ibt_deregister_mr(state->id_hca_hdl,
1859 	    state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1860 		DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1861 		    "failed");
1862 	}
1863 	state->rc_tx_mr_hdl = NULL;
1864 
1865 	kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1866 	state->rc_tx_mr_bufs = NULL;
1867 
1868 	kmem_free(state->rc_tx_largebuf_desc_base,
1869 	    num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1870 	state->rc_tx_largebuf_desc_base = NULL;
1871 }
1872 
1873 static int
1874 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1875 {
1876 	ibt_mr_attr_t mem_attr;
1877 	ibd_state_t *state;
1878 
1879 	state = chan->state;
1880 	ASSERT(state != NULL);
1881 
1882 	/*
1883 	 * Allocate one big chunk for all regular tx copy bufs
1884 	 */
1885 	mem_attr.mr_len = chan->scq_size * state->id_rc_tx_copy_thresh;
1886 
1887 	chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1888 
1889 	/*
1890 	 * Do one memory registration on the entire txbuf area
1891 	 */
1892 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1893 	mem_attr.mr_as = NULL;
1894 	mem_attr.mr_flags = IBT_MR_SLEEP;
1895 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1896 	    &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1897 		DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1898 		ASSERT(mem_attr.mr_len ==
1899 		    chan->scq_size * state->id_rc_tx_copy_thresh);
1900 		kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1901 		chan->tx_mr_bufs = NULL;
1902 		return (DDI_FAILURE);
1903 	}
1904 
1905 	return (DDI_SUCCESS);
1906 }
1907 
1908 /*
1909  * Allocate the statically allocated Tx buffer list.
1910  */
1911 static int
1912 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1913 {
1914 	ibd_swqe_t *swqe;
1915 	int i;
1916 	ibt_lkey_t lkey;
1917 	ibd_state_t *state = chan->state;
1918 
1919 	if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1920 		return (DDI_FAILURE);
1921 
1922 	/*
1923 	 * Allocate and setup the swqe list
1924 	 */
1925 	lkey = chan->tx_mr_desc.md_lkey;
1926 	chan->tx_wqes = kmem_zalloc(chan->scq_size *
1927 	    sizeof (ibd_swqe_t), KM_SLEEP);
1928 	swqe = chan->tx_wqes;
1929 	for (i = 0; i < chan->scq_size; i++, swqe++) {
1930 		swqe->swqe_next = NULL;
1931 		swqe->swqe_im_mblk = NULL;
1932 
1933 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1934 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1935 
1936 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1937 		swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1938 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1939 		    (chan->tx_mr_bufs + i * state->id_rc_tx_copy_thresh);
1940 		swqe->w_swr.wr_trans = IBT_RC_SRV;
1941 
1942 		/* Add to list */
1943 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
1944 		chan->tx_wqe_list.dl_cnt++;
1945 		swqe->swqe_next = chan->tx_wqe_list.dl_head;
1946 		chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1947 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
1948 	}
1949 
1950 	return (DDI_SUCCESS);
1951 }
1952 
1953 /*
1954  * Free the statically allocated Tx buffer list.
1955  */
1956 static void
1957 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1958 {
1959 	ibd_state_t *state = chan->state;
1960 	if (chan->tx_mr_hdl != NULL) {
1961 		if (ibt_deregister_mr(chan->state->id_hca_hdl,
1962 		    chan->tx_mr_hdl) != IBT_SUCCESS) {
1963 			DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1964 			    "failed");
1965 		}
1966 		chan->tx_mr_hdl = NULL;
1967 	}
1968 
1969 	if (chan->tx_mr_bufs != NULL) {
1970 		kmem_free(chan->tx_mr_bufs, chan->scq_size *
1971 		    state->id_rc_tx_copy_thresh);
1972 		chan->tx_mr_bufs = NULL;
1973 	}
1974 
1975 	if (chan->tx_wqes != NULL) {
1976 		kmem_free(chan->tx_wqes, chan->scq_size *
1977 		    sizeof (ibd_swqe_t));
1978 		chan->tx_wqes = NULL;
1979 	}
1980 }
1981 
1982 /*
1983  * Acquire send wqe from free list.
1984  * Returns error number and send wqe pointer.
1985  */
1986 ibd_swqe_t *
1987 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1988 {
1989 	ibd_swqe_t *wqe;
1990 
1991 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1992 	if (chan->tx_rel_list.dl_head != NULL) {
1993 		/* transfer id_tx_rel_list to id_tx_list */
1994 		chan->tx_wqe_list.dl_head =
1995 		    chan->tx_rel_list.dl_head;
1996 		chan->tx_wqe_list.dl_cnt =
1997 		    chan->tx_rel_list.dl_cnt;
1998 		chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1999 
2000 		/* clear id_tx_rel_list */
2001 		chan->tx_rel_list.dl_head = NULL;
2002 		chan->tx_rel_list.dl_cnt = 0;
2003 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2004 
2005 		wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
2006 		chan->tx_wqe_list.dl_cnt -= 1;
2007 		chan->tx_wqe_list.dl_head = wqe->swqe_next;
2008 	} else {	/* no free swqe */
2009 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2010 		chan->tx_wqe_list.dl_pending_sends = B_TRUE;
2011 		wqe = NULL;
2012 	}
2013 	return (wqe);
2014 }
2015 
2016 /*
2017  * Release send wqe back into free list.
2018  */
2019 static void
2020 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
2021 {
2022 	/*
2023 	 * Add back on Tx list for reuse.
2024 	 */
2025 	swqe->swqe_next = NULL;
2026 	mutex_enter(&chan->tx_rel_list.dl_mutex);
2027 	chan->tx_rel_list.dl_pending_sends = B_FALSE;
2028 	swqe->swqe_next = chan->tx_rel_list.dl_head;
2029 	chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
2030 	chan->tx_rel_list.dl_cnt++;
2031 	mutex_exit(&chan->tx_rel_list.dl_mutex);
2032 }
2033 
2034 void
2035 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
2036 {
2037 	uint_t		i;
2038 	uint_t		num_posted;
2039 	uint_t		n_wrs;
2040 	ibt_status_t	ibt_status;
2041 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
2042 	ibd_swqe_t	*tx_head, *elem;
2043 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
2044 
2045 	/* post the one request, then check for more */
2046 	ibt_status = ibt_post_send(chan->chan_hdl,
2047 	    &node->w_swr, 1, NULL);
2048 	if (ibt_status != IBT_SUCCESS) {
2049 		ibd_print_warn(chan->state, "ibd_post_send: "
2050 		    "posting one wr failed: ret=%d", ibt_status);
2051 		ibd_rc_tx_cleanup(node);
2052 	}
2053 
2054 	tx_head = NULL;
2055 	for (;;) {
2056 		if (tx_head == NULL) {
2057 			mutex_enter(&chan->tx_post_lock);
2058 			tx_head = chan->tx_head;
2059 			if (tx_head == NULL) {
2060 				chan->tx_busy = 0;
2061 				mutex_exit(&chan->tx_post_lock);
2062 				return;
2063 			}
2064 			chan->tx_head = NULL;
2065 			mutex_exit(&chan->tx_post_lock);
2066 		}
2067 
2068 		/*
2069 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
2070 		 * at a time if possible, and keep posting them.
2071 		 */
2072 		for (n_wrs = 0, elem = tx_head;
2073 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
2074 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
2075 			nodes[n_wrs] = elem;
2076 			wrs[n_wrs] = elem->w_swr;
2077 		}
2078 		tx_head = elem;
2079 
2080 		ASSERT(n_wrs != 0);
2081 
2082 		/*
2083 		 * If posting fails for some reason, we'll never receive
2084 		 * completion intimation, so we'll need to cleanup. But
2085 		 * we need to make sure we don't clean up nodes whose
2086 		 * wrs have been successfully posted. We assume that the
2087 		 * hca driver returns on the first failure to post and
2088 		 * therefore the first 'num_posted' entries don't need
2089 		 * cleanup here.
2090 		 */
2091 		num_posted = 0;
2092 		ibt_status = ibt_post_send(chan->chan_hdl,
2093 		    wrs, n_wrs, &num_posted);
2094 		if (ibt_status != IBT_SUCCESS) {
2095 			ibd_print_warn(chan->state, "ibd_post_send: "
2096 			    "posting multiple wrs failed: "
2097 			    "requested=%d, done=%d, ret=%d",
2098 			    n_wrs, num_posted, ibt_status);
2099 
2100 			for (i = num_posted; i < n_wrs; i++)
2101 				ibd_rc_tx_cleanup(nodes[i]);
2102 		}
2103 	}
2104 }
2105 
2106 /*
2107  * Common code that deals with clean ups after a successful or
2108  * erroneous transmission attempt.
2109  */
2110 void
2111 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
2112 {
2113 	ibd_ace_t *ace = swqe->w_ahandle;
2114 	ibd_state_t *state;
2115 
2116 	ASSERT(ace != NULL);
2117 	ASSERT(ace->ac_chan != NULL);
2118 
2119 	state = ace->ac_chan->state;
2120 
2121 	/*
2122 	 * If this was a dynamic registration in ibd_send(),
2123 	 * deregister now.
2124 	 */
2125 	if (swqe->swqe_im_mblk != NULL) {
2126 		ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
2127 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
2128 			ibd_unmap_mem(state, swqe);
2129 		}
2130 		freemsg(swqe->swqe_im_mblk);
2131 		swqe->swqe_im_mblk = NULL;
2132 	} else {
2133 		ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
2134 	}
2135 
2136 	if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
2137 		ibd_rc_tx_largebuf_t *lbufp;
2138 
2139 		lbufp = swqe->w_rc_tx_largebuf;
2140 		ASSERT(lbufp != NULL);
2141 
2142 		mutex_enter(&state->rc_tx_large_bufs_lock);
2143 		lbufp->lb_next = state->rc_tx_largebuf_free_head;
2144 		state->rc_tx_largebuf_free_head = lbufp;
2145 		state->rc_tx_largebuf_nfree ++;
2146 		mutex_exit(&state->rc_tx_large_bufs_lock);
2147 		swqe->w_rc_tx_largebuf = NULL;
2148 	}
2149 
2150 
2151 	/*
2152 	 * Release the send wqe for reuse.
2153 	 */
2154 	ibd_rc_release_swqe(ace->ac_chan, swqe);
2155 
2156 	/*
2157 	 * Drop the reference count on the AH; it can be reused
2158 	 * now for a different destination if there are no more
2159 	 * posted sends that will use it. This can be eliminated
2160 	 * if we can always associate each Tx buffer with an AH.
2161 	 * The ace can be null if we are cleaning up from the
2162 	 * ibd_send() error path.
2163 	 */
2164 	ibd_dec_ref_ace(state, ace);
2165 }
2166 
2167 void
2168 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2169 {
2170 	ibd_state_t *state = chan->state;
2171 	ibd_wqe_t *wqe;
2172 	ibt_wc_t *wc, *wcs;
2173 	ibd_ace_t *ace;
2174 	uint_t numwcs, real_numwcs;
2175 	int i;
2176 	boolean_t encount_error;
2177 
2178 	wcs = chan->tx_wc;
2179 	numwcs = IBD_RC_MAX_CQ_WC;
2180 	encount_error = B_FALSE;
2181 
2182 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2183 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2184 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2185 			if (wc->wc_status != IBT_WC_SUCCESS) {
2186 				if (encount_error == B_FALSE) {
2187 					/*
2188 					 * This RC channle is in error status,
2189 					 * remove it.
2190 					 */
2191 					encount_error = B_TRUE;
2192 					mutex_enter(&state->id_ac_mutex);
2193 					if ((chan->chan_state ==
2194 					    IBD_RC_STATE_ACT_ESTAB) &&
2195 					    (chan->state->id_link_state ==
2196 					    LINK_STATE_UP) &&
2197 					    ((ace = ibd_acache_find(state,
2198 					    &chan->ace->ac_mac, B_FALSE, 0))
2199 					    != NULL) && (ace == chan->ace)) {
2200 						ASSERT(ace->ac_mce == NULL);
2201 						INC_REF(ace, 1);
2202 						IBD_ACACHE_PULLOUT_ACTIVE(
2203 						    state, ace);
2204 						chan->chan_state =
2205 						    IBD_RC_STATE_ACT_CLOSING;
2206 						mutex_exit(&state->id_ac_mutex);
2207 						state->rc_reset_cnt++;
2208 						DPRINT(30, "ibd_rc_drain_scq: "
2209 						    "wc_status(%d) != SUCC, "
2210 						    "chan=%p, ace=%p, "
2211 						    "link_state=%d"
2212 						    "reset RC channel",
2213 						    wc->wc_status, chan,
2214 						    chan->ace, chan->state->
2215 						    id_link_state);
2216 						ibd_rc_signal_act_close(
2217 						    state, ace);
2218 					} else {
2219 						mutex_exit(&state->id_ac_mutex);
2220 						state->
2221 						    rc_act_close_simultaneous++;
2222 						DPRINT(40, "ibd_rc_drain_scq: "
2223 						    "wc_status(%d) != SUCC, "
2224 						    "chan=%p, chan_state=%d,"
2225 						    "ace=%p, link_state=%d."
2226 						    "other thread is closing "
2227 						    "it", wc->wc_status, chan,
2228 						    chan->chan_state, chan->ace,
2229 						    chan->state->id_link_state);
2230 					}
2231 				}
2232 			}
2233 			ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2234 		}
2235 
2236 		mutex_enter(&state->id_sched_lock);
2237 		if (state->id_sched_needed == 0) {
2238 			mutex_exit(&state->id_sched_lock);
2239 		} else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2240 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2241 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2242 			if ((chan->tx_rel_list.dl_cnt +
2243 			    chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2244 				state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2245 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2246 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2247 				mutex_exit(&state->id_sched_lock);
2248 				state->rc_swqe_mac_update++;
2249 				mac_tx_update(state->id_mh);
2250 			} else {
2251 				state->rc_scq_no_swqe++;
2252 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2253 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2254 				mutex_exit(&state->id_sched_lock);
2255 			}
2256 		} else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2257 			mutex_enter(&state->rc_tx_large_bufs_lock);
2258 			if (state->rc_tx_largebuf_nfree >
2259 			    IBD_RC_TX_FREE_THRESH) {
2260 				ASSERT(state->rc_tx_largebuf_free_head != NULL);
2261 				state->id_sched_needed &=
2262 				    ~IBD_RSRC_RC_TX_LARGEBUF;
2263 				mutex_exit(&state->rc_tx_large_bufs_lock);
2264 				mutex_exit(&state->id_sched_lock);
2265 				state->rc_xmt_buf_mac_update++;
2266 				mac_tx_update(state->id_mh);
2267 			} else {
2268 				state->rc_scq_no_largebuf++;
2269 				mutex_exit(&state->rc_tx_large_bufs_lock);
2270 				mutex_exit(&state->id_sched_lock);
2271 			}
2272 		} else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2273 			mutex_enter(&state->id_tx_list.dl_mutex);
2274 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
2275 			if ((state->id_tx_list.dl_cnt +
2276 			    state->id_tx_rel_list.dl_cnt)
2277 			    > IBD_FREE_SWQES_THRESH) {
2278 				state->id_sched_needed &= ~IBD_RSRC_SWQE;
2279 				state->id_sched_cnt++;
2280 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2281 				mutex_exit(&state->id_tx_list.dl_mutex);
2282 				mutex_exit(&state->id_sched_lock);
2283 				mac_tx_update(state->id_mh);
2284 			} else {
2285 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2286 				mutex_exit(&state->id_tx_list.dl_mutex);
2287 				mutex_exit(&state->id_sched_lock);
2288 			}
2289 		} else {
2290 			mutex_exit(&state->id_sched_lock);
2291 		}
2292 	}
2293 }
2294 
2295 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2296 /* ARGSUSED */
2297 static void
2298 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2299 {
2300 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2301 
2302 	if (ibd_rc_tx_softintr == 1) {
2303 		mutex_enter(&chan->tx_poll_lock);
2304 		if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2305 			chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2306 			mutex_exit(&chan->tx_poll_lock);
2307 			return;
2308 		} else {
2309 			mutex_exit(&chan->tx_poll_lock);
2310 			ddi_trigger_softintr(chan->scq_softintr);
2311 		}
2312 	} else
2313 		(void) ibd_rc_tx_recycle(arg);
2314 }
2315 
2316 static uint_t
2317 ibd_rc_tx_recycle(caddr_t arg)
2318 {
2319 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2320 	ibd_state_t *state = chan->state;
2321 	int flag, redo_flag;
2322 	int redo = 1;
2323 
2324 	flag = IBD_CQ_POLLING;
2325 	redo_flag = IBD_REDO_CQ_POLLING;
2326 
2327 	mutex_enter(&chan->tx_poll_lock);
2328 	if (chan->tx_poll_busy & flag) {
2329 		ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2330 		    "threads");
2331 		chan->tx_poll_busy |= redo_flag;
2332 		mutex_exit(&chan->tx_poll_lock);
2333 		return (DDI_INTR_CLAIMED);
2334 	}
2335 	chan->tx_poll_busy |= flag;
2336 	mutex_exit(&chan->tx_poll_lock);
2337 
2338 	/*
2339 	 * Poll for completed entries; the CQ will not interrupt any
2340 	 * more for completed packets.
2341 	 */
2342 	ibd_rc_drain_scq(chan, chan->scq_hdl);
2343 
2344 	/*
2345 	 * Now enable CQ notifications; all completions originating now
2346 	 * will cause new interrupts.
2347 	 */
2348 	do {
2349 		if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2350 		    IBT_SUCCESS) {
2351 			/*
2352 			 * We do not expect a failure here.
2353 			 */
2354 			DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2355 			    " failed");
2356 		}
2357 
2358 		ibd_rc_drain_scq(chan, chan->scq_hdl);
2359 
2360 		mutex_enter(&chan->tx_poll_lock);
2361 		if (chan->tx_poll_busy & redo_flag)
2362 			chan->tx_poll_busy &= ~redo_flag;
2363 		else {
2364 			chan->tx_poll_busy &= ~flag;
2365 			redo = 0;
2366 		}
2367 		mutex_exit(&chan->tx_poll_lock);
2368 
2369 	} while (redo);
2370 
2371 	return (DDI_INTR_CLAIMED);
2372 }
2373 
2374 static ibt_status_t
2375 ibd_register_service(ibt_srv_desc_t *srv, ib_svc_id_t sid,
2376     int num_sids, ibt_srv_hdl_t *srv_hdl, ib_svc_id_t *ret_sid)
2377 {
2378 	ibd_service_t *p;
2379 	ibt_status_t status;
2380 
2381 	mutex_enter(&ibd_gstate.ig_mutex);
2382 	for (p = ibd_gstate.ig_service_list; p != NULL; p = p->is_link) {
2383 		if (p->is_sid == sid) {
2384 			p->is_ref_cnt++;
2385 			*srv_hdl = p->is_srv_hdl;
2386 			*ret_sid = sid;
2387 			mutex_exit(&ibd_gstate.ig_mutex);
2388 			return (IBT_SUCCESS);
2389 		}
2390 	}
2391 	status = ibt_register_service(ibd_gstate.ig_ibt_hdl, srv, sid,
2392 	    num_sids, srv_hdl, ret_sid);
2393 	if (status == IBT_SUCCESS) {
2394 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2395 		p->is_srv_hdl = *srv_hdl;
2396 		p->is_sid = sid;
2397 		p->is_ref_cnt = 1;
2398 		p->is_link = ibd_gstate.ig_service_list;
2399 		ibd_gstate.ig_service_list = p;
2400 	}
2401 	mutex_exit(&ibd_gstate.ig_mutex);
2402 	return (status);
2403 }
2404 
2405 static ibt_status_t
2406 ibd_deregister_service(ibt_srv_hdl_t srv_hdl)
2407 {
2408 	ibd_service_t *p, **pp;
2409 	ibt_status_t status;
2410 
2411 	mutex_enter(&ibd_gstate.ig_mutex);
2412 	for (pp = &ibd_gstate.ig_service_list; *pp != NULL;
2413 	    pp = &((*pp)->is_link)) {
2414 		p = *pp;
2415 		if (p->is_srv_hdl == srv_hdl) {	/* Found it */
2416 			if (--p->is_ref_cnt == 0) {
2417 				status = ibt_deregister_service(
2418 				    ibd_gstate.ig_ibt_hdl, srv_hdl);
2419 				*pp = p->is_link; /* link prev to next */
2420 				kmem_free(p, sizeof (*p));
2421 			} else {
2422 				status = IBT_SUCCESS;
2423 			}
2424 			mutex_exit(&ibd_gstate.ig_mutex);
2425 			return (status);
2426 		}
2427 	}
2428 	/* Should not ever get here */
2429 	mutex_exit(&ibd_gstate.ig_mutex);
2430 	return (IBT_FAILURE);
2431 }
2432 
2433 /* Listen with corresponding service ID */
2434 ibt_status_t
2435 ibd_rc_listen(ibd_state_t *state)
2436 {
2437 	ibt_srv_desc_t srvdesc;
2438 	ib_svc_id_t ret_sid;
2439 	ibt_status_t status;
2440 	ib_gid_t gid;
2441 
2442 	if (state->rc_listen_hdl != NULL) {
2443 		DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2444 		return (IBT_FAILURE);
2445 	}
2446 
2447 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2448 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2449 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2450 
2451 	/*
2452 	 * Register the service with service id
2453 	 * Incoming connection requests should arrive on this service id.
2454 	 */
2455 	status = ibd_register_service(&srvdesc,
2456 	    IBD_RC_QPN_TO_SID(state->id_qpnum),
2457 	    1, &state->rc_listen_hdl, &ret_sid);
2458 	if (status != IBT_SUCCESS) {
2459 		DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2460 		    "ret=%d", status);
2461 		return (status);
2462 	}
2463 
2464 	gid = state->id_sgid;
2465 
2466 	/* pass state as cm_private */
2467 	status = ibt_bind_service(state->rc_listen_hdl,
2468 	    gid, NULL, state, &state->rc_listen_bind);
2469 	if (status != IBT_SUCCESS) {
2470 		DPRINT(40, "ibd_rc_listen:"
2471 		    " fail to bind port: <%d>", status);
2472 		(void) ibd_deregister_service(state->rc_listen_hdl);
2473 		return (status);
2474 	}
2475 
2476 	/*
2477 	 * Legacy OFED had used a wrong service ID (one additional zero digit)
2478 	 * for many years. To interop with legacy OFED, we support this wrong
2479 	 * service ID here.
2480 	 */
2481 	ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2482 
2483 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2484 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2485 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2486 
2487 	/*
2488 	 * Register the service with service id
2489 	 * Incoming connection requests should arrive on this service id.
2490 	 */
2491 	status = ibd_register_service(&srvdesc,
2492 	    IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2493 	    1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2494 	if (status != IBT_SUCCESS) {
2495 		DPRINT(40,
2496 		    "ibd_rc_listen: Service Registration for Legacy OFED "
2497 		    "Failed %d", status);
2498 		(void) ibt_unbind_service(state->rc_listen_hdl,
2499 		    state->rc_listen_bind);
2500 		(void) ibd_deregister_service(state->rc_listen_hdl);
2501 		return (status);
2502 	}
2503 
2504 	gid = state->id_sgid;
2505 
2506 	/* pass state as cm_private */
2507 	status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2508 	    gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2509 	if (status != IBT_SUCCESS) {
2510 		DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2511 		    "Legacy OFED listener", status);
2512 		(void) ibd_deregister_service(
2513 		    state->rc_listen_hdl_OFED_interop);
2514 		(void) ibt_unbind_service(state->rc_listen_hdl,
2515 		    state->rc_listen_bind);
2516 		(void) ibd_deregister_service(state->rc_listen_hdl);
2517 		return (status);
2518 	}
2519 
2520 	return (IBT_SUCCESS);
2521 }
2522 
2523 void
2524 ibd_rc_stop_listen(ibd_state_t *state)
2525 {
2526 	int ret;
2527 
2528 	/* Disable incoming connection requests */
2529 	if (state->rc_listen_hdl != NULL) {
2530 		ret = ibt_unbind_all_services(state->rc_listen_hdl);
2531 		if (ret != 0) {
2532 			DPRINT(40, "ibd_rc_stop_listen:"
2533 			    "ibt_unbind_all_services() failed, ret=%d", ret);
2534 		}
2535 		ret = ibd_deregister_service(state->rc_listen_hdl);
2536 		if (ret != 0) {
2537 			DPRINT(40, "ibd_rc_stop_listen:"
2538 			    "ibd_deregister_service() failed, ret=%d", ret);
2539 		} else {
2540 			state->rc_listen_hdl = NULL;
2541 		}
2542 	}
2543 
2544 	/* Disable incoming connection requests */
2545 	if (state->rc_listen_hdl_OFED_interop != NULL) {
2546 		ret = ibt_unbind_all_services(
2547 		    state->rc_listen_hdl_OFED_interop);
2548 		if (ret != 0) {
2549 			DPRINT(40, "ibd_rc_stop_listen:"
2550 			    "ibt_unbind_all_services() failed: %d", ret);
2551 		}
2552 		ret = ibd_deregister_service(state->rc_listen_hdl_OFED_interop);
2553 		if (ret != 0) {
2554 			DPRINT(40, "ibd_rc_stop_listen:"
2555 			    "ibd_deregister_service() failed: %d", ret);
2556 		} else {
2557 			state->rc_listen_hdl_OFED_interop = NULL;
2558 		}
2559 	}
2560 }
2561 
2562 void
2563 ibd_rc_close_all_chan(ibd_state_t *state)
2564 {
2565 	ibd_rc_chan_t *rc_chan;
2566 	ibd_ace_t *ace, *pre_ace;
2567 	uint_t attempts;
2568 
2569 	/* Disable all Rx routines */
2570 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2571 	rc_chan = state->rc_pass_chan_list.chan_list;
2572 	while (rc_chan != NULL) {
2573 		ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2574 		rc_chan = rc_chan->next;
2575 	}
2576 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2577 
2578 	if (state->rc_enable_srq) {
2579 		attempts = 10;
2580 		while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2581 			DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2582 			delay(drv_usectohz(100000));
2583 			if (--attempts == 0) {
2584 				/*
2585 				 * There are pending bufs with the network
2586 				 * layer and we have no choice but to wait
2587 				 * for them to be done with. Reap all the
2588 				 * Tx/Rx completions that were posted since
2589 				 * we turned off the notification and
2590 				 * return failure.
2591 				 */
2592 				break;
2593 			}
2594 		}
2595 	}
2596 
2597 	/* Close all passive RC channels */
2598 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2599 	while (rc_chan != NULL) {
2600 		(void) ibd_rc_pas_close(rc_chan, B_TRUE, B_FALSE);
2601 		rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2602 	}
2603 
2604 	/* Close all active RC channels */
2605 	mutex_enter(&state->id_ac_mutex);
2606 	state->id_ac_hot_ace = NULL;
2607 	ace = list_head(&state->id_ah_active);
2608 	while ((pre_ace = ace) != NULL) {
2609 		ace = list_next(&state->id_ah_active, ace);
2610 		if (pre_ace->ac_chan != NULL) {
2611 			INC_REF(pre_ace, 1);
2612 			IBD_ACACHE_PULLOUT_ACTIVE(state, pre_ace);
2613 			pre_ace->ac_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2614 			ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2615 			    pre_ace->ac_chan);
2616 		}
2617 	}
2618 	mutex_exit(&state->id_ac_mutex);
2619 
2620 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2621 	while (rc_chan != NULL) {
2622 		ace = rc_chan->ace;
2623 		ibd_rc_act_close(rc_chan, B_TRUE);
2624 		if (ace != NULL) {
2625 			mutex_enter(&state->id_ac_mutex);
2626 			ASSERT(ace->ac_ref != 0);
2627 			atomic_dec_32(&ace->ac_ref);
2628 			ace->ac_chan = NULL;
2629 			if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
2630 				IBD_ACACHE_INSERT_FREE(state, ace);
2631 				ace->ac_ref = 0;
2632 			} else {
2633 				ace->ac_ref |= CYCLEVAL;
2634 				state->rc_delay_ace_recycle++;
2635 			}
2636 			mutex_exit(&state->id_ac_mutex);
2637 		}
2638 		rc_chan = ibd_rc_rm_header_chan_list(
2639 		    &state->rc_obs_act_chan_list);
2640 	}
2641 
2642 	attempts = 400;
2643 	while (((state->rc_num_tx_chan != 0) ||
2644 	    (state->rc_num_rx_chan != 0)) && (attempts > 0)) {
2645 		/* Other thread is closing CM channel, wait it */
2646 		delay(drv_usectohz(100000));
2647 		attempts--;
2648 	}
2649 }
2650 
2651 void
2652 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
2653 {
2654 	ibt_status_t status;
2655 
2656 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2657 		return;
2658 
2659 	status = ibd_rc_connect(state, ace, path,
2660 	    IBD_RC_SERVICE_ID_OFED_INTEROP);
2661 
2662 	if (status != IBT_SUCCESS) {
2663 		/* wait peer side remove stale channel */
2664 		delay(drv_usectohz(10000));
2665 		if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2666 			return;
2667 		status = ibd_rc_connect(state, ace, path,
2668 		    IBD_RC_SERVICE_ID_OFED_INTEROP);
2669 	}
2670 
2671 	if (status != IBT_SUCCESS) {
2672 		/* wait peer side remove stale channel */
2673 		delay(drv_usectohz(10000));
2674 		if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
2675 			return;
2676 		(void) ibd_rc_connect(state, ace, path,
2677 		    IBD_RC_SERVICE_ID);
2678 	}
2679 }
2680 
2681 /*
2682  * Allocates channel and sets the ace->ac_chan to it.
2683  * Opens the channel.
2684  */
2685 ibt_status_t
2686 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
2687     uint64_t ietf_cm_service_id)
2688 {
2689 	ibt_status_t status = 0;
2690 	ibt_rc_returns_t open_returns;
2691 	ibt_chan_open_args_t open_args;
2692 	ibd_rc_msg_hello_t hello_req_msg;
2693 	ibd_rc_msg_hello_t *hello_ack_msg;
2694 	ibd_rc_chan_t *chan;
2695 	ibt_ud_dest_query_attr_t dest_attrs;
2696 
2697 	ASSERT(ace != NULL);
2698 	ASSERT(ace->ac_mce == NULL);
2699 	ASSERT(ace->ac_chan == NULL);
2700 
2701 	if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2702 		DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2703 		return (status);
2704 	}
2705 
2706 	ace->ac_chan = chan;
2707 	chan->state = state;
2708 	chan->ace = ace;
2709 
2710 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2711 
2712 	hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2713 
2714 	/*
2715 	 * open the channels
2716 	 */
2717 	bzero(&open_args, sizeof (ibt_chan_open_args_t));
2718 	bzero(&open_returns, sizeof (ibt_rc_returns_t));
2719 
2720 	open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2721 	open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2722 
2723 	/*
2724 	 * update path record with the SID
2725 	 */
2726 	if ((status = ibt_query_ud_dest(ace->ac_dest, &dest_attrs))
2727 	    != IBT_SUCCESS) {
2728 		DPRINT(40, "ibd_rc_connect: ibt_query_ud_dest() failed, "
2729 		    "ret=%d", status);
2730 		return (status);
2731 	}
2732 
2733 	path->pi_sid =
2734 	    ietf_cm_service_id | ((dest_attrs.ud_dst_qpn) & 0xffffff);
2735 
2736 
2737 	/* pre-allocate memory for hello ack message */
2738 	open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2739 	open_returns.rc_priv_data = hello_ack_msg;
2740 
2741 	open_args.oc_path = path;
2742 
2743 	open_args.oc_path_rnr_retry_cnt	= 1;
2744 	open_args.oc_path_retry_cnt = 1;
2745 
2746 	/* We don't do RDMA */
2747 	open_args.oc_rdma_ra_out = 0;
2748 	open_args.oc_rdma_ra_in	= 0;
2749 
2750 	hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2751 	hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2752 	open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2753 	open_args.oc_priv_data = (void *)(&hello_req_msg);
2754 
2755 	ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2756 	ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2757 	ASSERT(open_args.oc_cm_handler != NULL);
2758 
2759 	status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2760 	    IBT_BLOCKING, &open_args, &open_returns);
2761 
2762 	if (status == IBT_SUCCESS) {
2763 		/* Success! */
2764 		DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2765 		state->rc_conn_succ++;
2766 		kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2767 		return (IBT_SUCCESS);
2768 	}
2769 
2770 	/* failure */
2771 	(void) ibt_flush_channel(chan->chan_hdl);
2772 	ibd_rc_free_chan(chan);
2773 	ace->ac_chan = NULL;
2774 
2775 	/* check open_returns report error and exit */
2776 	DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2777 	    "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2778 	    " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2779 	    hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2780 	    dest_attrs.ud_dst_qpn);
2781 	kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2782 	return (status);
2783 }
2784 
2785 void
2786 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2787 {
2788 	ibd_req_t *req;
2789 
2790 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2791 	if (req == NULL) {
2792 		ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2793 		    "ibd_req_t fail");
2794 		mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2795 		ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2796 		state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2797 		mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2798 	} else {
2799 		req->rq_ptr = ace->ac_chan;
2800 		ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2801 	}
2802 }
2803 
2804 void
2805 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2806 {
2807 	ibd_req_t *req;
2808 
2809 	mutex_enter(&state->rc_ace_recycle_lock);
2810 	if (state->rc_ace_recycle != NULL) {
2811 		mutex_exit(&state->rc_ace_recycle_lock);
2812 		return;
2813 	}
2814 
2815 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2816 	if (req == NULL) {
2817 		mutex_exit(&state->rc_ace_recycle_lock);
2818 		return;
2819 	}
2820 
2821 	state->rc_ace_recycle = ace;
2822 	mutex_exit(&state->rc_ace_recycle_lock);
2823 	ASSERT(ace->ac_mce == NULL);
2824 	INC_REF(ace, 1);
2825 	IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2826 	req->rq_ptr = ace;
2827 	ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2828 }
2829 
2830 /*
2831  * Close an active channel
2832  *
2833  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2834  */
2835 static void
2836 ibd_rc_act_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan)
2837 {
2838 	ibd_state_t *state;
2839 	ibd_ace_t *ace;
2840 	uint_t times;
2841 	ibt_status_t ret;
2842 
2843 	ASSERT(chan != NULL);
2844 
2845 	chan->state->rc_act_close++;
2846 	switch (chan->chan_state) {
2847 	case IBD_RC_STATE_ACT_CLOSING:	/* stale, close it */
2848 	case IBD_RC_STATE_ACT_ESTAB:
2849 		DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2850 		    "act_state=%d, chan=%p", chan->chan_state, chan);
2851 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2852 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2853 		/*
2854 		 * Wait send queue empty. Its old value is 50 (5 seconds). But
2855 		 * in my experiment, 5 seconds is not enough time to let IBTL
2856 		 * return all buffers and ace->ac_ref. I tried 25 seconds, it
2857 		 * works well. As another evidence, I saw IBTL takes about 17
2858 		 * seconds every time it cleans a stale RC channel.
2859 		 */
2860 		times = 250;
2861 		ace = chan->ace;
2862 		ASSERT(ace != NULL);
2863 		state = chan->state;
2864 		ASSERT(state != NULL);
2865 		mutex_enter(&state->id_ac_mutex);
2866 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
2867 		mutex_enter(&chan->tx_rel_list.dl_mutex);
2868 		while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2869 		    != chan->scq_size) || ((ace->ac_ref != 1) &&
2870 		    (ace->ac_ref != (CYCLEVAL+1)))) {
2871 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2872 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2873 			mutex_exit(&state->id_ac_mutex);
2874 			times--;
2875 			if (times == 0) {
2876 				state->rc_act_close_not_clean++;
2877 				DPRINT(40, "ibd_rc_act_close: dl_cnt(tx_wqe_"
2878 				    "list=%d, tx_rel_list=%d) != chan->"
2879 				    "scq_size=%d, OR ac_ref(=%d) not clean",
2880 				    chan->tx_wqe_list.dl_cnt,
2881 				    chan->tx_rel_list.dl_cnt,
2882 				    chan->scq_size, ace->ac_ref);
2883 				break;
2884 			}
2885 			mutex_enter(&chan->tx_poll_lock);
2886 			if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2887 				DPRINT(40, "ibd_rc_act_close: multiple "
2888 				    "polling threads");
2889 				mutex_exit(&chan->tx_poll_lock);
2890 			} else {
2891 				chan->tx_poll_busy = IBD_CQ_POLLING;
2892 				mutex_exit(&chan->tx_poll_lock);
2893 				ibd_rc_drain_scq(chan, chan->scq_hdl);
2894 				mutex_enter(&chan->tx_poll_lock);
2895 				chan->tx_poll_busy = 0;
2896 				mutex_exit(&chan->tx_poll_lock);
2897 			}
2898 			delay(drv_usectohz(100000));
2899 			mutex_enter(&state->id_ac_mutex);
2900 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2901 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2902 		}
2903 		if (times != 0) {
2904 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2905 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2906 			mutex_exit(&state->id_ac_mutex);
2907 		}
2908 
2909 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2910 		if (is_close_rc_chan) {
2911 			ret = ibt_close_rc_channel(chan->chan_hdl,
2912 			    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
2913 			    0);
2914 			if (ret != IBT_SUCCESS) {
2915 				DPRINT(40, "ibd_rc_act_close: ibt_close_rc_"
2916 				    "channel fail, chan=%p, ret=%d",
2917 				    chan, ret);
2918 			} else {
2919 				DPRINT(30, "ibd_rc_act_close: ibt_close_rc_"
2920 				    "channel succ, chan=%p", chan);
2921 			}
2922 		}
2923 
2924 		ibd_rc_free_chan(chan);
2925 		break;
2926 	case IBD_RC_STATE_ACT_REP_RECV:
2927 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2928 		(void) ibt_flush_channel(chan->chan_hdl);
2929 		ibd_rc_free_chan(chan);
2930 		break;
2931 	case IBD_RC_STATE_ACT_ERROR:
2932 		DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2933 		break;
2934 	default:
2935 		DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2936 		    "chan=%p", chan->chan_state, chan);
2937 	}
2938 }
2939 
2940 /*
2941  * Close a passive channel
2942  *
2943  * is_close_rc_chan: if B_TRUE, we will call ibt_close_rc_channel()
2944  *
2945  * is_timeout_close: if B_TRUE, this function is called by the connection
2946  * reaper (refer to function ibd_rc_conn_timeout_call). When the connection
2947  * reaper calls ibd_rc_pas_close(), and if it finds that dl_bufs_outstanding
2948  * or chan->rcq_invoking is non-zero, then it can simply put that channel back
2949  * on the passive channels list and move on, since it might be an indication
2950  * that the channel became active again by the time we started it's cleanup.
2951  * It is costlier to do the cleanup and then reinitiate the channel
2952  * establishment and hence it will help to be conservative when we do the
2953  * cleanup.
2954  */
2955 int
2956 ibd_rc_pas_close(ibd_rc_chan_t *chan, boolean_t is_close_rc_chan,
2957     boolean_t is_timeout_close)
2958 {
2959 	uint_t times;
2960 	ibt_status_t ret;
2961 
2962 	ASSERT(chan != NULL);
2963 	chan->state->rc_pas_close++;
2964 
2965 	switch (chan->chan_state) {
2966 	case IBD_RC_STATE_PAS_ESTAB:
2967 		if (is_timeout_close) {
2968 			if ((chan->rcq_invoking != 0) ||
2969 			    ((!chan->state->rc_enable_srq) &&
2970 			    (chan->rx_wqe_list.dl_bufs_outstanding > 0))) {
2971 				if (ibd_rc_re_add_to_pas_chan_list(chan)) {
2972 					return (DDI_FAILURE);
2973 				}
2974 			}
2975 		}
2976 		/*
2977 		 * First, stop receive interrupts; this stops the
2978 		 * connection from handing up buffers to higher layers.
2979 		 * Wait for receive buffers to be returned; give up
2980 		 * after 5 seconds.
2981 		 */
2982 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2983 		/* Wait 0.01 second to let ibt_set_cq_handler() take effect */
2984 		delay(drv_usectohz(10000));
2985 		if (!chan->state->rc_enable_srq) {
2986 			times = 50;
2987 			while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2988 				delay(drv_usectohz(100000));
2989 				if (--times == 0) {
2990 					DPRINT(40, "ibd_rc_pas_close : "
2991 					    "reclaiming failed");
2992 					ibd_rc_poll_rcq(chan, chan->rcq_hdl);
2993 					ibt_set_cq_handler(chan->rcq_hdl,
2994 					    ibd_rc_rcq_handler,
2995 					    (void *)(uintptr_t)chan);
2996 					return (DDI_FAILURE);
2997 				}
2998 			}
2999 		}
3000 		times = 50;
3001 		while (chan->rcq_invoking != 0) {
3002 			delay(drv_usectohz(100000));
3003 			if (--times == 0) {
3004 				DPRINT(40, "ibd_rc_pas_close : "
3005 				    "rcq handler is being invoked");
3006 				chan->state->rc_pas_close_rcq_invoking++;
3007 				break;
3008 			}
3009 		}
3010 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
3011 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3012 		DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
3013 		    "chan_state=%d, chan=%p", chan->chan_state, chan);
3014 		if (is_close_rc_chan) {
3015 			ret = ibt_close_rc_channel(chan->chan_hdl,
3016 			    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL,
3017 			    0);
3018 			if (ret != IBT_SUCCESS) {
3019 				DPRINT(40, "ibd_rc_pas_close: ibt_close_rc_"
3020 				    "channel() fail, chan=%p, ret=%d", chan,
3021 				    ret);
3022 			} else {
3023 				DPRINT(30, "ibd_rc_pas_close: ibt_close_rc_"
3024 				    "channel() succ, chan=%p", chan);
3025 			}
3026 		}
3027 		ibd_rc_free_chan(chan);
3028 		break;
3029 	case IBD_RC_STATE_PAS_REQ_RECV:
3030 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
3031 		(void) ibt_flush_channel(chan->chan_hdl);
3032 		ibd_rc_free_chan(chan);
3033 		break;
3034 	default:
3035 		DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
3036 		    chan->chan_state, chan);
3037 	}
3038 	return (DDI_SUCCESS);
3039 }
3040 
3041 /*
3042  * Passive Side:
3043  *	Handle an incoming CM REQ from active side.
3044  *
3045  *	If success, this function allocates an ibd_rc_chan_t, then
3046  * assigns it to "*ret_conn".
3047  */
3048 static ibt_cm_status_t
3049 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
3050     ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
3051     void *ret_priv_data)
3052 {
3053 	ibd_rc_msg_hello_t *hello_msg;
3054 	ibd_state_t *state = (ibd_state_t *)arg;
3055 	ibd_rc_chan_t *chan;
3056 
3057 	if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
3058 		DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
3059 		return (IBT_CM_REJECT);
3060 	}
3061 
3062 	ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);
3063 
3064 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);
3065 
3066 	if (!state->rc_enable_srq) {
3067 		if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
3068 			ibd_rc_free_chan(chan);
3069 			DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
3070 			    "failed");
3071 			return (IBT_CM_REJECT);
3072 		}
3073 	}
3074 
3075 	ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;
3076 
3077 	/* We don't do RDMA */
3078 	ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
3079 	ret_args->cm_ret.rep.cm_rdma_ra_in = 0;
3080 
3081 	ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
3082 	ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);
3083 
3084 	hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3085 	DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
3086 	    ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));
3087 
3088 	hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
3089 	hello_msg->reserved_qpn = htonl(state->id_qpnum);
3090 	hello_msg->rx_mtu = htonl(state->rc_mtu);
3091 
3092 	chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV;	/* ready to receive */
3093 	*ret_conn = chan;
3094 
3095 	return (IBT_CM_ACCEPT);
3096 }
3097 
3098 /*
3099  * ibd_rc_handle_act_estab -- handler for connection established completion
3100  * for active side.
3101  */
3102 static ibt_cm_status_t
3103 ibd_rc_handle_act_estab(ibd_ace_t *ace)
3104 {
3105 	ibt_status_t result;
3106 
3107 	switch (ace->ac_chan->chan_state) {
3108 		case IBD_RC_STATE_ACT_REP_RECV:
3109 			ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
3110 			result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
3111 			    IBT_NEXT_COMPLETION);
3112 			if (result != IBT_SUCCESS) {
3113 				DPRINT(40, "ibd_rc_handle_act_estab: "
3114 				    "ibt_enable_cq_notify(rcq) "
3115 				    "failed: status %d", result);
3116 				return (IBT_CM_REJECT);
3117 			}
3118 			break;
3119 		default:
3120 			DPRINT(40, "ibd_rc_handle_act_estab: default "
3121 			    "branch, act_state=%d", ace->ac_chan->chan_state);
3122 			return (IBT_CM_REJECT);
3123 	}
3124 	return (IBT_CM_ACCEPT);
3125 }
3126 
3127 /*
3128  * ibd_rc_handle_pas_estab -- handler for connection established completion
3129  * for passive side.
3130  */
3131 static ibt_cm_status_t
3132 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
3133 {
3134 	ibt_status_t result;
3135 
3136 	switch (chan->chan_state) {
3137 		case IBD_RC_STATE_PAS_REQ_RECV:
3138 			chan->chan_state = IBD_RC_STATE_PAS_ESTAB;
3139 
3140 			result = ibt_enable_cq_notify(chan->rcq_hdl,
3141 			    IBT_NEXT_COMPLETION);
3142 			if (result != IBT_SUCCESS) {
3143 				DPRINT(40, "ibd_rc_handle_pas_estab: "
3144 				    "ibt_enable_cq_notify(rcq) "
3145 				    "failed: status %d", result);
3146 				return (IBT_CM_REJECT);
3147 			}
3148 			break;
3149 		default:
3150 			DPRINT(40, "ibd_rc_handle_pas_estab: default "
3151 			    "branch, chan_state=%d", chan->chan_state);
3152 			return (IBT_CM_REJECT);
3153 	}
3154 	return (IBT_CM_ACCEPT);
3155 }
3156 
3157 /* ARGSUSED */
3158 static ibt_cm_status_t
3159 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3160     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3161     ibt_priv_data_len_t ret_len_max)
3162 {
3163 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3164 	ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
3165 	ibd_rc_chan_t *rc_chan;
3166 	ibd_state_t *state;
3167 	ibd_rc_msg_hello_t *hello_ack;
3168 
3169 	switch (ibt_cm_event->cm_type) {
3170 	case IBT_CM_EVENT_REP_RCV:
3171 		ASSERT(ace->ac_chan != NULL);
3172 		ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
3173 		hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
3174 		DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
3175 		    "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
3176 		    ntohl(hello_ack->reserved_qpn));
3177 		ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
3178 		break;
3179 
3180 	case IBT_CM_EVENT_CONN_EST:
3181 		ASSERT(ace->ac_chan != NULL);
3182 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
3183 		    "ace=%p, act_state=%d, chan=%p",
3184 		    ace, ace->ac_chan->chan_state, ace->ac_chan);
3185 		result = ibd_rc_handle_act_estab(ace);
3186 		break;
3187 
3188 	case IBT_CM_EVENT_CONN_CLOSED:
3189 		rc_chan = ace->ac_chan;
3190 		if (rc_chan == NULL) {
3191 			DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3192 			    "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
3193 			return (IBT_CM_ACCEPT);
3194 		}
3195 		state = rc_chan->state;
3196 		mutex_enter(&state->id_ac_mutex);
3197 		if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
3198 		    ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
3199 		    != NULL) && (ace == rc_chan->ace)) {
3200 			rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
3201 			ASSERT(ace->ac_mce == NULL);
3202 			INC_REF(ace, 1);
3203 			IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
3204 			mutex_exit(&state->id_ac_mutex);
3205 			DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3206 			    "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
3207 			    "reason=%d", ace, rc_chan,
3208 			    ibt_cm_event->cm_event.closed);
3209 		} else {
3210 			mutex_exit(&state->id_ac_mutex);
3211 			state->rc_act_close_simultaneous++;
3212 			DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
3213 			    "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
3214 			    "chan_state=%d", rc_chan->chan_state);
3215 			return (IBT_CM_ACCEPT);
3216 		}
3217 		ibd_rc_act_close(rc_chan, B_FALSE);
3218 		mutex_enter(&state->id_ac_mutex);
3219 		ace->ac_chan = NULL;
3220 		ASSERT(ace->ac_ref != 0);
3221 		atomic_dec_32(&ace->ac_ref);
3222 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
3223 			IBD_ACACHE_INSERT_FREE(state, ace);
3224 			ace->ac_ref = 0;
3225 		} else {
3226 			ace->ac_ref |= CYCLEVAL;
3227 			state->rc_delay_ace_recycle++;
3228 		}
3229 		mutex_exit(&state->id_ac_mutex);
3230 		break;
3231 
3232 	case IBT_CM_EVENT_FAILURE:
3233 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
3234 		    "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
3235 		    ace, ace->ac_chan,
3236 		    ibt_cm_event->cm_event.failed.cf_code,
3237 		    ibt_cm_event->cm_event.failed.cf_msg,
3238 		    ibt_cm_event->cm_event.failed.cf_reason);
3239 		/*
3240 		 * Don't need free resource here. The resource is freed
3241 		 * at function ibd_rc_connect()
3242 		 */
3243 		break;
3244 
3245 	case IBT_CM_EVENT_MRA_RCV:
3246 		DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
3247 		break;
3248 	case IBT_CM_EVENT_LAP_RCV:
3249 		DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
3250 		break;
3251 	case IBT_CM_EVENT_APR_RCV:
3252 		DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
3253 		break;
3254 	default:
3255 		DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
3256 		    "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
3257 		break;
3258 	}
3259 
3260 	return (result);
3261 }
3262 
3263 /* ARGSUSED */
3264 static ibt_cm_status_t
3265 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3266     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3267     ibt_priv_data_len_t ret_len_max)
3268 {
3269 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3270 	ibd_rc_chan_t *chan;
3271 
3272 	if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
3273 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
3274 		    "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
3275 		/* Receive an incoming CM REQ from active side */
3276 		result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
3277 		    ret_priv_data);
3278 		return (result);
3279 	}
3280 
3281 	if (ibt_cm_event->cm_channel == 0) {
3282 		DPRINT(30, "ibd_rc_dispatch_pass_mad: "
3283 		    "ERROR ibt_cm_event->cm_channel == 0");
3284 		return (IBT_CM_REJECT);
3285 	}
3286 
3287 	chan =
3288 	    (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
3289 	if (chan == NULL) {
3290 		DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
3291 		return (IBT_CM_REJECT);
3292 	}
3293 
3294 	switch (ibt_cm_event->cm_type) {
3295 	case IBT_CM_EVENT_CONN_EST:
3296 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
3297 		    "chan=%p", chan);
3298 		result = ibd_rc_handle_pas_estab(chan);
3299 		break;
3300 	case IBT_CM_EVENT_CONN_CLOSED:
3301 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
3302 		    " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
3303 		chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3304 		    chan);
3305 		if (chan != NULL)
3306 			(void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3307 		break;
3308 	case IBT_CM_EVENT_FAILURE:
3309 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
3310 		    " chan=%p, code: %d, msg: %d, reason=%d", chan,
3311 		    ibt_cm_event->cm_event.failed.cf_code,
3312 		    ibt_cm_event->cm_event.failed.cf_msg,
3313 		    ibt_cm_event->cm_event.failed.cf_reason);
3314 		chan = ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list,
3315 		    chan);
3316 		if (chan != NULL)
3317 			(void) ibd_rc_pas_close(chan, B_FALSE, B_FALSE);
3318 		return (IBT_CM_ACCEPT);
3319 	case IBT_CM_EVENT_MRA_RCV:
3320 		DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
3321 		break;
3322 	case IBT_CM_EVENT_LAP_RCV:
3323 		DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
3324 		break;
3325 	case IBT_CM_EVENT_APR_RCV:
3326 		DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
3327 		break;
3328 	default:
3329 		DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
3330 		    "chan=%p", ibt_cm_event->cm_type, chan);
3331 		break;
3332 	}
3333 
3334 	return (result);
3335 }
3336