xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c (revision 93c426a1dbbb1987537f122c9e8342b2d9347210)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 /*
29  * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
30  */
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/atomic.h>		/* for atomic_add*() */
45 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
46 #include <netinet/in.h>		/* for netinet/ip.h below */
47 #include <netinet/ip.h>		/* for struct ip */
48 #include <inet/common.h>	/* for inet/ip.h below */
49 #include <inet/ip.h>		/* for ipha_t */
50 #include <inet/ip_if.h>		/* for ETHERTYPE_IPV6 */
51 #include <inet/ip6.h>		/* for ip6_t */
52 #include <netinet/icmp6.h>	/* for icmp6_t */
53 #include <sys/ib/ibtl/ibvti.h>	/* for ace->ac_dest->ud_dst_qpn */
54 
55 #include <sys/ib/clients/ibd/ibd.h>
56 
57 
58 /* Per-interface tunables (for developers) */
59 extern uint_t ibd_rc_tx_copy_thresh;
60 /*
61  * ibd_rc_rx_copy_thresh
62  *     If (the size of incoming buffer <= ibd_rc_rx_copy_thresh), ibd will
63  * attempt to allocate a buffer and do a bcopy of the incoming data into
64  * the alocated buffer.
65  *
66  * ibd_rc_rx_rwqe_thresh
67  *     If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd will
68  * attempt to allocate a buffer and do a bcopy of the incoming data into
69  * the allocated buffer.
70  */
71 uint_t ibd_rc_rx_copy_thresh = 0x1000;
72 uint_t ibd_rc_rx_rwqe_thresh = 0x200;	/* old is 32; */
73 
74 /*
75  * ibd_rc_num_swqe
76  *	1) Send CQ size = ibd_rc_num_swqe
77  *	2) The send queue size = ibd_rc_num_swqe -1
78  *	3) Number of pre-allocated Tx buffers for ibt_post_send() =
79  * ibd_rc_num_swqe - 1.
80  */
81 uint_t ibd_rc_num_swqe = 0x1ff;
82 
83 /*
84  * ibd_rc_num_rwqe
85  *	1) For non-SRQ, we pre-post ibd_rc_num_rwqe number of WRs
86  * via ibt_post_receive() for receive queue of each RC channel.
87  *	2) For SRQ and non-SRQ, receive CQ size = ibd_rc_num_rwqe
88  */
89 uint_t ibd_rc_num_rwqe = 0x7ff;
90 
91 /*
92  * For SRQ
93  *	If using SRQ, we allocate ibd_rc_num_srq number of buffers (the size of
94  * each buffer is equal to RC mtu). And post them by ibt_post_srq().
95  *
96  *	ibd_rc_num_srq should not be larger than ibd_rc_num_rwqe, otherwise
97  * it will cause a bug with the following warnings:
98  * NOTICE: hermon0: Device Error: EQE cq overrun or protection error
99  * NOTICE: hermon0: Device Error: EQE local work queue catastrophic error
100  * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff catastrophic
101  * channel error
102  * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff completion queue
103  * error
104  */
105 uint_t ibd_rc_num_srq = 0x7fe;
106 
107 boolean_t ibd_rc_enable_cq_moderation = B_TRUE;
108 
109 /*
110  * Send CQ moderation parameters
111  */
112 uint_t ibd_rc_txcomp_count = 10;
113 uint_t ibd_rc_txcomp_usec = 300;
114 
115 /*
116  * Receive CQ moderation parameters
117  */
118 uint_t ibd_rc_rxcomp_count = 4;
119 uint_t ibd_rc_rxcomp_usec = 10;
120 
121 uint_t ibd_rc_tx_softintr = 1;
122 
123 /*
124  * If the number of WRs in receive queue of each RC connection less than
125  * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
126  */
127 #define	IBD_RC_RX_WR_THRESHOLD		0x20
128 
129 /*
130  * If the number of free SWQEs (or large Tx buf) is larger than or equal to
131  * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
132  * transmitting packets.
133  */
134 #define	IBD_RC_TX_FREE_THRESH		8
135 
136 #define	IBD_RC_QPN_TO_SID(qpn) \
137 	((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))
138 
139 /* For interop with legacy OFED */
140 #define	IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
141 	((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))
142 
143 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
144 #define	IBD_RC_IP_ICMP_RETURN_DATA_BYTES	64
145 
146 
147 /* Functions for Reliable Connected Mode */
148 /* Connection Setup/Close Functions */
149 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
150     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
151 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
152     ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
153 static int ibd_rc_pas_close(ibd_rc_chan_t *);
154 static void ibd_rc_act_close(ibd_rc_chan_t *);
155 
156 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
157     ibd_rc_chan_t *);
158 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
159     ibd_rc_chan_list_t *);
160 static inline void ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
161     ibd_rc_chan_t *);
162 
163 /* CQ handlers */
164 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
165 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
166 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);
167 
168 /* Receive Functions */
169 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
170 static void ibd_rc_srq_freemsg_cb(char *);
171 
172 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
173 static void ibd_rc_freemsg_cb(char *);
174 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
175 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
176 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);
177 
178 
179 /* Send Functions */
180 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
181 static int ibd_rc_init_txlist(ibd_rc_chan_t *);
182 static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
183 static uint_t ibd_rc_tx_recycle(caddr_t);
184 
185 
186 void
187 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
188 {
189 	ibd_rc_chan_t *rc_chan = req->rq_ptr;
190 	ibd_ace_t *ace;
191 
192 	while (rc_chan != NULL) {
193 		ace = rc_chan->ace;
194 		ASSERT(ace != NULL);
195 		/* Close old RC channel */
196 		ibd_rc_act_close(rc_chan);
197 		mutex_enter(&state->id_ac_mutex);
198 		ASSERT(ace->ac_ref != 0);
199 		atomic_dec_32(&ace->ac_ref);
200 		ace->ac_chan = NULL;
201 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
202 			IBD_ACACHE_INSERT_FREE(state, ace);
203 			ace->ac_ref = 0;
204 		} else {
205 			ace->ac_ref |= CYCLEVAL;
206 			state->rc_delay_ace_recycle++;
207 		}
208 		mutex_exit(&state->id_ac_mutex);
209 		rc_chan = ibd_rc_rm_header_chan_list(
210 		    &state->rc_obs_act_chan_list);
211 	}
212 }
213 
214 void
215 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
216 {
217 	ibd_ace_t *ace = req->rq_ptr;
218 	ibd_rc_chan_t *rc_chan;
219 
220 	ASSERT(ace != NULL);
221 	rc_chan = ace->ac_chan;
222 	ASSERT(rc_chan != NULL);
223 	/* Close old RC channel */
224 	ibd_rc_act_close(rc_chan);
225 	mutex_enter(&state->id_ac_mutex);
226 	ASSERT(ace->ac_ref != 0);
227 	atomic_dec_32(&ace->ac_ref);
228 	ace->ac_chan = NULL;
229 	if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
230 		IBD_ACACHE_INSERT_FREE(state, ace);
231 		ace->ac_ref = 0;
232 	} else {
233 		ace->ac_ref |= CYCLEVAL;
234 		state->rc_delay_ace_recycle++;
235 	}
236 	mutex_exit(&state->id_ac_mutex);
237 	mutex_enter(&state->rc_ace_recycle_lock);
238 	state->rc_ace_recycle = NULL;
239 	mutex_exit(&state->rc_ace_recycle_lock);
240 }
241 
242 /* Simple ICMP IP Header Template */
243 static const ipha_t icmp_ipha = {
244 	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
245 };
246 
247 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
248 void
249 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
250 {
251 	mblk_t *mp = req->rq_ptr;
252 	ibd_ace_t *ace = req->rq_ptr2;
253 	uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
254 	uint_t	len_needed;
255 	size_t	msg_len;
256 	mblk_t	*pmtu_mp;
257 	ushort_t	sap;
258 	ib_header_info_t *ibha;	/* ib header for pmtu_pkt */
259 	/*
260 	 * ipha: IP header for pmtu_pkt
261 	 * old_ipha: IP header for old packet
262 	 */
263 	ipha_t *ipha, *old_ipha;
264 	icmph_t	*icmph;
265 
266 	sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);
267 
268 	if (!pullupmsg(mp, -1)) {
269 		DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
270 		goto too_big_fail;
271 	}
272 	/* move to IP header. */
273 	mp->b_rptr += IPOIB_HDRSIZE;
274 	old_ipha = (ipha_t *)mp->b_rptr;
275 
276 	len_needed = IPH_HDR_LENGTH(old_ipha);
277 	if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
278 		len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
279 		    len_needed));
280 	} else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
281 		ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
282 		    + len_needed);
283 		len_needed += ip_hdr_length_v6(mp, ip6h);
284 	}
285 	len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
286 	msg_len = msgdsize(mp);
287 	if (msg_len > len_needed) {
288 		(void) adjmsg(mp, len_needed - msg_len);
289 		msg_len = len_needed;
290 	}
291 
292 	if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
293 	    + sizeof (icmph_t), BPRI_MED)) == NULL) {
294 		DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
295 		goto too_big_fail;
296 	}
297 	pmtu_mp->b_cont = mp;
298 	pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
299 	    + sizeof (ipha_t) + sizeof (icmph_t);
300 
301 	ibha = (ib_header_info_t *)pmtu_mp->b_rptr;
302 
303 	/* Fill IB header */
304 	bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
305 	/*
306 	 * If the GRH is not valid, indicate to GLDv3 by setting
307 	 * the VerTcFlow field to 0.
308 	 */
309 	ibha->ib_grh.ipoib_vertcflow = 0;
310 	ibha->ipib_rhdr.ipoib_type = htons(sap);
311 	ibha->ipib_rhdr.ipoib_mbz = 0;
312 
313 	/* Fill IP header */
314 	ipha = (ipha_t *)&ibha[1];
315 	*ipha = icmp_ipha;
316 	ipha->ipha_src = old_ipha->ipha_dst;
317 	ipha->ipha_dst = old_ipha->ipha_src;
318 	ipha->ipha_ttl = old_ipha->ipha_ttl;
319 	msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
320 	if (msg_len > IP_MAXPACKET) {
321 		ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
322 		    "> IP_MAXPACKET", (uint32_t)msg_len);
323 		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
324 		msg_len = IP_MAXPACKET;
325 	}
326 	ipha->ipha_length = htons((uint16_t)msg_len);
327 	ipha->ipha_hdr_checksum = 0;
328 	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
329 
330 	/* Fill ICMP body */
331 	icmph = (icmph_t *)&ipha[1];
332 	bzero(icmph, sizeof (icmph_t));
333 	icmph->icmph_type = ICMP_DEST_UNREACHABLE;
334 	icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
335 	icmph->icmph_du_mtu = htons(mtu);
336 	icmph->icmph_checksum = 0;
337 	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
338 	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);
339 
340 	(void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
341 	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
342 
343 	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
344 	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
345 	    sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
346 	    len_needed, (uint32_t)msg_len);
347 
348 	mac_rx(state->id_mh, state->id_rh, pmtu_mp);
349 
350 	mutex_enter(&ace->tx_too_big_mutex);
351 	ace->tx_too_big_ongoing = B_FALSE;
352 	mutex_exit(&ace->tx_too_big_mutex);
353 	return;
354 
355 too_big_fail:
356 	/* Drop packet */
357 	freemsg(mp);
358 	mutex_enter(&ace->tx_too_big_mutex);
359 	ace->tx_too_big_ongoing = B_FALSE;
360 	mutex_exit(&ace->tx_too_big_mutex);
361 }
362 
363 void
364 ibd_rc_get_conf(ibd_state_t *state)
365 {
366 	int *props;
367 	uint_t num_props;
368 	int instance;
369 
370 	instance = ddi_get_instance(state->id_dip);
371 
372 	/*
373 	 * Get the array of "enable_rc" properties from "ibd.conf" file
374 	 */
375 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, state->id_dip,
376 	    DDI_PROP_DONTPASS, "enable_rc", &props, &num_props)
377 	    == DDI_PROP_SUCCESS) {
378 		if (instance < num_props) {
379 			if (props[instance] == 1) {
380 				state->id_enable_rc = B_TRUE;
381 			} else {
382 				state->id_enable_rc = B_FALSE;
383 			}
384 		} else {
385 			/* not enough properties configured */
386 			state->id_enable_rc = B_FALSE;
387 			DPRINT(40, "ibd_rc_get_conf: Not enough "
388 			    "enable_rc values in ibd.conf,"
389 			    " disable RC mode, instance=%d", instance);
390 		}
391 
392 		/* free memory allocated for properties */
393 		ddi_prop_free(props);
394 	} else {
395 		state->id_enable_rc = B_FALSE;
396 		DPRINT(30, "ibd_rc_get_conf: fail to find "
397 		    "enable_rc in ibd.conf, disable RC mode");
398 	}
399 
400 	state->rc_mtu = 65524;
401 	state->rc_enable_srq = B_TRUE;
402 }
403 
404 #ifdef DEBUG
405 /*
406  * ibd_rc_update_stats - update driver private kstat counters
407  *
408  * This routine will dump the internal statistics counters for ibd's
409  * Reliable Connected Mode. The current stats dump values will
410  * be sent to the kernel status area.
411  */
412 static int
413 ibd_rc_update_stats(kstat_t *ksp, int rw)
414 {
415 	ibd_state_t *state;
416 	ibd_rc_stat_t *ibd_rc_ksp;
417 
418 	if (rw == KSTAT_WRITE)
419 		return (EACCES);
420 
421 	state = (ibd_state_t *)ksp->ks_private;
422 	ASSERT(state != NULL);
423 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
424 
425 	ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
426 	ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
427 	ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
428 	ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
429 	ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;
430 
431 	ibd_rc_ksp->rc_rcq_invoke.value.ul = state->rc_rcq_invoke;
432 	ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
433 	ibd_rc_ksp->rc_scq_invoke.value.ul = state->rc_scq_invoke;
434 
435 	ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;
436 
437 	ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
438 	ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
439 	ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
440 	    state->rc_xmt_fragmented_pkt;
441 	ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
442 	ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
443 	ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;
444 
445 	ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
446 	ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
447 	ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
448 	ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
449 	ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
450 	ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
451 	    state->rc_xmt_buf_mac_update;
452 
453 	ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
454 	ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
455 	ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
456 	ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;
457 
458 	ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
459 	ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
460 	ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
461 	ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
462 	    state->rc_act_close_simultaneous;
463 	ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;
464 
465 	return (0);
466 }
467 
468 
469 /*
470  * ibd_rc_init_stats - initialize kstat data structures
471  *
472  * This routine will create and initialize the driver private
473  * statistics counters.
474  */
475 int
476 ibd_rc_init_stats(ibd_state_t *state)
477 {
478 	kstat_t *ksp;
479 	ibd_rc_stat_t *ibd_rc_ksp;
480 
481 	/*
482 	 * Create and init kstat
483 	 */
484 	ksp = kstat_create("ibd", ddi_get_instance(state->id_dip),
485 	    "statistics", "net", KSTAT_TYPE_NAMED,
486 	    sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);
487 
488 	if (ksp == NULL) {
489 		ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
490 		    "kernel statistics");
491 		return (DDI_FAILURE);
492 	}
493 
494 	state->rc_ksp = ksp;	/* Fill in the ksp of ibd over RC mode */
495 
496 	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;
497 
498 	/*
499 	 * Initialize all the statistics
500 	 */
501 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
502 	    "transfer mode", KSTAT_DATA_ULONG);
503 	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
504 	    "transfer mode", KSTAT_DATA_ULONG);
505 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
506 	    "copy mode", KSTAT_DATA_ULONG);
507 	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
508 	    "copy mode", KSTAT_DATA_ULONG);
509 	kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
510 	    KSTAT_DATA_ULONG);
511 
512 	kstat_named_init(&ibd_rc_ksp->rc_rcq_invoke, "RC: invoke of Recv CQ "
513 	    "handler", KSTAT_DATA_ULONG);
514 	kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
515 	    KSTAT_DATA_ULONG);
516 
517 	kstat_named_init(&ibd_rc_ksp->rc_scq_invoke, "RC: invoke of Send CQ "
518 	    "handler", KSTAT_DATA_ULONG);
519 
520 	kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
521 	    KSTAT_DATA_ULONG);
522 
523 	kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
524 	    KSTAT_DATA_ULONG);
525 	kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
526 	    "RC: Tx pkt small size", KSTAT_DATA_ULONG);
527 	kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
528 	    "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
529 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
530 	    "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
531 	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
532 	    "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
533 	kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
534 	    KSTAT_DATA_ULONG);
535 
536 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
537 	    "recycle", KSTAT_DATA_ULONG);
538 	kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
539 	    "after recycle", KSTAT_DATA_ULONG);
540 	kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
541 	    KSTAT_DATA_ULONG);
542 	kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
543 	    "#, swqe available", KSTAT_DATA_ULONG);
544 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
545 	    "ibd_send", KSTAT_DATA_ULONG);
546 	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
547 	    "mac_tx_update #, buf available", KSTAT_DATA_ULONG);
548 
549 	kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
550 	    KSTAT_DATA_ULONG);
551 	kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
552 	    KSTAT_DATA_ULONG);
553 	kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
554 	    "pkt", KSTAT_DATA_ULONG);
555 	kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
556 	    "state", KSTAT_DATA_ULONG);
557 
558 	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
559 	    KSTAT_DATA_ULONG);
560 	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
561 	    KSTAT_DATA_ULONG);
562 	kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
563 	    "recycle", KSTAT_DATA_ULONG);
564 	kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
565 	    "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
566 	kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
567 	    KSTAT_DATA_ULONG);
568 
569 	/*
570 	 * Function to provide kernel stat update on demand
571 	 */
572 	ksp->ks_update = ibd_rc_update_stats;
573 
574 	/*
575 	 * Pointer into provider's raw statistics
576 	 */
577 	ksp->ks_private = (void *)state;
578 
579 	/*
580 	 * Add kstat to systems kstat chain
581 	 */
582 	kstat_install(ksp);
583 
584 	return (DDI_SUCCESS);
585 }
586 #endif
587 
588 static ibt_status_t
589 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
590     boolean_t is_tx_chan)
591 {
592 	ibt_status_t result;
593 	ibd_rc_chan_t *chan;
594 	ibt_rc_chan_alloc_args_t alloc_args;
595 	ibt_chan_alloc_flags_t alloc_flags;
596 	ibt_chan_sizes_t sizes;
597 	ibt_cq_attr_t cq_atts;
598 	int rv;
599 
600 	chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);
601 
602 	chan->state = state;
603 	mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
604 	mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
605 	mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
606 	mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
607 	mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
608 	mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);
609 
610 	/* Allocate IB structures for a new RC channel. */
611 	if (is_tx_chan) {
612 		chan->scq_size = ibd_rc_num_swqe;
613 		chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
614 	} else {
615 		chan->scq_size = IBD_RC_MIN_CQ_SIZE;
616 		chan->rcq_size = ibd_rc_num_rwqe;
617 	}
618 	cq_atts.cq_size = chan->scq_size;
619 	cq_atts.cq_sched = NULL;
620 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
621 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
622 	    &chan->scq_size);
623 	if (result != IBT_SUCCESS) {
624 		DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
625 		    "create scq completion queue (size <%d>)",
626 		    result, chan->scq_size);
627 		goto alloc_scq_err;
628 	}	/* if failure to alloc cq */
629 
630 	if (ibd_rc_enable_cq_moderation) {
631 		if (ibt_modify_cq(chan->scq_hdl, ibd_rc_txcomp_count,
632 		    ibd_rc_txcomp_usec, 0) != IBT_SUCCESS) {
633 			ibd_print_warn(state, "ibd_rc_alloc_chan: Send CQ "
634 			    "interrupt moderation failed");
635 		}
636 	}
637 
638 	ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
639 	ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
640 	    (void *) (uintptr_t)chan);
641 
642 	cq_atts.cq_size = chan->rcq_size;
643 	cq_atts.cq_sched = NULL;
644 	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
645 	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
646 	    &chan->rcq_size);
647 	if (result != IBT_SUCCESS) {
648 		ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
649 		    "rx completion queue (size <%d>)", result, chan->rcq_size);
650 		goto alloc_rcq_err;
651 	}	/* if failure to alloc cq */
652 
653 	if (ibd_rc_enable_cq_moderation) {
654 		if (ibt_modify_cq(chan->rcq_hdl, ibd_rc_rxcomp_count,
655 		    ibd_rc_rxcomp_usec, 0) != IBT_SUCCESS) {
656 			ibd_print_warn(state, "ibd_rc_alloc_chan: Receive CQ "
657 			    "interrupt moderation failed");
658 		}
659 	}
660 	ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
661 	ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
662 	    (void *)(uintptr_t)chan);
663 
664 	if (is_tx_chan) {
665 		chan->is_tx_chan = B_TRUE;
666 		if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
667 			ibd_print_warn(state, "ibd_rc_alloc_chan: "
668 			    "ibd_rc_init_txlist failed");
669 			goto init_txlist_err;
670 		}
671 		if (ibd_rc_tx_softintr == 1) {
672 			if ((rv = ddi_add_softintr(state->id_dip,
673 			    DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
674 			    ibd_rc_tx_recycle, (caddr_t)chan)) !=
675 			    DDI_SUCCESS) {
676 				DPRINT(10, "ibd_rc_alloc_chan: failed in "
677 				    "ddi_add_softintr(scq_softintr), ret=%d",
678 				    rv);
679 				goto alloc_softintr_err;
680 			}
681 		}
682 	} else {
683 		chan->is_tx_chan = B_FALSE;
684 	}
685 
686 	/*
687 	 * enable completions
688 	 */
689 	result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
690 	if (result != IBT_SUCCESS) {
691 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
692 		    "(scq) failed: status %d\n", result);
693 		goto alloc_scq_enable_err;
694 	}
695 
696 	/* We will enable chan->rcq_hdl later. */
697 
698 	/* alloc a RC channel */
699 	bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
700 	bzero(&sizes, sizeof (ibt_chan_sizes_t));
701 
702 	alloc_args.rc_flags = IBT_WR_SIGNALED;
703 	alloc_args.rc_control = IBT_CEP_NO_FLAGS;
704 
705 	alloc_args.rc_scq = chan->scq_hdl;
706 	alloc_args.rc_rcq = chan->rcq_hdl;
707 	alloc_args.rc_pd = state->id_pd_hdl;
708 
709 	alloc_args.rc_hca_port_num = state->id_port;
710 	alloc_args.rc_clone_chan = NULL;
711 
712 	/* scatter/gather */
713 	alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;
714 
715 	/*
716 	 * For the number of SGL elements in receive side, I think it
717 	 * should be 1. Because ibd driver allocates a whole block memory
718 	 * for each ibt_post_recv().
719 	 */
720 	alloc_args.rc_sizes.cs_rq_sgl = 1;
721 
722 	/* The send queue size and the receive queue size */
723 	alloc_args.rc_sizes.cs_sq = chan->scq_size;
724 	alloc_args.rc_sizes.cs_rq = chan->rcq_size;
725 
726 	if (state->id_hca_res_lkey_capab) {
727 		alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
728 	} else {
729 		DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
730 	}
731 
732 	if (state->rc_enable_srq) {
733 		alloc_flags = IBT_ACHAN_USES_SRQ;
734 		alloc_args.rc_srq = state->rc_srq_hdl;
735 	} else {
736 		alloc_flags = IBT_ACHAN_NO_FLAGS;
737 	}
738 
739 	result = ibt_alloc_rc_channel(state->id_hca_hdl,
740 	    alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
741 	if (result != IBT_SUCCESS) {
742 		ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
743 		    " fail:<%d>", result);
744 		goto alloc_scq_enable_err;
745 	}
746 
747 	*ret_chan = chan;
748 	return (IBT_SUCCESS);
749 
750 alloc_scq_enable_err:
751 	if (is_tx_chan) {
752 		if (ibd_rc_tx_softintr == 1) {
753 			ddi_remove_softintr(chan->scq_softintr);
754 		}
755 	}
756 alloc_softintr_err:
757 	if (is_tx_chan) {
758 		ibd_rc_fini_txlist(chan);
759 	}
760 init_txlist_err:
761 	(void) ibt_free_cq(chan->rcq_hdl);
762 alloc_rcq_err:
763 	(void) ibt_free_cq(chan->scq_hdl);
764 alloc_scq_err:
765 	mutex_destroy(&chan->tx_poll_lock);
766 	mutex_destroy(&chan->tx_post_lock);
767 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
768 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
769 	mutex_destroy(&chan->rx_free_list.dl_mutex);
770 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
771 	kmem_free(chan, sizeof (ibd_rc_chan_t));
772 	return (result);
773 }
774 
775 static void
776 ibd_rc_free_chan(ibd_rc_chan_t *chan)
777 {
778 	ibt_status_t ret;
779 
780 	/* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */
781 
782 	if (chan->chan_hdl != NULL) {
783 		ret = ibt_free_channel(chan->chan_hdl);
784 		if (ret != IBT_SUCCESS) {
785 			DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
786 			    "chan=%p, returned: %d", chan, ret);
787 			return;
788 		}
789 		chan->chan_hdl = NULL;
790 	}
791 
792 	if (chan->rcq_hdl != NULL) {
793 		ret = ibt_free_cq(chan->rcq_hdl);
794 		if (ret != IBT_SUCCESS) {
795 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
796 			    "chan=%p, returned: %d", chan, ret);
797 			return;
798 		}
799 		chan->rcq_hdl = NULL;
800 	}
801 
802 	if (chan->scq_hdl != NULL) {
803 		ret = ibt_free_cq(chan->scq_hdl);
804 		if (ret != IBT_SUCCESS) {
805 			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
806 			    "chan=%p, returned: %d", chan, ret);
807 			return;
808 		}
809 		chan->scq_hdl = NULL;
810 	}
811 
812 	/* Free buffers */
813 	if (chan->is_tx_chan) {
814 		ibd_rc_fini_txlist(chan);
815 		if (ibd_rc_tx_softintr == 1) {
816 			ddi_remove_softintr(chan->scq_softintr);
817 		}
818 	} else {
819 		if (!chan->state->rc_enable_srq) {
820 			ibd_rc_fini_rxlist(chan);
821 		}
822 	}
823 
824 	mutex_destroy(&chan->tx_poll_lock);
825 	mutex_destroy(&chan->tx_post_lock);
826 	mutex_destroy(&chan->tx_rel_list.dl_mutex);
827 	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
828 	mutex_destroy(&chan->rx_free_list.dl_mutex);
829 	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
830 
831 	/*
832 	 * If it is a passive channel, must make sure it has been removed
833 	 * from chan->state->rc_pass_chan_list
834 	 */
835 	kmem_free(chan, sizeof (ibd_rc_chan_t));
836 }
837 
838 /* Add a RC channel */
839 static inline void
840 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
841 {
842 	mutex_enter(&list->chan_list_mutex);
843 	if (list->chan_list == NULL) {
844 		list->chan_list = chan;
845 	} else {
846 		chan->next = list->chan_list;
847 		list->chan_list = chan;
848 	}
849 	mutex_exit(&list->chan_list_mutex);
850 }
851 
852 /* Remove a RC channel */
853 static inline void
854 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
855 {
856 	ibd_rc_chan_t *pre_chan;
857 
858 	mutex_enter(&list->chan_list_mutex);
859 	if (list->chan_list == chan) {
860 		DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
861 		    " in chan_list", chan);
862 		list->chan_list = chan->next;
863 	} else {
864 		pre_chan = list->chan_list;
865 		while (pre_chan != NULL) {
866 			if (pre_chan->next == chan) {
867 				DPRINT(30, "ibd_rc_rm_from_chan_list"
868 				    "(middle): found chan(%p) in "
869 				    "rc_pass_chan_list", chan);
870 				pre_chan->next = chan->next;
871 				break;
872 			}
873 			pre_chan = pre_chan->next;
874 		}
875 	}
876 	mutex_exit(&list->chan_list_mutex);
877 }
878 
879 static inline ibd_rc_chan_t *
880 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
881 {
882 	ibd_rc_chan_t *rc_chan;
883 
884 	mutex_enter(&list->chan_list_mutex);
885 	rc_chan = list->chan_list;
886 	if (rc_chan != NULL) {
887 		list->chan_list = rc_chan->next;
888 	}
889 	mutex_exit(&list->chan_list_mutex);
890 	return (rc_chan);
891 }
892 
893 static int
894 ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
895 {
896 	ibt_mr_attr_t mem_attr;
897 	uint_t rc_rx_bufs_sz;
898 
899 	/*
900 	 * Allocate one big chunk for all regular rx copy bufs
901 	 */
902 	rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;
903 
904 	state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
905 
906 	state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
907 	    sizeof (ibd_rwqe_t), KM_SLEEP);
908 
909 	/*
910 	 * Do one memory registration on the entire rxbuf area
911 	 */
912 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
913 	mem_attr.mr_len = rc_rx_bufs_sz;
914 	mem_attr.mr_as = NULL;
915 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
916 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
917 	    &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
918 	    != IBT_SUCCESS) {
919 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
920 		    "failed");
921 		kmem_free(state->rc_srq_rwqes,
922 		    state->rc_srq_size * sizeof (ibd_rwqe_t));
923 		kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
924 		state->rc_srq_rx_bufs = NULL;
925 		state->rc_srq_rwqes = NULL;
926 		return (DDI_FAILURE);
927 	}
928 
929 	return (DDI_SUCCESS);
930 }
931 
932 static void
933 ibd_rc_free_srq_copybufs(ibd_state_t *state)
934 {
935 	uint_t rc_rx_buf_sz;
936 
937 	/*
938 	 * Don't change the value of state->rc_mtu at the period from call
939 	 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
940 	 */
941 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
942 
943 	/*
944 	 * Unregister rxbuf mr
945 	 */
946 	if (ibt_deregister_mr(state->id_hca_hdl,
947 	    state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
948 		DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
949 		    " failed");
950 	}
951 	state->rc_srq_rx_mr_hdl = NULL;
952 
953 	/*
954 	 * Free rxbuf memory
955 	 */
956 	kmem_free(state->rc_srq_rwqes,
957 	    state->rc_srq_size * sizeof (ibd_rwqe_t));
958 	kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
959 	state->rc_srq_rwqes = NULL;
960 	state->rc_srq_rx_bufs = NULL;
961 }
962 
963 /*
964  * Allocate and post a certain number of SRQ receive buffers and WRs.
965  */
966 int
967 ibd_rc_init_srq_list(ibd_state_t *state)
968 {
969 	ibd_rwqe_t *rwqe;
970 	ibt_lkey_t lkey;
971 	int i;
972 	uint_t len;
973 	uint8_t *bufaddr;
974 	ibt_srq_sizes_t srq_sizes;
975 	ibt_srq_sizes_t	 srq_real_sizes;
976 	ibt_status_t ret;
977 
978 	srq_sizes.srq_sgl_sz = 1;
979 	srq_sizes.srq_wr_sz = ibd_rc_num_srq;
980 	ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
981 	    state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
982 	if (ret != IBT_SUCCESS) {
983 		DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
984 		    "req_sgl_sz=%d, req_wr_sz=0x%x, ret=%d",
985 		    srq_sizes.srq_sgl_sz, srq_sizes.srq_wr_sz, ret);
986 		return (DDI_FAILURE);
987 	}
988 
989 	state->rc_srq_size = srq_real_sizes.srq_wr_sz;
990 	if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
991 		ret = ibt_free_srq(state->rc_srq_hdl);
992 		if (ret != IBT_SUCCESS) {
993 			ibd_print_warn(state, "ibd_rc_init_srq_list: "
994 			    "ibt_free_srq fail, ret=%d", ret);
995 		}
996 		return (DDI_FAILURE);
997 	}
998 
999 	/*
1000 	 * Allocate and setup the rwqe list
1001 	 */
1002 	lkey = state->rc_srq_rx_mr_desc.md_lkey;
1003 	rwqe = state->rc_srq_rwqes;
1004 	bufaddr = state->rc_srq_rx_bufs;
1005 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1006 	state->rc_srq_rwqe_list.dl_cnt = 0;
1007 	state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
1008 	for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
1009 		rwqe->w_state = state;
1010 		rwqe->w_freeing_wqe = B_FALSE;
1011 		rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
1012 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1013 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1014 
1015 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1016 		    &rwqe->w_freemsg_cb)) == NULL) {
1017 			DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
1018 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1019 			ibd_rc_fini_srq_list(state);
1020 			return (DDI_FAILURE);
1021 		}
1022 
1023 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1024 		/* Leave IPOIB_GRH_SIZE space */
1025 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1026 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1027 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1028 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1029 		rwqe->w_rwr.wr_nds = 1;
1030 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1031 		(void) ibd_rc_post_srq(state, rwqe);
1032 	}
1033 
1034 	return (DDI_SUCCESS);
1035 }
1036 
1037 /*
1038  * Free the statically allocated Rx buffer list for SRQ.
1039  */
1040 void
1041 ibd_rc_fini_srq_list(ibd_state_t *state)
1042 {
1043 	ibd_rwqe_t *rwqe;
1044 	int i;
1045 	ibt_status_t ret;
1046 
1047 	ret = ibt_free_srq(state->rc_srq_hdl);
1048 	if (ret != IBT_SUCCESS) {
1049 		ibd_print_warn(state, "ibd_rc_fini_srq_list: "
1050 		    "ibt_free_srq fail, ret=%d", ret);
1051 	}
1052 
1053 	mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
1054 	rwqe = state->rc_srq_rwqes;
1055 	for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
1056 		if (rwqe->rwqe_im_mblk != NULL) {
1057 			rwqe->w_freeing_wqe = B_TRUE;
1058 			freemsg(rwqe->rwqe_im_mblk);
1059 		}
1060 	}
1061 	mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);
1062 
1063 	ibd_rc_free_srq_copybufs(state);
1064 }
1065 
1066 /*
1067  * Free an allocated recv wqe.
1068  */
1069 void
1070 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
1071 {
1072 	/*
1073 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1074 	 *
1075 	 * This rwqe is placed on a free list so that it
1076 	 * can be reinstated in future.
1077 	 *
1078 	 * NOTE: no code currently exists to reinstate
1079 	 * these "lost" rwqes.
1080 	 */
1081 	mutex_enter(&state->rc_srq_free_list.dl_mutex);
1082 	state->rc_srq_free_list.dl_cnt++;
1083 	rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
1084 	state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
1085 	mutex_exit(&state->rc_srq_free_list.dl_mutex);
1086 }
1087 
1088 static void
1089 ibd_rc_srq_freemsg_cb(char *arg)
1090 {
1091 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1092 	ibd_state_t *state = rwqe->w_state;
1093 
1094 	ASSERT(state->rc_enable_srq);
1095 
1096 	/*
1097 	 * If the wqe is being destructed, do not attempt recycling.
1098 	 */
1099 	if (rwqe->w_freeing_wqe == B_TRUE) {
1100 		return;
1101 	}
1102 
1103 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1104 
1105 	/*
1106 	 * Upper layer has released held mblk, so we have
1107 	 * no more use for keeping the old pointer in
1108 	 * our rwqe.
1109 	 */
1110 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1111 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1112 	if (rwqe->rwqe_im_mblk == NULL) {
1113 		DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
1114 		ibd_rc_srq_free_rwqe(state, rwqe);
1115 		return;
1116 	}
1117 
1118 	if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1119 		ibd_rc_srq_free_rwqe(state, rwqe);
1120 		return;
1121 	}
1122 
1123 	atomic_add_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding, -1);
1124 }
1125 
1126 /*
1127  * Post a rwqe to the hardware and add it to the Rx list.
1128  */
1129 static int
1130 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
1131 {
1132 	/*
1133 	 * Here we should add dl_cnt before post recv, because
1134 	 * we would have to make sure dl_cnt is updated before
1135 	 * the corresponding ibd_rc_process_rx() is called.
1136 	 */
1137 	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
1138 	atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1);
1139 	if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
1140 	    IBT_SUCCESS) {
1141 		atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
1142 		DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
1143 		return (DDI_FAILURE);
1144 	}
1145 
1146 	return (DDI_SUCCESS);
1147 }
1148 
1149 /*
1150  * Post a rwqe to the hardware and add it to the Rx list.
1151  */
1152 static int
1153 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1154 {
1155 	/*
1156 	 * Here we should add dl_cnt before post recv, because we would
1157 	 * have to make sure dl_cnt has already updated before
1158 	 * corresponding ibd_rc_process_rx() is called.
1159 	 */
1160 	atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1);
1161 	if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
1162 	    IBT_SUCCESS) {
1163 		atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
1164 		DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
1165 		return (DDI_FAILURE);
1166 	}
1167 	return (DDI_SUCCESS);
1168 }
1169 
1170 static int
1171 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
1172 {
1173 	ibd_state_t *state = chan->state;
1174 	ibt_mr_attr_t mem_attr;
1175 	uint_t rc_rx_bufs_sz;
1176 
1177 	/*
1178 	 * Allocate one big chunk for all regular rx copy bufs
1179 	 */
1180 	rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;
1181 
1182 	chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);
1183 
1184 	chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
1185 	    sizeof (ibd_rwqe_t), KM_SLEEP);
1186 
1187 	/*
1188 	 * Do one memory registration on the entire rxbuf area
1189 	 */
1190 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
1191 	mem_attr.mr_len = rc_rx_bufs_sz;
1192 	mem_attr.mr_as = NULL;
1193 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
1194 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1195 	    &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
1196 		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
1197 		kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1198 		kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
1199 		chan->rx_bufs = NULL;
1200 		chan->rx_rwqes = NULL;
1201 		return (DDI_FAILURE);
1202 	}
1203 
1204 	return (DDI_SUCCESS);
1205 }
1206 
1207 static void
1208 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
1209 {
1210 	ibd_state_t *state = chan->state;
1211 	uint_t rc_rx_buf_sz;
1212 
1213 	ASSERT(!state->rc_enable_srq);
1214 	ASSERT(chan->rx_rwqes != NULL);
1215 	ASSERT(chan->rx_bufs != NULL);
1216 
1217 	/*
1218 	 * Don't change the value of state->rc_mtu at the period from call
1219 	 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
1220 	 */
1221 	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;
1222 
1223 	/*
1224 	 * Unregister rxbuf mr
1225 	 */
1226 	if (ibt_deregister_mr(state->id_hca_hdl,
1227 	    chan->rx_mr_hdl) != IBT_SUCCESS) {
1228 		DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
1229 	}
1230 	chan->rx_mr_hdl = NULL;
1231 
1232 	/*
1233 	 * Free rxbuf memory
1234 	 */
1235 	kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
1236 	chan->rx_rwqes = NULL;
1237 
1238 	kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
1239 	chan->rx_bufs = NULL;
1240 }
1241 
1242 /*
1243  * Post a certain number of receive buffers and WRs on a RC channel.
1244  */
1245 static int
1246 ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
1247 {
1248 	ibd_state_t *state = chan->state;
1249 	ibd_rwqe_t *rwqe;
1250 	ibt_lkey_t lkey;
1251 	int i;
1252 	uint_t len;
1253 	uint8_t *bufaddr;
1254 
1255 	ASSERT(!state->rc_enable_srq);
1256 	if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
1257 		return (DDI_FAILURE);
1258 
1259 	/*
1260 	 * Allocate and setup the rwqe list
1261 	 */
1262 	lkey = chan->rx_mr_desc.md_lkey;
1263 	rwqe = chan->rx_rwqes;
1264 	bufaddr = chan->rx_bufs;
1265 	len = state->rc_mtu + IPOIB_GRH_SIZE;
1266 	for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
1267 		rwqe->w_state = state;
1268 		rwqe->w_chan = chan;
1269 		rwqe->w_freeing_wqe = B_FALSE;
1270 		rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
1271 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
1272 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
1273 
1274 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
1275 		    &rwqe->w_freemsg_cb)) == NULL) {
1276 			DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
1277 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
1278 			ibd_rc_fini_rxlist(chan);
1279 			return (DDI_FAILURE);
1280 		}
1281 
1282 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
1283 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
1284 		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
1285 		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
1286 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
1287 		rwqe->w_rwr.wr_nds = 1;
1288 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
1289 		(void) ibd_rc_post_rwqe(chan, rwqe);
1290 	}
1291 
1292 	return (DDI_SUCCESS);
1293 }
1294 
1295 /*
1296  * Free the statically allocated Rx buffer list for SRQ.
1297  */
1298 static void
1299 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
1300 {
1301 	ibd_rwqe_t *rwqe;
1302 	int i;
1303 
1304 	if (chan->rx_bufs == NULL) {
1305 		DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
1306 		return;
1307 	}
1308 
1309 	/* bufs_outstanding must be 0 */
1310 	ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
1311 	    (chan->rx_wqe_list.dl_bufs_outstanding == 0));
1312 
1313 	mutex_enter(&chan->rx_wqe_list.dl_mutex);
1314 	rwqe = chan->rx_rwqes;
1315 	for (i = 0; i < chan->rcq_size; i++, rwqe++) {
1316 		if (rwqe->rwqe_im_mblk != NULL) {
1317 			rwqe->w_freeing_wqe = B_TRUE;
1318 			freemsg(rwqe->rwqe_im_mblk);
1319 		}
1320 	}
1321 	mutex_exit(&chan->rx_wqe_list.dl_mutex);
1322 
1323 	ibd_rc_free_rx_copybufs(chan);
1324 }
1325 
1326 /*
1327  * Free an allocated recv wqe.
1328  */
1329 static void
1330 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
1331 {
1332 	/*
1333 	 * desballoc() failed (no memory) or the posting of rwqe failed.
1334 	 *
1335 	 * This rwqe is placed on a free list so that it
1336 	 * can be reinstated in future.
1337 	 *
1338 	 * NOTE: no code currently exists to reinstate
1339 	 * these "lost" rwqes.
1340 	 */
1341 	mutex_enter(&chan->rx_free_list.dl_mutex);
1342 	chan->rx_free_list.dl_cnt++;
1343 	rwqe->rwqe_next = chan->rx_free_list.dl_head;
1344 	chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
1345 	mutex_exit(&chan->rx_free_list.dl_mutex);
1346 }
1347 
1348 /*
1349  * Processing to be done after receipt of a packet; hand off to GLD
1350  * in the format expected by GLD.
1351  */
1352 static void
1353 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
1354 {
1355 	ibd_state_t *state = chan->state;
1356 	ib_header_info_t *phdr;
1357 	ipoib_hdr_t *ipibp;
1358 	mblk_t *mp;
1359 	mblk_t *mpc;
1360 	int rxcnt;
1361 	ip6_t *ip6h;
1362 	int len;
1363 
1364 	/*
1365 	 * Track number handed to upper layer, and number still
1366 	 * available to receive packets.
1367 	 */
1368 	if (state->rc_enable_srq) {
1369 		rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
1370 	} else {
1371 		rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
1372 	}
1373 
1374 	/*
1375 	 * It can not be a IBA multicast packet.
1376 	 */
1377 	ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);
1378 
1379 
1380 #ifdef DEBUG
1381 	if (rxcnt < ibd_rc_rx_rwqe_thresh) {
1382 		state->rc_rwqe_short++;
1383 	}
1384 #endif
1385 
1386 	/*
1387 	 * Possibly replenish the Rx pool if needed.
1388 	 */
1389 	if ((rxcnt >= ibd_rc_rx_rwqe_thresh) &&
1390 	    (wc->wc_bytes_xfer > ibd_rc_rx_copy_thresh)) {
1391 		atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
1392 		atomic_inc_64(&state->rc_rcv_trans_pkt);
1393 
1394 		/*
1395 		 * Record how many rwqe has been occupied by upper
1396 		 * network layer
1397 		 */
1398 		if (state->rc_enable_srq) {
1399 			atomic_add_32(&state->rc_srq_rwqe_list.
1400 			    dl_bufs_outstanding, 1);
1401 		} else {
1402 			atomic_add_32(&chan->rx_wqe_list.
1403 			    dl_bufs_outstanding, 1);
1404 		}
1405 		mp = rwqe->rwqe_im_mblk;
1406 	} else {
1407 		atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
1408 		atomic_inc_64(&state->rc_rcv_copy_pkt);
1409 
1410 		if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
1411 		    BPRI_HI)) == NULL) {	/* no memory */
1412 			DPRINT(40, "ibd_rc_process_rx: allocb() failed");
1413 			state->rc_rcv_alloc_fail++;
1414 			if (state->rc_enable_srq) {
1415 				if (ibd_rc_post_srq(state, rwqe) ==
1416 				    DDI_FAILURE) {
1417 					ibd_rc_srq_free_rwqe(state, rwqe);
1418 				}
1419 			} else {
1420 				if (ibd_rc_post_rwqe(chan, rwqe) ==
1421 				    DDI_FAILURE) {
1422 					ibd_rc_free_rwqe(chan, rwqe);
1423 				}
1424 			}
1425 			return;
1426 		}
1427 
1428 		bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
1429 		    mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);
1430 
1431 		if (state->rc_enable_srq) {
1432 			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
1433 				ibd_rc_srq_free_rwqe(state, rwqe);
1434 			}
1435 		} else {
1436 			if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1437 				ibd_rc_free_rwqe(chan, rwqe);
1438 			}
1439 		}
1440 	}
1441 
1442 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
1443 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
1444 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
1445 		len = ntohs(ip6h->ip6_plen);
1446 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1447 			/* LINTED: E_CONSTANT_CONDITION */
1448 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
1449 		}
1450 	}
1451 
1452 	phdr = (ib_header_info_t *)mp->b_rptr;
1453 	phdr->ib_grh.ipoib_vertcflow = 0;
1454 	ovbcopy(&state->id_macaddr, &phdr->ib_dst,
1455 	    sizeof (ipoib_mac_t));
1456 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;
1457 
1458 	/*
1459 	 * Can RC mode in IB guarantee its checksum correctness?
1460 	 *
1461 	 *	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
1462 	 *	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
1463 	 */
1464 
1465 	/*
1466 	 * Make sure this is NULL or we're in trouble.
1467 	 */
1468 	if (mp->b_next != NULL) {
1469 		ibd_print_warn(state,
1470 		    "ibd_rc_process_rx: got duplicate mp from rcq?");
1471 		mp->b_next = NULL;
1472 	}
1473 
1474 	/*
1475 	 * Add this mp to the list of processed mp's to send to
1476 	 * the nw layer
1477 	 */
1478 	if (state->rc_enable_srq) {
1479 		mutex_enter(&state->rc_rx_lock);
1480 		if (state->rc_rx_mp) {
1481 			ASSERT(state->rc_rx_mp_tail != NULL);
1482 			state->rc_rx_mp_tail->b_next = mp;
1483 		} else {
1484 			ASSERT(state->rc_rx_mp_tail == NULL);
1485 			state->rc_rx_mp = mp;
1486 		}
1487 
1488 		state->rc_rx_mp_tail = mp;
1489 		state->rc_rx_mp_len++;
1490 
1491 		if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1492 			mpc = state->rc_rx_mp;
1493 
1494 			state->rc_rx_mp = NULL;
1495 			state->rc_rx_mp_tail = NULL;
1496 			state->rc_rx_mp_len = 0;
1497 			mutex_exit(&state->rc_rx_lock);
1498 			mac_rx(state->id_mh, NULL, mpc);
1499 		} else {
1500 			mutex_exit(&state->rc_rx_lock);
1501 		}
1502 	} else {
1503 		mutex_enter(&chan->rx_lock);
1504 		if (chan->rx_mp) {
1505 			ASSERT(chan->rx_mp_tail != NULL);
1506 			chan->rx_mp_tail->b_next = mp;
1507 		} else {
1508 			ASSERT(chan->rx_mp_tail == NULL);
1509 			chan->rx_mp = mp;
1510 		}
1511 
1512 		chan->rx_mp_tail = mp;
1513 		chan->rx_mp_len++;
1514 
1515 		if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
1516 			mpc = chan->rx_mp;
1517 
1518 			chan->rx_mp = NULL;
1519 			chan->rx_mp_tail = NULL;
1520 			chan->rx_mp_len = 0;
1521 			mutex_exit(&chan->rx_lock);
1522 			mac_rx(state->id_mh, NULL, mpc);
1523 		} else {
1524 			mutex_exit(&chan->rx_lock);
1525 		}
1526 	}
1527 }
1528 
1529 /*
1530  * Callback code invoked from STREAMs when the recv data buffer is free
1531  * for recycling.
1532  */
1533 static void
1534 ibd_rc_freemsg_cb(char *arg)
1535 {
1536 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
1537 	ibd_rc_chan_t *chan = rwqe->w_chan;
1538 	ibd_state_t *state = rwqe->w_state;
1539 
1540 	/*
1541 	 * If the wqe is being destructed, do not attempt recycling.
1542 	 */
1543 	if (rwqe->w_freeing_wqe == B_TRUE) {
1544 		return;
1545 	}
1546 
1547 	ASSERT(!state->rc_enable_srq);
1548 	ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);
1549 
1550 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
1551 	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
1552 	if (rwqe->rwqe_im_mblk == NULL) {
1553 		DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
1554 		ibd_rc_free_rwqe(chan, rwqe);
1555 		return;
1556 	}
1557 
1558 	/*
1559 	 * Post back to h/w. We could actually have more than
1560 	 * id_num_rwqe WQEs on the list if there were multiple
1561 	 * ibd_freemsg_cb() calls outstanding (since the lock is
1562 	 * not held the entire time). This will start getting
1563 	 * corrected over subsequent ibd_freemsg_cb() calls.
1564 	 */
1565 	if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
1566 		ibd_rc_free_rwqe(chan, rwqe);
1567 		return;
1568 	}
1569 	atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1);
1570 }
1571 
1572 /*
1573  * Common code for interrupt handling as well as for polling
1574  * for all completed wqe's while detaching.
1575  */
1576 static void
1577 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
1578 {
1579 	ibd_wqe_t *wqe;
1580 	ibt_wc_t *wc, *wcs;
1581 	uint_t numwcs, real_numwcs;
1582 	int i;
1583 
1584 	wcs = chan->rx_wc;
1585 	numwcs = IBD_RC_MAX_CQ_WC;
1586 
1587 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
1588 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
1589 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
1590 			if (wc->wc_status != IBT_WC_SUCCESS) {
1591 				chan->state->rc_rcq_err++;
1592 				/*
1593 				 * Channel being torn down.
1594 				 */
1595 				DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
1596 				    "SUCC, chan=%p", wc->wc_status, chan);
1597 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
1598 					/*
1599 					 * Do not invoke Rx handler because
1600 					 * it might add buffers to the Rx pool
1601 					 * when we are trying to deinitialize.
1602 					 */
1603 					continue;
1604 				}
1605 			}
1606 			ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
1607 		}
1608 	}
1609 }
1610 
1611 /* Receive CQ handler */
1612 /* ARGSUSED */
1613 static void
1614 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1615 {
1616 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
1617 	ibd_state_t *state = chan->state;
1618 
1619 	ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);
1620 
1621 	/*
1622 	 * Poll for completed entries; the CQ will not interrupt any
1623 	 * more for incoming (or transmitted) packets.
1624 	 */
1625 	state->rc_rcq_invoke++;
1626 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1627 
1628 	/*
1629 	 * Now enable CQ notifications; all packets that arrive now
1630 	 * (or complete transmission) will cause new interrupts.
1631 	 */
1632 	if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
1633 	    IBT_SUCCESS) {
1634 		/*
1635 		 * We do not expect a failure here.
1636 		 */
1637 		DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
1638 	}
1639 
1640 	/*
1641 	 * Repoll to catch all packets that might have arrived after
1642 	 * we finished the first poll loop and before interrupts got
1643 	 * armed.
1644 	 */
1645 	ibd_rc_poll_rcq(chan, chan->rcq_hdl);
1646 
1647 	if (state->rc_enable_srq) {
1648 		mutex_enter(&state->rc_rx_lock);
1649 
1650 		if (state->rc_rx_mp != NULL) {
1651 			mblk_t *mpc;
1652 			mpc = state->rc_rx_mp;
1653 
1654 			state->rc_rx_mp = NULL;
1655 			state->rc_rx_mp_tail = NULL;
1656 			state->rc_rx_mp_len = 0;
1657 
1658 			mutex_exit(&state->rc_rx_lock);
1659 			mac_rx(state->id_mh, NULL, mpc);
1660 		} else {
1661 			mutex_exit(&state->rc_rx_lock);
1662 		}
1663 	} else {
1664 		mutex_enter(&chan->rx_lock);
1665 
1666 		if (chan->rx_mp != NULL) {
1667 			mblk_t *mpc;
1668 			mpc = chan->rx_mp;
1669 
1670 			chan->rx_mp = NULL;
1671 			chan->rx_mp_tail = NULL;
1672 			chan->rx_mp_len = 0;
1673 
1674 			mutex_exit(&chan->rx_lock);
1675 			mac_rx(state->id_mh, NULL, mpc);
1676 		} else {
1677 			mutex_exit(&chan->rx_lock);
1678 		}
1679 	}
1680 }
1681 
1682 /*
1683  * Allocate the statically allocated Tx buffer list.
1684  */
1685 int
1686 ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
1687 {
1688 	ibd_rc_tx_largebuf_t *lbufp;
1689 	ibd_rc_tx_largebuf_t *tail;
1690 	uint8_t *memp;
1691 	ibt_mr_attr_t mem_attr;
1692 	uint32_t num_swqe;
1693 	size_t  mem_size;
1694 	int i;
1695 
1696 	num_swqe = ibd_rc_num_swqe - 1;
1697 
1698 	/*
1699 	 * Allocate one big chunk for all Tx large copy bufs
1700 	 */
1701 	/* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
1702 	mem_size = num_swqe * state->rc_mtu;
1703 	state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);
1704 
1705 	mem_attr.mr_len = mem_size;
1706 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
1707 	mem_attr.mr_as = NULL;
1708 	mem_attr.mr_flags = IBT_MR_SLEEP;
1709 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1710 	    &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
1711 		DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
1712 		    "failed");
1713 		kmem_free(state->rc_tx_mr_bufs, mem_size);
1714 		state->rc_tx_mr_bufs = NULL;
1715 		return (DDI_FAILURE);
1716 	}
1717 
1718 	state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
1719 	    sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);
1720 
1721 	/*
1722 	 * Set up the buf chain
1723 	 */
1724 	memp = state->rc_tx_mr_bufs;
1725 	mutex_enter(&state->rc_tx_large_bufs_lock);
1726 	lbufp = state->rc_tx_largebuf_desc_base;
1727 	for (i = 0; i < num_swqe; i++) {
1728 		lbufp->lb_buf = memp;
1729 		lbufp->lb_next = lbufp + 1;
1730 
1731 		tail = lbufp;
1732 
1733 		memp += state->rc_mtu;
1734 		lbufp++;
1735 	}
1736 	tail->lb_next = NULL;
1737 
1738 	/*
1739 	 * Set up the buffer information in ibd state
1740 	 */
1741 	state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
1742 	state->rc_tx_largebuf_nfree = num_swqe;
1743 	mutex_exit(&state->rc_tx_large_bufs_lock);
1744 	return (DDI_SUCCESS);
1745 }
1746 
1747 void
1748 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
1749 {
1750 	uint32_t num_swqe;
1751 
1752 	num_swqe = ibd_rc_num_swqe - 1;
1753 
1754 	if (ibt_deregister_mr(state->id_hca_hdl,
1755 	    state->rc_tx_mr_hdl) != IBT_SUCCESS) {
1756 		DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
1757 		    "failed");
1758 	}
1759 	state->rc_tx_mr_hdl = NULL;
1760 
1761 	kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
1762 	state->rc_tx_mr_bufs = NULL;
1763 
1764 	kmem_free(state->rc_tx_largebuf_desc_base,
1765 	    num_swqe * sizeof (ibd_rc_tx_largebuf_t));
1766 	state->rc_tx_largebuf_desc_base = NULL;
1767 }
1768 
1769 static int
1770 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
1771 {
1772 	ibt_mr_attr_t mem_attr;
1773 	ibd_state_t *state;
1774 
1775 	state = chan->state;
1776 	ASSERT(state != NULL);
1777 
1778 	/*
1779 	 * Allocate one big chunk for all regular tx copy bufs
1780 	 */
1781 	mem_attr.mr_len = chan->scq_size * ibd_rc_tx_copy_thresh;
1782 
1783 	chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);
1784 
1785 	/*
1786 	 * Do one memory registration on the entire txbuf area
1787 	 */
1788 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
1789 	mem_attr.mr_as = NULL;
1790 	mem_attr.mr_flags = IBT_MR_SLEEP;
1791 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
1792 	    &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
1793 		DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
1794 		ASSERT(mem_attr.mr_len ==
1795 		    chan->scq_size * ibd_rc_tx_copy_thresh);
1796 		kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
1797 		chan->tx_mr_bufs = NULL;
1798 		return (DDI_FAILURE);
1799 	}
1800 
1801 	return (DDI_SUCCESS);
1802 }
1803 
1804 /*
1805  * Allocate the statically allocated Tx buffer list.
1806  */
1807 static int
1808 ibd_rc_init_txlist(ibd_rc_chan_t *chan)
1809 {
1810 	ibd_swqe_t *swqe;
1811 	int i;
1812 	ibt_lkey_t lkey;
1813 
1814 	if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
1815 		return (DDI_FAILURE);
1816 
1817 	/*
1818 	 * Allocate and setup the swqe list
1819 	 */
1820 	lkey = chan->tx_mr_desc.md_lkey;
1821 	chan->tx_wqes = kmem_zalloc(chan->scq_size *
1822 	    sizeof (ibd_swqe_t), KM_SLEEP);
1823 	swqe = chan->tx_wqes;
1824 	for (i = 0; i < chan->scq_size; i++, swqe++) {
1825 		swqe->swqe_next = NULL;
1826 		swqe->swqe_im_mblk = NULL;
1827 
1828 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
1829 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
1830 
1831 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
1832 		swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
1833 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
1834 		    (chan->tx_mr_bufs + i * ibd_rc_tx_copy_thresh);
1835 		swqe->w_swr.wr_trans = IBT_RC_SRV;
1836 
1837 		/* Add to list */
1838 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
1839 		chan->tx_wqe_list.dl_cnt++;
1840 		swqe->swqe_next = chan->tx_wqe_list.dl_head;
1841 		chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
1842 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
1843 	}
1844 
1845 	return (DDI_SUCCESS);
1846 }
1847 
1848 /*
1849  * Free the statically allocated Tx buffer list.
1850  */
1851 static void
1852 ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
1853 {
1854 	if (chan->tx_mr_hdl != NULL) {
1855 		if (ibt_deregister_mr(chan->state->id_hca_hdl,
1856 		    chan->tx_mr_hdl) != IBT_SUCCESS) {
1857 			DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
1858 			    "failed");
1859 		}
1860 		chan->tx_mr_hdl = NULL;
1861 	}
1862 
1863 	if (chan->tx_mr_bufs != NULL) {
1864 		kmem_free(chan->tx_mr_bufs, chan->scq_size *
1865 		    ibd_rc_tx_copy_thresh);
1866 		chan->tx_mr_bufs = NULL;
1867 	}
1868 
1869 	if (chan->tx_wqes != NULL) {
1870 		kmem_free(chan->tx_wqes, chan->scq_size *
1871 		    sizeof (ibd_swqe_t));
1872 		chan->tx_wqes = NULL;
1873 	}
1874 }
1875 
1876 /*
1877  * Acquire send wqe from free list.
1878  * Returns error number and send wqe pointer.
1879  */
1880 ibd_swqe_t *
1881 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
1882 {
1883 	ibd_swqe_t *wqe;
1884 
1885 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1886 	if (chan->tx_rel_list.dl_head != NULL) {
1887 		/* transfer id_tx_rel_list to id_tx_list */
1888 		chan->tx_wqe_list.dl_head =
1889 		    chan->tx_rel_list.dl_head;
1890 		chan->tx_wqe_list.dl_cnt =
1891 		    chan->tx_rel_list.dl_cnt;
1892 		chan->tx_wqe_list.dl_pending_sends = B_FALSE;
1893 
1894 		/* clear id_tx_rel_list */
1895 		chan->tx_rel_list.dl_head = NULL;
1896 		chan->tx_rel_list.dl_cnt = 0;
1897 		mutex_exit(&chan->tx_rel_list.dl_mutex);
1898 
1899 		wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
1900 		chan->tx_wqe_list.dl_cnt -= 1;
1901 		chan->tx_wqe_list.dl_head = wqe->swqe_next;
1902 	} else {	/* no free swqe */
1903 		mutex_exit(&chan->tx_rel_list.dl_mutex);
1904 		chan->tx_wqe_list.dl_pending_sends = B_TRUE;
1905 		wqe = NULL;
1906 	}
1907 	return (wqe);
1908 }
1909 
1910 /*
1911  * Release send wqe back into free list.
1912  */
1913 static void
1914 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
1915 {
1916 	/*
1917 	 * Add back on Tx list for reuse.
1918 	 */
1919 	swqe->swqe_next = NULL;
1920 	mutex_enter(&chan->tx_rel_list.dl_mutex);
1921 	chan->tx_rel_list.dl_pending_sends = B_FALSE;
1922 	swqe->swqe_next = chan->tx_rel_list.dl_head;
1923 	chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
1924 	chan->tx_rel_list.dl_cnt++;
1925 	mutex_exit(&chan->tx_rel_list.dl_mutex);
1926 }
1927 
1928 void
1929 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
1930 {
1931 	uint_t		i;
1932 	uint_t		num_posted;
1933 	uint_t		n_wrs;
1934 	ibt_status_t	ibt_status;
1935 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
1936 	ibd_swqe_t	*tx_head, *elem;
1937 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
1938 
1939 	/* post the one request, then check for more */
1940 	ibt_status = ibt_post_send(chan->chan_hdl,
1941 	    &node->w_swr, 1, NULL);
1942 	if (ibt_status != IBT_SUCCESS) {
1943 		ibd_print_warn(chan->state, "ibd_post_send: "
1944 		    "posting one wr failed: ret=%d", ibt_status);
1945 		ibd_rc_tx_cleanup(node);
1946 	}
1947 
1948 	tx_head = NULL;
1949 	for (;;) {
1950 		if (tx_head == NULL) {
1951 			mutex_enter(&chan->tx_post_lock);
1952 			tx_head = chan->tx_head;
1953 			if (tx_head == NULL) {
1954 				chan->tx_busy = 0;
1955 				mutex_exit(&chan->tx_post_lock);
1956 				return;
1957 			}
1958 			chan->tx_head = NULL;
1959 			mutex_exit(&chan->tx_post_lock);
1960 		}
1961 
1962 		/*
1963 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
1964 		 * at a time if possible, and keep posting them.
1965 		 */
1966 		for (n_wrs = 0, elem = tx_head;
1967 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
1968 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
1969 			nodes[n_wrs] = elem;
1970 			wrs[n_wrs] = elem->w_swr;
1971 		}
1972 		tx_head = elem;
1973 
1974 		ASSERT(n_wrs != 0);
1975 
1976 		/*
1977 		 * If posting fails for some reason, we'll never receive
1978 		 * completion intimation, so we'll need to cleanup. But
1979 		 * we need to make sure we don't clean up nodes whose
1980 		 * wrs have been successfully posted. We assume that the
1981 		 * hca driver returns on the first failure to post and
1982 		 * therefore the first 'num_posted' entries don't need
1983 		 * cleanup here.
1984 		 */
1985 		num_posted = 0;
1986 		ibt_status = ibt_post_send(chan->chan_hdl,
1987 		    wrs, n_wrs, &num_posted);
1988 		if (ibt_status != IBT_SUCCESS) {
1989 			ibd_print_warn(chan->state, "ibd_post_send: "
1990 			    "posting multiple wrs failed: "
1991 			    "requested=%d, done=%d, ret=%d",
1992 			    n_wrs, num_posted, ibt_status);
1993 
1994 			for (i = num_posted; i < n_wrs; i++)
1995 				ibd_rc_tx_cleanup(nodes[i]);
1996 		}
1997 	}
1998 }
1999 
2000 /*
2001  * Common code that deals with clean ups after a successful or
2002  * erroneous transmission attempt.
2003  */
2004 void
2005 ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
2006 {
2007 	ibd_ace_t *ace = swqe->w_ahandle;
2008 	ibd_state_t *state;
2009 
2010 	ASSERT(ace != NULL);
2011 	ASSERT(ace->ac_chan != NULL);
2012 
2013 	state = ace->ac_chan->state;
2014 
2015 	/*
2016 	 * If this was a dynamic registration in ibd_send(),
2017 	 * deregister now.
2018 	 */
2019 	if (swqe->swqe_im_mblk != NULL) {
2020 		ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
2021 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
2022 			ibd_unmap_mem(state, swqe);
2023 		}
2024 		freemsg(swqe->swqe_im_mblk);
2025 		swqe->swqe_im_mblk = NULL;
2026 	} else {
2027 		ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
2028 	}
2029 
2030 	if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
2031 		ibd_rc_tx_largebuf_t *lbufp;
2032 
2033 		lbufp = swqe->w_rc_tx_largebuf;
2034 		ASSERT(lbufp != NULL);
2035 
2036 		mutex_enter(&state->rc_tx_large_bufs_lock);
2037 		lbufp->lb_next = state->rc_tx_largebuf_free_head;
2038 		state->rc_tx_largebuf_free_head = lbufp;
2039 		state->rc_tx_largebuf_nfree ++;
2040 		mutex_exit(&state->rc_tx_large_bufs_lock);
2041 		swqe->w_rc_tx_largebuf = NULL;
2042 	}
2043 
2044 
2045 	/*
2046 	 * Release the send wqe for reuse.
2047 	 */
2048 	ibd_rc_release_swqe(ace->ac_chan, swqe);
2049 
2050 	/*
2051 	 * Drop the reference count on the AH; it can be reused
2052 	 * now for a different destination if there are no more
2053 	 * posted sends that will use it. This can be eliminated
2054 	 * if we can always associate each Tx buffer with an AH.
2055 	 * The ace can be null if we are cleaning up from the
2056 	 * ibd_send() error path.
2057 	 */
2058 	ibd_dec_ref_ace(state, ace);
2059 }
2060 
2061 void
2062 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
2063 {
2064 	ibd_state_t *state = chan->state;
2065 	ibd_wqe_t *wqe;
2066 	ibt_wc_t *wc, *wcs;
2067 	uint_t numwcs, real_numwcs;
2068 	int i;
2069 
2070 	wcs = chan->tx_wc;
2071 	numwcs = IBD_RC_MAX_CQ_WC;
2072 
2073 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
2074 		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
2075 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
2076 			if (wc->wc_status != IBT_WC_SUCCESS) {
2077 				chan->tx_trans_error_cnt ++;
2078 				DPRINT(30, "ibd_rc_drain_scq: "
2079 				    "wc_status(%d) != SUCC, "
2080 				    "chan=%p, ace=%p, link_state=%d",
2081 				    wc->wc_status, chan, chan->ace,
2082 				    chan->state->id_link_state);
2083 			} else {
2084 				chan->tx_trans_error_cnt = 0;
2085 			}
2086 			ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
2087 		}
2088 
2089 		mutex_enter(&state->id_sched_lock);
2090 		if (state->id_sched_needed == 0) {
2091 			mutex_exit(&state->id_sched_lock);
2092 		} else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
2093 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2094 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2095 			if ((chan->tx_rel_list.dl_cnt +
2096 			    chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
2097 				state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
2098 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2099 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2100 				mutex_exit(&state->id_sched_lock);
2101 				state->rc_swqe_mac_update++;
2102 				mac_tx_update(state->id_mh);
2103 			} else {
2104 				state->rc_scq_no_swqe++;
2105 				mutex_exit(&chan->tx_rel_list.dl_mutex);
2106 				mutex_exit(&chan->tx_wqe_list.dl_mutex);
2107 				mutex_exit(&state->id_sched_lock);
2108 			}
2109 		} else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
2110 			mutex_enter(&state->rc_tx_large_bufs_lock);
2111 			if (state->rc_tx_largebuf_nfree >
2112 			    IBD_RC_TX_FREE_THRESH) {
2113 				ASSERT(state->rc_tx_largebuf_free_head != NULL);
2114 				state->id_sched_needed &=
2115 				    ~IBD_RSRC_RC_TX_LARGEBUF;
2116 				mutex_exit(&state->rc_tx_large_bufs_lock);
2117 				mutex_exit(&state->id_sched_lock);
2118 				state->rc_xmt_buf_mac_update++;
2119 				mac_tx_update(state->id_mh);
2120 			} else {
2121 				state->rc_scq_no_largebuf++;
2122 				mutex_exit(&state->rc_tx_large_bufs_lock);
2123 				mutex_exit(&state->id_sched_lock);
2124 			}
2125 		} else if (state->id_sched_needed & IBD_RSRC_SWQE) {
2126 			mutex_enter(&state->id_tx_list.dl_mutex);
2127 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
2128 			if ((state->id_tx_list.dl_cnt +
2129 			    state->id_tx_rel_list.dl_cnt)
2130 			    > IBD_FREE_SWQES_THRESH) {
2131 				state->id_sched_needed &= ~IBD_RSRC_SWQE;
2132 				state->id_sched_cnt++;
2133 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2134 				mutex_exit(&state->id_tx_list.dl_mutex);
2135 				mutex_exit(&state->id_sched_lock);
2136 				mac_tx_update(state->id_mh);
2137 			} else {
2138 				mutex_exit(&state->id_tx_rel_list.dl_mutex);
2139 				mutex_exit(&state->id_tx_list.dl_mutex);
2140 				mutex_exit(&state->id_sched_lock);
2141 			}
2142 		} else {
2143 			mutex_exit(&state->id_sched_lock);
2144 		}
2145 	}
2146 }
2147 
2148 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
2149 /* ARGSUSED */
2150 static void
2151 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
2152 {
2153 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2154 
2155 	chan->state->rc_scq_invoke++;
2156 
2157 	if (ibd_rc_tx_softintr == 1) {
2158 		mutex_enter(&chan->tx_poll_lock);
2159 		if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2160 			chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
2161 			mutex_exit(&chan->tx_poll_lock);
2162 			return;
2163 		} else {
2164 			mutex_exit(&chan->tx_poll_lock);
2165 			ddi_trigger_softintr(chan->scq_softintr);
2166 		}
2167 	} else
2168 		(void) ibd_rc_tx_recycle(arg);
2169 }
2170 
2171 static uint_t
2172 ibd_rc_tx_recycle(caddr_t arg)
2173 {
2174 	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
2175 	ibd_ace_t *ace;
2176 	ibd_state_t *state = chan->state;
2177 	int flag, redo_flag;
2178 	int redo = 1;
2179 
2180 	flag = IBD_CQ_POLLING;
2181 	redo_flag = IBD_REDO_CQ_POLLING;
2182 
2183 	mutex_enter(&chan->tx_poll_lock);
2184 	if (chan->tx_poll_busy & flag) {
2185 		ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
2186 		    "threads");
2187 		chan->tx_poll_busy |= redo_flag;
2188 		mutex_exit(&chan->tx_poll_lock);
2189 		return (DDI_INTR_CLAIMED);
2190 	}
2191 	chan->tx_poll_busy |= flag;
2192 	mutex_exit(&chan->tx_poll_lock);
2193 
2194 	/*
2195 	 * Poll for completed entries; the CQ will not interrupt any
2196 	 * more for completed packets.
2197 	 */
2198 	ibd_rc_drain_scq(chan, chan->scq_hdl);
2199 
2200 	/*
2201 	 * Now enable CQ notifications; all completions originating now
2202 	 * will cause new interrupts.
2203 	 */
2204 	do {
2205 		if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
2206 		    IBT_SUCCESS) {
2207 			/*
2208 			 * We do not expect a failure here.
2209 			 */
2210 			DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
2211 			    " failed");
2212 		}
2213 
2214 		ibd_rc_drain_scq(chan, chan->scq_hdl);
2215 
2216 		if (chan->tx_trans_error_cnt > 3) {
2217 			mutex_enter(&chan->tx_poll_lock);
2218 			chan->tx_poll_busy = 0;
2219 			mutex_exit(&chan->tx_poll_lock);
2220 			goto error_reset_chan;
2221 		}
2222 		mutex_enter(&chan->tx_poll_lock);
2223 		if (chan->tx_poll_busy & redo_flag)
2224 			chan->tx_poll_busy &= ~redo_flag;
2225 		else {
2226 			chan->tx_poll_busy &= ~flag;
2227 			redo = 0;
2228 		}
2229 		mutex_exit(&chan->tx_poll_lock);
2230 
2231 	} while (redo);
2232 
2233 	return (DDI_INTR_CLAIMED);
2234 
2235 error_reset_chan:
2236 	/*
2237 	 * Channel being torn down.
2238 	 */
2239 	mutex_enter(&state->id_ac_mutex);
2240 	if ((chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
2241 	    (chan->state->id_link_state == LINK_STATE_UP) &&
2242 	    ((ace = ibd_acache_find(state, &chan->ace->ac_mac, B_FALSE, 0))
2243 	    != NULL) && (ace == chan->ace)) {
2244 		ASSERT(ace->ac_mce == NULL);
2245 		INC_REF(ace, 1);
2246 		IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2247 		chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2248 		mutex_exit(&state->id_ac_mutex);
2249 		state->rc_reset_cnt++;
2250 		DPRINT(30, "ibd_rc_tx_recycle(chan=%p, ace=%p): "
2251 		    " reset RC channel", chan, chan->ace);
2252 		ibd_rc_signal_act_close(state, ace);
2253 	} else {
2254 		mutex_exit(&state->id_ac_mutex);
2255 		state->rc_act_close_simultaneous++;
2256 		DPRINT(40, "ibd_rc_tx_recycle: other thread is closing"
2257 		    " it. chan=%p, act_state=%d, link_state=%d, ace=%p",
2258 		    chan, chan->chan_state, state->id_link_state, ace);
2259 	}
2260 	return (DDI_INTR_CLAIMED);
2261 }
2262 
2263 /* Listen with corresponding service ID */
2264 ibt_status_t
2265 ibd_rc_listen(ibd_state_t *state)
2266 {
2267 	ibt_srv_desc_t srvdesc;
2268 	ib_svc_id_t ret_sid;
2269 	ibt_status_t status;
2270 	ib_gid_t gid;
2271 
2272 	if (state->rc_listen_hdl != NULL) {
2273 		DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
2274 		return (IBT_FAILURE);
2275 	}
2276 
2277 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2278 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2279 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2280 
2281 	/*
2282 	 * Register the service with service id
2283 	 * Incoming connection requests should arrive on this service id.
2284 	 */
2285 	status = ibt_register_service(state->id_ibt_hdl, &srvdesc,
2286 	    IBD_RC_QPN_TO_SID(state->id_qpnum),
2287 	    1, &state->rc_listen_hdl, &ret_sid);
2288 	if (status != IBT_SUCCESS) {
2289 		DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
2290 		    "ret=%d", status);
2291 		return (status);
2292 	}
2293 
2294 	gid = state->id_sgid;
2295 
2296 	/* pass state as cm_private */
2297 	status = ibt_bind_service(state->rc_listen_hdl,
2298 	    gid, NULL, state, &state->rc_listen_bind);
2299 	if (status != IBT_SUCCESS) {
2300 		DPRINT(40, "ibd_rc_listen:"
2301 		    " fail to bind port: <%d>", status);
2302 		(void) ibt_deregister_service(state->id_ibt_hdl,
2303 		    state->rc_listen_hdl);
2304 		return (status);
2305 	}
2306 
2307 	/*
2308 	 * Legacy OFED had used a wrong service ID (one additional zero digit)
2309 	 * for many years. To interop with legacy OFED, we support this wrong
2310 	 * service ID here.
2311 	 */
2312 	ASSERT(state->rc_listen_hdl_OFED_interop == NULL);
2313 
2314 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
2315 	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
2316 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
2317 
2318 	/*
2319 	 * Register the service with service id
2320 	 * Incoming connection requests should arrive on this service id.
2321 	 */
2322 	status = ibt_register_service(state->id_ibt_hdl, &srvdesc,
2323 	    IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
2324 	    1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
2325 	if (status != IBT_SUCCESS) {
2326 		DPRINT(40,
2327 		    "ibd_rc_listen: Service Registration for Legacy OFED "
2328 		    "Failed %d", status);
2329 		(void) ibt_unbind_service(state->rc_listen_hdl,
2330 		    state->rc_listen_bind);
2331 		(void) ibt_deregister_service(state->id_ibt_hdl,
2332 		    state->rc_listen_hdl);
2333 		return (status);
2334 	}
2335 
2336 	gid = state->id_sgid;
2337 
2338 	/* pass state as cm_private */
2339 	status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
2340 	    gid, NULL, state, &state->rc_listen_bind_OFED_interop);
2341 	if (status != IBT_SUCCESS) {
2342 		DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
2343 		    "Legacy OFED listener", status);
2344 		(void) ibt_deregister_service(state->id_ibt_hdl,
2345 		    state->rc_listen_hdl_OFED_interop);
2346 		(void) ibt_unbind_service(state->rc_listen_hdl,
2347 		    state->rc_listen_bind);
2348 		(void) ibt_deregister_service(state->id_ibt_hdl,
2349 		    state->rc_listen_hdl);
2350 		return (status);
2351 	}
2352 
2353 	return (IBT_SUCCESS);
2354 }
2355 
2356 void
2357 ibd_rc_stop_listen(ibd_state_t *state)
2358 {
2359 	int ret;
2360 
2361 	/* Disable incoming connection requests */
2362 	if (state->rc_listen_hdl != NULL) {
2363 		ret = ibt_unbind_all_services(state->rc_listen_hdl);
2364 		if (ret != 0) {
2365 			DPRINT(40, "ibd_rc_stop_listen:"
2366 			    "ibt_unbind_all_services() failed, ret=%d", ret);
2367 		}
2368 		ret = ibt_deregister_service(state->id_ibt_hdl,
2369 		    state->rc_listen_hdl);
2370 		if (ret != 0) {
2371 			DPRINT(40, "ibd_rc_stop_listen:"
2372 			    "ibt_deregister_service() failed, ret=%d", ret);
2373 		} else {
2374 			state->rc_listen_hdl = NULL;
2375 		}
2376 	}
2377 
2378 	/* Disable incoming connection requests */
2379 	if (state->rc_listen_hdl_OFED_interop != NULL) {
2380 		ret = ibt_unbind_all_services(
2381 		    state->rc_listen_hdl_OFED_interop);
2382 		if (ret != 0) {
2383 			DPRINT(40, "ibd_rc_stop_listen:"
2384 			    "ibt_unbind_all_services() failed: %d", ret);
2385 		}
2386 		ret = ibt_deregister_service(state->id_ibt_hdl,
2387 		    state->rc_listen_hdl_OFED_interop);
2388 		if (ret != 0) {
2389 			DPRINT(40, "ibd_rc_stop_listen:"
2390 			    "ibt_deregister_service() failed: %d", ret);
2391 		} else {
2392 			state->rc_listen_hdl_OFED_interop = NULL;
2393 		}
2394 	}
2395 }
2396 
2397 int
2398 ibd_rc_close_all_chan(ibd_state_t *state)
2399 {
2400 	ibd_rc_chan_t *rc_chan, *rc_chan1;
2401 	ibd_ace_t *ace;
2402 	uint_t attempts;
2403 
2404 	/* Disable all Rx routines */
2405 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2406 	rc_chan = state->rc_pass_chan_list.chan_list;
2407 	while (rc_chan != NULL) {
2408 		ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
2409 		rc_chan = rc_chan->next;
2410 	}
2411 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2412 
2413 	if (state->rc_enable_srq) {
2414 		attempts = 50;
2415 		while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
2416 			DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
2417 			delay(drv_usectohz(100000));
2418 			if (--attempts == 0) {
2419 				/*
2420 				 * There are pending bufs with the network
2421 				 * layer and we have no choice but to wait
2422 				 * for them to be done with. Reap all the
2423 				 * Tx/Rx completions that were posted since
2424 				 * we turned off the notification and
2425 				 * return failure.
2426 				 */
2427 				mutex_enter(
2428 				    &state->rc_pass_chan_list.chan_list_mutex);
2429 				rc_chan = state->rc_pass_chan_list.chan_list;
2430 				while (rc_chan != NULL) {
2431 					ibd_rc_poll_rcq
2432 					    (rc_chan, rc_chan->rcq_hdl);
2433 					ibt_set_cq_handler(rc_chan->rcq_hdl,
2434 					    ibd_rc_rcq_handler, rc_chan);
2435 					rc_chan = rc_chan->next;
2436 				}
2437 				mutex_exit(
2438 				    &state->rc_pass_chan_list.chan_list_mutex);
2439 				return (DDI_FAILURE);
2440 			}
2441 		}
2442 	}
2443 
2444 	/* Close all passive RC channels */
2445 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2446 	while (rc_chan != NULL) {
2447 		if (ibd_rc_pas_close(rc_chan) != DDI_SUCCESS) {
2448 			mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2449 			rc_chan1 = state->rc_pass_chan_list.chan_list;
2450 			while (rc_chan1 != NULL) {
2451 				ibd_rc_poll_rcq(rc_chan1, rc_chan1->rcq_hdl);
2452 				ibt_set_cq_handler(rc_chan1->rcq_hdl,
2453 				    ibd_rc_rcq_handler, rc_chan1);
2454 				rc_chan1 = rc_chan1->next;
2455 			}
2456 			mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2457 			ibd_rc_add_to_chan_list(&state->rc_pass_chan_list,
2458 			    rc_chan);
2459 			DPRINT(40, "ibd_rc_close_all_chan: ibd_rc_pas_close() "
2460 			    "failed");
2461 			return (DDI_FAILURE);
2462 		}
2463 		rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
2464 	}
2465 
2466 	/* Close all active RC channels */
2467 	mutex_enter(&state->id_ac_mutex);
2468 	ace = list_head(&state->id_ah_active);
2469 	while (ace != NULL) {
2470 		if (ace->ac_chan != NULL) {
2471 			ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
2472 			    ace->ac_chan);
2473 		}
2474 		ace = list_next(&state->id_ah_active, ace);
2475 	}
2476 	mutex_exit(&state->id_ac_mutex);
2477 
2478 	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
2479 	while (rc_chan != NULL) {
2480 		ace = rc_chan->ace;
2481 		ibd_rc_act_close(rc_chan);
2482 		if (ace != NULL)
2483 			ace->ac_chan = NULL;
2484 		rc_chan = ibd_rc_rm_header_chan_list(
2485 		    &state->rc_obs_act_chan_list);
2486 	}
2487 	return (DDI_SUCCESS);
2488 }
2489 
2490 void
2491 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
2492 {
2493 	ibt_status_t status;
2494 
2495 	status = ibd_rc_connect(state, ace, path,
2496 	    IBD_RC_SERVICE_ID_OFED_INTEROP);
2497 
2498 	if (status != IBT_SUCCESS) {
2499 		/* wait peer side remove stale channel */
2500 		delay(drv_usectohz(10000));
2501 		status = ibd_rc_connect(state, ace, path,
2502 		    IBD_RC_SERVICE_ID_OFED_INTEROP);
2503 	}
2504 
2505 	if (status != IBT_SUCCESS) {
2506 		/* wait peer side remove stale channel */
2507 		delay(drv_usectohz(10000));
2508 		(void) ibd_rc_connect(state, ace, path,
2509 		    IBD_RC_SERVICE_ID);
2510 	}
2511 }
2512 
2513 /*
2514  * Allocates channel and sets the ace->ac_chan to it.
2515  * Opens the channel.
2516  */
2517 ibt_status_t
2518 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
2519     uint64_t ietf_cm_service_id)
2520 {
2521 	ibt_status_t status = 0;
2522 	ibt_rc_returns_t open_returns;
2523 	ibt_chan_open_args_t open_args;
2524 	ibd_rc_msg_hello_t hello_req_msg;
2525 	ibd_rc_msg_hello_t *hello_ack_msg;
2526 	ibd_rc_chan_t *chan;
2527 
2528 	ASSERT(ace != NULL);
2529 	ASSERT(ace->ac_mce == NULL);
2530 	ASSERT(ace->ac_chan == NULL);
2531 
2532 	if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
2533 		DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
2534 		return (status);
2535 	}
2536 
2537 	ace->ac_chan = chan;
2538 	chan->state = state;
2539 	chan->ace = ace;
2540 
2541 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);
2542 
2543 	hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);
2544 
2545 	/*
2546 	 * open the channels
2547 	 */
2548 	bzero(&open_args, sizeof (ibt_chan_open_args_t));
2549 	bzero(&open_returns, sizeof (ibt_rc_returns_t));
2550 
2551 	open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
2552 	open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;
2553 
2554 	/*
2555 	 * update path record with the SID
2556 	 */
2557 	path->pi_sid =
2558 	    ietf_cm_service_id | ((ace->ac_dest->ud_dst_qpn) & 0xffffff);
2559 
2560 
2561 	/* pre-allocate memory for hello ack message */
2562 	open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2563 	open_returns.rc_priv_data = hello_ack_msg;
2564 
2565 	open_args.oc_path = path;
2566 
2567 	open_args.oc_path_rnr_retry_cnt	= 7;
2568 	open_args.oc_path_retry_cnt = 7;
2569 
2570 	/* We don't do RDMA */
2571 	open_args.oc_rdma_ra_out = 0;
2572 	open_args.oc_rdma_ra_in	= 0;
2573 
2574 	hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
2575 	hello_req_msg.rx_mtu = htonl(state->rc_mtu);
2576 	open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
2577 	open_args.oc_priv_data = (void *)(&hello_req_msg);
2578 
2579 	ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
2580 	ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
2581 	ASSERT(open_args.oc_cm_handler != NULL);
2582 
2583 	status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
2584 	    IBT_BLOCKING, &open_args, &open_returns);
2585 
2586 	if (status == IBT_SUCCESS) {
2587 		/* Success! */
2588 		DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
2589 		state->rc_conn_succ++;
2590 		kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2591 		return (IBT_SUCCESS);
2592 	}
2593 
2594 	/* failure */
2595 	(void) ibt_flush_channel(chan->chan_hdl);
2596 	ibd_rc_free_chan(chan);
2597 	ace->ac_chan = NULL;
2598 
2599 	/* check open_returns report error and exit */
2600 	DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
2601 	    "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
2602 	    " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
2603 	    hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
2604 	    ace->ac_dest->ud_dst_qpn);
2605 	kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
2606 	return (status);
2607 }
2608 
2609 void
2610 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
2611 {
2612 	ibd_req_t *req;
2613 
2614 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2615 	if (req == NULL) {
2616 		ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
2617 		    "ibd_req_t fail");
2618 		mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
2619 		ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
2620 		state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
2621 		mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
2622 	} else {
2623 		req->rq_ptr = ace->ac_chan;
2624 		ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
2625 	}
2626 }
2627 
2628 void
2629 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
2630 {
2631 	ibd_req_t *req;
2632 
2633 	mutex_enter(&state->rc_ace_recycle_lock);
2634 	if (state->rc_ace_recycle != NULL) {
2635 		mutex_exit(&state->rc_ace_recycle_lock);
2636 		return;
2637 	}
2638 
2639 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
2640 	if (req == NULL) {
2641 		mutex_exit(&state->rc_ace_recycle_lock);
2642 		return;
2643 	}
2644 
2645 	state->rc_ace_recycle = ace;
2646 	mutex_exit(&state->rc_ace_recycle_lock);
2647 	ASSERT(ace->ac_mce == NULL);
2648 	INC_REF(ace, 1);
2649 	IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
2650 	req->rq_ptr = ace;
2651 	ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
2652 }
2653 
2654 static void
2655 ibd_rc_act_close(ibd_rc_chan_t *chan)
2656 {
2657 	uint_t times;
2658 	ibt_status_t ret;
2659 
2660 	ASSERT(chan != NULL);
2661 
2662 	chan->state->rc_act_close++;
2663 	switch (chan->chan_state) {
2664 	case IBD_RC_STATE_ACT_CLOSING:	/* stale, close it */
2665 	case IBD_RC_STATE_ACT_ESTAB:
2666 		DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
2667 		    "act_state=%d, chan=%p", chan->chan_state, chan);
2668 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2669 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2670 		/* Wait send queue empty */
2671 		times = 0;
2672 		mutex_enter(&chan->tx_wqe_list.dl_mutex);
2673 		mutex_enter(&chan->tx_rel_list.dl_mutex);
2674 		while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
2675 		    != chan->scq_size) && (times < 50)) {
2676 			DPRINT(30, "ibd_rc_act_close: dl_cnt(tx_wqe_list=%d,"
2677 			    " tx_rel_list=%d) != chan->scq_size=%d",
2678 			    chan->tx_wqe_list.dl_cnt, chan->tx_rel_list.dl_cnt,
2679 			    chan->scq_size);
2680 			mutex_exit(&chan->tx_rel_list.dl_mutex);
2681 			mutex_exit(&chan->tx_wqe_list.dl_mutex);
2682 			mutex_enter(&chan->tx_poll_lock);
2683 			if (chan->tx_poll_busy & IBD_CQ_POLLING) {
2684 				DPRINT(40, "ibd_rc_act_close: multiple "
2685 				    "polling threads");
2686 				mutex_exit(&chan->tx_poll_lock);
2687 			} else {
2688 				chan->tx_poll_busy = IBD_CQ_POLLING;
2689 				mutex_exit(&chan->tx_poll_lock);
2690 				ibd_rc_drain_scq(chan, chan->scq_hdl);
2691 				mutex_enter(&chan->tx_poll_lock);
2692 				chan->tx_poll_busy = 0;
2693 				mutex_exit(&chan->tx_poll_lock);
2694 			}
2695 			delay(drv_usectohz(100000));
2696 			times++;
2697 			mutex_enter(&chan->tx_wqe_list.dl_mutex);
2698 			mutex_enter(&chan->tx_rel_list.dl_mutex);
2699 		}
2700 		mutex_exit(&chan->tx_rel_list.dl_mutex);
2701 		mutex_exit(&chan->tx_wqe_list.dl_mutex);
2702 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2703 		ret = ibt_close_rc_channel(chan->chan_hdl,
2704 		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
2705 		if (ret != IBT_SUCCESS) {
2706 			DPRINT(40, "ibd_rc_act_close-2: ibt_close_rc_channel "
2707 			    "fail, chan=%p, returned=%d", chan, ret);
2708 		} else {
2709 			DPRINT(30, "ibd_rc_act_close-2: ibt_close_rc_channel "
2710 			    "succ, chan=%p", chan);
2711 		}
2712 
2713 		ibd_rc_free_chan(chan);
2714 		break;
2715 	case IBD_RC_STATE_ACT_REP_RECV:
2716 		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
2717 		(void) ibt_flush_channel(chan->chan_hdl);
2718 		ibd_rc_free_chan(chan);
2719 		break;
2720 	case IBD_RC_STATE_ACT_ERROR:
2721 		DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
2722 		break;
2723 	default:
2724 		DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
2725 		    "chan=%p", chan->chan_state, chan);
2726 	}
2727 }
2728 
2729 static int
2730 ibd_rc_pas_close(ibd_rc_chan_t *chan)
2731 {
2732 	uint_t times;
2733 	ibt_status_t ret;
2734 
2735 	ASSERT(chan != NULL);
2736 	chan->state->rc_pas_close++;
2737 
2738 	switch (chan->chan_state) {
2739 	case IBD_RC_STATE_PAS_ESTAB:
2740 		/*
2741 		 * First, stop receive interrupts; this stops the
2742 		 * connection from handing up buffers to higher layers.
2743 		 * Wait for receive buffers to be returned; give up
2744 		 * after 5 seconds.
2745 		 */
2746 		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
2747 		if (!chan->state->rc_enable_srq) {
2748 			times = 50;
2749 			while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
2750 				delay(drv_usectohz(100000));
2751 				if (--times == 0) {
2752 					DPRINT(40, "ibd_rc_pas_close : "
2753 					    "reclaiming failed");
2754 					ibd_rc_poll_rcq(chan, chan->rcq_hdl);
2755 					ibt_set_cq_handler(chan->rcq_hdl,
2756 					    ibd_rc_rcq_handler,
2757 					    (void *)(uintptr_t)chan);
2758 					return (DDI_FAILURE);
2759 				}
2760 			}
2761 		}
2762 		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
2763 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
2764 		DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
2765 		    "chan_state=%d, chan=%p", chan->chan_state, chan);
2766 		ret = ibt_close_rc_channel(chan->chan_hdl,
2767 		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
2768 		if (ret != IBT_SUCCESS) {
2769 			DPRINT(40, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
2770 			    " fail, chan=%p, returned=%d", chan, ret);
2771 		} else {
2772 			DPRINT(30, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
2773 			    " succ, chan=%p", chan);
2774 		}
2775 
2776 		ibd_rc_free_chan(chan);
2777 		break;
2778 	case IBD_RC_STATE_PAS_REQ_RECV:
2779 		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
2780 		(void) ibt_flush_channel(chan->chan_hdl);
2781 		ibd_rc_free_chan(chan);
2782 		break;
2783 	default:
2784 		DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
2785 		    chan->chan_state, chan);
2786 	}
2787 	return (DDI_SUCCESS);
2788 }
2789 
2790 /*
2791  * Remove duplicate RC channel which comes from the same mac
2792  *
2793  * From the IP point of view, we could check for same MAC:
2794  * GID, P_Key (or QPN, though in a reboot this is likely to
2795  * change so P_Key is better). The GID usually will equate to
2796  * port (since typically it uses the port GUID in the low 64 bits).
2797  * These fields exists in the REQ messages.
2798  */
2799 void
2800 ibd_rc_handle_req_rm_dup(ibd_state_t *state, ibt_cm_event_t *ibt_cm_event)
2801 {
2802 	ibd_rc_chan_t *chan, *pre_chan;
2803 
2804 	pre_chan = NULL;
2805 	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
2806 	chan = state->rc_pass_chan_list.chan_list;
2807 	while (chan != NULL) {
2808 		if ((bcmp(&chan->requester_gid,
2809 		    &ibt_cm_event->cm_event.req.req_prim_addr.av_dgid,
2810 		    sizeof (ib_gid_t)) == 0) && (chan->requester_pkey ==
2811 		    ibt_cm_event->cm_event.req.req_pkey)) {
2812 			if (pre_chan == NULL) {
2813 				state->rc_pass_chan_list.chan_list = chan->next;
2814 			} else {
2815 				pre_chan->next = chan->next;
2816 			}
2817 			break;
2818 		}
2819 		pre_chan = chan;
2820 		chan = chan->next;
2821 	}
2822 	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
2823 	if (chan) {
2824 		DPRINT(30, "ibd_rc_handle_req_rm_dup: same gid and pkey, "
2825 		    "remove duplicate channal, chan=%p", chan);
2826 		if (ibd_rc_pas_close(chan) != DDI_SUCCESS) {
2827 			ibd_rc_add_to_chan_list(&state->rc_pass_chan_list,
2828 			    chan);
2829 		}
2830 	}
2831 }
2832 
2833 /*
2834  * Passive Side:
2835  *	Handle an incoming CM REQ from active side.
2836  *
2837  *	If success, this function allocates an ibd_rc_chan_t, then
2838  * assigns it to "*ret_conn".
2839  */
2840 static ibt_cm_status_t
2841 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
2842     ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
2843     void *ret_priv_data)
2844 {
2845 	ibd_rc_msg_hello_t *hello_msg;
2846 	ibd_state_t *state = (ibd_state_t *)arg;
2847 	ibd_rc_chan_t *chan;
2848 
2849 	ibd_rc_handle_req_rm_dup(state, ibt_cm_event);
2850 
2851 	if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
2852 		DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
2853 		return (IBT_CM_REJECT);
2854 	}
2855 
2856 	ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);
2857 
2858 	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);
2859 
2860 	if (!state->rc_enable_srq) {
2861 		if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
2862 			ibd_rc_free_chan(chan);
2863 			DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
2864 			    "failed");
2865 			return (IBT_CM_REJECT);
2866 		}
2867 	}
2868 
2869 	ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;
2870 
2871 	/* We don't do RDMA */
2872 	ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
2873 	ret_args->cm_ret.rep.cm_rdma_ra_in = 0;
2874 
2875 	ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
2876 	ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);
2877 
2878 	hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
2879 	DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
2880 	    ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));
2881 
2882 	hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
2883 	hello_msg->reserved_qpn = htonl(state->id_qpnum);
2884 	hello_msg->rx_mtu = htonl(state->rc_mtu);
2885 
2886 	chan->requester_gid = ibt_cm_event->cm_event.req.req_prim_addr.av_dgid;
2887 	chan->requester_pkey = ibt_cm_event->cm_event.req.req_pkey;
2888 	chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV;	/* ready to receive */
2889 	*ret_conn = chan;
2890 
2891 	return (IBT_CM_ACCEPT);
2892 }
2893 
2894 /*
2895  * ibd_rc_handle_act_estab -- handler for connection established completion
2896  * for active side.
2897  */
2898 static ibt_cm_status_t
2899 ibd_rc_handle_act_estab(ibd_ace_t *ace)
2900 {
2901 	ibt_status_t result;
2902 
2903 	switch (ace->ac_chan->chan_state) {
2904 		case IBD_RC_STATE_ACT_REP_RECV:
2905 			ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
2906 			result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
2907 			    IBT_NEXT_COMPLETION);
2908 			if (result != IBT_SUCCESS) {
2909 				DPRINT(40, "ibd_rc_handle_act_estab: "
2910 				    "ibt_enable_cq_notify(rcq) "
2911 				    "failed: status %d", result);
2912 				return (IBT_CM_REJECT);
2913 			}
2914 			break;
2915 		default:
2916 			DPRINT(40, "ibd_rc_handle_act_estab: default "
2917 			    "branch, act_state=%d", ace->ac_chan->chan_state);
2918 			return (IBT_CM_REJECT);
2919 	}
2920 	return (IBT_CM_ACCEPT);
2921 }
2922 
2923 /*
2924  * ibd_rc_handle_pas_estab -- handler for connection established completion
2925  * for passive side.
2926  */
2927 static ibt_cm_status_t
2928 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
2929 {
2930 	ibt_status_t result;
2931 
2932 	switch (chan->chan_state) {
2933 		case IBD_RC_STATE_PAS_REQ_RECV:
2934 			chan->chan_state = IBD_RC_STATE_PAS_ESTAB;
2935 
2936 			result = ibt_enable_cq_notify(chan->rcq_hdl,
2937 			    IBT_NEXT_COMPLETION);
2938 			if (result != IBT_SUCCESS) {
2939 				DPRINT(40, "ibd_rc_handle_pas_estab: "
2940 				    "ibt_enable_cq_notify(rcq) "
2941 				    "failed: status %d", result);
2942 				return (IBT_CM_REJECT);
2943 			}
2944 			break;
2945 		default:
2946 			DPRINT(40, "ibd_rc_handle_pas_estab: default "
2947 			    "branch, chan_state=%d", chan->chan_state);
2948 			return (IBT_CM_REJECT);
2949 	}
2950 	return (IBT_CM_ACCEPT);
2951 }
2952 
2953 /* ARGSUSED */
2954 static ibt_cm_status_t
2955 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
2956     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
2957     ibt_priv_data_len_t ret_len_max)
2958 {
2959 	ibt_cm_status_t result = IBT_CM_ACCEPT;
2960 	ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
2961 	ibd_rc_chan_t *rc_chan;
2962 	ibd_state_t *state;
2963 	ibd_rc_msg_hello_t *hello_ack;
2964 	uint_t times;
2965 
2966 	switch (ibt_cm_event->cm_type) {
2967 	case IBT_CM_EVENT_REP_RCV:
2968 		ASSERT(ace->ac_chan != NULL);
2969 		ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
2970 		hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
2971 		DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
2972 		    "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
2973 		    ntohl(hello_ack->reserved_qpn));
2974 		ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
2975 		break;
2976 
2977 	case IBT_CM_EVENT_CONN_EST:
2978 		ASSERT(ace->ac_chan != NULL);
2979 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
2980 		    "ace=%p, act_state=%d, chan=%p",
2981 		    ace, ace->ac_chan->chan_state, ace->ac_chan);
2982 		result = ibd_rc_handle_act_estab(ace);
2983 		break;
2984 
2985 	case IBT_CM_EVENT_CONN_CLOSED:
2986 		rc_chan = ace->ac_chan;
2987 		if (rc_chan == NULL) {
2988 			DPRINT(40, "ibd_rc_dispatch_actv_mad: "
2989 			    "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
2990 			return (IBT_CM_ACCEPT);
2991 		}
2992 		state = rc_chan->state;
2993 		mutex_enter(&state->id_ac_mutex);
2994 		if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
2995 		    ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
2996 		    != NULL) && (ace == rc_chan->ace)) {
2997 			rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
2998 			ASSERT(ace->ac_mce == NULL);
2999 			INC_REF(ace, 1);
3000 			IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
3001 			mutex_exit(&state->id_ac_mutex);
3002 			DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3003 			    "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
3004 			    "reason=%d", ace, rc_chan,
3005 			    ibt_cm_event->cm_event.closed);
3006 		} else {
3007 			mutex_exit(&state->id_ac_mutex);
3008 			state->rc_act_close_simultaneous++;
3009 			DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
3010 			    "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
3011 			    "chan_state=%d", rc_chan->chan_state);
3012 			return (IBT_CM_ACCEPT);
3013 		}
3014 		/* wait until the send queue clean */
3015 		times = 0;
3016 		mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
3017 		mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
3018 		while (((rc_chan->tx_wqe_list.dl_cnt +
3019 		    rc_chan->tx_rel_list.dl_cnt)
3020 		    != rc_chan->scq_size) && (times < 50)) {
3021 			DPRINT(40, "ibd_rc_dispatch_act_mad: dl_cnt"
3022 			    "(tx_wqe_list=%d, tx_rel_list=%d) != "
3023 			    "chan->scq_size=%d",
3024 			    rc_chan->tx_wqe_list.dl_cnt,
3025 			    rc_chan->tx_rel_list.dl_cnt,
3026 			    rc_chan->scq_size);
3027 			mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
3028 			mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
3029 			mutex_enter(&rc_chan->tx_poll_lock);
3030 			if (rc_chan->tx_poll_busy & IBD_CQ_POLLING) {
3031 				DPRINT(40, "ibd_rc_dispatch_actv_mad: "
3032 				    "multiple polling threads");
3033 				mutex_exit(&rc_chan->tx_poll_lock);
3034 			} else {
3035 				rc_chan->tx_poll_busy = IBD_CQ_POLLING;
3036 				mutex_exit(&rc_chan->tx_poll_lock);
3037 				ibd_rc_drain_scq(rc_chan, rc_chan->scq_hdl);
3038 				mutex_enter(&rc_chan->tx_poll_lock);
3039 				rc_chan->tx_poll_busy = 0;
3040 				mutex_exit(&rc_chan->tx_poll_lock);
3041 			}
3042 			delay(drv_usectohz(100000));
3043 			times++;
3044 			mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
3045 			mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
3046 		}
3047 		mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
3048 		mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
3049 		rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
3050 		ibd_rc_free_chan(rc_chan);
3051 		DPRINT(30, "ibd_rc_dispatch_actv_mad: "
3052 		    "IBT_CM_EVENT_CONN_CLOSED, ref=%x", ace->ac_ref);
3053 		mutex_enter(&state->id_ac_mutex);
3054 		ace->ac_chan = NULL;
3055 		ASSERT(ace->ac_ref != 0);
3056 		atomic_dec_32(&ace->ac_ref);
3057 		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
3058 			IBD_ACACHE_INSERT_FREE(state, ace);
3059 			ace->ac_ref = 0;
3060 		} else {
3061 			ace->ac_ref |= CYCLEVAL;
3062 			state->rc_delay_ace_recycle++;
3063 		}
3064 		mutex_exit(&state->id_ac_mutex);
3065 		break;
3066 
3067 	case IBT_CM_EVENT_FAILURE:
3068 		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
3069 		    "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
3070 		    ace, ace->ac_chan,
3071 		    ibt_cm_event->cm_event.failed.cf_code,
3072 		    ibt_cm_event->cm_event.failed.cf_msg,
3073 		    ibt_cm_event->cm_event.failed.cf_reason);
3074 		/*
3075 		 * Don't need free resource here. The resource is freed
3076 		 * at function ibd_rc_connect()
3077 		 */
3078 		break;
3079 
3080 	case IBT_CM_EVENT_MRA_RCV:
3081 		DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
3082 		break;
3083 	case IBT_CM_EVENT_LAP_RCV:
3084 		DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
3085 		break;
3086 	case IBT_CM_EVENT_APR_RCV:
3087 		DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
3088 		break;
3089 	default:
3090 		DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
3091 		    "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
3092 		break;
3093 	}
3094 
3095 	return (result);
3096 }
3097 
3098 /* ARGSUSED */
3099 static ibt_cm_status_t
3100 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
3101     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
3102     ibt_priv_data_len_t ret_len_max)
3103 {
3104 	ibt_cm_status_t result = IBT_CM_ACCEPT;
3105 	ibd_rc_chan_t *chan;
3106 
3107 	if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
3108 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
3109 		    "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
3110 		/* Receive an incoming CM REQ from active side */
3111 		result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
3112 		    ret_priv_data);
3113 		return (result);
3114 	}
3115 
3116 	if (ibt_cm_event->cm_channel == 0) {
3117 		DPRINT(30, "ibd_rc_dispatch_pass_mad: "
3118 		    "ERROR ibt_cm_event->cm_channel == 0");
3119 		return (IBT_CM_REJECT);
3120 	}
3121 
3122 	chan =
3123 	    (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
3124 	if (chan == NULL) {
3125 		DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
3126 		return (IBT_CM_REJECT);
3127 	}
3128 
3129 	switch (ibt_cm_event->cm_type) {
3130 	case IBT_CM_EVENT_CONN_EST:
3131 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
3132 		    "chan=%p", chan);
3133 		result = ibd_rc_handle_pas_estab(chan);
3134 		break;
3135 	case IBT_CM_EVENT_CONN_CLOSED:
3136 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
3137 		    " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
3138 		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
3139 		ibd_rc_free_chan(chan);
3140 		break;
3141 	case IBT_CM_EVENT_FAILURE:
3142 		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
3143 		    " chan=%p, code: %d, msg: %d, reason=%d", chan,
3144 		    ibt_cm_event->cm_event.failed.cf_code,
3145 		    ibt_cm_event->cm_event.failed.cf_msg,
3146 		    ibt_cm_event->cm_event.failed.cf_reason);
3147 
3148 		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
3149 		ibd_rc_free_chan(chan);
3150 		return (IBT_CM_ACCEPT);
3151 	case IBT_CM_EVENT_MRA_RCV:
3152 		DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
3153 		break;
3154 	case IBT_CM_EVENT_LAP_RCV:
3155 		DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
3156 		break;
3157 	case IBT_CM_EVENT_APR_RCV:
3158 		DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
3159 		break;
3160 	default:
3161 		DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
3162 		    "chan=%p", ibt_cm_event->cm_type, chan);
3163 		break;
3164 	}
3165 
3166 	return (result);
3167 }
3168