1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 /* 29 * An implementation of the IPoIB-CM standard based on PSARC 2009/593. 30 */ 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/atomic.h> /* for atomic_add*() */ 45 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 46 #include <netinet/in.h> /* for netinet/ip.h below */ 47 #include <netinet/ip.h> /* for struct ip */ 48 #include <inet/common.h> /* for inet/ip.h below */ 49 #include <inet/ip.h> /* for ipha_t */ 50 #include <inet/ip_if.h> /* for ETHERTYPE_IPV6 */ 51 #include <inet/ip6.h> /* for ip6_t */ 52 #include <netinet/icmp6.h> /* for icmp6_t */ 53 #include <sys/ib/ibtl/ibvti.h> /* for ace->ac_dest->ud_dst_qpn */ 54 55 #include <sys/ib/clients/ibd/ibd.h> 56 57 58 /* Per-interface tunables (for developers) */ 59 extern uint_t ibd_rc_tx_copy_thresh; 60 /* 61 * ibd_rc_rx_copy_thresh 62 * If (the size of incoming buffer <= ibd_rc_rx_copy_thresh), ibd will 63 * attempt to allocate a buffer and do a bcopy of the incoming data into 64 * the alocated buffer. 65 * 66 * ibd_rc_rx_rwqe_thresh 67 * If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd will 68 * attempt to allocate a buffer and do a bcopy of the incoming data into 69 * the allocated buffer. 70 */ 71 uint_t ibd_rc_rx_copy_thresh = 0x1000; 72 uint_t ibd_rc_rx_rwqe_thresh = 0x200; /* old is 32; */ 73 74 /* 75 * ibd_rc_num_swqe 76 * 1) Send CQ size = ibd_rc_num_swqe 77 * 2) The send queue size = ibd_rc_num_swqe -1 78 * 3) Number of pre-allocated Tx buffers for ibt_post_send() = 79 * ibd_rc_num_swqe - 1. 80 */ 81 uint_t ibd_rc_num_swqe = 0x1ff; 82 83 /* 84 * ibd_rc_num_rwqe 85 * 1) For non-SRQ, we pre-post ibd_rc_num_rwqe number of WRs 86 * via ibt_post_receive() for receive queue of each RC channel. 87 * 2) For SRQ and non-SRQ, receive CQ size = ibd_rc_num_rwqe 88 */ 89 uint_t ibd_rc_num_rwqe = 0x7ff; 90 91 /* 92 * For SRQ 93 * If using SRQ, we allocate ibd_rc_num_srq number of buffers (the size of 94 * each buffer is equal to RC mtu). And post them by ibt_post_srq(). 95 * 96 * ibd_rc_num_srq should not be larger than ibd_rc_num_rwqe, otherwise 97 * it will cause a bug with the following warnings: 98 * NOTICE: hermon0: Device Error: EQE cq overrun or protection error 99 * NOTICE: hermon0: Device Error: EQE local work queue catastrophic error 100 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff catastrophic 101 * channel error 102 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff completion queue 103 * error 104 */ 105 uint_t ibd_rc_num_srq = 0x7fe; 106 107 boolean_t ibd_rc_enable_cq_moderation = B_TRUE; 108 109 /* 110 * Send CQ moderation parameters 111 */ 112 uint_t ibd_rc_txcomp_count = 10; 113 uint_t ibd_rc_txcomp_usec = 300; 114 115 /* 116 * Receive CQ moderation parameters 117 */ 118 uint_t ibd_rc_rxcomp_count = 4; 119 uint_t ibd_rc_rxcomp_usec = 10; 120 121 uint_t ibd_rc_tx_softintr = 1; 122 123 /* 124 * If the number of WRs in receive queue of each RC connection less than 125 * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it. 126 */ 127 #define IBD_RC_RX_WR_THRESHOLD 0x20 128 129 /* 130 * If the number of free SWQEs (or large Tx buf) is larger than or equal to 131 * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue 132 * transmitting packets. 133 */ 134 #define IBD_RC_TX_FREE_THRESH 8 135 136 #define IBD_RC_QPN_TO_SID(qpn) \ 137 ((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff))) 138 139 /* For interop with legacy OFED */ 140 #define IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \ 141 ((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff))) 142 143 /* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */ 144 #define IBD_RC_IP_ICMP_RETURN_DATA_BYTES 64 145 146 147 /* Functions for Reliable Connected Mode */ 148 /* Connection Setup/Close Functions */ 149 static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *, 150 ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t); 151 static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *, 152 ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t); 153 static int ibd_rc_pas_close(ibd_rc_chan_t *); 154 static void ibd_rc_act_close(ibd_rc_chan_t *); 155 156 static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *, 157 ibd_rc_chan_t *); 158 static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list( 159 ibd_rc_chan_list_t *); 160 static inline void ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *, 161 ibd_rc_chan_t *); 162 163 /* CQ handlers */ 164 static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *); 165 static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *); 166 static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t); 167 168 /* Receive Functions */ 169 static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *); 170 static void ibd_rc_srq_freemsg_cb(char *); 171 172 static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *); 173 static void ibd_rc_freemsg_cb(char *); 174 static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *); 175 static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *); 176 static void ibd_rc_fini_rxlist(ibd_rc_chan_t *); 177 178 179 /* Send Functions */ 180 static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *); 181 static int ibd_rc_init_txlist(ibd_rc_chan_t *); 182 static void ibd_rc_fini_txlist(ibd_rc_chan_t *); 183 static uint_t ibd_rc_tx_recycle(caddr_t); 184 185 186 void 187 ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req) 188 { 189 ibd_rc_chan_t *rc_chan = req->rq_ptr; 190 ibd_ace_t *ace; 191 192 while (rc_chan != NULL) { 193 ace = rc_chan->ace; 194 ASSERT(ace != NULL); 195 /* Close old RC channel */ 196 ibd_rc_act_close(rc_chan); 197 mutex_enter(&state->id_ac_mutex); 198 ASSERT(ace->ac_ref != 0); 199 atomic_dec_32(&ace->ac_ref); 200 ace->ac_chan = NULL; 201 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { 202 IBD_ACACHE_INSERT_FREE(state, ace); 203 ace->ac_ref = 0; 204 } else { 205 ace->ac_ref |= CYCLEVAL; 206 state->rc_delay_ace_recycle++; 207 } 208 mutex_exit(&state->id_ac_mutex); 209 rc_chan = ibd_rc_rm_header_chan_list( 210 &state->rc_obs_act_chan_list); 211 } 212 } 213 214 void 215 ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req) 216 { 217 ibd_ace_t *ace = req->rq_ptr; 218 ibd_rc_chan_t *rc_chan; 219 220 ASSERT(ace != NULL); 221 rc_chan = ace->ac_chan; 222 ASSERT(rc_chan != NULL); 223 /* Close old RC channel */ 224 ibd_rc_act_close(rc_chan); 225 mutex_enter(&state->id_ac_mutex); 226 ASSERT(ace->ac_ref != 0); 227 atomic_dec_32(&ace->ac_ref); 228 ace->ac_chan = NULL; 229 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { 230 IBD_ACACHE_INSERT_FREE(state, ace); 231 ace->ac_ref = 0; 232 } else { 233 ace->ac_ref |= CYCLEVAL; 234 state->rc_delay_ace_recycle++; 235 } 236 mutex_exit(&state->id_ac_mutex); 237 mutex_enter(&state->rc_ace_recycle_lock); 238 state->rc_ace_recycle = NULL; 239 mutex_exit(&state->rc_ace_recycle_lock); 240 } 241 242 /* Simple ICMP IP Header Template */ 243 static const ipha_t icmp_ipha = { 244 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 245 }; 246 247 /* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */ 248 void 249 ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req) 250 { 251 mblk_t *mp = req->rq_ptr; 252 ibd_ace_t *ace = req->rq_ptr2; 253 uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE; 254 uint_t len_needed; 255 size_t msg_len; 256 mblk_t *pmtu_mp; 257 ushort_t sap; 258 ib_header_info_t *ibha; /* ib header for pmtu_pkt */ 259 /* 260 * ipha: IP header for pmtu_pkt 261 * old_ipha: IP header for old packet 262 */ 263 ipha_t *ipha, *old_ipha; 264 icmph_t *icmph; 265 266 sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type); 267 268 if (!pullupmsg(mp, -1)) { 269 DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail"); 270 goto too_big_fail; 271 } 272 /* move to IP header. */ 273 mp->b_rptr += IPOIB_HDRSIZE; 274 old_ipha = (ipha_t *)mp->b_rptr; 275 276 len_needed = IPH_HDR_LENGTH(old_ipha); 277 if (old_ipha->ipha_protocol == IPPROTO_ENCAP) { 278 len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha + 279 len_needed)); 280 } else if (old_ipha->ipha_protocol == IPPROTO_IPV6) { 281 ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha 282 + len_needed); 283 len_needed += ip_hdr_length_v6(mp, ip6h); 284 } 285 len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES; 286 msg_len = msgdsize(mp); 287 if (msg_len > len_needed) { 288 (void) adjmsg(mp, len_needed - msg_len); 289 msg_len = len_needed; 290 } 291 292 if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t) 293 + sizeof (icmph_t), BPRI_MED)) == NULL) { 294 DPRINT(40, "ibd_async_rc_process_too_big: allocb fail"); 295 goto too_big_fail; 296 } 297 pmtu_mp->b_cont = mp; 298 pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t) 299 + sizeof (ipha_t) + sizeof (icmph_t); 300 301 ibha = (ib_header_info_t *)pmtu_mp->b_rptr; 302 303 /* Fill IB header */ 304 bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL); 305 /* 306 * If the GRH is not valid, indicate to GLDv3 by setting 307 * the VerTcFlow field to 0. 308 */ 309 ibha->ib_grh.ipoib_vertcflow = 0; 310 ibha->ipib_rhdr.ipoib_type = htons(sap); 311 ibha->ipib_rhdr.ipoib_mbz = 0; 312 313 /* Fill IP header */ 314 ipha = (ipha_t *)&ibha[1]; 315 *ipha = icmp_ipha; 316 ipha->ipha_src = old_ipha->ipha_dst; 317 ipha->ipha_dst = old_ipha->ipha_src; 318 ipha->ipha_ttl = old_ipha->ipha_ttl; 319 msg_len += sizeof (icmp_ipha) + sizeof (icmph_t); 320 if (msg_len > IP_MAXPACKET) { 321 ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) " 322 "> IP_MAXPACKET", (uint32_t)msg_len); 323 (void) adjmsg(mp, IP_MAXPACKET - msg_len); 324 msg_len = IP_MAXPACKET; 325 } 326 ipha->ipha_length = htons((uint16_t)msg_len); 327 ipha->ipha_hdr_checksum = 0; 328 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); 329 330 /* Fill ICMP body */ 331 icmph = (icmph_t *)&ipha[1]; 332 bzero(icmph, sizeof (icmph_t)); 333 icmph->icmph_type = ICMP_DEST_UNREACHABLE; 334 icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED; 335 icmph->icmph_du_mtu = htons(mtu); 336 icmph->icmph_checksum = 0; 337 icmph->icmph_checksum = IP_CSUM(pmtu_mp, 338 (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0); 339 340 (void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0, 341 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 342 343 DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, " 344 "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d", 345 sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl, 346 len_needed, (uint32_t)msg_len); 347 348 mac_rx(state->id_mh, state->id_rh, pmtu_mp); 349 350 mutex_enter(&ace->tx_too_big_mutex); 351 ace->tx_too_big_ongoing = B_FALSE; 352 mutex_exit(&ace->tx_too_big_mutex); 353 return; 354 355 too_big_fail: 356 /* Drop packet */ 357 freemsg(mp); 358 mutex_enter(&ace->tx_too_big_mutex); 359 ace->tx_too_big_ongoing = B_FALSE; 360 mutex_exit(&ace->tx_too_big_mutex); 361 } 362 363 void 364 ibd_rc_get_conf(ibd_state_t *state) 365 { 366 int *props; 367 uint_t num_props; 368 int instance; 369 370 instance = ddi_get_instance(state->id_dip); 371 372 /* 373 * Get the array of "enable_rc" properties from "ibd.conf" file 374 */ 375 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, state->id_dip, 376 DDI_PROP_DONTPASS, "enable_rc", &props, &num_props) 377 == DDI_PROP_SUCCESS) { 378 if (instance < num_props) { 379 if (props[instance] == 1) { 380 state->id_enable_rc = B_TRUE; 381 } else { 382 state->id_enable_rc = B_FALSE; 383 } 384 } else { 385 /* not enough properties configured */ 386 state->id_enable_rc = B_FALSE; 387 DPRINT(40, "ibd_rc_get_conf: Not enough " 388 "enable_rc values in ibd.conf," 389 " disable RC mode, instance=%d", instance); 390 } 391 392 /* free memory allocated for properties */ 393 ddi_prop_free(props); 394 } else { 395 state->id_enable_rc = B_FALSE; 396 DPRINT(30, "ibd_rc_get_conf: fail to find " 397 "enable_rc in ibd.conf, disable RC mode"); 398 } 399 400 state->rc_mtu = 65524; 401 state->rc_enable_srq = B_TRUE; 402 } 403 404 #ifdef DEBUG 405 /* 406 * ibd_rc_update_stats - update driver private kstat counters 407 * 408 * This routine will dump the internal statistics counters for ibd's 409 * Reliable Connected Mode. The current stats dump values will 410 * be sent to the kernel status area. 411 */ 412 static int 413 ibd_rc_update_stats(kstat_t *ksp, int rw) 414 { 415 ibd_state_t *state; 416 ibd_rc_stat_t *ibd_rc_ksp; 417 418 if (rw == KSTAT_WRITE) 419 return (EACCES); 420 421 state = (ibd_state_t *)ksp->ks_private; 422 ASSERT(state != NULL); 423 ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data; 424 425 ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte; 426 ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt; 427 ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte; 428 ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt; 429 ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail; 430 431 ibd_rc_ksp->rc_rcq_invoke.value.ul = state->rc_rcq_invoke; 432 ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err; 433 ibd_rc_ksp->rc_scq_invoke.value.ul = state->rc_scq_invoke; 434 435 ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short; 436 437 ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes; 438 ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt; 439 ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul = 440 state->rc_xmt_fragmented_pkt; 441 ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt; 442 ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt; 443 ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found; 444 445 ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe; 446 ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf; 447 ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short; 448 ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update; 449 ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short; 450 ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul = 451 state->rc_xmt_buf_mac_update; 452 453 ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ; 454 ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail; 455 ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn; 456 ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn; 457 458 ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close; 459 ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close; 460 ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle; 461 ibd_rc_ksp->rc_act_close_simultaneous.value.ul = 462 state->rc_act_close_simultaneous; 463 ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt; 464 465 return (0); 466 } 467 468 469 /* 470 * ibd_rc_init_stats - initialize kstat data structures 471 * 472 * This routine will create and initialize the driver private 473 * statistics counters. 474 */ 475 int 476 ibd_rc_init_stats(ibd_state_t *state) 477 { 478 kstat_t *ksp; 479 ibd_rc_stat_t *ibd_rc_ksp; 480 481 /* 482 * Create and init kstat 483 */ 484 ksp = kstat_create("ibd", ddi_get_instance(state->id_dip), 485 "statistics", "net", KSTAT_TYPE_NAMED, 486 sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0); 487 488 if (ksp == NULL) { 489 ibd_print_warn(state, "ibd_rc_init_stats: Could not create " 490 "kernel statistics"); 491 return (DDI_FAILURE); 492 } 493 494 state->rc_ksp = ksp; /* Fill in the ksp of ibd over RC mode */ 495 496 ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data; 497 498 /* 499 * Initialize all the statistics 500 */ 501 kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, " 502 "transfer mode", KSTAT_DATA_ULONG); 503 kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, " 504 "transfer mode", KSTAT_DATA_ULONG); 505 kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, " 506 "copy mode", KSTAT_DATA_ULONG); 507 kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, " 508 "copy mode", KSTAT_DATA_ULONG); 509 kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail", 510 KSTAT_DATA_ULONG); 511 512 kstat_named_init(&ibd_rc_ksp->rc_rcq_invoke, "RC: invoke of Recv CQ " 513 "handler", KSTAT_DATA_ULONG); 514 kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler", 515 KSTAT_DATA_ULONG); 516 517 kstat_named_init(&ibd_rc_ksp->rc_scq_invoke, "RC: invoke of Send CQ " 518 "handler", KSTAT_DATA_ULONG); 519 520 kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe", 521 KSTAT_DATA_ULONG); 522 523 kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes", 524 KSTAT_DATA_ULONG); 525 kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt, 526 "RC: Tx pkt small size", KSTAT_DATA_ULONG); 527 kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt, 528 "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG); 529 kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt, 530 "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG); 531 kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt, 532 "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG); 533 kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found", 534 KSTAT_DATA_ULONG); 535 536 kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after " 537 "recycle", KSTAT_DATA_ULONG); 538 kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf " 539 "after recycle", KSTAT_DATA_ULONG); 540 kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send", 541 KSTAT_DATA_ULONG); 542 kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update " 543 "#, swqe available", KSTAT_DATA_ULONG); 544 kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in " 545 "ibd_send", KSTAT_DATA_ULONG); 546 kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: " 547 "mac_tx_update #, buf available", KSTAT_DATA_ULONG); 548 549 kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected", 550 KSTAT_DATA_ULONG); 551 kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect", 552 KSTAT_DATA_ULONG); 553 kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast " 554 "pkt", KSTAT_DATA_ULONG); 555 kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab " 556 "state", KSTAT_DATA_ULONG); 557 558 kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close", 559 KSTAT_DATA_ULONG); 560 kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close", 561 KSTAT_DATA_ULONG); 562 kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace " 563 "recycle", KSTAT_DATA_ULONG); 564 kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: " 565 "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG); 566 kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel", 567 KSTAT_DATA_ULONG); 568 569 /* 570 * Function to provide kernel stat update on demand 571 */ 572 ksp->ks_update = ibd_rc_update_stats; 573 574 /* 575 * Pointer into provider's raw statistics 576 */ 577 ksp->ks_private = (void *)state; 578 579 /* 580 * Add kstat to systems kstat chain 581 */ 582 kstat_install(ksp); 583 584 return (DDI_SUCCESS); 585 } 586 #endif 587 588 static ibt_status_t 589 ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state, 590 boolean_t is_tx_chan) 591 { 592 ibt_status_t result; 593 ibd_rc_chan_t *chan; 594 ibt_rc_chan_alloc_args_t alloc_args; 595 ibt_chan_alloc_flags_t alloc_flags; 596 ibt_chan_sizes_t sizes; 597 ibt_cq_attr_t cq_atts; 598 int rv; 599 600 chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP); 601 602 chan->state = state; 603 mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 604 mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 605 mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 606 mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 607 mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL); 608 mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL); 609 610 /* Allocate IB structures for a new RC channel. */ 611 if (is_tx_chan) { 612 chan->scq_size = ibd_rc_num_swqe; 613 chan->rcq_size = IBD_RC_MIN_CQ_SIZE; 614 } else { 615 chan->scq_size = IBD_RC_MIN_CQ_SIZE; 616 chan->rcq_size = ibd_rc_num_rwqe; 617 } 618 cq_atts.cq_size = chan->scq_size; 619 cq_atts.cq_sched = NULL; 620 cq_atts.cq_flags = IBT_CQ_NO_FLAGS; 621 result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl, 622 &chan->scq_size); 623 if (result != IBT_SUCCESS) { 624 DPRINT(40, "ibd_rc_alloc_chan: error <%d>" 625 "create scq completion queue (size <%d>)", 626 result, chan->scq_size); 627 goto alloc_scq_err; 628 } /* if failure to alloc cq */ 629 630 if (ibd_rc_enable_cq_moderation) { 631 if (ibt_modify_cq(chan->scq_hdl, ibd_rc_txcomp_count, 632 ibd_rc_txcomp_usec, 0) != IBT_SUCCESS) { 633 ibd_print_warn(state, "ibd_rc_alloc_chan: Send CQ " 634 "interrupt moderation failed"); 635 } 636 } 637 638 ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan); 639 ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler, 640 (void *) (uintptr_t)chan); 641 642 cq_atts.cq_size = chan->rcq_size; 643 cq_atts.cq_sched = NULL; 644 cq_atts.cq_flags = IBT_CQ_NO_FLAGS; 645 result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl, 646 &chan->rcq_size); 647 if (result != IBT_SUCCESS) { 648 ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating " 649 "rx completion queue (size <%d>)", result, chan->rcq_size); 650 goto alloc_rcq_err; 651 } /* if failure to alloc cq */ 652 653 if (ibd_rc_enable_cq_moderation) { 654 if (ibt_modify_cq(chan->rcq_hdl, ibd_rc_rxcomp_count, 655 ibd_rc_rxcomp_usec, 0) != IBT_SUCCESS) { 656 ibd_print_warn(state, "ibd_rc_alloc_chan: Receive CQ " 657 "interrupt moderation failed"); 658 } 659 } 660 ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan); 661 ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler, 662 (void *)(uintptr_t)chan); 663 664 if (is_tx_chan) { 665 chan->is_tx_chan = B_TRUE; 666 if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) { 667 ibd_print_warn(state, "ibd_rc_alloc_chan: " 668 "ibd_rc_init_txlist failed"); 669 goto init_txlist_err; 670 } 671 if (ibd_rc_tx_softintr == 1) { 672 if ((rv = ddi_add_softintr(state->id_dip, 673 DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL, 674 ibd_rc_tx_recycle, (caddr_t)chan)) != 675 DDI_SUCCESS) { 676 DPRINT(10, "ibd_rc_alloc_chan: failed in " 677 "ddi_add_softintr(scq_softintr), ret=%d", 678 rv); 679 goto alloc_softintr_err; 680 } 681 } 682 } else { 683 chan->is_tx_chan = B_FALSE; 684 } 685 686 /* 687 * enable completions 688 */ 689 result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION); 690 if (result != IBT_SUCCESS) { 691 ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify" 692 "(scq) failed: status %d\n", result); 693 goto alloc_scq_enable_err; 694 } 695 696 /* We will enable chan->rcq_hdl later. */ 697 698 /* alloc a RC channel */ 699 bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t)); 700 bzero(&sizes, sizeof (ibt_chan_sizes_t)); 701 702 alloc_args.rc_flags = IBT_WR_SIGNALED; 703 alloc_args.rc_control = IBT_CEP_NO_FLAGS; 704 705 alloc_args.rc_scq = chan->scq_hdl; 706 alloc_args.rc_rcq = chan->rcq_hdl; 707 alloc_args.rc_pd = state->id_pd_hdl; 708 709 alloc_args.rc_hca_port_num = state->id_port; 710 alloc_args.rc_clone_chan = NULL; 711 712 /* scatter/gather */ 713 alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg; 714 715 /* 716 * For the number of SGL elements in receive side, I think it 717 * should be 1. Because ibd driver allocates a whole block memory 718 * for each ibt_post_recv(). 719 */ 720 alloc_args.rc_sizes.cs_rq_sgl = 1; 721 722 /* The send queue size and the receive queue size */ 723 alloc_args.rc_sizes.cs_sq = chan->scq_size; 724 alloc_args.rc_sizes.cs_rq = chan->rcq_size; 725 726 if (state->id_hca_res_lkey_capab) { 727 alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY; 728 } else { 729 DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey"); 730 } 731 732 if (state->rc_enable_srq) { 733 alloc_flags = IBT_ACHAN_USES_SRQ; 734 alloc_args.rc_srq = state->rc_srq_hdl; 735 } else { 736 alloc_flags = IBT_ACHAN_NO_FLAGS; 737 } 738 739 result = ibt_alloc_rc_channel(state->id_hca_hdl, 740 alloc_flags, &alloc_args, &chan->chan_hdl, &sizes); 741 if (result != IBT_SUCCESS) { 742 ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel" 743 " fail:<%d>", result); 744 goto alloc_scq_enable_err; 745 } 746 747 *ret_chan = chan; 748 return (IBT_SUCCESS); 749 750 alloc_scq_enable_err: 751 if (is_tx_chan) { 752 if (ibd_rc_tx_softintr == 1) { 753 ddi_remove_softintr(chan->scq_softintr); 754 } 755 } 756 alloc_softintr_err: 757 if (is_tx_chan) { 758 ibd_rc_fini_txlist(chan); 759 } 760 init_txlist_err: 761 (void) ibt_free_cq(chan->rcq_hdl); 762 alloc_rcq_err: 763 (void) ibt_free_cq(chan->scq_hdl); 764 alloc_scq_err: 765 mutex_destroy(&chan->tx_poll_lock); 766 mutex_destroy(&chan->tx_post_lock); 767 mutex_destroy(&chan->tx_rel_list.dl_mutex); 768 mutex_destroy(&chan->tx_wqe_list.dl_mutex); 769 mutex_destroy(&chan->rx_free_list.dl_mutex); 770 mutex_destroy(&chan->rx_wqe_list.dl_mutex); 771 kmem_free(chan, sizeof (ibd_rc_chan_t)); 772 return (result); 773 } 774 775 static void 776 ibd_rc_free_chan(ibd_rc_chan_t *chan) 777 { 778 ibt_status_t ret; 779 780 /* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */ 781 782 if (chan->chan_hdl != NULL) { 783 ret = ibt_free_channel(chan->chan_hdl); 784 if (ret != IBT_SUCCESS) { 785 DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, " 786 "chan=%p, returned: %d", chan, ret); 787 return; 788 } 789 chan->chan_hdl = NULL; 790 } 791 792 if (chan->rcq_hdl != NULL) { 793 ret = ibt_free_cq(chan->rcq_hdl); 794 if (ret != IBT_SUCCESS) { 795 DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, " 796 "chan=%p, returned: %d", chan, ret); 797 return; 798 } 799 chan->rcq_hdl = NULL; 800 } 801 802 if (chan->scq_hdl != NULL) { 803 ret = ibt_free_cq(chan->scq_hdl); 804 if (ret != IBT_SUCCESS) { 805 DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, " 806 "chan=%p, returned: %d", chan, ret); 807 return; 808 } 809 chan->scq_hdl = NULL; 810 } 811 812 /* Free buffers */ 813 if (chan->is_tx_chan) { 814 ibd_rc_fini_txlist(chan); 815 if (ibd_rc_tx_softintr == 1) { 816 ddi_remove_softintr(chan->scq_softintr); 817 } 818 } else { 819 if (!chan->state->rc_enable_srq) { 820 ibd_rc_fini_rxlist(chan); 821 } 822 } 823 824 mutex_destroy(&chan->tx_poll_lock); 825 mutex_destroy(&chan->tx_post_lock); 826 mutex_destroy(&chan->tx_rel_list.dl_mutex); 827 mutex_destroy(&chan->tx_wqe_list.dl_mutex); 828 mutex_destroy(&chan->rx_free_list.dl_mutex); 829 mutex_destroy(&chan->rx_wqe_list.dl_mutex); 830 831 /* 832 * If it is a passive channel, must make sure it has been removed 833 * from chan->state->rc_pass_chan_list 834 */ 835 kmem_free(chan, sizeof (ibd_rc_chan_t)); 836 } 837 838 /* Add a RC channel */ 839 static inline void 840 ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan) 841 { 842 mutex_enter(&list->chan_list_mutex); 843 if (list->chan_list == NULL) { 844 list->chan_list = chan; 845 } else { 846 chan->next = list->chan_list; 847 list->chan_list = chan; 848 } 849 mutex_exit(&list->chan_list_mutex); 850 } 851 852 /* Remove a RC channel */ 853 static inline void 854 ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan) 855 { 856 ibd_rc_chan_t *pre_chan; 857 858 mutex_enter(&list->chan_list_mutex); 859 if (list->chan_list == chan) { 860 DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)" 861 " in chan_list", chan); 862 list->chan_list = chan->next; 863 } else { 864 pre_chan = list->chan_list; 865 while (pre_chan != NULL) { 866 if (pre_chan->next == chan) { 867 DPRINT(30, "ibd_rc_rm_from_chan_list" 868 "(middle): found chan(%p) in " 869 "rc_pass_chan_list", chan); 870 pre_chan->next = chan->next; 871 break; 872 } 873 pre_chan = pre_chan->next; 874 } 875 } 876 mutex_exit(&list->chan_list_mutex); 877 } 878 879 static inline ibd_rc_chan_t * 880 ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list) 881 { 882 ibd_rc_chan_t *rc_chan; 883 884 mutex_enter(&list->chan_list_mutex); 885 rc_chan = list->chan_list; 886 if (rc_chan != NULL) { 887 list->chan_list = rc_chan->next; 888 } 889 mutex_exit(&list->chan_list_mutex); 890 return (rc_chan); 891 } 892 893 static int 894 ibd_rc_alloc_srq_copybufs(ibd_state_t *state) 895 { 896 ibt_mr_attr_t mem_attr; 897 uint_t rc_rx_bufs_sz; 898 899 /* 900 * Allocate one big chunk for all regular rx copy bufs 901 */ 902 rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size; 903 904 state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP); 905 906 state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size * 907 sizeof (ibd_rwqe_t), KM_SLEEP); 908 909 /* 910 * Do one memory registration on the entire rxbuf area 911 */ 912 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs; 913 mem_attr.mr_len = rc_rx_bufs_sz; 914 mem_attr.mr_as = NULL; 915 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 916 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 917 &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc) 918 != IBT_SUCCESS) { 919 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() " 920 "failed"); 921 kmem_free(state->rc_srq_rwqes, 922 state->rc_srq_size * sizeof (ibd_rwqe_t)); 923 kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz); 924 state->rc_srq_rx_bufs = NULL; 925 state->rc_srq_rwqes = NULL; 926 return (DDI_FAILURE); 927 } 928 929 return (DDI_SUCCESS); 930 } 931 932 static void 933 ibd_rc_free_srq_copybufs(ibd_state_t *state) 934 { 935 uint_t rc_rx_buf_sz; 936 937 /* 938 * Don't change the value of state->rc_mtu at the period from call 939 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs(). 940 */ 941 rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE; 942 943 /* 944 * Unregister rxbuf mr 945 */ 946 if (ibt_deregister_mr(state->id_hca_hdl, 947 state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) { 948 DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()" 949 " failed"); 950 } 951 state->rc_srq_rx_mr_hdl = NULL; 952 953 /* 954 * Free rxbuf memory 955 */ 956 kmem_free(state->rc_srq_rwqes, 957 state->rc_srq_size * sizeof (ibd_rwqe_t)); 958 kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz); 959 state->rc_srq_rwqes = NULL; 960 state->rc_srq_rx_bufs = NULL; 961 } 962 963 /* 964 * Allocate and post a certain number of SRQ receive buffers and WRs. 965 */ 966 int 967 ibd_rc_init_srq_list(ibd_state_t *state) 968 { 969 ibd_rwqe_t *rwqe; 970 ibt_lkey_t lkey; 971 int i; 972 uint_t len; 973 uint8_t *bufaddr; 974 ibt_srq_sizes_t srq_sizes; 975 ibt_srq_sizes_t srq_real_sizes; 976 ibt_status_t ret; 977 978 srq_sizes.srq_sgl_sz = 1; 979 srq_sizes.srq_wr_sz = ibd_rc_num_srq; 980 ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS, 981 state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes); 982 if (ret != IBT_SUCCESS) { 983 DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed." 984 "req_sgl_sz=%d, req_wr_sz=0x%x, ret=%d", 985 srq_sizes.srq_sgl_sz, srq_sizes.srq_wr_sz, ret); 986 return (DDI_FAILURE); 987 } 988 989 state->rc_srq_size = srq_real_sizes.srq_wr_sz; 990 if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) { 991 ret = ibt_free_srq(state->rc_srq_hdl); 992 if (ret != IBT_SUCCESS) { 993 ibd_print_warn(state, "ibd_rc_init_srq_list: " 994 "ibt_free_srq fail, ret=%d", ret); 995 } 996 return (DDI_FAILURE); 997 } 998 999 /* 1000 * Allocate and setup the rwqe list 1001 */ 1002 lkey = state->rc_srq_rx_mr_desc.md_lkey; 1003 rwqe = state->rc_srq_rwqes; 1004 bufaddr = state->rc_srq_rx_bufs; 1005 len = state->rc_mtu + IPOIB_GRH_SIZE; 1006 state->rc_srq_rwqe_list.dl_cnt = 0; 1007 state->rc_srq_rwqe_list.dl_bufs_outstanding = 0; 1008 for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) { 1009 rwqe->w_state = state; 1010 rwqe->w_freeing_wqe = B_FALSE; 1011 rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb; 1012 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 1013 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 1014 1015 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 1016 &rwqe->w_freemsg_cb)) == NULL) { 1017 DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed"); 1018 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 1019 ibd_rc_fini_srq_list(state); 1020 return (DDI_FAILURE); 1021 } 1022 1023 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 1024 /* Leave IPOIB_GRH_SIZE space */ 1025 rwqe->rwqe_copybuf.ic_sgl.ds_va = 1026 (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE); 1027 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu; 1028 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 1029 rwqe->w_rwr.wr_nds = 1; 1030 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 1031 (void) ibd_rc_post_srq(state, rwqe); 1032 } 1033 1034 return (DDI_SUCCESS); 1035 } 1036 1037 /* 1038 * Free the statically allocated Rx buffer list for SRQ. 1039 */ 1040 void 1041 ibd_rc_fini_srq_list(ibd_state_t *state) 1042 { 1043 ibd_rwqe_t *rwqe; 1044 int i; 1045 ibt_status_t ret; 1046 1047 ret = ibt_free_srq(state->rc_srq_hdl); 1048 if (ret != IBT_SUCCESS) { 1049 ibd_print_warn(state, "ibd_rc_fini_srq_list: " 1050 "ibt_free_srq fail, ret=%d", ret); 1051 } 1052 1053 mutex_enter(&state->rc_srq_rwqe_list.dl_mutex); 1054 rwqe = state->rc_srq_rwqes; 1055 for (i = 0; i < state->rc_srq_size; i++, rwqe++) { 1056 if (rwqe->rwqe_im_mblk != NULL) { 1057 rwqe->w_freeing_wqe = B_TRUE; 1058 freemsg(rwqe->rwqe_im_mblk); 1059 } 1060 } 1061 mutex_exit(&state->rc_srq_rwqe_list.dl_mutex); 1062 1063 ibd_rc_free_srq_copybufs(state); 1064 } 1065 1066 /* 1067 * Free an allocated recv wqe. 1068 */ 1069 void 1070 ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 1071 { 1072 /* 1073 * desballoc() failed (no memory) or the posting of rwqe failed. 1074 * 1075 * This rwqe is placed on a free list so that it 1076 * can be reinstated in future. 1077 * 1078 * NOTE: no code currently exists to reinstate 1079 * these "lost" rwqes. 1080 */ 1081 mutex_enter(&state->rc_srq_free_list.dl_mutex); 1082 state->rc_srq_free_list.dl_cnt++; 1083 rwqe->rwqe_next = state->rc_srq_free_list.dl_head; 1084 state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe); 1085 mutex_exit(&state->rc_srq_free_list.dl_mutex); 1086 } 1087 1088 static void 1089 ibd_rc_srq_freemsg_cb(char *arg) 1090 { 1091 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 1092 ibd_state_t *state = rwqe->w_state; 1093 1094 ASSERT(state->rc_enable_srq); 1095 1096 /* 1097 * If the wqe is being destructed, do not attempt recycling. 1098 */ 1099 if (rwqe->w_freeing_wqe == B_TRUE) { 1100 return; 1101 } 1102 1103 ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size); 1104 1105 /* 1106 * Upper layer has released held mblk, so we have 1107 * no more use for keeping the old pointer in 1108 * our rwqe. 1109 */ 1110 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 1111 state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 1112 if (rwqe->rwqe_im_mblk == NULL) { 1113 DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed"); 1114 ibd_rc_srq_free_rwqe(state, rwqe); 1115 return; 1116 } 1117 1118 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { 1119 ibd_rc_srq_free_rwqe(state, rwqe); 1120 return; 1121 } 1122 1123 atomic_add_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding, -1); 1124 } 1125 1126 /* 1127 * Post a rwqe to the hardware and add it to the Rx list. 1128 */ 1129 static int 1130 ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe) 1131 { 1132 /* 1133 * Here we should add dl_cnt before post recv, because 1134 * we would have to make sure dl_cnt is updated before 1135 * the corresponding ibd_rc_process_rx() is called. 1136 */ 1137 ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size); 1138 atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1); 1139 if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) != 1140 IBT_SUCCESS) { 1141 atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt); 1142 DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed"); 1143 return (DDI_FAILURE); 1144 } 1145 1146 return (DDI_SUCCESS); 1147 } 1148 1149 /* 1150 * Post a rwqe to the hardware and add it to the Rx list. 1151 */ 1152 static int 1153 ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe) 1154 { 1155 /* 1156 * Here we should add dl_cnt before post recv, because we would 1157 * have to make sure dl_cnt has already updated before 1158 * corresponding ibd_rc_process_rx() is called. 1159 */ 1160 atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1); 1161 if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) != 1162 IBT_SUCCESS) { 1163 atomic_dec_32(&chan->rx_wqe_list.dl_cnt); 1164 DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()"); 1165 return (DDI_FAILURE); 1166 } 1167 return (DDI_SUCCESS); 1168 } 1169 1170 static int 1171 ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan) 1172 { 1173 ibd_state_t *state = chan->state; 1174 ibt_mr_attr_t mem_attr; 1175 uint_t rc_rx_bufs_sz; 1176 1177 /* 1178 * Allocate one big chunk for all regular rx copy bufs 1179 */ 1180 rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size; 1181 1182 chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP); 1183 1184 chan->rx_rwqes = kmem_zalloc(chan->rcq_size * 1185 sizeof (ibd_rwqe_t), KM_SLEEP); 1186 1187 /* 1188 * Do one memory registration on the entire rxbuf area 1189 */ 1190 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs; 1191 mem_attr.mr_len = rc_rx_bufs_sz; 1192 mem_attr.mr_as = NULL; 1193 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 1194 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 1195 &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) { 1196 DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed"); 1197 kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t)); 1198 kmem_free(chan->rx_bufs, rc_rx_bufs_sz); 1199 chan->rx_bufs = NULL; 1200 chan->rx_rwqes = NULL; 1201 return (DDI_FAILURE); 1202 } 1203 1204 return (DDI_SUCCESS); 1205 } 1206 1207 static void 1208 ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan) 1209 { 1210 ibd_state_t *state = chan->state; 1211 uint_t rc_rx_buf_sz; 1212 1213 ASSERT(!state->rc_enable_srq); 1214 ASSERT(chan->rx_rwqes != NULL); 1215 ASSERT(chan->rx_bufs != NULL); 1216 1217 /* 1218 * Don't change the value of state->rc_mtu at the period from call 1219 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs(). 1220 */ 1221 rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE; 1222 1223 /* 1224 * Unregister rxbuf mr 1225 */ 1226 if (ibt_deregister_mr(state->id_hca_hdl, 1227 chan->rx_mr_hdl) != IBT_SUCCESS) { 1228 DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed"); 1229 } 1230 chan->rx_mr_hdl = NULL; 1231 1232 /* 1233 * Free rxbuf memory 1234 */ 1235 kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t)); 1236 chan->rx_rwqes = NULL; 1237 1238 kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz); 1239 chan->rx_bufs = NULL; 1240 } 1241 1242 /* 1243 * Post a certain number of receive buffers and WRs on a RC channel. 1244 */ 1245 static int 1246 ibd_rc_init_rxlist(ibd_rc_chan_t *chan) 1247 { 1248 ibd_state_t *state = chan->state; 1249 ibd_rwqe_t *rwqe; 1250 ibt_lkey_t lkey; 1251 int i; 1252 uint_t len; 1253 uint8_t *bufaddr; 1254 1255 ASSERT(!state->rc_enable_srq); 1256 if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS) 1257 return (DDI_FAILURE); 1258 1259 /* 1260 * Allocate and setup the rwqe list 1261 */ 1262 lkey = chan->rx_mr_desc.md_lkey; 1263 rwqe = chan->rx_rwqes; 1264 bufaddr = chan->rx_bufs; 1265 len = state->rc_mtu + IPOIB_GRH_SIZE; 1266 for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) { 1267 rwqe->w_state = state; 1268 rwqe->w_chan = chan; 1269 rwqe->w_freeing_wqe = B_FALSE; 1270 rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb; 1271 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 1272 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 1273 1274 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 1275 &rwqe->w_freemsg_cb)) == NULL) { 1276 DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed"); 1277 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 1278 ibd_rc_fini_rxlist(chan); 1279 return (DDI_FAILURE); 1280 } 1281 1282 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 1283 rwqe->rwqe_copybuf.ic_sgl.ds_va = 1284 (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE); 1285 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu; 1286 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 1287 rwqe->w_rwr.wr_nds = 1; 1288 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 1289 (void) ibd_rc_post_rwqe(chan, rwqe); 1290 } 1291 1292 return (DDI_SUCCESS); 1293 } 1294 1295 /* 1296 * Free the statically allocated Rx buffer list for SRQ. 1297 */ 1298 static void 1299 ibd_rc_fini_rxlist(ibd_rc_chan_t *chan) 1300 { 1301 ibd_rwqe_t *rwqe; 1302 int i; 1303 1304 if (chan->rx_bufs == NULL) { 1305 DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit"); 1306 return; 1307 } 1308 1309 /* bufs_outstanding must be 0 */ 1310 ASSERT((chan->rx_wqe_list.dl_head == NULL) || 1311 (chan->rx_wqe_list.dl_bufs_outstanding == 0)); 1312 1313 mutex_enter(&chan->rx_wqe_list.dl_mutex); 1314 rwqe = chan->rx_rwqes; 1315 for (i = 0; i < chan->rcq_size; i++, rwqe++) { 1316 if (rwqe->rwqe_im_mblk != NULL) { 1317 rwqe->w_freeing_wqe = B_TRUE; 1318 freemsg(rwqe->rwqe_im_mblk); 1319 } 1320 } 1321 mutex_exit(&chan->rx_wqe_list.dl_mutex); 1322 1323 ibd_rc_free_rx_copybufs(chan); 1324 } 1325 1326 /* 1327 * Free an allocated recv wqe. 1328 */ 1329 static void 1330 ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe) 1331 { 1332 /* 1333 * desballoc() failed (no memory) or the posting of rwqe failed. 1334 * 1335 * This rwqe is placed on a free list so that it 1336 * can be reinstated in future. 1337 * 1338 * NOTE: no code currently exists to reinstate 1339 * these "lost" rwqes. 1340 */ 1341 mutex_enter(&chan->rx_free_list.dl_mutex); 1342 chan->rx_free_list.dl_cnt++; 1343 rwqe->rwqe_next = chan->rx_free_list.dl_head; 1344 chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 1345 mutex_exit(&chan->rx_free_list.dl_mutex); 1346 } 1347 1348 /* 1349 * Processing to be done after receipt of a packet; hand off to GLD 1350 * in the format expected by GLD. 1351 */ 1352 static void 1353 ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 1354 { 1355 ibd_state_t *state = chan->state; 1356 ib_header_info_t *phdr; 1357 ipoib_hdr_t *ipibp; 1358 mblk_t *mp; 1359 mblk_t *mpc; 1360 int rxcnt; 1361 ip6_t *ip6h; 1362 int len; 1363 1364 /* 1365 * Track number handed to upper layer, and number still 1366 * available to receive packets. 1367 */ 1368 if (state->rc_enable_srq) { 1369 rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt); 1370 } else { 1371 rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt); 1372 } 1373 1374 /* 1375 * It can not be a IBA multicast packet. 1376 */ 1377 ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT); 1378 1379 1380 #ifdef DEBUG 1381 if (rxcnt < ibd_rc_rx_rwqe_thresh) { 1382 state->rc_rwqe_short++; 1383 } 1384 #endif 1385 1386 /* 1387 * Possibly replenish the Rx pool if needed. 1388 */ 1389 if ((rxcnt >= ibd_rc_rx_rwqe_thresh) && 1390 (wc->wc_bytes_xfer > ibd_rc_rx_copy_thresh)) { 1391 atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer); 1392 atomic_inc_64(&state->rc_rcv_trans_pkt); 1393 1394 /* 1395 * Record how many rwqe has been occupied by upper 1396 * network layer 1397 */ 1398 if (state->rc_enable_srq) { 1399 atomic_add_32(&state->rc_srq_rwqe_list. 1400 dl_bufs_outstanding, 1); 1401 } else { 1402 atomic_add_32(&chan->rx_wqe_list. 1403 dl_bufs_outstanding, 1); 1404 } 1405 mp = rwqe->rwqe_im_mblk; 1406 } else { 1407 atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer); 1408 atomic_inc_64(&state->rc_rcv_copy_pkt); 1409 1410 if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE, 1411 BPRI_HI)) == NULL) { /* no memory */ 1412 DPRINT(40, "ibd_rc_process_rx: allocb() failed"); 1413 state->rc_rcv_alloc_fail++; 1414 if (state->rc_enable_srq) { 1415 if (ibd_rc_post_srq(state, rwqe) == 1416 DDI_FAILURE) { 1417 ibd_rc_srq_free_rwqe(state, rwqe); 1418 } 1419 } else { 1420 if (ibd_rc_post_rwqe(chan, rwqe) == 1421 DDI_FAILURE) { 1422 ibd_rc_free_rwqe(chan, rwqe); 1423 } 1424 } 1425 return; 1426 } 1427 1428 bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE, 1429 mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer); 1430 1431 if (state->rc_enable_srq) { 1432 if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) { 1433 ibd_rc_srq_free_rwqe(state, rwqe); 1434 } 1435 } else { 1436 if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) { 1437 ibd_rc_free_rwqe(chan, rwqe); 1438 } 1439 } 1440 } 1441 1442 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE); 1443 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 1444 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 1445 len = ntohs(ip6h->ip6_plen); 1446 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 1447 /* LINTED: E_CONSTANT_CONDITION */ 1448 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 1449 } 1450 } 1451 1452 phdr = (ib_header_info_t *)mp->b_rptr; 1453 phdr->ib_grh.ipoib_vertcflow = 0; 1454 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 1455 sizeof (ipoib_mac_t)); 1456 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE; 1457 1458 /* 1459 * Can RC mode in IB guarantee its checksum correctness? 1460 * 1461 * (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 1462 * HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 1463 */ 1464 1465 /* 1466 * Make sure this is NULL or we're in trouble. 1467 */ 1468 if (mp->b_next != NULL) { 1469 ibd_print_warn(state, 1470 "ibd_rc_process_rx: got duplicate mp from rcq?"); 1471 mp->b_next = NULL; 1472 } 1473 1474 /* 1475 * Add this mp to the list of processed mp's to send to 1476 * the nw layer 1477 */ 1478 if (state->rc_enable_srq) { 1479 mutex_enter(&state->rc_rx_lock); 1480 if (state->rc_rx_mp) { 1481 ASSERT(state->rc_rx_mp_tail != NULL); 1482 state->rc_rx_mp_tail->b_next = mp; 1483 } else { 1484 ASSERT(state->rc_rx_mp_tail == NULL); 1485 state->rc_rx_mp = mp; 1486 } 1487 1488 state->rc_rx_mp_tail = mp; 1489 state->rc_rx_mp_len++; 1490 1491 if (state->rc_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 1492 mpc = state->rc_rx_mp; 1493 1494 state->rc_rx_mp = NULL; 1495 state->rc_rx_mp_tail = NULL; 1496 state->rc_rx_mp_len = 0; 1497 mutex_exit(&state->rc_rx_lock); 1498 mac_rx(state->id_mh, NULL, mpc); 1499 } else { 1500 mutex_exit(&state->rc_rx_lock); 1501 } 1502 } else { 1503 mutex_enter(&chan->rx_lock); 1504 if (chan->rx_mp) { 1505 ASSERT(chan->rx_mp_tail != NULL); 1506 chan->rx_mp_tail->b_next = mp; 1507 } else { 1508 ASSERT(chan->rx_mp_tail == NULL); 1509 chan->rx_mp = mp; 1510 } 1511 1512 chan->rx_mp_tail = mp; 1513 chan->rx_mp_len++; 1514 1515 if (chan->rx_mp_len >= IBD_MAX_RX_MP_LEN) { 1516 mpc = chan->rx_mp; 1517 1518 chan->rx_mp = NULL; 1519 chan->rx_mp_tail = NULL; 1520 chan->rx_mp_len = 0; 1521 mutex_exit(&chan->rx_lock); 1522 mac_rx(state->id_mh, NULL, mpc); 1523 } else { 1524 mutex_exit(&chan->rx_lock); 1525 } 1526 } 1527 } 1528 1529 /* 1530 * Callback code invoked from STREAMs when the recv data buffer is free 1531 * for recycling. 1532 */ 1533 static void 1534 ibd_rc_freemsg_cb(char *arg) 1535 { 1536 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 1537 ibd_rc_chan_t *chan = rwqe->w_chan; 1538 ibd_state_t *state = rwqe->w_state; 1539 1540 /* 1541 * If the wqe is being destructed, do not attempt recycling. 1542 */ 1543 if (rwqe->w_freeing_wqe == B_TRUE) { 1544 return; 1545 } 1546 1547 ASSERT(!state->rc_enable_srq); 1548 ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size); 1549 1550 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 1551 state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 1552 if (rwqe->rwqe_im_mblk == NULL) { 1553 DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed"); 1554 ibd_rc_free_rwqe(chan, rwqe); 1555 return; 1556 } 1557 1558 /* 1559 * Post back to h/w. We could actually have more than 1560 * id_num_rwqe WQEs on the list if there were multiple 1561 * ibd_freemsg_cb() calls outstanding (since the lock is 1562 * not held the entire time). This will start getting 1563 * corrected over subsequent ibd_freemsg_cb() calls. 1564 */ 1565 if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) { 1566 ibd_rc_free_rwqe(chan, rwqe); 1567 return; 1568 } 1569 atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1); 1570 } 1571 1572 /* 1573 * Common code for interrupt handling as well as for polling 1574 * for all completed wqe's while detaching. 1575 */ 1576 static void 1577 ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl) 1578 { 1579 ibd_wqe_t *wqe; 1580 ibt_wc_t *wc, *wcs; 1581 uint_t numwcs, real_numwcs; 1582 int i; 1583 1584 wcs = chan->rx_wc; 1585 numwcs = IBD_RC_MAX_CQ_WC; 1586 1587 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { 1588 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { 1589 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 1590 if (wc->wc_status != IBT_WC_SUCCESS) { 1591 chan->state->rc_rcq_err++; 1592 /* 1593 * Channel being torn down. 1594 */ 1595 DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != " 1596 "SUCC, chan=%p", wc->wc_status, chan); 1597 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 1598 /* 1599 * Do not invoke Rx handler because 1600 * it might add buffers to the Rx pool 1601 * when we are trying to deinitialize. 1602 */ 1603 continue; 1604 } 1605 } 1606 ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc); 1607 } 1608 } 1609 } 1610 1611 /* Receive CQ handler */ 1612 /* ARGSUSED */ 1613 static void 1614 ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1615 { 1616 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; 1617 ibd_state_t *state = chan->state; 1618 1619 ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB); 1620 1621 /* 1622 * Poll for completed entries; the CQ will not interrupt any 1623 * more for incoming (or transmitted) packets. 1624 */ 1625 state->rc_rcq_invoke++; 1626 ibd_rc_poll_rcq(chan, chan->rcq_hdl); 1627 1628 /* 1629 * Now enable CQ notifications; all packets that arrive now 1630 * (or complete transmission) will cause new interrupts. 1631 */ 1632 if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) != 1633 IBT_SUCCESS) { 1634 /* 1635 * We do not expect a failure here. 1636 */ 1637 DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed"); 1638 } 1639 1640 /* 1641 * Repoll to catch all packets that might have arrived after 1642 * we finished the first poll loop and before interrupts got 1643 * armed. 1644 */ 1645 ibd_rc_poll_rcq(chan, chan->rcq_hdl); 1646 1647 if (state->rc_enable_srq) { 1648 mutex_enter(&state->rc_rx_lock); 1649 1650 if (state->rc_rx_mp != NULL) { 1651 mblk_t *mpc; 1652 mpc = state->rc_rx_mp; 1653 1654 state->rc_rx_mp = NULL; 1655 state->rc_rx_mp_tail = NULL; 1656 state->rc_rx_mp_len = 0; 1657 1658 mutex_exit(&state->rc_rx_lock); 1659 mac_rx(state->id_mh, NULL, mpc); 1660 } else { 1661 mutex_exit(&state->rc_rx_lock); 1662 } 1663 } else { 1664 mutex_enter(&chan->rx_lock); 1665 1666 if (chan->rx_mp != NULL) { 1667 mblk_t *mpc; 1668 mpc = chan->rx_mp; 1669 1670 chan->rx_mp = NULL; 1671 chan->rx_mp_tail = NULL; 1672 chan->rx_mp_len = 0; 1673 1674 mutex_exit(&chan->rx_lock); 1675 mac_rx(state->id_mh, NULL, mpc); 1676 } else { 1677 mutex_exit(&chan->rx_lock); 1678 } 1679 } 1680 } 1681 1682 /* 1683 * Allocate the statically allocated Tx buffer list. 1684 */ 1685 int 1686 ibd_rc_init_tx_largebuf_list(ibd_state_t *state) 1687 { 1688 ibd_rc_tx_largebuf_t *lbufp; 1689 ibd_rc_tx_largebuf_t *tail; 1690 uint8_t *memp; 1691 ibt_mr_attr_t mem_attr; 1692 uint32_t num_swqe; 1693 size_t mem_size; 1694 int i; 1695 1696 num_swqe = ibd_rc_num_swqe - 1; 1697 1698 /* 1699 * Allocate one big chunk for all Tx large copy bufs 1700 */ 1701 /* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */ 1702 mem_size = num_swqe * state->rc_mtu; 1703 state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP); 1704 1705 mem_attr.mr_len = mem_size; 1706 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs; 1707 mem_attr.mr_as = NULL; 1708 mem_attr.mr_flags = IBT_MR_SLEEP; 1709 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 1710 &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) { 1711 DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr " 1712 "failed"); 1713 kmem_free(state->rc_tx_mr_bufs, mem_size); 1714 state->rc_tx_mr_bufs = NULL; 1715 return (DDI_FAILURE); 1716 } 1717 1718 state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe * 1719 sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP); 1720 1721 /* 1722 * Set up the buf chain 1723 */ 1724 memp = state->rc_tx_mr_bufs; 1725 mutex_enter(&state->rc_tx_large_bufs_lock); 1726 lbufp = state->rc_tx_largebuf_desc_base; 1727 for (i = 0; i < num_swqe; i++) { 1728 lbufp->lb_buf = memp; 1729 lbufp->lb_next = lbufp + 1; 1730 1731 tail = lbufp; 1732 1733 memp += state->rc_mtu; 1734 lbufp++; 1735 } 1736 tail->lb_next = NULL; 1737 1738 /* 1739 * Set up the buffer information in ibd state 1740 */ 1741 state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base; 1742 state->rc_tx_largebuf_nfree = num_swqe; 1743 mutex_exit(&state->rc_tx_large_bufs_lock); 1744 return (DDI_SUCCESS); 1745 } 1746 1747 void 1748 ibd_rc_fini_tx_largebuf_list(ibd_state_t *state) 1749 { 1750 uint32_t num_swqe; 1751 1752 num_swqe = ibd_rc_num_swqe - 1; 1753 1754 if (ibt_deregister_mr(state->id_hca_hdl, 1755 state->rc_tx_mr_hdl) != IBT_SUCCESS) { 1756 DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() " 1757 "failed"); 1758 } 1759 state->rc_tx_mr_hdl = NULL; 1760 1761 kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu); 1762 state->rc_tx_mr_bufs = NULL; 1763 1764 kmem_free(state->rc_tx_largebuf_desc_base, 1765 num_swqe * sizeof (ibd_rc_tx_largebuf_t)); 1766 state->rc_tx_largebuf_desc_base = NULL; 1767 } 1768 1769 static int 1770 ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan) 1771 { 1772 ibt_mr_attr_t mem_attr; 1773 ibd_state_t *state; 1774 1775 state = chan->state; 1776 ASSERT(state != NULL); 1777 1778 /* 1779 * Allocate one big chunk for all regular tx copy bufs 1780 */ 1781 mem_attr.mr_len = chan->scq_size * ibd_rc_tx_copy_thresh; 1782 1783 chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP); 1784 1785 /* 1786 * Do one memory registration on the entire txbuf area 1787 */ 1788 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs; 1789 mem_attr.mr_as = NULL; 1790 mem_attr.mr_flags = IBT_MR_SLEEP; 1791 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 1792 &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) { 1793 DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed"); 1794 ASSERT(mem_attr.mr_len == 1795 chan->scq_size * ibd_rc_tx_copy_thresh); 1796 kmem_free(chan->tx_mr_bufs, mem_attr.mr_len); 1797 chan->tx_mr_bufs = NULL; 1798 return (DDI_FAILURE); 1799 } 1800 1801 return (DDI_SUCCESS); 1802 } 1803 1804 /* 1805 * Allocate the statically allocated Tx buffer list. 1806 */ 1807 static int 1808 ibd_rc_init_txlist(ibd_rc_chan_t *chan) 1809 { 1810 ibd_swqe_t *swqe; 1811 int i; 1812 ibt_lkey_t lkey; 1813 1814 if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS) 1815 return (DDI_FAILURE); 1816 1817 /* 1818 * Allocate and setup the swqe list 1819 */ 1820 lkey = chan->tx_mr_desc.md_lkey; 1821 chan->tx_wqes = kmem_zalloc(chan->scq_size * 1822 sizeof (ibd_swqe_t), KM_SLEEP); 1823 swqe = chan->tx_wqes; 1824 for (i = 0; i < chan->scq_size; i++, swqe++) { 1825 swqe->swqe_next = NULL; 1826 swqe->swqe_im_mblk = NULL; 1827 1828 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 1829 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 1830 1831 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 1832 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 1833 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 1834 (chan->tx_mr_bufs + i * ibd_rc_tx_copy_thresh); 1835 swqe->w_swr.wr_trans = IBT_RC_SRV; 1836 1837 /* Add to list */ 1838 mutex_enter(&chan->tx_wqe_list.dl_mutex); 1839 chan->tx_wqe_list.dl_cnt++; 1840 swqe->swqe_next = chan->tx_wqe_list.dl_head; 1841 chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe); 1842 mutex_exit(&chan->tx_wqe_list.dl_mutex); 1843 } 1844 1845 return (DDI_SUCCESS); 1846 } 1847 1848 /* 1849 * Free the statically allocated Tx buffer list. 1850 */ 1851 static void 1852 ibd_rc_fini_txlist(ibd_rc_chan_t *chan) 1853 { 1854 if (chan->tx_mr_hdl != NULL) { 1855 if (ibt_deregister_mr(chan->state->id_hca_hdl, 1856 chan->tx_mr_hdl) != IBT_SUCCESS) { 1857 DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr " 1858 "failed"); 1859 } 1860 chan->tx_mr_hdl = NULL; 1861 } 1862 1863 if (chan->tx_mr_bufs != NULL) { 1864 kmem_free(chan->tx_mr_bufs, chan->scq_size * 1865 ibd_rc_tx_copy_thresh); 1866 chan->tx_mr_bufs = NULL; 1867 } 1868 1869 if (chan->tx_wqes != NULL) { 1870 kmem_free(chan->tx_wqes, chan->scq_size * 1871 sizeof (ibd_swqe_t)); 1872 chan->tx_wqes = NULL; 1873 } 1874 } 1875 1876 /* 1877 * Acquire send wqe from free list. 1878 * Returns error number and send wqe pointer. 1879 */ 1880 ibd_swqe_t * 1881 ibd_rc_acquire_swqes(ibd_rc_chan_t *chan) 1882 { 1883 ibd_swqe_t *wqe; 1884 1885 mutex_enter(&chan->tx_rel_list.dl_mutex); 1886 if (chan->tx_rel_list.dl_head != NULL) { 1887 /* transfer id_tx_rel_list to id_tx_list */ 1888 chan->tx_wqe_list.dl_head = 1889 chan->tx_rel_list.dl_head; 1890 chan->tx_wqe_list.dl_cnt = 1891 chan->tx_rel_list.dl_cnt; 1892 chan->tx_wqe_list.dl_pending_sends = B_FALSE; 1893 1894 /* clear id_tx_rel_list */ 1895 chan->tx_rel_list.dl_head = NULL; 1896 chan->tx_rel_list.dl_cnt = 0; 1897 mutex_exit(&chan->tx_rel_list.dl_mutex); 1898 1899 wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head); 1900 chan->tx_wqe_list.dl_cnt -= 1; 1901 chan->tx_wqe_list.dl_head = wqe->swqe_next; 1902 } else { /* no free swqe */ 1903 mutex_exit(&chan->tx_rel_list.dl_mutex); 1904 chan->tx_wqe_list.dl_pending_sends = B_TRUE; 1905 wqe = NULL; 1906 } 1907 return (wqe); 1908 } 1909 1910 /* 1911 * Release send wqe back into free list. 1912 */ 1913 static void 1914 ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe) 1915 { 1916 /* 1917 * Add back on Tx list for reuse. 1918 */ 1919 swqe->swqe_next = NULL; 1920 mutex_enter(&chan->tx_rel_list.dl_mutex); 1921 chan->tx_rel_list.dl_pending_sends = B_FALSE; 1922 swqe->swqe_next = chan->tx_rel_list.dl_head; 1923 chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe); 1924 chan->tx_rel_list.dl_cnt++; 1925 mutex_exit(&chan->tx_rel_list.dl_mutex); 1926 } 1927 1928 void 1929 ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node) 1930 { 1931 uint_t i; 1932 uint_t num_posted; 1933 uint_t n_wrs; 1934 ibt_status_t ibt_status; 1935 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 1936 ibd_swqe_t *tx_head, *elem; 1937 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 1938 1939 /* post the one request, then check for more */ 1940 ibt_status = ibt_post_send(chan->chan_hdl, 1941 &node->w_swr, 1, NULL); 1942 if (ibt_status != IBT_SUCCESS) { 1943 ibd_print_warn(chan->state, "ibd_post_send: " 1944 "posting one wr failed: ret=%d", ibt_status); 1945 ibd_rc_tx_cleanup(node); 1946 } 1947 1948 tx_head = NULL; 1949 for (;;) { 1950 if (tx_head == NULL) { 1951 mutex_enter(&chan->tx_post_lock); 1952 tx_head = chan->tx_head; 1953 if (tx_head == NULL) { 1954 chan->tx_busy = 0; 1955 mutex_exit(&chan->tx_post_lock); 1956 return; 1957 } 1958 chan->tx_head = NULL; 1959 mutex_exit(&chan->tx_post_lock); 1960 } 1961 1962 /* 1963 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 1964 * at a time if possible, and keep posting them. 1965 */ 1966 for (n_wrs = 0, elem = tx_head; 1967 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 1968 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 1969 nodes[n_wrs] = elem; 1970 wrs[n_wrs] = elem->w_swr; 1971 } 1972 tx_head = elem; 1973 1974 ASSERT(n_wrs != 0); 1975 1976 /* 1977 * If posting fails for some reason, we'll never receive 1978 * completion intimation, so we'll need to cleanup. But 1979 * we need to make sure we don't clean up nodes whose 1980 * wrs have been successfully posted. We assume that the 1981 * hca driver returns on the first failure to post and 1982 * therefore the first 'num_posted' entries don't need 1983 * cleanup here. 1984 */ 1985 num_posted = 0; 1986 ibt_status = ibt_post_send(chan->chan_hdl, 1987 wrs, n_wrs, &num_posted); 1988 if (ibt_status != IBT_SUCCESS) { 1989 ibd_print_warn(chan->state, "ibd_post_send: " 1990 "posting multiple wrs failed: " 1991 "requested=%d, done=%d, ret=%d", 1992 n_wrs, num_posted, ibt_status); 1993 1994 for (i = num_posted; i < n_wrs; i++) 1995 ibd_rc_tx_cleanup(nodes[i]); 1996 } 1997 } 1998 } 1999 2000 /* 2001 * Common code that deals with clean ups after a successful or 2002 * erroneous transmission attempt. 2003 */ 2004 void 2005 ibd_rc_tx_cleanup(ibd_swqe_t *swqe) 2006 { 2007 ibd_ace_t *ace = swqe->w_ahandle; 2008 ibd_state_t *state; 2009 2010 ASSERT(ace != NULL); 2011 ASSERT(ace->ac_chan != NULL); 2012 2013 state = ace->ac_chan->state; 2014 2015 /* 2016 * If this was a dynamic registration in ibd_send(), 2017 * deregister now. 2018 */ 2019 if (swqe->swqe_im_mblk != NULL) { 2020 ASSERT(swqe->w_buftype == IBD_WQE_MAPPED); 2021 if (swqe->w_buftype == IBD_WQE_MAPPED) { 2022 ibd_unmap_mem(state, swqe); 2023 } 2024 freemsg(swqe->swqe_im_mblk); 2025 swqe->swqe_im_mblk = NULL; 2026 } else { 2027 ASSERT(swqe->w_buftype != IBD_WQE_MAPPED); 2028 } 2029 2030 if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) { 2031 ibd_rc_tx_largebuf_t *lbufp; 2032 2033 lbufp = swqe->w_rc_tx_largebuf; 2034 ASSERT(lbufp != NULL); 2035 2036 mutex_enter(&state->rc_tx_large_bufs_lock); 2037 lbufp->lb_next = state->rc_tx_largebuf_free_head; 2038 state->rc_tx_largebuf_free_head = lbufp; 2039 state->rc_tx_largebuf_nfree ++; 2040 mutex_exit(&state->rc_tx_large_bufs_lock); 2041 swqe->w_rc_tx_largebuf = NULL; 2042 } 2043 2044 2045 /* 2046 * Release the send wqe for reuse. 2047 */ 2048 ibd_rc_release_swqe(ace->ac_chan, swqe); 2049 2050 /* 2051 * Drop the reference count on the AH; it can be reused 2052 * now for a different destination if there are no more 2053 * posted sends that will use it. This can be eliminated 2054 * if we can always associate each Tx buffer with an AH. 2055 * The ace can be null if we are cleaning up from the 2056 * ibd_send() error path. 2057 */ 2058 ibd_dec_ref_ace(state, ace); 2059 } 2060 2061 void 2062 ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl) 2063 { 2064 ibd_state_t *state = chan->state; 2065 ibd_wqe_t *wqe; 2066 ibt_wc_t *wc, *wcs; 2067 uint_t numwcs, real_numwcs; 2068 int i; 2069 2070 wcs = chan->tx_wc; 2071 numwcs = IBD_RC_MAX_CQ_WC; 2072 2073 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { 2074 for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { 2075 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 2076 if (wc->wc_status != IBT_WC_SUCCESS) { 2077 chan->tx_trans_error_cnt ++; 2078 DPRINT(30, "ibd_rc_drain_scq: " 2079 "wc_status(%d) != SUCC, " 2080 "chan=%p, ace=%p, link_state=%d", 2081 wc->wc_status, chan, chan->ace, 2082 chan->state->id_link_state); 2083 } else { 2084 chan->tx_trans_error_cnt = 0; 2085 } 2086 ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe)); 2087 } 2088 2089 mutex_enter(&state->id_sched_lock); 2090 if (state->id_sched_needed == 0) { 2091 mutex_exit(&state->id_sched_lock); 2092 } else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) { 2093 mutex_enter(&chan->tx_wqe_list.dl_mutex); 2094 mutex_enter(&chan->tx_rel_list.dl_mutex); 2095 if ((chan->tx_rel_list.dl_cnt + 2096 chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) { 2097 state->id_sched_needed &= ~IBD_RSRC_RC_SWQE; 2098 mutex_exit(&chan->tx_rel_list.dl_mutex); 2099 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2100 mutex_exit(&state->id_sched_lock); 2101 state->rc_swqe_mac_update++; 2102 mac_tx_update(state->id_mh); 2103 } else { 2104 state->rc_scq_no_swqe++; 2105 mutex_exit(&chan->tx_rel_list.dl_mutex); 2106 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2107 mutex_exit(&state->id_sched_lock); 2108 } 2109 } else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) { 2110 mutex_enter(&state->rc_tx_large_bufs_lock); 2111 if (state->rc_tx_largebuf_nfree > 2112 IBD_RC_TX_FREE_THRESH) { 2113 ASSERT(state->rc_tx_largebuf_free_head != NULL); 2114 state->id_sched_needed &= 2115 ~IBD_RSRC_RC_TX_LARGEBUF; 2116 mutex_exit(&state->rc_tx_large_bufs_lock); 2117 mutex_exit(&state->id_sched_lock); 2118 state->rc_xmt_buf_mac_update++; 2119 mac_tx_update(state->id_mh); 2120 } else { 2121 state->rc_scq_no_largebuf++; 2122 mutex_exit(&state->rc_tx_large_bufs_lock); 2123 mutex_exit(&state->id_sched_lock); 2124 } 2125 } else if (state->id_sched_needed & IBD_RSRC_SWQE) { 2126 mutex_enter(&state->id_tx_list.dl_mutex); 2127 mutex_enter(&state->id_tx_rel_list.dl_mutex); 2128 if ((state->id_tx_list.dl_cnt + 2129 state->id_tx_rel_list.dl_cnt) 2130 > IBD_FREE_SWQES_THRESH) { 2131 state->id_sched_needed &= ~IBD_RSRC_SWQE; 2132 state->id_sched_cnt++; 2133 mutex_exit(&state->id_tx_rel_list.dl_mutex); 2134 mutex_exit(&state->id_tx_list.dl_mutex); 2135 mutex_exit(&state->id_sched_lock); 2136 mac_tx_update(state->id_mh); 2137 } else { 2138 mutex_exit(&state->id_tx_rel_list.dl_mutex); 2139 mutex_exit(&state->id_tx_list.dl_mutex); 2140 mutex_exit(&state->id_sched_lock); 2141 } 2142 } else { 2143 mutex_exit(&state->id_sched_lock); 2144 } 2145 } 2146 } 2147 2148 /* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */ 2149 /* ARGSUSED */ 2150 static void 2151 ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 2152 { 2153 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; 2154 2155 chan->state->rc_scq_invoke++; 2156 2157 if (ibd_rc_tx_softintr == 1) { 2158 mutex_enter(&chan->tx_poll_lock); 2159 if (chan->tx_poll_busy & IBD_CQ_POLLING) { 2160 chan->tx_poll_busy |= IBD_REDO_CQ_POLLING; 2161 mutex_exit(&chan->tx_poll_lock); 2162 return; 2163 } else { 2164 mutex_exit(&chan->tx_poll_lock); 2165 ddi_trigger_softintr(chan->scq_softintr); 2166 } 2167 } else 2168 (void) ibd_rc_tx_recycle(arg); 2169 } 2170 2171 static uint_t 2172 ibd_rc_tx_recycle(caddr_t arg) 2173 { 2174 ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg; 2175 ibd_ace_t *ace; 2176 ibd_state_t *state = chan->state; 2177 int flag, redo_flag; 2178 int redo = 1; 2179 2180 flag = IBD_CQ_POLLING; 2181 redo_flag = IBD_REDO_CQ_POLLING; 2182 2183 mutex_enter(&chan->tx_poll_lock); 2184 if (chan->tx_poll_busy & flag) { 2185 ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling " 2186 "threads"); 2187 chan->tx_poll_busy |= redo_flag; 2188 mutex_exit(&chan->tx_poll_lock); 2189 return (DDI_INTR_CLAIMED); 2190 } 2191 chan->tx_poll_busy |= flag; 2192 mutex_exit(&chan->tx_poll_lock); 2193 2194 /* 2195 * Poll for completed entries; the CQ will not interrupt any 2196 * more for completed packets. 2197 */ 2198 ibd_rc_drain_scq(chan, chan->scq_hdl); 2199 2200 /* 2201 * Now enable CQ notifications; all completions originating now 2202 * will cause new interrupts. 2203 */ 2204 do { 2205 if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) != 2206 IBT_SUCCESS) { 2207 /* 2208 * We do not expect a failure here. 2209 */ 2210 DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()" 2211 " failed"); 2212 } 2213 2214 ibd_rc_drain_scq(chan, chan->scq_hdl); 2215 2216 if (chan->tx_trans_error_cnt > 3) { 2217 mutex_enter(&chan->tx_poll_lock); 2218 chan->tx_poll_busy = 0; 2219 mutex_exit(&chan->tx_poll_lock); 2220 goto error_reset_chan; 2221 } 2222 mutex_enter(&chan->tx_poll_lock); 2223 if (chan->tx_poll_busy & redo_flag) 2224 chan->tx_poll_busy &= ~redo_flag; 2225 else { 2226 chan->tx_poll_busy &= ~flag; 2227 redo = 0; 2228 } 2229 mutex_exit(&chan->tx_poll_lock); 2230 2231 } while (redo); 2232 2233 return (DDI_INTR_CLAIMED); 2234 2235 error_reset_chan: 2236 /* 2237 * Channel being torn down. 2238 */ 2239 mutex_enter(&state->id_ac_mutex); 2240 if ((chan->chan_state == IBD_RC_STATE_ACT_ESTAB) && 2241 (chan->state->id_link_state == LINK_STATE_UP) && 2242 ((ace = ibd_acache_find(state, &chan->ace->ac_mac, B_FALSE, 0)) 2243 != NULL) && (ace == chan->ace)) { 2244 ASSERT(ace->ac_mce == NULL); 2245 INC_REF(ace, 1); 2246 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 2247 chan->chan_state = IBD_RC_STATE_ACT_CLOSING; 2248 mutex_exit(&state->id_ac_mutex); 2249 state->rc_reset_cnt++; 2250 DPRINT(30, "ibd_rc_tx_recycle(chan=%p, ace=%p): " 2251 " reset RC channel", chan, chan->ace); 2252 ibd_rc_signal_act_close(state, ace); 2253 } else { 2254 mutex_exit(&state->id_ac_mutex); 2255 state->rc_act_close_simultaneous++; 2256 DPRINT(40, "ibd_rc_tx_recycle: other thread is closing" 2257 " it. chan=%p, act_state=%d, link_state=%d, ace=%p", 2258 chan, chan->chan_state, state->id_link_state, ace); 2259 } 2260 return (DDI_INTR_CLAIMED); 2261 } 2262 2263 /* Listen with corresponding service ID */ 2264 ibt_status_t 2265 ibd_rc_listen(ibd_state_t *state) 2266 { 2267 ibt_srv_desc_t srvdesc; 2268 ib_svc_id_t ret_sid; 2269 ibt_status_t status; 2270 ib_gid_t gid; 2271 2272 if (state->rc_listen_hdl != NULL) { 2273 DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL"); 2274 return (IBT_FAILURE); 2275 } 2276 2277 bzero(&srvdesc, sizeof (ibt_srv_desc_t)); 2278 srvdesc.sd_handler = ibd_rc_dispatch_pass_mad; 2279 srvdesc.sd_flags = IBT_SRV_NO_FLAGS; 2280 2281 /* 2282 * Register the service with service id 2283 * Incoming connection requests should arrive on this service id. 2284 */ 2285 status = ibt_register_service(state->id_ibt_hdl, &srvdesc, 2286 IBD_RC_QPN_TO_SID(state->id_qpnum), 2287 1, &state->rc_listen_hdl, &ret_sid); 2288 if (status != IBT_SUCCESS) { 2289 DPRINT(40, "ibd_rc_listen: Service Registration Failed, " 2290 "ret=%d", status); 2291 return (status); 2292 } 2293 2294 gid = state->id_sgid; 2295 2296 /* pass state as cm_private */ 2297 status = ibt_bind_service(state->rc_listen_hdl, 2298 gid, NULL, state, &state->rc_listen_bind); 2299 if (status != IBT_SUCCESS) { 2300 DPRINT(40, "ibd_rc_listen:" 2301 " fail to bind port: <%d>", status); 2302 (void) ibt_deregister_service(state->id_ibt_hdl, 2303 state->rc_listen_hdl); 2304 return (status); 2305 } 2306 2307 /* 2308 * Legacy OFED had used a wrong service ID (one additional zero digit) 2309 * for many years. To interop with legacy OFED, we support this wrong 2310 * service ID here. 2311 */ 2312 ASSERT(state->rc_listen_hdl_OFED_interop == NULL); 2313 2314 bzero(&srvdesc, sizeof (ibt_srv_desc_t)); 2315 srvdesc.sd_handler = ibd_rc_dispatch_pass_mad; 2316 srvdesc.sd_flags = IBT_SRV_NO_FLAGS; 2317 2318 /* 2319 * Register the service with service id 2320 * Incoming connection requests should arrive on this service id. 2321 */ 2322 status = ibt_register_service(state->id_ibt_hdl, &srvdesc, 2323 IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum), 2324 1, &state->rc_listen_hdl_OFED_interop, &ret_sid); 2325 if (status != IBT_SUCCESS) { 2326 DPRINT(40, 2327 "ibd_rc_listen: Service Registration for Legacy OFED " 2328 "Failed %d", status); 2329 (void) ibt_unbind_service(state->rc_listen_hdl, 2330 state->rc_listen_bind); 2331 (void) ibt_deregister_service(state->id_ibt_hdl, 2332 state->rc_listen_hdl); 2333 return (status); 2334 } 2335 2336 gid = state->id_sgid; 2337 2338 /* pass state as cm_private */ 2339 status = ibt_bind_service(state->rc_listen_hdl_OFED_interop, 2340 gid, NULL, state, &state->rc_listen_bind_OFED_interop); 2341 if (status != IBT_SUCCESS) { 2342 DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for " 2343 "Legacy OFED listener", status); 2344 (void) ibt_deregister_service(state->id_ibt_hdl, 2345 state->rc_listen_hdl_OFED_interop); 2346 (void) ibt_unbind_service(state->rc_listen_hdl, 2347 state->rc_listen_bind); 2348 (void) ibt_deregister_service(state->id_ibt_hdl, 2349 state->rc_listen_hdl); 2350 return (status); 2351 } 2352 2353 return (IBT_SUCCESS); 2354 } 2355 2356 void 2357 ibd_rc_stop_listen(ibd_state_t *state) 2358 { 2359 int ret; 2360 2361 /* Disable incoming connection requests */ 2362 if (state->rc_listen_hdl != NULL) { 2363 ret = ibt_unbind_all_services(state->rc_listen_hdl); 2364 if (ret != 0) { 2365 DPRINT(40, "ibd_rc_stop_listen:" 2366 "ibt_unbind_all_services() failed, ret=%d", ret); 2367 } 2368 ret = ibt_deregister_service(state->id_ibt_hdl, 2369 state->rc_listen_hdl); 2370 if (ret != 0) { 2371 DPRINT(40, "ibd_rc_stop_listen:" 2372 "ibt_deregister_service() failed, ret=%d", ret); 2373 } else { 2374 state->rc_listen_hdl = NULL; 2375 } 2376 } 2377 2378 /* Disable incoming connection requests */ 2379 if (state->rc_listen_hdl_OFED_interop != NULL) { 2380 ret = ibt_unbind_all_services( 2381 state->rc_listen_hdl_OFED_interop); 2382 if (ret != 0) { 2383 DPRINT(40, "ibd_rc_stop_listen:" 2384 "ibt_unbind_all_services() failed: %d", ret); 2385 } 2386 ret = ibt_deregister_service(state->id_ibt_hdl, 2387 state->rc_listen_hdl_OFED_interop); 2388 if (ret != 0) { 2389 DPRINT(40, "ibd_rc_stop_listen:" 2390 "ibt_deregister_service() failed: %d", ret); 2391 } else { 2392 state->rc_listen_hdl_OFED_interop = NULL; 2393 } 2394 } 2395 } 2396 2397 int 2398 ibd_rc_close_all_chan(ibd_state_t *state) 2399 { 2400 ibd_rc_chan_t *rc_chan, *rc_chan1; 2401 ibd_ace_t *ace; 2402 uint_t attempts; 2403 2404 /* Disable all Rx routines */ 2405 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); 2406 rc_chan = state->rc_pass_chan_list.chan_list; 2407 while (rc_chan != NULL) { 2408 ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0); 2409 rc_chan = rc_chan->next; 2410 } 2411 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); 2412 2413 if (state->rc_enable_srq) { 2414 attempts = 50; 2415 while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) { 2416 DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0"); 2417 delay(drv_usectohz(100000)); 2418 if (--attempts == 0) { 2419 /* 2420 * There are pending bufs with the network 2421 * layer and we have no choice but to wait 2422 * for them to be done with. Reap all the 2423 * Tx/Rx completions that were posted since 2424 * we turned off the notification and 2425 * return failure. 2426 */ 2427 mutex_enter( 2428 &state->rc_pass_chan_list.chan_list_mutex); 2429 rc_chan = state->rc_pass_chan_list.chan_list; 2430 while (rc_chan != NULL) { 2431 ibd_rc_poll_rcq 2432 (rc_chan, rc_chan->rcq_hdl); 2433 ibt_set_cq_handler(rc_chan->rcq_hdl, 2434 ibd_rc_rcq_handler, rc_chan); 2435 rc_chan = rc_chan->next; 2436 } 2437 mutex_exit( 2438 &state->rc_pass_chan_list.chan_list_mutex); 2439 return (DDI_FAILURE); 2440 } 2441 } 2442 } 2443 2444 /* Close all passive RC channels */ 2445 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list); 2446 while (rc_chan != NULL) { 2447 if (ibd_rc_pas_close(rc_chan) != DDI_SUCCESS) { 2448 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); 2449 rc_chan1 = state->rc_pass_chan_list.chan_list; 2450 while (rc_chan1 != NULL) { 2451 ibd_rc_poll_rcq(rc_chan1, rc_chan1->rcq_hdl); 2452 ibt_set_cq_handler(rc_chan1->rcq_hdl, 2453 ibd_rc_rcq_handler, rc_chan1); 2454 rc_chan1 = rc_chan1->next; 2455 } 2456 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); 2457 ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, 2458 rc_chan); 2459 DPRINT(40, "ibd_rc_close_all_chan: ibd_rc_pas_close() " 2460 "failed"); 2461 return (DDI_FAILURE); 2462 } 2463 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list); 2464 } 2465 2466 /* Close all active RC channels */ 2467 mutex_enter(&state->id_ac_mutex); 2468 ace = list_head(&state->id_ah_active); 2469 while (ace != NULL) { 2470 if (ace->ac_chan != NULL) { 2471 ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list, 2472 ace->ac_chan); 2473 } 2474 ace = list_next(&state->id_ah_active, ace); 2475 } 2476 mutex_exit(&state->id_ac_mutex); 2477 2478 rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list); 2479 while (rc_chan != NULL) { 2480 ace = rc_chan->ace; 2481 ibd_rc_act_close(rc_chan); 2482 if (ace != NULL) 2483 ace->ac_chan = NULL; 2484 rc_chan = ibd_rc_rm_header_chan_list( 2485 &state->rc_obs_act_chan_list); 2486 } 2487 return (DDI_SUCCESS); 2488 } 2489 2490 void 2491 ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path) 2492 { 2493 ibt_status_t status; 2494 2495 status = ibd_rc_connect(state, ace, path, 2496 IBD_RC_SERVICE_ID_OFED_INTEROP); 2497 2498 if (status != IBT_SUCCESS) { 2499 /* wait peer side remove stale channel */ 2500 delay(drv_usectohz(10000)); 2501 status = ibd_rc_connect(state, ace, path, 2502 IBD_RC_SERVICE_ID_OFED_INTEROP); 2503 } 2504 2505 if (status != IBT_SUCCESS) { 2506 /* wait peer side remove stale channel */ 2507 delay(drv_usectohz(10000)); 2508 (void) ibd_rc_connect(state, ace, path, 2509 IBD_RC_SERVICE_ID); 2510 } 2511 } 2512 2513 /* 2514 * Allocates channel and sets the ace->ac_chan to it. 2515 * Opens the channel. 2516 */ 2517 ibt_status_t 2518 ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace, ibt_path_info_t *path, 2519 uint64_t ietf_cm_service_id) 2520 { 2521 ibt_status_t status = 0; 2522 ibt_rc_returns_t open_returns; 2523 ibt_chan_open_args_t open_args; 2524 ibd_rc_msg_hello_t hello_req_msg; 2525 ibd_rc_msg_hello_t *hello_ack_msg; 2526 ibd_rc_chan_t *chan; 2527 2528 ASSERT(ace != NULL); 2529 ASSERT(ace->ac_mce == NULL); 2530 ASSERT(ace->ac_chan == NULL); 2531 2532 if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) { 2533 DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed"); 2534 return (status); 2535 } 2536 2537 ace->ac_chan = chan; 2538 chan->state = state; 2539 chan->ace = ace; 2540 2541 ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace); 2542 2543 hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP); 2544 2545 /* 2546 * open the channels 2547 */ 2548 bzero(&open_args, sizeof (ibt_chan_open_args_t)); 2549 bzero(&open_returns, sizeof (ibt_rc_returns_t)); 2550 2551 open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad; 2552 open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace; 2553 2554 /* 2555 * update path record with the SID 2556 */ 2557 path->pi_sid = 2558 ietf_cm_service_id | ((ace->ac_dest->ud_dst_qpn) & 0xffffff); 2559 2560 2561 /* pre-allocate memory for hello ack message */ 2562 open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t); 2563 open_returns.rc_priv_data = hello_ack_msg; 2564 2565 open_args.oc_path = path; 2566 2567 open_args.oc_path_rnr_retry_cnt = 7; 2568 open_args.oc_path_retry_cnt = 7; 2569 2570 /* We don't do RDMA */ 2571 open_args.oc_rdma_ra_out = 0; 2572 open_args.oc_rdma_ra_in = 0; 2573 2574 hello_req_msg.reserved_qpn = htonl(state->id_qpnum); 2575 hello_req_msg.rx_mtu = htonl(state->rc_mtu); 2576 open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t); 2577 open_args.oc_priv_data = (void *)(&hello_req_msg); 2578 2579 ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ); 2580 ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ); 2581 ASSERT(open_args.oc_cm_handler != NULL); 2582 2583 status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS, 2584 IBT_BLOCKING, &open_args, &open_returns); 2585 2586 if (status == IBT_SUCCESS) { 2587 /* Success! */ 2588 DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!"); 2589 state->rc_conn_succ++; 2590 kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t)); 2591 return (IBT_SUCCESS); 2592 } 2593 2594 /* failure */ 2595 (void) ibt_flush_channel(chan->chan_hdl); 2596 ibd_rc_free_chan(chan); 2597 ace->ac_chan = NULL; 2598 2599 /* check open_returns report error and exit */ 2600 DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail." 2601 "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x," 2602 " peer qpn=0x%x", status, (int)open_returns.rc_status, ace, 2603 hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn, 2604 ace->ac_dest->ud_dst_qpn); 2605 kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t)); 2606 return (status); 2607 } 2608 2609 void 2610 ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace) 2611 { 2612 ibd_req_t *req; 2613 2614 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 2615 if (req == NULL) { 2616 ibd_print_warn(state, "ibd_rc_signal_act_close: alloc " 2617 "ibd_req_t fail"); 2618 mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex); 2619 ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list; 2620 state->rc_obs_act_chan_list.chan_list = ace->ac_chan; 2621 mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex); 2622 } else { 2623 req->rq_ptr = ace->ac_chan; 2624 ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN); 2625 } 2626 } 2627 2628 void 2629 ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace) 2630 { 2631 ibd_req_t *req; 2632 2633 mutex_enter(&state->rc_ace_recycle_lock); 2634 if (state->rc_ace_recycle != NULL) { 2635 mutex_exit(&state->rc_ace_recycle_lock); 2636 return; 2637 } 2638 2639 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 2640 if (req == NULL) { 2641 mutex_exit(&state->rc_ace_recycle_lock); 2642 return; 2643 } 2644 2645 state->rc_ace_recycle = ace; 2646 mutex_exit(&state->rc_ace_recycle_lock); 2647 ASSERT(ace->ac_mce == NULL); 2648 INC_REF(ace, 1); 2649 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 2650 req->rq_ptr = ace; 2651 ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE); 2652 } 2653 2654 static void 2655 ibd_rc_act_close(ibd_rc_chan_t *chan) 2656 { 2657 uint_t times; 2658 ibt_status_t ret; 2659 2660 ASSERT(chan != NULL); 2661 2662 chan->state->rc_act_close++; 2663 switch (chan->chan_state) { 2664 case IBD_RC_STATE_ACT_CLOSING: /* stale, close it */ 2665 case IBD_RC_STATE_ACT_ESTAB: 2666 DPRINT(30, "ibd_rc_act_close-1: close and free chan, " 2667 "act_state=%d, chan=%p", chan->chan_state, chan); 2668 chan->chan_state = IBD_RC_STATE_ACT_CLOSED; 2669 ibt_set_cq_handler(chan->rcq_hdl, 0, 0); 2670 /* Wait send queue empty */ 2671 times = 0; 2672 mutex_enter(&chan->tx_wqe_list.dl_mutex); 2673 mutex_enter(&chan->tx_rel_list.dl_mutex); 2674 while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt) 2675 != chan->scq_size) && (times < 50)) { 2676 DPRINT(30, "ibd_rc_act_close: dl_cnt(tx_wqe_list=%d," 2677 " tx_rel_list=%d) != chan->scq_size=%d", 2678 chan->tx_wqe_list.dl_cnt, chan->tx_rel_list.dl_cnt, 2679 chan->scq_size); 2680 mutex_exit(&chan->tx_rel_list.dl_mutex); 2681 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2682 mutex_enter(&chan->tx_poll_lock); 2683 if (chan->tx_poll_busy & IBD_CQ_POLLING) { 2684 DPRINT(40, "ibd_rc_act_close: multiple " 2685 "polling threads"); 2686 mutex_exit(&chan->tx_poll_lock); 2687 } else { 2688 chan->tx_poll_busy = IBD_CQ_POLLING; 2689 mutex_exit(&chan->tx_poll_lock); 2690 ibd_rc_drain_scq(chan, chan->scq_hdl); 2691 mutex_enter(&chan->tx_poll_lock); 2692 chan->tx_poll_busy = 0; 2693 mutex_exit(&chan->tx_poll_lock); 2694 } 2695 delay(drv_usectohz(100000)); 2696 times++; 2697 mutex_enter(&chan->tx_wqe_list.dl_mutex); 2698 mutex_enter(&chan->tx_rel_list.dl_mutex); 2699 } 2700 mutex_exit(&chan->tx_rel_list.dl_mutex); 2701 mutex_exit(&chan->tx_wqe_list.dl_mutex); 2702 ibt_set_cq_handler(chan->scq_hdl, 0, 0); 2703 ret = ibt_close_rc_channel(chan->chan_hdl, 2704 IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 2705 if (ret != IBT_SUCCESS) { 2706 DPRINT(40, "ibd_rc_act_close-2: ibt_close_rc_channel " 2707 "fail, chan=%p, returned=%d", chan, ret); 2708 } else { 2709 DPRINT(30, "ibd_rc_act_close-2: ibt_close_rc_channel " 2710 "succ, chan=%p", chan); 2711 } 2712 2713 ibd_rc_free_chan(chan); 2714 break; 2715 case IBD_RC_STATE_ACT_REP_RECV: 2716 chan->chan_state = IBD_RC_STATE_ACT_CLOSED; 2717 (void) ibt_flush_channel(chan->chan_hdl); 2718 ibd_rc_free_chan(chan); 2719 break; 2720 case IBD_RC_STATE_ACT_ERROR: 2721 DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch"); 2722 break; 2723 default: 2724 DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, " 2725 "chan=%p", chan->chan_state, chan); 2726 } 2727 } 2728 2729 static int 2730 ibd_rc_pas_close(ibd_rc_chan_t *chan) 2731 { 2732 uint_t times; 2733 ibt_status_t ret; 2734 2735 ASSERT(chan != NULL); 2736 chan->state->rc_pas_close++; 2737 2738 switch (chan->chan_state) { 2739 case IBD_RC_STATE_PAS_ESTAB: 2740 /* 2741 * First, stop receive interrupts; this stops the 2742 * connection from handing up buffers to higher layers. 2743 * Wait for receive buffers to be returned; give up 2744 * after 5 seconds. 2745 */ 2746 ibt_set_cq_handler(chan->rcq_hdl, 0, 0); 2747 if (!chan->state->rc_enable_srq) { 2748 times = 50; 2749 while (chan->rx_wqe_list.dl_bufs_outstanding > 0) { 2750 delay(drv_usectohz(100000)); 2751 if (--times == 0) { 2752 DPRINT(40, "ibd_rc_pas_close : " 2753 "reclaiming failed"); 2754 ibd_rc_poll_rcq(chan, chan->rcq_hdl); 2755 ibt_set_cq_handler(chan->rcq_hdl, 2756 ibd_rc_rcq_handler, 2757 (void *)(uintptr_t)chan); 2758 return (DDI_FAILURE); 2759 } 2760 } 2761 } 2762 ibt_set_cq_handler(chan->scq_hdl, 0, 0); 2763 chan->chan_state = IBD_RC_STATE_PAS_CLOSED; 2764 DPRINT(30, "ibd_rc_pas_close-1: close and free chan, " 2765 "chan_state=%d, chan=%p", chan->chan_state, chan); 2766 ret = ibt_close_rc_channel(chan->chan_hdl, 2767 IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0); 2768 if (ret != IBT_SUCCESS) { 2769 DPRINT(40, "ibd_rc_pas_close-2: ibt_close_rc_channel()" 2770 " fail, chan=%p, returned=%d", chan, ret); 2771 } else { 2772 DPRINT(30, "ibd_rc_pas_close-2: ibt_close_rc_channel()" 2773 " succ, chan=%p", chan); 2774 } 2775 2776 ibd_rc_free_chan(chan); 2777 break; 2778 case IBD_RC_STATE_PAS_REQ_RECV: 2779 chan->chan_state = IBD_RC_STATE_PAS_CLOSED; 2780 (void) ibt_flush_channel(chan->chan_hdl); 2781 ibd_rc_free_chan(chan); 2782 break; 2783 default: 2784 DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p", 2785 chan->chan_state, chan); 2786 } 2787 return (DDI_SUCCESS); 2788 } 2789 2790 /* 2791 * Remove duplicate RC channel which comes from the same mac 2792 * 2793 * From the IP point of view, we could check for same MAC: 2794 * GID, P_Key (or QPN, though in a reboot this is likely to 2795 * change so P_Key is better). The GID usually will equate to 2796 * port (since typically it uses the port GUID in the low 64 bits). 2797 * These fields exists in the REQ messages. 2798 */ 2799 void 2800 ibd_rc_handle_req_rm_dup(ibd_state_t *state, ibt_cm_event_t *ibt_cm_event) 2801 { 2802 ibd_rc_chan_t *chan, *pre_chan; 2803 2804 pre_chan = NULL; 2805 mutex_enter(&state->rc_pass_chan_list.chan_list_mutex); 2806 chan = state->rc_pass_chan_list.chan_list; 2807 while (chan != NULL) { 2808 if ((bcmp(&chan->requester_gid, 2809 &ibt_cm_event->cm_event.req.req_prim_addr.av_dgid, 2810 sizeof (ib_gid_t)) == 0) && (chan->requester_pkey == 2811 ibt_cm_event->cm_event.req.req_pkey)) { 2812 if (pre_chan == NULL) { 2813 state->rc_pass_chan_list.chan_list = chan->next; 2814 } else { 2815 pre_chan->next = chan->next; 2816 } 2817 break; 2818 } 2819 pre_chan = chan; 2820 chan = chan->next; 2821 } 2822 mutex_exit(&state->rc_pass_chan_list.chan_list_mutex); 2823 if (chan) { 2824 DPRINT(30, "ibd_rc_handle_req_rm_dup: same gid and pkey, " 2825 "remove duplicate channal, chan=%p", chan); 2826 if (ibd_rc_pas_close(chan) != DDI_SUCCESS) { 2827 ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, 2828 chan); 2829 } 2830 } 2831 } 2832 2833 /* 2834 * Passive Side: 2835 * Handle an incoming CM REQ from active side. 2836 * 2837 * If success, this function allocates an ibd_rc_chan_t, then 2838 * assigns it to "*ret_conn". 2839 */ 2840 static ibt_cm_status_t 2841 ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn, 2842 ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args, 2843 void *ret_priv_data) 2844 { 2845 ibd_rc_msg_hello_t *hello_msg; 2846 ibd_state_t *state = (ibd_state_t *)arg; 2847 ibd_rc_chan_t *chan; 2848 2849 ibd_rc_handle_req_rm_dup(state, ibt_cm_event); 2850 2851 if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) { 2852 DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed"); 2853 return (IBT_CM_REJECT); 2854 } 2855 2856 ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan); 2857 2858 ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan); 2859 2860 if (!state->rc_enable_srq) { 2861 if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) { 2862 ibd_rc_free_chan(chan); 2863 DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() " 2864 "failed"); 2865 return (IBT_CM_REJECT); 2866 } 2867 } 2868 2869 ret_args->cm_ret.rep.cm_channel = chan->chan_hdl; 2870 2871 /* We don't do RDMA */ 2872 ret_args->cm_ret.rep.cm_rdma_ra_out = 0; 2873 ret_args->cm_ret.rep.cm_rdma_ra_in = 0; 2874 2875 ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7; 2876 ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t); 2877 2878 hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data; 2879 DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x", 2880 ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu)); 2881 2882 hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data; 2883 hello_msg->reserved_qpn = htonl(state->id_qpnum); 2884 hello_msg->rx_mtu = htonl(state->rc_mtu); 2885 2886 chan->requester_gid = ibt_cm_event->cm_event.req.req_prim_addr.av_dgid; 2887 chan->requester_pkey = ibt_cm_event->cm_event.req.req_pkey; 2888 chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV; /* ready to receive */ 2889 *ret_conn = chan; 2890 2891 return (IBT_CM_ACCEPT); 2892 } 2893 2894 /* 2895 * ibd_rc_handle_act_estab -- handler for connection established completion 2896 * for active side. 2897 */ 2898 static ibt_cm_status_t 2899 ibd_rc_handle_act_estab(ibd_ace_t *ace) 2900 { 2901 ibt_status_t result; 2902 2903 switch (ace->ac_chan->chan_state) { 2904 case IBD_RC_STATE_ACT_REP_RECV: 2905 ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB; 2906 result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl, 2907 IBT_NEXT_COMPLETION); 2908 if (result != IBT_SUCCESS) { 2909 DPRINT(40, "ibd_rc_handle_act_estab: " 2910 "ibt_enable_cq_notify(rcq) " 2911 "failed: status %d", result); 2912 return (IBT_CM_REJECT); 2913 } 2914 break; 2915 default: 2916 DPRINT(40, "ibd_rc_handle_act_estab: default " 2917 "branch, act_state=%d", ace->ac_chan->chan_state); 2918 return (IBT_CM_REJECT); 2919 } 2920 return (IBT_CM_ACCEPT); 2921 } 2922 2923 /* 2924 * ibd_rc_handle_pas_estab -- handler for connection established completion 2925 * for passive side. 2926 */ 2927 static ibt_cm_status_t 2928 ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan) 2929 { 2930 ibt_status_t result; 2931 2932 switch (chan->chan_state) { 2933 case IBD_RC_STATE_PAS_REQ_RECV: 2934 chan->chan_state = IBD_RC_STATE_PAS_ESTAB; 2935 2936 result = ibt_enable_cq_notify(chan->rcq_hdl, 2937 IBT_NEXT_COMPLETION); 2938 if (result != IBT_SUCCESS) { 2939 DPRINT(40, "ibd_rc_handle_pas_estab: " 2940 "ibt_enable_cq_notify(rcq) " 2941 "failed: status %d", result); 2942 return (IBT_CM_REJECT); 2943 } 2944 break; 2945 default: 2946 DPRINT(40, "ibd_rc_handle_pas_estab: default " 2947 "branch, chan_state=%d", chan->chan_state); 2948 return (IBT_CM_REJECT); 2949 } 2950 return (IBT_CM_ACCEPT); 2951 } 2952 2953 /* ARGSUSED */ 2954 static ibt_cm_status_t 2955 ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event, 2956 ibt_cm_return_args_t *ret_args, void *ret_priv_data, 2957 ibt_priv_data_len_t ret_len_max) 2958 { 2959 ibt_cm_status_t result = IBT_CM_ACCEPT; 2960 ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg; 2961 ibd_rc_chan_t *rc_chan; 2962 ibd_state_t *state; 2963 ibd_rc_msg_hello_t *hello_ack; 2964 uint_t times; 2965 2966 switch (ibt_cm_event->cm_type) { 2967 case IBT_CM_EVENT_REP_RCV: 2968 ASSERT(ace->ac_chan != NULL); 2969 ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT); 2970 hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data; 2971 DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, " 2972 "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu), 2973 ntohl(hello_ack->reserved_qpn)); 2974 ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV; 2975 break; 2976 2977 case IBT_CM_EVENT_CONN_EST: 2978 ASSERT(ace->ac_chan != NULL); 2979 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, " 2980 "ace=%p, act_state=%d, chan=%p", 2981 ace, ace->ac_chan->chan_state, ace->ac_chan); 2982 result = ibd_rc_handle_act_estab(ace); 2983 break; 2984 2985 case IBT_CM_EVENT_CONN_CLOSED: 2986 rc_chan = ace->ac_chan; 2987 if (rc_chan == NULL) { 2988 DPRINT(40, "ibd_rc_dispatch_actv_mad: " 2989 "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED"); 2990 return (IBT_CM_ACCEPT); 2991 } 2992 state = rc_chan->state; 2993 mutex_enter(&state->id_ac_mutex); 2994 if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) && 2995 ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0)) 2996 != NULL) && (ace == rc_chan->ace)) { 2997 rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING; 2998 ASSERT(ace->ac_mce == NULL); 2999 INC_REF(ace, 1); 3000 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 3001 mutex_exit(&state->id_ac_mutex); 3002 DPRINT(30, "ibd_rc_dispatch_actv_mad: " 3003 "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, " 3004 "reason=%d", ace, rc_chan, 3005 ibt_cm_event->cm_event.closed); 3006 } else { 3007 mutex_exit(&state->id_ac_mutex); 3008 state->rc_act_close_simultaneous++; 3009 DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread " 3010 "is closing it, IBT_CM_EVENT_CONN_CLOSED, " 3011 "chan_state=%d", rc_chan->chan_state); 3012 return (IBT_CM_ACCEPT); 3013 } 3014 /* wait until the send queue clean */ 3015 times = 0; 3016 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 3017 mutex_enter(&rc_chan->tx_rel_list.dl_mutex); 3018 while (((rc_chan->tx_wqe_list.dl_cnt + 3019 rc_chan->tx_rel_list.dl_cnt) 3020 != rc_chan->scq_size) && (times < 50)) { 3021 DPRINT(40, "ibd_rc_dispatch_act_mad: dl_cnt" 3022 "(tx_wqe_list=%d, tx_rel_list=%d) != " 3023 "chan->scq_size=%d", 3024 rc_chan->tx_wqe_list.dl_cnt, 3025 rc_chan->tx_rel_list.dl_cnt, 3026 rc_chan->scq_size); 3027 mutex_exit(&rc_chan->tx_rel_list.dl_mutex); 3028 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 3029 mutex_enter(&rc_chan->tx_poll_lock); 3030 if (rc_chan->tx_poll_busy & IBD_CQ_POLLING) { 3031 DPRINT(40, "ibd_rc_dispatch_actv_mad: " 3032 "multiple polling threads"); 3033 mutex_exit(&rc_chan->tx_poll_lock); 3034 } else { 3035 rc_chan->tx_poll_busy = IBD_CQ_POLLING; 3036 mutex_exit(&rc_chan->tx_poll_lock); 3037 ibd_rc_drain_scq(rc_chan, rc_chan->scq_hdl); 3038 mutex_enter(&rc_chan->tx_poll_lock); 3039 rc_chan->tx_poll_busy = 0; 3040 mutex_exit(&rc_chan->tx_poll_lock); 3041 } 3042 delay(drv_usectohz(100000)); 3043 times++; 3044 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 3045 mutex_enter(&rc_chan->tx_rel_list.dl_mutex); 3046 } 3047 mutex_exit(&rc_chan->tx_rel_list.dl_mutex); 3048 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 3049 rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSED; 3050 ibd_rc_free_chan(rc_chan); 3051 DPRINT(30, "ibd_rc_dispatch_actv_mad: " 3052 "IBT_CM_EVENT_CONN_CLOSED, ref=%x", ace->ac_ref); 3053 mutex_enter(&state->id_ac_mutex); 3054 ace->ac_chan = NULL; 3055 ASSERT(ace->ac_ref != 0); 3056 atomic_dec_32(&ace->ac_ref); 3057 if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) { 3058 IBD_ACACHE_INSERT_FREE(state, ace); 3059 ace->ac_ref = 0; 3060 } else { 3061 ace->ac_ref |= CYCLEVAL; 3062 state->rc_delay_ace_recycle++; 3063 } 3064 mutex_exit(&state->id_ac_mutex); 3065 break; 3066 3067 case IBT_CM_EVENT_FAILURE: 3068 DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE," 3069 "ace=%p, chan=%p, code: %d, msg: %d, reason=%d", 3070 ace, ace->ac_chan, 3071 ibt_cm_event->cm_event.failed.cf_code, 3072 ibt_cm_event->cm_event.failed.cf_msg, 3073 ibt_cm_event->cm_event.failed.cf_reason); 3074 /* 3075 * Don't need free resource here. The resource is freed 3076 * at function ibd_rc_connect() 3077 */ 3078 break; 3079 3080 case IBT_CM_EVENT_MRA_RCV: 3081 DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV"); 3082 break; 3083 case IBT_CM_EVENT_LAP_RCV: 3084 DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received"); 3085 break; 3086 case IBT_CM_EVENT_APR_RCV: 3087 DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received"); 3088 break; 3089 default: 3090 DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, " 3091 "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type); 3092 break; 3093 } 3094 3095 return (result); 3096 } 3097 3098 /* ARGSUSED */ 3099 static ibt_cm_status_t 3100 ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event, 3101 ibt_cm_return_args_t *ret_args, void *ret_priv_data, 3102 ibt_priv_data_len_t ret_len_max) 3103 { 3104 ibt_cm_status_t result = IBT_CM_ACCEPT; 3105 ibd_rc_chan_t *chan; 3106 3107 if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) { 3108 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV," 3109 "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey); 3110 /* Receive an incoming CM REQ from active side */ 3111 result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args, 3112 ret_priv_data); 3113 return (result); 3114 } 3115 3116 if (ibt_cm_event->cm_channel == 0) { 3117 DPRINT(30, "ibd_rc_dispatch_pass_mad: " 3118 "ERROR ibt_cm_event->cm_channel == 0"); 3119 return (IBT_CM_REJECT); 3120 } 3121 3122 chan = 3123 (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel); 3124 if (chan == NULL) { 3125 DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0"); 3126 return (IBT_CM_REJECT); 3127 } 3128 3129 switch (ibt_cm_event->cm_type) { 3130 case IBT_CM_EVENT_CONN_EST: 3131 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, " 3132 "chan=%p", chan); 3133 result = ibd_rc_handle_pas_estab(chan); 3134 break; 3135 case IBT_CM_EVENT_CONN_CLOSED: 3136 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED," 3137 " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed); 3138 ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan); 3139 ibd_rc_free_chan(chan); 3140 break; 3141 case IBT_CM_EVENT_FAILURE: 3142 DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE," 3143 " chan=%p, code: %d, msg: %d, reason=%d", chan, 3144 ibt_cm_event->cm_event.failed.cf_code, 3145 ibt_cm_event->cm_event.failed.cf_msg, 3146 ibt_cm_event->cm_event.failed.cf_reason); 3147 3148 ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan); 3149 ibd_rc_free_chan(chan); 3150 return (IBT_CM_ACCEPT); 3151 case IBT_CM_EVENT_MRA_RCV: 3152 DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV"); 3153 break; 3154 case IBT_CM_EVENT_LAP_RCV: 3155 DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received"); 3156 break; 3157 case IBT_CM_EVENT_APR_RCV: 3158 DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received"); 3159 break; 3160 default: 3161 DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, " 3162 "chan=%p", ibt_cm_event->cm_type, chan); 3163 break; 3164 } 3165 3166 return (result); 3167 } 3168