1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 59 #include <sys/ib/clients/of/ofed_kernel.h> 60 #include <sys/ib/clients/of/rdma/ib_addr.h> 61 #include <sys/ib/clients/of/rdma/rdma_cm.h> 62 63 #include <sys/ib/clients/rdsv3/rdsv3.h> 64 #include <sys/ib/clients/rdsv3/ib.h> 65 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 66 67 extern ddi_taskq_t *rdsv3_taskq; 68 69 /* 70 * Set the selected protocol version 71 */ 72 static void 73 rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version) 74 { 75 RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d", 76 conn, version); 77 conn->c_version = version; 78 } 79 80 /* 81 * Set up flow control 82 */ 83 static void 84 rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits) 85 { 86 struct rdsv3_ib_connection *ic = conn->c_transport_data; 87 88 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 89 "Enter: conn: %p credits: %d", conn, credits); 90 91 if (rdsv3_ib_sysctl_flow_control && credits != 0) { 92 /* We're doing flow control */ 93 ic->i_flowctl = 1; 94 rdsv3_ib_send_add_credits(conn, credits); 95 } else { 96 ic->i_flowctl = 0; 97 } 98 99 RDSV3_DPRINTF2("rdsv3_ib_set_flow_control", 100 "Return: conn: %p credits: %d", 101 conn, credits); 102 } 103 104 /* 105 * Tune RNR behavior. Without flow control, we use a rather 106 * low timeout, but not the absolute minimum - this should 107 * be tunable. 108 * 109 * We already set the RNR retry count to 7 (which is the 110 * smallest infinite number :-) above. 111 * If flow control is off, we want to change this back to 0 112 * so that we learn quickly when our credit accounting is 113 * buggy. 114 * 115 * Caller passes in a qp_attr pointer - don't waste stack spacv 116 * by allocation this twice. 117 */ 118 static void 119 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr) 120 { 121 int ret; 122 123 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p", 124 ic, attr); 125 126 attr->min_rnr_timer = IB_RNR_TIMER_000_32; 127 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER); 128 if (ret) 129 RDSV3_DPRINTF0("rdsv3_ib_tune_rnr", 130 "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret); 131 } 132 133 /* 134 * Connection established. 135 * We get here for both outgoing and incoming connection. 136 */ 137 void 138 rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, 139 struct rdma_cm_event *event) 140 { 141 const struct rdsv3_ib_connect_private *dp = NULL; 142 struct rdsv3_ib_connection *ic = conn->c_transport_data; 143 struct rdsv3_ib_device *rds_ibdev; 144 struct ib_qp_attr qp_attr; 145 int err; 146 147 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 148 "Enter conn: %p event: %p", conn, event); 149 150 if (event->param.conn.private_data_len >= sizeof (*dp)) { 151 dp = event->param.conn.private_data; 152 153 /* make sure it isn't empty data */ 154 if (dp->dp_protocol_major) { 155 rdsv3_ib_set_protocol(conn, 156 RDS_PROTOCOL(dp->dp_protocol_major, 157 dp->dp_protocol_minor)); 158 rdsv3_ib_set_flow_control(conn, 159 ntohl(dp->dp_credit)); 160 } 161 } 162 163 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 164 "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", 165 NIPQUAD(conn->c_faddr), 166 RDS_PROTOCOL_MAJOR(conn->c_version), 167 RDS_PROTOCOL_MINOR(conn->c_version), 168 ic->i_flowctl ? ", flow control" : ""); 169 170 /* 171 * Init rings and fill recv. this needs to wait until protocol 172 * negotiation 173 * is complete, since ring layout is different from 3.0 to 3.1. 174 */ 175 rdsv3_ib_send_init_ring(ic); 176 rdsv3_ib_recv_init_ring(ic); 177 /* 178 * Post receive buffers - as a side effect, this will update 179 * the posted credit count. 180 */ 181 (void) rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0, 1); 182 183 /* Tune RNR behavior */ 184 rdsv3_ib_tune_rnr(ic, &qp_attr); 185 186 qp_attr.qp_state = IB_QPS_RTS; 187 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE); 188 if (err) 189 RDSV3_DPRINTF0("rdsv3_ib_cm_connect_complete", 190 "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err); 191 192 /* update ib_device with this local ipaddr & conn */ 193 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); 194 err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr); 195 if (err) 196 RDSV3_DPRINTF0("rdsv3_ib_cm_connect_complete", 197 "rdsv3_ib_update_ipaddr failed (%d)", err); 198 rdsv3_ib_add_conn(rds_ibdev, conn); 199 200 /* 201 * If the peer gave us the last packet it saw, process this as if 202 * we had received a regular ACK. 203 */ 204 if (dp && dp->dp_ack_seq) 205 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 206 207 rdsv3_connect_complete(conn); 208 209 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", 210 "Return conn: %p event: %p", 211 conn, event); 212 } 213 214 static void 215 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn, 216 struct rdma_conn_param *conn_param, 217 struct rdsv3_ib_connect_private *dp, 218 uint32_t protocol_version) 219 { 220 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 221 "Enter conn: %p conn_param: %p private: %p version: %d", 222 conn, conn_param, dp, protocol_version); 223 224 (void) memset(conn_param, 0, sizeof (struct rdma_conn_param)); 225 /* XXX tune these? */ 226 conn_param->responder_resources = 1; 227 conn_param->initiator_depth = 1; 228 conn_param->retry_count = min(rdsv3_ib_retry_count, 7); 229 conn_param->rnr_retry_count = 7; 230 231 if (dp) { 232 struct rdsv3_ib_connection *ic = conn->c_transport_data; 233 234 (void) memset(dp, 0, sizeof (*dp)); 235 dp->dp_saddr = conn->c_laddr; 236 dp->dp_daddr = conn->c_faddr; 237 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 238 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 239 dp->dp_protocol_minor_mask = 240 htons(RDSV3_IB_SUPPORTED_PROTOCOLS); 241 dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic); 242 243 /* Advertise flow control */ 244 if (ic->i_flowctl) { 245 unsigned int credits; 246 247 credits = IB_GET_POST_CREDITS( 248 atomic_get(&ic->i_credits)); 249 dp->dp_credit = htonl(credits); 250 atomic_add_32(&ic->i_credits, 251 -IB_SET_POST_CREDITS(credits)); 252 } 253 254 conn_param->private_data = dp; 255 conn_param->private_data_len = sizeof (*dp); 256 } 257 258 RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", 259 "Return conn: %p conn_param: %p private: %p version: %d", 260 conn, conn_param, dp, protocol_version); 261 } 262 263 static void 264 rdsv3_ib_cq_event_handler(struct ib_event *event, void *data) 265 { 266 RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p", 267 event->event, data); 268 } 269 270 static void 271 rdsv3_ib_qp_event_handler(struct ib_event *event, void *data) 272 { 273 struct rdsv3_connection *conn = data; 274 struct rdsv3_ib_connection *ic = conn->c_transport_data; 275 276 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u", 277 conn, ic, event->event); 278 279 switch (event->event) { 280 case IB_EVENT_COMM_EST: 281 (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); 282 break; 283 default: 284 if (conn) { 285 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 286 "RDS/IB: Fatal QP Event %u - " 287 "connection %u.%u.%u.%u ->%u.%u.%u.%u " 288 "...reconnecting", 289 event->event, NIPQUAD(conn->c_laddr), 290 NIPQUAD(conn->c_faddr)); 291 rdsv3_conn_drop(conn); 292 } else { 293 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", 294 "RDS/IB: Fatal QP Event %u - connection" 295 "...reconnecting", event->event); 296 } 297 break; 298 } 299 300 RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p", 301 conn, event); 302 } 303 304 extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev, 305 struct rdsv3_ib_connection *ic); 306 extern void rdsv3_ib_free_hdrs(ib_device_t *dev, 307 struct rdsv3_ib_connection *ic); 308 309 /* 310 * This needs to be very careful to not leave IS_ERR pointers around for 311 * cleanup to trip over. 312 */ 313 static int 314 rdsv3_ib_setup_qp(struct rdsv3_connection *conn) 315 { 316 struct rdsv3_ib_connection *ic = conn->c_transport_data; 317 struct ib_device *dev = ic->i_cm_id->device; 318 struct ib_qp_init_attr attr; 319 struct rdsv3_ib_device *rds_ibdev; 320 ibt_send_wr_t *wrp; 321 ibt_wr_ds_t *sgl; 322 int ret, i; 323 324 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn); 325 326 /* 327 * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device, 328 * and allocates a protection domain, memory range and FMR pool 329 * for each. If that fails for any reason, it will not register 330 * the rds_ibdev at all. 331 */ 332 rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client); 333 if (rds_ibdev == NULL) { 334 RDSV3_DPRINTF0("rdsv3_ib_setup_qp", 335 "RDS/IB: No client_data for device %s", dev->name); 336 return (-EOPNOTSUPP); 337 } 338 339 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) 340 rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); 341 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1) 342 rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1); 343 344 /* Protection domain and memory range */ 345 ic->i_pd = rds_ibdev->pd; 346 347 ic->i_send_cq = ib_create_cq(dev, rdsv3_ib_send_cq_comp_handler, 348 rdsv3_ib_cq_event_handler, conn, 349 ic->i_send_ring.w_nr + 1, 0); 350 if (IS_ERR(ic->i_send_cq)) { 351 ret = PTR_ERR(ic->i_send_cq); 352 ic->i_send_cq = NULL; 353 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 354 "ib_create_cq send failed: %d", ret); 355 goto out; 356 } 357 358 ic->i_recv_cq = ib_create_cq(dev, rdsv3_ib_recv_cq_comp_handler, 359 rdsv3_ib_cq_event_handler, conn, 360 ic->i_recv_ring.w_nr, 0); 361 if (IS_ERR(ic->i_recv_cq)) { 362 ret = PTR_ERR(ic->i_recv_cq); 363 ic->i_recv_cq = NULL; 364 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 365 "ib_create_cq recv failed: %d", ret); 366 goto out; 367 } 368 369 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); 370 if (ret) { 371 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 372 "ib_req_notify_cq send failed: %d", ret); 373 goto out; 374 } 375 376 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); 377 if (ret) { 378 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 379 "ib_req_notify_cq recv failed: %d", ret); 380 goto out; 381 } 382 383 /* XXX negotiate max send/recv with remote? */ 384 (void) memset(&attr, 0, sizeof (attr)); 385 attr.event_handler = rdsv3_ib_qp_event_handler; 386 attr.qp_context = conn; 387 /* + 1 to allow for the single ack message */ 388 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1; 389 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1; 390 attr.cap.max_send_sge = rds_ibdev->max_sge; 391 attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE; 392 attr.sq_sig_type = IB_SIGNAL_REQ_WR; 393 attr.qp_type = IB_QPT_RC; 394 attr.send_cq = ic->i_send_cq; 395 attr.recv_cq = ic->i_recv_cq; 396 397 /* 398 * XXX this can fail if max_*_wr is too large? Are we supposed 399 * to back off until we get a value that the hardware can support? 400 */ 401 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); 402 if (ret) { 403 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 404 "rdma_create_qp failed: %d", ret); 405 goto out; 406 } 407 408 ret = rdsv3_ib_alloc_hdrs(dev, ic); 409 if (ret != 0) { 410 ret = -ENOMEM; 411 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 412 "rdsv3_ib_alloc_hdrs failed: %d", ret); 413 goto out; 414 } 415 416 ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr * 417 sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP); 418 if (ic->i_sends == NULL) { 419 ret = -ENOMEM; 420 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 421 "send allocation failed: %d", ret); 422 goto out; 423 } 424 (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr * 425 sizeof (struct rdsv3_ib_send_work)); 426 427 ic->i_send_wrs = 428 kmem_alloc(RDSV3_IB_SEND_WRS * (sizeof (ibt_send_wr_t) + 429 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP); 430 if (ic->i_send_wrs == NULL) { 431 ret = -ENOMEM; 432 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 433 "WR allocation failed: %d", ret); 434 goto out; 435 } 436 sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs + 437 (RDSV3_IB_SEND_WRS * sizeof (ibt_send_wr_t))); 438 RDSV3_DPRINTF4("rdsv3_ib_setup_qp", "i_send_wrs: %p sgl: %p", 439 ic->i_send_wrs, sgl); 440 for (i = 0; i < RDSV3_IB_SEND_WRS; i++) { 441 wrp = &ic->i_send_wrs[i]; 442 wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE]; 443 } 444 445 ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr * 446 sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP); 447 if (ic->i_recvs == NULL) { 448 ret = -ENOMEM; 449 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", 450 "recv allocation failed: %d", ret); 451 goto out; 452 } 453 (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * 454 sizeof (struct rdsv3_ib_recv_work)); 455 456 rdsv3_ib_recv_init_ack(ic); 457 458 RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p %p", 459 conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); 460 461 out: 462 return (ret); 463 } 464 465 static uint32_t 466 rdsv3_ib_protocol_compatible(struct rdma_cm_event *event) 467 { 468 const struct rdsv3_ib_connect_private *dp = 469 event->param.conn.private_data; 470 uint16_t common; 471 uint32_t version = 0; 472 473 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p", 474 event); 475 476 /* 477 * rdma_cm private data is odd - when there is any private data in the 478 * request, we will be given a pretty large buffer without telling us 479 * the 480 * original size. The only way to tell the difference is by looking at 481 * the contents, which are initialized to zero. 482 * If the protocol version fields aren't set, 483 * this is a connection attempt 484 * from an older version. This could could be 3.0 or 2.0 - 485 * we can't tell. 486 * We really should have changed this for OFED 1.3 :-( 487 */ 488 489 /* Be paranoid. RDS always has privdata */ 490 if (!event->param.conn.private_data_len) { 491 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", 492 "RDS incoming connection has no private data, rejecting"); 493 return (0); 494 } 495 496 /* Even if len is crap *now* I still want to check it. -ASG */ 497 if (event->param.conn.private_data_len < sizeof (*dp) || 498 dp->dp_protocol_major == 0) 499 return (RDS_PROTOCOL_3_0); 500 501 common = ntohs(dp->dp_protocol_minor_mask) & 502 RDSV3_IB_SUPPORTED_PROTOCOLS; 503 if (dp->dp_protocol_major == 3 && common) { 504 version = RDS_PROTOCOL_3_0; 505 while ((common >>= 1) != 0) 506 version++; 507 } else { 508 RDSV3_DPRINTF0("rdsv3_ib_protocol_compatible", 509 "RDS: Connection from %u.%u.%u.%u using " 510 "incompatible protocol version %u.%u\n", 511 NIPQUAD(dp->dp_saddr), 512 dp->dp_protocol_major, 513 dp->dp_protocol_minor); 514 } 515 516 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p", 517 event); 518 519 return (version); 520 } 521 522 int 523 rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 524 struct rdma_cm_event *event) 525 { 526 uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id; 527 uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id; 528 const struct rdsv3_ib_connect_private *dp = 529 event->param.conn.private_data; 530 struct rdsv3_ib_connect_private dp_rep; 531 struct rdsv3_connection *conn = NULL; 532 struct rdsv3_ib_connection *ic = NULL; 533 struct rdma_conn_param conn_param; 534 uint32_t version; 535 int err, destroy = 1; 536 boolean_t conn_created = B_FALSE; 537 538 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 539 "Enter cm_id: %p event: %p", cm_id, event); 540 541 /* Check whether the remote protocol version matches ours. */ 542 version = rdsv3_ib_protocol_compatible(event); 543 if (!version) { 544 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 545 "version mismatch"); 546 goto out; 547 } 548 549 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 550 "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid " 551 "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr), 552 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 553 (unsigned long long)ntohll(lguid), 554 (unsigned long long)ntohll(fguid)); 555 556 conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr, 557 &rdsv3_ib_transport, KM_NOSLEEP); 558 if (IS_ERR(conn)) { 559 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 560 "rdsv3_conn_create failed (%ld)", PTR_ERR(conn)); 561 conn = NULL; 562 goto out; 563 } 564 565 /* 566 * The connection request may occur while the 567 * previous connection exist, e.g. in case of failover. 568 * But as connections may be initiated simultaneously 569 * by both hosts, we have a random backoff mechanism - 570 * see the comment above rdsv3_queue_reconnect() 571 */ 572 mutex_enter(&conn->c_cm_lock); 573 if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, 574 RDSV3_CONN_CONNECTING)) { 575 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 576 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 577 "incoming connect when connected: %p", 578 conn); 579 rdsv3_conn_drop(conn); 580 rdsv3_ib_stats_inc(s_ib_listen_closed_stale); 581 mutex_exit(&conn->c_cm_lock); 582 goto out; 583 } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) { 584 /* Wait and see - our connect may still be succeeding */ 585 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 586 "peer-to-peer connection request: %p, " 587 "lguid: 0x%llx fguid: 0x%llx", 588 conn, lguid, fguid); 589 rdsv3_ib_stats_inc(s_ib_connect_raced); 590 } 591 mutex_exit(&conn->c_cm_lock); 592 goto out; 593 } 594 595 ic = conn->c_transport_data; 596 597 rdsv3_ib_set_protocol(conn, version); 598 rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit)); 599 600 /* 601 * If the peer gave us the last packet it saw, process this as if 602 * we had received a regular ACK. 603 */ 604 if (dp->dp_ack_seq) 605 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL); 606 607 ASSERT(!cm_id->context); 608 ASSERT(!ic->i_cm_id); 609 610 if (ic->i_cm_id != NULL) 611 RDSV3_PANIC(); 612 613 ic->i_cm_id = cm_id; 614 cm_id->context = conn; 615 616 /* 617 * We got halfway through setting up the ib_connection, if we 618 * fail now, we have to take the long route out of this mess. 619 */ 620 destroy = 0; 621 622 err = rdsv3_ib_setup_qp(conn); 623 if (err) { 624 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 625 "rdsv3_ib_setup_qp failed (%d)", err); 626 rdsv3_conn_drop(conn); 627 goto out; 628 } 629 630 rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); 631 632 /* rdma_accept() calls rdma_reject() internally if it fails */ 633 err = rdma_accept(cm_id, &conn_param); 634 mutex_exit(&conn->c_cm_lock); 635 if (err) { 636 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 637 "rdma_accept failed (%d)", err); 638 rdsv3_conn_drop(conn); 639 goto out; 640 } 641 642 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect", 643 "Return cm_id: %p event: %p", cm_id, event); 644 645 return (0); 646 647 out: 648 (void) rdma_reject(cm_id, NULL, 0); 649 return (destroy); 650 } 651 652 653 int 654 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 655 { 656 struct rdsv3_connection *conn = cm_id->context; 657 struct rdsv3_ib_connection *ic = conn->c_transport_data; 658 struct rdma_conn_param conn_param; 659 struct rdsv3_ib_connect_private dp; 660 int ret; 661 662 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p", 663 cm_id); 664 665 /* 666 * If the peer doesn't do protocol negotiation, we must 667 * default to RDSv3.0 668 */ 669 rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0); 670 ic->i_flowctl = 671 rdsv3_ib_sysctl_flow_control; /* advertise flow control */ 672 673 ret = rdsv3_ib_setup_qp(conn); 674 if (ret) { 675 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 676 "rdsv3_ib_setup_qp failed (%d)", ret); 677 rdsv3_conn_drop(conn); 678 goto out; 679 } 680 681 (void) rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, 682 RDS_PROTOCOL_VERSION); 683 684 ret = rdma_connect(cm_id, &conn_param); 685 if (ret) { 686 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 687 "rdma_connect failed (%d)", ret); 688 rdsv3_conn_drop(conn); 689 } 690 691 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", 692 "Return: cm_id: %p", cm_id); 693 694 out: 695 /* 696 * Beware - returning non-zero tells the rdma_cm to destroy 697 * the cm_id. We should certainly not do it as long as we still 698 * "own" the cm_id. 699 */ 700 if (ret) { 701 if (ic->i_cm_id == cm_id) 702 ret = 0; 703 } 704 return (ret); 705 } 706 707 int 708 rdsv3_ib_conn_connect(struct rdsv3_connection *conn) 709 { 710 struct rdsv3_ib_connection *ic = conn->c_transport_data; 711 struct sockaddr_in src, dest; 712 ipaddr_t laddr, faddr; 713 int ret; 714 715 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn); 716 717 /* 718 * XXX I wonder what affect the port space has 719 */ 720 /* delegate cm event handler to rdma_transport */ 721 ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn, 722 RDMA_PS_TCP); 723 if (IS_ERR(ic->i_cm_id)) { 724 ret = PTR_ERR(ic->i_cm_id); 725 ic->i_cm_id = NULL; 726 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 727 "rdma_create_id() failed: %d", ret); 728 goto out; 729 } 730 731 RDSV3_DPRINTF3("rdsv3_ib_conn_connect", 732 "created cm id %p for conn %p", ic->i_cm_id, conn); 733 734 /* The ipaddr should be in the network order */ 735 laddr = conn->c_laddr; 736 faddr = conn->c_faddr; 737 ret = rdsv3_sc_path_lookup(&laddr, &faddr); 738 if (ret == 0) { 739 RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)", 740 ntohl(laddr), ntohl(faddr)); 741 } 742 743 src.sin_family = AF_INET; 744 src.sin_addr.s_addr = (uint32_t)laddr; 745 src.sin_port = (uint16_t)htons(0); 746 747 dest.sin_family = AF_INET; 748 dest.sin_addr.s_addr = (uint32_t)faddr; 749 dest.sin_port = (uint16_t)htons(RDSV3_PORT); 750 751 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 752 (struct sockaddr *)&dest, 753 RDSV3_RDMA_RESOLVE_TIMEOUT_MS); 754 if (ret) { 755 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", 756 "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret); 757 rdma_destroy_id(ic->i_cm_id); 758 ic->i_cm_id = NULL; 759 } 760 761 RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn); 762 763 out: 764 return (ret); 765 } 766 767 /* 768 * This is so careful about only cleaning up resources that were built up 769 * so that it can be called at any point during startup. In fact it 770 * can be called multiple times for a given connection. 771 */ 772 void 773 rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) 774 { 775 struct rdsv3_ib_connection *ic = conn->c_transport_data; 776 int err = 0; 777 778 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 779 "cm %p pd %p cq %p %p qp %p", ic->i_cm_id, 780 ic->i_pd, ic->i_send_cq, ic->i_recv_cq, 781 ic->i_cm_id ? ic->i_cm_id->qp : NULL); 782 783 if (ic->i_cm_id) { 784 struct ib_device *dev = ic->i_cm_id->device; 785 786 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 787 "disconnecting cm %p", ic->i_cm_id); 788 err = rdma_disconnect(ic->i_cm_id); 789 if (err) { 790 /* 791 * Actually this may happen quite frequently, when 792 * an outgoing connect raced with an incoming connect. 793 */ 794 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", 795 "failed to disconnect, cm: %p err %d", 796 ic->i_cm_id, err); 797 } 798 799 if (ic->i_cm_id->qp) { 800 (void) ibt_flush_qp( 801 ib_get_ibt_channel_hdl(ic->i_cm_id)); 802 803 /* wait until all WRs are flushed */ 804 rdsv3_wait_event(rdsv3_ib_ring_empty_wait, 805 rdsv3_ib_ring_empty(&ic->i_send_ring) && 806 rdsv3_ib_ring_empty(&ic->i_recv_ring)); 807 808 rdma_destroy_qp(ic->i_cm_id); 809 } 810 811 812 if (ic->i_mr) 813 rdsv3_ib_free_hdrs(dev, ic); 814 815 if (ic->i_sends) 816 rdsv3_ib_send_clear_ring(ic); 817 if (ic->i_recvs) 818 rdsv3_ib_recv_clear_ring(ic); 819 820 if (ic->i_send_cq) 821 (void) ib_destroy_cq(ic->i_send_cq); 822 if (ic->i_recv_cq) 823 (void) ib_destroy_cq(ic->i_recv_cq); 824 rdma_destroy_id(ic->i_cm_id); 825 826 /* 827 * Move connection back to the nodev list. 828 */ 829 if (ic->rds_ibdev) 830 rdsv3_ib_remove_conn(ic->rds_ibdev, conn); 831 832 ic->i_cm_id = NULL; 833 ic->i_pd = NULL; 834 ic->i_mr = NULL; 835 ic->i_send_cq = NULL; 836 ic->i_recv_cq = NULL; 837 ic->i_send_hdrs = NULL; 838 ic->i_recv_hdrs = NULL; 839 ic->i_ack = NULL; 840 } 841 ASSERT(!ic->rds_ibdev); 842 843 /* Clear pending transmit */ 844 if (ic->i_rm) { 845 rdsv3_message_put(ic->i_rm); 846 ic->i_rm = NULL; 847 } 848 849 /* Clear the ACK state */ 850 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 851 ic->i_ack_next = 0; 852 ic->i_ack_recv = 0; 853 854 /* Clear flow control state */ 855 ic->i_flowctl = 0; 856 ic->i_credits = 0; 857 858 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 859 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 860 861 if (ic->i_ibinc) { 862 rdsv3_inc_put(&ic->i_ibinc->ii_inc); 863 ic->i_ibinc = NULL; 864 } 865 866 if (ic->i_sends) { 867 kmem_free(ic->i_sends, 868 ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work)); 869 ic->i_sends = NULL; 870 } 871 if (ic->i_send_wrs) { 872 kmem_free(ic->i_send_wrs, RDSV3_IB_SEND_WRS * 873 (sizeof (ibt_send_wr_t) + 874 RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t))); 875 ic->i_send_wrs = NULL; 876 } 877 if (ic->i_recvs) { 878 kmem_free(ic->i_recvs, 879 ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work)); 880 ic->i_recvs = NULL; 881 } 882 883 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn); 884 } 885 886 /* 887 * the connection can be allocated from either rdsv3_conn_create_outgoing() 888 * or rdsv3_conn_create(), so ddi_taskq_create() can be called with the 889 * same string. This can print the kstat warning on the console. To prevent 890 * it, this counter value is used. 891 * Note that requests from rdsv3_conn_create_outgoing() refers to the cached 892 * value with the mutex lock before it allocates the connection, so that 893 * the warning cannot be produced in the case. (only between 894 * rdsv3_conn_create() and rdsv3_conn_create_outgoing(). 895 */ 896 static int conn_cnt; 897 898 /* ARGSUSED */ 899 int 900 rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) 901 { 902 struct rdsv3_ib_connection *ic; 903 char tq_name[TASKQ_NAMELEN]; 904 905 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn); 906 907 /* XXX too lazy? */ 908 ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp); 909 if (ic == NULL) 910 return (-ENOMEM); 911 912 list_link_init(&ic->ib_node); 913 (void) snprintf(tq_name, TASKQ_NAMELEN, "RDSV3_CONN_to_%x:%u", 914 htonl(conn->c_faddr), conn_cnt++ % 100); 915 ic->i_recv_tasklet = 916 ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0); 917 918 919 mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL); 920 mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL); 921 922 /* 923 * rdsv3_ib_conn_shutdown() waits for these to be emptied so they 924 * must be initialized before it can be called. 925 */ 926 rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr); 927 rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr); 928 929 ic->conn = conn; 930 conn->c_transport_data = ic; 931 932 mutex_enter(&ib_nodev_conns_lock); 933 list_insert_tail(&ib_nodev_conns, ic); 934 mutex_exit(&ib_nodev_conns_lock); 935 936 937 RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p", 938 conn, conn->c_transport_data); 939 return (0); 940 } 941 942 /* 943 * Free a connection. Connection must be shut down and not set for reconnect. 944 */ 945 void 946 rdsv3_ib_conn_free(void *arg) 947 { 948 struct rdsv3_ib_connection *ic = arg; 949 kmutex_t *lock_ptr; 950 951 RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic); 952 953 #ifndef __lock_lint 954 /* 955 * Conn is either on a dev's list or on the nodev list. 956 * A race with shutdown() or connect() would cause problems 957 * (since rds_ibdev would change) but that should never happen. 958 */ 959 lock_ptr = ic->rds_ibdev ? 960 &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; 961 962 mutex_enter(lock_ptr); 963 list_remove_node(&ic->ib_node); 964 mutex_exit(lock_ptr); 965 #endif 966 967 ddi_taskq_destroy(ic->i_recv_tasklet); 968 kmem_free(ic, sizeof (*ic)); 969 } 970 971 /* 972 * An error occurred on the connection 973 */ 974 void 975 __rdsv3_ib_conn_error(struct rdsv3_connection *conn) 976 { 977 rdsv3_conn_drop(conn); 978 } 979