1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/rds.h> 58 59 #include <sys/ib/clients/rdsv3/rdsv3.h> 60 #include <sys/ib/clients/rdsv3/rdma.h> 61 #include <sys/ib/clients/rdsv3/ib.h> 62 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 63 64 static void 65 rdsv3_ib_send_rdma_complete(struct rdsv3_message *rm, 66 int wc_status) 67 { 68 int notify_status; 69 70 RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", 71 rm, wc_status); 72 73 switch (wc_status) { 74 case IBT_WC_WR_FLUSHED_ERR: 75 return; 76 77 case IBT_WC_SUCCESS: 78 notify_status = RDSV3_RDMA_SUCCESS; 79 break; 80 81 case IBT_WC_REMOTE_ACCESS_ERR: 82 notify_status = RDSV3_RDMA_REMOTE_ERROR; 83 break; 84 85 default: 86 notify_status = RDSV3_RDMA_OTHER_ERROR; 87 break; 88 } 89 rdsv3_rdma_send_complete(rm, notify_status); 90 91 RDSV3_DPRINTF4("rdsv3_ib_send_rdma_complete", "rm: %p, wc_status: %d", 92 rm, wc_status); 93 } 94 95 static void rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, 96 uint_t num, struct rdsv3_rdma_sg scat[]); 97 98 void 99 rdsv3_ib_send_unmap_rdma(struct rdsv3_ib_connection *ic, 100 struct rdsv3_rdma_op *op) 101 { 102 RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rdma", "ic: %p, op: %p", ic, op); 103 if (op->r_mapped) { 104 op->r_mapped = 0; 105 if (ic->i_cm_id) { 106 rdsv3_ib_dma_unmap_sg_rdma(ic->i_cm_id->device, 107 op->r_nents, op->r_rdma_sg); 108 } else { 109 rdsv3_ib_dma_unmap_sg_rdma((struct ib_device *)NULL, 110 op->r_nents, op->r_rdma_sg); 111 } 112 } 113 } 114 115 static void 116 rdsv3_ib_send_unmap_rm(struct rdsv3_ib_connection *ic, 117 struct rdsv3_ib_send_work *send, 118 int wc_status) 119 { 120 struct rdsv3_message *rm = send->s_rm; 121 122 RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rm", "ic %p send %p rm %p\n", 123 ic, send, rm); 124 125 mutex_enter(&rm->m_rs_lock); 126 if (rm->m_count) { 127 rdsv3_ib_dma_unmap_sg(ic->i_cm_id->device, 128 rm->m_sg, rm->m_count); 129 rm->m_count = 0; 130 } 131 mutex_exit(&rm->m_rs_lock); 132 133 if (rm->m_rdma_op != NULL) { 134 rdsv3_ib_send_unmap_rdma(ic, rm->m_rdma_op); 135 136 /* 137 * If the user asked for a completion notification on this 138 * message, we can implement three different semantics: 139 * 1. Notify when we received the ACK on the RDS message 140 * that was queued with the RDMA. This provides reliable 141 * notification of RDMA status at the expense of a one-way 142 * packet delay. 143 * 2. Notify when the IB stack gives us the completion 144 * event for the RDMA operation. 145 * 3. Notify when the IB stack gives us the completion 146 * event for the accompanying RDS messages. 147 * Here, we implement approach #3. To implement approach #2, 148 * call rdsv3_rdma_send_complete from the cq_handler. 149 * To implement #1, 150 * don't call rdsv3_rdma_send_complete at all, and fall back to 151 * the notify 152 * handling in the ACK processing code. 153 * 154 * Note: There's no need to explicitly sync any RDMA buffers 155 * using 156 * ib_dma_sync_sg_for_cpu - the completion for the RDMA 157 * operation itself unmapped the RDMA buffers, which takes care 158 * of synching. 159 */ 160 rdsv3_ib_send_rdma_complete(rm, wc_status); 161 162 if (rm->m_rdma_op->r_write) 163 rdsv3_stats_add(s_send_rdma_bytes, 164 rm->m_rdma_op->r_bytes); 165 else 166 rdsv3_stats_add(s_recv_rdma_bytes, 167 rm->m_rdma_op->r_bytes); 168 } 169 170 /* 171 * If anyone waited for this message to get flushed out, wake 172 * them up now 173 */ 174 rdsv3_message_unmapped(rm); 175 176 rdsv3_message_put(rm); 177 send->s_rm = NULL; 178 } 179 180 void 181 rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic) 182 { 183 struct rdsv3_ib_send_work *send; 184 uint32_t i; 185 186 RDSV3_DPRINTF4("rdsv3_ib_send_init_ring", "ic: %p", ic); 187 188 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 189 send->s_rm = NULL; 190 send->s_op = NULL; 191 } 192 } 193 194 void 195 rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic) 196 { 197 struct rdsv3_ib_send_work *send; 198 uint32_t i; 199 200 RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "ic: %p", ic); 201 202 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { 203 if (send->s_opcode == 0xdd) 204 continue; 205 if (send->s_rm) 206 rdsv3_ib_send_unmap_rm(ic, send, IBT_WC_WR_FLUSHED_ERR); 207 if (send->s_op) 208 rdsv3_ib_send_unmap_rdma(ic, send->s_op); 209 } 210 211 RDSV3_DPRINTF4("rdsv3_ib_send_clear_ring", "Return: ic: %p", ic); 212 } 213 214 /* 215 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc 216 * operations performed in the send path. As the sender allocs and potentially 217 * unallocs the next free entry in the ring it doesn't alter which is 218 * the next to be freed, which is what this is concerned with. 219 */ 220 void 221 rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc) 222 { 223 struct rdsv3_connection *conn = ic->conn; 224 struct rdsv3_ib_send_work *send; 225 uint32_t completed, polled; 226 uint32_t oldest; 227 uint32_t i = 0; 228 int ret; 229 230 RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", 231 "wc wc_id 0x%llx status %u byte_len %u imm_data %u\n", 232 (unsigned long long)wc->wc_id, wc->wc_status, 233 wc->wc_bytes_xfer, ntohl(wc->wc_immed_data)); 234 235 rdsv3_ib_stats_inc(s_ib_tx_cq_event); 236 237 if (wc->wc_id == RDSV3_IB_ACK_WR_ID) { 238 if (ic->i_ack_queued + HZ/2 < jiffies) 239 rdsv3_ib_stats_inc(s_ib_tx_stalled); 240 rdsv3_ib_ack_send_complete(ic); 241 return; 242 } 243 244 oldest = rdsv3_ib_ring_oldest(&ic->i_send_ring); 245 246 completed = rdsv3_ib_ring_completed(&ic->i_send_ring, 247 (wc->wc_id & ~RDSV3_IB_SEND_OP), oldest); 248 249 for (i = 0; i < completed; i++) { 250 send = &ic->i_sends[oldest]; 251 252 /* 253 * In the error case, wc->opcode sometimes contains 254 * garbage 255 */ 256 switch (send->s_opcode) { 257 case IBT_WRC_SEND: 258 if (send->s_rm) 259 rdsv3_ib_send_unmap_rm(ic, send, 260 wc->wc_status); 261 break; 262 case IBT_WRC_RDMAW: 263 case IBT_WRC_RDMAR: 264 /* 265 * Nothing to be done - the SG list will 266 * be unmapped 267 * when the SEND completes. 268 */ 269 break; 270 default: 271 #ifndef __lock_lint 272 RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", 273 "RDS/IB: %s: unexpected opcode " 274 "0x%x in WR!", 275 __func__, send->s_opcode); 276 #endif 277 break; 278 } 279 280 send->s_opcode = 0xdd; 281 if (send->s_queued + HZ/2 < jiffies) 282 rdsv3_ib_stats_inc(s_ib_tx_stalled); 283 284 /* 285 * If a RDMA operation produced an error, signal 286 * this right 287 * away. If we don't, the subsequent SEND that goes 288 * with this 289 * RDMA will be canceled with ERR_WFLUSH, and the 290 * application 291 * never learn that the RDMA failed. 292 */ 293 if (wc->wc_status == 294 IBT_WC_REMOTE_ACCESS_ERR && send->s_op) { 295 struct rdsv3_message *rm; 296 297 rm = rdsv3_send_get_message(conn, send->s_op); 298 if (rm) { 299 if (rm->m_rdma_op != NULL) 300 rdsv3_ib_send_unmap_rdma(ic, 301 rm->m_rdma_op); 302 rdsv3_ib_send_rdma_complete(rm, 303 wc->wc_status); 304 rdsv3_message_put(rm); 305 } 306 } 307 308 oldest = (oldest + 1) % ic->i_send_ring.w_nr; 309 } 310 311 rdsv3_ib_ring_free(&ic->i_send_ring, completed); 312 313 clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 314 315 /* We expect errors as the qp is drained during shutdown */ 316 if (wc->wc_status != IBT_WC_SUCCESS && rdsv3_conn_up(conn)) { 317 RDSV3_DPRINTF2("rdsv3_ib_send_cqe_handler", 318 "send completion on %u.%u.%u.%u " 319 "had status %u, disconnecting and reconnecting\n", 320 NIPQUAD(conn->c_faddr), wc->wc_status); 321 rdsv3_conn_drop(conn); 322 } 323 324 RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", "Return: conn: %p", ic); 325 } 326 327 /* 328 * This is the main function for allocating credits when sending 329 * messages. 330 * 331 * Conceptually, we have two counters: 332 * - send credits: this tells us how many WRs we're allowed 333 * to submit without overruning the reciever's queue. For 334 * each SEND WR we post, we decrement this by one. 335 * 336 * - posted credits: this tells us how many WRs we recently 337 * posted to the receive queue. This value is transferred 338 * to the peer as a "credit update" in a RDS header field. 339 * Every time we transmit credits to the peer, we subtract 340 * the amount of transferred credits from this counter. 341 * 342 * It is essential that we avoid situations where both sides have 343 * exhausted their send credits, and are unable to send new credits 344 * to the peer. We achieve this by requiring that we send at least 345 * one credit update to the peer before exhausting our credits. 346 * When new credits arrive, we subtract one credit that is withheld 347 * until we've posted new buffers and are ready to transmit these 348 * credits (see rdsv3_ib_send_add_credits below). 349 * 350 * The RDS send code is essentially single-threaded; rdsv3_send_xmit 351 * grabs c_send_lock to ensure exclusive access to the send ring. 352 * However, the ACK sending code is independent and can race with 353 * message SENDs. 354 * 355 * In the send path, we need to update the counters for send credits 356 * and the counter of posted buffers atomically - when we use the 357 * last available credit, we cannot allow another thread to race us 358 * and grab the posted credits counter. Hence, we have to use a 359 * spinlock to protect the credit counter, or use atomics. 360 * 361 * Spinlocks shared between the send and the receive path are bad, 362 * because they create unnecessary delays. An early implementation 363 * using a spinlock showed a 5% degradation in throughput at some 364 * loads. 365 * 366 * This implementation avoids spinlocks completely, putting both 367 * counters into a single atomic, and updating that atomic using 368 * atomic_add (in the receive path, when receiving fresh credits), 369 * and using atomic_cmpxchg when updating the two counters. 370 */ 371 int 372 rdsv3_ib_send_grab_credits(struct rdsv3_ib_connection *ic, 373 uint32_t wanted, uint32_t *adv_credits, int need_posted) 374 { 375 unsigned int avail, posted, got = 0, advertise; 376 long oldval, newval; 377 378 RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d", 379 ic, wanted, *adv_credits, need_posted); 380 381 *adv_credits = 0; 382 if (!ic->i_flowctl) 383 return (wanted); 384 385 try_again: 386 advertise = 0; 387 oldval = newval = atomic_get(&ic->i_credits); 388 posted = IB_GET_POST_CREDITS(oldval); 389 avail = IB_GET_SEND_CREDITS(oldval); 390 391 RDSV3_DPRINTF5("rdsv3_ib_send_grab_credits", 392 "wanted (%u): credits=%u posted=%u\n", wanted, avail, posted); 393 394 /* The last credit must be used to send a credit update. */ 395 if (avail && !posted) 396 avail--; 397 398 if (avail < wanted) { 399 struct rdsv3_connection *conn = ic->i_cm_id->context; 400 401 /* Oops, there aren't that many credits left! */ 402 set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 403 got = avail; 404 } else { 405 /* Sometimes you get what you want, lalala. */ 406 got = wanted; 407 } 408 newval -= IB_SET_SEND_CREDITS(got); 409 410 /* 411 * If need_posted is non-zero, then the caller wants 412 * the posted regardless of whether any send credits are 413 * available. 414 */ 415 if (posted && (got || need_posted)) { 416 advertise = min(posted, RDSV3_MAX_ADV_CREDIT); 417 newval -= IB_SET_POST_CREDITS(advertise); 418 } 419 420 /* Finally bill everything */ 421 if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval) 422 goto try_again; 423 424 *adv_credits = advertise; 425 426 RDSV3_DPRINTF4("rdsv3_ib_send_grab_credits", "ic: %p, %d %d %d", 427 ic, got, *adv_credits, need_posted); 428 429 return (got); 430 } 431 432 void 433 rdsv3_ib_send_add_credits(struct rdsv3_connection *conn, unsigned int credits) 434 { 435 struct rdsv3_ib_connection *ic = conn->c_transport_data; 436 437 if (credits == 0) 438 return; 439 440 RDSV3_DPRINTF5("rdsv3_ib_send_add_credits", 441 "credits (%u): current=%u%s\n", 442 credits, 443 IB_GET_SEND_CREDITS(atomic_get(&ic->i_credits)), 444 test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) ? 445 ", ll_send_full" : ""); 446 447 atomic_add_32(&ic->i_credits, IB_SET_SEND_CREDITS(credits)); 448 if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) 449 rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); 450 451 ASSERT(!(IB_GET_SEND_CREDITS(credits) >= 16384)); 452 453 rdsv3_ib_stats_inc(s_ib_rx_credit_updates); 454 455 RDSV3_DPRINTF4("rdsv3_ib_send_add_credits", 456 "Return: conn: %p, credits: %d", 457 conn, credits); 458 } 459 460 void 461 rdsv3_ib_advertise_credits(struct rdsv3_connection *conn, unsigned int posted) 462 { 463 struct rdsv3_ib_connection *ic = conn->c_transport_data; 464 465 RDSV3_DPRINTF4("rdsv3_ib_advertise_credits", "conn: %p, posted: %d", 466 conn, posted); 467 468 if (posted == 0) 469 return; 470 471 atomic_add_32(&ic->i_credits, IB_SET_POST_CREDITS(posted)); 472 473 /* 474 * Decide whether to send an update to the peer now. 475 * If we would send a credit update for every single buffer we 476 * post, we would end up with an ACK storm (ACK arrives, 477 * consumes buffer, we refill the ring, send ACK to remote 478 * advertising the newly posted buffer... ad inf) 479 * 480 * Performance pretty much depends on how often we send 481 * credit updates - too frequent updates mean lots of ACKs. 482 * Too infrequent updates, and the peer will run out of 483 * credits and has to throttle. 484 * For the time being, 16 seems to be a good compromise. 485 */ 486 if (IB_GET_POST_CREDITS(atomic_get(&ic->i_credits)) >= 16) 487 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 488 } 489 490 static inline void 491 rdsv3_ib_xmit_populate_wr(struct rdsv3_ib_connection *ic, 492 ibt_send_wr_t *wr, unsigned int pos, 493 struct rdsv3_scatterlist *scat, unsigned int off, unsigned int length, 494 int send_flags) 495 { 496 ibt_wr_ds_t *sge; 497 498 RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", 499 "ic: %p, wr: %p scat: %p %d %d %d %d", 500 ic, wr, scat, pos, off, length, send_flags); 501 502 wr->wr_id = pos | RDSV3_IB_SEND_OP; 503 wr->wr_trans = IBT_RC_SRV; 504 wr->wr_flags = send_flags; 505 wr->wr_opcode = IBT_WRC_SEND; 506 507 if (length != 0) { 508 int ix, len, assigned; 509 ibt_wr_ds_t *sgl; 510 511 ASSERT(length <= scat->length - off); 512 513 sgl = scat->sgl; 514 if (off != 0) { 515 /* find the right sgl to begin with */ 516 while (sgl->ds_len <= off) { 517 off -= sgl->ds_len; 518 sgl++; 519 } 520 } 521 522 ix = 1; /* first data sgl is at 1 */ 523 assigned = 0; 524 len = length; 525 do { 526 sge = &wr->wr_sgl[ix++]; 527 sge->ds_va = sgl->ds_va + off; 528 assigned = min(len, sgl->ds_len - off); 529 sge->ds_len = assigned; 530 sge->ds_key = sgl->ds_key; 531 len -= assigned; 532 if (len != 0) { 533 sgl++; 534 off = 0; 535 } 536 } while (len > 0); 537 538 wr->wr_nds = ix; 539 } else { 540 /* 541 * We're sending a packet with no payload. There is only 542 * one SGE 543 */ 544 wr->wr_nds = 1; 545 } 546 547 sge = &wr->wr_sgl[0]; 548 sge->ds_va = ic->i_send_hdrs_dma + (pos * sizeof (struct rdsv3_header)); 549 sge->ds_len = sizeof (struct rdsv3_header); 550 sge->ds_key = ic->i_mr->lkey; 551 552 RDSV3_DPRINTF4("rdsv3_ib_xmit_populate_wr", 553 "Return: ic: %p, wr: %p scat: %p", ic, wr, scat); 554 } 555 556 /* 557 * This can be called multiple times for a given message. The first time 558 * we see a message we map its scatterlist into the IB device so that 559 * we can provide that mapped address to the IB scatter gather entries 560 * in the IB work requests. We translate the scatterlist into a series 561 * of work requests that fragment the message. These work requests complete 562 * in order so we pass ownership of the message to the completion handler 563 * once we send the final fragment. 564 * 565 * The RDS core uses the c_send_lock to only enter this function once 566 * per connection. This makes sure that the tx ring alloc/unalloc pairs 567 * don't get out of sync and confuse the ring. 568 */ 569 int 570 rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, 571 unsigned int hdr_off, unsigned int sg, unsigned int off) 572 { 573 struct rdsv3_ib_connection *ic = conn->c_transport_data; 574 struct ib_device *dev = ic->i_cm_id->device; 575 struct rdsv3_ib_send_work *send = NULL; 576 struct rdsv3_ib_send_work *first; 577 struct rdsv3_ib_send_work *prev; 578 ibt_send_wr_t *wr; 579 struct rdsv3_scatterlist *scat; 580 uint32_t pos; 581 uint32_t i; 582 uint32_t work_alloc; 583 uint32_t credit_alloc; 584 uint32_t posted; 585 uint32_t adv_credits = 0; 586 int send_flags = 0; 587 int sent; 588 int ret; 589 int flow_controlled = 0; 590 591 RDSV3_DPRINTF4("rdsv3_ib_xmit", "conn: %p, rm: %p", conn, rm); 592 593 ASSERT(!(off % RDSV3_FRAG_SIZE)); 594 ASSERT(!(hdr_off != 0 && hdr_off != sizeof (struct rdsv3_header))); 595 596 /* Do not send cong updates to IB loopback */ 597 if (conn->c_loopback && 598 rm->m_inc.i_hdr.h_flags & RDSV3_FLAG_CONG_BITMAP) { 599 rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0); 600 return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES); 601 } 602 603 #ifndef __lock_lint 604 /* FIXME we may overallocate here */ 605 if (ntohl(rm->m_inc.i_hdr.h_len) == 0) 606 i = 1; 607 else 608 i = ceil(ntohl(rm->m_inc.i_hdr.h_len), RDSV3_FRAG_SIZE); 609 #endif 610 611 work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, i, &pos); 612 if (work_alloc != i) { 613 rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 614 set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); 615 rdsv3_ib_stats_inc(s_ib_tx_ring_full); 616 ret = -ENOMEM; 617 goto out; 618 } 619 620 credit_alloc = work_alloc; 621 if (ic->i_flowctl) { 622 credit_alloc = rdsv3_ib_send_grab_credits(ic, work_alloc, 623 &posted, 0); 624 adv_credits += posted; 625 if (credit_alloc < work_alloc) { 626 rdsv3_ib_ring_unalloc(&ic->i_send_ring, 627 work_alloc - credit_alloc); 628 work_alloc = credit_alloc; 629 flow_controlled++; 630 } 631 if (work_alloc == 0) { 632 rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 633 rdsv3_ib_stats_inc(s_ib_tx_throttle); 634 ret = -ENOMEM; 635 goto out; 636 } 637 } 638 639 /* map the message the first time we see it */ 640 if (ic->i_rm == NULL) { 641 /* 642 * printk(KERN_NOTICE 643 * "rdsv3_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", 644 * be16_to_cpu(rm->m_inc.i_hdr.h_dport), 645 * rm->m_inc.i_hdr.h_flags, 646 * be32_to_cpu(rm->m_inc.i_hdr.h_len)); 647 */ 648 if (rm->m_nents) { 649 rm->m_count = rdsv3_ib_dma_map_sg(dev, 650 rm->m_sg, rm->m_nents); 651 RDSV3_DPRINTF5("rdsv3_ib_xmit", 652 "ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); 653 if (rm->m_count == 0) { 654 rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); 655 rdsv3_ib_ring_unalloc(&ic->i_send_ring, 656 work_alloc); 657 ret = -ENOMEM; /* XXX ? */ 658 RDSV3_DPRINTF2("rdsv3_ib_xmit", 659 "fail: ic %p mapping rm %p: %d\n", 660 ic, rm, rm->m_count); 661 goto out; 662 } 663 } else { 664 rm->m_count = 0; 665 } 666 667 ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; 668 ic->i_unsignaled_bytes = rdsv3_ib_sysctl_max_unsig_bytes; 669 rdsv3_message_addref(rm); 670 ic->i_rm = rm; 671 672 /* Finalize the header */ 673 if (test_bit(RDSV3_MSG_ACK_REQUIRED, &rm->m_flags)) 674 rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_ACK_REQUIRED; 675 if (test_bit(RDSV3_MSG_RETRANSMITTED, &rm->m_flags)) 676 rm->m_inc.i_hdr.h_flags |= RDSV3_FLAG_RETRANSMITTED; 677 678 /* 679 * If it has a RDMA op, tell the peer we did it. This is 680 * used by the peer to release use-once RDMA MRs. 681 */ 682 if (rm->m_rdma_op) { 683 struct rdsv3_ext_header_rdma ext_hdr; 684 685 ext_hdr.h_rdma_rkey = htonl(rm->m_rdma_op->r_key); 686 (void) rdsv3_message_add_extension(&rm->m_inc.i_hdr, 687 RDSV3_EXTHDR_RDMA, &ext_hdr, 688 sizeof (ext_hdr)); 689 } 690 if (rm->m_rdma_cookie) { 691 (void) rdsv3_message_add_rdma_dest_extension( 692 &rm->m_inc.i_hdr, 693 rdsv3_rdma_cookie_key(rm->m_rdma_cookie), 694 rdsv3_rdma_cookie_offset(rm->m_rdma_cookie)); 695 } 696 697 /* 698 * Note - rdsv3_ib_piggyb_ack clears the ACK_REQUIRED bit, so 699 * we should not do this unless we have a chance of at least 700 * sticking the header into the send ring. Which is why we 701 * should call rdsv3_ib_ring_alloc first. 702 */ 703 rm->m_inc.i_hdr.h_ack = htonll(rdsv3_ib_piggyb_ack(ic)); 704 rdsv3_message_make_checksum(&rm->m_inc.i_hdr); 705 706 /* 707 * Update adv_credits since we reset the ACK_REQUIRED bit. 708 */ 709 (void) rdsv3_ib_send_grab_credits(ic, 0, &posted, 1); 710 adv_credits += posted; 711 ASSERT(adv_credits <= 255); 712 } 713 714 send = &ic->i_sends[pos]; 715 first = send; 716 prev = NULL; 717 scat = &rm->m_sg[sg]; 718 sent = 0; 719 i = 0; 720 721 /* 722 * Sometimes you want to put a fence between an RDMA 723 * READ and the following SEND. 724 * We could either do this all the time 725 * or when requested by the user. Right now, we let 726 * the application choose. 727 */ 728 if (rm->m_rdma_op && rm->m_rdma_op->r_fence) 729 send_flags = IBT_WR_SEND_FENCE; 730 731 /* 732 * We could be copying the header into the unused tail of the page. 733 * That would need to be changed in the future when those pages might 734 * be mapped userspace pages or page cache pages. So instead we always 735 * use a second sge and our long-lived ring of mapped headers. We send 736 * the header after the data so that the data payload can be aligned on 737 * the receiver. 738 */ 739 740 /* handle a 0-len message */ 741 if (ntohl(rm->m_inc.i_hdr.h_len) == 0) { 742 wr = &ic->i_send_wrs[0]; 743 rdsv3_ib_xmit_populate_wr(ic, wr, pos, NULL, 0, 0, send_flags); 744 send->s_queued = jiffies; 745 send->s_op = NULL; 746 send->s_opcode = wr->wr_opcode; 747 goto add_header; 748 } 749 750 /* if there's data reference it with a chain of work reqs */ 751 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { 752 unsigned int len; 753 754 send = &ic->i_sends[pos]; 755 756 wr = &ic->i_send_wrs[i]; 757 len = min(RDSV3_FRAG_SIZE, 758 rdsv3_ib_sg_dma_len(dev, scat) - off); 759 rdsv3_ib_xmit_populate_wr(ic, wr, pos, scat, off, len, 760 send_flags); 761 send->s_queued = jiffies; 762 send->s_op = NULL; 763 send->s_opcode = wr->wr_opcode; 764 765 /* 766 * We want to delay signaling completions just enough to get 767 * the batching benefits but not so much that we create dead 768 * time 769 * on the wire. 770 */ 771 if (ic->i_unsignaled_wrs-- == 0) { 772 ic->i_unsignaled_wrs = rdsv3_ib_sysctl_max_unsig_wrs; 773 wr->wr_flags |= 774 IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 775 } 776 777 ic->i_unsignaled_bytes -= len; 778 if (ic->i_unsignaled_bytes <= 0) { 779 ic->i_unsignaled_bytes = 780 rdsv3_ib_sysctl_max_unsig_bytes; 781 wr->wr_flags |= 782 IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 783 } 784 785 /* 786 * Always signal the last one if we're stopping due to flow 787 * control. 788 */ 789 if (flow_controlled && i == (work_alloc-1)) { 790 wr->wr_flags |= 791 IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 792 } 793 794 RDSV3_DPRINTF5("rdsv3_ib_xmit", "send %p wr %p num_sge %u \n", 795 send, wr, wr->wr_nds); 796 797 sent += len; 798 off += len; 799 if (off == rdsv3_ib_sg_dma_len(dev, scat)) { 800 scat++; 801 off = 0; 802 } 803 804 add_header: 805 /* 806 * Tack on the header after the data. The header SGE 807 * should already 808 * have been set up to point to the right header buffer. 809 */ 810 (void) memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, 811 sizeof (struct rdsv3_header)); 812 813 if (0) { 814 struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; 815 816 RDSV3_DPRINTF2("rdsv3_ib_xmit", 817 "send WR dport=%u flags=0x%x len=%d", 818 ntohs(hdr->h_dport), 819 hdr->h_flags, 820 ntohl(hdr->h_len)); 821 } 822 if (adv_credits) { 823 struct rdsv3_header *hdr = &ic->i_send_hdrs[pos]; 824 825 /* add credit and redo the header checksum */ 826 hdr->h_credit = adv_credits; 827 rdsv3_message_make_checksum(hdr); 828 adv_credits = 0; 829 rdsv3_ib_stats_inc(s_ib_tx_credit_updates); 830 } 831 832 prev = send; 833 834 pos = (pos + 1) % ic->i_send_ring.w_nr; 835 } 836 837 /* 838 * Account the RDS header in the number of bytes we sent, but just once. 839 * The caller has no concept of fragmentation. 840 */ 841 if (hdr_off == 0) 842 sent += sizeof (struct rdsv3_header); 843 844 /* if we finished the message then send completion owns it */ 845 if (scat == &rm->m_sg[rm->m_count]) { 846 prev->s_rm = ic->i_rm; 847 wr->wr_flags |= IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 848 ic->i_rm = NULL; 849 } 850 851 if (i < work_alloc) { 852 rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); 853 work_alloc = i; 854 } 855 if (ic->i_flowctl && i < credit_alloc) 856 rdsv3_ib_send_add_credits(conn, credit_alloc - i); 857 858 /* XXX need to worry about failed_wr and partial sends. */ 859 ret = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), 860 ic->i_send_wrs, i, &posted); 861 if (posted != i) { 862 RDSV3_DPRINTF2("rdsv3_ib_xmit", 863 "ic %p first %p nwr: %d ret %d:%d", 864 ic, first, i, ret, posted); 865 } 866 if (ret) { 867 RDSV3_DPRINTF2("rdsv3_ib_xmit", 868 "RDS/IB: ib_post_send to %u.%u.%u.%u " 869 "returned %d\n", NIPQUAD(conn->c_faddr), ret); 870 rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 871 if (prev->s_rm) { 872 ic->i_rm = prev->s_rm; 873 prev->s_rm = NULL; 874 } 875 RDSV3_DPRINTF2("rdsv3_ib_xmit", "ibt_post_send failed\n"); 876 rdsv3_conn_drop(ic->conn); 877 ret = -EAGAIN; 878 goto out; 879 } 880 881 ret = sent; 882 883 RDSV3_DPRINTF4("rdsv3_ib_xmit", "Return: conn: %p, rm: %p", conn, rm); 884 out: 885 ASSERT(!adv_credits); 886 return (ret); 887 } 888 889 static void 890 rdsv3_ib_dma_unmap_sg_rdma(struct ib_device *dev, uint_t num, 891 struct rdsv3_rdma_sg scat[]) 892 { 893 ibt_hca_hdl_t hca_hdl; 894 int i; 895 int num_sgl; 896 897 RDSV3_DPRINTF4("rdsv3_ib_dma_unmap_sg", "rdma_sg: %p", scat); 898 899 if (dev) { 900 hca_hdl = ib_get_ibt_hca_hdl(dev); 901 } else { 902 hca_hdl = scat[0].hca_hdl; 903 RDSV3_DPRINTF2("rdsv3_ib_dma_unmap_sg_rdma", 904 "NULL dev use cached hca_hdl %p", hca_hdl); 905 } 906 907 if (hca_hdl == NULL) 908 return; 909 scat[0].hca_hdl = NULL; 910 911 for (i = 0; i < num; i++) { 912 if (scat[i].mihdl != NULL) { 913 num_sgl = (scat[i].iovec.bytes / PAGESIZE) + 2; 914 kmem_free(scat[i].swr.wr_sgl, 915 (num_sgl * sizeof (ibt_wr_ds_t))); 916 scat[i].swr.wr_sgl = NULL; 917 (void) ibt_unmap_mem_iov(hca_hdl, scat[i].mihdl); 918 scat[i].mihdl = NULL; 919 } else 920 break; 921 } 922 } 923 924 /* ARGSUSED */ 925 uint_t 926 rdsv3_ib_dma_map_sg_rdma(struct ib_device *dev, struct rdsv3_rdma_sg scat[], 927 uint_t num, struct rdsv3_scatterlist **scatl) 928 { 929 ibt_hca_hdl_t hca_hdl; 930 ibt_iov_attr_t iov_attr; 931 struct buf *bp; 932 uint_t i, j, k; 933 uint_t count; 934 struct rdsv3_scatterlist *sg; 935 int ret; 936 937 RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "scat: %p, num: %d", 938 scat, num); 939 940 hca_hdl = ib_get_ibt_hca_hdl(dev); 941 scat[0].hca_hdl = hca_hdl; 942 bzero(&iov_attr, sizeof (ibt_iov_attr_t)); 943 iov_attr.iov_flags = IBT_IOV_BUF; 944 iov_attr.iov_lso_hdr_sz = 0; 945 946 for (i = 0, count = 0; i < num; i++) { 947 /* transpose umem_cookie to buf structure */ 948 bp = ddi_umem_iosetup(scat[i].umem_cookie, 949 scat[i].iovec.addr & PAGEOFFSET, scat[i].iovec.bytes, 950 B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP); 951 if (bp == NULL) { 952 /* free resources and return error */ 953 goto out; 954 } 955 /* setup ibt_map_mem_iov() attributes */ 956 iov_attr.iov_buf = bp; 957 iov_attr.iov_wr_nds = (scat[i].iovec.bytes / PAGESIZE) + 2; 958 scat[i].swr.wr_sgl = 959 kmem_zalloc(iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t), 960 KM_SLEEP); 961 962 ret = ibt_map_mem_iov(hca_hdl, &iov_attr, 963 (ibt_all_wr_t *)&scat[i].swr, &scat[i].mihdl); 964 freerbuf(bp); 965 if (ret != IBT_SUCCESS) { 966 RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg_rdma", 967 "ibt_map_mem_iov returned: %d", ret); 968 /* free resources and return error */ 969 kmem_free(scat[i].swr.wr_sgl, 970 iov_attr.iov_wr_nds * sizeof (ibt_wr_ds_t)); 971 goto out; 972 } 973 count += scat[i].swr.wr_nds; 974 975 #ifdef DEBUG 976 for (j = 0; j < scat[i].swr.wr_nds; j++) { 977 RDSV3_DPRINTF5("rdsv3_ib_dma_map_sg_rdma", 978 "sgl[%d] va %llx len %x", j, 979 scat[i].swr.wr_sgl[j].ds_va, 980 scat[i].swr.wr_sgl[j].ds_len); 981 } 982 #endif 983 RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", 984 "iovec.bytes: 0x%x scat[%d]swr.wr_nds: %d", 985 scat[i].iovec.bytes, i, scat[i].swr.wr_nds); 986 } 987 988 count = ((count - 1) / RDSV3_IB_MAX_SGE) + 1; 989 RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg_rdma", "Ret: num: %d", count); 990 return (count); 991 992 out: 993 rdsv3_ib_dma_unmap_sg_rdma(dev, num, scat); 994 return (0); 995 } 996 997 int 998 rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) 999 { 1000 struct rdsv3_ib_connection *ic = conn->c_transport_data; 1001 struct rdsv3_ib_send_work *send = NULL; 1002 struct rdsv3_rdma_sg *scat; 1003 uint64_t remote_addr; 1004 uint32_t pos; 1005 uint32_t work_alloc; 1006 uint32_t i, j, k, idx; 1007 uint32_t left, count; 1008 uint32_t posted; 1009 int sent; 1010 ibt_status_t status; 1011 ibt_send_wr_t *wr; 1012 ibt_wr_ds_t *sge; 1013 1014 RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "rdsv3_ib_conn: %p", ic); 1015 1016 /* map the message the first time we see it */ 1017 if (!op->r_mapped) { 1018 op->r_count = rdsv3_ib_dma_map_sg_rdma(ic->i_cm_id->device, 1019 op->r_rdma_sg, op->r_nents, &op->r_sg); 1020 RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", "ic %p mapping op %p: %d", 1021 ic, op, op->r_count); 1022 if (op->r_count == 0) { 1023 rdsv3_ib_stats_inc(s_ib_tx_sg_mapping_failure); 1024 RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma", 1025 "fail: ic %p mapping op %p: %d", 1026 ic, op, op->r_count); 1027 return (-ENOMEM); /* XXX ? */ 1028 } 1029 op->r_mapped = 1; 1030 } 1031 1032 /* 1033 * Instead of knowing how to return a partial rdma read/write 1034 * we insist that there 1035 * be enough work requests to send the entire message. 1036 */ 1037 work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, op->r_count, &pos); 1038 if (work_alloc != op->r_count) { 1039 rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 1040 rdsv3_ib_stats_inc(s_ib_tx_ring_full); 1041 return (-ENOMEM); 1042 } 1043 1044 RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "pos %u cnt %u", pos, op->r_count); 1045 /* 1046 * take the scatter list and transpose into a list of 1047 * send wr's each with a scatter list of RDSV3_IB_MAX_SGE 1048 */ 1049 scat = &op->r_rdma_sg[0]; 1050 sent = 0; 1051 remote_addr = op->r_remote_addr; 1052 1053 for (i = 0, k = 0; i < op->r_nents; i++) { 1054 left = scat[i].swr.wr_nds; 1055 for (idx = 0; left > 0; k++) { 1056 send = &ic->i_sends[pos]; 1057 send->s_queued = jiffies; 1058 send->s_opcode = op->r_write ? IBT_WRC_RDMAW : 1059 IBT_WRC_RDMAR; 1060 send->s_op = op; 1061 1062 wr = &ic->i_send_wrs[k]; 1063 wr->wr_flags = 0; 1064 wr->wr_id = pos | RDSV3_IB_SEND_OP; 1065 wr->wr_trans = IBT_RC_SRV; 1066 wr->wr_opcode = op->r_write ? IBT_WRC_RDMAW : 1067 IBT_WRC_RDMAR; 1068 wr->wr.rc.rcwr.rdma.rdma_raddr = remote_addr; 1069 wr->wr.rc.rcwr.rdma.rdma_rkey = op->r_key; 1070 1071 if (left > RDSV3_IB_MAX_SGE) { 1072 count = RDSV3_IB_MAX_SGE; 1073 left -= RDSV3_IB_MAX_SGE; 1074 } else { 1075 count = left; 1076 left = 0; 1077 } 1078 wr->wr_nds = count; 1079 1080 for (j = 0; j < count; j++) { 1081 sge = &wr->wr_sgl[j]; 1082 *sge = scat[i].swr.wr_sgl[idx]; 1083 remote_addr += scat[i].swr.wr_sgl[idx].ds_len; 1084 sent += scat[i].swr.wr_sgl[idx].ds_len; 1085 idx++; 1086 RDSV3_DPRINTF5("xmit_rdma", 1087 "send_wrs[%d]sgl[%d] va %llx len %x", 1088 k, j, sge->ds_va, sge->ds_len); 1089 } 1090 RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", 1091 "wr[%d] %p key: %x code: %d tlen: %d", 1092 k, wr, wr->wr.rc.rcwr.rdma.rdma_rkey, 1093 wr->wr_opcode, sent); 1094 1095 /* 1096 * We want to delay signaling completions just enough 1097 * to get the batching benefits but not so much that 1098 * we create dead time on the wire. 1099 */ 1100 if (ic->i_unsignaled_wrs-- == 0) { 1101 ic->i_unsignaled_wrs = 1102 rdsv3_ib_sysctl_max_unsig_wrs; 1103 wr->wr_flags = IBT_WR_SEND_SIGNAL; 1104 } 1105 1106 pos = (pos + 1) % ic->i_send_ring.w_nr; 1107 } 1108 } 1109 1110 status = ibt_post_send(ib_get_ibt_channel_hdl(ic->i_cm_id), 1111 ic->i_send_wrs, k, &posted); 1112 if (status != IBT_SUCCESS) { 1113 RDSV3_DPRINTF2("rdsv3_ib_xmit_rdma", 1114 "RDS/IB: rdma ib_post_send to %u.%u.%u.%u " 1115 "returned %d", NIPQUAD(conn->c_faddr), status); 1116 rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 1117 } 1118 RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "Ret: %p", ic); 1119 return (status); 1120 } 1121 1122 void 1123 rdsv3_ib_xmit_complete(struct rdsv3_connection *conn) 1124 { 1125 struct rdsv3_ib_connection *ic = conn->c_transport_data; 1126 1127 RDSV3_DPRINTF4("rdsv3_ib_xmit_complete", "conn: %p", conn); 1128 1129 /* 1130 * We may have a pending ACK or window update we were unable 1131 * to send previously (due to flow control). Try again. 1132 */ 1133 rdsv3_ib_attempt_ack(ic); 1134 } 1135