1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file ib_recv.c 7 * Oracle elects to have and use the contents of ib_recv.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/types.h> 45 #include <sys/kmem.h> 46 #include <sys/cpuvar.h> 47 #include <sys/rds.h> 48 #include <sys/containerof.h> 49 50 #include <sys/ib/clients/rdsv3/rdsv3.h> 51 #include <sys/ib/clients/rdsv3/ib.h> 52 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 53 54 static struct kmem_cache *rdsv3_ib_incoming_slab; 55 static atomic_t rdsv3_ib_allocation = ATOMIC_INIT(0); 56 57 void 58 rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic) 59 { 60 struct rdsv3_ib_recv_work *recv; 61 struct rdsv3_header *hdrp; 62 uint32_t i; 63 64 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ring", "ic: %p", ic); 65 66 hdrp = ic->i_recv_hdrs; 67 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) { 68 recv->r_ibinc = NULL; 69 recv->r_frag = NULL; 70 71 /* initialize the hdr sgl permanently */ 72 recv->r_sge[0].ds_va = (ib_vaddr_t)(uintptr_t)hdrp++; 73 recv->r_sge[0].ds_len = sizeof (struct rdsv3_header); 74 recv->r_sge[0].ds_key = ic->i_mr->lkey; 75 } 76 } 77 78 static void 79 rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic, 80 struct rdsv3_ib_recv_work *recv) 81 { 82 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "ic: %p, recv: %p", 83 ic, recv); 84 85 if (recv->r_ibinc) { 86 rdsv3_inc_put(&recv->r_ibinc->ii_inc); 87 recv->r_ibinc = NULL; 88 } 89 90 if (recv->r_frag) { 91 kmem_cache_free(ic->rds_ibdev->ib_frag_slab, recv->r_frag); 92 recv->r_frag = NULL; 93 } 94 95 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_one", "Return: ic: %p, recv: %p", 96 ic, recv); 97 } 98 99 void 100 rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic) 101 { 102 uint32_t i; 103 104 RDSV3_DPRINTF4("rdsv3_ib_recv_clear_ring", "ic: %p", ic); 105 106 for (i = 0; i < ic->i_recv_ring.w_nr; i++) 107 rdsv3_ib_recv_clear_one(ic, &ic->i_recvs[i]); 108 } 109 110 extern int atomic_add_unless(atomic_t *, uint_t, ulong_t); 111 112 static int 113 rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn, 114 struct rdsv3_ib_recv_work *recv) 115 { 116 struct rdsv3_ib_connection *ic = conn->c_transport_data; 117 ibt_mi_hdl_t mi_hdl; 118 ibt_iov_attr_t iov_attr; 119 ibt_iov_t iov_arr[1]; 120 121 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p", 122 conn, recv); 123 124 if (!recv->r_ibinc) { 125 if (!atomic_add_unless(&rdsv3_ib_allocation, 1, 126 ic->i_max_recv_alloc)) { 127 rdsv3_ib_stats_inc(s_ib_rx_alloc_limit); 128 goto out; 129 } 130 recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab, 131 KM_NOSLEEP); 132 if (recv->r_ibinc == NULL) { 133 atomic_dec_32(&rdsv3_ib_allocation); 134 goto out; 135 } 136 rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); 137 recv->r_ibinc->ii_ibdev = ic->rds_ibdev; 138 recv->r_ibinc->ii_pool = ic->rds_ibdev->inc_pool; 139 } 140 141 if (!recv->r_frag) { 142 recv->r_frag = kmem_cache_alloc(ic->rds_ibdev->ib_frag_slab, 143 KM_NOSLEEP); 144 if (!recv->r_frag) 145 goto out; 146 } 147 148 /* Data sge, structure copy */ 149 recv->r_sge[1] = recv->r_frag->f_sge; 150 151 RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "Return: conn: %p, recv: %p", 152 conn, recv); 153 154 return (0); 155 out: 156 if (recv->r_ibinc) { 157 kmem_cache_free(rdsv3_ib_incoming_slab, recv->r_ibinc); 158 atomic_dec_32(&rdsv3_ib_allocation); 159 recv->r_ibinc = NULL; 160 } 161 return (-ENOMEM); 162 } 163 164 /* 165 * This tries to allocate and post unused work requests after making sure that 166 * they have all the allocations they need to queue received fragments into 167 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc 168 * pairs don't go unmatched. 169 * 170 * -1 is returned if posting fails due to temporary resource exhaustion. 171 */ 172 int 173 rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill) 174 { 175 struct rdsv3_ib_connection *ic = conn->c_transport_data; 176 struct rdsv3_ib_recv_work *recv; 177 unsigned int posted = 0; 178 int ret = 0, avail; 179 uint32_t pos, i; 180 181 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "conn: %p, prefill: %d", 182 conn, prefill); 183 184 if (prefill || rdsv3_conn_up(conn)) { 185 uint_t w_nr = ic->i_recv_ring.w_nr; 186 187 avail = rdsv3_ib_ring_alloc(&ic->i_recv_ring, w_nr, &pos); 188 if ((avail <= 0) || (pos >= w_nr)) { 189 RDSV3_DPRINTF2("rdsv3_ib_recv_refill", 190 "Argh - ring alloc returned pos=%u, avail: %d", 191 pos, avail); 192 return (-EINVAL); 193 } 194 195 /* populate the WRs */ 196 for (i = 0; i < avail; i++) { 197 recv = &ic->i_recvs[pos]; 198 ret = rdsv3_ib_recv_refill_one(conn, recv); 199 if (ret) { 200 rdsv3_ib_ring_unalloc(&ic->i_recv_ring, 201 avail - i); 202 break; 203 } 204 ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)pos; 205 ic->i_recv_wrs[i].wr_nds = RDSV3_IB_RECV_SGE; 206 ic->i_recv_wrs[i].wr_sgl = &recv->r_sge[0]; 207 208 pos = (pos + 1) % w_nr; 209 } 210 211 if (i) { 212 /* post the WRs at one shot */ 213 ret = ibt_post_recv(ib_get_ibt_channel_hdl(ic->i_cm_id), 214 &ic->i_recv_wrs[0], i, &posted); 215 RDSV3_DPRINTF3("rdsv3_ib_recv_refill", 216 "attempted: %d posted: %d WRs ret %d", 217 i, posted, ret); 218 if (ret) { 219 RDSV3_DPRINTF2("rdsv3_ib_recv_refill", 220 "disconnecting and reconnecting\n", 221 NIPQUAD(conn->c_faddr), ret); 222 rdsv3_ib_ring_unalloc(&ic->i_recv_ring, 223 i - posted); 224 rdsv3_conn_drop(conn); 225 } 226 } 227 } 228 229 /* We're doing flow control - update the window. */ 230 if (ic->i_flowctl && posted) 231 rdsv3_ib_advertise_credits(conn, posted); 232 233 RDSV3_DPRINTF4("rdsv3_ib_recv_refill", "Return: conn: %p, posted: %d", 234 conn, posted); 235 return (ret); 236 } 237 238 /* 239 * delayed freed incoming's 240 */ 241 struct rdsv3_inc_pool { 242 list_t f_list; /* list of freed incoming */ 243 kmutex_t f_lock; /* lock of fmr pool */ 244 int32_t f_listcnt; 245 }; 246 247 void 248 rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *rds_ibdev) 249 { 250 struct rdsv3_inc_pool *pool = rds_ibdev->inc_pool; 251 252 if (pool) { 253 list_destroy(&pool->f_list); 254 kmem_free((void *) pool, sizeof (*pool)); 255 } 256 } 257 258 int 259 rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *rds_ibdev) 260 { 261 struct rdsv3_inc_pool *pool; 262 263 pool = (struct rdsv3_inc_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP); 264 if (pool == NULL) { 265 return (-ENOMEM); 266 } 267 list_create(&pool->f_list, sizeof (struct rdsv3_ib_incoming), 268 offsetof(struct rdsv3_ib_incoming, ii_obj)); 269 mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL); 270 rds_ibdev->inc_pool = pool; 271 return (0); 272 } 273 274 static void 275 rdsv3_ib_inc_drop(struct rdsv3_ib_incoming *ibinc) 276 { 277 struct rdsv3_page_frag *frag; 278 struct rdsv3_page_frag *pos; 279 280 RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) { 281 list_remove_node(&frag->f_item); 282 kmem_cache_free(ibinc->ii_ibdev->ib_frag_slab, frag); 283 } 284 285 ASSERT(list_is_empty(&ibinc->ii_frags)); 286 kmem_cache_free(rdsv3_ib_incoming_slab, ibinc); 287 atomic_dec_uint(&rdsv3_ib_allocation); 288 } 289 290 void 291 rdsv3_ib_drain_inclist(void *data) 292 { 293 struct rdsv3_inc_pool *pool = (struct rdsv3_inc_pool *)data; 294 struct rdsv3_ib_incoming *ibinc; 295 list_t *listp = &pool->f_list; 296 kmutex_t *lockp = &pool->f_lock; 297 int i = 0; 298 299 for (;;) { 300 mutex_enter(lockp); 301 ibinc = (struct rdsv3_ib_incoming *)list_remove_head(listp); 302 if (ibinc) 303 pool->f_listcnt--; 304 mutex_exit(lockp); 305 if (!ibinc) 306 break; 307 i++; 308 rdsv3_ib_inc_drop(ibinc); 309 } 310 } 311 312 void 313 rdsv3_ib_inc_free(struct rdsv3_incoming *inc) 314 { 315 struct rdsv3_ib_incoming *ibinc; 316 rdsv3_af_thr_t *af_thr; 317 318 RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc); 319 320 ibinc = __containerof(inc, struct rdsv3_ib_incoming, ii_inc); 321 /* save af_thr in a local as ib_inc might be freed at mutex_exit */ 322 af_thr = ibinc->ii_ibdev->inc_soft_cq; 323 324 mutex_enter(&ibinc->ii_pool->f_lock); 325 list_insert_tail(&ibinc->ii_pool->f_list, ibinc); 326 ibinc->ii_pool->f_listcnt++; 327 mutex_exit(&ibinc->ii_pool->f_lock); 328 329 rdsv3_af_thr_fire(af_thr); 330 } 331 332 int 333 rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, 334 size_t size) 335 { 336 struct rdsv3_ib_incoming *ibinc; 337 struct rdsv3_page_frag *frag; 338 unsigned long to_copy; 339 unsigned long frag_off = 0; 340 int copied = 0; 341 int ret; 342 uint32_t len; 343 344 ibinc = __containerof(inc, struct rdsv3_ib_incoming, ii_inc); 345 frag = list_head(&ibinc->ii_frags); 346 len = ntohl(inc->i_hdr.h_len); 347 348 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", "inc: %p, size: %d len: %d", 349 inc, size, len); 350 351 while (copied < size && copied < len) { 352 if (frag_off == RDSV3_FRAG_SIZE) { 353 frag = list_next(&ibinc->ii_frags, frag); 354 frag_off = 0; 355 } 356 357 to_copy = min(len - copied, RDSV3_FRAG_SIZE - frag_off); 358 to_copy = min(size - copied, to_copy); 359 360 RDSV3_DPRINTF5("rdsv3_ib_inc_copy_to_user", 361 "%lu bytes to user %p from frag [%p, %u] + %lu", 362 to_copy, uiop, 363 frag->f_page, frag->f_offset, frag_off); 364 365 ret = uiomove((caddr_t)(frag->f_page + 366 frag->f_offset + frag_off), 367 to_copy, UIO_READ, uiop); 368 if (ret) { 369 RDSV3_DPRINTF2("rdsv3_ib_inc_copy_to_user", 370 "uiomove (%d) returned: %d", to_copy, ret); 371 break; 372 } 373 374 frag_off += to_copy; 375 copied += to_copy; 376 } 377 378 RDSV3_DPRINTF4("rdsv3_ib_inc_copy_to_user", 379 "Return: inc: %p, copied: %d", inc, copied); 380 381 return (copied); 382 } 383 384 /* ic starts out kmem_zalloc()ed */ 385 void 386 rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic) 387 { 388 ibt_send_wr_t *wr = &ic->i_ack_wr; 389 ibt_wr_ds_t *sge = &ic->i_ack_sge; 390 391 RDSV3_DPRINTF4("rdsv3_ib_recv_init_ack", "ic: %p", ic); 392 393 sge->ds_va = ic->i_ack_dma; 394 sge->ds_len = sizeof (struct rdsv3_header); 395 sge->ds_key = ic->i_mr->lkey; 396 397 wr->wr_sgl = sge; 398 wr->wr_nds = 1; 399 wr->wr_opcode = IBT_WRC_SEND; 400 wr->wr_id = RDSV3_IB_ACK_WR_ID; 401 wr->wr_flags = IBT_WR_SEND_SIGNAL | IBT_WR_SEND_SOLICIT; 402 } 403 404 /* 405 * You'd think that with reliable IB connections you wouldn't need to ack 406 * messages that have been received. The problem is that IB hardware generates 407 * an ack message before it has DMAed the message into memory. This creates a 408 * potential message loss if the HCA is disabled for any reason between when it 409 * sends the ack and before the message is DMAed and processed. This is only a 410 * potential issue if another HCA is available for fail-over. 411 * 412 * When the remote host receives our ack they'll free the sent message from 413 * their send queue. To decrease the latency of this we always send an ack 414 * immediately after we've received messages. 415 * 416 * For simplicity, we only have one ack in flight at a time. This puts 417 * pressure on senders to have deep enough send queues to absorb the latency of 418 * a single ack frame being in flight. This might not be good enough. 419 * 420 * This is implemented by have a long-lived send_wr and sge which point to a 421 * statically allocated ack frame. This ack wr does not fall under the ring 422 * accounting that the tx and rx wrs do. The QP attribute specifically makes 423 * room for it beyond the ring size. Send completion notices its special 424 * wr_id and avoids working with the ring in that case. 425 */ 426 void 427 rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, 428 int ack_required) 429 { 430 RDSV3_DPRINTF4("rdsv3_ib_set_ack", "ic: %p, seq: %lld ack: %d", 431 ic, seq, ack_required); 432 433 mutex_enter(&ic->i_ack_lock); 434 ic->i_ack_next = seq; 435 if (ack_required) 436 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 437 mutex_exit(&ic->i_ack_lock); 438 } 439 440 static uint64_t 441 rdsv3_ib_get_ack(struct rdsv3_ib_connection *ic) 442 { 443 uint64_t seq; 444 445 RDSV3_DPRINTF4("rdsv3_ib_get_ack", "ic: %p", ic); 446 447 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 448 449 mutex_enter(&ic->i_ack_lock); 450 seq = ic->i_ack_next; 451 mutex_exit(&ic->i_ack_lock); 452 453 return (seq); 454 } 455 456 static void 457 rdsv3_ib_send_ack(struct rdsv3_ib_connection *ic, unsigned int adv_credits) 458 { 459 struct rdsv3_header *hdr = ic->i_ack; 460 uint64_t seq; 461 int ret; 462 463 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "ic: %p adv_credits: %d", 464 ic, adv_credits); 465 466 seq = rdsv3_ib_get_ack(ic); 467 468 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "send_ack: ic %p ack %llu", 469 ic, (unsigned long long) seq); 470 rdsv3_message_populate_header(hdr, 0, 0, 0); 471 hdr->h_ack = htonll(seq); 472 hdr->h_credit = adv_credits; 473 rdsv3_message_make_checksum(hdr); 474 ic->i_ack_queued = jiffies; 475 476 ret = ibt_post_send(RDSV3_QP2CHANHDL(ic->i_cm_id->qp), &ic->i_ack_wr, 1, 477 NULL); 478 if (ret) { 479 /* 480 * Failed to send. Release the WR, and 481 * force another ACK. 482 */ 483 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 484 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 485 rdsv3_ib_stats_inc(s_ib_ack_send_failure); 486 RDSV3_DPRINTF2("rdsv3_ib_send_ack", "sending ack failed\n"); 487 rdsv3_conn_drop(ic->conn); 488 } else { 489 rdsv3_ib_stats_inc(s_ib_ack_sent); 490 } 491 RDSV3_DPRINTF4("rdsv3_ib_send_ack", "Return: ic: %p adv_credits: %d", 492 ic, adv_credits); 493 } 494 495 /* 496 * There are 3 ways of getting acknowledgements to the peer: 497 * 1. We call rdsv3_ib_attempt_ack from the recv completion handler 498 * to send an ACK-only frame. 499 * However, there can be only one such frame in the send queue 500 * at any time, so we may have to postpone it. 501 * 2. When another (data) packet is transmitted while there's 502 * an ACK in the queue, we piggyback the ACK sequence number 503 * on the data packet. 504 * 3. If the ACK WR is done sending, we get called from the 505 * send queue completion handler, and check whether there's 506 * another ACK pending (postponed because the WR was on the 507 * queue). If so, we transmit it. 508 * 509 * We maintain 2 variables: 510 * - i_ack_flags, which keeps track of whether the ACK WR 511 * is currently in the send queue or not (IB_ACK_IN_FLIGHT) 512 * - i_ack_next, which is the last sequence number we received 513 * 514 * Potentially, send queue and receive queue handlers can run concurrently. 515 * It would be nice to not have to use a spinlock to synchronize things, 516 * but the one problem that rules this out is that 64bit updates are 517 * not atomic on all platforms. Things would be a lot simpler if 518 * we had atomic64 or maybe cmpxchg64 everywhere. 519 * 520 * Reconnecting complicates this picture just slightly. When we 521 * reconnect, we may be seeing duplicate packets. The peer 522 * is retransmitting them, because it hasn't seen an ACK for 523 * them. It is important that we ACK these. 524 * 525 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with 526 * this flag set *MUST* be acknowledged immediately. 527 */ 528 529 /* 530 * When we get here, we're called from the recv queue handler. 531 * Check whether we ought to transmit an ACK. 532 */ 533 void 534 rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic) 535 { 536 unsigned int adv_credits; 537 538 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "ic: %p", ic); 539 540 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) 541 return; 542 543 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) { 544 rdsv3_ib_stats_inc(s_ib_ack_send_delayed); 545 return; 546 } 547 548 /* Can we get a send credit? */ 549 if (!rdsv3_ib_send_grab_credits(ic, 1, &adv_credits, 0)) { 550 rdsv3_ib_stats_inc(s_ib_tx_throttle); 551 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 552 return; 553 } 554 555 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); 556 rdsv3_ib_send_ack(ic, adv_credits); 557 558 RDSV3_DPRINTF4("rdsv3_ib_attempt_ack", "Return: ic: %p", ic); 559 } 560 561 /* 562 * We get here from the send completion handler, when the 563 * adapter tells us the ACK frame was sent. 564 */ 565 void 566 rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic) 567 { 568 RDSV3_DPRINTF4("rdsv3_ib_ack_send_complete", "ic: %p", ic); 569 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); 570 rdsv3_ib_attempt_ack(ic); 571 } 572 573 /* 574 * This is called by the regular xmit code when it wants to piggyback 575 * an ACK on an outgoing frame. 576 */ 577 uint64_t 578 rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic) 579 { 580 RDSV3_DPRINTF4("rdsv3_ib_piggyb_ack", "ic: %p", ic); 581 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags)) { 582 rdsv3_ib_stats_inc(s_ib_ack_send_piggybacked); 583 } 584 return (rdsv3_ib_get_ack(ic)); 585 } 586 587 /* 588 * It's kind of lame that we're copying from the posted receive pages into 589 * long-lived bitmaps. We could have posted the bitmaps and rdma written into 590 * them. But receiving new congestion bitmaps should be a *rare* event, so 591 * hopefully we won't need to invest that complexity in making it more 592 * efficient. By copying we can share a simpler core with TCP which has to 593 * copy. 594 */ 595 static void 596 rdsv3_ib_cong_recv(struct rdsv3_connection *conn, 597 struct rdsv3_ib_incoming *ibinc) 598 { 599 struct rdsv3_cong_map *map; 600 unsigned int map_off; 601 unsigned int map_page; 602 struct rdsv3_page_frag *frag; 603 unsigned long frag_off; 604 unsigned long to_copy; 605 unsigned long copied; 606 uint64_t uncongested = 0; 607 caddr_t addr; 608 609 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "conn: %p, ibinc: %p", 610 conn, ibinc); 611 612 /* catch completely corrupt packets */ 613 if (ntohl(ibinc->ii_inc.i_hdr.h_len) != RDSV3_CONG_MAP_BYTES) 614 return; 615 616 map = conn->c_fcong; 617 map_page = 0; 618 map_off = 0; 619 620 frag = list_head(&ibinc->ii_frags); 621 frag_off = 0; 622 623 copied = 0; 624 625 while (copied < RDSV3_CONG_MAP_BYTES) { 626 uint64_t *src, *dst; 627 unsigned int k; 628 629 to_copy = min(RDSV3_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); 630 ASSERT(!(to_copy & 7)); /* Must be 64bit aligned. */ 631 632 addr = frag->f_page + frag->f_offset; 633 634 src = (uint64_t *)(addr + frag_off); 635 dst = (uint64_t *)(map->m_page_addrs[map_page] + map_off); 636 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", 637 "src: %p dst: %p copied: %d", src, dst, copied); 638 for (k = 0; k < to_copy; k += 8) { 639 /* 640 * Record ports that became uncongested, ie 641 * bits that changed from 0 to 1. 642 */ 643 uncongested |= ~(*src) & *dst; 644 *dst++ = *src++; 645 } 646 647 copied += to_copy; 648 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", 649 "src: %p dst: %p copied: %d", src, dst, copied); 650 651 map_off += to_copy; 652 if (map_off == PAGE_SIZE) { 653 map_off = 0; 654 map_page++; 655 } 656 657 frag_off += to_copy; 658 if (frag_off == RDSV3_FRAG_SIZE) { 659 frag = list_next(&ibinc->ii_frags, frag); 660 frag_off = 0; 661 } 662 } 663 664 #if 0 665 XXX 666 /* the congestion map is in little endian order */ 667 uncongested = le64_to_cpu(uncongested); 668 #endif 669 670 rdsv3_cong_map_updated(map, uncongested); 671 672 RDSV3_DPRINTF4("rdsv3_ib_cong_recv", "Return: conn: %p, ibinc: %p", 673 conn, ibinc); 674 } 675 676 static void 677 rdsv3_ib_process_recv(struct rdsv3_connection *conn, 678 struct rdsv3_ib_recv_work *recv, uint32_t data_len, 679 struct rdsv3_ib_ack_state *state) 680 { 681 struct rdsv3_ib_connection *ic = conn->c_transport_data; 682 struct rdsv3_ib_incoming *ibinc = ic->i_ibinc; 683 struct rdsv3_header *ihdr, *hdr; 684 685 /* XXX shut down the connection if port 0,0 are seen? */ 686 687 RDSV3_DPRINTF5("rdsv3_ib_process_recv", 688 "ic %p ibinc %p recv %p byte len %u", ic, ibinc, recv, data_len); 689 690 if (data_len < sizeof (struct rdsv3_header)) { 691 RDSV3_DPRINTF2("rdsv3_ib_process_recv", 692 "incoming message from %u.%u.%u.%u didn't include a " 693 "header, disconnecting and reconnecting", 694 NIPQUAD(conn->c_faddr)); 695 rdsv3_conn_drop(conn); 696 return; 697 } 698 data_len -= sizeof (struct rdsv3_header); 699 700 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; 701 702 /* Validate the checksum. */ 703 if (!rdsv3_message_verify_checksum(ihdr)) { 704 RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message " 705 "from %u.%u.%u.%u has corrupted header - " 706 "forcing a reconnect", 707 NIPQUAD(conn->c_faddr)); 708 rdsv3_conn_drop(conn); 709 rdsv3_stats_inc(s_recv_drop_bad_checksum); 710 return; 711 } 712 713 /* Process the ACK sequence which comes with every packet */ 714 state->ack_recv = ntohll(ihdr->h_ack); 715 state->ack_recv_valid = 1; 716 717 /* Process the credits update if there was one */ 718 if (ihdr->h_credit) 719 rdsv3_ib_send_add_credits(conn, ihdr->h_credit); 720 721 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { 722 /* 723 * This is an ACK-only packet. The fact that it gets 724 * special treatment here is that historically, ACKs 725 * were rather special beasts. 726 */ 727 rdsv3_ib_stats_inc(s_ib_ack_received); 728 return; 729 } 730 731 /* 732 * If we don't already have an inc on the connection then this 733 * fragment has a header and starts a message.. copy its header 734 * into the inc and save the inc so we can hang upcoming fragments 735 * off its list. 736 */ 737 if (!ibinc) { 738 ibinc = recv->r_ibinc; 739 recv->r_ibinc = NULL; 740 ic->i_ibinc = ibinc; 741 742 hdr = &ibinc->ii_inc.i_hdr; 743 (void) memcpy(hdr, ihdr, sizeof (*hdr)); 744 ic->i_recv_data_rem = ntohl(hdr->h_len); 745 746 RDSV3_DPRINTF5("rdsv3_ib_process_recv", 747 "ic %p ibinc %p rem %u flag 0x%x", ic, ibinc, 748 ic->i_recv_data_rem, hdr->h_flags); 749 } else { 750 hdr = &ibinc->ii_inc.i_hdr; 751 /* 752 * We can't just use memcmp here; fragments of a 753 * single message may carry different ACKs 754 */ 755 if (hdr->h_sequence != ihdr->h_sequence || 756 hdr->h_len != ihdr->h_len || 757 hdr->h_sport != ihdr->h_sport || 758 hdr->h_dport != ihdr->h_dport) { 759 RDSV3_DPRINTF2("rdsv3_ib_process_recv", 760 "fragment header mismatch; forcing reconnect"); 761 rdsv3_conn_drop(conn); 762 return; 763 } 764 } 765 766 list_insert_tail(&ibinc->ii_frags, recv->r_frag); 767 recv->r_frag = NULL; 768 769 if (ic->i_recv_data_rem > RDSV3_FRAG_SIZE) 770 ic->i_recv_data_rem -= RDSV3_FRAG_SIZE; 771 else { 772 ic->i_recv_data_rem = 0; 773 ic->i_ibinc = NULL; 774 775 if (ibinc->ii_inc.i_hdr.h_flags == RDSV3_FLAG_CONG_BITMAP) 776 rdsv3_ib_cong_recv(conn, ibinc); 777 else { 778 rdsv3_recv_incoming(conn, conn->c_faddr, conn->c_laddr, 779 &ibinc->ii_inc, KM_NOSLEEP); 780 state->ack_next = ntohll(hdr->h_sequence); 781 state->ack_next_valid = 1; 782 } 783 784 /* 785 * Evaluate the ACK_REQUIRED flag *after* we received 786 * the complete frame, and after bumping the next_rx 787 * sequence. 788 */ 789 if (hdr->h_flags & RDSV3_FLAG_ACK_REQUIRED) { 790 rdsv3_stats_inc(s_recv_ack_required); 791 state->ack_required = 1; 792 } 793 794 rdsv3_inc_put(&ibinc->ii_inc); 795 } 796 797 RDSV3_DPRINTF4("rdsv3_ib_process_recv", 798 "Return: conn: %p recv: %p len: %d state: %p", 799 conn, recv, data_len, state); 800 } 801 802 void 803 rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc, 804 struct rdsv3_ib_ack_state *state) 805 { 806 struct rdsv3_connection *conn = ic->conn; 807 struct rdsv3_ib_recv_work *recv; 808 struct rdsv3_ib_work_ring *recv_ringp = &ic->i_recv_ring; 809 810 RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler", 811 "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n", 812 (unsigned long long)wc->wc_id, wc->wc_status, 813 wc->wc_bytes_xfer, ntohl(wc->wc_immed_data)); 814 815 rdsv3_ib_stats_inc(s_ib_rx_cq_event); 816 817 recv = &ic->i_recvs[rdsv3_ib_ring_oldest(recv_ringp)]; 818 819 /* 820 * Also process recvs in connecting state because it is possible 821 * to get a recv completion _before_ the rdmacm ESTABLISHED 822 * event is processed. 823 */ 824 if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) { 825 /* We expect errors as the qp is drained during shutdown */ 826 if (wc->wc_status == IBT_WC_SUCCESS) { 827 rdsv3_ib_process_recv(conn, recv, 828 wc->wc_bytes_xfer, state); 829 } else { 830 RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler", 831 "recv completion on " 832 "%u.%u.%u.%u had status %u, " 833 "disconnecting and reconnecting\n", 834 NIPQUAD(conn->c_faddr), 835 wc->wc_status); 836 rdsv3_conn_drop(conn); 837 } 838 } 839 840 rdsv3_ib_ring_free(recv_ringp, 1); 841 842 /* 843 * If we ever end up with a really empty receive ring, we're 844 * in deep trouble, as the sender will definitely see RNR 845 * timeouts. 846 */ 847 if (rdsv3_ib_ring_empty(recv_ringp)) 848 rdsv3_ib_stats_inc(s_ib_rx_ring_empty); 849 850 if (rdsv3_ib_ring_low(recv_ringp)) { 851 rdsv3_af_thr_fire(ic->i_refill_rq); 852 } 853 } 854 855 int 856 rdsv3_ib_recv(struct rdsv3_connection *conn) 857 { 858 struct rdsv3_ib_connection *ic = conn->c_transport_data; 859 int ret = 0; 860 861 RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn); 862 863 if (rdsv3_conn_up(conn)) 864 rdsv3_ib_attempt_ack(ic); 865 866 RDSV3_DPRINTF4("rdsv3_ib_recv", "Return: conn: %p", conn); 867 868 return (ret); 869 } 870 871 extern int rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags); 872 extern void rdsv3_ib_inc_destructor(void *buf, void *arg); 873 874 int 875 rdsv3_ib_recv_init(void) 876 { 877 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Enter"); 878 879 rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming", 880 sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor, 881 rdsv3_ib_inc_destructor, NULL, NULL, NULL, 0); 882 if (!rdsv3_ib_incoming_slab) { 883 RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create " 884 "failed"); 885 return (-ENOMEM); 886 } 887 888 RDSV3_DPRINTF4("rdsv3_ib_recv_init", "Return"); 889 return (0); 890 } 891 892 void 893 rdsv3_ib_recv_exit(void) 894 { 895 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Enter"); 896 kmem_cache_destroy(rdsv3_ib_incoming_slab); 897 RDSV3_DPRINTF4("rdsv3_ib_recv_exit", "Return"); 898 } 899