1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 #include <sys/types.h> 25 #include <sys/stream.h> 26 #include <sys/dlpi.h> 27 #include <sys/stropts.h> 28 #include <sys/strsun.h> 29 #include <sys/sysmacros.h> 30 #include <sys/strlog.h> 31 #include <sys/ddi.h> 32 #include <sys/cmn_err.h> 33 #include <sys/socket.h> 34 #include <net/if.h> 35 #include <net/if_types.h> 36 #include <netinet/in.h> 37 #include <sys/ethernet.h> 38 #include <inet/arp.h> 39 #include <inet/ip.h> 40 #include <inet/ip6.h> 41 #include <inet/ip_ire.h> 42 #include <inet/ip_if.h> 43 #include <inet/ip_ftable.h> 44 45 #include <sys/sunddi.h> 46 #include <sys/ksynch.h> 47 48 #include <sys/rds.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sockio.h> 52 #include <sys/sysmacros.h> 53 #include <inet/common.h> 54 #include <inet/ip.h> 55 #include <net/if_types.h> 56 57 #include <sys/ib/clients/rdsv3/rdsv3.h> 58 #include <sys/ib/clients/rdsv3/rdma.h> 59 #include <sys/ib/clients/rdsv3/ib.h> 60 #include <sys/ib/clients/rdsv3/rdsv3_impl.h> 61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 62 63 #include <sys/dls.h> 64 #include <sys/mac.h> 65 #include <sys/mac_client.h> 66 #include <sys/mac_provider.h> 67 #include <sys/mac_client_priv.h> 68 69 ddi_taskq_t *rdsv3_taskq = NULL; 70 extern kmem_cache_t *rdsv3_alloc_cache; 71 72 extern unsigned int ip_ocsum(ushort_t *address, int halfword_count, 73 unsigned int sum); 74 75 /* 76 * Check if the IP interface named by `lifrp' is RDS-capable. 77 */ 78 boolean_t 79 rdsv3_capable_interface(struct lifreq *lifrp) 80 { 81 char ifname[LIFNAMSIZ]; 82 char drv[MAXLINKNAMELEN]; 83 uint_t ppa; 84 char *cp; 85 86 RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter"); 87 88 if (lifrp->lifr_type == IFT_IB) 89 return (B_TRUE); 90 91 /* 92 * Strip off the logical interface portion before getting 93 * intimate with the name. 94 */ 95 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 96 if ((cp = strchr(ifname, ':')) != NULL) 97 *cp = '\0'; 98 99 if (strcmp("lo0", ifname) == 0) { 100 /* 101 * loopback is considered RDS-capable 102 */ 103 return (B_TRUE); 104 } 105 106 return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && 107 rdsv3_if_lookup_by_name(drv)); 108 } 109 110 int 111 rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs) 112 { 113 struct lifnum lifn; 114 struct lifconf lifc; 115 struct lifreq *lp, *rlp, lifr; 116 int rval = 0; 117 int numifs; 118 int bufsize, rbufsize; 119 void *buf, *rbuf; 120 int i, j, n, rc; 121 122 *ipaddrs = NULL; 123 *size = 0; 124 *nifs = 0; 125 126 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter"); 127 128 retry_count: 129 /* snapshot the current number of interfaces */ 130 lifn.lifn_family = PF_UNSPEC; 131 lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; 132 lifn.lifn_count = 0; 133 rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval, 134 CRED()); 135 if (rval != 0) { 136 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", 137 "ksocket_ioctl returned: %d", rval); 138 return (rval); 139 } 140 141 numifs = lifn.lifn_count; 142 if (numifs <= 0) { 143 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found"); 144 return (0); 145 } 146 147 /* allocate extra room in case more interfaces appear */ 148 numifs += 10; 149 150 /* get the interface names and ip addresses */ 151 bufsize = numifs * sizeof (struct lifreq); 152 buf = kmem_alloc(bufsize, KM_SLEEP); 153 154 lifc.lifc_family = AF_UNSPEC; 155 lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; 156 lifc.lifc_len = bufsize; 157 lifc.lifc_buf = buf; 158 rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED()); 159 if (rc != 0) { 160 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed"); 161 kmem_free(buf, bufsize); 162 return (rc); 163 } 164 /* if our extra room is used up, try again */ 165 if (bufsize <= lifc.lifc_len) { 166 kmem_free(buf, bufsize); 167 buf = NULL; 168 goto retry_count; 169 } 170 /* calc actual number of ifconfs */ 171 n = lifc.lifc_len / sizeof (struct lifreq); 172 173 /* 174 * Count the RDS interfaces 175 */ 176 for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) { 177 178 /* 179 * Copy as the SIOCGLIFFLAGS ioctl is destructive 180 */ 181 bcopy(lp, &lifr, sizeof (struct lifreq)); 182 /* 183 * fetch the flags using the socket of the correct family 184 */ 185 switch (lifr.lifr_addr.ss_family) { 186 case AF_INET: 187 rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, 188 &rval, CRED()); 189 break; 190 default: 191 continue; 192 } 193 194 if (rc != 0) continue; 195 196 /* 197 * If we got the flags, skip uninteresting 198 * interfaces based on flags 199 */ 200 if ((lifr.lifr_flags & IFF_UP) != IFF_UP) 201 continue; 202 if (lifr.lifr_flags & 203 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 204 continue; 205 if (!rdsv3_capable_interface(&lifr)) 206 continue; 207 j++; 208 } 209 210 if (j <= 0) { 211 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces"); 212 kmem_free(buf, bufsize); 213 return (rval); 214 } 215 216 numifs = j; 217 218 /* This is the buffer we pass back */ 219 rbufsize = numifs * sizeof (struct lifreq); 220 rbuf = kmem_alloc(rbufsize, KM_SLEEP); 221 rlp = (struct lifreq *)rbuf; 222 223 /* 224 * Examine the array of interfaces and filter uninteresting ones 225 */ 226 for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) { 227 228 /* 229 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive 230 */ 231 bcopy(lp, &lifr, sizeof (struct lifreq)); 232 /* 233 * fetch the flags using the socket of the correct family 234 */ 235 switch (lifr.lifr_addr.ss_family) { 236 case AF_INET: 237 rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr, 238 &rval, CRED()); 239 break; 240 default: 241 continue; 242 } 243 244 245 if (rc != 0) { 246 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", 247 "ksocket_ioctl failed" " for %s", lifr.lifr_name); 248 continue; 249 } 250 251 /* 252 * If we got the flags, skip uninteresting 253 * interfaces based on flags 254 */ 255 if ((lifr.lifr_flags & IFF_UP) != IFF_UP) 256 continue; 257 if (lifr.lifr_flags & 258 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 259 continue; 260 if (!rdsv3_capable_interface(&lifr)) 261 continue; 262 263 /* save the record */ 264 bcopy(lp, rlp, sizeof (struct lifreq)); 265 rlp++; 266 } 267 268 kmem_free(buf, bufsize); 269 270 *ipaddrs = rbuf; 271 *size = rbufsize; 272 *nifs = numifs; 273 274 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return"); 275 276 return (rval); 277 } 278 279 /* 280 * Check if the IP interface named by `ifrp' is RDS-capable. 281 */ 282 boolean_t 283 rdsv3_capable_interface_old(struct ifreq *ifrp) 284 { 285 char ifname[IFNAMSIZ]; 286 char drv[MAXLINKNAMELEN]; 287 uint_t ppa; 288 char *cp; 289 290 RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter"); 291 292 /* 293 * Strip off the logical interface portion before getting 294 * intimate with the name. 295 */ 296 (void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ); 297 if ((cp = strchr(ifname, ':')) != NULL) 298 *cp = '\0'; 299 300 RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname); 301 302 if ((strcmp("lo0", ifname) == 0) || 303 (strncmp("ibd", ifname, 3) == 0)) { 304 /* 305 * loopback and IB are considered RDS-capable 306 */ 307 return (B_TRUE); 308 } 309 310 return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS && 311 rdsv3_if_lookup_by_name(drv)); 312 } 313 314 int 315 rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs) 316 { 317 uint_t ifn; 318 struct ifconf ifc; 319 struct ifreq *lp, *rlp, ifr; 320 int rval = 0; 321 int numifs; 322 int bufsize, rbufsize; 323 void *buf, *rbuf; 324 int i, j, n, rc; 325 326 *ipaddrs = NULL; 327 *size = 0; 328 *nifs = 0; 329 330 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter"); 331 332 retry_count: 333 rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval, 334 CRED()); 335 if (rval != 0) { 336 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 337 "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval); 338 return (rval); 339 } 340 341 numifs = ifn; 342 if (numifs <= 0) { 343 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found"); 344 return (0); 345 } 346 347 /* allocate extra room in case more interfaces appear */ 348 numifs += 10; 349 350 /* get the interface names and ip addresses */ 351 bufsize = numifs * sizeof (struct ifreq); 352 buf = kmem_alloc(bufsize, KM_SLEEP); 353 354 ifc.ifc_len = bufsize; 355 ifc.ifc_buf = buf; 356 rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED()); 357 if (rc != 0) { 358 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 359 "SIOCGLIFCONF failed: %d", rc); 360 kmem_free(buf, bufsize); 361 return (rc); 362 } 363 /* if our extra room is used up, try again */ 364 if (bufsize <= ifc.ifc_len) { 365 kmem_free(buf, bufsize); 366 buf = NULL; 367 goto retry_count; 368 } 369 /* calc actual number of ifconfs */ 370 n = ifc.ifc_len / sizeof (struct ifreq); 371 372 /* 373 * Count the RDS interfaces 374 */ 375 for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) { 376 377 /* 378 * Copy as the SIOCGIFFLAGS ioctl is destructive 379 */ 380 bcopy(lp, &ifr, sizeof (struct ifreq)); 381 /* 382 * fetch the flags using the socket of the correct family 383 */ 384 switch (ifr.ifr_addr.sa_family) { 385 case AF_INET: 386 rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, 387 &rval, CRED()); 388 break; 389 default: 390 continue; 391 } 392 393 if (rc != 0) continue; 394 395 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 396 "1. ifr_name: %s, flags: %d", ifr.ifr_name, 397 (ushort_t)ifr.ifr_flags); 398 399 /* 400 * If we got the flags, skip uninteresting 401 * interfaces based on flags 402 */ 403 if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) 404 continue; 405 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 406 "2. ifr_name: %s, flags: %d", ifr.ifr_name, 407 (ushort_t)ifr.ifr_flags); 408 if (((ushort_t)ifr.ifr_flags) & 409 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 410 continue; 411 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 412 "3. ifr_name: %s, flags: %d", ifr.ifr_name, 413 (ushort_t)ifr.ifr_flags); 414 if (!rdsv3_capable_interface_old(&ifr)) 415 continue; 416 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 417 "4. ifr_name: %s, flags: %d", ifr.ifr_name, 418 (ushort_t)ifr.ifr_flags); 419 j++; 420 } 421 422 if (j <= 0) { 423 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces"); 424 kmem_free(buf, bufsize); 425 return (rval); 426 } 427 428 numifs = j; 429 430 /* This is the buffer we pass back */ 431 rbufsize = numifs * sizeof (struct ifreq); 432 rbuf = kmem_alloc(rbufsize, KM_SLEEP); 433 rlp = (struct ifreq *)rbuf; 434 435 /* 436 * Examine the array of interfaces and filter uninteresting ones 437 */ 438 for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) { 439 440 /* 441 * Copy the address as the SIOCGIFFLAGS ioctl is destructive 442 */ 443 bcopy(lp, &ifr, sizeof (struct ifreq)); 444 /* 445 * fetch the flags using the socket of the correct family 446 */ 447 switch (ifr.ifr_addr.sa_family) { 448 case AF_INET: 449 rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr, 450 &rval, CRED()); 451 break; 452 default: 453 continue; 454 } 455 456 457 if (rc != 0) { 458 RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", 459 "ksocket_ioctl failed: %d for %s", 460 rc, ifr.ifr_name); 461 continue; 462 } 463 464 /* 465 * If we got the flags, skip uninteresting 466 * interfaces based on flags 467 */ 468 if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP) 469 continue; 470 if (((ushort_t)ifr.ifr_flags) & 471 (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED)) 472 continue; 473 if (!rdsv3_capable_interface_old(&ifr)) 474 continue; 475 476 /* save the record */ 477 bcopy(lp, rlp, sizeof (struct ifreq)); 478 rlp++; 479 } 480 481 kmem_free(buf, bufsize); 482 483 *ipaddrs = rbuf; 484 *size = rbufsize; 485 *nifs = numifs; 486 487 RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return"); 488 489 return (rval); 490 } 491 492 boolean_t 493 rdsv3_isloopback(ipaddr_t addr) 494 { 495 ip_stack_t *ipst; 496 497 ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip; 498 ASSERT(ipst != NULL); 499 if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) { 500 netstack_rele(ipst->ips_netstack); 501 return (B_FALSE); 502 } 503 netstack_rele(ipst->ips_netstack); 504 return (B_TRUE); 505 } 506 507 /* 508 * Work Queue Implementation 509 */ 510 511 #define RDSV3_WQ_THREAD_IDLE 0 512 #define RDSV3_WQ_THREAD_RUNNING 1 513 #define RDSV3_WQ_THREAD_FLUSHING 2 514 #define RDSV3_WQ_THREAD_EXITING 3 515 516 /* worker thread */ 517 void 518 rdsv3_worker_thread(void *arg) 519 { 520 rdsv3_workqueue_struct_t *wq = arg; 521 rdsv3_work_t *work; 522 523 RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq); 524 525 mutex_enter(&wq->wq_lock); 526 work = list_remove_head(&wq->wq_queue); 527 while (work) { 528 mutex_exit(&wq->wq_lock); 529 530 /* process work */ 531 work->func(work); 532 533 mutex_enter(&wq->wq_lock); 534 work = list_remove_head(&wq->wq_queue); 535 } 536 537 /* No more work, go home, until called again */ 538 if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) { 539 wq->wq_state = RDSV3_WQ_THREAD_IDLE; 540 } 541 mutex_exit(&wq->wq_lock); 542 543 RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq); 544 } 545 546 /* XXX */ 547 void 548 rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq) 549 { 550 RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq); 551 552 mutex_enter(&wq->wq_lock); 553 switch (wq->wq_state) { 554 case RDSV3_WQ_THREAD_IDLE: 555 /* nothing to do */ 556 ASSERT(list_is_empty(&wq->wq_queue)); 557 break; 558 559 case RDSV3_WQ_THREAD_RUNNING: 560 wq->wq_state = RDSV3_WQ_THREAD_FLUSHING; 561 /* FALLTHRU */ 562 case RDSV3_WQ_THREAD_FLUSHING: 563 /* already flushing, wait until the flushing is complete */ 564 do { 565 mutex_exit(&wq->wq_lock); 566 delay(drv_usectohz(1000000)); 567 mutex_enter(&wq->wq_lock); 568 } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); 569 break; 570 case RDSV3_WQ_THREAD_EXITING: 571 mutex_exit(&wq->wq_lock); 572 rdsv3_worker_thread(wq); 573 return; 574 } 575 mutex_exit(&wq->wq_lock); 576 577 RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq); 578 } 579 580 void 581 rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp) 582 { 583 RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp); 584 585 mutex_enter(&wq->wq_lock); 586 587 if (list_link_active(&wp->work_item)) { 588 /* This is already in the queue, ignore this call */ 589 mutex_exit(&wq->wq_lock); 590 RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp); 591 return; 592 } 593 594 switch (wq->wq_state) { 595 case RDSV3_WQ_THREAD_RUNNING: 596 list_insert_tail(&wq->wq_queue, wp); 597 mutex_exit(&wq->wq_lock); 598 break; 599 600 case RDSV3_WQ_THREAD_FLUSHING: 601 do { 602 mutex_exit(&wq->wq_lock); 603 delay(drv_usectohz(1000000)); 604 mutex_enter(&wq->wq_lock); 605 } while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING); 606 607 if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) { 608 list_insert_tail(&wq->wq_queue, wp); 609 mutex_exit(&wq->wq_lock); 610 break; 611 } 612 /* FALLTHRU */ 613 614 case RDSV3_WQ_THREAD_IDLE: 615 list_insert_tail(&wq->wq_queue, wp); 616 wq->wq_state = RDSV3_WQ_THREAD_RUNNING; 617 mutex_exit(&wq->wq_lock); 618 619 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq, 620 DDI_SLEEP); 621 break; 622 623 case RDSV3_WQ_THREAD_EXITING: 624 mutex_exit(&wq->wq_lock); 625 break; 626 } 627 628 RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp); 629 } 630 631 /* timeout handler for delayed work queuing */ 632 void 633 rdsv3_work_timeout_handler(void *arg) 634 { 635 rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg; 636 637 RDSV3_DPRINTF4("rdsv3_work_timeout_handler", 638 "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work); 639 640 mutex_enter(&dwp->lock); 641 dwp->timeid = 0; 642 mutex_exit(&dwp->lock); 643 644 mutex_enter(&dwp->wq->wq_lock); 645 dwp->wq->wq_pending--; 646 if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) { 647 mutex_exit(&dwp->wq->wq_lock); 648 return; 649 } 650 mutex_exit(&dwp->wq->wq_lock); 651 652 rdsv3_queue_work(dwp->wq, &dwp->work); 653 654 RDSV3_DPRINTF4("rdsv3_work_timeout_handler", 655 "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work); 656 } 657 658 void 659 rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq, 660 rdsv3_delayed_work_t *dwp, uint_t delay) 661 { 662 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", 663 "Enter(wq: %p, wp: %p)", wq, dwp); 664 665 if (delay == 0) { 666 rdsv3_queue_work(wq, &dwp->work); 667 return; 668 } 669 670 mutex_enter(&wq->wq_lock); 671 if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) { 672 mutex_exit(&wq->wq_lock); 673 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", 674 "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp); 675 return; 676 } 677 wq->wq_pending++; 678 mutex_exit(&wq->wq_lock); 679 680 mutex_enter(&dwp->lock); 681 if (dwp->timeid == 0) { 682 dwp->wq = wq; 683 dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp, 684 jiffies + (delay * rdsv3_one_sec_in_hz)); 685 mutex_exit(&dwp->lock); 686 } else { 687 mutex_exit(&dwp->lock); 688 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p", 689 dwp); 690 mutex_enter(&wq->wq_lock); 691 wq->wq_pending--; 692 mutex_exit(&wq->wq_lock); 693 } 694 695 RDSV3_DPRINTF4("rdsv3_queue_delayed_work", 696 "Return(wq: %p, wp: %p)", wq, dwp); 697 } 698 699 void 700 rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp) 701 { 702 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", 703 "Enter(wq: %p, dwp: %p)", dwp->wq, dwp); 704 705 mutex_enter(&dwp->lock); 706 if (dwp->timeid != 0) { 707 (void) untimeout(dwp->timeid); 708 dwp->timeid = 0; 709 } else { 710 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", 711 "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp); 712 mutex_exit(&dwp->lock); 713 return; 714 } 715 mutex_exit(&dwp->lock); 716 717 mutex_enter(&dwp->wq->wq_lock); 718 dwp->wq->wq_pending--; 719 mutex_exit(&dwp->wq->wq_lock); 720 721 RDSV3_DPRINTF4("rdsv3_cancel_delayed_work", 722 "Return(wq: %p, dwp: %p)", dwp->wq, dwp); 723 } 724 725 void 726 rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq) 727 { 728 RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter"); 729 730 ASSERT(wq); 731 732 mutex_enter(&wq->wq_lock); 733 wq->wq_state = RDSV3_WQ_THREAD_EXITING; 734 735 while (wq->wq_pending > 0) { 736 mutex_exit(&wq->wq_lock); 737 delay(drv_usectohz(1000000)); 738 mutex_enter(&wq->wq_lock); 739 }; 740 mutex_exit(&wq->wq_lock); 741 742 rdsv3_flush_workqueue(wq); 743 744 list_destroy(&wq->wq_queue); 745 mutex_destroy(&wq->wq_lock); 746 kmem_free(wq, sizeof (rdsv3_workqueue_struct_t)); 747 748 ASSERT(rdsv3_taskq); 749 ddi_taskq_destroy(rdsv3_taskq); 750 751 wq = NULL; 752 rdsv3_taskq = NULL; 753 754 RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return"); 755 } 756 757 /* ARGSUSED */ 758 void 759 rdsv3_rdma_init_worker(struct rdsv3_work_s *work) 760 { 761 rdsv3_rdma_init(); 762 } 763 764 #define RDSV3_NUM_TASKQ_THREADS 4 765 rdsv3_workqueue_struct_t * 766 rdsv3_create_task_workqueue(char *name) 767 { 768 rdsv3_workqueue_struct_t *wq; 769 770 RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)", 771 rdsv3_dev_info); 772 773 rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name, 774 RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0); 775 if (rdsv3_taskq == NULL) { 776 RDSV3_DPRINTF2(__FILE__, 777 "ddi_taskq_create failed for rdsv3_taskq"); 778 return (NULL); 779 } 780 781 wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP); 782 if (wq == NULL) { 783 RDSV3_DPRINTF2(__FILE__, "kmem_zalloc failed for wq"); 784 ddi_taskq_destroy(rdsv3_taskq); 785 return (NULL); 786 } 787 788 list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s), 789 offsetof(struct rdsv3_work_s, work_item)); 790 mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL); 791 wq->wq_state = RDSV3_WQ_THREAD_IDLE; 792 wq->wq_pending = 0; 793 rdsv3_one_sec_in_hz = drv_usectohz(1000000); 794 795 RDSV3_DPRINTF2("create_singlethread_workqueue", "Return"); 796 797 return (wq); 798 } 799 800 /* 801 * Implementation for struct sock 802 */ 803 804 void 805 rdsv3_sock_exit_data(struct rsock *sk) 806 { 807 struct rdsv3_sock *rs = sk->sk_protinfo; 808 809 RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); 810 811 ASSERT(rs != NULL); 812 ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD)); 813 814 rs->rs_sk = NULL; 815 816 list_destroy(&rs->rs_send_queue); 817 list_destroy(&rs->rs_notify_queue); 818 list_destroy(&rs->rs_recv_queue); 819 820 rw_destroy(&rs->rs_recv_lock); 821 mutex_destroy(&rs->rs_lock); 822 823 mutex_destroy(&rs->rs_rdma_lock); 824 avl_destroy(&rs->rs_rdma_keys); 825 826 rdsv3_exit_waitqueue(sk->sk_sleep); 827 kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t)); 828 mutex_destroy(&sk->sk_lock); 829 830 kmem_cache_free(rdsv3_alloc_cache, sk); 831 RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk); 832 } 833 834 /* XXX - figure out right values */ 835 #define RDSV3_RECV_HIWATER (256 * 1024) 836 #define RDSV3_RECV_LOWATER 128 837 #define RDSV3_XMIT_HIWATER (256 * 1024) 838 #define RDSV3_XMIT_LOWATER 1024 839 840 struct rsock * 841 rdsv3_sk_alloc() 842 { 843 struct rsock *sk; 844 845 sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP); 846 if (sk == NULL) { 847 RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed"); 848 return (NULL); 849 } 850 851 bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock)); 852 return (sk); 853 } 854 855 void 856 rdsv3_sock_init_data(struct rsock *sk) 857 { 858 sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP); 859 rdsv3_init_waitqueue(sk->sk_sleep); 860 861 mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL); 862 sk->sk_refcount = 1; 863 sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1); 864 sk->sk_sndbuf = RDSV3_XMIT_HIWATER; 865 sk->sk_rcvbuf = RDSV3_RECV_HIWATER; 866 } 867 868 /* 869 * Connection cache 870 */ 871 /* ARGSUSED */ 872 int 873 rdsv3_conn_constructor(void *buf, void *arg, int kmflags) 874 { 875 struct rdsv3_connection *conn = buf; 876 877 bzero(conn, sizeof (struct rdsv3_connection)); 878 879 conn->c_next_tx_seq = 1; 880 mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL); 881 mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL); 882 list_create(&conn->c_send_queue, sizeof (struct rdsv3_message), 883 offsetof(struct rdsv3_message, m_conn_item)); 884 list_create(&conn->c_retrans, sizeof (struct rdsv3_message), 885 offsetof(struct rdsv3_message, m_conn_item)); 886 return (0); 887 } 888 889 /* ARGSUSED */ 890 void 891 rdsv3_conn_destructor(void *buf, void *arg) 892 { 893 struct rdsv3_connection *conn = buf; 894 895 ASSERT(list_is_empty(&conn->c_send_queue)); 896 ASSERT(list_is_empty(&conn->c_retrans)); 897 list_destroy(&conn->c_send_queue); 898 list_destroy(&conn->c_retrans); 899 mutex_destroy(&conn->c_send_lock); 900 mutex_destroy(&conn->c_lock); 901 } 902 903 int 904 rdsv3_conn_compare(const void *conn1, const void *conn2) 905 { 906 uint32_be_t laddr1, faddr1, laddr2, faddr2; 907 908 laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr; 909 laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr; 910 911 if (laddr1 == laddr2) { 912 faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr; 913 faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr; 914 if (faddr1 == faddr2) 915 return (0); 916 if (faddr1 < faddr2) 917 return (-1); 918 return (1); 919 } 920 921 if (laddr1 < laddr2) 922 return (-1); 923 924 return (1); 925 } 926 927 /* loop.c */ 928 extern kmutex_t loop_conns_lock; 929 extern list_t loop_conns; 930 931 struct rdsv3_loop_connection 932 { 933 struct list_node loop_node; 934 struct rdsv3_connection *conn; 935 }; 936 937 void 938 rdsv3_loop_init(void) 939 { 940 list_create(&loop_conns, sizeof (struct rdsv3_loop_connection), 941 offsetof(struct rdsv3_loop_connection, loop_node)); 942 mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL); 943 } 944 945 /* rdma.c */ 946 /* IB Rkey is used here for comparison */ 947 int 948 rdsv3_mr_compare(const void *mr1, const void *mr2) 949 { 950 uint32_t key1 = *(uint32_t *)mr1; 951 uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key; 952 953 if (key1 < key2) 954 return (-1); 955 if (key1 > key2) 956 return (1); 957 return (0); 958 } 959 960 /* transport.c */ 961 extern list_t transports; 962 extern krwlock_t trans_sem; 963 964 void 965 rdsv3_trans_exit(void) 966 { 967 struct rdsv3_transport *trans; 968 969 RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter"); 970 971 /* currently, only IB transport */ 972 rw_enter(&trans_sem, RW_READER); 973 if (!list_is_empty(&transports)) 974 trans = list_head(&transports); 975 else 976 trans = NULL; 977 rw_exit(&trans_sem); 978 979 /* trans->exit() will remove the trans from the list */ 980 if (trans) 981 trans->exit(); 982 983 list_destroy(&transports); 984 rw_destroy(&trans_sem); 985 986 RDSV3_DPRINTF2("rdsv3_trans_exit", "Return"); 987 } 988 989 void 990 rdsv3_trans_init() 991 { 992 RDSV3_DPRINTF2("rdsv3_trans_init", "Enter"); 993 994 list_create(&transports, sizeof (struct rdsv3_transport), 995 offsetof(struct rdsv3_transport, t_item)); 996 rw_init(&trans_sem, NULL, RW_DRIVER, NULL); 997 998 RDSV3_DPRINTF2("rdsv3_trans_init", "Return"); 999 } 1000 1001 int 1002 rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size, 1003 void *payload) 1004 { 1005 struct cmsghdr *cp; 1006 char *bp; 1007 size_t cmlen; 1008 size_t cmspace; 1009 size_t bufsz; 1010 1011 RDSV3_DPRINTF4("rdsv3_put_cmsg", 1012 "Enter(msg: %p level: %d type: %d sz: %d)", 1013 msg, level, type, size); 1014 1015 if (msg == NULL || msg->msg_controllen == 0 || payload == NULL) { 1016 return (0); 1017 } 1018 /* check for first cmsg or this is another cmsg to be appended */ 1019 if (msg->msg_control == NULL) 1020 msg->msg_controllen = 0; 1021 1022 cmlen = CMSG_LEN(size); 1023 cmspace = CMSG_SPACE(size); 1024 bufsz = msg->msg_controllen + cmspace; 1025 1026 /* extend the existing cmsg to append the next cmsg */ 1027 bp = kmem_alloc(bufsz, KM_SLEEP); 1028 if (msg->msg_control) { 1029 bcopy(msg->msg_control, bp, msg->msg_controllen); 1030 kmem_free(msg->msg_control, (size_t)msg->msg_controllen); 1031 } 1032 1033 /* assign payload the proper cmsg location */ 1034 cp = (struct cmsghdr *)(bp + msg->msg_controllen); 1035 cp->cmsg_len = cmlen; 1036 cp->cmsg_level = level; 1037 cp->cmsg_type = type; 1038 1039 bcopy(payload, CMSG_DATA(cp), cmlen - 1040 (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr))); 1041 1042 msg->msg_control = bp; 1043 msg->msg_controllen = bufsz; 1044 1045 RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len); 1046 1047 return (0); 1048 } 1049 1050 /* bind.c */ 1051 extern kmutex_t rdsv3_bind_lock; 1052 extern avl_tree_t rdsv3_bind_tree; 1053 1054 /* ARGSUSED */ 1055 int 1056 rdsv3_verify_bind_address(ipaddr_t addr) 1057 { 1058 return (1); 1059 } 1060 1061 /* XXX - need to enhance to compare IP address and port */ 1062 int 1063 rdsv3_bind_node_compare(const void *a, const void *b) 1064 { 1065 uint16_be_t port = *(in_port_t *)a; 1066 struct rdsv3_sock *rs = (struct rdsv3_sock *)b; 1067 1068 RDSV3_DPRINTF5("rdsv3_bind_node_compare", "Enter (%x %x)", port, 1069 rs->rs_bound_port); 1070 1071 if (port > rs->rs_bound_port) 1072 return (+1); 1073 else if (port < rs->rs_bound_port) 1074 return (-1); 1075 1076 return (0); 1077 } 1078 1079 void 1080 rdsv3_bind_tree_init() 1081 { 1082 RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter"); 1083 1084 mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL); 1085 avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare, 1086 sizeof (struct rdsv3_sock), 1087 offsetof(struct rdsv3_sock, rs_bound_node)); 1088 1089 RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return"); 1090 } 1091 1092 void 1093 rdsv3_bind_tree_exit() 1094 { 1095 RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter"); 1096 1097 ASSERT(avl_is_empty(&rdsv3_bind_tree)); 1098 avl_destroy(&rdsv3_bind_tree); 1099 mutex_destroy(&rdsv3_bind_lock); 1100 1101 RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return"); 1102 } 1103 1104 /* checksum */ 1105 uint16_t 1106 rdsv3_ip_fast_csum(void *hdr, size_t length) 1107 { 1108 return (0xffff & 1109 (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0))); 1110 } 1111 1112 /* scatterlist implementation */ 1113 /* ARGSUSED */ 1114 caddr_t 1115 rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat, 1116 uint_t offset) 1117 { 1118 return (0); 1119 } 1120 1121 uint_t 1122 rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat, 1123 uint_t num) 1124 { 1125 struct rdsv3_scatterlist *s, *first; 1126 ibt_iov_t *iov; 1127 ibt_wr_ds_t *sgl; 1128 ibt_iov_attr_t iov_attr; 1129 ibt_send_wr_t swr; 1130 uint_t i; 1131 1132 RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num); 1133 1134 s = first = &scat[0]; 1135 ASSERT(first->mihdl == NULL); 1136 1137 iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP); 1138 sgl = kmem_zalloc((num * 2) * sizeof (ibt_wr_ds_t), KM_SLEEP); 1139 1140 for (i = 0; i < num; i++, s++) { 1141 iov[i].iov_addr = s->vaddr; 1142 iov[i].iov_len = s->length; 1143 } 1144 1145 iov_attr.iov_as = NULL; 1146 iov_attr.iov = iov; 1147 iov_attr.iov_buf = NULL; 1148 iov_attr.iov_list_len = num; 1149 iov_attr.iov_wr_nds = num * 2; 1150 iov_attr.iov_lso_hdr_sz = 0; 1151 iov_attr.iov_flags = IBT_IOV_SLEEP; 1152 1153 swr.wr_sgl = sgl; 1154 1155 i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev), 1156 &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl); 1157 kmem_free(iov, num * sizeof (ibt_iov_t)); 1158 if (i != IBT_SUCCESS) { 1159 RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg", 1160 "ibt_map_mem_iov returned: %d", i); 1161 return (0); 1162 } 1163 1164 s = first; 1165 for (i = 0; i < num; i++, s++, sgl++) { 1166 s->sgl = sgl; 1167 } 1168 1169 return (num); 1170 } 1171 1172 void 1173 rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat, 1174 uint_t num) 1175 { 1176 /* Zero length messages have no scatter gather entries */ 1177 if (num != 0) { 1178 ASSERT(scat->mihdl != NULL); 1179 ASSERT(scat->sgl != NULL); 1180 1181 (void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl); 1182 1183 kmem_free(scat->sgl, (num * 2) * sizeof (ibt_wr_ds_t)); 1184 scat->sgl = NULL; 1185 scat->mihdl = NULL; 1186 } 1187 } 1188 1189 int 1190 rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) 1191 { 1192 caddr_t addr; 1193 size_t size; 1194 ibt_mr_attr_t mr_attr; 1195 ibt_mr_desc_t mr_desc; 1196 ibt_mr_hdl_t mr_hdl; 1197 int ret; 1198 1199 RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev); 1200 1201 ASSERT(ic->i_mr == NULL); 1202 1203 size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) * 1204 sizeof (struct rdsv3_header); 1205 1206 addr = kmem_zalloc(size, KM_NOSLEEP); 1207 if (addr == NULL) 1208 return (-1); 1209 1210 mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr; 1211 mr_attr.mr_len = size; 1212 mr_attr.mr_as = NULL; 1213 mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE; 1214 ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd), 1215 &mr_attr, &mr_hdl, &mr_desc); 1216 if (ret != IBT_SUCCESS) { 1217 RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs", 1218 "ibt_register_mr returned: " "%d", ret); 1219 return (-1); 1220 } 1221 1222 ic->i_mr = 1223 (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr), 1224 KM_SLEEP); 1225 ic->i_mr->addr = addr; 1226 ic->i_mr->size = size; 1227 ic->i_mr->hdl = mr_hdl; 1228 ic->i_mr->lkey = mr_desc.md_lkey; 1229 1230 ic->i_send_hdrs = (struct rdsv3_header *)addr; 1231 ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr; 1232 1233 ic->i_recv_hdrs = (struct rdsv3_header *)(addr + 1234 (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); 1235 ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr + 1236 (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); 1237 ic->i_recv_tasklet_cpuid = -1; 1238 1239 ic->i_ack = (struct rdsv3_header *)(addr + 1240 ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * 1241 sizeof (struct rdsv3_header))); 1242 ic->i_ack_dma = (uint64_t)(uintptr_t)(addr + 1243 ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * 1244 sizeof (struct rdsv3_header))); 1245 1246 RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev); 1247 1248 return (0); 1249 } 1250 1251 void 1252 rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) 1253 { 1254 RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev); 1255 ASSERT(ic->i_mr != NULL); 1256 1257 ic->i_send_hdrs = NULL; 1258 ic->i_send_hdrs_dma = NULL; 1259 1260 ic->i_recv_hdrs = NULL; 1261 ic->i_recv_hdrs_dma = NULL; 1262 1263 ic->i_ack = NULL; 1264 ic->i_ack_dma = NULL; 1265 1266 (void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl); 1267 1268 kmem_free(ic->i_mr->addr, ic->i_mr->size); 1269 kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr)); 1270 1271 ic->i_mr = NULL; 1272 RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev); 1273 } 1274