1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/types.h> 58 #include <sys/stat.h> 59 #include <sys/conf.h> 60 #include <sys/ddi.h> 61 #include <sys/sunddi.h> 62 #include <sys/modctl.h> 63 #include <sys/rds.h> 64 #include <sys/stropts.h> 65 #include <sys/socket.h> 66 #include <sys/socketvar.h> 67 #include <sys/sockio.h> 68 #include <sys/sysmacros.h> 69 70 #include <inet/ip.h> 71 #include <net/if_types.h> 72 73 #include <sys/ib/clients/rdsv3/rdsv3.h> 74 #include <sys/ib/clients/rdsv3/rdma.h> 75 #include <sys/ib/clients/rdsv3/rdma_transport.h> 76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 77 78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds); 79 extern int rdsv3_verify_bind_address(ipaddr_t addr); 80 81 extern ddi_taskq_t *rdsv3_taskq; 82 extern struct rdma_cm_id *rdsv3_rdma_listen_id; 83 84 /* this is just used for stats gathering :/ */ 85 kmutex_t rdsv3_sock_lock; 86 static unsigned long rdsv3_sock_count; 87 list_t rdsv3_sock_list; 88 rdsv3_wait_queue_t rdsv3_poll_waitq; 89 90 /* 91 * This is called as the final descriptor referencing this socket is closed. 92 * We have to unbind the socket so that another socket can be bound to the 93 * address it was using. 94 * 95 * We have to be careful about racing with the incoming path. sock_orphan() 96 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 97 * messages shouldn't be queued. 98 */ 99 /* ARGSUSED */ 100 static int 101 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) 102 { 103 struct rsock *sk = (struct rsock *)proto_handle; 104 struct rdsv3_sock *rs; 105 106 if (sk == NULL) 107 goto out; 108 109 rs = rdsv3_sk_to_rs(sk); 110 RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk); 111 112 rdsv3_sk_sock_orphan(sk); 113 rdsv3_cong_remove_socket(rs); 114 rdsv3_remove_bound(rs); 115 /* 116 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so 117 * that ensures the recv path has completed messing 118 * with the socket. 119 */ 120 rdsv3_clear_recv_queue(rs); 121 rdsv3_send_drop_to(rs, NULL); 122 rdsv3_rdma_drop_keys(rs); 123 (void) rdsv3_notify_queue_get(rs, NULL); 124 125 mutex_enter(&rdsv3_sock_lock); 126 list_remove_node(&rs->rs_item); 127 rdsv3_sock_count--; 128 mutex_exit(&rdsv3_sock_lock); 129 130 rdsv3_sk_sock_put(sk); 131 132 RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs); 133 out: 134 return (0); 135 } 136 137 void 138 __rdsv3_wake_sk_sleep(struct rsock *sk) 139 { 140 /* wakup anyone waiting in recvmsg */ 141 if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep) 142 rdsv3_wake_up(sk->sk_sleep); 143 } 144 145 /* 146 * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep. 147 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 148 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 149 * this seems more conservative. 150 * NB - normally, one would use sk_callback_lock for this, but we can 151 * get here from interrupts, whereas the network code grabs sk_callback_lock 152 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 153 */ 154 void 155 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs) 156 { 157 RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs); 158 159 rw_enter(&rs->rs_recv_lock, RW_READER); 160 __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs)); 161 rw_exit(&rs->rs_recv_lock); 162 } 163 164 /*ARGSUSED*/ 165 static int 166 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 167 socklen_t *addr_len, cred_t *cr) 168 { 169 struct rsock *sk = (struct rsock *)proto_handle; 170 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 171 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 172 173 RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs, 174 rs->rs_bound_port); 175 176 sin->sin_port = rs->rs_bound_port; 177 sin->sin_addr.s_addr = rs->rs_bound_addr; 178 179 sin->sin_family = AF_INET_OFFLOAD; 180 181 *addr_len = sizeof (*sin); 182 return (0); 183 } 184 185 /* 186 * RDS' poll is without a doubt the least intuitive part of the interface, 187 * as POLLIN and POLLOUT do not behave entirely as you would expect from 188 * a network protocol. 189 * 190 * POLLIN is asserted if 191 * - there is data on the receive queue. 192 * - to signal that a previously congested destination may have become 193 * uncongested 194 * - A notification has been queued to the socket (this can be a congestion 195 * update, or a RDMA completion). 196 * 197 * POLLOUT is asserted if there is room on the send queue. This does not mean 198 * however, that the next sendmsg() call will succeed. If the application tries 199 * to send to a congested destination, the system call may still fail (and 200 * return ENOBUFS). 201 */ 202 /* ARGSUSED */ 203 static short 204 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, 205 cred_t *cr) 206 { 207 struct rsock *sk = (struct rsock *)proto_handle; 208 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 209 unsigned short mask = 0; 210 211 #if 0 212 RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet); 213 #endif 214 215 rw_enter(&rs->rs_recv_lock, RW_READER); 216 if (!rs->rs_cong_monitor) { 217 /* 218 * When a congestion map was updated, we signal POLLIN for 219 * "historical" reasons. Applications can also poll for 220 * WRBAND instead. 221 */ 222 if (rdsv3_cong_updated_since(&rs->rs_cong_track)) 223 mask |= (POLLIN | POLLRDNORM | POLLWRBAND); 224 } else { 225 mutex_enter(&rs->rs_lock); 226 if (rs->rs_cong_notify) 227 mask |= (POLLIN | POLLRDNORM); 228 mutex_exit(&rs->rs_lock); 229 } 230 if (!list_is_empty(&rs->rs_recv_queue) || 231 !list_is_empty(&rs->rs_notify_queue)) 232 mask |= (POLLIN | POLLRDNORM); 233 if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) 234 mask |= (POLLOUT | POLLWRNORM); 235 rw_exit(&rs->rs_recv_lock); 236 237 #if 0 238 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); 239 #endif 240 241 return (mask); 242 } 243 244 /* ARGSUSED */ 245 static int 246 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 247 int mode, int32_t *rvalp, cred_t *cr) 248 { 249 ksocket_t so4; 250 struct lifconf lifc; 251 struct lifreq lifr, *lifrp; 252 struct ifconf ifc; 253 struct ifreq ifr; 254 int rval = 0, rc, len; 255 int numifs; 256 int bufsize; 257 void *buf; 258 259 RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd); 260 261 /* Only ipv4 for now */ 262 rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP, 263 CRED()); 264 if (rval != 0) { 265 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d", 266 rval); 267 return (rval); 268 } 269 270 switch (cmd) { 271 case SIOCGLIFNUM : 272 case SIOCGIFNUM : 273 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 274 if (rval != 0) break; 275 if (cmd == SIOCGLIFNUM) { 276 (void) ddi_copyout(&numifs, (void *)arg, 277 sizeof (int), 0); 278 } else { 279 len = 0; 280 for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs; 281 rc++, lifrp++) { 282 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) { 283 len++; 284 } 285 } 286 (void) ddi_copyout(&len, (void *)arg, 287 sizeof (int), 0); 288 } 289 kmem_free(buf, bufsize); 290 break; 291 292 case SIOCGLIFCONF : 293 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0) 294 != 0) { 295 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc"); 296 rval = EFAULT; 297 break; 298 } 299 300 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 301 if (rval != 0) { 302 RDSV3_DPRINTF2("rdsv3_ioctl", 303 "rdsv3_do_ip_ioctl failed: %d", rval); 304 break; 305 } 306 307 if ((lifc.lifc_len > 0) && (numifs > 0)) { 308 if (ddi_copyout(buf, (void *)lifc.lifc_req, 309 (lifc.lifc_len < bufsize) ? lifc.lifc_len : 310 bufsize, 0) != 0) { 311 RDSV3_DPRINTF2("rdsv3_ioctl", 312 "copyout of records failed"); 313 rval = EFAULT; 314 } 315 316 } 317 318 lifc.lifc_len = bufsize; 319 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf), 320 0) != 0) { 321 RDSV3_DPRINTF2("rdsv3_ioctl", 322 "copyout of lifconf failed"); 323 rval = EFAULT; 324 } 325 326 kmem_free(buf, bufsize); 327 break; 328 329 case SIOCGIFCONF : 330 case O_SIOCGIFCONF : 331 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0) 332 != 0) { 333 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc"); 334 rval = EFAULT; 335 break; 336 } 337 338 RDSV3_DPRINTF2("rdsv3_ioctl", 339 "O_SIOCGIFCONF: ifc_len: %d, req: %p", 340 ifc.ifc_len, ifc.ifc_req); 341 342 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs); 343 if (rval != 0) { 344 RDSV3_DPRINTF2("rdsv3_ioctl", 345 "rdsv3_do_ip_ioctl_old failed: %d", rval); 346 break; 347 } 348 349 if ((ifc.ifc_len > 0) && (numifs > 0)) { 350 if (ddi_copyout(buf, (void *)ifc.ifc_req, 351 (ifc.ifc_len < bufsize) ? ifc.ifc_len : 352 bufsize, 0) != 0) { 353 RDSV3_DPRINTF2("rdsv3_ioctl", 354 "copyout of records failed"); 355 rval = EFAULT; 356 } 357 358 } 359 360 ifc.ifc_len = bufsize; 361 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf), 362 0) != 0) { 363 RDSV3_DPRINTF2("rdsv3_ioctl", 364 "copyout of ifconf failed"); 365 rval = EFAULT; 366 } 367 368 kmem_free(buf, bufsize); 369 break; 370 371 case SIOCGLIFFLAGS : 372 case SIOCSLIFFLAGS : 373 case SIOCGLIFMTU : 374 case SIOCGLIFNETMASK : 375 case SIOCGLIFINDEX : 376 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0) 377 != 0) { 378 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr"); 379 rval = EFAULT; 380 break; 381 } 382 383 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED()); 384 if (rc != 0) { 385 RDSV3_DPRINTF2("rdsv3_ioctl", 386 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 387 rc, lifr.lifr_name, cmd); 388 break; 389 } 390 391 (void) ddi_copyout(&lifr, (void *)arg, 392 sizeof (struct lifreq), 0); 393 break; 394 395 case SIOCGIFFLAGS : 396 case SIOCSIFFLAGS : 397 case SIOCGIFMTU : 398 case SIOCGIFNETMASK : 399 case SIOCGIFINDEX : 400 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0) 401 != 0) { 402 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr"); 403 rval = EFAULT; 404 break; 405 } 406 407 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name); 408 409 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED()); 410 if (rc != 0) { 411 RDSV3_DPRINTF2("rdsv3_ioctl", 412 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 413 rc, ifr.ifr_name, cmd); 414 415 break; 416 } 417 418 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name); 419 420 (void) ddi_copyout(&ifr, (void *)arg, 421 sizeof (struct ifreq), 0); 422 break; 423 424 default: 425 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd); 426 rval = EOPNOTSUPP; 427 } 428 429 (void) ksocket_close(so4, CRED()); 430 431 RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd); 432 433 *rvalp = rval; 434 return (rval); 435 } 436 437 static int 438 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len) 439 { 440 struct sockaddr_in sin; 441 442 /* racing with another thread binding seems ok here */ 443 if (rs->rs_bound_addr == 0) 444 return (-ENOTCONN); /* XXX not a great errno */ 445 446 if (len < sizeof (struct sockaddr_in)) 447 return (-EINVAL); 448 449 if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in), 450 0) != 0) { 451 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin"); 452 return (-EFAULT); 453 } 454 455 rdsv3_send_drop_to(rs, &sin); 456 457 return (0); 458 } 459 460 static int 461 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen) 462 { 463 int value = *optval; 464 465 if (optlen < sizeof (int)) 466 return (-EINVAL); 467 *optvar = !!value; 468 return (0); 469 } 470 471 static int 472 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen) 473 { 474 int ret; 475 476 ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 477 if (ret == 0) { 478 if (rs->rs_cong_monitor) { 479 rdsv3_cong_add_socket(rs); 480 } else { 481 rdsv3_cong_remove_socket(rs); 482 rs->rs_cong_mask = 0; 483 rs->rs_cong_notify = 0; 484 } 485 } 486 return (ret); 487 } 488 489 /*ARGSUSED*/ 490 static int 491 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level, 492 int optname, const void *optval, socklen_t optlen, cred_t *cr) 493 { 494 struct rsock *sk = (struct rsock *)proto_handle; 495 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 496 int ret = 0; 497 498 RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)", 499 rs, level, optname); 500 501 switch (optname) { 502 case RDSV3_CANCEL_SENT_TO: 503 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen); 504 break; 505 case RDSV3_GET_MR: 506 ret = rdsv3_get_mr(rs, optval, optlen); 507 break; 508 case RDSV3_FREE_MR: 509 ret = rdsv3_free_mr(rs, optval, optlen); 510 break; 511 case RDSV3_RECVERR: 512 ret = rdsv3_set_bool_option(&rs->rs_recverr, 513 (char *)optval, optlen); 514 break; 515 case RDSV3_CONG_MONITOR: 516 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen); 517 break; 518 case SO_SNDBUF: 519 sk->sk_sndbuf = *(uint_t *)optval; 520 return (ret); 521 case SO_RCVBUF: 522 sk->sk_rcvbuf = *(uint_t *)optval; 523 return (ret); 524 default: 525 #if 1 526 break; 527 #else 528 ret = -ENOPROTOOPT; 529 #endif 530 } 531 out: 532 return (ret); 533 } 534 535 /* XXX */ 536 /*ARGSUSED*/ 537 static int 538 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level, 539 int optname, void *optval, socklen_t *optlen, cred_t *cr) 540 { 541 struct rsock *sk = (struct rsock *)proto_handle; 542 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 543 int ret = 0; 544 545 RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)", 546 rs, optname, *optlen); 547 548 switch (optname) { 549 case SO_SNDBUF: 550 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)", 551 sk->sk_sndbuf); 552 if (*optlen != 0) { 553 *((int *)optval) = sk->sk_sndbuf; 554 *optlen = sizeof (uint_t); 555 } 556 return (ret); 557 case SO_RCVBUF: 558 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)", 559 sk->sk_rcvbuf); 560 if (*optlen != 0) { 561 *((int *)optval) = sk->sk_rcvbuf; 562 *optlen = sizeof (uint_t); 563 } 564 return (ret); 565 case RDSV3_RECVERR: 566 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)", 567 rs->rs_recverr); 568 if (*optlen < sizeof (int)) 569 return (-EINVAL); 570 else { 571 *(int *)optval = rs->rs_recverr; 572 *optlen = sizeof (int); 573 } 574 return (0); 575 default: 576 if ((optname >= RDSV3_INFO_FIRST) && 577 (optname <= RDSV3_INFO_LAST)) { 578 return (rdsv3_info_getsockopt(sk, optname, optval, 579 optlen)); 580 } 581 RDSV3_DPRINTF2("rdsv3_getsockopt", 582 "Unknown: level: %d optname: %d", level, optname); 583 ret = -ENOPROTOOPT; 584 } 585 586 RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)", 587 rs, optname, ret); 588 return (ret); 589 } 590 591 /*ARGSUSED*/ 592 static int rdsv3_connect(sock_lower_handle_t proto_handle, 593 const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn, 594 cred_t *cr) 595 { 596 struct rsock *sk = (struct rsock *)proto_handle; 597 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 598 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 599 int ret = 0; 600 601 RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs); 602 603 mutex_enter(&sk->sk_lock); 604 605 if (addr_len != sizeof (struct sockaddr_in)) { 606 ret = -EINVAL; 607 goto out; 608 } 609 610 if (sin->sin_family != AF_INET_OFFLOAD) { 611 ret = -EAFNOSUPPORT; 612 goto out; 613 } 614 615 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 616 ret = -EDESTADDRREQ; 617 goto out; 618 } 619 620 rs->rs_conn_addr = sin->sin_addr.s_addr; 621 rs->rs_conn_port = sin->sin_port; 622 623 sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1); 624 625 RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs); 626 627 out: 628 mutex_exit(&sk->sk_lock); 629 return (ret); 630 } 631 632 /*ARGSUSED*/ 633 static int 634 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 635 { 636 struct rsock *sk = (struct rsock *)proto_handle; 637 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 638 639 RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs); 640 641 return (0); 642 } 643 644 /*ARGSUSED*/ 645 void 646 rdsv3_activate(sock_lower_handle_t proto_handle, 647 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, 648 int flags, cred_t *cr) 649 { 650 struct rsock *sk = (struct rsock *)proto_handle; 651 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 652 653 RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs); 654 655 sk->sk_upcalls = sock_upcalls; 656 sk->sk_upper_handle = sock_handle; 657 658 RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs); 659 } 660 661 662 /* ARGSUSED */ 663 int 664 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio, 665 struct nmsghdr *msg, cred_t *cr) 666 { 667 struct rsock *sk = (struct rsock *)proto_handle; 668 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 669 int ret; 670 671 RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs); 672 ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid); 673 674 RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret); 675 if (ret < 0) { 676 return (-ret); 677 } 678 679 return (0); 680 } 681 682 /* ARGSUSED */ 683 int 684 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio, 685 struct nmsghdr *msg, cred_t *cr) 686 { 687 struct rsock *sk = (struct rsock *)proto_handle; 688 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 689 int ret; 690 691 RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs); 692 ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags); 693 694 RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret); 695 696 if (ret < 0) { 697 return (-ret); 698 } 699 700 return (0); 701 } 702 703 /*ARGSUSED*/ 704 int 705 rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 706 socklen_t *addr_len, cred_t *cr) 707 { 708 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 709 struct rsock *sk = (struct rsock *)proto_handle; 710 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 711 712 RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs); 713 714 (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero)); 715 716 /* racey, don't care */ 717 if (!rs->rs_conn_addr) 718 return (-ENOTCONN); 719 720 sin->sin_port = rs->rs_conn_port; 721 sin->sin_addr.s_addr = rs->rs_conn_addr; 722 723 sin->sin_family = AF_INET_OFFLOAD; 724 725 *addr_len = sizeof (*sin); 726 return (0); 727 } 728 729 void 730 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle) 731 { 732 struct rsock *sk = (struct rsock *)proto_handle; 733 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 734 735 RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs); 736 } 737 738 #ifndef __lock_lint 739 static struct sock_downcalls_s rdsv3_sock_downcalls = { 740 .sd_close = rdsv3_release, 741 .sd_bind = rdsv3_bind, 742 .sd_connect = rdsv3_connect, 743 .sd_accept = NULL, 744 .sd_getsockname = rdsv3_getname, 745 .sd_poll = rdsv3_poll, 746 .sd_ioctl = rdsv3_ioctl, 747 .sd_listen = NULL, 748 .sd_shutdown = rdsv3_shutdown, 749 .sd_setsockopt = rdsv3_setsockopt, 750 .sd_getsockopt = rdsv3_getsockopt, 751 .sd_send_uio = rdsv3_send_uio, 752 .sd_recv_uio = rdsv3_recv_uio, 753 .sd_activate = rdsv3_activate, 754 .sd_getpeername = rdsv3_getpeername, 755 .sd_send = NULL, 756 .sd_clr_flowctrl = NULL 757 }; 758 #else 759 static struct sock_downcalls_s rdsv3_sock_downcalls = { 760 rdsv3_activate, 761 NULL, 762 rdsv3_bind, 763 NULL, 764 rdsv3_connect, 765 rdsv3_getpeername, 766 rdsv3_getname, 767 rdsv3_getsockopt, 768 rdsv3_setsockopt, 769 NULL, 770 rdsv3_send_uio, 771 rdsv3_recv_uio, 772 rdsv3_poll, 773 rdsv3_shutdown, 774 NULL, 775 rdsv3_ioctl, 776 rdsv3_release 777 }; 778 #endif 779 780 sock_lower_handle_t 781 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 782 uint_t *smodep, int *errorp, int flags, cred_t *credp) 783 { 784 struct rdsv3_sock *rs; 785 struct rsock *sk; 786 787 RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d " 788 "flags: %d", family, type, proto, flags); 789 790 sk = rdsv3_sk_alloc(); 791 if (sk == NULL) 792 return (NULL); 793 rdsv3_sock_init_data(sk); 794 795 rs = rdsv3_sk_to_rs(sk); 796 rs->rs_sk = sk; 797 mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL); 798 rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL); 799 list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message), 800 offsetof(struct rdsv3_message, m_sock_item)); 801 list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming), 802 offsetof(struct rdsv3_incoming, i_item)); 803 list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier), 804 offsetof(struct rdsv3_notifier, n_list)); 805 mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL); 806 avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, 807 sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); 808 mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); 809 rs->rs_cred = credp; 810 rs->rs_zoneid = getzoneid(); 811 crhold(credp); 812 813 mutex_enter(&rdsv3_sock_lock); 814 list_insert_tail(&rdsv3_sock_list, rs); 815 rdsv3_sock_count++; 816 /* Initialize RDMA/IB on the 1st socket if not done at attach */ 817 if (rdsv3_sock_count == 1) { 818 rdsv3_rdma_init(); 819 } 820 mutex_exit(&rdsv3_sock_lock); 821 822 *errorp = 0; 823 *smodep = SM_ATOMIC; 824 *sock_downcalls = &rdsv3_sock_downcalls; 825 826 RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs); 827 828 return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs)); 829 } 830 831 void 832 rdsv3_sock_addref(struct rdsv3_sock *rs) 833 { 834 RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs); 835 rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 836 } 837 838 void 839 rdsv3_sock_put(struct rdsv3_sock *rs) 840 { 841 RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs); 842 rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 843 } 844 845 static void 846 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len, 847 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 848 { 849 struct rdsv3_sock *rs; 850 struct rdsv3_incoming *inc; 851 unsigned int total = 0; 852 853 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)", 854 rdsv3_sk_to_rs(sock)); 855 856 len /= sizeof (struct rdsv3_info_message); 857 858 mutex_enter(&rdsv3_sock_lock); 859 860 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 861 rw_enter(&rs->rs_recv_lock, RW_READER); 862 863 /* XXX too lazy to maintain counts.. */ 864 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) { 865 total++; 866 if (total <= len) 867 rdsv3_inc_info_copy(inc, iter, inc->i_saddr, 868 rs->rs_bound_addr, 1); 869 } 870 871 rw_exit(&rs->rs_recv_lock); 872 } 873 874 mutex_exit(&rdsv3_sock_lock); 875 876 lens->nr = total; 877 lens->each = sizeof (struct rdsv3_info_message); 878 879 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)", 880 rdsv3_sk_to_rs(sock)); 881 } 882 883 static void 884 rdsv3_sock_info(struct rsock *sock, unsigned int len, 885 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 886 { 887 struct rdsv3_info_socket sinfo; 888 struct rdsv3_sock *rs; 889 unsigned long bytes; 890 891 RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)", 892 rdsv3_sk_to_rs(sock)); 893 894 len /= sizeof (struct rdsv3_info_socket); 895 896 mutex_enter(&rdsv3_sock_lock); 897 898 if ((len < rdsv3_sock_count) || (iter->addr == NULL)) 899 goto out; 900 901 bytes = sizeof (struct rdsv3_info_socket); 902 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 903 sinfo.sndbuf = rdsv3_sk_sndbuf(rs); 904 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs); 905 sinfo.bound_addr = rs->rs_bound_addr; 906 sinfo.connected_addr = rs->rs_conn_addr; 907 sinfo.bound_port = rs->rs_bound_port; 908 sinfo.connected_port = rs->rs_conn_port; 909 910 rdsv3_info_copy(iter, &sinfo, bytes); 911 } 912 913 RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)", 914 rdsv3_sk_to_rs(sock)); 915 916 out: 917 lens->nr = rdsv3_sock_count; 918 lens->each = sizeof (struct rdsv3_info_socket); 919 920 mutex_exit(&rdsv3_sock_lock); 921 } 922 923 rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL; 924 uint_t rdsv3_rdma_init_delay = 5; /* secs */ 925 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work); 926 927 void 928 rdsv3_exit(void) 929 { 930 RDSV3_DPRINTF4("rdsv3_exit", "Enter"); 931 932 if (rdsv3_rdma_dwp) { 933 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp); 934 } 935 936 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit, 937 NULL, DDI_SLEEP); 938 while (rdsv3_rdma_listen_id != NULL) { 939 #ifndef __lock_lint 940 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit", 941 __func__, __LINE__); 942 #endif 943 delay(drv_usectohz(1000)); 944 } 945 946 rdsv3_conn_exit(); 947 rdsv3_cong_exit(); 948 rdsv3_sysctl_exit(); 949 rdsv3_threads_exit(); 950 rdsv3_stats_exit(); 951 rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 952 rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES, 953 rdsv3_sock_inc_info); 954 955 if (rdsv3_rdma_dwp) { 956 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t)); 957 rdsv3_rdma_dwp = NULL; 958 } 959 960 RDSV3_DPRINTF4("rdsv3_exit", "Return"); 961 } 962 963 /*ARGSUSED*/ 964 int 965 rdsv3_init() 966 { 967 int ret; 968 969 RDSV3_DPRINTF4("rdsv3_init", "Enter"); 970 971 rdsv3_cong_init(); 972 ret = rdsv3_conn_init(); 973 if (ret) 974 goto out; 975 ret = rdsv3_threads_init(); 976 if (ret) 977 goto out_conn; 978 ret = rdsv3_sysctl_init(); 979 if (ret) 980 goto out_threads; 981 ret = rdsv3_stats_init(); 982 if (ret) 983 goto out_sysctl; 984 985 rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 986 rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info); 987 988 /* rdsv3_rdma_init need to be called with a little delay */ 989 rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP); 990 RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker); 991 rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp, 992 rdsv3_rdma_init_delay); 993 994 RDSV3_DPRINTF4("rdsv3_init", "Return"); 995 996 goto out; 997 998 out_stats: 999 rdsv3_stats_exit(); 1000 out_sysctl: 1001 rdsv3_sysctl_exit(); 1002 out_threads: 1003 rdsv3_threads_exit(); 1004 out_conn: 1005 rdsv3_conn_exit(); 1006 rdsv3_cong_exit(); 1007 out: 1008 return (ret); 1009 } 1010