1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/types.h> 58 #include <sys/stat.h> 59 #include <sys/conf.h> 60 #include <sys/ddi.h> 61 #include <sys/sunddi.h> 62 #include <sys/modctl.h> 63 #include <sys/rds.h> 64 #include <sys/stropts.h> 65 #include <sys/socket.h> 66 #include <sys/socketvar.h> 67 #include <sys/sockio.h> 68 #include <sys/sysmacros.h> 69 70 #include <inet/ip.h> 71 #include <net/if_types.h> 72 73 #include <sys/ib/clients/rdsv3/rdsv3.h> 74 #include <sys/ib/clients/rdsv3/rdma.h> 75 #include <sys/ib/clients/rdsv3/rdma_transport.h> 76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 77 78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds); 79 extern int rdsv3_verify_bind_address(ipaddr_t addr); 80 81 extern ddi_taskq_t *rdsv3_taskq; 82 extern struct rdma_cm_id *rdsv3_rdma_listen_id; 83 84 /* this is just used for stats gathering :/ */ 85 kmutex_t rdsv3_sock_lock; 86 static unsigned long rdsv3_sock_count; 87 list_t rdsv3_sock_list; 88 rdsv3_wait_queue_t rdsv3_poll_waitq; 89 90 /* 91 * This is called as the final descriptor referencing this socket is closed. 92 * We have to unbind the socket so that another socket can be bound to the 93 * address it was using. 94 * 95 * We have to be careful about racing with the incoming path. sock_orphan() 96 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 97 * messages shouldn't be queued. 98 */ 99 /* ARGSUSED */ 100 static int 101 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) 102 { 103 struct rsock *sk = (struct rsock *)proto_handle; 104 struct rdsv3_sock *rs; 105 106 if (sk == NULL) 107 goto out; 108 109 rs = rdsv3_sk_to_rs(sk); 110 RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk); 111 112 rdsv3_sk_sock_orphan(sk); 113 rdsv3_cong_remove_socket(rs); 114 rdsv3_remove_bound(rs); 115 /* 116 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so 117 * that ensures the recv path has completed messing 118 * with the socket. 119 */ 120 rdsv3_clear_recv_queue(rs); 121 rdsv3_send_drop_to(rs, NULL); 122 rdsv3_rdma_drop_keys(rs); 123 (void) rdsv3_notify_queue_get(rs, NULL); 124 125 mutex_enter(&rdsv3_sock_lock); 126 list_remove_node(&rs->rs_item); 127 rdsv3_sock_count--; 128 mutex_exit(&rdsv3_sock_lock); 129 130 while (sk->sk_refcount > 1) { 131 /* wait for 1 sec and try again */ 132 delay(drv_usectohz(1000000)); 133 } 134 135 /* this will free the rs and sk */ 136 rdsv3_sk_sock_put(sk); 137 138 RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs); 139 out: 140 return (0); 141 } 142 143 void 144 __rdsv3_wake_sk_sleep(struct rsock *sk) 145 { 146 /* wakup anyone waiting in recvmsg */ 147 if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep) 148 rdsv3_wake_up(sk->sk_sleep); 149 } 150 151 /* 152 * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep. 153 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 154 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 155 * this seems more conservative. 156 * NB - normally, one would use sk_callback_lock for this, but we can 157 * get here from interrupts, whereas the network code grabs sk_callback_lock 158 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 159 */ 160 void 161 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs) 162 { 163 RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs); 164 165 rw_enter(&rs->rs_recv_lock, RW_READER); 166 __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs)); 167 rw_exit(&rs->rs_recv_lock); 168 } 169 170 /*ARGSUSED*/ 171 static int 172 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 173 socklen_t *addr_len, cred_t *cr) 174 { 175 struct rsock *sk = (struct rsock *)proto_handle; 176 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 177 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 178 179 RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs, 180 rs->rs_bound_port); 181 182 sin->sin_port = rs->rs_bound_port; 183 sin->sin_addr.s_addr = rs->rs_bound_addr; 184 185 sin->sin_family = AF_INET_OFFLOAD; 186 187 *addr_len = sizeof (*sin); 188 return (0); 189 } 190 191 /* 192 * RDS' poll is without a doubt the least intuitive part of the interface, 193 * as POLLIN and POLLOUT do not behave entirely as you would expect from 194 * a network protocol. 195 * 196 * POLLIN is asserted if 197 * - there is data on the receive queue. 198 * - to signal that a previously congested destination may have become 199 * uncongested 200 * - A notification has been queued to the socket (this can be a congestion 201 * update, or a RDMA completion). 202 * 203 * POLLOUT is asserted if there is room on the send queue. This does not mean 204 * however, that the next sendmsg() call will succeed. If the application tries 205 * to send to a congested destination, the system call may still fail (and 206 * return ENOBUFS). 207 */ 208 /* ARGSUSED */ 209 static short 210 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, 211 cred_t *cr) 212 { 213 struct rsock *sk = (struct rsock *)proto_handle; 214 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 215 unsigned short mask = 0; 216 217 #if 0 218 RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet); 219 #endif 220 221 /* 222 * If rs_seen_congestion is on, wait until it's off. 223 * This is implemented for the following OFED code. 224 * if (rs->rs_seen_congestion) 225 * poll_wait(file, &rds_poll_waitq, wait); 226 */ 227 mutex_enter(&rdsv3_poll_waitq.waitq_mutex); 228 while (rs->rs_seen_congestion) { 229 cv_wait(&rdsv3_poll_waitq.waitq_cv, 230 &rdsv3_poll_waitq.waitq_mutex); 231 } 232 mutex_exit(&rdsv3_poll_waitq.waitq_mutex); 233 234 rw_enter(&rs->rs_recv_lock, RW_READER); 235 if (!rs->rs_cong_monitor) { 236 /* 237 * When a congestion map was updated, we signal POLLIN for 238 * "historical" reasons. Applications can also poll for 239 * WRBAND instead. 240 */ 241 if (rdsv3_cong_updated_since(&rs->rs_cong_track)) 242 mask |= (POLLIN | POLLRDNORM | POLLWRBAND); 243 } else { 244 mutex_enter(&rs->rs_lock); 245 if (rs->rs_cong_notify) 246 mask |= (POLLIN | POLLRDNORM); 247 mutex_exit(&rs->rs_lock); 248 } 249 if (!list_is_empty(&rs->rs_recv_queue) || 250 !list_is_empty(&rs->rs_notify_queue)) 251 mask |= (POLLIN | POLLRDNORM); 252 if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) 253 mask |= (POLLOUT | POLLWRNORM); 254 rw_exit(&rs->rs_recv_lock); 255 256 /* clear state any time we wake a seen-congested socket */ 257 if (mask) { 258 mutex_enter(&rdsv3_poll_waitq.waitq_mutex); 259 rs->rs_seen_congestion = 0; 260 mutex_exit(&rdsv3_poll_waitq.waitq_mutex); 261 } 262 263 #if 0 264 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); 265 #endif 266 267 return (mask); 268 } 269 270 /* ARGSUSED */ 271 static int 272 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 273 int mode, int32_t *rvalp, cred_t *cr) 274 { 275 ksocket_t so4; 276 struct lifconf lifc; 277 struct lifreq lifr, *lifrp; 278 struct ifconf ifc; 279 struct ifreq ifr; 280 int rval = 0, rc, len; 281 int numifs; 282 int bufsize; 283 void *buf; 284 285 RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd); 286 287 /* Only ipv4 for now */ 288 rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP, 289 CRED()); 290 if (rval != 0) { 291 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d", 292 rval); 293 return (rval); 294 } 295 296 switch (cmd) { 297 case SIOCGLIFNUM : 298 case SIOCGIFNUM : 299 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 300 if (rval != 0) break; 301 if (cmd == SIOCGLIFNUM) { 302 (void) ddi_copyout(&numifs, (void *)arg, 303 sizeof (int), 0); 304 } else { 305 len = 0; 306 for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs; 307 rc++, lifrp++) { 308 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) { 309 len++; 310 } 311 } 312 (void) ddi_copyout(&len, (void *)arg, 313 sizeof (int), 0); 314 } 315 kmem_free(buf, bufsize); 316 break; 317 318 case SIOCGLIFCONF : 319 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0) 320 != 0) { 321 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc"); 322 rval = EFAULT; 323 break; 324 } 325 326 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 327 if (rval != 0) { 328 RDSV3_DPRINTF2("rdsv3_ioctl", 329 "rdsv3_do_ip_ioctl failed: %d", rval); 330 break; 331 } 332 333 if ((lifc.lifc_len > 0) && (numifs > 0)) { 334 if (ddi_copyout(buf, (void *)lifc.lifc_req, 335 (lifc.lifc_len < bufsize) ? lifc.lifc_len : 336 bufsize, 0) != 0) { 337 RDSV3_DPRINTF2("rdsv3_ioctl", 338 "copyout of records failed"); 339 rval = EFAULT; 340 } 341 342 } 343 344 lifc.lifc_len = bufsize; 345 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf), 346 0) != 0) { 347 RDSV3_DPRINTF2("rdsv3_ioctl", 348 "copyout of lifconf failed"); 349 rval = EFAULT; 350 } 351 352 kmem_free(buf, bufsize); 353 break; 354 355 case SIOCGIFCONF : 356 case O_SIOCGIFCONF : 357 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0) 358 != 0) { 359 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc"); 360 rval = EFAULT; 361 break; 362 } 363 364 RDSV3_DPRINTF2("rdsv3_ioctl", 365 "O_SIOCGIFCONF: ifc_len: %d, req: %p", 366 ifc.ifc_len, ifc.ifc_req); 367 368 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs); 369 if (rval != 0) { 370 RDSV3_DPRINTF2("rdsv3_ioctl", 371 "rdsv3_do_ip_ioctl_old failed: %d", rval); 372 break; 373 } 374 375 if ((ifc.ifc_len > 0) && (numifs > 0)) { 376 if (ddi_copyout(buf, (void *)ifc.ifc_req, 377 (ifc.ifc_len < bufsize) ? ifc.ifc_len : 378 bufsize, 0) != 0) { 379 RDSV3_DPRINTF2("rdsv3_ioctl", 380 "copyout of records failed"); 381 rval = EFAULT; 382 } 383 384 } 385 386 ifc.ifc_len = bufsize; 387 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf), 388 0) != 0) { 389 RDSV3_DPRINTF2("rdsv3_ioctl", 390 "copyout of ifconf failed"); 391 rval = EFAULT; 392 } 393 394 kmem_free(buf, bufsize); 395 break; 396 397 case SIOCGLIFFLAGS : 398 case SIOCSLIFFLAGS : 399 case SIOCGLIFMTU : 400 case SIOCGLIFNETMASK : 401 case SIOCGLIFINDEX : 402 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0) 403 != 0) { 404 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr"); 405 rval = EFAULT; 406 break; 407 } 408 409 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED()); 410 if (rc != 0) { 411 RDSV3_DPRINTF2("rdsv3_ioctl", 412 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 413 rc, lifr.lifr_name, cmd); 414 break; 415 } 416 417 (void) ddi_copyout(&lifr, (void *)arg, 418 sizeof (struct lifreq), 0); 419 break; 420 421 case SIOCGIFFLAGS : 422 case SIOCSIFFLAGS : 423 case SIOCGIFMTU : 424 case SIOCGIFNETMASK : 425 case SIOCGIFINDEX : 426 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0) 427 != 0) { 428 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr"); 429 rval = EFAULT; 430 break; 431 } 432 433 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name); 434 435 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED()); 436 if (rc != 0) { 437 RDSV3_DPRINTF2("rdsv3_ioctl", 438 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 439 rc, ifr.ifr_name, cmd); 440 441 break; 442 } 443 444 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name); 445 446 (void) ddi_copyout(&ifr, (void *)arg, 447 sizeof (struct ifreq), 0); 448 break; 449 450 default: 451 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd); 452 rval = EOPNOTSUPP; 453 } 454 455 (void) ksocket_close(so4, CRED()); 456 457 RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd); 458 459 *rvalp = rval; 460 return (rval); 461 } 462 463 static int 464 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len) 465 { 466 struct sockaddr_in sin; 467 468 /* racing with another thread binding seems ok here */ 469 if (rs->rs_bound_addr == 0) 470 return (-ENOTCONN); /* XXX not a great errno */ 471 472 if (len < sizeof (struct sockaddr_in)) 473 return (-EINVAL); 474 475 if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in), 476 0) != 0) { 477 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin"); 478 return (-EFAULT); 479 } 480 481 rdsv3_send_drop_to(rs, &sin); 482 483 return (0); 484 } 485 486 static int 487 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen) 488 { 489 int value = *optval; 490 491 if (optlen < sizeof (int)) 492 return (-EINVAL); 493 *optvar = !!value; 494 return (0); 495 } 496 497 static int 498 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen) 499 { 500 int ret; 501 502 ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 503 if (ret == 0) { 504 if (rs->rs_cong_monitor) { 505 rdsv3_cong_add_socket(rs); 506 } else { 507 rdsv3_cong_remove_socket(rs); 508 rs->rs_cong_mask = 0; 509 rs->rs_cong_notify = 0; 510 } 511 } 512 return (ret); 513 } 514 515 /*ARGSUSED*/ 516 static int 517 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level, 518 int optname, const void *optval, socklen_t optlen, cred_t *cr) 519 { 520 struct rsock *sk = (struct rsock *)proto_handle; 521 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 522 int ret = 0; 523 524 RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)", 525 rs, level, optname); 526 527 switch (optname) { 528 case RDSV3_CANCEL_SENT_TO: 529 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen); 530 break; 531 case RDSV3_GET_MR: 532 ret = rdsv3_get_mr(rs, optval, optlen); 533 break; 534 case RDSV3_GET_MR_FOR_DEST: 535 ret = rdsv3_get_mr_for_dest(rs, optval, optlen); 536 break; 537 case RDSV3_FREE_MR: 538 ret = rdsv3_free_mr(rs, optval, optlen); 539 break; 540 case RDSV3_RECVERR: 541 ret = rdsv3_set_bool_option(&rs->rs_recverr, 542 (char *)optval, optlen); 543 break; 544 case RDSV3_CONG_MONITOR: 545 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen); 546 break; 547 case SO_SNDBUF: 548 sk->sk_sndbuf = *(uint_t *)optval; 549 return (ret); 550 case SO_RCVBUF: 551 sk->sk_rcvbuf = *(uint_t *)optval; 552 return (ret); 553 default: 554 #if 1 555 break; 556 #else 557 ret = -ENOPROTOOPT; 558 #endif 559 } 560 out: 561 return (ret); 562 } 563 564 /* XXX */ 565 /*ARGSUSED*/ 566 static int 567 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level, 568 int optname, void *optval, socklen_t *optlen, cred_t *cr) 569 { 570 struct rsock *sk = (struct rsock *)proto_handle; 571 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 572 int ret = 0; 573 574 RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)", 575 rs, optname, *optlen); 576 577 switch (optname) { 578 case SO_SNDBUF: 579 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)", 580 sk->sk_sndbuf); 581 if (*optlen != 0) { 582 *((int *)optval) = sk->sk_sndbuf; 583 *optlen = sizeof (uint_t); 584 } 585 return (ret); 586 case SO_RCVBUF: 587 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)", 588 sk->sk_rcvbuf); 589 if (*optlen != 0) { 590 *((int *)optval) = sk->sk_rcvbuf; 591 *optlen = sizeof (uint_t); 592 } 593 return (ret); 594 case RDSV3_RECVERR: 595 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)", 596 rs->rs_recverr); 597 if (*optlen < sizeof (int)) 598 return (-EINVAL); 599 else { 600 *(int *)optval = rs->rs_recverr; 601 *optlen = sizeof (int); 602 } 603 return (0); 604 default: 605 if ((optname >= RDSV3_INFO_FIRST) && 606 (optname <= RDSV3_INFO_LAST)) { 607 return (rdsv3_info_getsockopt(sk, optname, optval, 608 optlen)); 609 } 610 RDSV3_DPRINTF2("rdsv3_getsockopt", 611 "Unknown: level: %d optname: %d", level, optname); 612 ret = -ENOPROTOOPT; 613 } 614 615 RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)", 616 rs, optname, ret); 617 return (ret); 618 } 619 620 /*ARGSUSED*/ 621 static int rdsv3_connect(sock_lower_handle_t proto_handle, 622 const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn, 623 cred_t *cr) 624 { 625 struct rsock *sk = (struct rsock *)proto_handle; 626 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 627 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 628 int ret = 0; 629 630 RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs); 631 632 mutex_enter(&sk->sk_lock); 633 634 if (addr_len != sizeof (struct sockaddr_in)) { 635 ret = -EINVAL; 636 goto out; 637 } 638 639 if (sin->sin_family != AF_INET_OFFLOAD) { 640 ret = -EAFNOSUPPORT; 641 goto out; 642 } 643 644 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 645 ret = -EDESTADDRREQ; 646 goto out; 647 } 648 649 rs->rs_conn_addr = sin->sin_addr.s_addr; 650 rs->rs_conn_port = sin->sin_port; 651 652 sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1); 653 654 RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs); 655 656 out: 657 mutex_exit(&sk->sk_lock); 658 return (ret); 659 } 660 661 /*ARGSUSED*/ 662 static int 663 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 664 { 665 struct rsock *sk = (struct rsock *)proto_handle; 666 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 667 668 RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs); 669 670 return (0); 671 } 672 673 /*ARGSUSED*/ 674 void 675 rdsv3_activate(sock_lower_handle_t proto_handle, 676 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, 677 int flags, cred_t *cr) 678 { 679 struct rsock *sk = (struct rsock *)proto_handle; 680 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 681 682 RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs); 683 684 sk->sk_upcalls = sock_upcalls; 685 sk->sk_upper_handle = sock_handle; 686 687 RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs); 688 } 689 690 691 /* ARGSUSED */ 692 int 693 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio, 694 struct nmsghdr *msg, cred_t *cr) 695 { 696 struct rsock *sk = (struct rsock *)proto_handle; 697 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 698 int ret; 699 700 RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs); 701 ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid); 702 703 RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret); 704 if (ret < 0) { 705 return (-ret); 706 } 707 708 return (0); 709 } 710 711 /* ARGSUSED */ 712 int 713 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio, 714 struct nmsghdr *msg, cred_t *cr) 715 { 716 struct rsock *sk = (struct rsock *)proto_handle; 717 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 718 int ret; 719 720 RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs); 721 ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags); 722 723 RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret); 724 725 if (ret < 0) { 726 return (-ret); 727 } 728 729 return (0); 730 } 731 732 /*ARGSUSED*/ 733 int 734 rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 735 socklen_t *addr_len, cred_t *cr) 736 { 737 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 738 struct rsock *sk = (struct rsock *)proto_handle; 739 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 740 741 RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs); 742 743 (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero)); 744 745 /* racey, don't care */ 746 if (!rs->rs_conn_addr) 747 return (-ENOTCONN); 748 749 sin->sin_port = rs->rs_conn_port; 750 sin->sin_addr.s_addr = rs->rs_conn_addr; 751 752 sin->sin_family = AF_INET_OFFLOAD; 753 754 *addr_len = sizeof (*sin); 755 return (0); 756 } 757 758 void 759 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle) 760 { 761 struct rsock *sk = (struct rsock *)proto_handle; 762 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 763 764 RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs); 765 } 766 767 #ifndef __lock_lint 768 static struct sock_downcalls_s rdsv3_sock_downcalls = { 769 .sd_close = rdsv3_release, 770 .sd_bind = rdsv3_bind, 771 .sd_connect = rdsv3_connect, 772 .sd_accept = NULL, 773 .sd_getsockname = rdsv3_getname, 774 .sd_poll = rdsv3_poll, 775 .sd_ioctl = rdsv3_ioctl, 776 .sd_listen = NULL, 777 .sd_shutdown = rdsv3_shutdown, 778 .sd_setsockopt = rdsv3_setsockopt, 779 .sd_getsockopt = rdsv3_getsockopt, 780 .sd_send_uio = rdsv3_send_uio, 781 .sd_recv_uio = rdsv3_recv_uio, 782 .sd_activate = rdsv3_activate, 783 .sd_getpeername = rdsv3_getpeername, 784 .sd_send = NULL, 785 .sd_clr_flowctrl = NULL 786 }; 787 #else 788 static struct sock_downcalls_s rdsv3_sock_downcalls = { 789 rdsv3_activate, 790 NULL, 791 rdsv3_bind, 792 NULL, 793 rdsv3_connect, 794 rdsv3_getpeername, 795 rdsv3_getname, 796 rdsv3_getsockopt, 797 rdsv3_setsockopt, 798 NULL, 799 rdsv3_send_uio, 800 rdsv3_recv_uio, 801 rdsv3_poll, 802 rdsv3_shutdown, 803 NULL, 804 rdsv3_ioctl, 805 rdsv3_release 806 }; 807 #endif 808 809 sock_lower_handle_t 810 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 811 uint_t *smodep, int *errorp, int flags, cred_t *credp) 812 { 813 struct rdsv3_sock *rs; 814 struct rsock *sk; 815 816 RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d " 817 "flags: %d", family, type, proto, flags); 818 819 sk = rdsv3_sk_alloc(); 820 if (sk == NULL) 821 return (NULL); 822 rdsv3_sock_init_data(sk); 823 824 rs = rdsv3_sk_to_rs(sk); 825 rs->rs_sk = sk; 826 mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL); 827 rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL); 828 list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message), 829 offsetof(struct rdsv3_message, m_sock_item)); 830 list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming), 831 offsetof(struct rdsv3_incoming, i_item)); 832 list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier), 833 offsetof(struct rdsv3_notifier, n_list)); 834 mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL); 835 avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, 836 sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); 837 mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); 838 rs->rs_cred = credp; 839 rs->rs_zoneid = getzoneid(); 840 crhold(credp); 841 842 mutex_enter(&rdsv3_sock_lock); 843 list_insert_tail(&rdsv3_sock_list, rs); 844 rdsv3_sock_count++; 845 /* Initialize RDMA/IB on the 1st socket if not done at attach */ 846 if (rdsv3_sock_count == 1) { 847 rdsv3_rdma_init(); 848 } 849 mutex_exit(&rdsv3_sock_lock); 850 851 *errorp = 0; 852 *smodep = SM_ATOMIC; 853 *sock_downcalls = &rdsv3_sock_downcalls; 854 855 RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs); 856 857 return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs)); 858 } 859 860 void 861 rdsv3_sock_addref(struct rdsv3_sock *rs) 862 { 863 RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs); 864 rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 865 } 866 867 void 868 rdsv3_sock_put(struct rdsv3_sock *rs) 869 { 870 RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs); 871 rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 872 } 873 874 static void 875 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len, 876 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 877 { 878 struct rdsv3_sock *rs; 879 struct rdsv3_incoming *inc; 880 unsigned int total = 0; 881 882 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)", 883 rdsv3_sk_to_rs(sock)); 884 885 len /= sizeof (struct rdsv3_info_message); 886 887 mutex_enter(&rdsv3_sock_lock); 888 889 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 890 rw_enter(&rs->rs_recv_lock, RW_READER); 891 892 /* XXX too lazy to maintain counts.. */ 893 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) { 894 total++; 895 if (total <= len) 896 rdsv3_inc_info_copy(inc, iter, inc->i_saddr, 897 rs->rs_bound_addr, 1); 898 } 899 900 rw_exit(&rs->rs_recv_lock); 901 } 902 903 mutex_exit(&rdsv3_sock_lock); 904 905 lens->nr = total; 906 lens->each = sizeof (struct rdsv3_info_message); 907 908 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)", 909 rdsv3_sk_to_rs(sock)); 910 } 911 912 static void 913 rdsv3_sock_info(struct rsock *sock, unsigned int len, 914 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 915 { 916 struct rdsv3_info_socket sinfo; 917 struct rdsv3_sock *rs; 918 unsigned long bytes; 919 920 RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)", 921 rdsv3_sk_to_rs(sock)); 922 923 len /= sizeof (struct rdsv3_info_socket); 924 925 mutex_enter(&rdsv3_sock_lock); 926 927 if ((len < rdsv3_sock_count) || (iter->addr == NULL)) 928 goto out; 929 930 bytes = sizeof (struct rdsv3_info_socket); 931 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 932 sinfo.sndbuf = rdsv3_sk_sndbuf(rs); 933 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs); 934 sinfo.bound_addr = rs->rs_bound_addr; 935 sinfo.connected_addr = rs->rs_conn_addr; 936 sinfo.bound_port = rs->rs_bound_port; 937 sinfo.connected_port = rs->rs_conn_port; 938 939 rdsv3_info_copy(iter, &sinfo, bytes); 940 } 941 942 RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)", 943 rdsv3_sk_to_rs(sock)); 944 945 out: 946 lens->nr = rdsv3_sock_count; 947 lens->each = sizeof (struct rdsv3_info_socket); 948 949 mutex_exit(&rdsv3_sock_lock); 950 } 951 952 rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL; 953 uint_t rdsv3_rdma_init_delay = 5; /* secs */ 954 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work); 955 956 void 957 rdsv3_exit(void) 958 { 959 RDSV3_DPRINTF4("rdsv3_exit", "Enter"); 960 961 if (rdsv3_rdma_dwp) { 962 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp); 963 } 964 965 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit, 966 NULL, DDI_SLEEP); 967 while (rdsv3_rdma_listen_id != NULL) { 968 #ifndef __lock_lint 969 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit", 970 __func__, __LINE__); 971 #endif 972 delay(drv_usectohz(1000)); 973 } 974 975 rdsv3_conn_exit(); 976 rdsv3_cong_exit(); 977 rdsv3_sysctl_exit(); 978 rdsv3_threads_exit(); 979 rdsv3_stats_exit(); 980 rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 981 rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES, 982 rdsv3_sock_inc_info); 983 984 if (rdsv3_rdma_dwp) { 985 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t)); 986 rdsv3_rdma_dwp = NULL; 987 } 988 989 RDSV3_DPRINTF4("rdsv3_exit", "Return"); 990 } 991 992 /*ARGSUSED*/ 993 int 994 rdsv3_init() 995 { 996 int ret; 997 998 RDSV3_DPRINTF4("rdsv3_init", "Enter"); 999 1000 rdsv3_cong_init(); 1001 1002 ret = rdsv3_conn_init(); 1003 if (ret) 1004 goto out; 1005 ret = rdsv3_threads_init(); 1006 if (ret) 1007 goto out_conn; 1008 ret = rdsv3_sysctl_init(); 1009 if (ret) 1010 goto out_threads; 1011 ret = rdsv3_stats_init(); 1012 if (ret) 1013 goto out_sysctl; 1014 1015 rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 1016 rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info); 1017 1018 /* rdsv3_rdma_init need to be called with a little delay */ 1019 rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP); 1020 RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker); 1021 rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp, 1022 rdsv3_rdma_init_delay); 1023 1024 RDSV3_DPRINTF4("rdsv3_init", "Return"); 1025 1026 goto out; 1027 1028 out_stats: 1029 rdsv3_stats_exit(); 1030 out_sysctl: 1031 rdsv3_sysctl_exit(); 1032 out_threads: 1033 rdsv3_threads_exit(); 1034 out_conn: 1035 rdsv3_conn_exit(); 1036 rdsv3_cong_exit(); 1037 out: 1038 return (ret); 1039 } 1040