1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/types.h> 58 #include <sys/stat.h> 59 #include <sys/conf.h> 60 #include <sys/ddi.h> 61 #include <sys/sunddi.h> 62 #include <sys/modctl.h> 63 #include <sys/rds.h> 64 #include <sys/stropts.h> 65 #include <sys/socket.h> 66 #include <sys/socketvar.h> 67 #include <sys/sockio.h> 68 #include <sys/sysmacros.h> 69 70 #include <inet/ip.h> 71 #include <net/if_types.h> 72 73 #include <sys/ib/clients/rdsv3/rdsv3.h> 74 #include <sys/ib/clients/rdsv3/rdma.h> 75 #include <sys/ib/clients/rdsv3/rdma_transport.h> 76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 77 78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds); 79 extern int rdsv3_verify_bind_address(ipaddr_t addr); 80 81 extern ddi_taskq_t *rdsv3_taskq; 82 extern struct rdma_cm_id *rdsv3_rdma_listen_id; 83 84 /* this is just used for stats gathering :/ */ 85 kmutex_t rdsv3_sock_lock; 86 static unsigned long rdsv3_sock_count; 87 list_t rdsv3_sock_list; 88 rdsv3_wait_queue_t rdsv3_poll_waitq; 89 90 /* 91 * This is called as the final descriptor referencing this socket is closed. 92 * We have to unbind the socket so that another socket can be bound to the 93 * address it was using. 94 * 95 * We have to be careful about racing with the incoming path. sock_orphan() 96 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 97 * messages shouldn't be queued. 98 */ 99 /* ARGSUSED */ 100 static int 101 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) 102 { 103 struct rsock *sk = (struct rsock *)proto_handle; 104 struct rdsv3_sock *rs; 105 106 if (sk == NULL) 107 goto out; 108 109 rs = rdsv3_sk_to_rs(sk); 110 RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk); 111 112 rdsv3_sk_sock_orphan(sk); 113 rdsv3_cong_remove_socket(rs); 114 rdsv3_remove_bound(rs); 115 /* 116 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so 117 * that ensures the recv path has completed messing 118 * with the socket. 119 */ 120 rdsv3_clear_recv_queue(rs); 121 rdsv3_send_drop_to(rs, NULL); 122 rdsv3_rdma_drop_keys(rs); 123 (void) rdsv3_notify_queue_get(rs, NULL); 124 125 mutex_enter(&rdsv3_sock_lock); 126 list_remove_node(&rs->rs_item); 127 rdsv3_sock_count--; 128 mutex_exit(&rdsv3_sock_lock); 129 130 while (sk->sk_refcount > 1) { 131 /* wait for 1 sec and try again */ 132 delay(drv_usectohz(1000000)); 133 } 134 135 /* this will free the rs and sk */ 136 rdsv3_sk_sock_put(sk); 137 138 RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs); 139 out: 140 return (0); 141 } 142 143 void 144 __rdsv3_wake_sk_sleep(struct rsock *sk) 145 { 146 /* wakup anyone waiting in recvmsg */ 147 if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep) 148 rdsv3_wake_up(sk->sk_sleep); 149 } 150 151 /* 152 * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep. 153 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 154 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 155 * this seems more conservative. 156 * NB - normally, one would use sk_callback_lock for this, but we can 157 * get here from interrupts, whereas the network code grabs sk_callback_lock 158 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 159 */ 160 void 161 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs) 162 { 163 RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs); 164 165 rw_enter(&rs->rs_recv_lock, RW_READER); 166 __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs)); 167 rw_exit(&rs->rs_recv_lock); 168 } 169 170 /*ARGSUSED*/ 171 static int 172 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 173 socklen_t *addr_len, cred_t *cr) 174 { 175 struct rsock *sk = (struct rsock *)proto_handle; 176 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 177 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 178 179 RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs, 180 rs->rs_bound_port); 181 182 sin->sin_port = rs->rs_bound_port; 183 sin->sin_addr.s_addr = rs->rs_bound_addr; 184 185 sin->sin_family = AF_INET_OFFLOAD; 186 187 *addr_len = sizeof (*sin); 188 return (0); 189 } 190 191 /* 192 * RDS' poll is without a doubt the least intuitive part of the interface, 193 * as POLLIN and POLLOUT do not behave entirely as you would expect from 194 * a network protocol. 195 * 196 * POLLIN is asserted if 197 * - there is data on the receive queue. 198 * - to signal that a previously congested destination may have become 199 * uncongested 200 * - A notification has been queued to the socket (this can be a congestion 201 * update, or a RDMA completion). 202 * 203 * POLLOUT is asserted if there is room on the send queue. This does not mean 204 * however, that the next sendmsg() call will succeed. If the application tries 205 * to send to a congested destination, the system call may still fail (and 206 * return ENOBUFS). 207 */ 208 /* ARGSUSED */ 209 static short 210 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, 211 cred_t *cr) 212 { 213 struct rsock *sk = (struct rsock *)proto_handle; 214 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 215 unsigned short mask = 0; 216 217 #if 0 218 RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet); 219 #endif 220 221 /* 222 * If rs_seen_congestion is on, wait until it's off. 223 * This is implemented for the following OFED code. 224 * if (rs->rs_seen_congestion) 225 * poll_wait(file, &rds_poll_waitq, wait); 226 */ 227 mutex_enter(&rdsv3_poll_waitq.waitq_mutex); 228 while (rs->rs_seen_congestion) { 229 cv_wait(&rdsv3_poll_waitq.waitq_cv, 230 &rdsv3_poll_waitq.waitq_mutex); 231 } 232 mutex_exit(&rdsv3_poll_waitq.waitq_mutex); 233 234 rw_enter(&rs->rs_recv_lock, RW_READER); 235 if (!rs->rs_cong_monitor) { 236 /* 237 * When a congestion map was updated, we signal POLLIN for 238 * "historical" reasons. Applications can also poll for 239 * WRBAND instead. 240 */ 241 if (rdsv3_cong_updated_since(&rs->rs_cong_track)) 242 mask |= (POLLIN | POLLRDNORM | POLLWRBAND); 243 } else { 244 mutex_enter(&rs->rs_lock); 245 if (rs->rs_cong_notify) 246 mask |= (POLLIN | POLLRDNORM); 247 mutex_exit(&rs->rs_lock); 248 } 249 if (!list_is_empty(&rs->rs_recv_queue) || 250 !list_is_empty(&rs->rs_notify_queue)) 251 mask |= (POLLIN | POLLRDNORM); 252 if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) 253 mask |= (POLLOUT | POLLWRNORM); 254 rw_exit(&rs->rs_recv_lock); 255 256 /* clear state any time we wake a seen-congested socket */ 257 if (mask) { 258 mutex_enter(&rdsv3_poll_waitq.waitq_mutex); 259 rs->rs_seen_congestion = 0; 260 mutex_exit(&rdsv3_poll_waitq.waitq_mutex); 261 } 262 263 #if 0 264 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); 265 #endif 266 267 return (mask); 268 } 269 270 /* ARGSUSED */ 271 static int 272 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 273 int mode, int32_t *rvalp, cred_t *cr) 274 { 275 ksocket_t so4; 276 struct lifconf lifc; 277 struct lifreq lifr, *lifrp; 278 struct ifconf ifc; 279 struct ifreq ifr; 280 int rval = 0, rc, len; 281 int numifs; 282 int bufsize; 283 void *buf; 284 285 RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd); 286 287 /* Only ipv4 for now */ 288 rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP, 289 CRED()); 290 if (rval != 0) { 291 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d", 292 rval); 293 return (rval); 294 } 295 296 switch (cmd) { 297 case SIOCGLIFNUM : 298 case SIOCGIFNUM : 299 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 300 if (rval != 0) break; 301 if (cmd == SIOCGLIFNUM) { 302 struct lifnum lifn; 303 lifn.lifn_family = AF_INET_OFFLOAD; 304 lifn.lifn_flags = 0; 305 lifn.lifn_count = numifs; 306 (void) ddi_copyout(&lifn, (void *)arg, 307 sizeof (struct lifnum), 0); 308 } else { 309 len = 0; 310 for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs; 311 rc++, lifrp++) { 312 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) { 313 len++; 314 } 315 } 316 (void) ddi_copyout(&len, (void *)arg, 317 sizeof (int), 0); 318 } 319 kmem_free(buf, bufsize); 320 break; 321 322 case SIOCGLIFCONF : 323 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0) 324 != 0) { 325 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc"); 326 rval = EFAULT; 327 break; 328 } 329 330 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 331 if (rval != 0) { 332 RDSV3_DPRINTF2("rdsv3_ioctl", 333 "rdsv3_do_ip_ioctl failed: %d", rval); 334 break; 335 } 336 337 if ((lifc.lifc_len > 0) && (numifs > 0)) { 338 if (ddi_copyout(buf, (void *)lifc.lifc_req, 339 (lifc.lifc_len < bufsize) ? lifc.lifc_len : 340 bufsize, 0) != 0) { 341 RDSV3_DPRINTF2("rdsv3_ioctl", 342 "copyout of records failed"); 343 rval = EFAULT; 344 } 345 346 } 347 348 lifc.lifc_len = bufsize; 349 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf), 350 0) != 0) { 351 RDSV3_DPRINTF2("rdsv3_ioctl", 352 "copyout of lifconf failed"); 353 rval = EFAULT; 354 } 355 356 kmem_free(buf, bufsize); 357 break; 358 359 case SIOCGIFCONF : 360 case O_SIOCGIFCONF : 361 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0) 362 != 0) { 363 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc"); 364 rval = EFAULT; 365 break; 366 } 367 368 RDSV3_DPRINTF2("rdsv3_ioctl", 369 "O_SIOCGIFCONF: ifc_len: %d, req: %p", 370 ifc.ifc_len, ifc.ifc_req); 371 372 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs); 373 if (rval != 0) { 374 RDSV3_DPRINTF2("rdsv3_ioctl", 375 "rdsv3_do_ip_ioctl_old failed: %d", rval); 376 break; 377 } 378 379 if ((ifc.ifc_len > 0) && (numifs > 0)) { 380 if (ddi_copyout(buf, (void *)ifc.ifc_req, 381 (ifc.ifc_len < bufsize) ? ifc.ifc_len : 382 bufsize, 0) != 0) { 383 RDSV3_DPRINTF2("rdsv3_ioctl", 384 "copyout of records failed"); 385 rval = EFAULT; 386 } 387 388 } 389 390 ifc.ifc_len = bufsize; 391 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf), 392 0) != 0) { 393 RDSV3_DPRINTF2("rdsv3_ioctl", 394 "copyout of ifconf failed"); 395 rval = EFAULT; 396 } 397 398 kmem_free(buf, bufsize); 399 break; 400 401 case SIOCGLIFFLAGS : 402 case SIOCSLIFFLAGS : 403 case SIOCGLIFMTU : 404 case SIOCGLIFNETMASK : 405 case SIOCGLIFINDEX : 406 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0) 407 != 0) { 408 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr"); 409 rval = EFAULT; 410 break; 411 } 412 413 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED()); 414 if (rc != 0) { 415 RDSV3_DPRINTF2("rdsv3_ioctl", 416 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 417 rc, lifr.lifr_name, cmd); 418 break; 419 } 420 421 (void) ddi_copyout(&lifr, (void *)arg, 422 sizeof (struct lifreq), 0); 423 break; 424 425 case SIOCGIFFLAGS : 426 case SIOCSIFFLAGS : 427 case SIOCGIFMTU : 428 case SIOCGIFNETMASK : 429 case SIOCGIFINDEX : 430 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0) 431 != 0) { 432 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr"); 433 rval = EFAULT; 434 break; 435 } 436 437 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name); 438 439 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED()); 440 if (rc != 0) { 441 RDSV3_DPRINTF2("rdsv3_ioctl", 442 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 443 rc, ifr.ifr_name, cmd); 444 445 break; 446 } 447 448 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name); 449 450 (void) ddi_copyout(&ifr, (void *)arg, 451 sizeof (struct ifreq), 0); 452 break; 453 454 default: 455 if ((cmd >= RDSV3_INFO_FIRST) && 456 (cmd <= RDSV3_INFO_LAST)) { 457 return (rdsv3_info_ioctl((struct rsock *)proto_handle, 458 cmd, (char *)arg, rvalp)); 459 } 460 RDSV3_DPRINTF2("rdsv3_ioctl", "Unknown ioctl cmd: %d", cmd); 461 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd); 462 rval = EOPNOTSUPP; 463 } 464 465 (void) ksocket_close(so4, CRED()); 466 467 RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd); 468 469 *rvalp = rval; 470 return (rval); 471 } 472 473 static int 474 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len) 475 { 476 struct sockaddr_in sin; 477 478 /* racing with another thread binding seems ok here */ 479 if (rs->rs_bound_addr == 0) 480 return (-ENOTCONN); /* XXX not a great errno */ 481 482 if (len < sizeof (struct sockaddr_in)) 483 return (-EINVAL); 484 485 if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in), 486 0) != 0) { 487 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin"); 488 return (-EFAULT); 489 } 490 491 rdsv3_send_drop_to(rs, &sin); 492 493 return (0); 494 } 495 496 static int 497 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen) 498 { 499 int value = *optval; 500 501 if (optlen < sizeof (int)) 502 return (-EINVAL); 503 *optvar = !!value; 504 return (0); 505 } 506 507 static int 508 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen) 509 { 510 int ret; 511 512 ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 513 if (ret == 0) { 514 if (rs->rs_cong_monitor) { 515 rdsv3_cong_add_socket(rs); 516 } else { 517 rdsv3_cong_remove_socket(rs); 518 rs->rs_cong_mask = 0; 519 rs->rs_cong_notify = 0; 520 } 521 } 522 return (ret); 523 } 524 525 /*ARGSUSED*/ 526 static int 527 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level, 528 int optname, const void *optval, socklen_t optlen, cred_t *cr) 529 { 530 struct rsock *sk = (struct rsock *)proto_handle; 531 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 532 int ret = 0; 533 534 RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)", 535 rs, level, optname); 536 537 switch (optname) { 538 case RDSV3_CANCEL_SENT_TO: 539 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen); 540 break; 541 case RDSV3_GET_MR: 542 ret = rdsv3_get_mr(rs, optval, optlen); 543 break; 544 case RDSV3_GET_MR_FOR_DEST: 545 ret = rdsv3_get_mr_for_dest(rs, optval, optlen); 546 break; 547 case RDSV3_FREE_MR: 548 ret = rdsv3_free_mr(rs, optval, optlen); 549 break; 550 case RDSV3_RECVERR: 551 ret = rdsv3_set_bool_option(&rs->rs_recverr, 552 (char *)optval, optlen); 553 break; 554 case RDSV3_CONG_MONITOR: 555 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen); 556 break; 557 case SO_SNDBUF: 558 sk->sk_sndbuf = *(uint_t *)optval; 559 return (ret); 560 case SO_RCVBUF: 561 sk->sk_rcvbuf = *(uint_t *)optval; 562 return (ret); 563 default: 564 #if 1 565 break; 566 #else 567 ret = -ENOPROTOOPT; 568 #endif 569 } 570 out: 571 return (ret); 572 } 573 574 /* XXX */ 575 /*ARGSUSED*/ 576 static int 577 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level, 578 int optname, void *optval, socklen_t *optlen, cred_t *cr) 579 { 580 struct rsock *sk = (struct rsock *)proto_handle; 581 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 582 int ret = 0; 583 584 RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)", 585 rs, optname, *optlen); 586 587 switch (optname) { 588 case SO_SNDBUF: 589 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)", 590 sk->sk_sndbuf); 591 if (*optlen != 0) { 592 *((int *)optval) = sk->sk_sndbuf; 593 *optlen = sizeof (uint_t); 594 } 595 return (ret); 596 case SO_RCVBUF: 597 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)", 598 sk->sk_rcvbuf); 599 if (*optlen != 0) { 600 *((int *)optval) = sk->sk_rcvbuf; 601 *optlen = sizeof (uint_t); 602 } 603 return (ret); 604 case RDSV3_RECVERR: 605 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)", 606 rs->rs_recverr); 607 if (*optlen < sizeof (int)) 608 return (-EINVAL); 609 else { 610 *(int *)optval = rs->rs_recverr; 611 *optlen = sizeof (int); 612 } 613 return (0); 614 default: 615 RDSV3_DPRINTF2("rdsv3_getsockopt", 616 "Unknown: level: %d optname: %d", level, optname); 617 ret = -ENOPROTOOPT; 618 } 619 620 RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)", 621 rs, optname, ret); 622 return (ret); 623 } 624 625 /*ARGSUSED*/ 626 static int rdsv3_connect(sock_lower_handle_t proto_handle, 627 const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn, 628 cred_t *cr) 629 { 630 struct rsock *sk = (struct rsock *)proto_handle; 631 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 632 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 633 int ret = 0; 634 635 RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs); 636 637 mutex_enter(&sk->sk_lock); 638 639 if (addr_len != sizeof (struct sockaddr_in)) { 640 ret = -EINVAL; 641 goto out; 642 } 643 644 if (sin->sin_family != AF_INET_OFFLOAD) { 645 ret = -EAFNOSUPPORT; 646 goto out; 647 } 648 649 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 650 ret = -EDESTADDRREQ; 651 goto out; 652 } 653 654 rs->rs_conn_addr = sin->sin_addr.s_addr; 655 rs->rs_conn_port = sin->sin_port; 656 657 sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1); 658 659 RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs); 660 661 out: 662 mutex_exit(&sk->sk_lock); 663 return (ret); 664 } 665 666 /*ARGSUSED*/ 667 static int 668 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 669 { 670 struct rsock *sk = (struct rsock *)proto_handle; 671 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 672 673 RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs); 674 675 return (0); 676 } 677 678 /*ARGSUSED*/ 679 void 680 rdsv3_activate(sock_lower_handle_t proto_handle, 681 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, 682 int flags, cred_t *cr) 683 { 684 struct rsock *sk = (struct rsock *)proto_handle; 685 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 686 687 RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs); 688 689 sk->sk_upcalls = sock_upcalls; 690 sk->sk_upper_handle = sock_handle; 691 692 RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs); 693 } 694 695 696 /* ARGSUSED */ 697 int 698 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio, 699 struct nmsghdr *msg, cred_t *cr) 700 { 701 struct rsock *sk = (struct rsock *)proto_handle; 702 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 703 int ret; 704 705 RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs); 706 ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid); 707 708 RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret); 709 if (ret < 0) { 710 return (-ret); 711 } 712 713 return (0); 714 } 715 716 /* ARGSUSED */ 717 int 718 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio, 719 struct nmsghdr *msg, cred_t *cr) 720 { 721 struct rsock *sk = (struct rsock *)proto_handle; 722 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 723 int ret; 724 725 RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs); 726 ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags); 727 728 RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret); 729 730 if (ret < 0) { 731 return (-ret); 732 } 733 734 return (0); 735 } 736 737 /*ARGSUSED*/ 738 int 739 rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 740 socklen_t *addr_len, cred_t *cr) 741 { 742 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 743 struct rsock *sk = (struct rsock *)proto_handle; 744 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 745 746 RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs); 747 748 (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero)); 749 750 /* racey, don't care */ 751 if (!rs->rs_conn_addr) 752 return (-ENOTCONN); 753 754 sin->sin_port = rs->rs_conn_port; 755 sin->sin_addr.s_addr = rs->rs_conn_addr; 756 757 sin->sin_family = AF_INET_OFFLOAD; 758 759 *addr_len = sizeof (*sin); 760 return (0); 761 } 762 763 void 764 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle) 765 { 766 struct rsock *sk = (struct rsock *)proto_handle; 767 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 768 769 RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs); 770 } 771 772 #ifndef __lock_lint 773 static struct sock_downcalls_s rdsv3_sock_downcalls = { 774 .sd_close = rdsv3_release, 775 .sd_bind = rdsv3_bind, 776 .sd_connect = rdsv3_connect, 777 .sd_accept = NULL, 778 .sd_getsockname = rdsv3_getname, 779 .sd_poll = rdsv3_poll, 780 .sd_ioctl = rdsv3_ioctl, 781 .sd_listen = NULL, 782 .sd_shutdown = rdsv3_shutdown, 783 .sd_setsockopt = rdsv3_setsockopt, 784 .sd_getsockopt = rdsv3_getsockopt, 785 .sd_send_uio = rdsv3_send_uio, 786 .sd_recv_uio = rdsv3_recv_uio, 787 .sd_activate = rdsv3_activate, 788 .sd_getpeername = rdsv3_getpeername, 789 .sd_send = NULL, 790 .sd_clr_flowctrl = NULL 791 }; 792 #else 793 static struct sock_downcalls_s rdsv3_sock_downcalls = { 794 rdsv3_activate, 795 NULL, 796 rdsv3_bind, 797 NULL, 798 rdsv3_connect, 799 rdsv3_getpeername, 800 rdsv3_getname, 801 rdsv3_getsockopt, 802 rdsv3_setsockopt, 803 NULL, 804 rdsv3_send_uio, 805 rdsv3_recv_uio, 806 rdsv3_poll, 807 rdsv3_shutdown, 808 NULL, 809 rdsv3_ioctl, 810 rdsv3_release 811 }; 812 #endif 813 814 sock_lower_handle_t 815 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 816 uint_t *smodep, int *errorp, int flags, cred_t *credp) 817 { 818 struct rdsv3_sock *rs; 819 struct rsock *sk; 820 821 RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d " 822 "flags: %d", family, type, proto, flags); 823 824 sk = rdsv3_sk_alloc(); 825 if (sk == NULL) 826 return (NULL); 827 rdsv3_sock_init_data(sk); 828 829 rs = rdsv3_sk_to_rs(sk); 830 rs->rs_sk = sk; 831 mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL); 832 rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL); 833 list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message), 834 offsetof(struct rdsv3_message, m_sock_item)); 835 list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming), 836 offsetof(struct rdsv3_incoming, i_item)); 837 list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier), 838 offsetof(struct rdsv3_notifier, n_list)); 839 mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL); 840 avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, 841 sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); 842 mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); 843 rs->rs_cred = credp; 844 rs->rs_zoneid = getzoneid(); 845 crhold(credp); 846 847 mutex_enter(&rdsv3_sock_lock); 848 list_insert_tail(&rdsv3_sock_list, rs); 849 rdsv3_sock_count++; 850 /* Initialize RDMA/IB on the 1st socket if not done at attach */ 851 if (rdsv3_sock_count == 1) { 852 rdsv3_rdma_init(); 853 } 854 mutex_exit(&rdsv3_sock_lock); 855 856 *errorp = 0; 857 *smodep = SM_ATOMIC; 858 *sock_downcalls = &rdsv3_sock_downcalls; 859 860 RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs); 861 862 return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs)); 863 } 864 865 void 866 rdsv3_sock_addref(struct rdsv3_sock *rs) 867 { 868 RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs); 869 rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 870 } 871 872 void 873 rdsv3_sock_put(struct rdsv3_sock *rs) 874 { 875 RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs); 876 rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 877 } 878 879 static void 880 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len, 881 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 882 { 883 struct rdsv3_sock *rs; 884 struct rdsv3_incoming *inc; 885 unsigned int total = 0; 886 887 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)", 888 rdsv3_sk_to_rs(sock)); 889 890 len /= sizeof (struct rdsv3_info_message); 891 892 mutex_enter(&rdsv3_sock_lock); 893 894 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 895 rw_enter(&rs->rs_recv_lock, RW_READER); 896 897 /* XXX too lazy to maintain counts.. */ 898 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) { 899 total++; 900 if (total <= len) 901 rdsv3_inc_info_copy(inc, iter, inc->i_saddr, 902 rs->rs_bound_addr, 1); 903 } 904 905 rw_exit(&rs->rs_recv_lock); 906 } 907 908 mutex_exit(&rdsv3_sock_lock); 909 910 lens->nr = total; 911 lens->each = sizeof (struct rdsv3_info_message); 912 913 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)", 914 rdsv3_sk_to_rs(sock)); 915 } 916 917 static void 918 rdsv3_sock_info(struct rsock *sock, unsigned int len, 919 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 920 { 921 struct rdsv3_info_socket sinfo; 922 struct rdsv3_sock *rs; 923 unsigned long bytes; 924 925 RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)", 926 rdsv3_sk_to_rs(sock)); 927 928 len /= sizeof (struct rdsv3_info_socket); 929 930 mutex_enter(&rdsv3_sock_lock); 931 932 if ((len < rdsv3_sock_count) || (iter->addr == NULL)) 933 goto out; 934 935 bytes = sizeof (struct rdsv3_info_socket); 936 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 937 sinfo.sndbuf = rdsv3_sk_sndbuf(rs); 938 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs); 939 sinfo.bound_addr = rs->rs_bound_addr; 940 sinfo.connected_addr = rs->rs_conn_addr; 941 sinfo.bound_port = rs->rs_bound_port; 942 sinfo.connected_port = rs->rs_conn_port; 943 944 rdsv3_info_copy(iter, &sinfo, bytes); 945 } 946 947 RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)", 948 rdsv3_sk_to_rs(sock)); 949 950 out: 951 lens->nr = rdsv3_sock_count; 952 lens->each = sizeof (struct rdsv3_info_socket); 953 954 mutex_exit(&rdsv3_sock_lock); 955 } 956 957 rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL; 958 uint_t rdsv3_rdma_init_delay = 5; /* secs */ 959 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work); 960 961 void 962 rdsv3_exit(void) 963 { 964 RDSV3_DPRINTF4("rdsv3_exit", "Enter"); 965 966 if (rdsv3_rdma_dwp) { 967 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp); 968 } 969 970 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit, 971 NULL, DDI_SLEEP); 972 while (rdsv3_rdma_listen_id != NULL) { 973 #ifndef __lock_lint 974 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit", 975 __func__, __LINE__); 976 #endif 977 delay(drv_usectohz(1000)); 978 } 979 980 rdsv3_conn_exit(); 981 rdsv3_cong_exit(); 982 rdsv3_sysctl_exit(); 983 rdsv3_threads_exit(); 984 rdsv3_stats_exit(); 985 rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 986 rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES, 987 rdsv3_sock_inc_info); 988 989 if (rdsv3_rdma_dwp) { 990 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t)); 991 rdsv3_rdma_dwp = NULL; 992 } 993 994 RDSV3_DPRINTF4("rdsv3_exit", "Return"); 995 } 996 997 /*ARGSUSED*/ 998 int 999 rdsv3_init() 1000 { 1001 int ret; 1002 1003 RDSV3_DPRINTF4("rdsv3_init", "Enter"); 1004 1005 rdsv3_cong_init(); 1006 1007 ret = rdsv3_conn_init(); 1008 if (ret) 1009 goto out; 1010 ret = rdsv3_threads_init(); 1011 if (ret) 1012 goto out_conn; 1013 ret = rdsv3_sysctl_init(); 1014 if (ret) 1015 goto out_threads; 1016 ret = rdsv3_stats_init(); 1017 if (ret) 1018 goto out_sysctl; 1019 1020 rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info); 1021 rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info); 1022 1023 /* rdsv3_rdma_init need to be called with a little delay */ 1024 rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP); 1025 RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker); 1026 rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp, 1027 rdsv3_rdma_init_delay); 1028 1029 RDSV3_DPRINTF4("rdsv3_init", "Return"); 1030 1031 goto out; 1032 1033 out_stats: 1034 rdsv3_stats_exit(); 1035 out_sysctl: 1036 rdsv3_sysctl_exit(); 1037 out_threads: 1038 rdsv3_threads_exit(); 1039 out_conn: 1040 rdsv3_conn_exit(); 1041 rdsv3_cong_exit(); 1042 out: 1043 return (ret); 1044 } 1045