xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c (revision c0dd49bdd68c0d758a67d56f07826f3b45cfc664)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/types.h>
58 #include <sys/stat.h>
59 #include <sys/conf.h>
60 #include <sys/ddi.h>
61 #include <sys/sunddi.h>
62 #include <sys/modctl.h>
63 #include <sys/rds.h>
64 #include <sys/stropts.h>
65 #include <sys/socket.h>
66 #include <sys/socketvar.h>
67 #include <sys/sockio.h>
68 #include <sys/sysmacros.h>
69 
70 #include <inet/ip.h>
71 #include <net/if_types.h>
72 
73 #include <sys/ib/clients/rdsv3/rdsv3.h>
74 #include <sys/ib/clients/rdsv3/rdma.h>
75 #include <sys/ib/clients/rdsv3/rdma_transport.h>
76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
77 
78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds);
79 extern int rdsv3_verify_bind_address(ipaddr_t addr);
80 
81 extern ddi_taskq_t	*rdsv3_taskq;
82 extern struct rdma_cm_id *rdsv3_rdma_listen_id;
83 
84 /* this is just used for stats gathering :/ */
85 kmutex_t rdsv3_sock_lock;
86 static unsigned long rdsv3_sock_count;
87 list_t rdsv3_sock_list;
88 rdsv3_wait_queue_t rdsv3_poll_waitq;
89 
90 /*
91  * This is called as the final descriptor referencing this socket is closed.
92  * We have to unbind the socket so that another socket can be bound to the
93  * address it was using.
94  *
95  * We have to be careful about racing with the incoming path.  sock_orphan()
96  * sets SOCK_DEAD and we use that as an indicator to the rx path that new
97  * messages shouldn't be queued.
98  */
99 /* ARGSUSED */
100 static int
101 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr)
102 {
103 	struct rsock *sk = (struct rsock *)proto_handle;
104 	struct rdsv3_sock *rs;
105 
106 	if (sk == NULL)
107 		goto out;
108 
109 	rs = rdsv3_sk_to_rs(sk);
110 	RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk);
111 
112 	rdsv3_sk_sock_orphan(sk);
113 	rdsv3_cong_remove_socket(rs);
114 	rdsv3_remove_bound(rs);
115 	/*
116 	 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so
117 	 * that ensures the recv path has completed messing
118 	 * with the socket.
119 	 */
120 	rdsv3_clear_recv_queue(rs);
121 	rdsv3_send_drop_to(rs, NULL);
122 	rdsv3_rdma_drop_keys(rs);
123 	(void) rdsv3_notify_queue_get(rs, NULL);
124 
125 	mutex_enter(&rdsv3_sock_lock);
126 	list_remove_node(&rs->rs_item);
127 	rdsv3_sock_count--;
128 	mutex_exit(&rdsv3_sock_lock);
129 
130 	rdsv3_sk_sock_put(sk);
131 
132 	RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs);
133 out:
134 	return (0);
135 }
136 
137 void
138 __rdsv3_wake_sk_sleep(struct rsock *sk)
139 {
140 	/* wakup anyone waiting in recvmsg */
141 	if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep)
142 		rdsv3_wake_up(sk->sk_sleep);
143 }
144 
145 /*
146  * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep.
147  * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
148  * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
149  * this seems more conservative.
150  * NB - normally, one would use sk_callback_lock for this, but we can
151  * get here from interrupts, whereas the network code grabs sk_callback_lock
152  * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
153  */
154 void
155 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs)
156 {
157 	RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs);
158 
159 	rw_enter(&rs->rs_recv_lock, RW_READER);
160 	__rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs));
161 	rw_exit(&rs->rs_recv_lock);
162 }
163 
164 /*ARGSUSED*/
165 static int
166 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
167     socklen_t *addr_len, cred_t *cr)
168 {
169 	struct rsock *sk = (struct rsock *)proto_handle;
170 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
171 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
172 
173 	RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs,
174 	    rs->rs_bound_port);
175 
176 	sin->sin_port = rs->rs_bound_port;
177 	sin->sin_addr.s_addr = rs->rs_bound_addr;
178 
179 	sin->sin_family = AF_INET_OFFLOAD;
180 
181 	*addr_len = sizeof (*sin);
182 	return (0);
183 }
184 
185 /*
186  * RDS' poll is without a doubt the least intuitive part of the interface,
187  * as POLLIN and POLLOUT do not behave entirely as you would expect from
188  * a network protocol.
189  *
190  * POLLIN is asserted if
191  *  -	there is data on the receive queue.
192  *  -	to signal that a previously congested destination may have become
193  *	uncongested
194  *  -	A notification has been queued to the socket (this can be a congestion
195  *	update, or a RDMA completion).
196  *
197  * POLLOUT is asserted if there is room on the send queue. This does not mean
198  * however, that the next sendmsg() call will succeed. If the application tries
199  * to send to a congested destination, the system call may still fail (and
200  * return ENOBUFS).
201  */
202 /* ARGSUSED */
203 static short
204 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet,
205     cred_t *cr)
206 {
207 	struct rsock	*sk = (struct rsock *)proto_handle;
208 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
209 	unsigned short mask = 0;
210 
211 #if 0
212 	RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet);
213 #endif
214 
215 	rw_enter(&rs->rs_recv_lock, RW_READER);
216 	if (!rs->rs_cong_monitor) {
217 		/*
218 		 * When a congestion map was updated, we signal POLLIN for
219 		 * "historical" reasons. Applications can also poll for
220 		 * WRBAND instead.
221 		 */
222 		if (rdsv3_cong_updated_since(&rs->rs_cong_track))
223 			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
224 	} else {
225 		mutex_enter(&rs->rs_lock);
226 		if (rs->rs_cong_notify)
227 			mask |= (POLLIN | POLLRDNORM);
228 		mutex_exit(&rs->rs_lock);
229 	}
230 	if (!list_is_empty(&rs->rs_recv_queue) ||
231 	    !list_is_empty(&rs->rs_notify_queue))
232 		mask |= (POLLIN | POLLRDNORM);
233 	if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs))
234 		mask |= (POLLOUT | POLLWRNORM);
235 	rw_exit(&rs->rs_recv_lock);
236 
237 #if 0
238 	RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask);
239 #endif
240 
241 	return (mask);
242 }
243 
244 /* ARGSUSED */
245 static int
246 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
247     int mode, int32_t *rvalp, cred_t *cr)
248 {
249 	ksocket_t	so4;
250 	struct lifconf	lifc;
251 	struct lifreq	lifr, *lifrp;
252 	struct ifconf	ifc;
253 	struct ifreq	ifr;
254 	int		rval = 0, rc, len;
255 	int		numifs;
256 	int		bufsize;
257 	void		*buf;
258 
259 	RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd);
260 
261 	/* Only ipv4 for now */
262 	rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP,
263 	    CRED());
264 	if (rval != 0) {
265 		RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d",
266 		    rval);
267 		return (rval);
268 	}
269 
270 	switch (cmd) {
271 	case SIOCGLIFNUM :
272 	case SIOCGIFNUM :
273 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
274 		if (rval != 0) break;
275 		if (cmd == SIOCGLIFNUM) {
276 			(void) ddi_copyout(&numifs, (void *)arg,
277 			    sizeof (int), 0);
278 		} else {
279 			len = 0;
280 			for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs;
281 			    rc++, lifrp++) {
282 				if (strlen(lifrp->lifr_name) <= IFNAMSIZ) {
283 					len++;
284 				}
285 			}
286 			(void) ddi_copyout(&len, (void *)arg,
287 			    sizeof (int), 0);
288 		}
289 		kmem_free(buf, bufsize);
290 		break;
291 
292 	case SIOCGLIFCONF :
293 		if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0)
294 		    != 0) {
295 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc");
296 			rval = EFAULT;
297 			break;
298 		}
299 
300 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
301 		if (rval != 0) {
302 			RDSV3_DPRINTF2("rdsv3_ioctl",
303 			    "rdsv3_do_ip_ioctl failed: %d", rval);
304 			break;
305 		}
306 
307 		if ((lifc.lifc_len > 0) && (numifs > 0)) {
308 			if (ddi_copyout(buf, (void *)lifc.lifc_req,
309 			    (lifc.lifc_len < bufsize) ? lifc.lifc_len :
310 			    bufsize, 0) != 0) {
311 				RDSV3_DPRINTF2("rdsv3_ioctl",
312 				    "copyout of records failed");
313 				rval = EFAULT;
314 			}
315 
316 		}
317 
318 		lifc.lifc_len = bufsize;
319 		if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf),
320 		    0) != 0) {
321 			RDSV3_DPRINTF2("rdsv3_ioctl",
322 			    "copyout of lifconf failed");
323 			rval = EFAULT;
324 		}
325 
326 		kmem_free(buf, bufsize);
327 		break;
328 
329 	case SIOCGIFCONF :
330 	case O_SIOCGIFCONF :
331 		if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0)
332 		    != 0) {
333 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc");
334 			rval = EFAULT;
335 			break;
336 		}
337 
338 		RDSV3_DPRINTF2("rdsv3_ioctl",
339 		    "O_SIOCGIFCONF: ifc_len: %d, req: %p",
340 		    ifc.ifc_len, ifc.ifc_req);
341 
342 		rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs);
343 		if (rval != 0) {
344 			RDSV3_DPRINTF2("rdsv3_ioctl",
345 			    "rdsv3_do_ip_ioctl_old failed: %d", rval);
346 			break;
347 		}
348 
349 		if ((ifc.ifc_len > 0) && (numifs > 0)) {
350 			if (ddi_copyout(buf, (void *)ifc.ifc_req,
351 			    (ifc.ifc_len < bufsize) ? ifc.ifc_len :
352 			    bufsize, 0) != 0) {
353 				RDSV3_DPRINTF2("rdsv3_ioctl",
354 				    "copyout of records failed");
355 				rval = EFAULT;
356 			}
357 
358 		}
359 
360 		ifc.ifc_len = bufsize;
361 		if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf),
362 		    0) != 0) {
363 			RDSV3_DPRINTF2("rdsv3_ioctl",
364 			    "copyout of ifconf failed");
365 			rval = EFAULT;
366 		}
367 
368 		kmem_free(buf, bufsize);
369 		break;
370 
371 	case SIOCGLIFFLAGS :
372 	case SIOCSLIFFLAGS :
373 	case SIOCGLIFMTU :
374 	case SIOCGLIFNETMASK :
375 	case SIOCGLIFINDEX :
376 		if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0)
377 		    != 0) {
378 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr");
379 			rval = EFAULT;
380 			break;
381 		}
382 
383 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED());
384 		if (rc != 0) {
385 			RDSV3_DPRINTF2("rdsv3_ioctl",
386 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
387 			    rc, lifr.lifr_name, cmd);
388 			break;
389 		}
390 
391 		(void) ddi_copyout(&lifr, (void *)arg,
392 		    sizeof (struct lifreq), 0);
393 		break;
394 
395 	case SIOCGIFFLAGS :
396 	case SIOCSIFFLAGS :
397 	case SIOCGIFMTU :
398 	case SIOCGIFNETMASK :
399 	case SIOCGIFINDEX :
400 		if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0)
401 		    != 0) {
402 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr");
403 			rval = EFAULT;
404 			break;
405 		}
406 
407 		RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name);
408 
409 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED());
410 		if (rc != 0) {
411 			RDSV3_DPRINTF2("rdsv3_ioctl",
412 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
413 			    rc, ifr.ifr_name, cmd);
414 
415 			break;
416 		}
417 
418 		RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name);
419 
420 		(void) ddi_copyout(&ifr, (void *)arg,
421 		    sizeof (struct ifreq), 0);
422 		break;
423 
424 	default:
425 		cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd);
426 		rval = EOPNOTSUPP;
427 	}
428 
429 	(void) ksocket_close(so4, CRED());
430 
431 	RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd);
432 
433 	*rvalp = rval;
434 	return (rval);
435 }
436 
437 static int
438 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len)
439 {
440 	struct sockaddr_in sin;
441 
442 	/* racing with another thread binding seems ok here */
443 	if (rs->rs_bound_addr == 0)
444 		return (-ENOTCONN); /* XXX not a great errno */
445 
446 	if (len < sizeof (struct sockaddr_in))
447 		return (-EINVAL);
448 
449 	if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in),
450 	    0) != 0) {
451 		RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin");
452 		return (-EFAULT);
453 	}
454 
455 	rdsv3_send_drop_to(rs, &sin);
456 
457 	return (0);
458 }
459 
460 static int
461 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen)
462 {
463 	int value = *optval;
464 
465 	if (optlen < sizeof (int))
466 		return (-EINVAL);
467 	*optvar = !!value;
468 	return (0);
469 }
470 
471 static int
472 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen)
473 {
474 	int ret;
475 
476 	ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
477 	if (ret == 0) {
478 		if (rs->rs_cong_monitor) {
479 			rdsv3_cong_add_socket(rs);
480 		} else {
481 			rdsv3_cong_remove_socket(rs);
482 			rs->rs_cong_mask = 0;
483 			rs->rs_cong_notify = 0;
484 		}
485 	}
486 	return (ret);
487 }
488 
489 /*ARGSUSED*/
490 static int
491 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level,
492     int optname, const void *optval, socklen_t optlen, cred_t *cr)
493 {
494 	struct rsock *sk = (struct rsock *)proto_handle;
495 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
496 	int	ret = 0;
497 
498 	RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)",
499 	    rs, level, optname);
500 
501 	switch (optname) {
502 	case RDSV3_CANCEL_SENT_TO:
503 		ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen);
504 		break;
505 	case RDSV3_GET_MR:
506 		ret = rdsv3_get_mr(rs, optval, optlen);
507 		break;
508 	case RDSV3_FREE_MR:
509 		ret = rdsv3_free_mr(rs, optval, optlen);
510 		break;
511 	case RDSV3_RECVERR:
512 		ret = rdsv3_set_bool_option(&rs->rs_recverr,
513 		    (char *)optval, optlen);
514 		break;
515 	case RDSV3_CONG_MONITOR:
516 		ret = rdsv3_cong_monitor(rs, (char *)optval, optlen);
517 		break;
518 	case SO_SNDBUF:
519 		sk->sk_sndbuf = *(uint_t *)optval;
520 		return (ret);
521 	case SO_RCVBUF:
522 		sk->sk_rcvbuf = *(uint_t *)optval;
523 		return (ret);
524 	default:
525 #if 1
526 		break;
527 #else
528 		ret = -ENOPROTOOPT;
529 #endif
530 	}
531 out:
532 	return (ret);
533 }
534 
535 /* XXX */
536 /*ARGSUSED*/
537 static int
538 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level,
539     int optname, void *optval, socklen_t *optlen, cred_t *cr)
540 {
541 	struct rsock *sk = (struct rsock *)proto_handle;
542 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
543 	int ret = 0;
544 
545 	RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)",
546 	    rs, optname, *optlen);
547 
548 	switch (optname) {
549 	case SO_SNDBUF:
550 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)",
551 		    sk->sk_sndbuf);
552 		if (*optlen != 0) {
553 			*((int *)optval) = sk->sk_sndbuf;
554 			*optlen = sizeof (uint_t);
555 		}
556 		return (ret);
557 	case SO_RCVBUF:
558 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)",
559 		    sk->sk_rcvbuf);
560 		if (*optlen != 0) {
561 			*((int *)optval) = sk->sk_rcvbuf;
562 			*optlen = sizeof (uint_t);
563 		}
564 		return (ret);
565 	case RDSV3_RECVERR:
566 		RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)",
567 		    rs->rs_recverr);
568 		if (*optlen < sizeof (int))
569 			return (-EINVAL);
570 		else {
571 			*(int *)optval = rs->rs_recverr;
572 			*optlen = sizeof (int);
573 		}
574 		return (0);
575 	default:
576 		if ((optname >= RDSV3_INFO_FIRST) &&
577 		    (optname <= RDSV3_INFO_LAST)) {
578 			return (rdsv3_info_getsockopt(sk, optname, optval,
579 			    optlen));
580 		}
581 		RDSV3_DPRINTF2("rdsv3_getsockopt",
582 		    "Unknown: level: %d optname: %d", level, optname);
583 		ret = -ENOPROTOOPT;
584 	}
585 
586 	RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)",
587 	    rs, optname, ret);
588 	return (ret);
589 }
590 
591 /*ARGSUSED*/
592 static int rdsv3_connect(sock_lower_handle_t proto_handle,
593     const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn,
594     cred_t *cr)
595 {
596 	struct rsock *sk = (struct rsock *)proto_handle;
597 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
598 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
599 	int ret = 0;
600 
601 	RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs);
602 
603 	mutex_enter(&sk->sk_lock);
604 
605 	if (addr_len != sizeof (struct sockaddr_in)) {
606 		ret = -EINVAL;
607 		goto out;
608 	}
609 
610 	if (sin->sin_family != AF_INET_OFFLOAD) {
611 		ret = -EAFNOSUPPORT;
612 		goto out;
613 	}
614 
615 	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
616 		ret = -EDESTADDRREQ;
617 		goto out;
618 	}
619 
620 	rs->rs_conn_addr = sin->sin_addr.s_addr;
621 	rs->rs_conn_port = sin->sin_port;
622 
623 	sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1);
624 
625 	RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs);
626 
627 out:
628 	mutex_exit(&sk->sk_lock);
629 	return (ret);
630 }
631 
632 /*ARGSUSED*/
633 static int
634 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
635 {
636 	struct rsock *sk = (struct rsock *)proto_handle;
637 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
638 
639 	RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs);
640 
641 	return (0);
642 }
643 
644 /*ARGSUSED*/
645 void
646 rdsv3_activate(sock_lower_handle_t proto_handle,
647     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
648     int flags, cred_t *cr)
649 {
650 	struct rsock *sk = (struct rsock *)proto_handle;
651 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
652 
653 	RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs);
654 
655 	sk->sk_upcalls = sock_upcalls;
656 	sk->sk_upper_handle = sock_handle;
657 
658 	RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs);
659 }
660 
661 
662 /* ARGSUSED */
663 int
664 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio,
665     struct nmsghdr *msg, cred_t *cr)
666 {
667 	struct rsock *sk = (struct rsock *)proto_handle;
668 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
669 	int ret;
670 
671 	RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs);
672 	ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid);
673 
674 	RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret);
675 	if (ret < 0) {
676 		return (-ret);
677 	}
678 
679 	return (0);
680 }
681 
682 /* ARGSUSED */
683 int
684 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio,
685     struct nmsghdr *msg, cred_t *cr)
686 {
687 	struct rsock *sk = (struct rsock *)proto_handle;
688 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
689 	int ret;
690 
691 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs);
692 	ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags);
693 
694 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret);
695 
696 	if (ret < 0) {
697 		return (-ret);
698 	}
699 
700 	return (0);
701 }
702 
703 /*ARGSUSED*/
704 int
705 rdsv3_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *addr,
706     socklen_t *addr_len, cred_t *cr)
707 {
708 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
709 	struct rsock *sk = (struct rsock *)proto_handle;
710 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
711 
712 	RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs);
713 
714 	(void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero));
715 
716 	/* racey, don't care */
717 	if (!rs->rs_conn_addr)
718 		return (-ENOTCONN);
719 
720 	sin->sin_port = rs->rs_conn_port;
721 	sin->sin_addr.s_addr = rs->rs_conn_addr;
722 
723 	sin->sin_family = AF_INET_OFFLOAD;
724 
725 	*addr_len = sizeof (*sin);
726 	return (0);
727 }
728 
729 void
730 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)
731 {
732 	struct rsock *sk = (struct rsock *)proto_handle;
733 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
734 
735 	RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs);
736 }
737 
738 #ifndef __lock_lint
739 static struct sock_downcalls_s rdsv3_sock_downcalls = {
740 	.sd_close =		rdsv3_release,
741 	.sd_bind =		rdsv3_bind,
742 	.sd_connect =		rdsv3_connect,
743 	.sd_accept =		NULL,
744 	.sd_getsockname =	rdsv3_getname,
745 	.sd_poll =		rdsv3_poll,
746 	.sd_ioctl =		rdsv3_ioctl,
747 	.sd_listen =		NULL,
748 	.sd_shutdown =		rdsv3_shutdown,
749 	.sd_setsockopt =	rdsv3_setsockopt,
750 	.sd_getsockopt =	rdsv3_getsockopt,
751 	.sd_send_uio =		rdsv3_send_uio,
752 	.sd_recv_uio =		rdsv3_recv_uio,
753 	.sd_activate =		rdsv3_activate,
754 	.sd_getpeername =	rdsv3_getpeername,
755 	.sd_send =		NULL,
756 	.sd_clr_flowctrl =	NULL
757 };
758 #else
759 static struct sock_downcalls_s rdsv3_sock_downcalls = {
760 	rdsv3_activate,
761 	NULL,
762 	rdsv3_bind,
763 	NULL,
764 	rdsv3_connect,
765 	rdsv3_getpeername,
766 	rdsv3_getname,
767 	rdsv3_getsockopt,
768 	rdsv3_setsockopt,
769 	NULL,
770 	rdsv3_send_uio,
771 	rdsv3_recv_uio,
772 	rdsv3_poll,
773 	rdsv3_shutdown,
774 	NULL,
775 	rdsv3_ioctl,
776 	rdsv3_release
777 };
778 #endif
779 
780 sock_lower_handle_t
781 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
782     uint_t *smodep, int *errorp, int flags, cred_t *credp)
783 {
784 	struct rdsv3_sock	*rs;
785 	struct rsock		*sk;
786 
787 	RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d "
788 	    "flags: %d", family, type, proto, flags);
789 
790 	sk = rdsv3_sk_alloc();
791 	if (sk == NULL)
792 		return (NULL);
793 	rdsv3_sock_init_data(sk);
794 
795 	rs = rdsv3_sk_to_rs(sk);
796 	rs->rs_sk = sk;
797 	mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL);
798 	rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL);
799 	list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message),
800 	    offsetof(struct rdsv3_message, m_sock_item));
801 	list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming),
802 	    offsetof(struct rdsv3_incoming, i_item));
803 	list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier),
804 	    offsetof(struct rdsv3_notifier, n_list));
805 	mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL);
806 	avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare,
807 	    sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node));
808 	mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL);
809 	rs->rs_cred = credp;
810 	rs->rs_zoneid = getzoneid();
811 	crhold(credp);
812 
813 	mutex_enter(&rdsv3_sock_lock);
814 	list_insert_tail(&rdsv3_sock_list, rs);
815 	rdsv3_sock_count++;
816 	/* Initialize RDMA/IB on the 1st socket if not done at attach */
817 	if (rdsv3_sock_count == 1) {
818 		rdsv3_rdma_init();
819 	}
820 	mutex_exit(&rdsv3_sock_lock);
821 
822 	*errorp = 0;
823 	*smodep = SM_ATOMIC;
824 	*sock_downcalls = &rdsv3_sock_downcalls;
825 
826 	RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs);
827 
828 	return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs));
829 }
830 
831 void
832 rdsv3_sock_addref(struct rdsv3_sock *rs)
833 {
834 	RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs);
835 	rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
836 }
837 
838 void
839 rdsv3_sock_put(struct rdsv3_sock *rs)
840 {
841 	RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs);
842 	rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
843 }
844 
845 static void
846 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len,
847     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
848 {
849 	struct rdsv3_sock *rs;
850 	struct rdsv3_incoming *inc;
851 	unsigned int total = 0;
852 
853 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)",
854 	    rdsv3_sk_to_rs(sock));
855 
856 	len /= sizeof (struct rdsv3_info_message);
857 
858 	mutex_enter(&rdsv3_sock_lock);
859 
860 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
861 		rw_enter(&rs->rs_recv_lock, RW_READER);
862 
863 		/* XXX too lazy to maintain counts.. */
864 		RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) {
865 			total++;
866 			if (total <= len)
867 				rdsv3_inc_info_copy(inc, iter, inc->i_saddr,
868 				    rs->rs_bound_addr, 1);
869 		}
870 
871 		rw_exit(&rs->rs_recv_lock);
872 	}
873 
874 	mutex_exit(&rdsv3_sock_lock);
875 
876 	lens->nr = total;
877 	lens->each = sizeof (struct rdsv3_info_message);
878 
879 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)",
880 	    rdsv3_sk_to_rs(sock));
881 }
882 
883 static void
884 rdsv3_sock_info(struct rsock *sock, unsigned int len,
885     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
886 {
887 	struct rdsv3_info_socket sinfo;
888 	struct rdsv3_sock *rs;
889 	unsigned long bytes;
890 
891 	RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)",
892 	    rdsv3_sk_to_rs(sock));
893 
894 	len /= sizeof (struct rdsv3_info_socket);
895 
896 	mutex_enter(&rdsv3_sock_lock);
897 
898 	if ((len < rdsv3_sock_count) || (iter->addr == NULL))
899 		goto out;
900 
901 	bytes = sizeof (struct rdsv3_info_socket);
902 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
903 		sinfo.sndbuf = rdsv3_sk_sndbuf(rs);
904 		sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs);
905 		sinfo.bound_addr = rs->rs_bound_addr;
906 		sinfo.connected_addr = rs->rs_conn_addr;
907 		sinfo.bound_port = rs->rs_bound_port;
908 		sinfo.connected_port = rs->rs_conn_port;
909 
910 		rdsv3_info_copy(iter, &sinfo, bytes);
911 	}
912 
913 	RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)",
914 	    rdsv3_sk_to_rs(sock));
915 
916 out:
917 	lens->nr = rdsv3_sock_count;
918 	lens->each = sizeof (struct rdsv3_info_socket);
919 
920 	mutex_exit(&rdsv3_sock_lock);
921 }
922 
923 rdsv3_delayed_work_t	*rdsv3_rdma_dwp = NULL;
924 uint_t			rdsv3_rdma_init_delay = 5; /* secs */
925 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work);
926 
927 void
928 rdsv3_exit(void)
929 {
930 	RDSV3_DPRINTF4("rdsv3_exit", "Enter");
931 
932 	if (rdsv3_rdma_dwp) {
933 		rdsv3_cancel_delayed_work(rdsv3_rdma_dwp);
934 	}
935 
936 	(void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit,
937 	    NULL, DDI_SLEEP);
938 	while (rdsv3_rdma_listen_id != NULL) {
939 #ifndef __lock_lint
940 		RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit",
941 		    __func__, __LINE__);
942 #endif
943 		delay(drv_usectohz(1000));
944 	}
945 
946 	rdsv3_conn_exit();
947 	rdsv3_cong_exit();
948 	rdsv3_sysctl_exit();
949 	rdsv3_threads_exit();
950 	rdsv3_stats_exit();
951 	rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
952 	rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES,
953 	    rdsv3_sock_inc_info);
954 
955 	if (rdsv3_rdma_dwp) {
956 		kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t));
957 		rdsv3_rdma_dwp = NULL;
958 	}
959 
960 	RDSV3_DPRINTF4("rdsv3_exit", "Return");
961 }
962 
963 /*ARGSUSED*/
964 int
965 rdsv3_init()
966 {
967 	int ret;
968 
969 	RDSV3_DPRINTF4("rdsv3_init", "Enter");
970 
971 	rdsv3_cong_init();
972 	ret = rdsv3_conn_init();
973 	if (ret)
974 		goto out;
975 	ret = rdsv3_threads_init();
976 	if (ret)
977 		goto out_conn;
978 	ret = rdsv3_sysctl_init();
979 	if (ret)
980 		goto out_threads;
981 	ret = rdsv3_stats_init();
982 	if (ret)
983 		goto out_sysctl;
984 
985 	rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
986 	rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info);
987 
988 	/* rdsv3_rdma_init need to be called with a little delay */
989 	rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP);
990 	RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker);
991 	rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp,
992 	    rdsv3_rdma_init_delay);
993 
994 	RDSV3_DPRINTF4("rdsv3_init", "Return");
995 
996 	goto out;
997 
998 out_stats:
999 	rdsv3_stats_exit();
1000 out_sysctl:
1001 	rdsv3_sysctl_exit();
1002 out_threads:
1003 	rdsv3_threads_exit();
1004 out_conn:
1005 	rdsv3_conn_exit();
1006 	rdsv3_cong_exit();
1007 out:
1008 	return (ret);
1009 }
1010