xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c (revision b27516f55237249607f754e6e42e865f12456675)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/types.h>
58 #include <sys/stat.h>
59 #include <sys/conf.h>
60 #include <sys/ddi.h>
61 #include <sys/sunddi.h>
62 #include <sys/modctl.h>
63 #include <sys/rds.h>
64 #include <sys/stropts.h>
65 #include <sys/socket.h>
66 #include <sys/socketvar.h>
67 #include <sys/sockio.h>
68 #include <sys/sysmacros.h>
69 
70 #include <inet/ip.h>
71 #include <net/if_types.h>
72 
73 #include <sys/ib/clients/rdsv3/rdsv3.h>
74 #include <sys/ib/clients/rdsv3/rdma.h>
75 #include <sys/ib/clients/rdsv3/rdma_transport.h>
76 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
77 
78 extern void rdsv3_remove_bound(struct rdsv3_sock *rds);
79 extern int rdsv3_verify_bind_address(ipaddr_t addr);
80 
81 extern ddi_taskq_t	*rdsv3_taskq;
82 extern struct rdma_cm_id *rdsv3_rdma_listen_id;
83 
84 /* this is just used for stats gathering :/ */
85 kmutex_t rdsv3_sock_lock;
86 static unsigned long rdsv3_sock_count;
87 list_t rdsv3_sock_list;
88 rdsv3_wait_queue_t rdsv3_poll_waitq;
89 
90 /*
91  * This is called as the final descriptor referencing this socket is closed.
92  * We have to unbind the socket so that another socket can be bound to the
93  * address it was using.
94  *
95  * We have to be careful about racing with the incoming path.  sock_orphan()
96  * sets SOCK_DEAD and we use that as an indicator to the rx path that new
97  * messages shouldn't be queued.
98  */
99 /* ARGSUSED */
100 static int
101 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr)
102 {
103 	struct rsock *sk = (struct rsock *)proto_handle;
104 	struct rdsv3_sock *rs;
105 
106 	if (sk == NULL)
107 		goto out;
108 
109 	rs = rdsv3_sk_to_rs(sk);
110 	RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk);
111 
112 	rdsv3_sk_sock_orphan(sk);
113 	rdsv3_cong_remove_socket(rs);
114 	rdsv3_remove_bound(rs);
115 	/*
116 	 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so
117 	 * that ensures the recv path has completed messing
118 	 * with the socket.
119 	 */
120 	rdsv3_clear_recv_queue(rs);
121 	rdsv3_send_drop_to(rs, NULL);
122 	rdsv3_rdma_drop_keys(rs);
123 	(void) rdsv3_notify_queue_get(rs, NULL);
124 
125 	mutex_enter(&rdsv3_sock_lock);
126 	list_remove_node(&rs->rs_item);
127 	rdsv3_sock_count--;
128 	mutex_exit(&rdsv3_sock_lock);
129 
130 	while (sk->sk_refcount > 1) {
131 		/* wait for 1 sec and try again */
132 		delay(drv_usectohz(1000000));
133 	}
134 
135 	/* this will free the rs and sk */
136 	rdsv3_sk_sock_put(sk);
137 
138 	RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs);
139 out:
140 	return (0);
141 }
142 
143 void
144 __rdsv3_wake_sk_sleep(struct rsock *sk)
145 {
146 	/* wakup anyone waiting in recvmsg */
147 	if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep)
148 		rdsv3_wake_up(sk->sk_sleep);
149 }
150 
151 /*
152  * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep.
153  * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
154  * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
155  * this seems more conservative.
156  * NB - normally, one would use sk_callback_lock for this, but we can
157  * get here from interrupts, whereas the network code grabs sk_callback_lock
158  * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
159  */
160 void
161 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs)
162 {
163 	RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs);
164 
165 	rw_enter(&rs->rs_recv_lock, RW_READER);
166 	__rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs));
167 	rw_exit(&rs->rs_recv_lock);
168 }
169 
170 /*ARGSUSED*/
171 static int
172 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
173     socklen_t *addr_len, cred_t *cr)
174 {
175 	struct rsock *sk = (struct rsock *)proto_handle;
176 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
177 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
178 
179 	RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs,
180 	    rs->rs_bound_port);
181 
182 	sin->sin_port = rs->rs_bound_port;
183 	sin->sin_addr.s_addr = rs->rs_bound_addr;
184 
185 	sin->sin_family = AF_INET_OFFLOAD;
186 
187 	*addr_len = sizeof (*sin);
188 	return (0);
189 }
190 
191 /*
192  * RDS' poll is without a doubt the least intuitive part of the interface,
193  * as POLLIN and POLLOUT do not behave entirely as you would expect from
194  * a network protocol.
195  *
196  * POLLIN is asserted if
197  *  -	there is data on the receive queue.
198  *  -	to signal that a previously congested destination may have become
199  *	uncongested
200  *  -	A notification has been queued to the socket (this can be a congestion
201  *	update, or a RDMA completion).
202  *
203  * POLLOUT is asserted if there is room on the send queue. This does not mean
204  * however, that the next sendmsg() call will succeed. If the application tries
205  * to send to a congested destination, the system call may still fail (and
206  * return ENOBUFS).
207  */
208 /* ARGSUSED */
209 static short
210 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet,
211     cred_t *cr)
212 {
213 	struct rsock	*sk = (struct rsock *)proto_handle;
214 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
215 	unsigned short mask = 0;
216 
217 #if 0
218 	RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet);
219 #endif
220 
221 	/*
222 	 * If rs_seen_congestion is on, wait until it's off.
223 	 * This is implemented for the following OFED code.
224 	 * 	if (rs->rs_seen_congestion)
225 	 *		poll_wait(file, &rds_poll_waitq, wait);
226 	 */
227 	mutex_enter(&rdsv3_poll_waitq.waitq_mutex);
228 	while (rs->rs_seen_congestion) {
229 		cv_wait(&rdsv3_poll_waitq.waitq_cv,
230 		    &rdsv3_poll_waitq.waitq_mutex);
231 	}
232 	mutex_exit(&rdsv3_poll_waitq.waitq_mutex);
233 
234 	rw_enter(&rs->rs_recv_lock, RW_READER);
235 	if (!rs->rs_cong_monitor) {
236 		/*
237 		 * When a congestion map was updated, we signal POLLIN for
238 		 * "historical" reasons. Applications can also poll for
239 		 * WRBAND instead.
240 		 */
241 		if (rdsv3_cong_updated_since(&rs->rs_cong_track))
242 			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
243 	} else {
244 		mutex_enter(&rs->rs_lock);
245 		if (rs->rs_cong_notify)
246 			mask |= (POLLIN | POLLRDNORM);
247 		mutex_exit(&rs->rs_lock);
248 	}
249 	if (!list_is_empty(&rs->rs_recv_queue) ||
250 	    !list_is_empty(&rs->rs_notify_queue))
251 		mask |= (POLLIN | POLLRDNORM);
252 	if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs))
253 		mask |= (POLLOUT | POLLWRNORM);
254 	rw_exit(&rs->rs_recv_lock);
255 
256 	/* clear state any time we wake a seen-congested socket */
257 	if (mask) {
258 		mutex_enter(&rdsv3_poll_waitq.waitq_mutex);
259 		rs->rs_seen_congestion = 0;
260 		mutex_exit(&rdsv3_poll_waitq.waitq_mutex);
261 	}
262 
263 #if 0
264 	RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask);
265 #endif
266 
267 	return (mask);
268 }
269 
270 /* ARGSUSED */
271 static int
272 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
273     int mode, int32_t *rvalp, cred_t *cr)
274 {
275 	ksocket_t	so4;
276 	struct lifconf	lifc;
277 	struct lifreq	lifr, *lifrp;
278 	struct ifconf	ifc;
279 	struct ifreq	ifr;
280 	int		rval = 0, rc, len;
281 	int		numifs;
282 	int		bufsize;
283 	void		*buf;
284 
285 	RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd);
286 
287 	/* Only ipv4 for now */
288 	rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP,
289 	    CRED());
290 	if (rval != 0) {
291 		RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d",
292 		    rval);
293 		return (rval);
294 	}
295 
296 	switch (cmd) {
297 	case SIOCGLIFNUM :
298 	case SIOCGIFNUM :
299 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
300 		if (rval != 0) break;
301 		if (cmd == SIOCGLIFNUM) {
302 			struct lifnum	lifn;
303 			lifn.lifn_family = AF_INET_OFFLOAD;
304 			lifn.lifn_flags = 0;
305 			lifn.lifn_count = numifs;
306 			(void) ddi_copyout(&lifn, (void *)arg,
307 			    sizeof (struct lifnum), 0);
308 		} else {
309 			len = 0;
310 			for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs;
311 			    rc++, lifrp++) {
312 				if (strlen(lifrp->lifr_name) <= IFNAMSIZ) {
313 					len++;
314 				}
315 			}
316 			(void) ddi_copyout(&len, (void *)arg,
317 			    sizeof (int), 0);
318 		}
319 		kmem_free(buf, bufsize);
320 		break;
321 
322 	case SIOCGLIFCONF :
323 		if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0)
324 		    != 0) {
325 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc");
326 			rval = EFAULT;
327 			break;
328 		}
329 
330 		rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
331 		if (rval != 0) {
332 			RDSV3_DPRINTF2("rdsv3_ioctl",
333 			    "rdsv3_do_ip_ioctl failed: %d", rval);
334 			break;
335 		}
336 
337 		if ((lifc.lifc_len > 0) && (numifs > 0)) {
338 			if (ddi_copyout(buf, (void *)lifc.lifc_req,
339 			    (lifc.lifc_len < bufsize) ? lifc.lifc_len :
340 			    bufsize, 0) != 0) {
341 				RDSV3_DPRINTF2("rdsv3_ioctl",
342 				    "copyout of records failed");
343 				rval = EFAULT;
344 			}
345 
346 		}
347 
348 		lifc.lifc_len = bufsize;
349 		if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf),
350 		    0) != 0) {
351 			RDSV3_DPRINTF2("rdsv3_ioctl",
352 			    "copyout of lifconf failed");
353 			rval = EFAULT;
354 		}
355 
356 		kmem_free(buf, bufsize);
357 		break;
358 
359 	case SIOCGIFCONF :
360 	case O_SIOCGIFCONF :
361 		if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0)
362 		    != 0) {
363 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc");
364 			rval = EFAULT;
365 			break;
366 		}
367 
368 		RDSV3_DPRINTF2("rdsv3_ioctl",
369 		    "O_SIOCGIFCONF: ifc_len: %d, req: %p",
370 		    ifc.ifc_len, ifc.ifc_req);
371 
372 		rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs);
373 		if (rval != 0) {
374 			RDSV3_DPRINTF2("rdsv3_ioctl",
375 			    "rdsv3_do_ip_ioctl_old failed: %d", rval);
376 			break;
377 		}
378 
379 		if ((ifc.ifc_len > 0) && (numifs > 0)) {
380 			if (ddi_copyout(buf, (void *)ifc.ifc_req,
381 			    (ifc.ifc_len < bufsize) ? ifc.ifc_len :
382 			    bufsize, 0) != 0) {
383 				RDSV3_DPRINTF2("rdsv3_ioctl",
384 				    "copyout of records failed");
385 				rval = EFAULT;
386 			}
387 
388 		}
389 
390 		ifc.ifc_len = bufsize;
391 		if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf),
392 		    0) != 0) {
393 			RDSV3_DPRINTF2("rdsv3_ioctl",
394 			    "copyout of ifconf failed");
395 			rval = EFAULT;
396 		}
397 
398 		kmem_free(buf, bufsize);
399 		break;
400 
401 	case SIOCGLIFFLAGS :
402 	case SIOCSLIFFLAGS :
403 	case SIOCGLIFMTU :
404 	case SIOCGLIFNETMASK :
405 	case SIOCGLIFINDEX :
406 		if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0)
407 		    != 0) {
408 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr");
409 			rval = EFAULT;
410 			break;
411 		}
412 
413 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED());
414 		if (rc != 0) {
415 			RDSV3_DPRINTF2("rdsv3_ioctl",
416 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
417 			    rc, lifr.lifr_name, cmd);
418 			break;
419 		}
420 
421 		(void) ddi_copyout(&lifr, (void *)arg,
422 		    sizeof (struct lifreq), 0);
423 		break;
424 
425 	case SIOCGIFFLAGS :
426 	case SIOCSIFFLAGS :
427 	case SIOCGIFMTU :
428 	case SIOCGIFNETMASK :
429 	case SIOCGIFINDEX :
430 		if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0)
431 		    != 0) {
432 			RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr");
433 			rval = EFAULT;
434 			break;
435 		}
436 
437 		RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name);
438 
439 		rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED());
440 		if (rc != 0) {
441 			RDSV3_DPRINTF2("rdsv3_ioctl",
442 			    "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
443 			    rc, ifr.ifr_name, cmd);
444 
445 			break;
446 		}
447 
448 		RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name);
449 
450 		(void) ddi_copyout(&ifr, (void *)arg,
451 		    sizeof (struct ifreq), 0);
452 		break;
453 
454 	default:
455 		if ((cmd >= RDSV3_INFO_FIRST) &&
456 		    (cmd <= RDSV3_INFO_LAST)) {
457 			return (rdsv3_info_ioctl((struct rsock *)proto_handle,
458 			    cmd, (char *)arg, rvalp));
459 		}
460 		RDSV3_DPRINTF2("rdsv3_ioctl", "Unknown ioctl cmd: %d",  cmd);
461 		cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd);
462 		rval = EOPNOTSUPP;
463 	}
464 
465 	(void) ksocket_close(so4, CRED());
466 
467 	RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd);
468 
469 	*rvalp = rval;
470 	return (rval);
471 }
472 
473 static int
474 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len)
475 {
476 	struct sockaddr_in sin;
477 
478 	/* racing with another thread binding seems ok here */
479 	if (rs->rs_bound_addr == 0)
480 		return (-ENOTCONN); /* XXX not a great errno */
481 
482 	if (len < sizeof (struct sockaddr_in))
483 		return (-EINVAL);
484 
485 	if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in),
486 	    0) != 0) {
487 		RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin");
488 		return (-EFAULT);
489 	}
490 
491 	rdsv3_send_drop_to(rs, &sin);
492 
493 	return (0);
494 }
495 
496 static int
497 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen)
498 {
499 	int value = *optval;
500 
501 	if (optlen < sizeof (int))
502 		return (-EINVAL);
503 	*optvar = !!value;
504 	return (0);
505 }
506 
507 static int
508 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen)
509 {
510 	int ret;
511 
512 	ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
513 	if (ret == 0) {
514 		if (rs->rs_cong_monitor) {
515 			rdsv3_cong_add_socket(rs);
516 		} else {
517 			rdsv3_cong_remove_socket(rs);
518 			rs->rs_cong_mask = 0;
519 			rs->rs_cong_notify = 0;
520 		}
521 	}
522 	return (ret);
523 }
524 
525 /*ARGSUSED*/
526 static int
527 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level,
528     int optname, const void *optval, socklen_t optlen, cred_t *cr)
529 {
530 	struct rsock *sk = (struct rsock *)proto_handle;
531 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
532 	int	ret = 0;
533 
534 	RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)",
535 	    rs, level, optname);
536 
537 	switch (optname) {
538 	case RDSV3_CANCEL_SENT_TO:
539 		ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen);
540 		break;
541 	case RDSV3_GET_MR:
542 		ret = rdsv3_get_mr(rs, optval, optlen);
543 		break;
544 	case RDSV3_GET_MR_FOR_DEST:
545 		ret = rdsv3_get_mr_for_dest(rs, optval, optlen);
546 		break;
547 	case RDSV3_FREE_MR:
548 		ret = rdsv3_free_mr(rs, optval, optlen);
549 		break;
550 	case RDSV3_RECVERR:
551 		ret = rdsv3_set_bool_option(&rs->rs_recverr,
552 		    (char *)optval, optlen);
553 		break;
554 	case RDSV3_CONG_MONITOR:
555 		ret = rdsv3_cong_monitor(rs, (char *)optval, optlen);
556 		break;
557 	case SO_SNDBUF:
558 		sk->sk_sndbuf = *(uint_t *)optval;
559 		return (ret);
560 	case SO_RCVBUF:
561 		sk->sk_rcvbuf = *(uint_t *)optval;
562 		return (ret);
563 	default:
564 #if 1
565 		break;
566 #else
567 		ret = -ENOPROTOOPT;
568 #endif
569 	}
570 out:
571 	return (ret);
572 }
573 
574 /* XXX */
575 /*ARGSUSED*/
576 static int
577 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level,
578     int optname, void *optval, socklen_t *optlen, cred_t *cr)
579 {
580 	struct rsock *sk = (struct rsock *)proto_handle;
581 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
582 	int ret = 0;
583 
584 	RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)",
585 	    rs, optname, *optlen);
586 
587 	switch (optname) {
588 	case SO_SNDBUF:
589 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)",
590 		    sk->sk_sndbuf);
591 		if (*optlen != 0) {
592 			*((int *)optval) = sk->sk_sndbuf;
593 			*optlen = sizeof (uint_t);
594 		}
595 		return (ret);
596 	case SO_RCVBUF:
597 		RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)",
598 		    sk->sk_rcvbuf);
599 		if (*optlen != 0) {
600 			*((int *)optval) = sk->sk_rcvbuf;
601 			*optlen = sizeof (uint_t);
602 		}
603 		return (ret);
604 	case RDSV3_RECVERR:
605 		RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)",
606 		    rs->rs_recverr);
607 		if (*optlen < sizeof (int))
608 			return (-EINVAL);
609 		else {
610 			*(int *)optval = rs->rs_recverr;
611 			*optlen = sizeof (int);
612 		}
613 		return (0);
614 	default:
615 		RDSV3_DPRINTF2("rdsv3_getsockopt",
616 		    "Unknown: level: %d optname: %d", level, optname);
617 		ret = -ENOPROTOOPT;
618 	}
619 
620 	RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)",
621 	    rs, optname, ret);
622 	return (ret);
623 }
624 
625 /*ARGSUSED*/
626 static int rdsv3_connect(sock_lower_handle_t proto_handle,
627     const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn,
628     cred_t *cr)
629 {
630 	struct rsock *sk = (struct rsock *)proto_handle;
631 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
632 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
633 	int ret = 0;
634 
635 	RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs);
636 
637 	mutex_enter(&sk->sk_lock);
638 
639 	if (addr_len != sizeof (struct sockaddr_in)) {
640 		ret = -EINVAL;
641 		goto out;
642 	}
643 
644 	if (sin->sin_family != AF_INET_OFFLOAD) {
645 		ret = -EAFNOSUPPORT;
646 		goto out;
647 	}
648 
649 	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
650 		ret = -EDESTADDRREQ;
651 		goto out;
652 	}
653 
654 	rs->rs_conn_addr = sin->sin_addr.s_addr;
655 	rs->rs_conn_port = sin->sin_port;
656 
657 	sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1);
658 
659 	RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs);
660 
661 out:
662 	mutex_exit(&sk->sk_lock);
663 	return (ret);
664 }
665 
666 /*ARGSUSED*/
667 static int
668 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
669 {
670 	struct rsock *sk = (struct rsock *)proto_handle;
671 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
672 
673 	RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs);
674 
675 	return (0);
676 }
677 
678 /*ARGSUSED*/
679 void
680 rdsv3_activate(sock_lower_handle_t proto_handle,
681     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
682     int flags, cred_t *cr)
683 {
684 	struct rsock *sk = (struct rsock *)proto_handle;
685 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
686 
687 	RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs);
688 
689 	sk->sk_upcalls = sock_upcalls;
690 	sk->sk_upper_handle = sock_handle;
691 
692 	RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs);
693 }
694 
695 
696 /* ARGSUSED */
697 int
698 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio,
699     struct nmsghdr *msg, cred_t *cr)
700 {
701 	struct rsock *sk = (struct rsock *)proto_handle;
702 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
703 	int ret;
704 
705 	RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs);
706 	ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid);
707 
708 	RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret);
709 	if (ret < 0) {
710 		return (-ret);
711 	}
712 
713 	return (0);
714 }
715 
716 /* ARGSUSED */
717 int
718 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio,
719     struct nmsghdr *msg, cred_t *cr)
720 {
721 	struct rsock *sk = (struct rsock *)proto_handle;
722 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
723 	int ret;
724 
725 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs);
726 	ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags);
727 
728 	RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret);
729 
730 	if (ret < 0) {
731 		return (-ret);
732 	}
733 
734 	return (0);
735 }
736 
737 /*ARGSUSED*/
738 int
739 rdsv3_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *addr,
740     socklen_t *addr_len, cred_t *cr)
741 {
742 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
743 	struct rsock *sk = (struct rsock *)proto_handle;
744 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
745 
746 	RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs);
747 
748 	(void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero));
749 
750 	/* racey, don't care */
751 	if (!rs->rs_conn_addr)
752 		return (-ENOTCONN);
753 
754 	sin->sin_port = rs->rs_conn_port;
755 	sin->sin_addr.s_addr = rs->rs_conn_addr;
756 
757 	sin->sin_family = AF_INET_OFFLOAD;
758 
759 	*addr_len = sizeof (*sin);
760 	return (0);
761 }
762 
763 void
764 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)
765 {
766 	struct rsock *sk = (struct rsock *)proto_handle;
767 	struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
768 
769 	RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs);
770 }
771 
772 #ifndef __lock_lint
773 static struct sock_downcalls_s rdsv3_sock_downcalls = {
774 	.sd_close =		rdsv3_release,
775 	.sd_bind =		rdsv3_bind,
776 	.sd_connect =		rdsv3_connect,
777 	.sd_accept =		NULL,
778 	.sd_getsockname =	rdsv3_getname,
779 	.sd_poll =		rdsv3_poll,
780 	.sd_ioctl =		rdsv3_ioctl,
781 	.sd_listen =		NULL,
782 	.sd_shutdown =		rdsv3_shutdown,
783 	.sd_setsockopt =	rdsv3_setsockopt,
784 	.sd_getsockopt =	rdsv3_getsockopt,
785 	.sd_send_uio =		rdsv3_send_uio,
786 	.sd_recv_uio =		rdsv3_recv_uio,
787 	.sd_activate =		rdsv3_activate,
788 	.sd_getpeername =	rdsv3_getpeername,
789 	.sd_send =		NULL,
790 	.sd_clr_flowctrl =	NULL
791 };
792 #else
793 static struct sock_downcalls_s rdsv3_sock_downcalls = {
794 	rdsv3_activate,
795 	NULL,
796 	rdsv3_bind,
797 	NULL,
798 	rdsv3_connect,
799 	rdsv3_getpeername,
800 	rdsv3_getname,
801 	rdsv3_getsockopt,
802 	rdsv3_setsockopt,
803 	NULL,
804 	rdsv3_send_uio,
805 	rdsv3_recv_uio,
806 	rdsv3_poll,
807 	rdsv3_shutdown,
808 	NULL,
809 	rdsv3_ioctl,
810 	rdsv3_release
811 };
812 #endif
813 
814 sock_lower_handle_t
815 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
816     uint_t *smodep, int *errorp, int flags, cred_t *credp)
817 {
818 	struct rdsv3_sock	*rs;
819 	struct rsock		*sk;
820 
821 	RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d "
822 	    "flags: %d", family, type, proto, flags);
823 
824 	sk = rdsv3_sk_alloc();
825 	if (sk == NULL)
826 		return (NULL);
827 	rdsv3_sock_init_data(sk);
828 
829 	rs = rdsv3_sk_to_rs(sk);
830 	rs->rs_sk = sk;
831 	mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL);
832 	rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL);
833 	list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message),
834 	    offsetof(struct rdsv3_message, m_sock_item));
835 	list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming),
836 	    offsetof(struct rdsv3_incoming, i_item));
837 	list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier),
838 	    offsetof(struct rdsv3_notifier, n_list));
839 	mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL);
840 	avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare,
841 	    sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node));
842 	mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL);
843 	rs->rs_cred = credp;
844 	rs->rs_zoneid = getzoneid();
845 	crhold(credp);
846 
847 	mutex_enter(&rdsv3_sock_lock);
848 	list_insert_tail(&rdsv3_sock_list, rs);
849 	rdsv3_sock_count++;
850 	/* Initialize RDMA/IB on the 1st socket if not done at attach */
851 	if (rdsv3_sock_count == 1) {
852 		rdsv3_rdma_init();
853 	}
854 	mutex_exit(&rdsv3_sock_lock);
855 
856 	*errorp = 0;
857 	*smodep = SM_ATOMIC;
858 	*sock_downcalls = &rdsv3_sock_downcalls;
859 
860 	RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs);
861 
862 	return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs));
863 }
864 
865 void
866 rdsv3_sock_addref(struct rdsv3_sock *rs)
867 {
868 	RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs);
869 	rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
870 }
871 
872 void
873 rdsv3_sock_put(struct rdsv3_sock *rs)
874 {
875 	RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs);
876 	rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
877 }
878 
879 static void
880 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len,
881     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
882 {
883 	struct rdsv3_sock *rs;
884 	struct rdsv3_incoming *inc;
885 	unsigned int total = 0;
886 
887 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)",
888 	    rdsv3_sk_to_rs(sock));
889 
890 	len /= sizeof (struct rdsv3_info_message);
891 
892 	mutex_enter(&rdsv3_sock_lock);
893 
894 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
895 		rw_enter(&rs->rs_recv_lock, RW_READER);
896 
897 		/* XXX too lazy to maintain counts.. */
898 		RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) {
899 			total++;
900 			if (total <= len)
901 				rdsv3_inc_info_copy(inc, iter, inc->i_saddr,
902 				    rs->rs_bound_addr, 1);
903 		}
904 
905 		rw_exit(&rs->rs_recv_lock);
906 	}
907 
908 	mutex_exit(&rdsv3_sock_lock);
909 
910 	lens->nr = total;
911 	lens->each = sizeof (struct rdsv3_info_message);
912 
913 	RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)",
914 	    rdsv3_sk_to_rs(sock));
915 }
916 
917 static void
918 rdsv3_sock_info(struct rsock *sock, unsigned int len,
919     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
920 {
921 	struct rdsv3_info_socket sinfo;
922 	struct rdsv3_sock *rs;
923 	unsigned long bytes;
924 
925 	RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)",
926 	    rdsv3_sk_to_rs(sock));
927 
928 	len /= sizeof (struct rdsv3_info_socket);
929 
930 	mutex_enter(&rdsv3_sock_lock);
931 
932 	if ((len < rdsv3_sock_count) || (iter->addr == NULL))
933 		goto out;
934 
935 	bytes = sizeof (struct rdsv3_info_socket);
936 	RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
937 		sinfo.sndbuf = rdsv3_sk_sndbuf(rs);
938 		sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs);
939 		sinfo.bound_addr = rs->rs_bound_addr;
940 		sinfo.connected_addr = rs->rs_conn_addr;
941 		sinfo.bound_port = rs->rs_bound_port;
942 		sinfo.connected_port = rs->rs_conn_port;
943 
944 		rdsv3_info_copy(iter, &sinfo, bytes);
945 	}
946 
947 	RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)",
948 	    rdsv3_sk_to_rs(sock));
949 
950 out:
951 	lens->nr = rdsv3_sock_count;
952 	lens->each = sizeof (struct rdsv3_info_socket);
953 
954 	mutex_exit(&rdsv3_sock_lock);
955 }
956 
957 rdsv3_delayed_work_t	*rdsv3_rdma_dwp = NULL;
958 uint_t			rdsv3_rdma_init_delay = 5; /* secs */
959 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work);
960 
961 void
962 rdsv3_exit(void)
963 {
964 	RDSV3_DPRINTF4("rdsv3_exit", "Enter");
965 
966 	if (rdsv3_rdma_dwp) {
967 		rdsv3_cancel_delayed_work(rdsv3_rdma_dwp);
968 	}
969 
970 	(void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit,
971 	    NULL, DDI_SLEEP);
972 	while (rdsv3_rdma_listen_id != NULL) {
973 #ifndef __lock_lint
974 		RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit",
975 		    __func__, __LINE__);
976 #endif
977 		delay(drv_usectohz(1000));
978 	}
979 
980 	rdsv3_conn_exit();
981 	rdsv3_cong_exit();
982 	rdsv3_sysctl_exit();
983 	rdsv3_threads_exit();
984 	rdsv3_stats_exit();
985 	rdsv3_info_deregister_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
986 	rdsv3_info_deregister_func(RDSV3_INFO_RECV_MESSAGES,
987 	    rdsv3_sock_inc_info);
988 
989 	if (rdsv3_rdma_dwp) {
990 		kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t));
991 		rdsv3_rdma_dwp = NULL;
992 	}
993 
994 	RDSV3_DPRINTF4("rdsv3_exit", "Return");
995 }
996 
997 /*ARGSUSED*/
998 int
999 rdsv3_init()
1000 {
1001 	int ret;
1002 
1003 	RDSV3_DPRINTF4("rdsv3_init", "Enter");
1004 
1005 	rdsv3_cong_init();
1006 
1007 	ret = rdsv3_conn_init();
1008 	if (ret)
1009 		goto out;
1010 	ret = rdsv3_threads_init();
1011 	if (ret)
1012 		goto out_conn;
1013 	ret = rdsv3_sysctl_init();
1014 	if (ret)
1015 		goto out_threads;
1016 	ret = rdsv3_stats_init();
1017 	if (ret)
1018 		goto out_sysctl;
1019 
1020 	rdsv3_info_register_func(RDSV3_INFO_SOCKETS, rdsv3_sock_info);
1021 	rdsv3_info_register_func(RDSV3_INFO_RECV_MESSAGES, rdsv3_sock_inc_info);
1022 
1023 	/* rdsv3_rdma_init need to be called with a little delay */
1024 	rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP);
1025 	RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker);
1026 	rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp,
1027 	    rdsv3_rdma_init_delay);
1028 
1029 	RDSV3_DPRINTF4("rdsv3_init", "Return");
1030 
1031 	goto out;
1032 
1033 out_stats:
1034 	rdsv3_stats_exit();
1035 out_sysctl:
1036 	rdsv3_sysctl_exit();
1037 out_threads:
1038 	rdsv3_threads_exit();
1039 out_conn:
1040 	rdsv3_conn_exit();
1041 	rdsv3_cong_exit();
1042 out:
1043 	return (ret);
1044 }
1045