1c0dd49bdSEiji Ota /*
216e76cddSagiri  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
316e76cddSagiri  */
416e76cddSagiri 
516e76cddSagiri /*
616e76cddSagiri  * This file contains code imported from the OFED rds source file bind.c
716e76cddSagiri  * Oracle elects to have and use the contents of bind.c under and governed
816e76cddSagiri  * by the OpenIB.org BSD license (see below for full license text). However,
916e76cddSagiri  * the following notice accompanied the original version of this file:
1016e76cddSagiri  */
1116e76cddSagiri 
1216e76cddSagiri /*
1316e76cddSagiri  * Copyright (c) 2006 Oracle.  All rights reserved.
14c0dd49bdSEiji Ota  *
1516e76cddSagiri  * This software is available to you under a choice of one of two
1616e76cddSagiri  * licenses.  You may choose to be licensed under the terms of the GNU
1716e76cddSagiri  * General Public License (GPL) Version 2, available from the file
1816e76cddSagiri  * COPYING in the main directory of this source tree, or the
1916e76cddSagiri  * OpenIB.org BSD license below:
20c0dd49bdSEiji Ota  *
2116e76cddSagiri  *     Redistribution and use in source and binary forms, with or
2216e76cddSagiri  *     without modification, are permitted provided that the following
2316e76cddSagiri  *     conditions are met:
24c0dd49bdSEiji Ota  *
2516e76cddSagiri  *      - Redistributions of source code must retain the above
2616e76cddSagiri  *        copyright notice, this list of conditions and the following
2716e76cddSagiri  *        disclaimer.
2816e76cddSagiri  *
2916e76cddSagiri  *      - Redistributions in binary form must reproduce the above
3016e76cddSagiri  *        copyright notice, this list of conditions and the following
3116e76cddSagiri  *        disclaimer in the documentation and/or other materials
3216e76cddSagiri  *        provided with the distribution.
3316e76cddSagiri  *
3416e76cddSagiri  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
3516e76cddSagiri  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
3616e76cddSagiri  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
3716e76cddSagiri  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
3816e76cddSagiri  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
3916e76cddSagiri  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
4016e76cddSagiri  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
4116e76cddSagiri  * SOFTWARE.
42c0dd49bdSEiji Ota  *
43c0dd49bdSEiji Ota  */
44c0dd49bdSEiji Ota #include <sys/types.h>
45c0dd49bdSEiji Ota #include <sys/sysmacros.h>
46c0dd49bdSEiji Ota #include <sys/random.h>
47c0dd49bdSEiji Ota #include <sys/rds.h>
48c0dd49bdSEiji Ota 
49c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3.h>
50c0dd49bdSEiji Ota #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51c0dd49bdSEiji Ota 
52c0dd49bdSEiji Ota kmutex_t	rdsv3_bind_lock;
53c0dd49bdSEiji Ota avl_tree_t	rdsv3_bind_tree;
54c0dd49bdSEiji Ota 
55*80166370Sagiri /*
56*80166370Sagiri  * Each node in the rdsv3_bind_tree is of this type.
57*80166370Sagiri  */
58*80166370Sagiri struct rdsv3_ip_bucket {
59*80166370Sagiri 	ipaddr_t		ip;
60*80166370Sagiri 	zoneid_t		zone;
61*80166370Sagiri 	avl_node_t		ip_avl_node;
62*80166370Sagiri 	krwlock_t		rwlock;
63*80166370Sagiri 	uint_t			nsockets;
64*80166370Sagiri 	struct rdsv3_sock	*port[65536];
65*80166370Sagiri };
66*80166370Sagiri 
67*80166370Sagiri static int
rdsv3_bind_node_compare(const void * a,const void * b)68*80166370Sagiri rdsv3_bind_node_compare(const void *a, const void *b)
69c0dd49bdSEiji Ota {
70*80166370Sagiri 	struct rdsv3_ip_bucket *bp = (struct rdsv3_ip_bucket *)b;
71*80166370Sagiri 
72*80166370Sagiri 	if (*(uint64_t *)a > (((uint64_t)bp->ip << 32) | bp->zone))
73*80166370Sagiri 		return (+1);
74*80166370Sagiri 	else if (*(uint64_t *)a < (((uint64_t)bp->ip << 32) | bp->zone))
75*80166370Sagiri 		return (-1);
76*80166370Sagiri 
77*80166370Sagiri 	return (0);
78*80166370Sagiri }
79*80166370Sagiri 
80*80166370Sagiri void
rdsv3_bind_init()81*80166370Sagiri rdsv3_bind_init()
82*80166370Sagiri {
83*80166370Sagiri 	RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter");
84*80166370Sagiri 
85*80166370Sagiri 	mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL);
86*80166370Sagiri 	avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare,
87*80166370Sagiri 	    sizeof (struct rdsv3_ip_bucket),
88*80166370Sagiri 	    offsetof(struct rdsv3_ip_bucket, ip_avl_node));
89*80166370Sagiri 
90*80166370Sagiri 	RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return");
91*80166370Sagiri }
92*80166370Sagiri 
93*80166370Sagiri /* called on detach */
94*80166370Sagiri void
rdsv3_bind_exit()95*80166370Sagiri rdsv3_bind_exit()
96*80166370Sagiri {
97*80166370Sagiri 	struct rdsv3_ip_bucket	*bucketp;
98*80166370Sagiri 	void			*cookie = NULL;
99*80166370Sagiri 
100*80166370Sagiri 	RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter");
101*80166370Sagiri 
102*80166370Sagiri 	while ((bucketp =
103*80166370Sagiri 	    avl_destroy_nodes(&rdsv3_bind_tree, &cookie)) != NULL) {
104*80166370Sagiri 		rw_destroy(&bucketp->rwlock);
105*80166370Sagiri 		kmem_free(bucketp, sizeof (struct rdsv3_ip_bucket));
106c0dd49bdSEiji Ota 	}
107c0dd49bdSEiji Ota 
108*80166370Sagiri 	avl_destroy(&rdsv3_bind_tree);
109*80166370Sagiri 	mutex_destroy(&rdsv3_bind_lock);
110*80166370Sagiri 
111*80166370Sagiri 	RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return");
112*80166370Sagiri }
113*80166370Sagiri 
114*80166370Sagiri struct rdsv3_ip_bucket *
rdsv3_find_ip_bucket(ipaddr_t ipaddr,zoneid_t zoneid)115*80166370Sagiri rdsv3_find_ip_bucket(ipaddr_t ipaddr, zoneid_t zoneid)
116*80166370Sagiri {
117*80166370Sagiri 	struct rdsv3_ip_bucket	*bucketp;
118*80166370Sagiri 	avl_index_t		where;
119*80166370Sagiri 	uint64_t		needle = ((uint64_t)ipaddr << 32) | zoneid;
120*80166370Sagiri 
121*80166370Sagiri 	mutex_enter(&rdsv3_bind_lock);
122*80166370Sagiri 	bucketp = avl_find(&rdsv3_bind_tree, &needle, &where);
123*80166370Sagiri 	if (bucketp == NULL) {
124*80166370Sagiri 		/* allocate a new bucket for this IP & zone */
125*80166370Sagiri 		bucketp =
126*80166370Sagiri 		    kmem_zalloc(sizeof (struct rdsv3_ip_bucket), KM_SLEEP);
127*80166370Sagiri 		rw_init(&bucketp->rwlock, NULL, RW_DRIVER, NULL);
128*80166370Sagiri 		bucketp->ip = ipaddr;
129*80166370Sagiri 		bucketp->zone = zoneid;
130*80166370Sagiri 		avl_insert(&rdsv3_bind_tree, bucketp, where);
131*80166370Sagiri 	}
132*80166370Sagiri 	mutex_exit(&rdsv3_bind_lock);
133*80166370Sagiri 
134*80166370Sagiri 	return (bucketp);
135c0dd49bdSEiji Ota }
136c0dd49bdSEiji Ota 
137c0dd49bdSEiji Ota /*
138c0dd49bdSEiji Ota  * Return the rdsv3_sock bound at the given local address.
139c0dd49bdSEiji Ota  *
140c0dd49bdSEiji Ota  * The rx path can race with rdsv3_release.  We notice if rdsv3_release() has
141c0dd49bdSEiji Ota  * marked this socket and don't return a rs ref to the rx path.
142c0dd49bdSEiji Ota  */
143c0dd49bdSEiji Ota struct rdsv3_sock *
rdsv3_find_bound(struct rdsv3_connection * conn,uint16_be_t port)144*80166370Sagiri rdsv3_find_bound(struct rdsv3_connection *conn, uint16_be_t port)
145c0dd49bdSEiji Ota {
146c0dd49bdSEiji Ota 	struct rdsv3_sock *rs;
147c0dd49bdSEiji Ota 
148*80166370Sagiri 	RDSV3_DPRINTF4("rdsv3_find_bound", "Enter(ip:port: %u.%u.%u.%u:%d)",
149*80166370Sagiri 	    NIPQUAD(conn->c_laddr), ntohs(port));
150c0dd49bdSEiji Ota 
151*80166370Sagiri 	rw_enter(&conn->c_bucketp->rwlock, RW_READER);
152*80166370Sagiri 	ASSERT(ntohl(conn->c_laddr) == conn->c_bucketp->ip);
153*80166370Sagiri 	rs = conn->c_bucketp->port[ntohs(port)];
154c0dd49bdSEiji Ota 	if (rs && !rdsv3_sk_sock_flag(rdsv3_rs_to_sk(rs), SOCK_DEAD))
155*80166370Sagiri 		rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
156c0dd49bdSEiji Ota 	else
157c0dd49bdSEiji Ota 		rs = NULL;
158*80166370Sagiri 	rw_exit(&conn->c_bucketp->rwlock);
159c0dd49bdSEiji Ota 
160*80166370Sagiri 	RDSV3_DPRINTF5("rdsv3_find_bound", "returning rs %p for %u.%u.%u.%u:%d",
161*80166370Sagiri 	    rs, NIPQUAD(conn->c_laddr), ntohs(port));
162cadbfdc3SEiji Ota 
163c0dd49bdSEiji Ota 	return (rs);
164c0dd49bdSEiji Ota }
165c0dd49bdSEiji Ota 
166c0dd49bdSEiji Ota /* returns -ve errno or +ve port */
167c0dd49bdSEiji Ota static int
rdsv3_add_bound(struct rdsv3_sock * rs,uint32_be_t addr,uint16_be_t * port)168c0dd49bdSEiji Ota rdsv3_add_bound(struct rdsv3_sock *rs, uint32_be_t addr, uint16_be_t *port)
169c0dd49bdSEiji Ota {
170c0dd49bdSEiji Ota 	int ret = -EADDRINUSE;
171c0dd49bdSEiji Ota 	uint16_t rover, last;
172*80166370Sagiri 	struct rdsv3_ip_bucket *bucketp;
173c0dd49bdSEiji Ota 
174*80166370Sagiri 	RDSV3_DPRINTF4("rdsv3_add_bound", "Enter(addr:port: %x:%x)",
175*80166370Sagiri 	    ntohl(addr), ntohs(*port));
176c0dd49bdSEiji Ota 
177c0dd49bdSEiji Ota 	if (*port != 0) {
178c0dd49bdSEiji Ota 		rover = ntohs(*port);
179c0dd49bdSEiji Ota 		last = rover;
180c0dd49bdSEiji Ota 	} else {
181c0dd49bdSEiji Ota 		(void) random_get_pseudo_bytes((uint8_t *)&rover,
182c0dd49bdSEiji Ota 		    sizeof (uint16_t));
183c0dd49bdSEiji Ota 		rover = MAX(rover, 2);
184c0dd49bdSEiji Ota 		last = rover - 1;
185c0dd49bdSEiji Ota 	}
186c0dd49bdSEiji Ota 
187*80166370Sagiri 	bucketp = rdsv3_find_ip_bucket(ntohl(addr), rs->rs_zoneid);
188*80166370Sagiri 
189*80166370Sagiri 	/* leave the bind lock and get the bucket lock */
190*80166370Sagiri 	rw_enter(&bucketp->rwlock, RW_WRITER);
191c0dd49bdSEiji Ota 
192c0dd49bdSEiji Ota 	do {
193c0dd49bdSEiji Ota 		if (rover == 0)
194c0dd49bdSEiji Ota 			rover++;
1955d5562f5SEiji Ota 
196*80166370Sagiri 		if (bucketp->port[rover] == NULL) {
197c0dd49bdSEiji Ota 			*port = htons(rover);
198c0dd49bdSEiji Ota 			ret = 0;
199c0dd49bdSEiji Ota 			break;
200c0dd49bdSEiji Ota 		}
201c0dd49bdSEiji Ota 	} while (rover++ != last);
202c0dd49bdSEiji Ota 
203c0dd49bdSEiji Ota 	if (ret == 0)  {
204c0dd49bdSEiji Ota 		rs->rs_bound_addr = addr;
205c0dd49bdSEiji Ota 		rs->rs_bound_port = *port;
206*80166370Sagiri 		bucketp->port[rover] = rs;
207*80166370Sagiri 		bucketp->nsockets++;
208c0dd49bdSEiji Ota 		rdsv3_sock_addref(rs);
209c0dd49bdSEiji Ota 
210c0dd49bdSEiji Ota 		RDSV3_DPRINTF5("rdsv3_add_bound",
211*80166370Sagiri 		    "rs %p binding to %u.%u.%u.%u:%d",
212*80166370Sagiri 		    rs, NIPQUAD(addr), rover);
213c0dd49bdSEiji Ota 	}
214c0dd49bdSEiji Ota 
215*80166370Sagiri 	rw_exit(&bucketp->rwlock);
216*80166370Sagiri 
217*80166370Sagiri 	RDSV3_DPRINTF4("rdsv3_add_bound", "Return(ret: %d port: %d)",
218*80166370Sagiri 	    ret, rover);
219c0dd49bdSEiji Ota 
220c0dd49bdSEiji Ota 
221c0dd49bdSEiji Ota 	return (ret);
222c0dd49bdSEiji Ota }
223c0dd49bdSEiji Ota 
224c0dd49bdSEiji Ota void
rdsv3_remove_bound(struct rdsv3_sock * rs)225c0dd49bdSEiji Ota rdsv3_remove_bound(struct rdsv3_sock *rs)
226c0dd49bdSEiji Ota {
227c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_remove_bound", "Enter(rs: %p)", rs);
228c0dd49bdSEiji Ota 
229c0dd49bdSEiji Ota 	if (rs->rs_bound_addr) {
230*80166370Sagiri 		struct rdsv3_ip_bucket *bucketp;
231*80166370Sagiri 
232c0dd49bdSEiji Ota 		RDSV3_DPRINTF5("rdsv3_remove_bound",
233c0dd49bdSEiji Ota 		    "rs %p unbinding from %u.%u.%u.%u:%x",
234*80166370Sagiri 		    rs, NIPQUAD(htonl(rs->rs_bound_addr)), rs->rs_bound_port);
235*80166370Sagiri 
236*80166370Sagiri 		bucketp = rdsv3_find_ip_bucket(ntohl(rs->rs_bound_addr),
237*80166370Sagiri 		    rs->rs_zoneid);
238*80166370Sagiri 
239*80166370Sagiri 		rw_enter(&bucketp->rwlock, RW_WRITER);
240*80166370Sagiri 		bucketp->port[ntohs(rs->rs_bound_port)] = NULL;
241*80166370Sagiri 		bucketp->nsockets--;
242c0dd49bdSEiji Ota 		rs->rs_bound_addr = 0;
243*80166370Sagiri 		rw_exit(&bucketp->rwlock);
244c0dd49bdSEiji Ota 
245*80166370Sagiri 		rdsv3_sock_put(rs);
246*80166370Sagiri 	}
247c0dd49bdSEiji Ota 
248c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_remove_bound", "Return(rs: %p)", rs);
249c0dd49bdSEiji Ota }
250c0dd49bdSEiji Ota 
251c0dd49bdSEiji Ota /* ARGSUSED */
252c0dd49bdSEiji Ota int
rdsv3_bind(sock_lower_handle_t proto_handle,struct sockaddr * sa,socklen_t len,cred_t * cr)253c0dd49bdSEiji Ota rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
254c0dd49bdSEiji Ota     socklen_t len, cred_t *cr)
255c0dd49bdSEiji Ota {
256c0dd49bdSEiji Ota 	struct rsock	*sk = (struct rsock *)proto_handle;
257c0dd49bdSEiji Ota 	sin_t		*sin = (sin_t *)sa;
258c0dd49bdSEiji Ota 	struct rdsv3_sock	*rs = rdsv3_sk_to_rs(sk);
259c0dd49bdSEiji Ota 	int		ret;
260c0dd49bdSEiji Ota 
261c0dd49bdSEiji Ota 	if (len != sizeof (sin_t) || (sin == NULL) ||
262c0dd49bdSEiji Ota 	    !OK_32PTR((char *)sin)) {
263c0dd49bdSEiji Ota 		RDSV3_DPRINTF2("rdsv3_bind", "address to bind not specified");
264c0dd49bdSEiji Ota 		return (EINVAL);
265c0dd49bdSEiji Ota 	}
266c0dd49bdSEiji Ota 
267c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_bind", "Enter(rs: %p, addr: 0x%x, port: %x)",
268c0dd49bdSEiji Ota 	    rs, ntohl(sin->sin_addr.s_addr), htons(sin->sin_port));
269c0dd49bdSEiji Ota 
270c0dd49bdSEiji Ota 	if (sin->sin_addr.s_addr == INADDR_ANY) {
271c0dd49bdSEiji Ota 		RDSV3_DPRINTF2("rdsv3_bind", "Invalid address");
272c0dd49bdSEiji Ota 		return (EINVAL);
273c0dd49bdSEiji Ota 	}
274c0dd49bdSEiji Ota 
275c0dd49bdSEiji Ota 	/* We don't allow multiple binds */
276c0dd49bdSEiji Ota 	if (rs->rs_bound_addr) {
277c0dd49bdSEiji Ota 		RDSV3_DPRINTF2("rdsv3_bind", "Multiple binds not allowed");
278c0dd49bdSEiji Ota 		return (EINVAL);
279c0dd49bdSEiji Ota 	}
280c0dd49bdSEiji Ota 
281c0dd49bdSEiji Ota 	ret = rdsv3_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
282c0dd49bdSEiji Ota 	if (ret) {
283c0dd49bdSEiji Ota 		return (ret);
284c0dd49bdSEiji Ota 	}
285c0dd49bdSEiji Ota 
286c0dd49bdSEiji Ota 	rs->rs_transport = rdsv3_trans_get_preferred(sin->sin_addr.s_addr);
2875d5562f5SEiji Ota 	if (!rs->rs_transport) {
288c0dd49bdSEiji Ota 		rdsv3_remove_bound(rs);
289cadbfdc3SEiji Ota 		if (rdsv3_printk_ratelimit()) {
290cadbfdc3SEiji Ota 			RDSV3_DPRINTF1("rdsv3_bind",
291cadbfdc3SEiji Ota 			    "RDS: rdsv3_bind() could not find a transport.\n");
292cadbfdc3SEiji Ota 		}
293c0dd49bdSEiji Ota 		return (EADDRNOTAVAIL);
294c0dd49bdSEiji Ota 	}
295c0dd49bdSEiji Ota 
296c0dd49bdSEiji Ota 	RDSV3_DPRINTF4("rdsv3_bind", "Return: Assigned port: %x to sock: %p",
297c0dd49bdSEiji Ota 	    sin->sin_port, rs);
298c0dd49bdSEiji Ota 
299c0dd49bdSEiji Ota 	return (0);
300c0dd49bdSEiji Ota }
301