1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define _SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/policy.h>
39 #include <sys/squeue_impl.h>
40 #include <sys/squeue.h>
41 #include <sys/tsol/tnet.h>
42
43 #include <rpc/pmap_prot.h>
44
45 #include <inet/common.h>
46 #include <inet/ip.h>
47 #include <inet/tcp.h>
48 #include <inet/tcp_impl.h>
49 #include <inet/proto_set.h>
50 #include <inet/ipsec_impl.h>
51
52 /* Setable in /etc/system */
53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
54 static uint32_t tcp_random_anon_port = 1;
55
56 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
57 cred_t *cr);
58 static in_port_t tcp_get_next_priv_port(const tcp_t *);
59
60 /*
61 * Hash list insertion routine for tcp_t structures. Each hash bucket
62 * contains a list of tcp_t entries, and each entry is bound to a unique
63 * port. If there are multiple tcp_t's that are bound to the same port, then
64 * one of them will be linked into the hash bucket list, and the rest will
65 * hang off of that one entry. For each port, entries bound to a specific IP
66 * address will be inserted before those those bound to INADDR_ANY.
67 */
68 void
tcp_bind_hash_insert(tf_t * tbf,tcp_t * tcp,int caller_holds_lock)69 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
70 {
71 tcp_t **tcpp;
72 tcp_t *tcpnext;
73 tcp_t *tcphash;
74 conn_t *connp = tcp->tcp_connp;
75 conn_t *connext;
76
77 if (tcp->tcp_ptpbhn != NULL) {
78 ASSERT(!caller_holds_lock);
79 tcp_bind_hash_remove(tcp);
80 }
81 tcpp = &tbf->tf_tcp;
82 if (!caller_holds_lock) {
83 mutex_enter(&tbf->tf_lock);
84 } else {
85 ASSERT(MUTEX_HELD(&tbf->tf_lock));
86 }
87 tcphash = tcpp[0];
88 tcpnext = NULL;
89 if (tcphash != NULL) {
90 /* Look for an entry using the same port */
91 while ((tcphash = tcpp[0]) != NULL &&
92 connp->conn_lport != tcphash->tcp_connp->conn_lport)
93 tcpp = &(tcphash->tcp_bind_hash);
94
95 /* The port was not found, just add to the end */
96 if (tcphash == NULL)
97 goto insert;
98
99 /*
100 * OK, there already exists an entry bound to the
101 * same port.
102 *
103 * If the new tcp bound to the INADDR_ANY address
104 * and the first one in the list is not bound to
105 * INADDR_ANY we skip all entries until we find the
106 * first one bound to INADDR_ANY.
107 * This makes sure that applications binding to a
108 * specific address get preference over those binding to
109 * INADDR_ANY.
110 */
111 tcpnext = tcphash;
112 connext = tcpnext->tcp_connp;
113 tcphash = NULL;
114 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
115 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
116 while ((tcpnext = tcpp[0]) != NULL) {
117 connext = tcpnext->tcp_connp;
118 if (!V6_OR_V4_INADDR_ANY(
119 connext->conn_bound_addr_v6))
120 tcpp = &(tcpnext->tcp_bind_hash_port);
121 else
122 break;
123 }
124 if (tcpnext != NULL) {
125 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
126 tcphash = tcpnext->tcp_bind_hash;
127 if (tcphash != NULL) {
128 tcphash->tcp_ptpbhn =
129 &(tcp->tcp_bind_hash);
130 tcpnext->tcp_bind_hash = NULL;
131 }
132 }
133 } else {
134 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
135 tcphash = tcpnext->tcp_bind_hash;
136 if (tcphash != NULL) {
137 tcphash->tcp_ptpbhn =
138 &(tcp->tcp_bind_hash);
139 tcpnext->tcp_bind_hash = NULL;
140 }
141 }
142 }
143 insert:
144 tcp->tcp_bind_hash_port = tcpnext;
145 tcp->tcp_bind_hash = tcphash;
146 tcp->tcp_ptpbhn = tcpp;
147 tcpp[0] = tcp;
148 if (!caller_holds_lock)
149 mutex_exit(&tbf->tf_lock);
150 }
151
152 /*
153 * Hash list removal routine for tcp_t structures.
154 */
155 void
tcp_bind_hash_remove(tcp_t * tcp)156 tcp_bind_hash_remove(tcp_t *tcp)
157 {
158 tcp_t *tcpnext;
159 kmutex_t *lockp;
160 tcp_stack_t *tcps = tcp->tcp_tcps;
161 conn_t *connp = tcp->tcp_connp;
162
163 if (tcp->tcp_ptpbhn == NULL)
164 return;
165
166 /*
167 * Extract the lock pointer in case there are concurrent
168 * hash_remove's for this instance.
169 */
170 ASSERT(connp->conn_lport != 0);
171 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
172 connp->conn_lport)].tf_lock;
173
174 ASSERT(lockp != NULL);
175 mutex_enter(lockp);
176 if (tcp->tcp_ptpbhn) {
177 tcpnext = tcp->tcp_bind_hash_port;
178 if (tcpnext != NULL) {
179 tcp->tcp_bind_hash_port = NULL;
180 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
181 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
182 if (tcpnext->tcp_bind_hash != NULL) {
183 tcpnext->tcp_bind_hash->tcp_ptpbhn =
184 &(tcpnext->tcp_bind_hash);
185 tcp->tcp_bind_hash = NULL;
186 }
187 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
188 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
189 tcp->tcp_bind_hash = NULL;
190 }
191 *tcp->tcp_ptpbhn = tcpnext;
192 tcp->tcp_ptpbhn = NULL;
193 }
194 mutex_exit(lockp);
195 }
196
197 /*
198 * Don't let port fall into the privileged range.
199 * Since the extra privileged ports can be arbitrary we also
200 * ensure that we exclude those from consideration.
201 * tcp_g_epriv_ports is not sorted thus we loop over it until
202 * there are no changes.
203 *
204 * Note: No locks are held when inspecting tcp_g_*epriv_ports
205 * but instead the code relies on:
206 * - the fact that the address of the array and its size never changes
207 * - the atomic assignment of the elements of the array
208 *
209 * Returns 0 if there are no more ports available.
210 *
211 * TS note: skip multilevel ports.
212 */
213 in_port_t
tcp_update_next_port(in_port_t port,const tcp_t * tcp,boolean_t random)214 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
215 {
216 int i, bump;
217 boolean_t restart = B_FALSE;
218 tcp_stack_t *tcps = tcp->tcp_tcps;
219
220 if (random && tcp_random_anon_port != 0) {
221 (void) random_get_pseudo_bytes((uint8_t *)&port,
222 sizeof (in_port_t));
223 /*
224 * Unless changed by a sys admin, the smallest anon port
225 * is 32768 and the largest anon port is 65535. It is
226 * very likely (50%) for the random port to be smaller
227 * than the smallest anon port. When that happens,
228 * add port % (anon port range) to the smallest anon
229 * port to get the random port. It should fall into the
230 * valid anon port range.
231 */
232 if ((port < tcps->tcps_smallest_anon_port) ||
233 (port > tcps->tcps_largest_anon_port)) {
234 if (tcps->tcps_smallest_anon_port ==
235 tcps->tcps_largest_anon_port) {
236 bump = 0;
237 } else {
238 bump = port % (tcps->tcps_largest_anon_port -
239 tcps->tcps_smallest_anon_port);
240 }
241 port = tcps->tcps_smallest_anon_port + bump;
242 }
243 }
244
245 retry:
246 if (port < tcps->tcps_smallest_anon_port)
247 port = (in_port_t)tcps->tcps_smallest_anon_port;
248
249 if (port > tcps->tcps_largest_anon_port) {
250 if (restart)
251 return (0);
252 restart = B_TRUE;
253 port = (in_port_t)tcps->tcps_smallest_anon_port;
254 }
255
256 if (port < tcps->tcps_smallest_nonpriv_port)
257 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
258
259 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
260 if (port == tcps->tcps_g_epriv_ports[i]) {
261 port++;
262 /*
263 * Make sure whether the port is in the
264 * valid range.
265 */
266 goto retry;
267 }
268 }
269 if (is_system_labeled() &&
270 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
271 IPPROTO_TCP, B_TRUE)) != 0) {
272 port = i;
273 goto retry;
274 }
275 return (port);
276 }
277
278 /*
279 * Return the next anonymous port in the privileged port range for
280 * bind checking. It starts at IPPORT_RESERVED - 1 and goes
281 * downwards. This is the same behavior as documented in the userland
282 * library call rresvport(3SOCKET).
283 *
284 * TS note: skip multilevel ports.
285 */
286 static in_port_t
tcp_get_next_priv_port(const tcp_t * tcp)287 tcp_get_next_priv_port(const tcp_t *tcp)
288 {
289 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
290 in_port_t nextport;
291 boolean_t restart = B_FALSE;
292 tcp_stack_t *tcps = tcp->tcp_tcps;
293 retry:
294 if (next_priv_port < tcps->tcps_min_anonpriv_port ||
295 next_priv_port >= IPPORT_RESERVED) {
296 next_priv_port = IPPORT_RESERVED - 1;
297 if (restart)
298 return (0);
299 restart = B_TRUE;
300 }
301 if (is_system_labeled() &&
302 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
303 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
304 next_priv_port = nextport;
305 goto retry;
306 }
307 return (next_priv_port--);
308 }
309
310 static int
tcp_bind_select_lport(tcp_t * tcp,in_port_t * requested_port_ptr,boolean_t bind_to_req_port_only,cred_t * cr)311 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
312 boolean_t bind_to_req_port_only, cred_t *cr)
313 {
314 in_port_t mlp_port;
315 mlp_type_t addrtype, mlptype;
316 boolean_t user_specified;
317 in_port_t allocated_port;
318 in_port_t requested_port = *requested_port_ptr;
319 conn_t *connp = tcp->tcp_connp;
320 zone_t *zone;
321 tcp_stack_t *tcps = tcp->tcp_tcps;
322 in6_addr_t v6addr = connp->conn_laddr_v6;
323
324 zone = NULL;
325 /*
326 * XXX It's up to the caller to specify bind_to_req_port_only or not.
327 */
328 ASSERT(cr != NULL);
329
330 /*
331 * Get a valid port (within the anonymous range and should not
332 * be a privileged one) to use if the user has not given a port.
333 * If multiple threads are here, they may all start with
334 * with the same initial port. But, it should be fine as long as
335 * tcp_bindi will ensure that no two threads will be assigned
336 * the same port.
337 *
338 * NOTE: XXX If a privileged process asks for an anonymous port, we
339 * still check for ports only in the range > tcp_smallest_non_priv_port,
340 * unless TCP_ANONPRIVBIND option is set.
341 */
342 mlptype = mlptSingle;
343 mlp_port = requested_port;
344 if (requested_port == 0) {
345 requested_port = connp->conn_anon_priv_bind ?
346 tcp_get_next_priv_port(tcp) :
347 tcp_update_next_port(tcps->tcps_next_port_to_try,
348 tcp, B_TRUE);
349 if (requested_port == 0) {
350 return (-TNOADDR);
351 }
352 user_specified = B_FALSE;
353
354 /*
355 * If the user went through one of the RPC interfaces to create
356 * this socket and RPC is MLP in this zone, then give them an
357 * anonymous MLP.
358 */
359 if (connp->conn_anon_mlp && is_system_labeled()) {
360 zone = crgetzone(cr);
361 addrtype = tsol_mlp_addr_type(
362 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
363 IPV6_VERSION, &v6addr,
364 tcps->tcps_netstack->netstack_ip);
365 if (addrtype == mlptSingle) {
366 return (-TNOADDR);
367 }
368 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
369 PMAPPORT, addrtype);
370 mlp_port = PMAPPORT;
371 }
372 } else {
373 int i;
374 boolean_t priv = B_FALSE;
375
376 /*
377 * If the requested_port is in the well-known privileged range,
378 * verify that the stream was opened by a privileged user.
379 * Note: No locks are held when inspecting tcp_g_*epriv_ports
380 * but instead the code relies on:
381 * - the fact that the address of the array and its size never
382 * changes
383 * - the atomic assignment of the elements of the array
384 */
385 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
386 priv = B_TRUE;
387 } else {
388 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
389 if (requested_port ==
390 tcps->tcps_g_epriv_ports[i]) {
391 priv = B_TRUE;
392 break;
393 }
394 }
395 }
396 if (priv) {
397 if (secpolicy_net_privaddr(cr, requested_port,
398 IPPROTO_TCP) != 0) {
399 if (connp->conn_debug) {
400 (void) strlog(TCP_MOD_ID, 0, 1,
401 SL_ERROR|SL_TRACE,
402 "tcp_bind: no priv for port %d",
403 requested_port);
404 }
405 return (-TACCES);
406 }
407 }
408 user_specified = B_TRUE;
409
410 connp = tcp->tcp_connp;
411 if (is_system_labeled()) {
412 zone = crgetzone(cr);
413 addrtype = tsol_mlp_addr_type(
414 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
415 IPV6_VERSION, &v6addr,
416 tcps->tcps_netstack->netstack_ip);
417 if (addrtype == mlptSingle) {
418 return (-TNOADDR);
419 }
420 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
421 requested_port, addrtype);
422 }
423 }
424
425 if (mlptype != mlptSingle) {
426 if (secpolicy_net_bindmlp(cr) != 0) {
427 if (connp->conn_debug) {
428 (void) strlog(TCP_MOD_ID, 0, 1,
429 SL_ERROR|SL_TRACE,
430 "tcp_bind: no priv for multilevel port %d",
431 requested_port);
432 }
433 return (-TACCES);
434 }
435
436 /*
437 * If we're specifically binding a shared IP address and the
438 * port is MLP on shared addresses, then check to see if this
439 * zone actually owns the MLP. Reject if not.
440 */
441 if (mlptype == mlptShared && addrtype == mlptShared) {
442 /*
443 * No need to handle exclusive-stack zones since
444 * ALL_ZONES only applies to the shared stack.
445 */
446 zoneid_t mlpzone;
447
448 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
449 htons(mlp_port));
450 if (connp->conn_zoneid != mlpzone) {
451 if (connp->conn_debug) {
452 (void) strlog(TCP_MOD_ID, 0, 1,
453 SL_ERROR|SL_TRACE,
454 "tcp_bind: attempt to bind port "
455 "%d on shared addr in zone %d "
456 "(should be %d)",
457 mlp_port, connp->conn_zoneid,
458 mlpzone);
459 }
460 return (-TACCES);
461 }
462 }
463
464 if (!user_specified) {
465 int err;
466 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
467 requested_port, B_TRUE);
468 if (err != 0) {
469 if (connp->conn_debug) {
470 (void) strlog(TCP_MOD_ID, 0, 1,
471 SL_ERROR|SL_TRACE,
472 "tcp_bind: cannot establish anon "
473 "MLP for port %d",
474 requested_port);
475 }
476 return (err);
477 }
478 connp->conn_anon_port = B_TRUE;
479 }
480 connp->conn_mlp_type = mlptype;
481 }
482
483 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
484 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
485 user_specified);
486
487 if (allocated_port == 0) {
488 connp->conn_mlp_type = mlptSingle;
489 if (connp->conn_anon_port) {
490 connp->conn_anon_port = B_FALSE;
491 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
492 requested_port, B_FALSE);
493 }
494 if (bind_to_req_port_only) {
495 if (connp->conn_debug) {
496 (void) strlog(TCP_MOD_ID, 0, 1,
497 SL_ERROR|SL_TRACE,
498 "tcp_bind: requested addr busy");
499 }
500 return (-TADDRBUSY);
501 } else {
502 /* If we are out of ports, fail the bind. */
503 if (connp->conn_debug) {
504 (void) strlog(TCP_MOD_ID, 0, 1,
505 SL_ERROR|SL_TRACE,
506 "tcp_bind: out of ports?");
507 }
508 return (-TNOADDR);
509 }
510 }
511
512 /* Pass the allocated port back */
513 *requested_port_ptr = allocated_port;
514 return (0);
515 }
516
517 /*
518 * Check the address and check/pick a local port number.
519 */
520 int
tcp_bind_check(conn_t * connp,struct sockaddr * sa,socklen_t len,cred_t * cr,boolean_t bind_to_req_port_only)521 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
522 boolean_t bind_to_req_port_only)
523 {
524 tcp_t *tcp = connp->conn_tcp;
525 sin_t *sin;
526 sin6_t *sin6;
527 in_port_t requested_port;
528 ipaddr_t v4addr;
529 in6_addr_t v6addr;
530 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
531 zoneid_t zoneid = IPCL_ZONEID(connp);
532 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
533 uint_t scopeid = 0;
534 int error = 0;
535 ip_xmit_attr_t *ixa = connp->conn_ixa;
536
537 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
538
539 if (tcp->tcp_state == TCPS_BOUND) {
540 return (0);
541 } else if (tcp->tcp_state > TCPS_BOUND) {
542 if (connp->conn_debug) {
543 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
544 "tcp_bind: bad state, %d", tcp->tcp_state);
545 }
546 return (-TOUTSTATE);
547 }
548
549 ASSERT(sa != NULL && len != 0);
550
551 if (!OK_32PTR((char *)sa)) {
552 if (connp->conn_debug) {
553 (void) strlog(TCP_MOD_ID, 0, 1,
554 SL_ERROR|SL_TRACE,
555 "tcp_bind: bad address parameter, "
556 "address %p, len %d",
557 (void *)sa, len);
558 }
559 return (-TPROTO);
560 }
561
562 error = proto_verify_ip_addr(connp->conn_family, sa, len);
563 if (error != 0) {
564 return (error);
565 }
566
567 switch (len) {
568 case sizeof (sin_t): /* Complete IPv4 address */
569 sin = (sin_t *)sa;
570 requested_port = ntohs(sin->sin_port);
571 v4addr = sin->sin_addr.s_addr;
572 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
573 if (v4addr != INADDR_ANY) {
574 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
575 B_FALSE);
576 }
577 break;
578
579 case sizeof (sin6_t): /* Complete IPv6 address */
580 sin6 = (sin6_t *)sa;
581 v6addr = sin6->sin6_addr;
582 requested_port = ntohs(sin6->sin6_port);
583 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
584 if (connp->conn_ipv6_v6only)
585 return (EADDRNOTAVAIL);
586
587 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
588 if (v4addr != INADDR_ANY) {
589 laddr_type = ip_laddr_verify_v4(v4addr,
590 zoneid, ipst, B_FALSE);
591 }
592 } else {
593 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
594 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
595 scopeid = sin6->sin6_scope_id;
596 laddr_type = ip_laddr_verify_v6(&v6addr,
597 zoneid, ipst, B_FALSE, scopeid);
598 }
599 }
600 break;
601
602 default:
603 if (connp->conn_debug) {
604 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
605 "tcp_bind: bad address length, %d", len);
606 }
607 return (EAFNOSUPPORT);
608 /* return (-TBADADDR); */
609 }
610
611 /* Is the local address a valid unicast address? */
612 if (laddr_type == IPVL_BAD)
613 return (EADDRNOTAVAIL);
614
615 connp->conn_bound_addr_v6 = v6addr;
616 if (scopeid != 0) {
617 ixa->ixa_flags |= IXAF_SCOPEID_SET;
618 ixa->ixa_scopeid = scopeid;
619 connp->conn_incoming_ifindex = scopeid;
620 } else {
621 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
622 connp->conn_incoming_ifindex = connp->conn_bound_if;
623 }
624
625 connp->conn_laddr_v6 = v6addr;
626 connp->conn_saddr_v6 = v6addr;
627
628 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
629
630 error = tcp_bind_select_lport(tcp, &requested_port,
631 bind_to_req_port_only, cr);
632 if (error != 0) {
633 connp->conn_laddr_v6 = ipv6_all_zeros;
634 connp->conn_saddr_v6 = ipv6_all_zeros;
635 connp->conn_bound_addr_v6 = ipv6_all_zeros;
636 }
637 return (error);
638 }
639
640 /*
641 * If the "bind_to_req_port_only" parameter is set, if the requested port
642 * number is available, return it, If not return 0
643 *
644 * If "bind_to_req_port_only" parameter is not set and
645 * If the requested port number is available, return it. If not, return
646 * the first anonymous port we happen across. If no anonymous ports are
647 * available, return 0. addr is the requested local address, if any.
648 *
649 * In either case, when succeeding update the tcp_t to record the port number
650 * and insert it in the bind hash table.
651 *
652 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
653 * without setting SO_REUSEADDR. This is needed so that they
654 * can be viewed as two independent transport protocols.
655 */
656 in_port_t
tcp_bindi(tcp_t * tcp,in_port_t port,const in6_addr_t * laddr,int reuseaddr,boolean_t quick_connect,boolean_t bind_to_req_port_only,boolean_t user_specified)657 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
658 int reuseaddr, boolean_t quick_connect,
659 boolean_t bind_to_req_port_only, boolean_t user_specified)
660 {
661 /* number of times we have run around the loop */
662 int count = 0;
663 /* maximum number of times to run around the loop */
664 int loopmax;
665 conn_t *connp = tcp->tcp_connp;
666 tcp_stack_t *tcps = tcp->tcp_tcps;
667
668 /*
669 * Lookup for free addresses is done in a loop and "loopmax"
670 * influences how long we spin in the loop
671 */
672 if (bind_to_req_port_only) {
673 /*
674 * If the requested port is busy, don't bother to look
675 * for a new one. Setting loop maximum count to 1 has
676 * that effect.
677 */
678 loopmax = 1;
679 } else {
680 /*
681 * If the requested port is busy, look for a free one
682 * in the anonymous port range.
683 * Set loopmax appropriately so that one does not look
684 * forever in the case all of the anonymous ports are in use.
685 */
686 if (connp->conn_anon_priv_bind) {
687 /*
688 * loopmax =
689 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
690 */
691 loopmax = IPPORT_RESERVED -
692 tcps->tcps_min_anonpriv_port;
693 } else {
694 loopmax = (tcps->tcps_largest_anon_port -
695 tcps->tcps_smallest_anon_port + 1);
696 }
697 }
698 do {
699 uint16_t lport;
700 tf_t *tbf;
701 tcp_t *ltcp;
702 conn_t *lconnp;
703
704 lport = htons(port);
705
706 /*
707 * Ensure that the tcp_t is not currently in the bind hash.
708 * Hold the lock on the hash bucket to ensure that
709 * the duplicate check plus the insertion is an atomic
710 * operation.
711 *
712 * This function does an inline lookup on the bind hash list
713 * Make sure that we access only members of tcp_t
714 * and that we don't look at tcp_tcp, since we are not
715 * doing a CONN_INC_REF.
716 */
717 tcp_bind_hash_remove(tcp);
718 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
719 mutex_enter(&tbf->tf_lock);
720 for (ltcp = tbf->tf_tcp; ltcp != NULL;
721 ltcp = ltcp->tcp_bind_hash) {
722 if (lport == ltcp->tcp_connp->conn_lport)
723 break;
724 }
725
726 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
727 boolean_t not_socket;
728 boolean_t exclbind;
729
730 lconnp = ltcp->tcp_connp;
731
732 /*
733 * On a labeled system, we must treat bindings to ports
734 * on shared IP addresses by sockets with MAC exemption
735 * privilege as being in all zones, as there's
736 * otherwise no way to identify the right receiver.
737 */
738 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
739 continue;
740
741 /*
742 * If TCP_EXCLBIND is set for either the bound or
743 * binding endpoint, the semantics of bind
744 * is changed according to the following.
745 *
746 * spec = specified address (v4 or v6)
747 * unspec = unspecified address (v4 or v6)
748 * A = specified addresses are different for endpoints
749 *
750 * bound bind to allowed
751 * -------------------------------------
752 * unspec unspec no
753 * unspec spec no
754 * spec unspec no
755 * spec spec yes if A
756 *
757 * For labeled systems, SO_MAC_EXEMPT behaves the same
758 * as TCP_EXCLBIND, except that zoneid is ignored.
759 *
760 * Note:
761 *
762 * 1. Because of TLI semantics, an endpoint can go
763 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
764 * TCPS_BOUND, depending on whether it is originally
765 * a listener or not. That is why we need to check
766 * for states greater than or equal to TCPS_BOUND
767 * here.
768 *
769 * 2. Ideally, we should only check for state equals
770 * to TCPS_LISTEN. And the following check should be
771 * added.
772 *
773 * if (ltcp->tcp_state == TCPS_LISTEN ||
774 * !reuseaddr || !lconnp->conn_reuseaddr) {
775 * ...
776 * }
777 *
778 * The semantics will be changed to this. If the
779 * endpoint on the list is in state not equal to
780 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
781 * set, let the bind succeed.
782 *
783 * Because of (1), we cannot do that for TLI
784 * endpoints. But we can do that for socket endpoints.
785 * If in future, we can change this going back
786 * semantics, we can use the above check for TLI also.
787 */
788 not_socket = !(TCP_IS_SOCKET(ltcp) &&
789 TCP_IS_SOCKET(tcp));
790 exclbind = lconnp->conn_exclbind ||
791 connp->conn_exclbind;
792
793 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
794 (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
795 (exclbind && (not_socket ||
796 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
797 if (V6_OR_V4_INADDR_ANY(
798 lconnp->conn_bound_addr_v6) ||
799 V6_OR_V4_INADDR_ANY(*laddr) ||
800 IN6_ARE_ADDR_EQUAL(laddr,
801 &lconnp->conn_bound_addr_v6)) {
802 break;
803 }
804 continue;
805 }
806
807 /*
808 * Check ipversion to allow IPv4 and IPv6 sockets to
809 * have disjoint port number spaces, if *_EXCLBIND
810 * is not set and only if the application binds to a
811 * specific port. We use the same autoassigned port
812 * number space for IPv4 and IPv6 sockets.
813 */
814 if (connp->conn_ipversion != lconnp->conn_ipversion &&
815 bind_to_req_port_only)
816 continue;
817
818 /*
819 * Ideally, we should make sure that the source
820 * address, remote address, and remote port in the
821 * four tuple for this tcp-connection is unique.
822 * However, trying to find out the local source
823 * address would require too much code duplication
824 * with IP, since IP needs needs to have that code
825 * to support userland TCP implementations.
826 */
827 if (quick_connect &&
828 (ltcp->tcp_state > TCPS_LISTEN) &&
829 ((connp->conn_fport != lconnp->conn_fport) ||
830 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
831 &lconnp->conn_faddr_v6)))
832 continue;
833
834 if (!reuseaddr) {
835 /*
836 * No socket option SO_REUSEADDR.
837 * If existing port is bound to
838 * a non-wildcard IP address
839 * and the requesting stream is
840 * bound to a distinct
841 * different IP addresses
842 * (non-wildcard, also), keep
843 * going.
844 */
845 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
846 !V6_OR_V4_INADDR_ANY(
847 lconnp->conn_bound_addr_v6) &&
848 !IN6_ARE_ADDR_EQUAL(laddr,
849 &lconnp->conn_bound_addr_v6))
850 continue;
851 if (ltcp->tcp_state >= TCPS_BOUND) {
852 /*
853 * This port is being used and
854 * its state is >= TCPS_BOUND,
855 * so we can't bind to it.
856 */
857 break;
858 }
859 } else {
860 /*
861 * socket option SO_REUSEADDR is set on the
862 * binding tcp_t.
863 *
864 * If two streams are bound to
865 * same IP address or both addr
866 * and bound source are wildcards
867 * (INADDR_ANY), we want to stop
868 * searching.
869 * We have found a match of IP source
870 * address and source port, which is
871 * refused regardless of the
872 * SO_REUSEADDR setting, so we break.
873 */
874 if (IN6_ARE_ADDR_EQUAL(laddr,
875 &lconnp->conn_bound_addr_v6) &&
876 (ltcp->tcp_state == TCPS_LISTEN ||
877 ltcp->tcp_state == TCPS_BOUND))
878 break;
879 }
880 }
881 if (ltcp != NULL) {
882 /* The port number is busy */
883 mutex_exit(&tbf->tf_lock);
884 } else {
885 /*
886 * This port is ours. Insert in fanout and mark as
887 * bound to prevent others from getting the port
888 * number.
889 */
890 tcp->tcp_state = TCPS_BOUND;
891 DTRACE_TCP6(state__change, void, NULL,
892 ip_xmit_attr_t *, connp->conn_ixa,
893 void, NULL, tcp_t *, tcp, void, NULL,
894 int32_t, TCPS_IDLE);
895
896 connp->conn_lport = htons(port);
897
898 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
899 connp->conn_lport)] == tbf);
900 tcp_bind_hash_insert(tbf, tcp, 1);
901
902 mutex_exit(&tbf->tf_lock);
903
904 /*
905 * We don't want tcp_next_port_to_try to "inherit"
906 * a port number supplied by the user in a bind.
907 */
908 if (user_specified)
909 return (port);
910
911 /*
912 * This is the only place where tcp_next_port_to_try
913 * is updated. After the update, it may or may not
914 * be in the valid range.
915 */
916 if (!connp->conn_anon_priv_bind)
917 tcps->tcps_next_port_to_try = port + 1;
918 return (port);
919 }
920
921 if (connp->conn_anon_priv_bind) {
922 port = tcp_get_next_priv_port(tcp);
923 } else {
924 if (count == 0 && user_specified) {
925 /*
926 * We may have to return an anonymous port. So
927 * get one to start with.
928 */
929 port =
930 tcp_update_next_port(
931 tcps->tcps_next_port_to_try,
932 tcp, B_TRUE);
933 user_specified = B_FALSE;
934 } else {
935 port = tcp_update_next_port(port + 1, tcp,
936 B_FALSE);
937 }
938 }
939 if (port == 0)
940 break;
941
942 /*
943 * Don't let this loop run forever in the case where
944 * all of the anonymous ports are in use.
945 */
946 } while (++count < loopmax);
947 return (0);
948 }
949