1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
27  * Copyright 2022 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/socket.h>
35 #include <sys/xti_xtiopt.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 
39 #include <inet/cc.h>
40 #include <inet/common.h>
41 #include <netinet/ip6.h>
42 #include <inet/ip.h>
43 
44 #include <netinet/in.h>
45 #include <netinet/tcp.h>
46 #include <inet/optcom.h>
47 #include <inet/proto_set.h>
48 #include <inet/tcp_impl.h>
49 
50 static int	tcp_opt_default(queue_t *, int, int, uchar_t *);
51 
52 /*
53  * Table of all known options handled on a TCP protocol stack.
54  *
55  * Note: This table contains options processed by both TCP and IP levels
56  *       and is the superset of options that can be performed on a TCP over IP
57  *       stack.
58  */
59 opdes_t	tcp_opt_arr[] = {
60 
61 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
62 	sizeof (struct linger), 0 },
63 
64 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
68 	},
69 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
71 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
72 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
73 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
75 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
76 	sizeof (struct timeval), 0 },
77 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
78 	sizeof (struct timeval), 0 },
79 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
80 	},
81 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
82 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
83 	0 },
84 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
85 	0 },
86 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
87 	0 },
88 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
89 	0 },
90 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
91 
92 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
93 
94 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
95 
96 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
97 	},
98 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
99 	536 },
100 
101 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
102 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
103 
104 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
105 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
106 
107 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
108 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
109 
110 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
111 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
112 
113 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
114 	0 },
115 
116 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
117 	sizeof (int), 0 },
118 
119 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
120 	},
121 
122 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
123 	sizeof (int), 0 },
124 
125 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
126 	sizeof (int), 0	},
127 
128 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
129 
130 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131 
132 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
133 
134 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
135 	sizeof (int), 0	},
136 
137 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
138 
139 { TCP_QUICKACK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
140 
141 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
142 
143 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
144 
145 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
146 
147 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
148 
149 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
150 	OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
151 
152 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
153 	(OP_VARLEN|OP_NODEFAULT),
154 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
155 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
156 	(OP_VARLEN|OP_NODEFAULT),
157 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
158 
159 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
160 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
161 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
162 	sizeof (int), -1 /* not initialized */ },
163 { IP_RECVTOS,	IPPROTO_IP,  OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
164 
165 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
166 	sizeof (ipsec_req_t), -1 /* not initialized */ },
167 
168 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
169 	sizeof (int),	0 /* no ifindex */ },
170 
171 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
172 	sizeof (int), 0 },
173 
174 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
175 	sizeof (int), -1 /* not initialized */ },
176 
177 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
178 	sizeof (int),	0 /* no ifindex */ },
179 
180 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
181 
182 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
183 	sizeof (in_addr_t),	-1 /* not initialized  */ },
184 
185 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
186 	sizeof (int), 0 },
187 
188 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
189 	(OP_NODEFAULT|OP_VARLEN),
190 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
191 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
192 	OP_NODEFAULT,
193 	sizeof (sin6_t), -1 /* not initialized */ },
194 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
195 	(OP_VARLEN|OP_NODEFAULT), 255*8,
196 	-1 /* not initialized */ },
197 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
198 	(OP_VARLEN|OP_NODEFAULT), 255*8,
199 	-1 /* not initialized */ },
200 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
201 	(OP_VARLEN|OP_NODEFAULT), 255*8,
202 	-1 /* not initialized */ },
203 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
204 	(OP_VARLEN|OP_NODEFAULT), 255*8,
205 	-1 /* not initialized */ },
206 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
207 	OP_NODEFAULT,
208 	sizeof (int), -1 /* not initialized */ },
209 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
210 	OP_NODEFAULT,
211 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
212 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
213 	sizeof (int), 0 },
214 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
215 	sizeof (int), 0 },
216 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
217 	sizeof (int), 0 },
218 
219 /* Enable receipt of ancillary data */
220 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
221 	sizeof (int), 0 },
222 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
223 	sizeof (int), 0 },
224 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
225 	sizeof (int), 0 },
226 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
227 	sizeof (int), 0 },
228 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
229 	sizeof (int), 0 },
230 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
231 	sizeof (int), 0 },
232 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
233 	sizeof (int), 0 },
234 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
235 	sizeof (int), 0 },
236 
237 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
238 	sizeof (ipsec_req_t), -1 /* not initialized */ },
239 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
240 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
241 };
242 
243 /*
244  * Table of all supported levels
245  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
246  * any supported options so we need this info separately.
247  *
248  * This is needed only for topmost tpi providers and is used only by
249  * XTI interfaces.
250  */
251 optlevel_t	tcp_valid_levels_arr[] = {
252 	XTI_GENERIC,
253 	SOL_SOCKET,
254 	IPPROTO_TCP,
255 	IPPROTO_IP,
256 	IPPROTO_IPV6
257 };
258 
259 
260 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
261 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
262 
263 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
264 
265 /*
266  * Initialize option database object for TCP
267  *
268  * This object represents database of options to search passed to
269  * {sock,tpi}optcom_req() interface routine to take care of option
270  * management and associated methods.
271  */
272 
273 optdb_obj_t tcp_opt_obj = {
274 	tcp_opt_default,	/* TCP default value function pointer */
275 	tcp_tpi_opt_get,	/* TCP get function pointer */
276 	tcp_tpi_opt_set,	/* TCP set function pointer */
277 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
278 	tcp_opt_arr,		/* TCP option database */
279 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
280 	tcp_valid_levels_arr	/* TCP valid level array */
281 };
282 
283 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
284 
285 /*
286  * Some TCP options can be "set" by requesting them in the option
287  * buffer. This is needed for XTI feature test though we do not
288  * allow it in general. We interpret that this mechanism is more
289  * applicable to OSI protocols and need not be allowed in general.
290  * This routine filters out options for which it is not allowed (most)
291  * and lets through those (few) for which it is. [ The XTI interface
292  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
293  * ever implemented will have to be allowed here ].
294  */
295 static boolean_t
tcp_allow_connopt_set(int level,int name)296 tcp_allow_connopt_set(int level, int name)
297 {
298 
299 	switch (level) {
300 	case IPPROTO_TCP:
301 		switch (name) {
302 		case TCP_NODELAY:
303 			return (B_TRUE);
304 		default:
305 			return (B_FALSE);
306 		}
307 		/*NOTREACHED*/
308 	default:
309 		return (B_FALSE);
310 	}
311 	/*NOTREACHED*/
312 }
313 
314 /*
315  * This routine gets default values of certain options whose default
316  * values are maintained by protocol specific code
317  */
318 /* ARGSUSED */
319 static int
tcp_opt_default(queue_t * q,int level,int name,uchar_t * ptr)320 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
321 {
322 	int32_t	*i1 = (int32_t *)ptr;
323 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
324 
325 	switch (level) {
326 	case IPPROTO_TCP:
327 		switch (name) {
328 		case TCP_NOTIFY_THRESHOLD:
329 			*i1 = tcps->tcps_ip_notify_interval;
330 			break;
331 		case TCP_ABORT_THRESHOLD:
332 			*i1 = tcps->tcps_ip_abort_interval;
333 			break;
334 		case TCP_CONN_NOTIFY_THRESHOLD:
335 			*i1 = tcps->tcps_ip_notify_cinterval;
336 			break;
337 		case TCP_CONN_ABORT_THRESHOLD:
338 			*i1 = tcps->tcps_ip_abort_cinterval;
339 			break;
340 		default:
341 			return (-1);
342 		}
343 		break;
344 	case IPPROTO_IP:
345 		switch (name) {
346 		case IP_TTL:
347 			*i1 = tcps->tcps_ipv4_ttl;
348 			break;
349 		default:
350 			return (-1);
351 		}
352 		break;
353 	case IPPROTO_IPV6:
354 		switch (name) {
355 		case IPV6_UNICAST_HOPS:
356 			*i1 = tcps->tcps_ipv6_hoplimit;
357 			break;
358 		default:
359 			return (-1);
360 		}
361 		break;
362 	default:
363 		return (-1);
364 	}
365 	return (sizeof (int));
366 }
367 
368 /*
369  * TCP routine to get the values of options.
370  */
371 int
tcp_opt_get(conn_t * connp,int level,int name,uchar_t * ptr)372 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
373 {
374 	int		*i1 = (int *)ptr;
375 	tcp_t		*tcp = connp->conn_tcp;
376 	conn_opt_arg_t	coas;
377 	int		retval;
378 
379 	coas.coa_connp = connp;
380 	coas.coa_ixa = connp->conn_ixa;
381 	coas.coa_ipp = &connp->conn_xmit_ipp;
382 	coas.coa_ancillary = B_FALSE;
383 	coas.coa_changed = 0;
384 
385 	switch (level) {
386 	case SOL_SOCKET:
387 		switch (name) {
388 		case SO_SND_COPYAVOID:
389 			*i1 = tcp->tcp_snd_zcopy_on ?
390 			    SO_SND_COPYAVOID : 0;
391 			return (sizeof (int));
392 		case SO_ACCEPTCONN:
393 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
394 			return (sizeof (int));
395 		}
396 		break;
397 	case IPPROTO_TCP:
398 		switch (name) {
399 		case TCP_NODELAY:
400 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
401 			return (sizeof (int));
402 		case TCP_MAXSEG:
403 			*i1 = tcp->tcp_mss;
404 			return (sizeof (int));
405 		case TCP_NOTIFY_THRESHOLD:
406 			*i1 = (int)tcp->tcp_first_timer_threshold;
407 			return (sizeof (int));
408 		case TCP_ABORT_THRESHOLD:
409 			*i1 = tcp->tcp_second_timer_threshold;
410 			return (sizeof (int));
411 		case TCP_CONN_NOTIFY_THRESHOLD:
412 			*i1 = tcp->tcp_first_ctimer_threshold;
413 			return (sizeof (int));
414 		case TCP_CONN_ABORT_THRESHOLD:
415 			*i1 = tcp->tcp_second_ctimer_threshold;
416 			return (sizeof (int));
417 		case TCP_INIT_CWND:
418 			*i1 = tcp->tcp_init_cwnd;
419 			return (sizeof (int));
420 		case TCP_KEEPALIVE_THRESHOLD:
421 			*i1 = tcp->tcp_ka_interval;
422 			return (sizeof (int));
423 
424 		/*
425 		 * TCP_KEEPIDLE expects value in seconds, but
426 		 * tcp_ka_interval is in milliseconds.
427 		 */
428 		case TCP_KEEPIDLE:
429 			*i1 = tcp->tcp_ka_interval / 1000;
430 			return (sizeof (int));
431 		case TCP_KEEPCNT:
432 			*i1 = tcp->tcp_ka_cnt;
433 			return (sizeof (int));
434 
435 		/*
436 		 * TCP_KEEPINTVL expects value in seconds, but
437 		 * tcp_ka_rinterval is in milliseconds.
438 		 */
439 		case TCP_KEEPINTVL:
440 			*i1 = tcp->tcp_ka_rinterval / 1000;
441 			return (sizeof (int));
442 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
443 			*i1 = tcp->tcp_ka_abort_thres;
444 			return (sizeof (int));
445 		case TCP_CONGESTION: {
446 			size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
447 			    CC_ALGO_NAME_MAX);
448 			if (len >= CC_ALGO_NAME_MAX)
449 				return (-1);
450 			return (len + 1);
451 		}
452 		case TCP_CORK:
453 			*i1 = tcp->tcp_cork;
454 			return (sizeof (int));
455 		case TCP_QUICKACK:
456 			*i1 = tcp->tcp_quickack;
457 			return (sizeof (int));
458 		case TCP_RTO_INITIAL:
459 			*i1 = tcp->tcp_rto_initial;
460 			return (sizeof (uint32_t));
461 		case TCP_RTO_MIN:
462 			*i1 = tcp->tcp_rto_min;
463 			return (sizeof (uint32_t));
464 		case TCP_RTO_MAX:
465 			*i1 = tcp->tcp_rto_max;
466 			return (sizeof (uint32_t));
467 		case TCP_LINGER2:
468 			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
469 			return (sizeof (int));
470 		}
471 		break;
472 	case IPPROTO_IP:
473 		if (connp->conn_family != AF_INET)
474 			return (-1);
475 		switch (name) {
476 		case IP_OPTIONS:
477 		case T_IP_OPTIONS:
478 			/* Caller ensures enough space */
479 			return (ip_opt_get_user(connp, ptr));
480 		default:
481 			break;
482 		}
483 		break;
484 
485 	case IPPROTO_IPV6:
486 		/*
487 		 * IPPROTO_IPV6 options are only supported for sockets
488 		 * that are using IPv6 on the wire.
489 		 */
490 		if (connp->conn_ipversion != IPV6_VERSION) {
491 			return (-1);
492 		}
493 		switch (name) {
494 		case IPV6_PATHMTU:
495 			if (tcp->tcp_state < TCPS_ESTABLISHED)
496 				return (-1);
497 			break;
498 		}
499 		break;
500 	}
501 	mutex_enter(&connp->conn_lock);
502 	retval = conn_opt_get(&coas, level, name, ptr);
503 	mutex_exit(&connp->conn_lock);
504 	return (retval);
505 }
506 
507 /*
508  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
509  * Parameters are assumed to be verified by the caller.
510  */
511 /* ARGSUSED */
512 int
tcp_opt_set(conn_t * connp,uint_t optset_context,int level,int name,uint_t inlen,uchar_t * invalp,uint_t * outlenp,uchar_t * outvalp,void * thisdg_attrs,cred_t * cr)513 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
514     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
515     void *thisdg_attrs, cred_t *cr)
516 {
517 	tcp_t	*tcp = connp->conn_tcp;
518 	int	*i1 = (int *)invalp;
519 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
520 	boolean_t checkonly;
521 	int	reterr;
522 	tcp_stack_t	*tcps = tcp->tcp_tcps;
523 	conn_opt_arg_t	coas;
524 	uint32_t	val = *((uint32_t *)invalp);
525 
526 	coas.coa_connp = connp;
527 	coas.coa_ixa = connp->conn_ixa;
528 	coas.coa_ipp = &connp->conn_xmit_ipp;
529 	coas.coa_ancillary = B_FALSE;
530 	coas.coa_changed = 0;
531 
532 	switch (optset_context) {
533 	case SETFN_OPTCOM_CHECKONLY:
534 		checkonly = B_TRUE;
535 		/*
536 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
537 		 * inlen != 0 implies value supplied and
538 		 *	we have to "pretend" to set it.
539 		 * inlen == 0 implies that there is no
540 		 *	value part in T_CHECK request and just validation
541 		 * done elsewhere should be enough, we just return here.
542 		 */
543 		if (inlen == 0) {
544 			*outlenp = 0;
545 			return (0);
546 		}
547 		break;
548 	case SETFN_OPTCOM_NEGOTIATE:
549 		checkonly = B_FALSE;
550 		break;
551 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
552 	case SETFN_CONN_NEGOTIATE:
553 		checkonly = B_FALSE;
554 		/*
555 		 * Negotiating local and "association-related" options
556 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
557 		 * primitives is allowed by XTI, but we choose
558 		 * to not implement this style negotiation for Internet
559 		 * protocols (We interpret it is a must for OSI world but
560 		 * optional for Internet protocols) for all options.
561 		 * [ Will do only for the few options that enable test
562 		 * suites that our XTI implementation of this feature
563 		 * works for transports that do allow it ]
564 		 */
565 		if (!tcp_allow_connopt_set(level, name)) {
566 			*outlenp = 0;
567 			return (EINVAL);
568 		}
569 		break;
570 	default:
571 		/*
572 		 * We should never get here
573 		 */
574 		*outlenp = 0;
575 		return (EINVAL);
576 	}
577 
578 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
579 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
580 
581 	/*
582 	 * For TCP, we should have no ancillary data sent down
583 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
584 	 * has to be zero.
585 	 */
586 	ASSERT(thisdg_attrs == NULL);
587 
588 	/*
589 	 * For fixed length options, no sanity check
590 	 * of passed in length is done. It is assumed *_optcom_req()
591 	 * routines do the right thing.
592 	 */
593 	switch (level) {
594 	case SOL_SOCKET:
595 		switch (name) {
596 		case SO_KEEPALIVE:
597 			if (checkonly) {
598 				/* check only case */
599 				break;
600 			}
601 
602 			if (!onoff) {
603 				if (connp->conn_keepalive) {
604 					if (tcp->tcp_ka_tid != 0) {
605 						(void) TCP_TIMER_CANCEL(tcp,
606 						    tcp->tcp_ka_tid);
607 						tcp->tcp_ka_tid = 0;
608 					}
609 					connp->conn_keepalive = 0;
610 				}
611 				break;
612 			}
613 			if (!connp->conn_keepalive) {
614 				/* Crank up the keepalive timer */
615 				tcp->tcp_ka_last_intrvl = 0;
616 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
617 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
618 				connp->conn_keepalive = 1;
619 			}
620 			break;
621 		case SO_SNDBUF: {
622 			if (*i1 > tcps->tcps_max_buf) {
623 				*outlenp = 0;
624 				return (ENOBUFS);
625 			}
626 			if (checkonly)
627 				break;
628 
629 			connp->conn_sndbuf = *i1;
630 			if (tcps->tcps_snd_lowat_fraction != 0) {
631 				connp->conn_sndlowat = connp->conn_sndbuf /
632 				    tcps->tcps_snd_lowat_fraction;
633 			}
634 			(void) tcp_maxpsz_set(tcp, B_TRUE);
635 			/*
636 			 * If we are flow-controlled, recheck the condition.
637 			 * There are apps that increase SO_SNDBUF size when
638 			 * flow-controlled (EWOULDBLOCK), and expect the flow
639 			 * control condition to be lifted right away.
640 			 */
641 			mutex_enter(&tcp->tcp_non_sq_lock);
642 			if (tcp->tcp_flow_stopped &&
643 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
644 				tcp_clrqfull(tcp);
645 			}
646 			mutex_exit(&tcp->tcp_non_sq_lock);
647 			*outlenp = inlen;
648 			return (0);
649 		}
650 		case SO_RCVBUF:
651 			if (*i1 > tcps->tcps_max_buf) {
652 				*outlenp = 0;
653 				return (ENOBUFS);
654 			}
655 			/* Silently ignore zero */
656 			if (!checkonly && *i1 != 0) {
657 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
658 				(void) tcp_rwnd_set(tcp, *i1);
659 			}
660 			/*
661 			 * XXX should we return the rwnd here
662 			 * and tcp_opt_get ?
663 			 */
664 			*outlenp = inlen;
665 			return (0);
666 		case SO_SND_COPYAVOID:
667 			if (!checkonly) {
668 				if (tcp->tcp_loopback ||
669 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
670 					*outlenp = 0;
671 					return (EOPNOTSUPP);
672 				}
673 				tcp->tcp_snd_zcopy_aware = 1;
674 			}
675 			*outlenp = inlen;
676 			return (0);
677 		}
678 		break;
679 	case IPPROTO_TCP:
680 		switch (name) {
681 		case TCP_NODELAY:
682 			if (!checkonly)
683 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
684 			break;
685 		case TCP_NOTIFY_THRESHOLD:
686 			if (!checkonly)
687 				tcp->tcp_first_timer_threshold = *i1;
688 			break;
689 		case TCP_ABORT_THRESHOLD:
690 			if (!checkonly)
691 				tcp->tcp_second_timer_threshold = *i1;
692 			break;
693 		case TCP_CONN_NOTIFY_THRESHOLD:
694 			if (!checkonly)
695 				tcp->tcp_first_ctimer_threshold = *i1;
696 			break;
697 		case TCP_CONN_ABORT_THRESHOLD:
698 			if (!checkonly)
699 				tcp->tcp_second_ctimer_threshold = *i1;
700 			break;
701 		case TCP_RECVDSTADDR:
702 			if (tcp->tcp_state > TCPS_LISTEN) {
703 				*outlenp = 0;
704 				return (EOPNOTSUPP);
705 			}
706 			/* Setting done in conn_opt_set */
707 			break;
708 		case TCP_INIT_CWND:
709 			if (checkonly)
710 				break;
711 
712 			/*
713 			 * Only allow socket with network configuration
714 			 * privilege to set the initial cwnd to be larger
715 			 * than allowed by RFC 3390.
716 			 */
717 			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
718 				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
719 				    != 0) {
720 					*outlenp = 0;
721 					return (reterr);
722 				}
723 				if (val > tcp_max_init_cwnd) {
724 					*outlenp = 0;
725 					return (EINVAL);
726 				}
727 			}
728 
729 			tcp->tcp_init_cwnd = val;
730 
731 			/*
732 			 * If the socket is connected, AND no outbound data
733 			 * has been sent, reset the actual cwnd values.
734 			 */
735 			if (tcp->tcp_state == TCPS_ESTABLISHED &&
736 			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
737 				tcp->tcp_cwnd =
738 				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
739 			}
740 			break;
741 
742 		/*
743 		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
744 		 * is in milliseconds. TCP_KEEPIDLE is introduced for
745 		 * compatibility with other Unix flavors.
746 		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
747 		 * converting the input to milliseconds.
748 		 */
749 		case TCP_KEEPIDLE:
750 			*i1 *= 1000;
751 			/* FALLTHRU */
752 
753 		case TCP_KEEPALIVE_THRESHOLD:
754 			if (checkonly)
755 				break;
756 
757 			if (*i1 < tcps->tcps_keepalive_interval_low ||
758 			    *i1 > tcps->tcps_keepalive_interval_high) {
759 				*outlenp = 0;
760 				return (EINVAL);
761 			}
762 			if (*i1 != tcp->tcp_ka_interval) {
763 				tcp->tcp_ka_interval = *i1;
764 				/*
765 				 * Check if we need to restart the
766 				 * keepalive timer.
767 				 */
768 				if (tcp->tcp_ka_tid != 0) {
769 					ASSERT(connp->conn_keepalive);
770 					(void) TCP_TIMER_CANCEL(tcp,
771 					    tcp->tcp_ka_tid);
772 					tcp->tcp_ka_last_intrvl = 0;
773 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
774 					    tcp_keepalive_timer,
775 					    tcp->tcp_ka_interval);
776 				}
777 			}
778 			break;
779 
780 		/*
781 		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
782 		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
783 		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
784 		 * tcp_ka_cnt.
785 		 */
786 		case TCP_KEEPCNT:
787 			if (checkonly)
788 				break;
789 
790 			if (*i1 == 0) {
791 				return (EINVAL);
792 			} else if (tcp->tcp_ka_rinterval == 0) {
793 				/*
794 				 * When TCP_KEEPCNT is specified without first
795 				 * specifying a TCP_KEEPINTVL, we infer an
796 				 * interval based on a tunable specific to our
797 				 * stack: the tcp_keepalive_abort_interval.
798 				 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
799 				 * the unlikely event that that has been set.)
800 				 * Given the abort interval's default value of
801 				 * 480 seconds, low TCP_KEEPCNT values can
802 				 * result in intervals that exceed the default
803 				 * maximum RTO of 60 seconds.  Rather than
804 				 * fail in these cases, we (implicitly) clamp
805 				 * the interval at the maximum RTO; if the
806 				 * TCP_KEEPCNT is shortly followed by a
807 				 * TCP_KEEPINTVL (as we expect), the abort
808 				 * threshold will be recalculated correctly --
809 				 * and if a TCP_KEEPINTVL is not forthcoming,
810 				 * keep-alive will at least operate reasonably
811 				 * given the underconfigured state.
812 				 */
813 				uint32_t interval;
814 
815 				interval = tcp->tcp_ka_abort_thres / *i1;
816 
817 				if (interval < tcp->tcp_rto_min)
818 					interval = tcp->tcp_rto_min;
819 
820 				if (interval > tcp->tcp_rto_max)
821 					interval = tcp->tcp_rto_max;
822 
823 				tcp->tcp_ka_rinterval = interval;
824 			} else {
825 				if ((*i1 * tcp->tcp_ka_rinterval) <
826 				    tcps->tcps_keepalive_abort_interval_low ||
827 				    (*i1 * tcp->tcp_ka_rinterval) >
828 				    tcps->tcps_keepalive_abort_interval_high)
829 					return (EINVAL);
830 				tcp->tcp_ka_abort_thres =
831 				    (*i1 * tcp->tcp_ka_rinterval);
832 			}
833 			tcp->tcp_ka_cnt = *i1;
834 			break;
835 		case TCP_KEEPINTVL:
836 			/*
837 			 * TCP_KEEPINTVL is specified in seconds, but
838 			 * tcp_ka_rinterval is in milliseconds.
839 			 */
840 
841 			if (checkonly)
842 				break;
843 
844 			if ((*i1 * 1000) < tcp->tcp_rto_min ||
845 			    (*i1 * 1000) > tcp->tcp_rto_max)
846 				return (EINVAL);
847 
848 			if (tcp->tcp_ka_cnt == 0) {
849 				tcp->tcp_ka_cnt =
850 				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
851 			} else {
852 				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
853 				    tcps->tcps_keepalive_abort_interval_low ||
854 				    (*i1 * tcp->tcp_ka_cnt * 1000) >
855 				    tcps->tcps_keepalive_abort_interval_high)
856 					return (EINVAL);
857 				tcp->tcp_ka_abort_thres =
858 				    (*i1 * tcp->tcp_ka_cnt * 1000);
859 			}
860 			tcp->tcp_ka_rinterval = *i1 * 1000;
861 			break;
862 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
863 			if (!checkonly) {
864 				if (*i1 <
865 				    tcps->tcps_keepalive_abort_interval_low ||
866 				    *i1 >
867 				    tcps->tcps_keepalive_abort_interval_high) {
868 					*outlenp = 0;
869 					return (EINVAL);
870 				}
871 				tcp->tcp_ka_abort_thres = *i1;
872 				tcp->tcp_ka_cnt = 0;
873 				tcp->tcp_ka_rinterval = 0;
874 			}
875 			break;
876 		case TCP_CONGESTION: {
877 			struct cc_algo *algo;
878 
879 			if (checkonly) {
880 				break;
881 			}
882 
883 			/*
884 			 * Make sure the string is NUL-terminated. Some
885 			 * consumers pass only the number of characters
886 			 * in the string, and don't include the NUL
887 			 * terminator, so we set it for them.
888 			 */
889 			if (inlen < CC_ALGO_NAME_MAX) {
890 				invalp[inlen] = '\0';
891 			}
892 			invalp[CC_ALGO_NAME_MAX - 1] = '\0';
893 
894 			if ((algo = cc_load_algo((char *)invalp)) == NULL) {
895 				return (ENOENT);
896 			}
897 
898 			if (CC_ALGO(tcp)->cb_destroy != NULL) {
899 				CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
900 			}
901 
902 			CC_DATA(tcp) = NULL;
903 			CC_ALGO(tcp) = algo;
904 
905 			if (CC_ALGO(tcp)->cb_init != NULL) {
906 				VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
907 			}
908 
909 			break;
910 		}
911 		case TCP_CORK:
912 			if (!checkonly) {
913 				/*
914 				 * if tcp->tcp_cork was set and is now
915 				 * being unset, we have to make sure that
916 				 * the remaining data gets sent out. Also
917 				 * unset tcp->tcp_cork so that tcp_wput_data()
918 				 * can send data even if it is less than mss
919 				 */
920 				if (tcp->tcp_cork && onoff == 0 &&
921 				    tcp->tcp_unsent > 0) {
922 					tcp->tcp_cork = B_FALSE;
923 					tcp_wput_data(tcp, NULL, B_FALSE);
924 				}
925 				tcp->tcp_cork = onoff;
926 			}
927 			break;
928 		case TCP_QUICKACK:
929 			if (!checkonly) {
930 				tcp->tcp_quickack = onoff;
931 			}
932 			break;
933 		case TCP_RTO_INITIAL:
934 			if (checkonly || val == 0)
935 				break;
936 
937 			/*
938 			 * Sanity checks
939 			 *
940 			 * The initial RTO should be bounded by the minimum
941 			 * and maximum RTO.  And it should also be smaller
942 			 * than the connect attempt abort timeout.  Otherwise,
943 			 * the connection won't be aborted in a period
944 			 * reasonably close to that timeout.
945 			 */
946 			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
947 			    val > tcp->tcp_second_ctimer_threshold ||
948 			    val < tcps->tcps_rexmit_interval_initial_low ||
949 			    val > tcps->tcps_rexmit_interval_initial_high) {
950 				*outlenp = 0;
951 				return (EINVAL);
952 			}
953 			tcp->tcp_rto_initial = val;
954 
955 			/*
956 			 * If TCP has not sent anything, need to re-calculate
957 			 * tcp_rto.  Otherwise, this option change does not
958 			 * really affect anything.
959 			 */
960 			if (tcp->tcp_state >= TCPS_SYN_SENT)
961 				break;
962 
963 			tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
964 			tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
965 			tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
966 			    tcps->tcps_conn_grace_period);
967 			break;
968 		case TCP_RTO_MIN:
969 			if (checkonly || val == 0)
970 				break;
971 
972 			if (val < tcps->tcps_rexmit_interval_min_low ||
973 			    val > tcps->tcps_rexmit_interval_min_high ||
974 			    val > tcp->tcp_rto_max) {
975 				*outlenp = 0;
976 				return (EINVAL);
977 			}
978 			tcp->tcp_rto_min = val;
979 			if (tcp->tcp_rto < val)
980 				tcp->tcp_rto = val;
981 			break;
982 		case TCP_RTO_MAX:
983 			if (checkonly || val == 0)
984 				break;
985 
986 			/*
987 			 * Sanity checks
988 			 *
989 			 * The maximum RTO should not be larger than the
990 			 * connection abort timeout.  Otherwise, the
991 			 * connection won't be aborted in a period reasonably
992 			 * close to that timeout.
993 			 */
994 			if (val < tcps->tcps_rexmit_interval_max_low ||
995 			    val > tcps->tcps_rexmit_interval_max_high ||
996 			    val < tcp->tcp_rto_min ||
997 			    val > tcp->tcp_second_timer_threshold) {
998 				*outlenp = 0;
999 				return (EINVAL);
1000 			}
1001 			tcp->tcp_rto_max = val;
1002 			if (tcp->tcp_rto > val)
1003 				tcp->tcp_rto = val;
1004 			break;
1005 		case TCP_LINGER2:
1006 			if (checkonly || *i1 == 0)
1007 				break;
1008 
1009 			/*
1010 			 * Note that the option value's unit is second.  And
1011 			 * the value should be bigger than the private
1012 			 * parameter tcp_fin_wait_2_flush_interval's lower
1013 			 * bound and smaller than the current value of that
1014 			 * parameter.  It should be smaller than the current
1015 			 * value to avoid an app setting TCP_LINGER2 to a big
1016 			 * value, causing resource to be held up too long in
1017 			 * FIN-WAIT-2 state.
1018 			 */
1019 			if (*i1 < 0 ||
1020 			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1021 			    *i1 ||
1022 			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1023 			    *i1) {
1024 				*outlenp = 0;
1025 				return (EINVAL);
1026 			}
1027 			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1028 			break;
1029 		default:
1030 			break;
1031 		}
1032 		break;
1033 	case IPPROTO_IP:
1034 		if (connp->conn_family != AF_INET) {
1035 			*outlenp = 0;
1036 			return (EINVAL);
1037 		}
1038 		switch (name) {
1039 		case IP_SEC_OPT:
1040 			/*
1041 			 * We should not allow policy setting after
1042 			 * we start listening for connections.
1043 			 */
1044 			if (tcp->tcp_state == TCPS_LISTEN) {
1045 				return (EINVAL);
1046 			}
1047 			break;
1048 		case IP_RECVTOS:
1049 			if (!checkonly) {
1050 				/*
1051 				 * Force it to be sent up with the next msg
1052 				 * by setting it to a value which cannot
1053 				 * appear in a packet (TOS is only 8-bits)
1054 				 */
1055 				tcp->tcp_recvtos = 0xffffffffU;
1056 			}
1057 			break;
1058 		}
1059 		break;
1060 	case IPPROTO_IPV6:
1061 		/*
1062 		 * IPPROTO_IPV6 options are only supported for sockets
1063 		 * that are using IPv6 on the wire.
1064 		 */
1065 		if (connp->conn_ipversion != IPV6_VERSION) {
1066 			*outlenp = 0;
1067 			return (EINVAL);
1068 		}
1069 
1070 		switch (name) {
1071 		case IPV6_RECVPKTINFO:
1072 			if (!checkonly) {
1073 				/* Force it to be sent up with the next msg */
1074 				tcp->tcp_recvifindex = 0;
1075 			}
1076 			break;
1077 		case IPV6_RECVTCLASS:
1078 			if (!checkonly) {
1079 				/* Force it to be sent up with the next msg */
1080 				tcp->tcp_recvtclass = 0xffffffffU;
1081 			}
1082 			break;
1083 		case IPV6_RECVHOPLIMIT:
1084 			if (!checkonly) {
1085 				/* Force it to be sent up with the next msg */
1086 				tcp->tcp_recvhops = 0xffffffffU;
1087 			}
1088 			break;
1089 		case IPV6_PKTINFO:
1090 			/* This is an extra check for TCP */
1091 			if (inlen == sizeof (struct in6_pktinfo)) {
1092 				struct in6_pktinfo *pkti;
1093 
1094 				pkti = (struct in6_pktinfo *)invalp;
1095 				/*
1096 				 * RFC 3542 states that ipi6_addr must be
1097 				 * the unspecified address when setting the
1098 				 * IPV6_PKTINFO sticky socket option on a
1099 				 * TCP socket.
1100 				 */
1101 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1102 					return (EINVAL);
1103 			}
1104 			break;
1105 		case IPV6_SEC_OPT:
1106 			/*
1107 			 * We should not allow policy setting after
1108 			 * we start listening for connections.
1109 			 */
1110 			if (tcp->tcp_state == TCPS_LISTEN) {
1111 				return (EINVAL);
1112 			}
1113 			break;
1114 		}
1115 		break;
1116 	}
1117 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1118 	    checkonly, cr);
1119 	if (reterr != 0) {
1120 		*outlenp = 0;
1121 		return (reterr);
1122 	}
1123 
1124 	/*
1125 	 * Common case of OK return with outval same as inval
1126 	 */
1127 	if (invalp != outvalp) {
1128 		/* don't trust bcopy for identical src/dst */
1129 		(void) bcopy(invalp, outvalp, inlen);
1130 	}
1131 	*outlenp = inlen;
1132 
1133 	if (coas.coa_changed & COA_HEADER_CHANGED) {
1134 		/* If we are connected we rebuilt the headers */
1135 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1136 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1137 			reterr = tcp_build_hdrs(tcp);
1138 			if (reterr != 0)
1139 				return (reterr);
1140 		}
1141 	}
1142 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1143 		in6_addr_t nexthop;
1144 
1145 		/*
1146 		 * If we are connected we re-cache the information.
1147 		 * We ignore errors to preserve BSD behavior.
1148 		 * Note that we don't redo IPsec policy lookup here
1149 		 * since the final destination (or source) didn't change.
1150 		 */
1151 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1152 		    &connp->conn_faddr_v6, &nexthop);
1153 
1154 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1155 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1156 			(void) ip_attr_connect(connp, connp->conn_ixa,
1157 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1158 			    &nexthop, connp->conn_fport, NULL, NULL,
1159 			    IPDF_VERIFY_DST);
1160 		}
1161 	}
1162 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1163 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1164 	}
1165 	if (coas.coa_changed & COA_WROFF_CHANGED) {
1166 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1167 		    tcps->tcps_wroff_xtra;
1168 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1169 		    connp->conn_wroff);
1170 	}
1171 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1172 		if (IPCL_IS_NONSTR(connp))
1173 			proto_set_rx_oob_opt(connp, onoff);
1174 	}
1175 	return (0);
1176 }
1177