1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
27  * Copyright 2024 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/socket.h>
35 #include <sys/xti_xtiopt.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 
39 #include <inet/cc.h>
40 #include <inet/common.h>
41 #include <netinet/ip6.h>
42 #include <inet/ip.h>
43 
44 #include <netinet/in.h>
45 #include <netinet/tcp.h>
46 #include <inet/optcom.h>
47 #include <inet/proto_set.h>
48 #include <inet/tcp_impl.h>
49 
50 static int	tcp_opt_default(queue_t *, int, int, uchar_t *);
51 
52 /*
53  * Table of all known options handled on a TCP protocol stack.
54  *
55  * Note: This table contains options processed by both TCP and IP levels
56  *       and is the superset of options that can be performed on a TCP over IP
57  *       stack.
58  */
59 opdes_t	tcp_opt_arr[] = {
60 
61 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
62 	sizeof (struct linger), 0 },
63 
64 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
68 	},
69 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
71 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
72 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
73 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
75 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
76 	sizeof (struct timeval), 0 },
77 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
78 	sizeof (struct timeval), 0 },
79 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
80 	},
81 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
82 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
83 	0 },
84 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
85 	0 },
86 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
87 	0 },
88 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
89 	0 },
90 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
91 
92 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
93 
94 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
95 
96 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
97 	},
98 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
99 	536 },
100 
101 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
102 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
103 
104 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
105 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
106 
107 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
108 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
109 
110 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
111 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
112 
113 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
114 	0 },
115 
116 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
117 	sizeof (int), 0 },
118 
119 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
120 	},
121 
122 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
123 	sizeof (int), 0 },
124 
125 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
126 	sizeof (int), 0	},
127 
128 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
129 
130 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131 
132 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
133 
134 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
135 	sizeof (int), 0	},
136 
137 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
138 
139 { TCP_QUICKACK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
140 
141 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
142 
143 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
144 
145 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
146 
147 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
148 
149 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
150 	OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
151 
152 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
153 	(OP_VARLEN|OP_NODEFAULT),
154 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
155 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
156 	(OP_VARLEN|OP_NODEFAULT),
157 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
158 
159 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
160 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
161 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
162 	sizeof (int), -1 /* not initialized */ },
163 { IP_RECVTOS,	IPPROTO_IP,  OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
164 
165 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
166 	sizeof (ipsec_req_t), -1 /* not initialized */ },
167 
168 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
169 	sizeof (int),	0 /* no ifindex */ },
170 
171 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
172 	sizeof (int), 0 },
173 
174 { IP_MINTTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
175 
176 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
177 	sizeof (int), -1 /* not initialized */ },
178 
179 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
180 	sizeof (int),	0 /* no ifindex */ },
181 
182 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
183 
184 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
185 	sizeof (in_addr_t),	-1 /* not initialized  */ },
186 
187 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
188 	sizeof (int), 0 },
189 
190 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
191 	(OP_NODEFAULT|OP_VARLEN),
192 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
193 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
194 	OP_NODEFAULT,
195 	sizeof (sin6_t), -1 /* not initialized */ },
196 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
197 	(OP_VARLEN|OP_NODEFAULT), 255*8,
198 	-1 /* not initialized */ },
199 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
200 	(OP_VARLEN|OP_NODEFAULT), 255*8,
201 	-1 /* not initialized */ },
202 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
203 	(OP_VARLEN|OP_NODEFAULT), 255*8,
204 	-1 /* not initialized */ },
205 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
206 	(OP_VARLEN|OP_NODEFAULT), 255*8,
207 	-1 /* not initialized */ },
208 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
209 	OP_NODEFAULT,
210 	sizeof (int), -1 /* not initialized */ },
211 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
212 	OP_NODEFAULT,
213 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
214 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
215 	sizeof (int), 0 },
216 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
217 	sizeof (int), 0 },
218 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
219 	sizeof (int), 0 },
220 
221 /* Enable receipt of ancillary data */
222 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
223 	sizeof (int), 0 },
224 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
225 	sizeof (int), 0 },
226 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
227 	sizeof (int), 0 },
228 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
229 	sizeof (int), 0 },
230 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
231 	sizeof (int), 0 },
232 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
233 	sizeof (int), 0 },
234 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
235 	sizeof (int), 0 },
236 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
237 	sizeof (int), 0 },
238 
239 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
240 	sizeof (ipsec_req_t), -1 /* not initialized */ },
241 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
242 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
243 
244 { IPV6_MINHOPCOUNT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
245 	sizeof (int), 0 },
246 };
247 
248 /*
249  * Table of all supported levels
250  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
251  * any supported options so we need this info separately.
252  *
253  * This is needed only for topmost tpi providers and is used only by
254  * XTI interfaces.
255  */
256 optlevel_t	tcp_valid_levels_arr[] = {
257 	XTI_GENERIC,
258 	SOL_SOCKET,
259 	IPPROTO_TCP,
260 	IPPROTO_IP,
261 	IPPROTO_IPV6
262 };
263 
264 
265 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
266 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
267 
268 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
269 
270 /*
271  * Initialize option database object for TCP
272  *
273  * This object represents database of options to search passed to
274  * {sock,tpi}optcom_req() interface routine to take care of option
275  * management and associated methods.
276  */
277 
278 optdb_obj_t tcp_opt_obj = {
279 	tcp_opt_default,	/* TCP default value function pointer */
280 	tcp_tpi_opt_get,	/* TCP get function pointer */
281 	tcp_tpi_opt_set,	/* TCP set function pointer */
282 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
283 	tcp_opt_arr,		/* TCP option database */
284 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
285 	tcp_valid_levels_arr	/* TCP valid level array */
286 };
287 
288 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
289 
290 /*
291  * Some TCP options can be "set" by requesting them in the option
292  * buffer. This is needed for XTI feature test though we do not
293  * allow it in general. We interpret that this mechanism is more
294  * applicable to OSI protocols and need not be allowed in general.
295  * This routine filters out options for which it is not allowed (most)
296  * and lets through those (few) for which it is. [ The XTI interface
297  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
298  * ever implemented will have to be allowed here ].
299  */
300 static boolean_t
tcp_allow_connopt_set(int level,int name)301 tcp_allow_connopt_set(int level, int name)
302 {
303 
304 	switch (level) {
305 	case IPPROTO_TCP:
306 		switch (name) {
307 		case TCP_NODELAY:
308 			return (B_TRUE);
309 		default:
310 			return (B_FALSE);
311 		}
312 		/*NOTREACHED*/
313 	default:
314 		return (B_FALSE);
315 	}
316 	/*NOTREACHED*/
317 }
318 
319 /*
320  * This routine gets default values of certain options whose default
321  * values are maintained by protocol specific code
322  */
323 /* ARGSUSED */
324 static int
tcp_opt_default(queue_t * q,int level,int name,uchar_t * ptr)325 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
326 {
327 	int32_t	*i1 = (int32_t *)ptr;
328 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
329 
330 	switch (level) {
331 	case IPPROTO_TCP:
332 		switch (name) {
333 		case TCP_NOTIFY_THRESHOLD:
334 			*i1 = tcps->tcps_ip_notify_interval;
335 			break;
336 		case TCP_ABORT_THRESHOLD:
337 			*i1 = tcps->tcps_ip_abort_interval;
338 			break;
339 		case TCP_CONN_NOTIFY_THRESHOLD:
340 			*i1 = tcps->tcps_ip_notify_cinterval;
341 			break;
342 		case TCP_CONN_ABORT_THRESHOLD:
343 			*i1 = tcps->tcps_ip_abort_cinterval;
344 			break;
345 		default:
346 			return (-1);
347 		}
348 		break;
349 	case IPPROTO_IP:
350 		switch (name) {
351 		case IP_TTL:
352 			*i1 = tcps->tcps_ipv4_ttl;
353 			break;
354 		default:
355 			return (-1);
356 		}
357 		break;
358 	case IPPROTO_IPV6:
359 		switch (name) {
360 		case IPV6_UNICAST_HOPS:
361 			*i1 = tcps->tcps_ipv6_hoplimit;
362 			break;
363 		default:
364 			return (-1);
365 		}
366 		break;
367 	default:
368 		return (-1);
369 	}
370 	return (sizeof (int));
371 }
372 
373 /*
374  * TCP routine to get the values of options.
375  */
376 int
tcp_opt_get(conn_t * connp,int level,int name,uchar_t * ptr)377 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
378 {
379 	int		*i1 = (int *)ptr;
380 	tcp_t		*tcp = connp->conn_tcp;
381 	conn_opt_arg_t	coas;
382 	int		retval;
383 
384 	coas.coa_connp = connp;
385 	coas.coa_ixa = connp->conn_ixa;
386 	coas.coa_ipp = &connp->conn_xmit_ipp;
387 	coas.coa_ancillary = B_FALSE;
388 	coas.coa_changed = 0;
389 
390 	switch (level) {
391 	case SOL_SOCKET:
392 		switch (name) {
393 		case SO_SND_COPYAVOID:
394 			*i1 = tcp->tcp_snd_zcopy_on ?
395 			    SO_SND_COPYAVOID : 0;
396 			return (sizeof (int));
397 		case SO_ACCEPTCONN:
398 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
399 			return (sizeof (int));
400 		}
401 		break;
402 	case IPPROTO_TCP:
403 		switch (name) {
404 		case TCP_NODELAY:
405 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
406 			return (sizeof (int));
407 		case TCP_MAXSEG:
408 			*i1 = tcp->tcp_mss;
409 			return (sizeof (int));
410 		case TCP_NOTIFY_THRESHOLD:
411 			*i1 = (int)tcp->tcp_first_timer_threshold;
412 			return (sizeof (int));
413 		case TCP_ABORT_THRESHOLD:
414 			*i1 = tcp->tcp_second_timer_threshold;
415 			return (sizeof (int));
416 		case TCP_CONN_NOTIFY_THRESHOLD:
417 			*i1 = tcp->tcp_first_ctimer_threshold;
418 			return (sizeof (int));
419 		case TCP_CONN_ABORT_THRESHOLD:
420 			*i1 = tcp->tcp_second_ctimer_threshold;
421 			return (sizeof (int));
422 		case TCP_INIT_CWND:
423 			*i1 = tcp->tcp_init_cwnd;
424 			return (sizeof (int));
425 		case TCP_KEEPALIVE_THRESHOLD:
426 			*i1 = tcp->tcp_ka_interval;
427 			return (sizeof (int));
428 
429 		/*
430 		 * TCP_KEEPIDLE expects value in seconds, but
431 		 * tcp_ka_interval is in milliseconds.
432 		 */
433 		case TCP_KEEPIDLE:
434 			*i1 = tcp->tcp_ka_interval / 1000;
435 			return (sizeof (int));
436 		case TCP_KEEPCNT:
437 			*i1 = tcp->tcp_ka_cnt;
438 			return (sizeof (int));
439 
440 		/*
441 		 * TCP_KEEPINTVL expects value in seconds, but
442 		 * tcp_ka_rinterval is in milliseconds.
443 		 */
444 		case TCP_KEEPINTVL:
445 			*i1 = tcp->tcp_ka_rinterval / 1000;
446 			return (sizeof (int));
447 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
448 			*i1 = tcp->tcp_ka_abort_thres;
449 			return (sizeof (int));
450 		case TCP_CONGESTION: {
451 			size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
452 			    CC_ALGO_NAME_MAX);
453 			if (len >= CC_ALGO_NAME_MAX)
454 				return (-1);
455 			return (len + 1);
456 		}
457 		case TCP_CORK:
458 			*i1 = tcp->tcp_cork;
459 			return (sizeof (int));
460 		case TCP_QUICKACK:
461 			*i1 = tcp->tcp_quickack;
462 			return (sizeof (int));
463 		case TCP_RTO_INITIAL:
464 			*i1 = tcp->tcp_rto_initial;
465 			return (sizeof (uint32_t));
466 		case TCP_RTO_MIN:
467 			*i1 = tcp->tcp_rto_min;
468 			return (sizeof (uint32_t));
469 		case TCP_RTO_MAX:
470 			*i1 = tcp->tcp_rto_max;
471 			return (sizeof (uint32_t));
472 		case TCP_LINGER2:
473 			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
474 			return (sizeof (int));
475 		}
476 		break;
477 	case IPPROTO_IP:
478 		if (connp->conn_family != AF_INET)
479 			return (-1);
480 		switch (name) {
481 		case IP_OPTIONS:
482 		case T_IP_OPTIONS:
483 			/* Caller ensures enough space */
484 			return (ip_opt_get_user(connp, ptr));
485 		default:
486 			break;
487 		}
488 		break;
489 
490 	case IPPROTO_IPV6:
491 		/*
492 		 * IPPROTO_IPV6 options are only supported for sockets
493 		 * that are using IPv6 on the wire.
494 		 */
495 		if (connp->conn_ipversion != IPV6_VERSION) {
496 			return (-1);
497 		}
498 		switch (name) {
499 		case IPV6_PATHMTU:
500 			if (tcp->tcp_state < TCPS_ESTABLISHED)
501 				return (-1);
502 			break;
503 		}
504 		break;
505 	}
506 	mutex_enter(&connp->conn_lock);
507 	retval = conn_opt_get(&coas, level, name, ptr);
508 	mutex_exit(&connp->conn_lock);
509 	return (retval);
510 }
511 
512 /*
513  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
514  * Parameters are assumed to be verified by the caller.
515  */
516 /* ARGSUSED */
517 int
tcp_opt_set(conn_t * connp,uint_t optset_context,int level,int name,uint_t inlen,uchar_t * invalp,uint_t * outlenp,uchar_t * outvalp,void * thisdg_attrs,cred_t * cr)518 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
519     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
520     void *thisdg_attrs, cred_t *cr)
521 {
522 	tcp_t	*tcp = connp->conn_tcp;
523 	int	*i1 = (int *)invalp;
524 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
525 	boolean_t checkonly;
526 	int	reterr;
527 	tcp_stack_t	*tcps = tcp->tcp_tcps;
528 	conn_opt_arg_t	coas;
529 	uint32_t	val = *((uint32_t *)invalp);
530 
531 	coas.coa_connp = connp;
532 	coas.coa_ixa = connp->conn_ixa;
533 	coas.coa_ipp = &connp->conn_xmit_ipp;
534 	coas.coa_ancillary = B_FALSE;
535 	coas.coa_changed = 0;
536 
537 	switch (optset_context) {
538 	case SETFN_OPTCOM_CHECKONLY:
539 		checkonly = B_TRUE;
540 		/*
541 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
542 		 * inlen != 0 implies value supplied and
543 		 *	we have to "pretend" to set it.
544 		 * inlen == 0 implies that there is no
545 		 *	value part in T_CHECK request and just validation
546 		 * done elsewhere should be enough, we just return here.
547 		 */
548 		if (inlen == 0) {
549 			*outlenp = 0;
550 			return (0);
551 		}
552 		break;
553 	case SETFN_OPTCOM_NEGOTIATE:
554 		checkonly = B_FALSE;
555 		break;
556 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
557 	case SETFN_CONN_NEGOTIATE:
558 		checkonly = B_FALSE;
559 		/*
560 		 * Negotiating local and "association-related" options
561 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
562 		 * primitives is allowed by XTI, but we choose
563 		 * to not implement this style negotiation for Internet
564 		 * protocols (We interpret it is a must for OSI world but
565 		 * optional for Internet protocols) for all options.
566 		 * [ Will do only for the few options that enable test
567 		 * suites that our XTI implementation of this feature
568 		 * works for transports that do allow it ]
569 		 */
570 		if (!tcp_allow_connopt_set(level, name)) {
571 			*outlenp = 0;
572 			return (EINVAL);
573 		}
574 		break;
575 	default:
576 		/*
577 		 * We should never get here
578 		 */
579 		*outlenp = 0;
580 		return (EINVAL);
581 	}
582 
583 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
584 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
585 
586 	/*
587 	 * For TCP, we should have no ancillary data sent down
588 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
589 	 * has to be zero.
590 	 */
591 	ASSERT(thisdg_attrs == NULL);
592 
593 	/*
594 	 * For fixed length options, no sanity check
595 	 * of passed in length is done. It is assumed *_optcom_req()
596 	 * routines do the right thing.
597 	 */
598 	switch (level) {
599 	case SOL_SOCKET:
600 		switch (name) {
601 		case SO_KEEPALIVE:
602 			if (checkonly) {
603 				/* check only case */
604 				break;
605 			}
606 
607 			if (!onoff) {
608 				if (connp->conn_keepalive) {
609 					if (tcp->tcp_ka_tid != 0) {
610 						(void) TCP_TIMER_CANCEL(tcp,
611 						    tcp->tcp_ka_tid);
612 						tcp->tcp_ka_tid = 0;
613 					}
614 					connp->conn_keepalive = 0;
615 				}
616 				break;
617 			}
618 			if (!connp->conn_keepalive) {
619 				/* Crank up the keepalive timer */
620 				tcp->tcp_ka_last_intrvl = 0;
621 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
622 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
623 				connp->conn_keepalive = 1;
624 			}
625 			break;
626 		case SO_SNDBUF: {
627 			if (*i1 > tcps->tcps_max_buf) {
628 				*outlenp = 0;
629 				return (ENOBUFS);
630 			}
631 			if (checkonly)
632 				break;
633 
634 			connp->conn_sndbuf = *i1;
635 			if (tcps->tcps_snd_lowat_fraction != 0) {
636 				connp->conn_sndlowat = connp->conn_sndbuf /
637 				    tcps->tcps_snd_lowat_fraction;
638 			}
639 			(void) tcp_maxpsz_set(tcp, B_TRUE);
640 			/*
641 			 * If we are flow-controlled, recheck the condition.
642 			 * There are apps that increase SO_SNDBUF size when
643 			 * flow-controlled (EWOULDBLOCK), and expect the flow
644 			 * control condition to be lifted right away.
645 			 */
646 			mutex_enter(&tcp->tcp_non_sq_lock);
647 			if (tcp->tcp_flow_stopped &&
648 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
649 				tcp_clrqfull(tcp);
650 			}
651 			mutex_exit(&tcp->tcp_non_sq_lock);
652 			*outlenp = inlen;
653 			return (0);
654 		}
655 		case SO_RCVBUF:
656 			if (*i1 > tcps->tcps_max_buf) {
657 				*outlenp = 0;
658 				return (ENOBUFS);
659 			}
660 			/* Silently ignore zero */
661 			if (!checkonly && *i1 != 0) {
662 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
663 				(void) tcp_rwnd_set(tcp, *i1);
664 			}
665 			/*
666 			 * XXX should we return the rwnd here
667 			 * and tcp_opt_get ?
668 			 */
669 			*outlenp = inlen;
670 			return (0);
671 		case SO_SND_COPYAVOID:
672 			if (!checkonly) {
673 				if (tcp->tcp_loopback ||
674 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
675 					*outlenp = 0;
676 					return (EOPNOTSUPP);
677 				}
678 				tcp->tcp_snd_zcopy_aware = 1;
679 			}
680 			*outlenp = inlen;
681 			return (0);
682 		}
683 		break;
684 	case IPPROTO_TCP:
685 		switch (name) {
686 		case TCP_NODELAY:
687 			if (!checkonly)
688 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
689 			break;
690 		case TCP_NOTIFY_THRESHOLD:
691 			if (!checkonly)
692 				tcp->tcp_first_timer_threshold = *i1;
693 			break;
694 		case TCP_ABORT_THRESHOLD:
695 			if (!checkonly)
696 				tcp->tcp_second_timer_threshold = *i1;
697 			break;
698 		case TCP_CONN_NOTIFY_THRESHOLD:
699 			if (!checkonly)
700 				tcp->tcp_first_ctimer_threshold = *i1;
701 			break;
702 		case TCP_CONN_ABORT_THRESHOLD:
703 			if (!checkonly)
704 				tcp->tcp_second_ctimer_threshold = *i1;
705 			break;
706 		case TCP_RECVDSTADDR:
707 			if (tcp->tcp_state > TCPS_LISTEN) {
708 				*outlenp = 0;
709 				return (EOPNOTSUPP);
710 			}
711 			/* Setting done in conn_opt_set */
712 			break;
713 		case TCP_INIT_CWND:
714 			if (checkonly)
715 				break;
716 
717 			/*
718 			 * Only allow socket with network configuration
719 			 * privilege to set the initial cwnd to be larger
720 			 * than allowed by RFC 3390.
721 			 */
722 			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
723 				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
724 				    != 0) {
725 					*outlenp = 0;
726 					return (reterr);
727 				}
728 				if (val > tcp_max_init_cwnd) {
729 					*outlenp = 0;
730 					return (EINVAL);
731 				}
732 			}
733 
734 			tcp->tcp_init_cwnd = val;
735 
736 			/*
737 			 * If the socket is connected, AND no outbound data
738 			 * has been sent, reset the actual cwnd values.
739 			 */
740 			if (tcp->tcp_state == TCPS_ESTABLISHED &&
741 			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
742 				tcp->tcp_cwnd =
743 				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
744 			}
745 			break;
746 
747 		/*
748 		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
749 		 * is in milliseconds. TCP_KEEPIDLE is introduced for
750 		 * compatibility with other Unix flavors.
751 		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
752 		 * converting the input to milliseconds.
753 		 */
754 		case TCP_KEEPIDLE:
755 			*i1 *= 1000;
756 			/* FALLTHRU */
757 
758 		case TCP_KEEPALIVE_THRESHOLD:
759 			if (checkonly)
760 				break;
761 
762 			if (*i1 < tcps->tcps_keepalive_interval_low ||
763 			    *i1 > tcps->tcps_keepalive_interval_high) {
764 				*outlenp = 0;
765 				return (EINVAL);
766 			}
767 			if (*i1 != tcp->tcp_ka_interval) {
768 				tcp->tcp_ka_interval = *i1;
769 				/*
770 				 * Check if we need to restart the
771 				 * keepalive timer.
772 				 */
773 				if (tcp->tcp_ka_tid != 0) {
774 					ASSERT(connp->conn_keepalive);
775 					(void) TCP_TIMER_CANCEL(tcp,
776 					    tcp->tcp_ka_tid);
777 					tcp->tcp_ka_last_intrvl = 0;
778 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
779 					    tcp_keepalive_timer,
780 					    tcp->tcp_ka_interval);
781 				}
782 			}
783 			break;
784 
785 		/*
786 		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
787 		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
788 		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
789 		 * tcp_ka_cnt.
790 		 */
791 		case TCP_KEEPCNT:
792 			if (checkonly)
793 				break;
794 
795 			if (*i1 == 0) {
796 				return (EINVAL);
797 			} else if (tcp->tcp_ka_rinterval == 0) {
798 				/*
799 				 * When TCP_KEEPCNT is specified without first
800 				 * specifying a TCP_KEEPINTVL, we infer an
801 				 * interval based on a tunable specific to our
802 				 * stack: the tcp_keepalive_abort_interval.
803 				 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
804 				 * the unlikely event that that has been set.)
805 				 * Given the abort interval's default value of
806 				 * 480 seconds, low TCP_KEEPCNT values can
807 				 * result in intervals that exceed the default
808 				 * maximum RTO of 60 seconds.  Rather than
809 				 * fail in these cases, we (implicitly) clamp
810 				 * the interval at the maximum RTO; if the
811 				 * TCP_KEEPCNT is shortly followed by a
812 				 * TCP_KEEPINTVL (as we expect), the abort
813 				 * threshold will be recalculated correctly --
814 				 * and if a TCP_KEEPINTVL is not forthcoming,
815 				 * keep-alive will at least operate reasonably
816 				 * given the underconfigured state.
817 				 */
818 				uint32_t interval;
819 
820 				interval = tcp->tcp_ka_abort_thres / *i1;
821 
822 				if (interval < tcp->tcp_rto_min)
823 					interval = tcp->tcp_rto_min;
824 
825 				if (interval > tcp->tcp_rto_max)
826 					interval = tcp->tcp_rto_max;
827 
828 				tcp->tcp_ka_rinterval = interval;
829 			} else {
830 				if ((*i1 * tcp->tcp_ka_rinterval) <
831 				    tcps->tcps_keepalive_abort_interval_low ||
832 				    (*i1 * tcp->tcp_ka_rinterval) >
833 				    tcps->tcps_keepalive_abort_interval_high)
834 					return (EINVAL);
835 				tcp->tcp_ka_abort_thres =
836 				    (*i1 * tcp->tcp_ka_rinterval);
837 			}
838 			tcp->tcp_ka_cnt = *i1;
839 			break;
840 		case TCP_KEEPINTVL:
841 			/*
842 			 * TCP_KEEPINTVL is specified in seconds, but
843 			 * tcp_ka_rinterval is in milliseconds.
844 			 */
845 
846 			if (checkonly)
847 				break;
848 
849 			if ((*i1 * 1000) < tcp->tcp_rto_min ||
850 			    (*i1 * 1000) > tcp->tcp_rto_max)
851 				return (EINVAL);
852 
853 			if (tcp->tcp_ka_cnt == 0) {
854 				tcp->tcp_ka_cnt =
855 				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
856 			} else {
857 				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
858 				    tcps->tcps_keepalive_abort_interval_low ||
859 				    (*i1 * tcp->tcp_ka_cnt * 1000) >
860 				    tcps->tcps_keepalive_abort_interval_high)
861 					return (EINVAL);
862 				tcp->tcp_ka_abort_thres =
863 				    (*i1 * tcp->tcp_ka_cnt * 1000);
864 			}
865 			tcp->tcp_ka_rinterval = *i1 * 1000;
866 			break;
867 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
868 			if (!checkonly) {
869 				if (*i1 <
870 				    tcps->tcps_keepalive_abort_interval_low ||
871 				    *i1 >
872 				    tcps->tcps_keepalive_abort_interval_high) {
873 					*outlenp = 0;
874 					return (EINVAL);
875 				}
876 				tcp->tcp_ka_abort_thres = *i1;
877 				tcp->tcp_ka_cnt = 0;
878 				tcp->tcp_ka_rinterval = 0;
879 			}
880 			break;
881 		case TCP_CONGESTION: {
882 			struct cc_algo *algo;
883 
884 			if (checkonly) {
885 				break;
886 			}
887 
888 			/*
889 			 * Make sure the string is NUL-terminated. Some
890 			 * consumers pass only the number of characters
891 			 * in the string, and don't include the NUL
892 			 * terminator, so we set it for them.
893 			 */
894 			if (inlen < CC_ALGO_NAME_MAX) {
895 				invalp[inlen] = '\0';
896 			}
897 			invalp[CC_ALGO_NAME_MAX - 1] = '\0';
898 
899 			if ((algo = cc_load_algo((char *)invalp)) == NULL) {
900 				return (ENOENT);
901 			}
902 
903 			if (CC_ALGO(tcp)->cb_destroy != NULL) {
904 				CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
905 			}
906 
907 			CC_DATA(tcp) = NULL;
908 			CC_ALGO(tcp) = algo;
909 
910 			if (CC_ALGO(tcp)->cb_init != NULL) {
911 				VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
912 			}
913 
914 			break;
915 		}
916 		case TCP_CORK:
917 			if (!checkonly) {
918 				/*
919 				 * if tcp->tcp_cork was set and is now
920 				 * being unset, we have to make sure that
921 				 * the remaining data gets sent out. Also
922 				 * unset tcp->tcp_cork so that tcp_wput_data()
923 				 * can send data even if it is less than mss
924 				 */
925 				if (tcp->tcp_cork && onoff == 0 &&
926 				    tcp->tcp_unsent > 0) {
927 					tcp->tcp_cork = B_FALSE;
928 					tcp_wput_data(tcp, NULL, B_FALSE);
929 				}
930 				tcp->tcp_cork = onoff;
931 			}
932 			break;
933 		case TCP_QUICKACK:
934 			if (!checkonly) {
935 				tcp->tcp_quickack = onoff;
936 			}
937 			break;
938 		case TCP_RTO_INITIAL:
939 			if (checkonly || val == 0)
940 				break;
941 
942 			/*
943 			 * Sanity checks
944 			 *
945 			 * The initial RTO should be bounded by the minimum
946 			 * and maximum RTO.  And it should also be smaller
947 			 * than the connect attempt abort timeout.  Otherwise,
948 			 * the connection won't be aborted in a period
949 			 * reasonably close to that timeout.
950 			 */
951 			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
952 			    val > tcp->tcp_second_ctimer_threshold ||
953 			    val < tcps->tcps_rexmit_interval_initial_low ||
954 			    val > tcps->tcps_rexmit_interval_initial_high) {
955 				*outlenp = 0;
956 				return (EINVAL);
957 			}
958 			tcp->tcp_rto_initial = val;
959 
960 			/*
961 			 * If TCP has not sent anything, need to re-calculate
962 			 * tcp_rto.  Otherwise, this option change does not
963 			 * really affect anything.
964 			 */
965 			if (tcp->tcp_state >= TCPS_SYN_SENT)
966 				break;
967 
968 			tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
969 			tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
970 			tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
971 			    tcps->tcps_conn_grace_period);
972 			break;
973 		case TCP_RTO_MIN:
974 			if (checkonly || val == 0)
975 				break;
976 
977 			if (val < tcps->tcps_rexmit_interval_min_low ||
978 			    val > tcps->tcps_rexmit_interval_min_high ||
979 			    val > tcp->tcp_rto_max) {
980 				*outlenp = 0;
981 				return (EINVAL);
982 			}
983 			tcp->tcp_rto_min = val;
984 			if (tcp->tcp_rto < val)
985 				tcp->tcp_rto = val;
986 			break;
987 		case TCP_RTO_MAX:
988 			if (checkonly || val == 0)
989 				break;
990 
991 			/*
992 			 * Sanity checks
993 			 *
994 			 * The maximum RTO should not be larger than the
995 			 * connection abort timeout.  Otherwise, the
996 			 * connection won't be aborted in a period reasonably
997 			 * close to that timeout.
998 			 */
999 			if (val < tcps->tcps_rexmit_interval_max_low ||
1000 			    val > tcps->tcps_rexmit_interval_max_high ||
1001 			    val < tcp->tcp_rto_min ||
1002 			    val > tcp->tcp_second_timer_threshold) {
1003 				*outlenp = 0;
1004 				return (EINVAL);
1005 			}
1006 			tcp->tcp_rto_max = val;
1007 			if (tcp->tcp_rto > val)
1008 				tcp->tcp_rto = val;
1009 			break;
1010 		case TCP_LINGER2:
1011 			if (checkonly || *i1 == 0)
1012 				break;
1013 
1014 			/*
1015 			 * Note that the option value's unit is second.  And
1016 			 * the value should be bigger than the private
1017 			 * parameter tcp_fin_wait_2_flush_interval's lower
1018 			 * bound and smaller than the current value of that
1019 			 * parameter.  It should be smaller than the current
1020 			 * value to avoid an app setting TCP_LINGER2 to a big
1021 			 * value, causing resource to be held up too long in
1022 			 * FIN-WAIT-2 state.
1023 			 */
1024 			if (*i1 < 0 ||
1025 			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1026 			    *i1 ||
1027 			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1028 			    *i1) {
1029 				*outlenp = 0;
1030 				return (EINVAL);
1031 			}
1032 			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1033 			break;
1034 		default:
1035 			break;
1036 		}
1037 		break;
1038 	case IPPROTO_IP:
1039 		if (connp->conn_family != AF_INET) {
1040 			*outlenp = 0;
1041 			return (EINVAL);
1042 		}
1043 		switch (name) {
1044 		case IP_SEC_OPT:
1045 			/*
1046 			 * We should not allow policy setting after
1047 			 * we start listening for connections.
1048 			 */
1049 			if (tcp->tcp_state == TCPS_LISTEN) {
1050 				return (EINVAL);
1051 			}
1052 			break;
1053 		case IP_RECVTOS:
1054 			if (!checkonly) {
1055 				/*
1056 				 * Force it to be sent up with the next msg
1057 				 * by setting it to a value which cannot
1058 				 * appear in a packet (TOS is only 8-bits)
1059 				 */
1060 				tcp->tcp_recvtos = 0xffffffffU;
1061 			}
1062 			break;
1063 		}
1064 		break;
1065 	case IPPROTO_IPV6:
1066 		/*
1067 		 * IPPROTO_IPV6 options are only supported for sockets
1068 		 * that are using IPv6 on the wire.
1069 		 */
1070 		if (connp->conn_ipversion != IPV6_VERSION) {
1071 			*outlenp = 0;
1072 			return (EINVAL);
1073 		}
1074 
1075 		switch (name) {
1076 		case IPV6_RECVPKTINFO:
1077 			if (!checkonly) {
1078 				/* Force it to be sent up with the next msg */
1079 				tcp->tcp_recvifindex = 0;
1080 			}
1081 			break;
1082 		case IPV6_RECVTCLASS:
1083 			if (!checkonly) {
1084 				/* Force it to be sent up with the next msg */
1085 				tcp->tcp_recvtclass = 0xffffffffU;
1086 			}
1087 			break;
1088 		case IPV6_RECVHOPLIMIT:
1089 			if (!checkonly) {
1090 				/* Force it to be sent up with the next msg */
1091 				tcp->tcp_recvhops = 0xffffffffU;
1092 			}
1093 			break;
1094 		case IPV6_PKTINFO:
1095 			/* This is an extra check for TCP */
1096 			if (inlen == sizeof (struct in6_pktinfo)) {
1097 				struct in6_pktinfo *pkti;
1098 
1099 				pkti = (struct in6_pktinfo *)invalp;
1100 				/*
1101 				 * RFC 3542 states that ipi6_addr must be
1102 				 * the unspecified address when setting the
1103 				 * IPV6_PKTINFO sticky socket option on a
1104 				 * TCP socket.
1105 				 */
1106 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1107 					return (EINVAL);
1108 			}
1109 			break;
1110 		case IPV6_SEC_OPT:
1111 			/*
1112 			 * We should not allow policy setting after
1113 			 * we start listening for connections.
1114 			 */
1115 			if (tcp->tcp_state == TCPS_LISTEN) {
1116 				return (EINVAL);
1117 			}
1118 			break;
1119 		}
1120 		break;
1121 	}
1122 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1123 	    checkonly, cr);
1124 	if (reterr != 0) {
1125 		*outlenp = 0;
1126 		return (reterr);
1127 	}
1128 
1129 	/*
1130 	 * Common case of OK return with outval same as inval
1131 	 */
1132 	if (invalp != outvalp) {
1133 		/* don't trust bcopy for identical src/dst */
1134 		(void) bcopy(invalp, outvalp, inlen);
1135 	}
1136 	*outlenp = inlen;
1137 
1138 	if (coas.coa_changed & COA_HEADER_CHANGED) {
1139 		/* If we are connected we rebuilt the headers */
1140 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1141 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1142 			reterr = tcp_build_hdrs(tcp);
1143 			if (reterr != 0)
1144 				return (reterr);
1145 		}
1146 	}
1147 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1148 		in6_addr_t nexthop;
1149 
1150 		/*
1151 		 * If we are connected we re-cache the information.
1152 		 * We ignore errors to preserve BSD behavior.
1153 		 * Note that we don't redo IPsec policy lookup here
1154 		 * since the final destination (or source) didn't change.
1155 		 */
1156 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1157 		    &connp->conn_faddr_v6, &nexthop);
1158 
1159 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1160 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1161 			(void) ip_attr_connect(connp, connp->conn_ixa,
1162 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1163 			    &nexthop, connp->conn_fport, NULL, NULL,
1164 			    IPDF_VERIFY_DST);
1165 		}
1166 	}
1167 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1168 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1169 	}
1170 	if (coas.coa_changed & COA_WROFF_CHANGED) {
1171 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1172 		    tcps->tcps_wroff_xtra;
1173 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1174 		    connp->conn_wroff);
1175 	}
1176 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1177 		if (IPCL_IS_NONSTR(connp))
1178 			proto_set_rx_oob_opt(connp, onoff);
1179 	}
1180 	return (0);
1181 }
1182