1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
27  * Copyright 2024 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/socket.h>
35 #include <sys/xti_xtiopt.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 
39 #include <inet/cc.h>
40 #include <inet/common.h>
41 #include <netinet/ip6.h>
42 #include <inet/ip.h>
43 
44 #include <netinet/in.h>
45 #include <netinet/tcp.h>
46 #include <inet/optcom.h>
47 #include <inet/proto_set.h>
48 #include <inet/tcp_impl.h>
49 
50 static int	tcp_opt_default(queue_t *, int, int, uchar_t *);
51 
52 /*
53  * Table of all known options handled on a TCP protocol stack.
54  *
55  * Note: This table contains options processed by both TCP and IP levels
56  *       and is the superset of options that can be performed on a TCP over IP
57  *       stack.
58  */
59 opdes_t	tcp_opt_arr[] = {
60 
61 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
62 	sizeof (struct linger), 0 },
63 
64 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
68 	},
69 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
71 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
72 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
73 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
75 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
76 	sizeof (struct timeval), 0 },
77 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
78 	sizeof (struct timeval), 0 },
79 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
80 	},
81 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
82 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
83 	0 },
84 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
85 	0 },
86 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
87 	0 },
88 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
89 	0 },
90 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
91 
92 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
93 
94 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
95 
96 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
97 	},
98 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
99 	536 },
100 
101 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
102 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
103 
104 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
105 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
106 
107 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
108 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
109 
110 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
111 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
112 
113 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
114 	0 },
115 
116 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
117 	sizeof (int), 0 },
118 
119 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
120 	},
121 
122 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
123 	sizeof (int), 0 },
124 
125 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
126 	sizeof (int), 0	},
127 
128 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
129 
130 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131 
132 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
133 
134 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
135 	sizeof (int), 0	},
136 
137 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
138 
139 { TCP_QUICKACK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
140 
141 { TCP_MD5SIG, IPPROTO_TCP, OA_W, OA_W, OP_NP, 0, sizeof (int), 0 },
142 
143 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
144 
145 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
146 
147 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
148 
149 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
150 
151 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
152 	OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
153 
154 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
155 	(OP_VARLEN|OP_NODEFAULT),
156 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
157 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
158 	(OP_VARLEN|OP_NODEFAULT),
159 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
160 
161 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
162 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
163 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
164 	sizeof (int), -1 /* not initialized */ },
165 { IP_RECVTOS,	IPPROTO_IP,  OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
166 
167 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
168 	sizeof (ipsec_req_t), -1 /* not initialized */ },
169 
170 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
171 	sizeof (int),	0 /* no ifindex */ },
172 
173 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
174 	sizeof (int), 0 },
175 
176 { IP_MINTTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
177 
178 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
179 	sizeof (int), -1 /* not initialized */ },
180 
181 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
182 	sizeof (int),	0 /* no ifindex */ },
183 
184 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
185 
186 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
187 	sizeof (in_addr_t),	-1 /* not initialized  */ },
188 
189 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
190 	sizeof (int), 0 },
191 
192 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
193 	(OP_NODEFAULT|OP_VARLEN),
194 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
195 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
196 	OP_NODEFAULT,
197 	sizeof (sin6_t), -1 /* not initialized */ },
198 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
199 	(OP_VARLEN|OP_NODEFAULT), 255*8,
200 	-1 /* not initialized */ },
201 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
202 	(OP_VARLEN|OP_NODEFAULT), 255*8,
203 	-1 /* not initialized */ },
204 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
205 	(OP_VARLEN|OP_NODEFAULT), 255*8,
206 	-1 /* not initialized */ },
207 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
208 	(OP_VARLEN|OP_NODEFAULT), 255*8,
209 	-1 /* not initialized */ },
210 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
211 	OP_NODEFAULT,
212 	sizeof (int), -1 /* not initialized */ },
213 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
214 	OP_NODEFAULT,
215 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
216 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
217 	sizeof (int), 0 },
218 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
219 	sizeof (int), 0 },
220 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
221 	sizeof (int), 0 },
222 
223 /* Enable receipt of ancillary data */
224 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
225 	sizeof (int), 0 },
226 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
227 	sizeof (int), 0 },
228 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
229 	sizeof (int), 0 },
230 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
231 	sizeof (int), 0 },
232 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
233 	sizeof (int), 0 },
234 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
235 	sizeof (int), 0 },
236 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
237 	sizeof (int), 0 },
238 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
239 	sizeof (int), 0 },
240 
241 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
242 	sizeof (ipsec_req_t), -1 /* not initialized */ },
243 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
244 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
245 
246 { IPV6_MINHOPCOUNT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
247 	sizeof (int), 0 },
248 };
249 
250 /*
251  * Table of all supported levels
252  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
253  * any supported options so we need this info separately.
254  *
255  * This is needed only for topmost tpi providers and is used only by
256  * XTI interfaces.
257  */
258 optlevel_t	tcp_valid_levels_arr[] = {
259 	XTI_GENERIC,
260 	SOL_SOCKET,
261 	IPPROTO_TCP,
262 	IPPROTO_IP,
263 	IPPROTO_IPV6
264 };
265 
266 
267 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
268 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
269 
270 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
271 
272 /*
273  * Initialize option database object for TCP
274  *
275  * This object represents database of options to search passed to
276  * {sock,tpi}optcom_req() interface routine to take care of option
277  * management and associated methods.
278  */
279 
280 optdb_obj_t tcp_opt_obj = {
281 	tcp_opt_default,	/* TCP default value function pointer */
282 	tcp_tpi_opt_get,	/* TCP get function pointer */
283 	tcp_tpi_opt_set,	/* TCP set function pointer */
284 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
285 	tcp_opt_arr,		/* TCP option database */
286 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
287 	tcp_valid_levels_arr	/* TCP valid level array */
288 };
289 
290 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
291 
292 /*
293  * Some TCP options can be "set" by requesting them in the option
294  * buffer. This is needed for XTI feature test though we do not
295  * allow it in general. We interpret that this mechanism is more
296  * applicable to OSI protocols and need not be allowed in general.
297  * This routine filters out options for which it is not allowed (most)
298  * and lets through those (few) for which it is. [ The XTI interface
299  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
300  * ever implemented will have to be allowed here ].
301  */
302 static boolean_t
tcp_allow_connopt_set(int level,int name)303 tcp_allow_connopt_set(int level, int name)
304 {
305 
306 	switch (level) {
307 	case IPPROTO_TCP:
308 		switch (name) {
309 		case TCP_NODELAY:
310 			return (B_TRUE);
311 		default:
312 			return (B_FALSE);
313 		}
314 		/*NOTREACHED*/
315 	default:
316 		return (B_FALSE);
317 	}
318 	/*NOTREACHED*/
319 }
320 
321 /*
322  * This routine gets default values of certain options whose default
323  * values are maintained by protocol specific code
324  */
325 /* ARGSUSED */
326 static int
tcp_opt_default(queue_t * q,int level,int name,uchar_t * ptr)327 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
328 {
329 	int32_t	*i1 = (int32_t *)ptr;
330 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
331 
332 	switch (level) {
333 	case IPPROTO_TCP:
334 		switch (name) {
335 		case TCP_NOTIFY_THRESHOLD:
336 			*i1 = tcps->tcps_ip_notify_interval;
337 			break;
338 		case TCP_ABORT_THRESHOLD:
339 			*i1 = tcps->tcps_ip_abort_interval;
340 			break;
341 		case TCP_CONN_NOTIFY_THRESHOLD:
342 			*i1 = tcps->tcps_ip_notify_cinterval;
343 			break;
344 		case TCP_CONN_ABORT_THRESHOLD:
345 			*i1 = tcps->tcps_ip_abort_cinterval;
346 			break;
347 		default:
348 			return (-1);
349 		}
350 		break;
351 	case IPPROTO_IP:
352 		switch (name) {
353 		case IP_TTL:
354 			*i1 = tcps->tcps_ipv4_ttl;
355 			break;
356 		default:
357 			return (-1);
358 		}
359 		break;
360 	case IPPROTO_IPV6:
361 		switch (name) {
362 		case IPV6_UNICAST_HOPS:
363 			*i1 = tcps->tcps_ipv6_hoplimit;
364 			break;
365 		default:
366 			return (-1);
367 		}
368 		break;
369 	default:
370 		return (-1);
371 	}
372 	return (sizeof (int));
373 }
374 
375 /*
376  * TCP routine to get the values of options.
377  */
378 int
tcp_opt_get(conn_t * connp,int level,int name,uchar_t * ptr)379 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
380 {
381 	int		*i1 = (int *)ptr;
382 	tcp_t		*tcp = connp->conn_tcp;
383 	conn_opt_arg_t	coas;
384 	int		retval;
385 
386 	coas.coa_connp = connp;
387 	coas.coa_ixa = connp->conn_ixa;
388 	coas.coa_ipp = &connp->conn_xmit_ipp;
389 	coas.coa_ancillary = B_FALSE;
390 	coas.coa_changed = 0;
391 
392 	switch (level) {
393 	case SOL_SOCKET:
394 		switch (name) {
395 		case SO_SND_COPYAVOID:
396 			*i1 = tcp->tcp_snd_zcopy_on ?
397 			    SO_SND_COPYAVOID : 0;
398 			return (sizeof (int));
399 		case SO_ACCEPTCONN:
400 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
401 			return (sizeof (int));
402 		}
403 		break;
404 	case IPPROTO_TCP:
405 		switch (name) {
406 		case TCP_NODELAY:
407 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
408 			return (sizeof (int));
409 		case TCP_MAXSEG:
410 			*i1 = tcp->tcp_mss;
411 			return (sizeof (int));
412 		case TCP_NOTIFY_THRESHOLD:
413 			*i1 = (int)tcp->tcp_first_timer_threshold;
414 			return (sizeof (int));
415 		case TCP_ABORT_THRESHOLD:
416 			*i1 = tcp->tcp_second_timer_threshold;
417 			return (sizeof (int));
418 		case TCP_CONN_NOTIFY_THRESHOLD:
419 			*i1 = tcp->tcp_first_ctimer_threshold;
420 			return (sizeof (int));
421 		case TCP_CONN_ABORT_THRESHOLD:
422 			*i1 = tcp->tcp_second_ctimer_threshold;
423 			return (sizeof (int));
424 		case TCP_INIT_CWND:
425 			*i1 = tcp->tcp_init_cwnd;
426 			return (sizeof (int));
427 		case TCP_KEEPALIVE_THRESHOLD:
428 			*i1 = tcp->tcp_ka_interval;
429 			return (sizeof (int));
430 
431 		/*
432 		 * TCP_KEEPIDLE expects value in seconds, but
433 		 * tcp_ka_interval is in milliseconds.
434 		 */
435 		case TCP_KEEPIDLE:
436 			*i1 = tcp->tcp_ka_interval / 1000;
437 			return (sizeof (int));
438 		case TCP_KEEPCNT:
439 			*i1 = tcp->tcp_ka_cnt;
440 			return (sizeof (int));
441 
442 		/*
443 		 * TCP_KEEPINTVL expects value in seconds, but
444 		 * tcp_ka_rinterval is in milliseconds.
445 		 */
446 		case TCP_KEEPINTVL:
447 			*i1 = tcp->tcp_ka_rinterval / 1000;
448 			return (sizeof (int));
449 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
450 			*i1 = tcp->tcp_ka_abort_thres;
451 			return (sizeof (int));
452 		case TCP_CONGESTION: {
453 			size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
454 			    CC_ALGO_NAME_MAX);
455 			if (len >= CC_ALGO_NAME_MAX)
456 				return (-1);
457 			return (len + 1);
458 		}
459 		case TCP_CORK:
460 			*i1 = tcp->tcp_cork;
461 			return (sizeof (int));
462 		case TCP_QUICKACK:
463 			*i1 = tcp->tcp_quickack;
464 			return (sizeof (int));
465 		case TCP_MD5SIG:
466 			*i1 = tcp->tcp_md5sig;
467 			return (sizeof (int));
468 		case TCP_RTO_INITIAL:
469 			*i1 = tcp->tcp_rto_initial;
470 			return (sizeof (uint32_t));
471 		case TCP_RTO_MIN:
472 			*i1 = tcp->tcp_rto_min;
473 			return (sizeof (uint32_t));
474 		case TCP_RTO_MAX:
475 			*i1 = tcp->tcp_rto_max;
476 			return (sizeof (uint32_t));
477 		case TCP_LINGER2:
478 			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
479 			return (sizeof (int));
480 		}
481 		break;
482 	case IPPROTO_IP:
483 		if (connp->conn_family != AF_INET)
484 			return (-1);
485 		switch (name) {
486 		case IP_OPTIONS:
487 		case T_IP_OPTIONS:
488 			/* Caller ensures enough space */
489 			return (ip_opt_get_user(connp, ptr));
490 		default:
491 			break;
492 		}
493 		break;
494 
495 	case IPPROTO_IPV6:
496 		/*
497 		 * IPPROTO_IPV6 options are only supported for sockets
498 		 * that are using IPv6 on the wire.
499 		 */
500 		if (connp->conn_ipversion != IPV6_VERSION) {
501 			return (-1);
502 		}
503 		switch (name) {
504 		case IPV6_PATHMTU:
505 			if (tcp->tcp_state < TCPS_ESTABLISHED)
506 				return (-1);
507 			break;
508 		}
509 		break;
510 	}
511 	mutex_enter(&connp->conn_lock);
512 	retval = conn_opt_get(&coas, level, name, ptr);
513 	mutex_exit(&connp->conn_lock);
514 	return (retval);
515 }
516 
517 /*
518  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
519  * Parameters are assumed to be verified by the caller.
520  */
521 /* ARGSUSED */
522 int
tcp_opt_set(conn_t * connp,uint_t optset_context,int level,int name,uint_t inlen,uchar_t * invalp,uint_t * outlenp,uchar_t * outvalp,void * thisdg_attrs,cred_t * cr)523 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
524     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
525     void *thisdg_attrs, cred_t *cr)
526 {
527 	tcp_t	*tcp = connp->conn_tcp;
528 	int	*i1 = (int *)invalp;
529 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
530 	boolean_t checkonly;
531 	int	reterr;
532 	tcp_stack_t	*tcps = tcp->tcp_tcps;
533 	conn_opt_arg_t	coas;
534 	uint32_t	val = *((uint32_t *)invalp);
535 
536 	coas.coa_connp = connp;
537 	coas.coa_ixa = connp->conn_ixa;
538 	coas.coa_ipp = &connp->conn_xmit_ipp;
539 	coas.coa_ancillary = B_FALSE;
540 	coas.coa_changed = 0;
541 
542 	switch (optset_context) {
543 	case SETFN_OPTCOM_CHECKONLY:
544 		checkonly = B_TRUE;
545 		/*
546 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
547 		 * inlen != 0 implies value supplied and
548 		 *	we have to "pretend" to set it.
549 		 * inlen == 0 implies that there is no
550 		 *	value part in T_CHECK request and just validation
551 		 * done elsewhere should be enough, we just return here.
552 		 */
553 		if (inlen == 0) {
554 			*outlenp = 0;
555 			return (0);
556 		}
557 		break;
558 	case SETFN_OPTCOM_NEGOTIATE:
559 		checkonly = B_FALSE;
560 		break;
561 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
562 	case SETFN_CONN_NEGOTIATE:
563 		checkonly = B_FALSE;
564 		/*
565 		 * Negotiating local and "association-related" options
566 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
567 		 * primitives is allowed by XTI, but we choose
568 		 * to not implement this style negotiation for Internet
569 		 * protocols (We interpret it is a must for OSI world but
570 		 * optional for Internet protocols) for all options.
571 		 * [ Will do only for the few options that enable test
572 		 * suites that our XTI implementation of this feature
573 		 * works for transports that do allow it ]
574 		 */
575 		if (!tcp_allow_connopt_set(level, name)) {
576 			*outlenp = 0;
577 			return (EINVAL);
578 		}
579 		break;
580 	default:
581 		/*
582 		 * We should never get here
583 		 */
584 		*outlenp = 0;
585 		return (EINVAL);
586 	}
587 
588 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
589 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
590 
591 	/*
592 	 * For TCP, we should have no ancillary data sent down
593 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
594 	 * has to be zero.
595 	 */
596 	ASSERT(thisdg_attrs == NULL);
597 
598 	/*
599 	 * For fixed length options, no sanity check
600 	 * of passed in length is done. It is assumed *_optcom_req()
601 	 * routines do the right thing.
602 	 */
603 	switch (level) {
604 	case SOL_SOCKET:
605 		switch (name) {
606 		case SO_KEEPALIVE:
607 			if (checkonly) {
608 				/* check only case */
609 				break;
610 			}
611 
612 			if (!onoff) {
613 				if (connp->conn_keepalive) {
614 					if (tcp->tcp_ka_tid != 0) {
615 						(void) TCP_TIMER_CANCEL(tcp,
616 						    tcp->tcp_ka_tid);
617 						tcp->tcp_ka_tid = 0;
618 					}
619 					connp->conn_keepalive = 0;
620 				}
621 				break;
622 			}
623 			if (!connp->conn_keepalive) {
624 				/* Crank up the keepalive timer */
625 				tcp->tcp_ka_last_intrvl = 0;
626 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
627 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
628 				connp->conn_keepalive = 1;
629 			}
630 			break;
631 		case SO_SNDBUF: {
632 			if (*i1 > tcps->tcps_max_buf) {
633 				*outlenp = 0;
634 				return (ENOBUFS);
635 			}
636 			if (checkonly)
637 				break;
638 
639 			connp->conn_sndbuf = *i1;
640 			if (tcps->tcps_snd_lowat_fraction != 0) {
641 				connp->conn_sndlowat = connp->conn_sndbuf /
642 				    tcps->tcps_snd_lowat_fraction;
643 			}
644 			(void) tcp_maxpsz_set(tcp, B_TRUE);
645 			/*
646 			 * If we are flow-controlled, recheck the condition.
647 			 * There are apps that increase SO_SNDBUF size when
648 			 * flow-controlled (EWOULDBLOCK), and expect the flow
649 			 * control condition to be lifted right away.
650 			 */
651 			mutex_enter(&tcp->tcp_non_sq_lock);
652 			if (tcp->tcp_flow_stopped &&
653 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
654 				tcp_clrqfull(tcp);
655 			}
656 			mutex_exit(&tcp->tcp_non_sq_lock);
657 			*outlenp = inlen;
658 			return (0);
659 		}
660 		case SO_RCVBUF:
661 			if (*i1 > tcps->tcps_max_buf) {
662 				*outlenp = 0;
663 				return (ENOBUFS);
664 			}
665 			/* Silently ignore zero */
666 			if (!checkonly && *i1 != 0) {
667 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
668 				(void) tcp_rwnd_set(tcp, *i1);
669 			}
670 			/*
671 			 * XXX should we return the rwnd here
672 			 * and tcp_opt_get ?
673 			 */
674 			*outlenp = inlen;
675 			return (0);
676 		case SO_SND_COPYAVOID:
677 			if (!checkonly) {
678 				if (tcp->tcp_loopback ||
679 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
680 					*outlenp = 0;
681 					return (EOPNOTSUPP);
682 				}
683 				tcp->tcp_snd_zcopy_aware = 1;
684 			}
685 			*outlenp = inlen;
686 			return (0);
687 		}
688 		break;
689 	case IPPROTO_TCP:
690 		switch (name) {
691 		case TCP_NODELAY:
692 			if (!checkonly)
693 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
694 			break;
695 		case TCP_NOTIFY_THRESHOLD:
696 			if (!checkonly)
697 				tcp->tcp_first_timer_threshold = *i1;
698 			break;
699 		case TCP_ABORT_THRESHOLD:
700 			if (!checkonly)
701 				tcp->tcp_second_timer_threshold = *i1;
702 			break;
703 		case TCP_CONN_NOTIFY_THRESHOLD:
704 			if (!checkonly)
705 				tcp->tcp_first_ctimer_threshold = *i1;
706 			break;
707 		case TCP_CONN_ABORT_THRESHOLD:
708 			if (!checkonly)
709 				tcp->tcp_second_ctimer_threshold = *i1;
710 			break;
711 		case TCP_RECVDSTADDR:
712 			if (tcp->tcp_state > TCPS_LISTEN) {
713 				*outlenp = 0;
714 				return (EOPNOTSUPP);
715 			}
716 			/* Setting done in conn_opt_set */
717 			break;
718 		case TCP_INIT_CWND:
719 			if (checkonly)
720 				break;
721 
722 			/*
723 			 * Only allow socket with network configuration
724 			 * privilege to set the initial cwnd to be larger
725 			 * than allowed by RFC 3390.
726 			 */
727 			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
728 				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
729 				    != 0) {
730 					*outlenp = 0;
731 					return (reterr);
732 				}
733 				if (val > tcp_max_init_cwnd) {
734 					*outlenp = 0;
735 					return (EINVAL);
736 				}
737 			}
738 
739 			tcp->tcp_init_cwnd = val;
740 
741 			/*
742 			 * If the socket is connected, AND no outbound data
743 			 * has been sent, reset the actual cwnd values.
744 			 */
745 			if (tcp->tcp_state == TCPS_ESTABLISHED &&
746 			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
747 				tcp->tcp_cwnd =
748 				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
749 			}
750 			break;
751 
752 		/*
753 		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
754 		 * is in milliseconds. TCP_KEEPIDLE is introduced for
755 		 * compatibility with other Unix flavors.
756 		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
757 		 * converting the input to milliseconds.
758 		 */
759 		case TCP_KEEPIDLE:
760 			*i1 *= 1000;
761 			/* FALLTHRU */
762 
763 		case TCP_KEEPALIVE_THRESHOLD:
764 			if (checkonly)
765 				break;
766 
767 			if (*i1 < tcps->tcps_keepalive_interval_low ||
768 			    *i1 > tcps->tcps_keepalive_interval_high) {
769 				*outlenp = 0;
770 				return (EINVAL);
771 			}
772 			if (*i1 != tcp->tcp_ka_interval) {
773 				tcp->tcp_ka_interval = *i1;
774 				/*
775 				 * Check if we need to restart the
776 				 * keepalive timer.
777 				 */
778 				if (tcp->tcp_ka_tid != 0) {
779 					ASSERT(connp->conn_keepalive);
780 					(void) TCP_TIMER_CANCEL(tcp,
781 					    tcp->tcp_ka_tid);
782 					tcp->tcp_ka_last_intrvl = 0;
783 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
784 					    tcp_keepalive_timer,
785 					    tcp->tcp_ka_interval);
786 				}
787 			}
788 			break;
789 
790 		/*
791 		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
792 		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
793 		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
794 		 * tcp_ka_cnt.
795 		 */
796 		case TCP_KEEPCNT:
797 			if (checkonly)
798 				break;
799 
800 			if (*i1 == 0) {
801 				return (EINVAL);
802 			} else if (tcp->tcp_ka_rinterval == 0) {
803 				/*
804 				 * When TCP_KEEPCNT is specified without first
805 				 * specifying a TCP_KEEPINTVL, we infer an
806 				 * interval based on a tunable specific to our
807 				 * stack: the tcp_keepalive_abort_interval.
808 				 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
809 				 * the unlikely event that that has been set.)
810 				 * Given the abort interval's default value of
811 				 * 480 seconds, low TCP_KEEPCNT values can
812 				 * result in intervals that exceed the default
813 				 * maximum RTO of 60 seconds.  Rather than
814 				 * fail in these cases, we (implicitly) clamp
815 				 * the interval at the maximum RTO; if the
816 				 * TCP_KEEPCNT is shortly followed by a
817 				 * TCP_KEEPINTVL (as we expect), the abort
818 				 * threshold will be recalculated correctly --
819 				 * and if a TCP_KEEPINTVL is not forthcoming,
820 				 * keep-alive will at least operate reasonably
821 				 * given the underconfigured state.
822 				 */
823 				uint32_t interval;
824 
825 				interval = tcp->tcp_ka_abort_thres / *i1;
826 
827 				if (interval < tcp->tcp_rto_min)
828 					interval = tcp->tcp_rto_min;
829 
830 				if (interval > tcp->tcp_rto_max)
831 					interval = tcp->tcp_rto_max;
832 
833 				tcp->tcp_ka_rinterval = interval;
834 			} else {
835 				if ((*i1 * tcp->tcp_ka_rinterval) <
836 				    tcps->tcps_keepalive_abort_interval_low ||
837 				    (*i1 * tcp->tcp_ka_rinterval) >
838 				    tcps->tcps_keepalive_abort_interval_high)
839 					return (EINVAL);
840 				tcp->tcp_ka_abort_thres =
841 				    (*i1 * tcp->tcp_ka_rinterval);
842 			}
843 			tcp->tcp_ka_cnt = *i1;
844 			break;
845 		case TCP_KEEPINTVL:
846 			/*
847 			 * TCP_KEEPINTVL is specified in seconds, but
848 			 * tcp_ka_rinterval is in milliseconds.
849 			 */
850 
851 			if (checkonly)
852 				break;
853 
854 			if ((*i1 * 1000) < tcp->tcp_rto_min ||
855 			    (*i1 * 1000) > tcp->tcp_rto_max)
856 				return (EINVAL);
857 
858 			if (tcp->tcp_ka_cnt == 0) {
859 				tcp->tcp_ka_cnt =
860 				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
861 			} else {
862 				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
863 				    tcps->tcps_keepalive_abort_interval_low ||
864 				    (*i1 * tcp->tcp_ka_cnt * 1000) >
865 				    tcps->tcps_keepalive_abort_interval_high)
866 					return (EINVAL);
867 				tcp->tcp_ka_abort_thres =
868 				    (*i1 * tcp->tcp_ka_cnt * 1000);
869 			}
870 			tcp->tcp_ka_rinterval = *i1 * 1000;
871 			break;
872 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
873 			if (!checkonly) {
874 				if (*i1 <
875 				    tcps->tcps_keepalive_abort_interval_low ||
876 				    *i1 >
877 				    tcps->tcps_keepalive_abort_interval_high) {
878 					*outlenp = 0;
879 					return (EINVAL);
880 				}
881 				tcp->tcp_ka_abort_thres = *i1;
882 				tcp->tcp_ka_cnt = 0;
883 				tcp->tcp_ka_rinterval = 0;
884 			}
885 			break;
886 		case TCP_CONGESTION: {
887 			struct cc_algo *algo;
888 
889 			if (checkonly) {
890 				break;
891 			}
892 
893 			/*
894 			 * Make sure the string is NUL-terminated. Some
895 			 * consumers pass only the number of characters
896 			 * in the string, and don't include the NUL
897 			 * terminator, so we set it for them.
898 			 */
899 			if (inlen < CC_ALGO_NAME_MAX) {
900 				invalp[inlen] = '\0';
901 			}
902 			invalp[CC_ALGO_NAME_MAX - 1] = '\0';
903 
904 			if ((algo = cc_load_algo((char *)invalp)) == NULL) {
905 				return (ENOENT);
906 			}
907 
908 			if (CC_ALGO(tcp)->cb_destroy != NULL) {
909 				CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
910 			}
911 
912 			CC_DATA(tcp) = NULL;
913 			CC_ALGO(tcp) = algo;
914 
915 			if (CC_ALGO(tcp)->cb_init != NULL) {
916 				VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
917 			}
918 
919 			break;
920 		}
921 		case TCP_CORK:
922 			if (!checkonly) {
923 				/*
924 				 * if tcp->tcp_cork was set and is now
925 				 * being unset, we have to make sure that
926 				 * the remaining data gets sent out. Also
927 				 * unset tcp->tcp_cork so that tcp_wput_data()
928 				 * can send data even if it is less than mss
929 				 */
930 				if (tcp->tcp_cork && onoff == 0 &&
931 				    tcp->tcp_unsent > 0) {
932 					tcp->tcp_cork = B_FALSE;
933 					tcp_wput_data(tcp, NULL, B_FALSE);
934 				}
935 				tcp->tcp_cork = onoff;
936 			}
937 			break;
938 		case TCP_QUICKACK:
939 			if (!checkonly) {
940 				tcp->tcp_quickack = onoff;
941 			}
942 			break;
943 		case TCP_MD5SIG:
944 			if (!checkonly) {
945 				tcp->tcp_md5sig = onoff;
946 			}
947 			break;
948 		case TCP_RTO_INITIAL:
949 			if (checkonly || val == 0)
950 				break;
951 
952 			/*
953 			 * Sanity checks
954 			 *
955 			 * The initial RTO should be bounded by the minimum
956 			 * and maximum RTO.  And it should also be smaller
957 			 * than the connect attempt abort timeout.  Otherwise,
958 			 * the connection won't be aborted in a period
959 			 * reasonably close to that timeout.
960 			 */
961 			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
962 			    val > tcp->tcp_second_ctimer_threshold ||
963 			    val < tcps->tcps_rexmit_interval_initial_low ||
964 			    val > tcps->tcps_rexmit_interval_initial_high) {
965 				*outlenp = 0;
966 				return (EINVAL);
967 			}
968 			tcp->tcp_rto_initial = val;
969 
970 			/*
971 			 * If TCP has not sent anything, need to re-calculate
972 			 * tcp_rto.  Otherwise, this option change does not
973 			 * really affect anything.
974 			 */
975 			if (tcp->tcp_state >= TCPS_SYN_SENT)
976 				break;
977 
978 			tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
979 			tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
980 			tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
981 			    tcps->tcps_conn_grace_period);
982 			break;
983 		case TCP_RTO_MIN:
984 			if (checkonly || val == 0)
985 				break;
986 
987 			if (val < tcps->tcps_rexmit_interval_min_low ||
988 			    val > tcps->tcps_rexmit_interval_min_high ||
989 			    val > tcp->tcp_rto_max) {
990 				*outlenp = 0;
991 				return (EINVAL);
992 			}
993 			tcp->tcp_rto_min = val;
994 			if (tcp->tcp_rto < val)
995 				tcp->tcp_rto = val;
996 			break;
997 		case TCP_RTO_MAX:
998 			if (checkonly || val == 0)
999 				break;
1000 
1001 			/*
1002 			 * Sanity checks
1003 			 *
1004 			 * The maximum RTO should not be larger than the
1005 			 * connection abort timeout.  Otherwise, the
1006 			 * connection won't be aborted in a period reasonably
1007 			 * close to that timeout.
1008 			 */
1009 			if (val < tcps->tcps_rexmit_interval_max_low ||
1010 			    val > tcps->tcps_rexmit_interval_max_high ||
1011 			    val < tcp->tcp_rto_min ||
1012 			    val > tcp->tcp_second_timer_threshold) {
1013 				*outlenp = 0;
1014 				return (EINVAL);
1015 			}
1016 			tcp->tcp_rto_max = val;
1017 			if (tcp->tcp_rto > val)
1018 				tcp->tcp_rto = val;
1019 			break;
1020 		case TCP_LINGER2:
1021 			if (checkonly || *i1 == 0)
1022 				break;
1023 
1024 			/*
1025 			 * Note that the option value's unit is second.  And
1026 			 * the value should be bigger than the private
1027 			 * parameter tcp_fin_wait_2_flush_interval's lower
1028 			 * bound and smaller than the current value of that
1029 			 * parameter.  It should be smaller than the current
1030 			 * value to avoid an app setting TCP_LINGER2 to a big
1031 			 * value, causing resource to be held up too long in
1032 			 * FIN-WAIT-2 state.
1033 			 */
1034 			if (*i1 < 0 ||
1035 			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1036 			    *i1 ||
1037 			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1038 			    *i1) {
1039 				*outlenp = 0;
1040 				return (EINVAL);
1041 			}
1042 			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1043 			break;
1044 		default:
1045 			break;
1046 		}
1047 		break;
1048 	case IPPROTO_IP:
1049 		if (connp->conn_family != AF_INET) {
1050 			*outlenp = 0;
1051 			return (EINVAL);
1052 		}
1053 		switch (name) {
1054 		case IP_SEC_OPT:
1055 			/*
1056 			 * We should not allow policy setting after
1057 			 * we start listening for connections.
1058 			 */
1059 			if (tcp->tcp_state == TCPS_LISTEN) {
1060 				return (EINVAL);
1061 			}
1062 			break;
1063 		case IP_RECVTOS:
1064 			if (!checkonly) {
1065 				/*
1066 				 * Force it to be sent up with the next msg
1067 				 * by setting it to a value which cannot
1068 				 * appear in a packet (TOS is only 8-bits)
1069 				 */
1070 				tcp->tcp_recvtos = 0xffffffffU;
1071 			}
1072 			break;
1073 		}
1074 		break;
1075 	case IPPROTO_IPV6:
1076 		/*
1077 		 * IPPROTO_IPV6 options are only supported for sockets
1078 		 * that are using IPv6 on the wire.
1079 		 */
1080 		if (connp->conn_ipversion != IPV6_VERSION) {
1081 			*outlenp = 0;
1082 			return (EINVAL);
1083 		}
1084 
1085 		switch (name) {
1086 		case IPV6_RECVPKTINFO:
1087 			if (!checkonly) {
1088 				/* Force it to be sent up with the next msg */
1089 				tcp->tcp_recvifindex = 0;
1090 			}
1091 			break;
1092 		case IPV6_RECVTCLASS:
1093 			if (!checkonly) {
1094 				/* Force it to be sent up with the next msg */
1095 				tcp->tcp_recvtclass = 0xffffffffU;
1096 			}
1097 			break;
1098 		case IPV6_RECVHOPLIMIT:
1099 			if (!checkonly) {
1100 				/* Force it to be sent up with the next msg */
1101 				tcp->tcp_recvhops = 0xffffffffU;
1102 			}
1103 			break;
1104 		case IPV6_PKTINFO:
1105 			/* This is an extra check for TCP */
1106 			if (inlen == sizeof (struct in6_pktinfo)) {
1107 				struct in6_pktinfo *pkti;
1108 
1109 				pkti = (struct in6_pktinfo *)invalp;
1110 				/*
1111 				 * RFC 3542 states that ipi6_addr must be
1112 				 * the unspecified address when setting the
1113 				 * IPV6_PKTINFO sticky socket option on a
1114 				 * TCP socket.
1115 				 */
1116 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1117 					return (EINVAL);
1118 			}
1119 			break;
1120 		case IPV6_SEC_OPT:
1121 			/*
1122 			 * We should not allow policy setting after
1123 			 * we start listening for connections.
1124 			 */
1125 			if (tcp->tcp_state == TCPS_LISTEN) {
1126 				return (EINVAL);
1127 			}
1128 			break;
1129 		}
1130 		break;
1131 	}
1132 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1133 	    checkonly, cr);
1134 	if (reterr != 0) {
1135 		*outlenp = 0;
1136 		return (reterr);
1137 	}
1138 
1139 	/*
1140 	 * Common case of OK return with outval same as inval
1141 	 */
1142 	if (invalp != outvalp) {
1143 		/* don't trust bcopy for identical src/dst */
1144 		(void) bcopy(invalp, outvalp, inlen);
1145 	}
1146 	*outlenp = inlen;
1147 
1148 	if (coas.coa_changed & COA_HEADER_CHANGED) {
1149 		/* If we are connected we rebuilt the headers */
1150 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1151 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1152 			reterr = tcp_build_hdrs(tcp);
1153 			if (reterr != 0)
1154 				return (reterr);
1155 		}
1156 	}
1157 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1158 		in6_addr_t nexthop;
1159 
1160 		/*
1161 		 * If we are connected we re-cache the information.
1162 		 * We ignore errors to preserve BSD behavior.
1163 		 * Note that we don't redo IPsec policy lookup here
1164 		 * since the final destination (or source) didn't change.
1165 		 */
1166 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1167 		    &connp->conn_faddr_v6, &nexthop);
1168 
1169 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1170 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1171 			(void) ip_attr_connect(connp, connp->conn_ixa,
1172 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1173 			    &nexthop, connp->conn_fport, NULL, NULL,
1174 			    IPDF_VERIFY_DST);
1175 		}
1176 	}
1177 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1178 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1179 	}
1180 	if (coas.coa_changed & COA_WROFF_CHANGED) {
1181 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1182 		    tcps->tcps_wroff_xtra;
1183 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1184 		    connp->conn_wroff);
1185 	}
1186 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1187 		if (IPCL_IS_NONSTR(connp))
1188 			proto_set_rx_oob_opt(connp, onoff);
1189 	}
1190 	return (0);
1191 }
1192