1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28#include <sys/types.h>
29#include <sys/stream.h>
30#define	_SUN_TPI_VERSION 2
31#include <sys/tihdr.h>
32#include <sys/socket.h>
33#include <sys/xti_xtiopt.h>
34#include <sys/xti_inet.h>
35#include <sys/policy.h>
36
37#include <inet/cc.h>
38#include <inet/common.h>
39#include <netinet/ip6.h>
40#include <inet/ip.h>
41
42#include <netinet/in.h>
43#include <netinet/tcp.h>
44#include <inet/optcom.h>
45#include <inet/proto_set.h>
46#include <inet/tcp_impl.h>
47
48static int	tcp_opt_default(queue_t *, int, int, uchar_t *);
49
50/*
51 * Table of all known options handled on a TCP protocol stack.
52 *
53 * Note: This table contains options processed by both TCP and IP levels
54 *       and is the superset of options that can be performed on a TCP over IP
55 *       stack.
56 */
57opdes_t	tcp_opt_arr[] = {
58
59{ SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
60	sizeof (struct linger), 0 },
61
62{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63{ SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
66	},
67{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
68{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
71{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
72{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
73{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
74	sizeof (struct timeval), 0 },
75{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
76	sizeof (struct timeval), 0 },
77{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
78	},
79{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
80{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
81	0 },
82{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
83	0 },
84{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
85	0 },
86{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
87	0 },
88{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
89
90{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
91
92{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
93
94{ TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
95	},
96{ TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
97	536 },
98
99{ TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101
102{ TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104
105{ TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
106	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
107
108{ TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
109	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
110
111{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
112	0 },
113
114{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
115	sizeof (int), 0 },
116
117{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
118	},
119
120{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
121	sizeof (int), 0 },
122
123{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
124	sizeof (int), 0	},
125
126{ TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
127
128{ TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
129
130{ TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131
132{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
133	sizeof (int), 0	},
134
135{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
136
137{ TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
138
139{ TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
140
141{ TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
142
143{ TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
144
145{ TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
146	OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
147
148{ IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
149	(OP_VARLEN|OP_NODEFAULT),
150	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
151{ T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
152	(OP_VARLEN|OP_NODEFAULT),
153	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
154
155{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
156{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
157{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
158	sizeof (int), -1 /* not initialized */ },
159
160{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
161	sizeof (ipsec_req_t), -1 /* not initialized */ },
162
163{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
164	sizeof (int),	0 /* no ifindex */ },
165
166{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
167	sizeof (int), 0 },
168
169{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
170	sizeof (int), -1 /* not initialized */ },
171
172{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
173	sizeof (int),	0 /* no ifindex */ },
174
175{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
176
177{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
178	sizeof (in_addr_t),	-1 /* not initialized  */ },
179
180{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
181	sizeof (int), 0 },
182
183{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184	(OP_NODEFAULT|OP_VARLEN),
185	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
186{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187	OP_NODEFAULT,
188	sizeof (sin6_t), -1 /* not initialized */ },
189{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190	(OP_VARLEN|OP_NODEFAULT), 255*8,
191	-1 /* not initialized */ },
192{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
193	(OP_VARLEN|OP_NODEFAULT), 255*8,
194	-1 /* not initialized */ },
195{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
196	(OP_VARLEN|OP_NODEFAULT), 255*8,
197	-1 /* not initialized */ },
198{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
199	(OP_VARLEN|OP_NODEFAULT), 255*8,
200	-1 /* not initialized */ },
201{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
202	OP_NODEFAULT,
203	sizeof (int), -1 /* not initialized */ },
204{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
205	OP_NODEFAULT,
206	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
207{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208	sizeof (int), 0 },
209{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210	sizeof (int), 0 },
211{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212	sizeof (int), 0 },
213
214/* Enable receipt of ancillary data */
215{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216	sizeof (int), 0 },
217{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218	sizeof (int), 0 },
219{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220	sizeof (int), 0 },
221{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222	sizeof (int), 0 },
223{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224	sizeof (int), 0 },
225{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
226	sizeof (int), 0 },
227{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
228	sizeof (int), 0 },
229{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
230	sizeof (int), 0 },
231
232{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
233	sizeof (ipsec_req_t), -1 /* not initialized */ },
234{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
235	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
236};
237
238/*
239 * Table of all supported levels
240 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
241 * any supported options so we need this info separately.
242 *
243 * This is needed only for topmost tpi providers and is used only by
244 * XTI interfaces.
245 */
246optlevel_t	tcp_valid_levels_arr[] = {
247	XTI_GENERIC,
248	SOL_SOCKET,
249	IPPROTO_TCP,
250	IPPROTO_IP,
251	IPPROTO_IPV6
252};
253
254
255#define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
256#define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
257
258uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
259
260/*
261 * Initialize option database object for TCP
262 *
263 * This object represents database of options to search passed to
264 * {sock,tpi}optcom_req() interface routine to take care of option
265 * management and associated methods.
266 */
267
268optdb_obj_t tcp_opt_obj = {
269	tcp_opt_default,	/* TCP default value function pointer */
270	tcp_tpi_opt_get,	/* TCP get function pointer */
271	tcp_tpi_opt_set,	/* TCP set function pointer */
272	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
273	tcp_opt_arr,		/* TCP option database */
274	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
275	tcp_valid_levels_arr	/* TCP valid level array */
276};
277
278static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
279
280/*
281 * Some TCP options can be "set" by requesting them in the option
282 * buffer. This is needed for XTI feature test though we do not
283 * allow it in general. We interpret that this mechanism is more
284 * applicable to OSI protocols and need not be allowed in general.
285 * This routine filters out options for which it is not allowed (most)
286 * and lets through those (few) for which it is. [ The XTI interface
287 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
288 * ever implemented will have to be allowed here ].
289 */
290static boolean_t
291tcp_allow_connopt_set(int level, int name)
292{
293
294	switch (level) {
295	case IPPROTO_TCP:
296		switch (name) {
297		case TCP_NODELAY:
298			return (B_TRUE);
299		default:
300			return (B_FALSE);
301		}
302		/*NOTREACHED*/
303	default:
304		return (B_FALSE);
305	}
306	/*NOTREACHED*/
307}
308
309/*
310 * This routine gets default values of certain options whose default
311 * values are maintained by protocol specific code
312 */
313/* ARGSUSED */
314static int
315tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
316{
317	int32_t	*i1 = (int32_t *)ptr;
318	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
319
320	switch (level) {
321	case IPPROTO_TCP:
322		switch (name) {
323		case TCP_NOTIFY_THRESHOLD:
324			*i1 = tcps->tcps_ip_notify_interval;
325			break;
326		case TCP_ABORT_THRESHOLD:
327			*i1 = tcps->tcps_ip_abort_interval;
328			break;
329		case TCP_CONN_NOTIFY_THRESHOLD:
330			*i1 = tcps->tcps_ip_notify_cinterval;
331			break;
332		case TCP_CONN_ABORT_THRESHOLD:
333			*i1 = tcps->tcps_ip_abort_cinterval;
334			break;
335		default:
336			return (-1);
337		}
338		break;
339	case IPPROTO_IP:
340		switch (name) {
341		case IP_TTL:
342			*i1 = tcps->tcps_ipv4_ttl;
343			break;
344		default:
345			return (-1);
346		}
347		break;
348	case IPPROTO_IPV6:
349		switch (name) {
350		case IPV6_UNICAST_HOPS:
351			*i1 = tcps->tcps_ipv6_hoplimit;
352			break;
353		default:
354			return (-1);
355		}
356		break;
357	default:
358		return (-1);
359	}
360	return (sizeof (int));
361}
362
363/*
364 * TCP routine to get the values of options.
365 */
366int
367tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
368{
369	int		*i1 = (int *)ptr;
370	tcp_t		*tcp = connp->conn_tcp;
371	conn_opt_arg_t	coas;
372	int		retval;
373
374	coas.coa_connp = connp;
375	coas.coa_ixa = connp->conn_ixa;
376	coas.coa_ipp = &connp->conn_xmit_ipp;
377	coas.coa_ancillary = B_FALSE;
378	coas.coa_changed = 0;
379
380	switch (level) {
381	case SOL_SOCKET:
382		switch (name) {
383		case SO_SND_COPYAVOID:
384			*i1 = tcp->tcp_snd_zcopy_on ?
385			    SO_SND_COPYAVOID : 0;
386			return (sizeof (int));
387		case SO_ACCEPTCONN:
388			*i1 = (tcp->tcp_state == TCPS_LISTEN);
389			return (sizeof (int));
390		}
391		break;
392	case IPPROTO_TCP:
393		switch (name) {
394		case TCP_NODELAY:
395			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
396			return (sizeof (int));
397		case TCP_MAXSEG:
398			*i1 = tcp->tcp_mss;
399			return (sizeof (int));
400		case TCP_NOTIFY_THRESHOLD:
401			*i1 = (int)tcp->tcp_first_timer_threshold;
402			return (sizeof (int));
403		case TCP_ABORT_THRESHOLD:
404			*i1 = tcp->tcp_second_timer_threshold;
405			return (sizeof (int));
406		case TCP_CONN_NOTIFY_THRESHOLD:
407			*i1 = tcp->tcp_first_ctimer_threshold;
408			return (sizeof (int));
409		case TCP_CONN_ABORT_THRESHOLD:
410			*i1 = tcp->tcp_second_ctimer_threshold;
411			return (sizeof (int));
412		case TCP_INIT_CWND:
413			*i1 = tcp->tcp_init_cwnd;
414			return (sizeof (int));
415		case TCP_KEEPALIVE_THRESHOLD:
416			*i1 = tcp->tcp_ka_interval;
417			return (sizeof (int));
418
419		/*
420		 * TCP_KEEPIDLE expects value in seconds, but
421		 * tcp_ka_interval is in milliseconds.
422		 */
423		case TCP_KEEPIDLE:
424			*i1 = tcp->tcp_ka_interval / 1000;
425			return (sizeof (int));
426		case TCP_KEEPCNT:
427			*i1 = tcp->tcp_ka_cnt;
428			return (sizeof (int));
429
430		/*
431		 * TCP_KEEPINTVL expects value in seconds, but
432		 * tcp_ka_rinterval is in milliseconds.
433		 */
434		case TCP_KEEPINTVL:
435			*i1 = tcp->tcp_ka_rinterval / 1000;
436			return (sizeof (int));
437		case TCP_KEEPALIVE_ABORT_THRESHOLD:
438			*i1 = tcp->tcp_ka_abort_thres;
439			return (sizeof (int));
440		case TCP_CONGESTION: {
441			size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
442			    CC_ALGO_NAME_MAX);
443			if (len >= CC_ALGO_NAME_MAX)
444				return (-1);
445			return (len + 1);
446		}
447		case TCP_CORK:
448			*i1 = tcp->tcp_cork;
449			return (sizeof (int));
450		case TCP_RTO_INITIAL:
451			*i1 = tcp->tcp_rto_initial;
452			return (sizeof (uint32_t));
453		case TCP_RTO_MIN:
454			*i1 = tcp->tcp_rto_min;
455			return (sizeof (uint32_t));
456		case TCP_RTO_MAX:
457			*i1 = tcp->tcp_rto_max;
458			return (sizeof (uint32_t));
459		case TCP_LINGER2:
460			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
461			return (sizeof (int));
462		}
463		break;
464	case IPPROTO_IP:
465		if (connp->conn_family != AF_INET)
466			return (-1);
467		switch (name) {
468		case IP_OPTIONS:
469		case T_IP_OPTIONS:
470			/* Caller ensures enough space */
471			return (ip_opt_get_user(connp, ptr));
472		default:
473			break;
474		}
475		break;
476
477	case IPPROTO_IPV6:
478		/*
479		 * IPPROTO_IPV6 options are only supported for sockets
480		 * that are using IPv6 on the wire.
481		 */
482		if (connp->conn_ipversion != IPV6_VERSION) {
483			return (-1);
484		}
485		switch (name) {
486		case IPV6_PATHMTU:
487			if (tcp->tcp_state < TCPS_ESTABLISHED)
488				return (-1);
489			break;
490		}
491		break;
492	}
493	mutex_enter(&connp->conn_lock);
494	retval = conn_opt_get(&coas, level, name, ptr);
495	mutex_exit(&connp->conn_lock);
496	return (retval);
497}
498
499/*
500 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
501 * Parameters are assumed to be verified by the caller.
502 */
503/* ARGSUSED */
504int
505tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
506    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
507    void *thisdg_attrs, cred_t *cr)
508{
509	tcp_t	*tcp = connp->conn_tcp;
510	int	*i1 = (int *)invalp;
511	boolean_t onoff = (*i1 == 0) ? 0 : 1;
512	boolean_t checkonly;
513	int	reterr;
514	tcp_stack_t	*tcps = tcp->tcp_tcps;
515	conn_opt_arg_t	coas;
516	uint32_t	val = *((uint32_t *)invalp);
517
518	coas.coa_connp = connp;
519	coas.coa_ixa = connp->conn_ixa;
520	coas.coa_ipp = &connp->conn_xmit_ipp;
521	coas.coa_ancillary = B_FALSE;
522	coas.coa_changed = 0;
523
524	switch (optset_context) {
525	case SETFN_OPTCOM_CHECKONLY:
526		checkonly = B_TRUE;
527		/*
528		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
529		 * inlen != 0 implies value supplied and
530		 * 	we have to "pretend" to set it.
531		 * inlen == 0 implies that there is no
532		 * 	value part in T_CHECK request and just validation
533		 * done elsewhere should be enough, we just return here.
534		 */
535		if (inlen == 0) {
536			*outlenp = 0;
537			return (0);
538		}
539		break;
540	case SETFN_OPTCOM_NEGOTIATE:
541		checkonly = B_FALSE;
542		break;
543	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
544	case SETFN_CONN_NEGOTIATE:
545		checkonly = B_FALSE;
546		/*
547		 * Negotiating local and "association-related" options
548		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
549		 * primitives is allowed by XTI, but we choose
550		 * to not implement this style negotiation for Internet
551		 * protocols (We interpret it is a must for OSI world but
552		 * optional for Internet protocols) for all options.
553		 * [ Will do only for the few options that enable test
554		 * suites that our XTI implementation of this feature
555		 * works for transports that do allow it ]
556		 */
557		if (!tcp_allow_connopt_set(level, name)) {
558			*outlenp = 0;
559			return (EINVAL);
560		}
561		break;
562	default:
563		/*
564		 * We should never get here
565		 */
566		*outlenp = 0;
567		return (EINVAL);
568	}
569
570	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
571	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
572
573	/*
574	 * For TCP, we should have no ancillary data sent down
575	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
576	 * has to be zero.
577	 */
578	ASSERT(thisdg_attrs == NULL);
579
580	/*
581	 * For fixed length options, no sanity check
582	 * of passed in length is done. It is assumed *_optcom_req()
583	 * routines do the right thing.
584	 */
585	switch (level) {
586	case SOL_SOCKET:
587		switch (name) {
588		case SO_KEEPALIVE:
589			if (checkonly) {
590				/* check only case */
591				break;
592			}
593
594			if (!onoff) {
595				if (connp->conn_keepalive) {
596					if (tcp->tcp_ka_tid != 0) {
597						(void) TCP_TIMER_CANCEL(tcp,
598						    tcp->tcp_ka_tid);
599						tcp->tcp_ka_tid = 0;
600					}
601					connp->conn_keepalive = 0;
602				}
603				break;
604			}
605			if (!connp->conn_keepalive) {
606				/* Crank up the keepalive timer */
607				tcp->tcp_ka_last_intrvl = 0;
608				tcp->tcp_ka_tid = TCP_TIMER(tcp,
609				    tcp_keepalive_timer, tcp->tcp_ka_interval);
610				connp->conn_keepalive = 1;
611			}
612			break;
613		case SO_SNDBUF: {
614			if (*i1 > tcps->tcps_max_buf) {
615				*outlenp = 0;
616				return (ENOBUFS);
617			}
618			if (checkonly)
619				break;
620
621			connp->conn_sndbuf = *i1;
622			if (tcps->tcps_snd_lowat_fraction != 0) {
623				connp->conn_sndlowat = connp->conn_sndbuf /
624				    tcps->tcps_snd_lowat_fraction;
625			}
626			(void) tcp_maxpsz_set(tcp, B_TRUE);
627			/*
628			 * If we are flow-controlled, recheck the condition.
629			 * There are apps that increase SO_SNDBUF size when
630			 * flow-controlled (EWOULDBLOCK), and expect the flow
631			 * control condition to be lifted right away.
632			 */
633			mutex_enter(&tcp->tcp_non_sq_lock);
634			if (tcp->tcp_flow_stopped &&
635			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
636				tcp_clrqfull(tcp);
637			}
638			mutex_exit(&tcp->tcp_non_sq_lock);
639			*outlenp = inlen;
640			return (0);
641		}
642		case SO_RCVBUF:
643			if (*i1 > tcps->tcps_max_buf) {
644				*outlenp = 0;
645				return (ENOBUFS);
646			}
647			/* Silently ignore zero */
648			if (!checkonly && *i1 != 0) {
649				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
650				(void) tcp_rwnd_set(tcp, *i1);
651			}
652			/*
653			 * XXX should we return the rwnd here
654			 * and tcp_opt_get ?
655			 */
656			*outlenp = inlen;
657			return (0);
658		case SO_SND_COPYAVOID:
659			if (!checkonly) {
660				if (tcp->tcp_loopback ||
661				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
662					*outlenp = 0;
663					return (EOPNOTSUPP);
664				}
665				tcp->tcp_snd_zcopy_aware = 1;
666			}
667			*outlenp = inlen;
668			return (0);
669		}
670		break;
671	case IPPROTO_TCP:
672		switch (name) {
673		case TCP_NODELAY:
674			if (!checkonly)
675				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
676			break;
677		case TCP_NOTIFY_THRESHOLD:
678			if (!checkonly)
679				tcp->tcp_first_timer_threshold = *i1;
680			break;
681		case TCP_ABORT_THRESHOLD:
682			if (!checkonly)
683				tcp->tcp_second_timer_threshold = *i1;
684			break;
685		case TCP_CONN_NOTIFY_THRESHOLD:
686			if (!checkonly)
687				tcp->tcp_first_ctimer_threshold = *i1;
688			break;
689		case TCP_CONN_ABORT_THRESHOLD:
690			if (!checkonly)
691				tcp->tcp_second_ctimer_threshold = *i1;
692			break;
693		case TCP_RECVDSTADDR:
694			if (tcp->tcp_state > TCPS_LISTEN) {
695				*outlenp = 0;
696				return (EOPNOTSUPP);
697			}
698			/* Setting done in conn_opt_set */
699			break;
700		case TCP_INIT_CWND:
701			if (checkonly)
702				break;
703
704			/*
705			 * Only allow socket with network configuration
706			 * privilege to set the initial cwnd to be larger
707			 * than allowed by RFC 3390.
708			 */
709			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
710				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
711				    != 0) {
712					*outlenp = 0;
713					return (reterr);
714				}
715				if (val > tcp_max_init_cwnd) {
716					*outlenp = 0;
717					return (EINVAL);
718				}
719			}
720
721			tcp->tcp_init_cwnd = val;
722
723			/*
724			 * If the socket is connected, AND no outbound data
725			 * has been sent, reset the actual cwnd values.
726			 */
727			if (tcp->tcp_state == TCPS_ESTABLISHED &&
728			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
729				tcp->tcp_cwnd =
730				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
731			}
732			break;
733
734		/*
735		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
736		 * is in milliseconds. TCP_KEEPIDLE is introduced for
737		 * compatibility with other Unix flavors.
738		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
739		 * converting the input to milliseconds.
740		 */
741		case TCP_KEEPIDLE:
742			*i1 *= 1000;
743			/* FALLTHRU */
744
745		case TCP_KEEPALIVE_THRESHOLD:
746			if (checkonly)
747				break;
748
749			if (*i1 < tcps->tcps_keepalive_interval_low ||
750			    *i1 > tcps->tcps_keepalive_interval_high) {
751				*outlenp = 0;
752				return (EINVAL);
753			}
754			if (*i1 != tcp->tcp_ka_interval) {
755				tcp->tcp_ka_interval = *i1;
756				/*
757				 * Check if we need to restart the
758				 * keepalive timer.
759				 */
760				if (tcp->tcp_ka_tid != 0) {
761					ASSERT(connp->conn_keepalive);
762					(void) TCP_TIMER_CANCEL(tcp,
763					    tcp->tcp_ka_tid);
764					tcp->tcp_ka_last_intrvl = 0;
765					tcp->tcp_ka_tid = TCP_TIMER(tcp,
766					    tcp_keepalive_timer,
767					    tcp->tcp_ka_interval);
768				}
769			}
770			break;
771
772		/*
773		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
774		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
775		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
776		 * tcp_ka_cnt.
777		 */
778		case TCP_KEEPCNT:
779			if (checkonly)
780				break;
781
782			if (*i1 == 0) {
783				return (EINVAL);
784			} else if (tcp->tcp_ka_rinterval == 0) {
785				/*
786				 * When TCP_KEEPCNT is specified without first
787				 * specifying a TCP_KEEPINTVL, we infer an
788				 * interval based on a tunable specific to our
789				 * stack: the tcp_keepalive_abort_interval.
790				 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
791				 * the unlikely event that that has been set.)
792				 * Given the abort interval's default value of
793				 * 480 seconds, low TCP_KEEPCNT values can
794				 * result in intervals that exceed the default
795				 * maximum RTO of 60 seconds.  Rather than
796				 * fail in these cases, we (implicitly) clamp
797				 * the interval at the maximum RTO; if the
798				 * TCP_KEEPCNT is shortly followed by a
799				 * TCP_KEEPINTVL (as we expect), the abort
800				 * threshold will be recalculated correctly --
801				 * and if a TCP_KEEPINTVL is not forthcoming,
802				 * keep-alive will at least operate reasonably
803				 * given the underconfigured state.
804				 */
805				uint32_t interval;
806
807				interval = tcp->tcp_ka_abort_thres / *i1;
808
809				if (interval < tcp->tcp_rto_min)
810					interval = tcp->tcp_rto_min;
811
812				if (interval > tcp->tcp_rto_max)
813					interval = tcp->tcp_rto_max;
814
815				tcp->tcp_ka_rinterval = interval;
816			} else {
817				if ((*i1 * tcp->tcp_ka_rinterval) <
818				    tcps->tcps_keepalive_abort_interval_low ||
819				    (*i1 * tcp->tcp_ka_rinterval) >
820				    tcps->tcps_keepalive_abort_interval_high)
821					return (EINVAL);
822				tcp->tcp_ka_abort_thres =
823				    (*i1 * tcp->tcp_ka_rinterval);
824			}
825			tcp->tcp_ka_cnt = *i1;
826			break;
827		case TCP_KEEPINTVL:
828			/*
829			 * TCP_KEEPINTVL is specified in seconds, but
830			 * tcp_ka_rinterval is in milliseconds.
831			 */
832
833			if (checkonly)
834				break;
835
836			if ((*i1 * 1000) < tcp->tcp_rto_min ||
837			    (*i1 * 1000) > tcp->tcp_rto_max)
838				return (EINVAL);
839
840			if (tcp->tcp_ka_cnt == 0) {
841				tcp->tcp_ka_cnt =
842				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
843			} else {
844				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
845				    tcps->tcps_keepalive_abort_interval_low ||
846				    (*i1 * tcp->tcp_ka_cnt * 1000) >
847				    tcps->tcps_keepalive_abort_interval_high)
848					return (EINVAL);
849				tcp->tcp_ka_abort_thres =
850				    (*i1 * tcp->tcp_ka_cnt * 1000);
851			}
852			tcp->tcp_ka_rinterval = *i1 * 1000;
853			break;
854		case TCP_KEEPALIVE_ABORT_THRESHOLD:
855			if (!checkonly) {
856				if (*i1 <
857				    tcps->tcps_keepalive_abort_interval_low ||
858				    *i1 >
859				    tcps->tcps_keepalive_abort_interval_high) {
860					*outlenp = 0;
861					return (EINVAL);
862				}
863				tcp->tcp_ka_abort_thres = *i1;
864				tcp->tcp_ka_cnt = 0;
865				tcp->tcp_ka_rinterval = 0;
866			}
867			break;
868		case TCP_CONGESTION: {
869			struct cc_algo *algo;
870
871			if (checkonly) {
872				break;
873			}
874
875			/*
876			 * Make sure the string is NUL-terminated. Some
877			 * consumers pass only the number of characters
878			 * in the string, and don't include the NUL
879			 * terminator, so we set it for them.
880			 */
881			if (inlen < CC_ALGO_NAME_MAX) {
882				invalp[inlen] = '\0';
883			}
884			invalp[CC_ALGO_NAME_MAX - 1] = '\0';
885
886			if ((algo = cc_load_algo((char *)invalp)) == NULL) {
887				return (ENOENT);
888			}
889
890			if (CC_ALGO(tcp)->cb_destroy != NULL) {
891				CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
892			}
893
894			CC_DATA(tcp) = NULL;
895			CC_ALGO(tcp) = algo;
896
897			if (CC_ALGO(tcp)->cb_init != NULL) {
898				VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
899			}
900
901			break;
902		}
903		case TCP_CORK:
904			if (!checkonly) {
905				/*
906				 * if tcp->tcp_cork was set and is now
907				 * being unset, we have to make sure that
908				 * the remaining data gets sent out. Also
909				 * unset tcp->tcp_cork so that tcp_wput_data()
910				 * can send data even if it is less than mss
911				 */
912				if (tcp->tcp_cork && onoff == 0 &&
913				    tcp->tcp_unsent > 0) {
914					tcp->tcp_cork = B_FALSE;
915					tcp_wput_data(tcp, NULL, B_FALSE);
916				}
917				tcp->tcp_cork = onoff;
918			}
919			break;
920		case TCP_RTO_INITIAL:
921			if (checkonly || val == 0)
922				break;
923
924			/*
925			 * Sanity checks
926			 *
927			 * The initial RTO should be bounded by the minimum
928			 * and maximum RTO.  And it should also be smaller
929			 * than the connect attempt abort timeout.  Otherwise,
930			 * the connection won't be aborted in a period
931			 * reasonably close to that timeout.
932			 */
933			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
934			    val > tcp->tcp_second_ctimer_threshold ||
935			    val < tcps->tcps_rexmit_interval_initial_low ||
936			    val > tcps->tcps_rexmit_interval_initial_high) {
937				*outlenp = 0;
938				return (EINVAL);
939			}
940			tcp->tcp_rto_initial = val;
941
942			/*
943			 * If TCP has not sent anything, need to re-calculate
944			 * tcp_rto.  Otherwise, this option change does not
945			 * really affect anything.
946			 */
947			if (tcp->tcp_state >= TCPS_SYN_SENT)
948				break;
949
950			tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
951			tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
952			tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
953			    tcps->tcps_conn_grace_period);
954			break;
955		case TCP_RTO_MIN:
956			if (checkonly || val == 0)
957				break;
958
959			if (val < tcps->tcps_rexmit_interval_min_low ||
960			    val > tcps->tcps_rexmit_interval_min_high ||
961			    val > tcp->tcp_rto_max) {
962				*outlenp = 0;
963				return (EINVAL);
964			}
965			tcp->tcp_rto_min = val;
966			if (tcp->tcp_rto < val)
967				tcp->tcp_rto = val;
968			break;
969		case TCP_RTO_MAX:
970			if (checkonly || val == 0)
971				break;
972
973			/*
974			 * Sanity checks
975			 *
976			 * The maximum RTO should not be larger than the
977			 * connection abort timeout.  Otherwise, the
978			 * connection won't be aborted in a period reasonably
979			 * close to that timeout.
980			 */
981			if (val < tcps->tcps_rexmit_interval_max_low ||
982			    val > tcps->tcps_rexmit_interval_max_high ||
983			    val < tcp->tcp_rto_min ||
984			    val > tcp->tcp_second_timer_threshold) {
985				*outlenp = 0;
986				return (EINVAL);
987			}
988			tcp->tcp_rto_max = val;
989			if (tcp->tcp_rto > val)
990				tcp->tcp_rto = val;
991			break;
992		case TCP_LINGER2:
993			if (checkonly || *i1 == 0)
994				break;
995
996			/*
997			 * Note that the option value's unit is second.  And
998			 * the value should be bigger than the private
999			 * parameter tcp_fin_wait_2_flush_interval's lower
1000			 * bound and smaller than the current value of that
1001			 * parameter.  It should be smaller than the current
1002			 * value to avoid an app setting TCP_LINGER2 to a big
1003			 * value, causing resource to be held up too long in
1004			 * FIN-WAIT-2 state.
1005			 */
1006			if (*i1 < 0 ||
1007			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1008			    *i1 ||
1009			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1010			    *i1) {
1011				*outlenp = 0;
1012				return (EINVAL);
1013			}
1014			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1015			break;
1016		default:
1017			break;
1018		}
1019		break;
1020	case IPPROTO_IP:
1021		if (connp->conn_family != AF_INET) {
1022			*outlenp = 0;
1023			return (EINVAL);
1024		}
1025		switch (name) {
1026		case IP_SEC_OPT:
1027			/*
1028			 * We should not allow policy setting after
1029			 * we start listening for connections.
1030			 */
1031			if (tcp->tcp_state == TCPS_LISTEN) {
1032				return (EINVAL);
1033			}
1034			break;
1035		}
1036		break;
1037	case IPPROTO_IPV6:
1038		/*
1039		 * IPPROTO_IPV6 options are only supported for sockets
1040		 * that are using IPv6 on the wire.
1041		 */
1042		if (connp->conn_ipversion != IPV6_VERSION) {
1043			*outlenp = 0;
1044			return (EINVAL);
1045		}
1046
1047		switch (name) {
1048		case IPV6_RECVPKTINFO:
1049			if (!checkonly) {
1050				/* Force it to be sent up with the next msg */
1051				tcp->tcp_recvifindex = 0;
1052			}
1053			break;
1054		case IPV6_RECVTCLASS:
1055			if (!checkonly) {
1056				/* Force it to be sent up with the next msg */
1057				tcp->tcp_recvtclass = 0xffffffffU;
1058			}
1059			break;
1060		case IPV6_RECVHOPLIMIT:
1061			if (!checkonly) {
1062				/* Force it to be sent up with the next msg */
1063				tcp->tcp_recvhops = 0xffffffffU;
1064			}
1065			break;
1066		case IPV6_PKTINFO:
1067			/* This is an extra check for TCP */
1068			if (inlen == sizeof (struct in6_pktinfo)) {
1069				struct in6_pktinfo *pkti;
1070
1071				pkti = (struct in6_pktinfo *)invalp;
1072				/*
1073				 * RFC 3542 states that ipi6_addr must be
1074				 * the unspecified address when setting the
1075				 * IPV6_PKTINFO sticky socket option on a
1076				 * TCP socket.
1077				 */
1078				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1079					return (EINVAL);
1080			}
1081			break;
1082		case IPV6_SEC_OPT:
1083			/*
1084			 * We should not allow policy setting after
1085			 * we start listening for connections.
1086			 */
1087			if (tcp->tcp_state == TCPS_LISTEN) {
1088				return (EINVAL);
1089			}
1090			break;
1091		}
1092		break;
1093	}
1094	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1095	    checkonly, cr);
1096	if (reterr != 0) {
1097		*outlenp = 0;
1098		return (reterr);
1099	}
1100
1101	/*
1102	 * Common case of OK return with outval same as inval
1103	 */
1104	if (invalp != outvalp) {
1105		/* don't trust bcopy for identical src/dst */
1106		(void) bcopy(invalp, outvalp, inlen);
1107	}
1108	*outlenp = inlen;
1109
1110	if (coas.coa_changed & COA_HEADER_CHANGED) {
1111		/* If we are connected we rebuilt the headers */
1112		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1113		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1114			reterr = tcp_build_hdrs(tcp);
1115			if (reterr != 0)
1116				return (reterr);
1117		}
1118	}
1119	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1120		in6_addr_t nexthop;
1121
1122		/*
1123		 * If we are connected we re-cache the information.
1124		 * We ignore errors to preserve BSD behavior.
1125		 * Note that we don't redo IPsec policy lookup here
1126		 * since the final destination (or source) didn't change.
1127		 */
1128		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1129		    &connp->conn_faddr_v6, &nexthop);
1130
1131		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1132		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1133			(void) ip_attr_connect(connp, connp->conn_ixa,
1134			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1135			    &nexthop, connp->conn_fport, NULL, NULL,
1136			    IPDF_VERIFY_DST);
1137		}
1138	}
1139	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1140		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1141	}
1142	if (coas.coa_changed & COA_WROFF_CHANGED) {
1143		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1144		    tcps->tcps_wroff_xtra;
1145		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1146		    connp->conn_wroff);
1147	}
1148	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1149		if (IPCL_IS_NONSTR(connp))
1150			proto_set_rx_oob_opt(connp, onoff);
1151	}
1152	return (0);
1153}
1154