1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright 2019 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
27 */
28
29#include <sys/types.h>
30#include <sys/stream.h>
31#define	_SUN_TPI_VERSION 2
32#include <sys/tihdr.h>
33#include <sys/socket.h>
34#include <sys/xti_xtiopt.h>
35#include <sys/xti_inet.h>
36#include <sys/policy.h>
37
38#include <inet/cc.h>
39#include <inet/common.h>
40#include <netinet/ip6.h>
41#include <inet/ip.h>
42
43#include <netinet/in.h>
44#include <netinet/tcp.h>
45#include <inet/optcom.h>
46#include <inet/proto_set.h>
47#include <inet/tcp_impl.h>
48
49static int	tcp_opt_default(queue_t *, int, int, uchar_t *);
50
51/*
52 * Table of all known options handled on a TCP protocol stack.
53 *
54 * Note: This table contains options processed by both TCP and IP levels
55 *       and is the superset of options that can be performed on a TCP over IP
56 *       stack.
57 */
58opdes_t	tcp_opt_arr[] = {
59
60{ SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
61	sizeof (struct linger), 0 },
62
63{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64{ SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
67	},
68{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
69{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
70{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
71{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
72{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
73{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
75	sizeof (struct timeval), 0 },
76{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
77	sizeof (struct timeval), 0 },
78{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
79	},
80{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
81{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
82	0 },
83{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
84	0 },
85{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
86	0 },
87{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
88	0 },
89{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
90
91{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
92
93{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
94
95{ TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
96	},
97{ TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
98	536 },
99
100{ TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
101	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
102
103{ TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
104	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
105
106{ TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
107	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
108
109{ TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
110	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
111
112{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
113	0 },
114
115{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
116	sizeof (int), 0 },
117
118{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
119	},
120
121{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
122	sizeof (int), 0 },
123
124{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
125	sizeof (int), 0	},
126
127{ TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
128
129{ TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
130
131{ TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
132
133{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
134	sizeof (int), 0	},
135
136{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
137
138{ TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
139
140{ TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
141
142{ TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
143
144{ TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
145
146{ TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
147	OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
148
149{ IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
150	(OP_VARLEN|OP_NODEFAULT),
151	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
152{ T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
153	(OP_VARLEN|OP_NODEFAULT),
154	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
155
156{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
157{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
158{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
159	sizeof (int), -1 /* not initialized */ },
160{ IP_RECVTOS,	IPPROTO_IP,  OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
161
162{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
163	sizeof (ipsec_req_t), -1 /* not initialized */ },
164
165{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
166	sizeof (int),	0 /* no ifindex */ },
167
168{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
169	sizeof (int), 0 },
170
171{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
172	sizeof (int), -1 /* not initialized */ },
173
174{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
175	sizeof (int),	0 /* no ifindex */ },
176
177{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
178
179{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
180	sizeof (in_addr_t),	-1 /* not initialized  */ },
181
182{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
183	sizeof (int), 0 },
184
185{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
186	(OP_NODEFAULT|OP_VARLEN),
187	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
188{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
189	OP_NODEFAULT,
190	sizeof (sin6_t), -1 /* not initialized */ },
191{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
192	(OP_VARLEN|OP_NODEFAULT), 255*8,
193	-1 /* not initialized */ },
194{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
195	(OP_VARLEN|OP_NODEFAULT), 255*8,
196	-1 /* not initialized */ },
197{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
198	(OP_VARLEN|OP_NODEFAULT), 255*8,
199	-1 /* not initialized */ },
200{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
201	(OP_VARLEN|OP_NODEFAULT), 255*8,
202	-1 /* not initialized */ },
203{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
204	OP_NODEFAULT,
205	sizeof (int), -1 /* not initialized */ },
206{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
207	OP_NODEFAULT,
208	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
209{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210	sizeof (int), 0 },
211{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212	sizeof (int), 0 },
213{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214	sizeof (int), 0 },
215
216/* Enable receipt of ancillary data */
217{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218	sizeof (int), 0 },
219{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220	sizeof (int), 0 },
221{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222	sizeof (int), 0 },
223{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
224	sizeof (int), 0 },
225{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
226	sizeof (int), 0 },
227{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
228	sizeof (int), 0 },
229{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
230	sizeof (int), 0 },
231{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
232	sizeof (int), 0 },
233
234{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
235	sizeof (ipsec_req_t), -1 /* not initialized */ },
236{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
237	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
238};
239
240/*
241 * Table of all supported levels
242 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
243 * any supported options so we need this info separately.
244 *
245 * This is needed only for topmost tpi providers and is used only by
246 * XTI interfaces.
247 */
248optlevel_t	tcp_valid_levels_arr[] = {
249	XTI_GENERIC,
250	SOL_SOCKET,
251	IPPROTO_TCP,
252	IPPROTO_IP,
253	IPPROTO_IPV6
254};
255
256
257#define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
258#define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
259
260uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
261
262/*
263 * Initialize option database object for TCP
264 *
265 * This object represents database of options to search passed to
266 * {sock,tpi}optcom_req() interface routine to take care of option
267 * management and associated methods.
268 */
269
270optdb_obj_t tcp_opt_obj = {
271	tcp_opt_default,	/* TCP default value function pointer */
272	tcp_tpi_opt_get,	/* TCP get function pointer */
273	tcp_tpi_opt_set,	/* TCP set function pointer */
274	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
275	tcp_opt_arr,		/* TCP option database */
276	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
277	tcp_valid_levels_arr	/* TCP valid level array */
278};
279
280static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
281
282/*
283 * Some TCP options can be "set" by requesting them in the option
284 * buffer. This is needed for XTI feature test though we do not
285 * allow it in general. We interpret that this mechanism is more
286 * applicable to OSI protocols and need not be allowed in general.
287 * This routine filters out options for which it is not allowed (most)
288 * and lets through those (few) for which it is. [ The XTI interface
289 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
290 * ever implemented will have to be allowed here ].
291 */
292static boolean_t
293tcp_allow_connopt_set(int level, int name)
294{
295
296	switch (level) {
297	case IPPROTO_TCP:
298		switch (name) {
299		case TCP_NODELAY:
300			return (B_TRUE);
301		default:
302			return (B_FALSE);
303		}
304		/*NOTREACHED*/
305	default:
306		return (B_FALSE);
307	}
308	/*NOTREACHED*/
309}
310
311/*
312 * This routine gets default values of certain options whose default
313 * values are maintained by protocol specific code
314 */
315/* ARGSUSED */
316static int
317tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
318{
319	int32_t	*i1 = (int32_t *)ptr;
320	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
321
322	switch (level) {
323	case IPPROTO_TCP:
324		switch (name) {
325		case TCP_NOTIFY_THRESHOLD:
326			*i1 = tcps->tcps_ip_notify_interval;
327			break;
328		case TCP_ABORT_THRESHOLD:
329			*i1 = tcps->tcps_ip_abort_interval;
330			break;
331		case TCP_CONN_NOTIFY_THRESHOLD:
332			*i1 = tcps->tcps_ip_notify_cinterval;
333			break;
334		case TCP_CONN_ABORT_THRESHOLD:
335			*i1 = tcps->tcps_ip_abort_cinterval;
336			break;
337		default:
338			return (-1);
339		}
340		break;
341	case IPPROTO_IP:
342		switch (name) {
343		case IP_TTL:
344			*i1 = tcps->tcps_ipv4_ttl;
345			break;
346		default:
347			return (-1);
348		}
349		break;
350	case IPPROTO_IPV6:
351		switch (name) {
352		case IPV6_UNICAST_HOPS:
353			*i1 = tcps->tcps_ipv6_hoplimit;
354			break;
355		default:
356			return (-1);
357		}
358		break;
359	default:
360		return (-1);
361	}
362	return (sizeof (int));
363}
364
365/*
366 * TCP routine to get the values of options.
367 */
368int
369tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
370{
371	int		*i1 = (int *)ptr;
372	tcp_t		*tcp = connp->conn_tcp;
373	conn_opt_arg_t	coas;
374	int		retval;
375
376	coas.coa_connp = connp;
377	coas.coa_ixa = connp->conn_ixa;
378	coas.coa_ipp = &connp->conn_xmit_ipp;
379	coas.coa_ancillary = B_FALSE;
380	coas.coa_changed = 0;
381
382	switch (level) {
383	case SOL_SOCKET:
384		switch (name) {
385		case SO_SND_COPYAVOID:
386			*i1 = tcp->tcp_snd_zcopy_on ?
387			    SO_SND_COPYAVOID : 0;
388			return (sizeof (int));
389		case SO_ACCEPTCONN:
390			*i1 = (tcp->tcp_state == TCPS_LISTEN);
391			return (sizeof (int));
392		}
393		break;
394	case IPPROTO_TCP:
395		switch (name) {
396		case TCP_NODELAY:
397			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
398			return (sizeof (int));
399		case TCP_MAXSEG:
400			*i1 = tcp->tcp_mss;
401			return (sizeof (int));
402		case TCP_NOTIFY_THRESHOLD:
403			*i1 = (int)tcp->tcp_first_timer_threshold;
404			return (sizeof (int));
405		case TCP_ABORT_THRESHOLD:
406			*i1 = tcp->tcp_second_timer_threshold;
407			return (sizeof (int));
408		case TCP_CONN_NOTIFY_THRESHOLD:
409			*i1 = tcp->tcp_first_ctimer_threshold;
410			return (sizeof (int));
411		case TCP_CONN_ABORT_THRESHOLD:
412			*i1 = tcp->tcp_second_ctimer_threshold;
413			return (sizeof (int));
414		case TCP_INIT_CWND:
415			*i1 = tcp->tcp_init_cwnd;
416			return (sizeof (int));
417		case TCP_KEEPALIVE_THRESHOLD:
418			*i1 = tcp->tcp_ka_interval;
419			return (sizeof (int));
420
421		/*
422		 * TCP_KEEPIDLE expects value in seconds, but
423		 * tcp_ka_interval is in milliseconds.
424		 */
425		case TCP_KEEPIDLE:
426			*i1 = tcp->tcp_ka_interval / 1000;
427			return (sizeof (int));
428		case TCP_KEEPCNT:
429			*i1 = tcp->tcp_ka_cnt;
430			return (sizeof (int));
431
432		/*
433		 * TCP_KEEPINTVL expects value in seconds, but
434		 * tcp_ka_rinterval is in milliseconds.
435		 */
436		case TCP_KEEPINTVL:
437			*i1 = tcp->tcp_ka_rinterval / 1000;
438			return (sizeof (int));
439		case TCP_KEEPALIVE_ABORT_THRESHOLD:
440			*i1 = tcp->tcp_ka_abort_thres;
441			return (sizeof (int));
442		case TCP_CONGESTION: {
443			size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
444			    CC_ALGO_NAME_MAX);
445			if (len >= CC_ALGO_NAME_MAX)
446				return (-1);
447			return (len + 1);
448		}
449		case TCP_CORK:
450			*i1 = tcp->tcp_cork;
451			return (sizeof (int));
452		case TCP_RTO_INITIAL:
453			*i1 = tcp->tcp_rto_initial;
454			return (sizeof (uint32_t));
455		case TCP_RTO_MIN:
456			*i1 = tcp->tcp_rto_min;
457			return (sizeof (uint32_t));
458		case TCP_RTO_MAX:
459			*i1 = tcp->tcp_rto_max;
460			return (sizeof (uint32_t));
461		case TCP_LINGER2:
462			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
463			return (sizeof (int));
464		}
465		break;
466	case IPPROTO_IP:
467		if (connp->conn_family != AF_INET)
468			return (-1);
469		switch (name) {
470		case IP_OPTIONS:
471		case T_IP_OPTIONS:
472			/* Caller ensures enough space */
473			return (ip_opt_get_user(connp, ptr));
474		default:
475			break;
476		}
477		break;
478
479	case IPPROTO_IPV6:
480		/*
481		 * IPPROTO_IPV6 options are only supported for sockets
482		 * that are using IPv6 on the wire.
483		 */
484		if (connp->conn_ipversion != IPV6_VERSION) {
485			return (-1);
486		}
487		switch (name) {
488		case IPV6_PATHMTU:
489			if (tcp->tcp_state < TCPS_ESTABLISHED)
490				return (-1);
491			break;
492		}
493		break;
494	}
495	mutex_enter(&connp->conn_lock);
496	retval = conn_opt_get(&coas, level, name, ptr);
497	mutex_exit(&connp->conn_lock);
498	return (retval);
499}
500
501/*
502 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
503 * Parameters are assumed to be verified by the caller.
504 */
505/* ARGSUSED */
506int
507tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
508    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
509    void *thisdg_attrs, cred_t *cr)
510{
511	tcp_t	*tcp = connp->conn_tcp;
512	int	*i1 = (int *)invalp;
513	boolean_t onoff = (*i1 == 0) ? 0 : 1;
514	boolean_t checkonly;
515	int	reterr;
516	tcp_stack_t	*tcps = tcp->tcp_tcps;
517	conn_opt_arg_t	coas;
518	uint32_t	val = *((uint32_t *)invalp);
519
520	coas.coa_connp = connp;
521	coas.coa_ixa = connp->conn_ixa;
522	coas.coa_ipp = &connp->conn_xmit_ipp;
523	coas.coa_ancillary = B_FALSE;
524	coas.coa_changed = 0;
525
526	switch (optset_context) {
527	case SETFN_OPTCOM_CHECKONLY:
528		checkonly = B_TRUE;
529		/*
530		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
531		 * inlen != 0 implies value supplied and
532		 *	we have to "pretend" to set it.
533		 * inlen == 0 implies that there is no
534		 *	value part in T_CHECK request and just validation
535		 * done elsewhere should be enough, we just return here.
536		 */
537		if (inlen == 0) {
538			*outlenp = 0;
539			return (0);
540		}
541		break;
542	case SETFN_OPTCOM_NEGOTIATE:
543		checkonly = B_FALSE;
544		break;
545	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
546	case SETFN_CONN_NEGOTIATE:
547		checkonly = B_FALSE;
548		/*
549		 * Negotiating local and "association-related" options
550		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
551		 * primitives is allowed by XTI, but we choose
552		 * to not implement this style negotiation for Internet
553		 * protocols (We interpret it is a must for OSI world but
554		 * optional for Internet protocols) for all options.
555		 * [ Will do only for the few options that enable test
556		 * suites that our XTI implementation of this feature
557		 * works for transports that do allow it ]
558		 */
559		if (!tcp_allow_connopt_set(level, name)) {
560			*outlenp = 0;
561			return (EINVAL);
562		}
563		break;
564	default:
565		/*
566		 * We should never get here
567		 */
568		*outlenp = 0;
569		return (EINVAL);
570	}
571
572	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
573	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
574
575	/*
576	 * For TCP, we should have no ancillary data sent down
577	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
578	 * has to be zero.
579	 */
580	ASSERT(thisdg_attrs == NULL);
581
582	/*
583	 * For fixed length options, no sanity check
584	 * of passed in length is done. It is assumed *_optcom_req()
585	 * routines do the right thing.
586	 */
587	switch (level) {
588	case SOL_SOCKET:
589		switch (name) {
590		case SO_KEEPALIVE:
591			if (checkonly) {
592				/* check only case */
593				break;
594			}
595
596			if (!onoff) {
597				if (connp->conn_keepalive) {
598					if (tcp->tcp_ka_tid != 0) {
599						(void) TCP_TIMER_CANCEL(tcp,
600						    tcp->tcp_ka_tid);
601						tcp->tcp_ka_tid = 0;
602					}
603					connp->conn_keepalive = 0;
604				}
605				break;
606			}
607			if (!connp->conn_keepalive) {
608				/* Crank up the keepalive timer */
609				tcp->tcp_ka_last_intrvl = 0;
610				tcp->tcp_ka_tid = TCP_TIMER(tcp,
611				    tcp_keepalive_timer, tcp->tcp_ka_interval);
612				connp->conn_keepalive = 1;
613			}
614			break;
615		case SO_SNDBUF: {
616			if (*i1 > tcps->tcps_max_buf) {
617				*outlenp = 0;
618				return (ENOBUFS);
619			}
620			if (checkonly)
621				break;
622
623			connp->conn_sndbuf = *i1;
624			if (tcps->tcps_snd_lowat_fraction != 0) {
625				connp->conn_sndlowat = connp->conn_sndbuf /
626				    tcps->tcps_snd_lowat_fraction;
627			}
628			(void) tcp_maxpsz_set(tcp, B_TRUE);
629			/*
630			 * If we are flow-controlled, recheck the condition.
631			 * There are apps that increase SO_SNDBUF size when
632			 * flow-controlled (EWOULDBLOCK), and expect the flow
633			 * control condition to be lifted right away.
634			 */
635			mutex_enter(&tcp->tcp_non_sq_lock);
636			if (tcp->tcp_flow_stopped &&
637			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
638				tcp_clrqfull(tcp);
639			}
640			mutex_exit(&tcp->tcp_non_sq_lock);
641			*outlenp = inlen;
642			return (0);
643		}
644		case SO_RCVBUF:
645			if (*i1 > tcps->tcps_max_buf) {
646				*outlenp = 0;
647				return (ENOBUFS);
648			}
649			/* Silently ignore zero */
650			if (!checkonly && *i1 != 0) {
651				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
652				(void) tcp_rwnd_set(tcp, *i1);
653			}
654			/*
655			 * XXX should we return the rwnd here
656			 * and tcp_opt_get ?
657			 */
658			*outlenp = inlen;
659			return (0);
660		case SO_SND_COPYAVOID:
661			if (!checkonly) {
662				if (tcp->tcp_loopback ||
663				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
664					*outlenp = 0;
665					return (EOPNOTSUPP);
666				}
667				tcp->tcp_snd_zcopy_aware = 1;
668			}
669			*outlenp = inlen;
670			return (0);
671		}
672		break;
673	case IPPROTO_TCP:
674		switch (name) {
675		case TCP_NODELAY:
676			if (!checkonly)
677				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
678			break;
679		case TCP_NOTIFY_THRESHOLD:
680			if (!checkonly)
681				tcp->tcp_first_timer_threshold = *i1;
682			break;
683		case TCP_ABORT_THRESHOLD:
684			if (!checkonly)
685				tcp->tcp_second_timer_threshold = *i1;
686			break;
687		case TCP_CONN_NOTIFY_THRESHOLD:
688			if (!checkonly)
689				tcp->tcp_first_ctimer_threshold = *i1;
690			break;
691		case TCP_CONN_ABORT_THRESHOLD:
692			if (!checkonly)
693				tcp->tcp_second_ctimer_threshold = *i1;
694			break;
695		case TCP_RECVDSTADDR:
696			if (tcp->tcp_state > TCPS_LISTEN) {
697				*outlenp = 0;
698				return (EOPNOTSUPP);
699			}
700			/* Setting done in conn_opt_set */
701			break;
702		case TCP_INIT_CWND:
703			if (checkonly)
704				break;
705
706			/*
707			 * Only allow socket with network configuration
708			 * privilege to set the initial cwnd to be larger
709			 * than allowed by RFC 3390.
710			 */
711			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
712				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
713				    != 0) {
714					*outlenp = 0;
715					return (reterr);
716				}
717				if (val > tcp_max_init_cwnd) {
718					*outlenp = 0;
719					return (EINVAL);
720				}
721			}
722
723			tcp->tcp_init_cwnd = val;
724
725			/*
726			 * If the socket is connected, AND no outbound data
727			 * has been sent, reset the actual cwnd values.
728			 */
729			if (tcp->tcp_state == TCPS_ESTABLISHED &&
730			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
731				tcp->tcp_cwnd =
732				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
733			}
734			break;
735
736		/*
737		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
738		 * is in milliseconds. TCP_KEEPIDLE is introduced for
739		 * compatibility with other Unix flavors.
740		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
741		 * converting the input to milliseconds.
742		 */
743		case TCP_KEEPIDLE:
744			*i1 *= 1000;
745			/* FALLTHRU */
746
747		case TCP_KEEPALIVE_THRESHOLD:
748			if (checkonly)
749				break;
750
751			if (*i1 < tcps->tcps_keepalive_interval_low ||
752			    *i1 > tcps->tcps_keepalive_interval_high) {
753				*outlenp = 0;
754				return (EINVAL);
755			}
756			if (*i1 != tcp->tcp_ka_interval) {
757				tcp->tcp_ka_interval = *i1;
758				/*
759				 * Check if we need to restart the
760				 * keepalive timer.
761				 */
762				if (tcp->tcp_ka_tid != 0) {
763					ASSERT(connp->conn_keepalive);
764					(void) TCP_TIMER_CANCEL(tcp,
765					    tcp->tcp_ka_tid);
766					tcp->tcp_ka_last_intrvl = 0;
767					tcp->tcp_ka_tid = TCP_TIMER(tcp,
768					    tcp_keepalive_timer,
769					    tcp->tcp_ka_interval);
770				}
771			}
772			break;
773
774		/*
775		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
776		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
777		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
778		 * tcp_ka_cnt.
779		 */
780		case TCP_KEEPCNT:
781			if (checkonly)
782				break;
783
784			if (*i1 == 0) {
785				return (EINVAL);
786			} else if (tcp->tcp_ka_rinterval == 0) {
787				/*
788				 * When TCP_KEEPCNT is specified without first
789				 * specifying a TCP_KEEPINTVL, we infer an
790				 * interval based on a tunable specific to our
791				 * stack: the tcp_keepalive_abort_interval.
792				 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
793				 * the unlikely event that that has been set.)
794				 * Given the abort interval's default value of
795				 * 480 seconds, low TCP_KEEPCNT values can
796				 * result in intervals that exceed the default
797				 * maximum RTO of 60 seconds.  Rather than
798				 * fail in these cases, we (implicitly) clamp
799				 * the interval at the maximum RTO; if the
800				 * TCP_KEEPCNT is shortly followed by a
801				 * TCP_KEEPINTVL (as we expect), the abort
802				 * threshold will be recalculated correctly --
803				 * and if a TCP_KEEPINTVL is not forthcoming,
804				 * keep-alive will at least operate reasonably
805				 * given the underconfigured state.
806				 */
807				uint32_t interval;
808
809				interval = tcp->tcp_ka_abort_thres / *i1;
810
811				if (interval < tcp->tcp_rto_min)
812					interval = tcp->tcp_rto_min;
813
814				if (interval > tcp->tcp_rto_max)
815					interval = tcp->tcp_rto_max;
816
817				tcp->tcp_ka_rinterval = interval;
818			} else {
819				if ((*i1 * tcp->tcp_ka_rinterval) <
820				    tcps->tcps_keepalive_abort_interval_low ||
821				    (*i1 * tcp->tcp_ka_rinterval) >
822				    tcps->tcps_keepalive_abort_interval_high)
823					return (EINVAL);
824				tcp->tcp_ka_abort_thres =
825				    (*i1 * tcp->tcp_ka_rinterval);
826			}
827			tcp->tcp_ka_cnt = *i1;
828			break;
829		case TCP_KEEPINTVL:
830			/*
831			 * TCP_KEEPINTVL is specified in seconds, but
832			 * tcp_ka_rinterval is in milliseconds.
833			 */
834
835			if (checkonly)
836				break;
837
838			if ((*i1 * 1000) < tcp->tcp_rto_min ||
839			    (*i1 * 1000) > tcp->tcp_rto_max)
840				return (EINVAL);
841
842			if (tcp->tcp_ka_cnt == 0) {
843				tcp->tcp_ka_cnt =
844				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
845			} else {
846				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
847				    tcps->tcps_keepalive_abort_interval_low ||
848				    (*i1 * tcp->tcp_ka_cnt * 1000) >
849				    tcps->tcps_keepalive_abort_interval_high)
850					return (EINVAL);
851				tcp->tcp_ka_abort_thres =
852				    (*i1 * tcp->tcp_ka_cnt * 1000);
853			}
854			tcp->tcp_ka_rinterval = *i1 * 1000;
855			break;
856		case TCP_KEEPALIVE_ABORT_THRESHOLD:
857			if (!checkonly) {
858				if (*i1 <
859				    tcps->tcps_keepalive_abort_interval_low ||
860				    *i1 >
861				    tcps->tcps_keepalive_abort_interval_high) {
862					*outlenp = 0;
863					return (EINVAL);
864				}
865				tcp->tcp_ka_abort_thres = *i1;
866				tcp->tcp_ka_cnt = 0;
867				tcp->tcp_ka_rinterval = 0;
868			}
869			break;
870		case TCP_CONGESTION: {
871			struct cc_algo *algo;
872
873			if (checkonly) {
874				break;
875			}
876
877			/*
878			 * Make sure the string is NUL-terminated. Some
879			 * consumers pass only the number of characters
880			 * in the string, and don't include the NUL
881			 * terminator, so we set it for them.
882			 */
883			if (inlen < CC_ALGO_NAME_MAX) {
884				invalp[inlen] = '\0';
885			}
886			invalp[CC_ALGO_NAME_MAX - 1] = '\0';
887
888			if ((algo = cc_load_algo((char *)invalp)) == NULL) {
889				return (ENOENT);
890			}
891
892			if (CC_ALGO(tcp)->cb_destroy != NULL) {
893				CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
894			}
895
896			CC_DATA(tcp) = NULL;
897			CC_ALGO(tcp) = algo;
898
899			if (CC_ALGO(tcp)->cb_init != NULL) {
900				VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
901			}
902
903			break;
904		}
905		case TCP_CORK:
906			if (!checkonly) {
907				/*
908				 * if tcp->tcp_cork was set and is now
909				 * being unset, we have to make sure that
910				 * the remaining data gets sent out. Also
911				 * unset tcp->tcp_cork so that tcp_wput_data()
912				 * can send data even if it is less than mss
913				 */
914				if (tcp->tcp_cork && onoff == 0 &&
915				    tcp->tcp_unsent > 0) {
916					tcp->tcp_cork = B_FALSE;
917					tcp_wput_data(tcp, NULL, B_FALSE);
918				}
919				tcp->tcp_cork = onoff;
920			}
921			break;
922		case TCP_RTO_INITIAL:
923			if (checkonly || val == 0)
924				break;
925
926			/*
927			 * Sanity checks
928			 *
929			 * The initial RTO should be bounded by the minimum
930			 * and maximum RTO.  And it should also be smaller
931			 * than the connect attempt abort timeout.  Otherwise,
932			 * the connection won't be aborted in a period
933			 * reasonably close to that timeout.
934			 */
935			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
936			    val > tcp->tcp_second_ctimer_threshold ||
937			    val < tcps->tcps_rexmit_interval_initial_low ||
938			    val > tcps->tcps_rexmit_interval_initial_high) {
939				*outlenp = 0;
940				return (EINVAL);
941			}
942			tcp->tcp_rto_initial = val;
943
944			/*
945			 * If TCP has not sent anything, need to re-calculate
946			 * tcp_rto.  Otherwise, this option change does not
947			 * really affect anything.
948			 */
949			if (tcp->tcp_state >= TCPS_SYN_SENT)
950				break;
951
952			tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
953			tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
954			tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
955			    tcps->tcps_conn_grace_period);
956			break;
957		case TCP_RTO_MIN:
958			if (checkonly || val == 0)
959				break;
960
961			if (val < tcps->tcps_rexmit_interval_min_low ||
962			    val > tcps->tcps_rexmit_interval_min_high ||
963			    val > tcp->tcp_rto_max) {
964				*outlenp = 0;
965				return (EINVAL);
966			}
967			tcp->tcp_rto_min = val;
968			if (tcp->tcp_rto < val)
969				tcp->tcp_rto = val;
970			break;
971		case TCP_RTO_MAX:
972			if (checkonly || val == 0)
973				break;
974
975			/*
976			 * Sanity checks
977			 *
978			 * The maximum RTO should not be larger than the
979			 * connection abort timeout.  Otherwise, the
980			 * connection won't be aborted in a period reasonably
981			 * close to that timeout.
982			 */
983			if (val < tcps->tcps_rexmit_interval_max_low ||
984			    val > tcps->tcps_rexmit_interval_max_high ||
985			    val < tcp->tcp_rto_min ||
986			    val > tcp->tcp_second_timer_threshold) {
987				*outlenp = 0;
988				return (EINVAL);
989			}
990			tcp->tcp_rto_max = val;
991			if (tcp->tcp_rto > val)
992				tcp->tcp_rto = val;
993			break;
994		case TCP_LINGER2:
995			if (checkonly || *i1 == 0)
996				break;
997
998			/*
999			 * Note that the option value's unit is second.  And
1000			 * the value should be bigger than the private
1001			 * parameter tcp_fin_wait_2_flush_interval's lower
1002			 * bound and smaller than the current value of that
1003			 * parameter.  It should be smaller than the current
1004			 * value to avoid an app setting TCP_LINGER2 to a big
1005			 * value, causing resource to be held up too long in
1006			 * FIN-WAIT-2 state.
1007			 */
1008			if (*i1 < 0 ||
1009			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
1010			    *i1 ||
1011			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
1012			    *i1) {
1013				*outlenp = 0;
1014				return (EINVAL);
1015			}
1016			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
1017			break;
1018		default:
1019			break;
1020		}
1021		break;
1022	case IPPROTO_IP:
1023		if (connp->conn_family != AF_INET) {
1024			*outlenp = 0;
1025			return (EINVAL);
1026		}
1027		switch (name) {
1028		case IP_SEC_OPT:
1029			/*
1030			 * We should not allow policy setting after
1031			 * we start listening for connections.
1032			 */
1033			if (tcp->tcp_state == TCPS_LISTEN) {
1034				return (EINVAL);
1035			}
1036			break;
1037		case IP_RECVTOS:
1038			if (!checkonly) {
1039				/*
1040				 * Force it to be sent up with the next msg
1041				 * by setting it to a value which cannot
1042				 * appear in a packet (TOS is only 8-bits)
1043				 */
1044				tcp->tcp_recvtos = 0xffffffffU;
1045			}
1046			break;
1047		}
1048		break;
1049	case IPPROTO_IPV6:
1050		/*
1051		 * IPPROTO_IPV6 options are only supported for sockets
1052		 * that are using IPv6 on the wire.
1053		 */
1054		if (connp->conn_ipversion != IPV6_VERSION) {
1055			*outlenp = 0;
1056			return (EINVAL);
1057		}
1058
1059		switch (name) {
1060		case IPV6_RECVPKTINFO:
1061			if (!checkonly) {
1062				/* Force it to be sent up with the next msg */
1063				tcp->tcp_recvifindex = 0;
1064			}
1065			break;
1066		case IPV6_RECVTCLASS:
1067			if (!checkonly) {
1068				/* Force it to be sent up with the next msg */
1069				tcp->tcp_recvtclass = 0xffffffffU;
1070			}
1071			break;
1072		case IPV6_RECVHOPLIMIT:
1073			if (!checkonly) {
1074				/* Force it to be sent up with the next msg */
1075				tcp->tcp_recvhops = 0xffffffffU;
1076			}
1077			break;
1078		case IPV6_PKTINFO:
1079			/* This is an extra check for TCP */
1080			if (inlen == sizeof (struct in6_pktinfo)) {
1081				struct in6_pktinfo *pkti;
1082
1083				pkti = (struct in6_pktinfo *)invalp;
1084				/*
1085				 * RFC 3542 states that ipi6_addr must be
1086				 * the unspecified address when setting the
1087				 * IPV6_PKTINFO sticky socket option on a
1088				 * TCP socket.
1089				 */
1090				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1091					return (EINVAL);
1092			}
1093			break;
1094		case IPV6_SEC_OPT:
1095			/*
1096			 * We should not allow policy setting after
1097			 * we start listening for connections.
1098			 */
1099			if (tcp->tcp_state == TCPS_LISTEN) {
1100				return (EINVAL);
1101			}
1102			break;
1103		}
1104		break;
1105	}
1106	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1107	    checkonly, cr);
1108	if (reterr != 0) {
1109		*outlenp = 0;
1110		return (reterr);
1111	}
1112
1113	/*
1114	 * Common case of OK return with outval same as inval
1115	 */
1116	if (invalp != outvalp) {
1117		/* don't trust bcopy for identical src/dst */
1118		(void) bcopy(invalp, outvalp, inlen);
1119	}
1120	*outlenp = inlen;
1121
1122	if (coas.coa_changed & COA_HEADER_CHANGED) {
1123		/* If we are connected we rebuilt the headers */
1124		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1125		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1126			reterr = tcp_build_hdrs(tcp);
1127			if (reterr != 0)
1128				return (reterr);
1129		}
1130	}
1131	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1132		in6_addr_t nexthop;
1133
1134		/*
1135		 * If we are connected we re-cache the information.
1136		 * We ignore errors to preserve BSD behavior.
1137		 * Note that we don't redo IPsec policy lookup here
1138		 * since the final destination (or source) didn't change.
1139		 */
1140		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1141		    &connp->conn_faddr_v6, &nexthop);
1142
1143		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1144		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1145			(void) ip_attr_connect(connp, connp->conn_ixa,
1146			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1147			    &nexthop, connp->conn_fport, NULL, NULL,
1148			    IPDF_VERIFY_DST);
1149		}
1150	}
1151	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1152		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1153	}
1154	if (coas.coa_changed & COA_WROFF_CHANGED) {
1155		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1156		    tcps->tcps_wroff_xtra;
1157		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1158		    connp->conn_wroff);
1159	}
1160	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1161		if (IPCL_IS_NONSTR(connp))
1162			proto_set_rx_oob_opt(connp, onoff);
1163	}
1164	return (0);
1165}
1166