1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/* This file contains all TCP kernel socket related functions. */
27
28#include <sys/types.h>
29#include <sys/strlog.h>
30#include <sys/policy.h>
31#include <sys/sockio.h>
32#include <sys/strsubr.h>
33#include <sys/strsun.h>
34#include <sys/squeue_impl.h>
35#include <sys/squeue.h>
36#define	_SUN_TPI_VERSION 2
37#include <sys/tihdr.h>
38#include <sys/timod.h>
39#include <sys/tpicommon.h>
40#include <sys/socketvar.h>
41
42#include <inet/common.h>
43#include <inet/proto_set.h>
44#include <inet/ip.h>
45#include <inet/tcp.h>
46#include <inet/tcp_impl.h>
47
48static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49		    sock_upcalls_t *, int, cred_t *);
50static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51		    sock_upper_handle_t, cred_t *);
52static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
53		    socklen_t, cred_t *);
54static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
55static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56		    socklen_t, sock_connid_t *, cred_t *);
57static int	tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
58		    socklen_t *, cred_t *);
59static int	tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
60		    socklen_t *, cred_t *);
61static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
62		    socklen_t *, cred_t *);
63static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
64		    socklen_t, cred_t *);
65static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
66		    cred_t *);
67static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
68static void	tcp_clr_flowctrl(sock_lower_handle_t);
69static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
70		    cred_t *);
71static int	tcp_close(sock_lower_handle_t, int, cred_t *);
72
73sock_downcalls_t sock_tcp_downcalls = {
74	tcp_activate,
75	tcp_accept,
76	tcp_bind,
77	tcp_listen,
78	tcp_connect,
79	tcp_getpeername,
80	tcp_getsockname,
81	tcp_getsockopt,
82	tcp_setsockopt,
83	tcp_sendmsg,
84	NULL,
85	NULL,
86	NULL,
87	tcp_shutdown,
88	tcp_clr_flowctrl,
89	tcp_ioctl,
90	tcp_close,
91};
92
93/* ARGSUSED */
94static void
95tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
96    sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
97{
98	conn_t *connp = (conn_t *)proto_handle;
99	struct sock_proto_props sopp;
100	extern struct module_info tcp_rinfo;
101
102	ASSERT(connp->conn_upper_handle == NULL);
103
104	/* All Solaris components should pass a cred for this operation. */
105	ASSERT(cr != NULL);
106
107	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
108	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
109	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
110
111	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
112	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
113	sopp.sopp_maxpsz = INFPSZ;
114	sopp.sopp_maxblk = INFPSZ;
115	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
116	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
117	sopp.sopp_maxaddrlen = sizeof (sin6_t);
118	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
119	    tcp_rinfo.mi_minpsz;
120
121	connp->conn_upcalls = sock_upcalls;
122	connp->conn_upper_handle = sock_handle;
123
124	ASSERT(connp->conn_rcvbuf != 0 &&
125	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
126	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
127}
128
129/*ARGSUSED*/
130static int
131tcp_accept(sock_lower_handle_t lproto_handle,
132    sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
133    cred_t *cr)
134{
135	conn_t *lconnp, *econnp;
136	tcp_t *listener, *eager;
137
138	/*
139	 * KSSL can move a socket from one listener to another, in which
140	 * case `lproto_handle' points to the new listener. To ensure that
141	 * the original listener is used the information is obtained from
142	 * the eager.
143	 */
144	econnp = (conn_t *)eproto_handle;
145	eager = econnp->conn_tcp;
146	ASSERT(IPCL_IS_NONSTR(econnp));
147	ASSERT(eager->tcp_listener != NULL);
148	listener = eager->tcp_listener;
149	lconnp = (conn_t *)listener->tcp_connp;
150	ASSERT(listener->tcp_state == TCPS_LISTEN);
151	ASSERT(lconnp->conn_upper_handle != NULL);
152
153	/*
154	 * It is possible for the accept thread to race with the thread that
155	 * made the su_newconn upcall in tcp_newconn_notify. Both
156	 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
157	 * and conn_upcalls be set before returning, so they both write to
158	 * them. However, we're guaranteed that the value written is the same
159	 * for both threads.
160	 */
161	ASSERT(econnp->conn_upper_handle == NULL ||
162	    econnp->conn_upper_handle == sock_handle);
163	ASSERT(econnp->conn_upcalls == NULL ||
164	    econnp->conn_upcalls == lconnp->conn_upcalls);
165	econnp->conn_upper_handle = sock_handle;
166	econnp->conn_upcalls = lconnp->conn_upcalls;
167
168	ASSERT(econnp->conn_netstack ==
169	    listener->tcp_connp->conn_netstack);
170	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
171
172	/*
173	 * We should have a minimum of 2 references on the conn at this
174	 * point. One for TCP and one for the newconn notification
175	 * (which is now taken over by IP). In the normal case we would
176	 * also have another reference (making a total of 3) for the conn
177	 * being in the classifier hash list. However the eager could have
178	 * received an RST subsequently and tcp_closei_local could have
179	 * removed the eager from the classifier hash list, hence we can't
180	 * assert that reference.
181	 */
182	ASSERT(econnp->conn_ref >= 2);
183
184	mutex_enter(&listener->tcp_eager_lock);
185	/*
186	 * Non-STREAMS listeners never defer the notification of new
187	 * connections.
188	 */
189	ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
190	tcp_eager_unlink(eager);
191	mutex_exit(&listener->tcp_eager_lock);
192	CONN_DEC_REF(listener->tcp_connp);
193
194	return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
195}
196
197static int
198tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
199    socklen_t len, cred_t *cr)
200{
201	int 		error;
202	conn_t		*connp = (conn_t *)proto_handle;
203
204	/* All Solaris components should pass a cred for this operation. */
205	ASSERT(cr != NULL);
206	ASSERT(connp->conn_upper_handle != NULL);
207
208	error = squeue_synch_enter(connp, NULL);
209	if (error != 0) {
210		/* failed to enter */
211		return (ENOSR);
212	}
213
214	/* binding to a NULL address really means unbind */
215	if (sa == NULL) {
216		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
217			error = tcp_do_unbind(connp);
218		else
219			error = EINVAL;
220	} else {
221		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
222	}
223
224	squeue_synch_exit(connp);
225
226	if (error < 0) {
227		if (error == -TOUTSTATE)
228			error = EINVAL;
229		else
230			error = proto_tlitosyserr(-error);
231	}
232
233	return (error);
234}
235
236/* ARGSUSED */
237static int
238tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
239{
240	conn_t	*connp = (conn_t *)proto_handle;
241	tcp_t	*tcp = connp->conn_tcp;
242	int 	error;
243
244	ASSERT(connp->conn_upper_handle != NULL);
245
246	/* All Solaris components should pass a cred for this operation. */
247	ASSERT(cr != NULL);
248
249	error = squeue_synch_enter(connp, NULL);
250	if (error != 0) {
251		/* failed to enter */
252		return (ENOBUFS);
253	}
254
255	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
256	if (error == 0) {
257		/*
258		 * sockfs needs to know what's the maximum number of socket
259		 * that can be queued on the listener.
260		 */
261		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
262		    SOCK_OPCTL_ENAB_ACCEPT,
263		    (uintptr_t)(tcp->tcp_conn_req_max +
264		    tcp->tcp_tcps->tcps_conn_req_max_q0));
265	} else if (error < 0) {
266		if (error == -TOUTSTATE)
267			error = EINVAL;
268		else
269			error = proto_tlitosyserr(-error);
270	}
271	squeue_synch_exit(connp);
272	return (error);
273}
274
275static int
276tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
277    socklen_t len, sock_connid_t *id, cred_t *cr)
278{
279	conn_t		*connp = (conn_t *)proto_handle;
280	int		error;
281
282	ASSERT(connp->conn_upper_handle != NULL);
283
284	/* All Solaris components should pass a cred for this operation. */
285	ASSERT(cr != NULL);
286
287	error = proto_verify_ip_addr(connp->conn_family, sa, len);
288	if (error != 0) {
289		return (error);
290	}
291
292	error = squeue_synch_enter(connp, NULL);
293	if (error != 0) {
294		/* failed to enter */
295		return (ENOSR);
296	}
297
298	/*
299	 * TCP supports quick connect, so no need to do an implicit bind
300	 */
301	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
302	if (error == 0) {
303		*id = connp->conn_tcp->tcp_connid;
304	} else if (error < 0) {
305		if (error == -TOUTSTATE) {
306			switch (connp->conn_tcp->tcp_state) {
307			case TCPS_SYN_SENT:
308				error = EALREADY;
309				break;
310			case TCPS_ESTABLISHED:
311				error = EISCONN;
312				break;
313			case TCPS_LISTEN:
314				error = EOPNOTSUPP;
315				break;
316			default:
317				error = EINVAL;
318				break;
319			}
320		} else {
321			error = proto_tlitosyserr(-error);
322		}
323	}
324
325	if (connp->conn_tcp->tcp_loopback) {
326		struct sock_proto_props sopp;
327
328		sopp.sopp_flags = SOCKOPT_LOOPBACK;
329		sopp.sopp_loopback = B_TRUE;
330
331		(*connp->conn_upcalls->su_set_proto_props)(
332		    connp->conn_upper_handle, &sopp);
333	}
334done:
335	squeue_synch_exit(connp);
336
337	return ((error == 0) ? EINPROGRESS : error);
338}
339
340/* ARGSUSED3 */
341static int
342tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
343    socklen_t *addrlenp, cred_t *cr)
344{
345	conn_t	*connp = (conn_t *)proto_handle;
346	tcp_t	*tcp = connp->conn_tcp;
347
348	/* All Solaris components should pass a cred for this operation. */
349	ASSERT(cr != NULL);
350
351	ASSERT(tcp != NULL);
352	if (tcp->tcp_state < TCPS_SYN_RCVD)
353		return (ENOTCONN);
354
355	return (conn_getpeername(connp, addr, addrlenp));
356}
357
358/* ARGSUSED3 */
359static int
360tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
361    socklen_t *addrlenp, cred_t *cr)
362{
363	conn_t	*connp = (conn_t *)proto_handle;
364
365	/* All Solaris components should pass a cred for this operation. */
366	ASSERT(cr != NULL);
367
368	return (conn_getsockname(connp, addr, addrlenp));
369}
370
371/* returns UNIX error, the optlen is a value-result arg */
372static int
373tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
374    void *optvalp, socklen_t *optlen, cred_t *cr)
375{
376	conn_t		*connp = (conn_t *)proto_handle;
377	int		error;
378	t_uscalar_t	max_optbuf_len;
379	void		*optvalp_buf;
380	int		len;
381
382	ASSERT(connp->conn_upper_handle != NULL);
383
384	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
385	    tcp_opt_obj.odb_opt_des_arr,
386	    tcp_opt_obj.odb_opt_arr_cnt,
387	    B_FALSE, B_TRUE, cr);
388	if (error != 0) {
389		if (error < 0) {
390			error = proto_tlitosyserr(-error);
391		}
392		return (error);
393	}
394
395	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
396
397	error = squeue_synch_enter(connp, NULL);
398	if (error == ENOMEM) {
399		kmem_free(optvalp_buf, max_optbuf_len);
400		return (ENOMEM);
401	}
402
403	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
404	squeue_synch_exit(connp);
405
406	if (len == -1) {
407		kmem_free(optvalp_buf, max_optbuf_len);
408		return (EINVAL);
409	}
410
411	/*
412	 * update optlen and copy option value
413	 */
414	t_uscalar_t size = MIN(len, *optlen);
415
416	bcopy(optvalp_buf, optvalp, size);
417	bcopy(&size, optlen, sizeof (size));
418
419	kmem_free(optvalp_buf, max_optbuf_len);
420	return (0);
421}
422
423static int
424tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
425    const void *optvalp, socklen_t optlen, cred_t *cr)
426{
427	conn_t		*connp = (conn_t *)proto_handle;
428	int		error;
429
430	ASSERT(connp->conn_upper_handle != NULL);
431	/*
432	 * Entering the squeue synchronously can result in a context switch,
433	 * which can cause a rather sever performance degradation. So we try to
434	 * handle whatever options we can without entering the squeue.
435	 */
436	if (level == IPPROTO_TCP) {
437		switch (option_name) {
438		case TCP_NODELAY:
439			if (optlen != sizeof (int32_t))
440				return (EINVAL);
441			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
442			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
443			    connp->conn_tcp->tcp_mss;
444			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
445			return (0);
446		default:
447			break;
448		}
449	}
450
451	error = squeue_synch_enter(connp, NULL);
452	if (error == ENOMEM) {
453		return (ENOMEM);
454	}
455
456	error = proto_opt_check(level, option_name, optlen, NULL,
457	    tcp_opt_obj.odb_opt_des_arr,
458	    tcp_opt_obj.odb_opt_arr_cnt,
459	    B_TRUE, B_FALSE, cr);
460
461	if (error != 0) {
462		if (error < 0) {
463			error = proto_tlitosyserr(-error);
464		}
465		squeue_synch_exit(connp);
466		return (error);
467	}
468
469	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
470	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
471	    NULL, cr);
472	squeue_synch_exit(connp);
473
474	ASSERT(error >= 0);
475
476	return (error);
477}
478
479/* ARGSUSED */
480static int
481tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
482    cred_t *cr)
483{
484	tcp_t		*tcp;
485	uint32_t	msize;
486	conn_t *connp = (conn_t *)proto_handle;
487	int32_t		tcpstate;
488
489	/* All Solaris components should pass a cred for this operation. */
490	ASSERT(cr != NULL);
491
492	ASSERT(connp->conn_ref >= 2);
493	ASSERT(connp->conn_upper_handle != NULL);
494
495	if (msg->msg_controllen != 0) {
496		freemsg(mp);
497		return (EOPNOTSUPP);
498	}
499
500	switch (DB_TYPE(mp)) {
501	case M_DATA:
502		tcp = connp->conn_tcp;
503		ASSERT(tcp != NULL);
504
505		tcpstate = tcp->tcp_state;
506		if (tcpstate < TCPS_ESTABLISHED) {
507			freemsg(mp);
508			/*
509			 * We return ENOTCONN if the endpoint is trying to
510			 * connect or has never been connected, and EPIPE if it
511			 * has been disconnected. The connection id helps us
512			 * distinguish between the last two cases.
513			 */
514			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
515			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
516		} else if (tcpstate > TCPS_CLOSE_WAIT) {
517			freemsg(mp);
518			return (EPIPE);
519		}
520
521		msize = msgdsize(mp);
522
523		mutex_enter(&tcp->tcp_non_sq_lock);
524		tcp->tcp_squeue_bytes += msize;
525		/*
526		 * Squeue Flow Control
527		 */
528		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
529			tcp_setqfull(tcp);
530		}
531		mutex_exit(&tcp->tcp_non_sq_lock);
532
533		/*
534		 * The application may pass in an address in the msghdr, but
535		 * we ignore the address on connection-oriented sockets.
536		 * Just like BSD this code does not generate an error for
537		 * TCP (a CONNREQUIRED socket) when sending to an address
538		 * passed in with sendto/sendmsg. Instead the data is
539		 * delivered on the connection as if no address had been
540		 * supplied.
541		 */
542		CONN_INC_REF(connp);
543
544		if (msg->msg_flags & MSG_OOB) {
545			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
546			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
547		} else {
548			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
549			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
550		}
551
552		return (0);
553
554	default:
555		ASSERT(0);
556	}
557
558	freemsg(mp);
559	return (0);
560}
561
562/* ARGSUSED */
563static int
564tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
565{
566	conn_t  *connp = (conn_t *)proto_handle;
567	tcp_t   *tcp = connp->conn_tcp;
568
569	ASSERT(connp->conn_upper_handle != NULL);
570
571	/* All Solaris components should pass a cred for this operation. */
572	ASSERT(cr != NULL);
573
574	/*
575	 * X/Open requires that we check the connected state.
576	 */
577	if (tcp->tcp_state < TCPS_SYN_SENT)
578		return (ENOTCONN);
579
580	/* shutdown the send side */
581	if (how != SHUT_RD) {
582		mblk_t *bp;
583
584		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
585		CONN_INC_REF(connp);
586		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
587		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
588
589		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
590		    SOCK_OPCTL_SHUT_SEND, 0);
591	}
592
593	/* shutdown the recv side */
594	if (how != SHUT_WR)
595		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
596		    SOCK_OPCTL_SHUT_RECV, 0);
597
598	return (0);
599}
600
601static void
602tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
603{
604	conn_t  *connp = (conn_t *)proto_handle;
605	tcp_t	*tcp = connp->conn_tcp;
606	mblk_t *mp;
607	int error;
608
609	ASSERT(connp->conn_upper_handle != NULL);
610
611	/*
612	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
613	 * is currently running.
614	 */
615	mutex_enter(&tcp->tcp_rsrv_mp_lock);
616	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
617		mutex_exit(&tcp->tcp_rsrv_mp_lock);
618		return;
619	}
620	tcp->tcp_rsrv_mp = NULL;
621	mutex_exit(&tcp->tcp_rsrv_mp_lock);
622
623	error = squeue_synch_enter(connp, mp);
624	ASSERT(error == 0);
625
626	mutex_enter(&tcp->tcp_rsrv_mp_lock);
627	tcp->tcp_rsrv_mp = mp;
628	mutex_exit(&tcp->tcp_rsrv_mp_lock);
629
630	if (tcp->tcp_fused) {
631		tcp_fuse_backenable(tcp);
632	} else {
633		tcp->tcp_rwnd = connp->conn_rcvbuf;
634		/*
635		 * Send back a window update immediately if TCP is above
636		 * ESTABLISHED state and the increase of the rcv window
637		 * that the other side knows is at least 1 MSS after flow
638		 * control is lifted.
639		 */
640		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
641		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
642			tcp_xmit_ctl(NULL, tcp,
643			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
644			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
645		}
646	}
647
648	squeue_synch_exit(connp);
649}
650
651/* ARGSUSED */
652static int
653tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
654    int mode, int32_t *rvalp, cred_t *cr)
655{
656	conn_t  	*connp = (conn_t *)proto_handle;
657	int		error;
658
659	ASSERT(connp->conn_upper_handle != NULL);
660
661	/* All Solaris components should pass a cred for this operation. */
662	ASSERT(cr != NULL);
663
664	/*
665	 * If we don't have a helper stream then create one.
666	 * ip_create_helper_stream takes care of locking the conn_t,
667	 * so this check for NULL is just a performance optimization.
668	 */
669	if (connp->conn_helper_info == NULL) {
670		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
671
672		/*
673		 * Create a helper stream for non-STREAMS socket.
674		 */
675		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
676		if (error != 0) {
677			ip0dbg(("tcp_ioctl: create of IP helper stream "
678			    "failed %d\n", error));
679			return (error);
680		}
681	}
682
683	switch (cmd) {
684		case ND_SET:
685		case ND_GET:
686		case _SIOCSOCKFALLBACK:
687		case TCP_IOC_ABORT_CONN:
688		case TI_GETPEERNAME:
689		case TI_GETMYNAME:
690			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
691			    cmd));
692			error = EINVAL;
693			break;
694		default:
695			/*
696			 * If the conn is not closing, pass on to IP using
697			 * helper stream. Bump the ioctlref to prevent tcp_close
698			 * from closing the rq/wq out from underneath the ioctl
699			 * if it ends up queued or aborted/interrupted.
700			 */
701			mutex_enter(&connp->conn_lock);
702			if (connp->conn_state_flags & (CONN_CLOSING)) {
703				mutex_exit(&connp->conn_lock);
704				error = EINVAL;
705				break;
706			}
707			CONN_INC_IOCTLREF_LOCKED(connp);
708			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
709			    cmd, arg, mode, cr, rvalp);
710			CONN_DEC_IOCTLREF(connp);
711			break;
712	}
713	return (error);
714}
715
716/* ARGSUSED */
717static int
718tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
719{
720	conn_t *connp = (conn_t *)proto_handle;
721
722	ASSERT(connp->conn_upper_handle != NULL);
723
724	/* All Solaris components should pass a cred for this operation. */
725	ASSERT(cr != NULL);
726
727	tcp_close_common(connp, flags);
728
729	ip_free_helper_stream(connp);
730
731	/*
732	 * Drop IP's reference on the conn. This is the last reference
733	 * on the connp if the state was less than established. If the
734	 * connection has gone into timewait state, then we will have
735	 * one ref for the TCP and one more ref (total of two) for the
736	 * classifier connected hash list (a timewait connections stays
737	 * in connected hash till closed).
738	 *
739	 * We can't assert the references because there might be other
740	 * transient reference places because of some walkers or queued
741	 * packets in squeue for the timewait state.
742	 */
743	CONN_DEC_REF(connp);
744
745	/*
746	 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
747	 * freeing the socket.
748	 */
749	return (EINPROGRESS);
750}
751
752/* ARGSUSED */
753sock_lower_handle_t
754tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
755    uint_t *smodep, int *errorp, int flags, cred_t *credp)
756{
757	conn_t		*connp;
758	boolean_t	isv6 = family == AF_INET6;
759
760	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
761	    (proto != 0 && proto != IPPROTO_TCP)) {
762		*errorp = EPROTONOSUPPORT;
763		return (NULL);
764	}
765
766	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
767	if (connp == NULL) {
768		return (NULL);
769	}
770
771	/*
772	 * Put the ref for TCP. Ref for IP was already put
773	 * by ipcl_conn_create. Also make the conn_t globally
774	 * visible to walkers
775	 */
776	mutex_enter(&connp->conn_lock);
777	CONN_INC_REF_LOCKED(connp);
778	ASSERT(connp->conn_ref == 2);
779	connp->conn_state_flags &= ~CONN_INCIPIENT;
780
781	connp->conn_flags |= IPCL_NONSTR;
782	mutex_exit(&connp->conn_lock);
783
784	ASSERT(errorp != NULL);
785	*errorp = 0;
786	*sock_downcalls = &sock_tcp_downcalls;
787	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
788	    SM_SENDFILESUPP;
789
790	return ((sock_lower_handle_t)connp);
791}
792
793/*
794 * tcp_fallback
795 *
796 * A direct socket is falling back to using STREAMS. The queue
797 * that is being passed down was created using tcp_open() with
798 * the SO_FALLBACK flag set. As a result, the queue is not
799 * associated with a conn, and the q_ptrs instead contain the
800 * dev and minor area that should be used.
801 *
802 * The 'issocket' flag indicates whether the FireEngine
803 * optimizations should be used. The common case would be that
804 * optimizations are enabled, and they might be subsequently
805 * disabled using the _SIOCSOCKFALLBACK ioctl.
806 */
807
808/*
809 * An active connection is falling back to TPI. Gather all the information
810 * required by the STREAM head and TPI sonode and send it up.
811 */
812static void
813tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
814    boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
815    sock_quiesce_arg_t *arg)
816{
817	conn_t			*connp = tcp->tcp_connp;
818	struct stroptions	*stropt;
819	struct T_capability_ack tca;
820	struct sockaddr_in6	laddr, faddr;
821	socklen_t 		laddrlen, faddrlen;
822	short			opts;
823	int			error;
824	mblk_t			*mp, *mpnext;
825
826	connp->conn_dev = (dev_t)RD(q)->q_ptr;
827	connp->conn_minor_arena = WR(q)->q_ptr;
828
829	RD(q)->q_ptr = WR(q)->q_ptr = connp;
830
831	connp->conn_rq = RD(q);
832	connp->conn_wq = WR(q);
833
834	WR(q)->q_qinfo = &tcp_sock_winit;
835
836	if (!issocket)
837		tcp_use_pure_tpi(tcp);
838
839	/*
840	 * free the helper stream
841	 */
842	ip_free_helper_stream(connp);
843
844	/*
845	 * Notify the STREAM head about options
846	 */
847	DB_TYPE(stropt_mp) = M_SETOPTS;
848	stropt = (struct stroptions *)stropt_mp->b_rptr;
849	stropt_mp->b_wptr += sizeof (struct stroptions);
850	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
851
852	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
853	    tcp->tcp_tcps->tcps_wroff_xtra);
854	if (tcp->tcp_snd_sack_ok)
855		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
856	stropt->so_hiwat = connp->conn_rcvbuf;
857	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
858
859	putnext(RD(q), stropt_mp);
860
861	/*
862	 * Collect the information needed to sync with the sonode
863	 */
864	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
865
866	laddrlen = faddrlen = sizeof (sin6_t);
867	(void) tcp_getsockname((sock_lower_handle_t)connp,
868	    (struct sockaddr *)&laddr, &laddrlen, CRED());
869	error = tcp_getpeername((sock_lower_handle_t)connp,
870	    (struct sockaddr *)&faddr, &faddrlen, CRED());
871	if (error != 0)
872		faddrlen = 0;
873
874	opts = 0;
875	if (connp->conn_oobinline)
876		opts |= SO_OOBINLINE;
877	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
878		opts |= SO_DONTROUTE;
879
880	/*
881	 * Notify the socket that the protocol is now quiescent,
882	 * and it's therefore safe move data from the socket
883	 * to the stream head.
884	 */
885	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
886	    (struct sockaddr *)&laddr, laddrlen,
887	    (struct sockaddr *)&faddr, faddrlen, opts);
888
889	while (mp != NULL) {
890		mpnext = mp->b_next;
891		tcp->tcp_rcv_list = mp->b_next;
892		mp->b_next = NULL;
893		putnext(q, mp);
894		mp = mpnext;
895	}
896	ASSERT(tcp->tcp_rcv_last_head == NULL);
897	ASSERT(tcp->tcp_rcv_last_tail == NULL);
898	ASSERT(tcp->tcp_rcv_cnt == 0);
899
900	/*
901	 * All eagers in q0 are marked as being non-STREAM, so they will
902	 * make su_newconn upcalls when the handshake completes, which
903	 * will fail (resulting in the conn being closed). So we just blow
904	 * off everything in q0 instead of waiting for the inevitable.
905	 */
906	if (tcp->tcp_conn_req_cnt_q0 != 0)
907		tcp_eager_cleanup(tcp, B_TRUE);
908}
909
910/*
911 * An eager is falling back to TPI. All we have to do is send
912 * up a T_CONN_IND.
913 */
914static void
915tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
916    so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
917{
918	conn_t *connp = eager->tcp_connp;
919	tcp_t *listener = eager->tcp_listener;
920	mblk_t *mp;
921
922	ASSERT(listener != NULL);
923
924	/*
925	 * Notify the socket that the protocol is now quiescent,
926	 * and it's therefore safe move data from the socket
927	 * to tcp's rcv queue.
928	 */
929	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
930	    NULL, 0, 0);
931
932	if (mp != NULL) {
933		ASSERT(eager->tcp_rcv_cnt == 0);
934
935		eager->tcp_rcv_list = mp;
936		eager->tcp_rcv_cnt = msgdsize(mp);
937		while (mp->b_next != NULL) {
938			mp = mp->b_next;
939			eager->tcp_rcv_cnt += msgdsize(mp);
940		}
941		eager->tcp_rcv_last_head = mp;
942		while (mp->b_cont)
943			mp = mp->b_cont;
944		eager->tcp_rcv_last_tail = mp;
945		if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
946			eager->tcp_rwnd = 0;
947		else
948			eager->tcp_rwnd -= eager->tcp_rcv_cnt;
949	}
950
951	if (!issocket)
952		eager->tcp_issocket = B_FALSE;
953	/*
954	 * The stream for this eager does not yet exist, so mark it as
955	 * being detached.
956	 */
957	eager->tcp_detached = B_TRUE;
958	eager->tcp_hard_binding = B_TRUE;
959	connp->conn_rq = listener->tcp_connp->conn_rq;
960	connp->conn_wq = listener->tcp_connp->conn_wq;
961
962	/* Send up the connection indication */
963	mp = eager->tcp_conn.tcp_eager_conn_ind;
964	ASSERT(mp != NULL);
965	eager->tcp_conn.tcp_eager_conn_ind = NULL;
966
967	/*
968	 * TLI/XTI applications will get confused by
969	 * sending eager as an option since it violates
970	 * the option semantics. So remove the eager as
971	 * option since TLI/XTI app doesn't need it anyway.
972	 */
973	if (!issocket) {
974		struct T_conn_ind *conn_ind;
975
976		conn_ind = (struct T_conn_ind *)mp->b_rptr;
977		conn_ind->OPT_length = 0;
978		conn_ind->OPT_offset = 0;
979	}
980
981	/*
982	 * Sockfs guarantees that the listener will not be closed
983	 * during fallback. So we can safely use the listener's queue.
984	 */
985	putnext(listener->tcp_connp->conn_rq, mp);
986}
987
988
989int
990tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
991    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
992    sock_quiesce_arg_t *arg)
993{
994	tcp_t			*tcp;
995	conn_t 			*connp = (conn_t *)proto_handle;
996	int			error;
997	mblk_t			*stropt_mp;
998	mblk_t			*ordrel_mp;
999
1000	tcp = connp->conn_tcp;
1001
1002	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1003	    NULL);
1004
1005	/* Pre-allocate the T_ordrel_ind mblk. */
1006	ASSERT(tcp->tcp_ordrel_mp == NULL);
1007	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1008	    STR_NOSIG, NULL);
1009	ordrel_mp->b_datap->db_type = M_PROTO;
1010	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1011	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1012
1013	/*
1014	 * Enter the squeue so that no new packets can come in
1015	 */
1016	error = squeue_synch_enter(connp, NULL);
1017	if (error != 0) {
1018		/* failed to enter, free all the pre-allocated messages. */
1019		freeb(stropt_mp);
1020		freeb(ordrel_mp);
1021		return (ENOMEM);
1022	}
1023
1024	/*
1025	 * Both endpoints must be of the same type (either STREAMS or
1026	 * non-STREAMS) for fusion to be enabled. So if we are fused,
1027	 * we have to unfuse.
1028	 */
1029	if (tcp->tcp_fused)
1030		tcp_unfuse(tcp);
1031
1032	if (tcp->tcp_listener != NULL) {
1033		/* The eager will deal with opts when accept() is called */
1034		freeb(stropt_mp);
1035		tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1036	} else {
1037		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1038		    quiesced_cb, arg);
1039	}
1040
1041	/*
1042	 * No longer a direct socket
1043	 *
1044	 * Note that we intentionally leave the upper_handle and upcalls
1045	 * intact, since eagers may still be using them.
1046	 */
1047	connp->conn_flags &= ~IPCL_NONSTR;
1048	tcp->tcp_ordrel_mp = ordrel_mp;
1049
1050	/*
1051	 * There should be atleast two ref's (IP + TCP)
1052	 */
1053	ASSERT(connp->conn_ref >= 2);
1054	squeue_synch_exit(connp);
1055
1056	return (0);
1057}
1058
1059/*
1060 * Notifies a non-STREAMS based listener about a new connection. This
1061 * function is executed on the *eager*'s squeue once the 3 way handshake
1062 * has completed. Note that the behavior differs from STREAMS, where the
1063 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1064 * squeue.
1065 *
1066 * Returns B_TRUE if the notification succeeded and an upper handle was
1067 * obtained. `tcp' should be closed on failure.
1068 */
1069boolean_t
1070tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1071{
1072	tcp_t *listener = tcp->tcp_listener;
1073	conn_t *lconnp = listener->tcp_connp;
1074	conn_t *econnp = tcp->tcp_connp;
1075	tcp_t *tail;
1076	ipaddr_t *addr_cache;
1077	sock_upper_handle_t upper;
1078	struct sock_proto_props sopp;
1079
1080	mutex_enter(&listener->tcp_eager_lock);
1081	/*
1082	 * Take the eager out, if it is in the list of droppable eagers
1083	 * as we are here because the 3W handshake is over.
1084	 */
1085	MAKE_UNDROPPABLE(tcp);
1086	/*
1087	 * The eager already has an extra ref put in tcp_input_data
1088	 * so that it stays till accept comes back even though it
1089	 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1090	 */
1091	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1092	listener->tcp_conn_req_cnt_q0--;
1093	listener->tcp_conn_req_cnt_q++;
1094
1095	/* Move from SYN_RCVD to ESTABLISHED list  */
1096	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1097	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1098	tcp->tcp_eager_prev_q0 = NULL;
1099	tcp->tcp_eager_next_q0 = NULL;
1100
1101	/*
1102	 * Insert at end of the queue because connections are accepted
1103	 * in chronological order. Leaving the older connections at front
1104	 * of the queue helps reducing search time.
1105	 */
1106	tail = listener->tcp_eager_last_q;
1107	if (tail != NULL)
1108		tail->tcp_eager_next_q = tcp;
1109	else
1110		listener->tcp_eager_next_q = tcp;
1111	listener->tcp_eager_last_q = tcp;
1112	tcp->tcp_eager_next_q = NULL;
1113
1114	/* we have timed out before */
1115	if (tcp->tcp_syn_rcvd_timeout != 0) {
1116		tcp->tcp_syn_rcvd_timeout = 0;
1117		listener->tcp_syn_rcvd_timeout--;
1118		if (listener->tcp_syn_defense &&
1119		    listener->tcp_syn_rcvd_timeout <=
1120		    (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1121		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1122		    listener->tcp_last_rcv_lbolt)) {
1123			/*
1124			 * Turn off the defense mode if we
1125			 * believe the SYN attack is over.
1126			 */
1127			listener->tcp_syn_defense = B_FALSE;
1128			if (listener->tcp_ip_addr_cache) {
1129				kmem_free((void *)listener->tcp_ip_addr_cache,
1130				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1131				listener->tcp_ip_addr_cache = NULL;
1132			}
1133		}
1134	}
1135	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1136	if (addr_cache != NULL) {
1137		/*
1138		 * We have finished a 3-way handshake with this
1139		 * remote host. This proves the IP addr is good.
1140		 * Cache it!
1141		 */
1142		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1143		    tcp->tcp_connp->conn_faddr_v4;
1144	}
1145	mutex_exit(&listener->tcp_eager_lock);
1146
1147	/*
1148	 * Notify the ULP about the newconn. It is guaranteed that no
1149	 * tcp_accept() call will be made for the eager if the
1150	 * notification fails.
1151	 */
1152	if ((upper = (*lconnp->conn_upcalls->su_newconn)
1153	    (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1154	    &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1155	    &econnp->conn_upcalls)) == NULL) {
1156		return (B_FALSE);
1157	}
1158	econnp->conn_upper_handle = upper;
1159
1160	tcp->tcp_detached = B_FALSE;
1161	tcp->tcp_hard_binding = B_FALSE;
1162	tcp->tcp_tconnind_started = B_TRUE;
1163
1164	if (econnp->conn_keepalive) {
1165		tcp->tcp_ka_last_intrvl = 0;
1166		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1167		    tcp->tcp_ka_interval);
1168	}
1169
1170	/* Update the necessary parameters */
1171	tcp_get_proto_props(tcp, &sopp);
1172
1173	(*econnp->conn_upcalls->su_set_proto_props)
1174	    (econnp->conn_upper_handle, &sopp);
1175
1176	return (B_TRUE);
1177}
1178