1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/* This files contains all TCP TLI/TPI related functions */
27
28#include <sys/types.h>
29#include <sys/stream.h>
30#include <sys/strsun.h>
31#include <sys/strsubr.h>
32#include <sys/stropts.h>
33#include <sys/strlog.h>
34#define	_SUN_TPI_VERSION 2
35#include <sys/tihdr.h>
36#include <sys/suntpi.h>
37#include <sys/xti_inet.h>
38#include <sys/squeue_impl.h>
39#include <sys/squeue.h>
40
41#include <inet/common.h>
42#include <inet/ip.h>
43#include <inet/tcp.h>
44#include <inet/tcp_impl.h>
45#include <inet/proto_set.h>
46
47static void	tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
48static int	tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
49
50void
51tcp_use_pure_tpi(tcp_t *tcp)
52{
53	conn_t		*connp = tcp->tcp_connp;
54
55#ifdef	_ILP32
56	tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
57#else
58	tcp->tcp_acceptor_id = connp->conn_dev;
59#endif
60	/*
61	 * Insert this socket into the acceptor hash.
62	 * We might need it for T_CONN_RES message
63	 */
64	tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
65
66	tcp->tcp_issocket = B_FALSE;
67	TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
68}
69
70/* Shorthand to generate and send TPI error acks to our client */
71void
72tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
73{
74	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
75		putnext(tcp->tcp_connp->conn_rq, mp);
76}
77
78/* Shorthand to generate and send TPI error acks to our client */
79void
80tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
81    int t_error, int sys_error)
82{
83	struct T_error_ack	*teackp;
84
85	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
86	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
87		teackp = (struct T_error_ack *)mp->b_rptr;
88		teackp->ERROR_prim = primitive;
89		teackp->TLI_error = t_error;
90		teackp->UNIX_error = sys_error;
91		putnext(tcp->tcp_connp->conn_rq, mp);
92	}
93}
94
95/*
96 * TCP routine to get the values of options.
97 */
98int
99tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
100{
101	return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
102}
103
104/* ARGSUSED */
105int
106tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
107    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
108    void *thisdg_attrs, cred_t *cr)
109{
110	conn_t	*connp =  Q_TO_CONN(q);
111
112	return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
113	    outlenp, outvalp, thisdg_attrs, cr));
114}
115
116static int
117tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
118    int *t_errorp, int *sys_errorp)
119{
120	int error;
121	int is_absreq_failure;
122	t_scalar_t *opt_lenp;
123	t_scalar_t opt_offset;
124	int prim_type;
125	struct T_conn_req *tcreqp;
126	struct T_conn_res *tcresp;
127	cred_t *cr;
128
129	/*
130	 * All Solaris components should pass a db_credp
131	 * for this TPI message, hence we ASSERT.
132	 * But in case there is some other M_PROTO that looks
133	 * like a TPI message sent by some other kernel
134	 * component, we check and return an error.
135	 */
136	cr = msg_getcred(mp, NULL);
137	ASSERT(cr != NULL);
138	if (cr == NULL)
139		return (-1);
140
141	prim_type = ((union T_primitives *)mp->b_rptr)->type;
142	ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
143	    prim_type == T_CONN_RES);
144
145	switch (prim_type) {
146	case T_CONN_REQ:
147		tcreqp = (struct T_conn_req *)mp->b_rptr;
148		opt_offset = tcreqp->OPT_offset;
149		opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
150		break;
151	case O_T_CONN_RES:
152	case T_CONN_RES:
153		tcresp = (struct T_conn_res *)mp->b_rptr;
154		opt_offset = tcresp->OPT_offset;
155		opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
156		break;
157	default:
158		opt_lenp = 0;
159		opt_offset = 0;
160		break;
161	}
162
163	*t_errorp = 0;
164	*sys_errorp = 0;
165	*do_disconnectp = 0;
166
167	error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
168	    opt_offset, cr, &tcp_opt_obj,
169	    NULL, &is_absreq_failure);
170
171	switch (error) {
172	case  0:		/* no error */
173		ASSERT(is_absreq_failure == 0);
174		return (0);
175	case ENOPROTOOPT:
176		*t_errorp = TBADOPT;
177		break;
178	case EACCES:
179		*t_errorp = TACCES;
180		break;
181	default:
182		*t_errorp = TSYSERR; *sys_errorp = error;
183		break;
184	}
185	if (is_absreq_failure != 0) {
186		/*
187		 * The connection request should get the local ack
188		 * T_OK_ACK and then a T_DISCON_IND.
189		 */
190		*do_disconnectp = 1;
191	}
192	return (-1);
193}
194
195void
196tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
197{
198	int	error;
199	conn_t	*connp = tcp->tcp_connp;
200	struct sockaddr	*sa;
201	mblk_t  *mp1;
202	struct T_bind_req *tbr;
203	int	backlog;
204	socklen_t	len;
205	sin_t	*sin;
206	sin6_t	*sin6;
207	cred_t		*cr;
208
209	/*
210	 * All Solaris components should pass a db_credp
211	 * for this TPI message, hence we ASSERT.
212	 * But in case there is some other M_PROTO that looks
213	 * like a TPI message sent by some other kernel
214	 * component, we check and return an error.
215	 */
216	cr = msg_getcred(mp, NULL);
217	ASSERT(cr != NULL);
218	if (cr == NULL) {
219		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
220		return;
221	}
222
223	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
224	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
225		if (connp->conn_debug) {
226			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
227			    "tcp_tpi_bind: bad req, len %u",
228			    (uint_t)(mp->b_wptr - mp->b_rptr));
229		}
230		tcp_err_ack(tcp, mp, TPROTO, 0);
231		return;
232	}
233	/* Make sure the largest address fits */
234	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
235	if (mp1 == NULL) {
236		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
237		return;
238	}
239	mp = mp1;
240	tbr = (struct T_bind_req *)mp->b_rptr;
241
242	backlog = tbr->CONIND_number;
243	len = tbr->ADDR_length;
244
245	switch (len) {
246	case 0:		/* request for a generic port */
247		tbr->ADDR_offset = sizeof (struct T_bind_req);
248		if (connp->conn_family == AF_INET) {
249			tbr->ADDR_length = sizeof (sin_t);
250			sin = (sin_t *)&tbr[1];
251			*sin = sin_null;
252			sin->sin_family = AF_INET;
253			sa = (struct sockaddr *)sin;
254			len = sizeof (sin_t);
255			mp->b_wptr = (uchar_t *)&sin[1];
256		} else {
257			ASSERT(connp->conn_family == AF_INET6);
258			tbr->ADDR_length = sizeof (sin6_t);
259			sin6 = (sin6_t *)&tbr[1];
260			*sin6 = sin6_null;
261			sin6->sin6_family = AF_INET6;
262			sa = (struct sockaddr *)sin6;
263			len = sizeof (sin6_t);
264			mp->b_wptr = (uchar_t *)&sin6[1];
265		}
266		break;
267
268	case sizeof (sin_t):    /* Complete IPv4 address */
269		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
270		    sizeof (sin_t));
271		break;
272
273	case sizeof (sin6_t): /* Complete IPv6 address */
274		sa = (struct sockaddr *)mi_offset_param(mp,
275		    tbr->ADDR_offset, sizeof (sin6_t));
276		break;
277
278	default:
279		if (connp->conn_debug) {
280			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
281			    "tcp_tpi_bind: bad address length, %d",
282			    tbr->ADDR_length);
283		}
284		tcp_err_ack(tcp, mp, TBADADDR, 0);
285		return;
286	}
287
288	if (backlog > 0) {
289		error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
290		    tbr->PRIM_type != O_T_BIND_REQ);
291	} else {
292		error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
293		    tbr->PRIM_type != O_T_BIND_REQ);
294	}
295done:
296	if (error > 0) {
297		tcp_err_ack(tcp, mp, TSYSERR, error);
298	} else if (error < 0) {
299		tcp_err_ack(tcp, mp, -error, 0);
300	} else {
301		/*
302		 * Update port information as sockfs/tpi needs it for checking
303		 */
304		if (connp->conn_family == AF_INET) {
305			sin = (sin_t *)sa;
306			sin->sin_port = connp->conn_lport;
307		} else {
308			sin6 = (sin6_t *)sa;
309			sin6->sin6_port = connp->conn_lport;
310		}
311		mp->b_datap->db_type = M_PCPROTO;
312		tbr->PRIM_type = T_BIND_ACK;
313		putnext(connp->conn_rq, mp);
314	}
315}
316
317/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
318void
319tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
320{
321	conn_t *connp = tcp->tcp_connp;
322	int error;
323
324	error = tcp_do_unbind(connp);
325	if (error > 0) {
326		tcp_err_ack(tcp, mp, TSYSERR, error);
327	} else if (error < 0) {
328		tcp_err_ack(tcp, mp, -error, 0);
329	} else {
330		/* Send M_FLUSH according to TPI */
331		(void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
332
333		mp = mi_tpi_ok_ack_alloc(mp);
334		if (mp != NULL)
335			putnext(connp->conn_rq, mp);
336	}
337}
338
339/* ARGSUSED */
340int
341tcp_tpi_close(queue_t *q, int flags, cred_t *credp __unused)
342{
343	conn_t		*connp;
344
345	ASSERT(WR(q)->q_next == NULL);
346
347	if (flags & SO_FALLBACK) {
348		/*
349		 * stream is being closed while in fallback
350		 * simply free the resources that were allocated
351		 */
352		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
353		qprocsoff(q);
354		goto done;
355	}
356
357	connp = Q_TO_CONN(q);
358	/*
359	 * We are being closed as /dev/tcp or /dev/tcp6.
360	 */
361	tcp_close_common(connp, flags);
362
363	qprocsoff(q);
364	inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
365
366	/*
367	 * Drop IP's reference on the conn. This is the last reference
368	 * on the connp if the state was less than established. If the
369	 * connection has gone into timewait state, then we will have
370	 * one ref for the TCP and one more ref (total of two) for the
371	 * classifier connected hash list (a timewait connections stays
372	 * in connected hash till closed).
373	 *
374	 * We can't assert the references because there might be other
375	 * transient reference places because of some walkers or queued
376	 * packets in squeue for the timewait state.
377	 */
378	CONN_DEC_REF(connp);
379done:
380	q->q_ptr = WR(q)->q_ptr = NULL;
381	return (0);
382}
383
384/* ARGSUSED */
385int
386tcp_tpi_close_accept(queue_t *q, int flags __unused, cred_t *credp __unused)
387{
388	vmem_t	*minor_arena;
389	dev_t	conn_dev;
390	extern struct qinit tcp_acceptor_winit;
391
392	ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
393
394	/*
395	 * We had opened an acceptor STREAM for sockfs which is
396	 * now being closed due to some error.
397	 */
398	qprocsoff(q);
399
400	minor_arena = (vmem_t *)WR(q)->q_ptr;
401	conn_dev = (dev_t)RD(q)->q_ptr;
402	ASSERT(minor_arena != NULL);
403	ASSERT(conn_dev != 0);
404	inet_minor_free(minor_arena, conn_dev);
405	q->q_ptr = WR(q)->q_ptr = NULL;
406	return (0);
407}
408
409/*
410 * Put a connection confirmation message upstream built from the
411 * address/flowid information with the conn and iph. Report our success or
412 * failure.
413 */
414boolean_t
415tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
416    mblk_t **defermp, ip_recv_attr_t *ira)
417{
418	sin_t	sin;
419	sin6_t	sin6;
420	mblk_t	*mp;
421	char	*optp = NULL;
422	int	optlen = 0;
423	conn_t	*connp = tcp->tcp_connp;
424
425	if (defermp != NULL)
426		*defermp = NULL;
427
428	if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
429		/*
430		 * Return in T_CONN_CON results of option negotiation through
431		 * the T_CONN_REQ. Note: If there is an real end-to-end option
432		 * negotiation, then what is received from remote end needs
433		 * to be taken into account but there is no such thing (yet?)
434		 * in our TCP/IP.
435		 * Note: We do not use mi_offset_param() here as
436		 * tcp_opts_conn_req contents do not directly come from
437		 * an application and are either generated in kernel or
438		 * from user input that was already verified.
439		 */
440		mp = tcp->tcp_conn.tcp_opts_conn_req;
441		optp = (char *)(mp->b_rptr +
442		    ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
443		optlen = (int)
444		    ((struct T_conn_req *)mp->b_rptr)->OPT_length;
445	}
446
447	if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
448
449		/* packet is IPv4 */
450		if (connp->conn_family == AF_INET) {
451			sin = sin_null;
452			sin.sin_addr.s_addr = connp->conn_faddr_v4;
453			sin.sin_port = connp->conn_fport;
454			sin.sin_family = AF_INET;
455			mp = mi_tpi_conn_con(NULL, (char *)&sin,
456			    (int)sizeof (sin_t), optp, optlen);
457		} else {
458			sin6 = sin6_null;
459			sin6.sin6_addr = connp->conn_faddr_v6;
460			sin6.sin6_port = connp->conn_fport;
461			sin6.sin6_family = AF_INET6;
462			mp = mi_tpi_conn_con(NULL, (char *)&sin6,
463			    (int)sizeof (sin6_t), optp, optlen);
464
465		}
466	} else {
467		ip6_t	*ip6h = (ip6_t *)iphdr;
468
469		ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
470		ASSERT(connp->conn_family == AF_INET6);
471		sin6 = sin6_null;
472		sin6.sin6_addr = connp->conn_faddr_v6;
473		sin6.sin6_port = connp->conn_fport;
474		sin6.sin6_family = AF_INET6;
475		sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
476		mp = mi_tpi_conn_con(NULL, (char *)&sin6,
477		    (int)sizeof (sin6_t), optp, optlen);
478	}
479
480	if (!mp)
481		return (B_FALSE);
482
483	mblk_copycred(mp, idmp);
484
485	if (defermp == NULL) {
486		conn_t *connp = tcp->tcp_connp;
487		if (IPCL_IS_NONSTR(connp)) {
488			(*connp->conn_upcalls->su_connected)
489			    (connp->conn_upper_handle, tcp->tcp_connid,
490			    ira->ira_cred, ira->ira_cpid);
491			freemsg(mp);
492		} else {
493			if (ira->ira_cred != NULL) {
494				/* So that getpeerucred works for TPI sockfs */
495				mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
496			}
497			putnext(connp->conn_rq, mp);
498		}
499	} else {
500		*defermp = mp;
501	}
502
503	if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
504		tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
505	return (B_TRUE);
506}
507
508/*
509 * Successful connect request processing begins when our client passes
510 * a T_CONN_REQ message into tcp_wput(), which performs function calls into
511 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
512 *
513 * After various error checks are completed, tcp_tpi_connect() lays
514 * the target address and port into the composite header template.
515 * Then we ask IP for information, including a source address if we didn't
516 * already have one. Finally we prepare to send the SYN packet, and then
517 * send up the T_OK_ACK reply message.
518 */
519void
520tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
521{
522	sin_t		*sin;
523	struct T_conn_req	*tcr;
524	struct sockaddr	*sa;
525	socklen_t	len;
526	int		error;
527	cred_t		*cr;
528	pid_t		cpid;
529	conn_t		*connp = tcp->tcp_connp;
530	queue_t		*q = connp->conn_wq;
531
532	/*
533	 * All Solaris components should pass a db_credp
534	 * for this TPI message, hence we ASSERT.
535	 * But in case there is some other M_PROTO that looks
536	 * like a TPI message sent by some other kernel
537	 * component, we check and return an error.
538	 */
539	cr = msg_getcred(mp, &cpid);
540	ASSERT(cr != NULL);
541	if (cr == NULL) {
542		tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
543		return;
544	}
545
546	tcr = (struct T_conn_req *)mp->b_rptr;
547
548	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
549	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
550		tcp_err_ack(tcp, mp, TPROTO, 0);
551		return;
552	}
553
554	/*
555	 * Pre-allocate the T_ordrel_ind mblk so that at close time, we
556	 * will always have that to send up.  Otherwise, we need to do
557	 * special handling in case the allocation fails at that time.
558	 * If the end point is TPI, the tcp_t can be reused and the
559	 * tcp_ordrel_mp may be allocated already.
560	 */
561	if (tcp->tcp_ordrel_mp == NULL) {
562		if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
563			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
564			return;
565		}
566	}
567
568	/*
569	 * Determine packet type based on type of address passed in
570	 * the request should contain an IPv4 or IPv6 address.
571	 * Make sure that address family matches the type of
572	 * family of the address passed down.
573	 */
574	switch (tcr->DEST_length) {
575	default:
576		tcp_err_ack(tcp, mp, TBADADDR, 0);
577		return;
578
579	case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
580		/*
581		 * XXX: The check for valid DEST_length was not there
582		 * in earlier releases and some buggy
583		 * TLI apps (e.g Sybase) got away with not feeding
584		 * in sin_zero part of address.
585		 * We allow that bug to keep those buggy apps humming.
586		 * Test suites require the check on DEST_length.
587		 * We construct a new mblk with valid DEST_length
588		 * free the original so the rest of the code does
589		 * not have to keep track of this special shorter
590		 * length address case.
591		 */
592		mblk_t *nmp;
593		struct T_conn_req *ntcr;
594		sin_t *nsin;
595
596		nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
597		    tcr->OPT_length, BPRI_HI);
598		if (nmp == NULL) {
599			tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
600			return;
601		}
602		ntcr = (struct T_conn_req *)nmp->b_rptr;
603		bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
604		ntcr->PRIM_type = T_CONN_REQ;
605		ntcr->DEST_length = sizeof (sin_t);
606		ntcr->DEST_offset = sizeof (struct T_conn_req);
607
608		nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
609		*nsin = sin_null;
610		/* Get pointer to shorter address to copy from original mp */
611		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
612		    tcr->DEST_length); /* extract DEST_length worth of sin_t */
613		if (sin == NULL || !OK_32PTR((char *)sin)) {
614			freemsg(nmp);
615			tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
616			return;
617		}
618		nsin->sin_family = sin->sin_family;
619		nsin->sin_port = sin->sin_port;
620		nsin->sin_addr = sin->sin_addr;
621		/* Note:nsin->sin_zero zero-fill with sin_null assign above */
622		nmp->b_wptr = (uchar_t *)&nsin[1];
623		if (tcr->OPT_length != 0) {
624			ntcr->OPT_length = tcr->OPT_length;
625			ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
626			bcopy((uchar_t *)tcr + tcr->OPT_offset,
627			    (uchar_t *)ntcr + ntcr->OPT_offset,
628			    tcr->OPT_length);
629			nmp->b_wptr += tcr->OPT_length;
630		}
631		freemsg(mp);	/* original mp freed */
632		mp = nmp;	/* re-initialize original variables */
633		tcr = ntcr;
634	}
635	/* FALLTHRU */
636
637	case sizeof (sin_t):
638		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
639		    sizeof (sin_t));
640		len = sizeof (sin_t);
641		break;
642
643	case sizeof (sin6_t):
644		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
645		    sizeof (sin6_t));
646		len = sizeof (sin6_t);
647		break;
648	}
649
650	error = proto_verify_ip_addr(connp->conn_family, sa, len);
651	if (error != 0) {
652		tcp_err_ack(tcp, mp, TSYSERR, error);
653		return;
654	}
655
656	/*
657	 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
658	 * should key on their sequence number and cut them loose.
659	 */
660
661	/*
662	 * If options passed in, feed it for verification and handling
663	 */
664	if (tcr->OPT_length != 0) {
665		mblk_t	*ok_mp;
666		mblk_t	*discon_mp;
667		mblk_t  *conn_opts_mp;
668		int t_error, sys_error, do_disconnect;
669
670		conn_opts_mp = NULL;
671
672		if (tcp_conprim_opt_process(tcp, mp,
673		    &do_disconnect, &t_error, &sys_error) < 0) {
674			if (do_disconnect) {
675				ASSERT(t_error == 0 && sys_error == 0);
676				discon_mp = mi_tpi_discon_ind(NULL,
677				    ECONNREFUSED, 0);
678				if (!discon_mp) {
679					tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
680					    TSYSERR, ENOMEM);
681					return;
682				}
683				ok_mp = mi_tpi_ok_ack_alloc(mp);
684				if (!ok_mp) {
685					tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
686					    TSYSERR, ENOMEM);
687					return;
688				}
689				qreply(q, ok_mp);
690				qreply(q, discon_mp); /* no flush! */
691			} else {
692				ASSERT(t_error != 0);
693				tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
694				    sys_error);
695			}
696			return;
697		}
698		/*
699		 * Success in setting options, the mp option buffer represented
700		 * by OPT_length/offset has been potentially modified and
701		 * contains results of option processing. We copy it in
702		 * another mp to save it for potentially influencing returning
703		 * it in T_CONN_CONN.
704		 */
705		if (tcr->OPT_length != 0) { /* there are resulting options */
706			conn_opts_mp = copyb(mp);
707			if (!conn_opts_mp) {
708				tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
709				    TSYSERR, ENOMEM);
710				return;
711			}
712			ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
713			tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
714			/*
715			 * Note:
716			 * These resulting option negotiation can include any
717			 * end-to-end negotiation options but there no such
718			 * thing (yet?) in our TCP/IP.
719			 */
720		}
721	}
722
723	/* call the non-TPI version */
724	error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
725	if (error < 0) {
726		mp = mi_tpi_err_ack_alloc(mp, -error, 0);
727	} else if (error > 0) {
728		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
729	} else {
730		mp = mi_tpi_ok_ack_alloc(mp);
731	}
732
733	/*
734	 * Note: Code below is the "failure" case
735	 */
736	/* return error ack and blow away saved option results if any */
737connect_failed:
738	if (mp != NULL)
739		putnext(connp->conn_rq, mp);
740	else {
741		tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
742		    TSYSERR, ENOMEM);
743	}
744}
745
746/* Return the TPI/TLI equivalent of our current tcp_state */
747static int
748tcp_tpistate(tcp_t *tcp)
749{
750	switch (tcp->tcp_state) {
751	case TCPS_IDLE:
752		return (TS_UNBND);
753	case TCPS_LISTEN:
754		/*
755		 * Return whether there are outstanding T_CONN_IND waiting
756		 * for the matching T_CONN_RES. Therefore don't count q0.
757		 */
758		if (tcp->tcp_conn_req_cnt_q > 0)
759			return (TS_WRES_CIND);
760		else
761			return (TS_IDLE);
762	case TCPS_BOUND:
763		return (TS_IDLE);
764	case TCPS_SYN_SENT:
765		return (TS_WCON_CREQ);
766	case TCPS_SYN_RCVD:
767		/*
768		 * Note: assumption: this has to the active open SYN_RCVD.
769		 * The passive instance is detached in SYN_RCVD stage of
770		 * incoming connection processing so we cannot get request
771		 * for T_info_ack on it.
772		 */
773		return (TS_WACK_CRES);
774	case TCPS_ESTABLISHED:
775		return (TS_DATA_XFER);
776	case TCPS_CLOSE_WAIT:
777		return (TS_WREQ_ORDREL);
778	case TCPS_FIN_WAIT_1:
779		return (TS_WIND_ORDREL);
780	case TCPS_FIN_WAIT_2:
781		return (TS_WIND_ORDREL);
782
783	case TCPS_CLOSING:
784	case TCPS_LAST_ACK:
785	case TCPS_TIME_WAIT:
786	case TCPS_CLOSED:
787		/*
788		 * Following TS_WACK_DREQ7 is a rendition of "not
789		 * yet TS_IDLE" TPI state. There is no best match to any
790		 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
791		 * choose a value chosen that will map to TLI/XTI level
792		 * state of TSTATECHNG (state is process of changing) which
793		 * captures what this dummy state represents.
794		 */
795		return (TS_WACK_DREQ7);
796	default:
797		cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
798		    tcp->tcp_state, tcp_display(tcp, NULL,
799		    DISP_PORT_ONLY));
800		return (TS_UNBND);
801	}
802}
803
804static void
805tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
806{
807	tcp_stack_t	*tcps = tcp->tcp_tcps;
808	conn_t		*connp = tcp->tcp_connp;
809	extern struct T_info_ack tcp_g_t_info_ack;
810	extern struct T_info_ack tcp_g_t_info_ack_v6;
811
812	if (connp->conn_family == AF_INET6)
813		*tia = tcp_g_t_info_ack_v6;
814	else
815		*tia = tcp_g_t_info_ack;
816	tia->CURRENT_state = tcp_tpistate(tcp);
817	tia->OPT_size = tcp_max_optsize;
818	if (tcp->tcp_mss == 0) {
819		/* Not yet set - tcp_open does not set mss */
820		if (connp->conn_ipversion == IPV4_VERSION)
821			tia->TIDU_size = tcps->tcps_mss_def_ipv4;
822		else
823			tia->TIDU_size = tcps->tcps_mss_def_ipv6;
824	} else {
825		tia->TIDU_size = tcp->tcp_mss;
826	}
827	/* TODO: Default ETSDU is 1.  Is that correct for tcp? */
828}
829
830void
831tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
832    t_uscalar_t cap_bits1)
833{
834	tcap->CAP_bits1 = 0;
835
836	if (cap_bits1 & TC1_INFO) {
837		tcp_copy_info(&tcap->INFO_ack, tcp);
838		tcap->CAP_bits1 |= TC1_INFO;
839	}
840
841	if (cap_bits1 & TC1_ACCEPTOR_ID) {
842		tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
843		tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
844	}
845
846}
847
848/*
849 * This routine responds to T_CAPABILITY_REQ messages.  It is called by
850 * tcp_wput.  Much of the T_CAPABILITY_ACK information is copied from
851 * tcp_g_t_info_ack.  The current state of the stream is copied from
852 * tcp_state.
853 */
854void
855tcp_capability_req(tcp_t *tcp, mblk_t *mp)
856{
857	t_uscalar_t		cap_bits1;
858	struct T_capability_ack	*tcap;
859
860	if (MBLKL(mp) < sizeof (struct T_capability_req)) {
861		freemsg(mp);
862		return;
863	}
864
865	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
866
867	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
868	    mp->b_datap->db_type, T_CAPABILITY_ACK);
869	if (mp == NULL)
870		return;
871
872	tcap = (struct T_capability_ack *)mp->b_rptr;
873	tcp_do_capability_ack(tcp, tcap, cap_bits1);
874
875	putnext(tcp->tcp_connp->conn_rq, mp);
876}
877
878/*
879 * This routine responds to T_INFO_REQ messages.  It is called by tcp_wput.
880 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
881 * The current state of the stream is copied from tcp_state.
882 */
883void
884tcp_info_req(tcp_t *tcp, mblk_t *mp)
885{
886	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
887	    T_INFO_ACK);
888	if (!mp) {
889		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
890		return;
891	}
892	tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
893	putnext(tcp->tcp_connp->conn_rq, mp);
894}
895
896/* Respond to the TPI addr request */
897void
898tcp_addr_req(tcp_t *tcp, mblk_t *mp)
899{
900	struct sockaddr *sa;
901	mblk_t	*ackmp;
902	struct T_addr_ack *taa;
903	conn_t	*connp = tcp->tcp_connp;
904	uint_t	addrlen;
905
906	/* Make it large enough for worst case */
907	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
908	    2 * sizeof (sin6_t), 1);
909	if (ackmp == NULL) {
910		tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
911		return;
912	}
913
914	taa = (struct T_addr_ack *)ackmp->b_rptr;
915
916	bzero(taa, sizeof (struct T_addr_ack));
917	ackmp->b_wptr = (uchar_t *)&taa[1];
918
919	taa->PRIM_type = T_ADDR_ACK;
920	ackmp->b_datap->db_type = M_PCPROTO;
921
922	if (connp->conn_family == AF_INET)
923		addrlen = sizeof (sin_t);
924	else
925		addrlen = sizeof (sin6_t);
926
927	/*
928	 * Note: Following code assumes 32 bit alignment of basic
929	 * data structures like sin_t and struct T_addr_ack.
930	 */
931	if (tcp->tcp_state >= TCPS_BOUND) {
932		/*
933		 * Fill in local address first
934		 */
935		taa->LOCADDR_offset = sizeof (*taa);
936		taa->LOCADDR_length = addrlen;
937		sa = (struct sockaddr *)&taa[1];
938		(void) conn_getsockname(connp, sa, &addrlen);
939		ackmp->b_wptr += addrlen;
940	}
941	if (tcp->tcp_state >= TCPS_SYN_RCVD) {
942		/*
943		 * Fill in Remote address
944		 */
945		taa->REMADDR_length = addrlen;
946		/* assumed 32-bit alignment */
947		taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
948		sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
949		(void) conn_getpeername(connp, sa, &addrlen);
950		ackmp->b_wptr += addrlen;
951	}
952	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
953	putnext(tcp->tcp_connp->conn_rq, ackmp);
954}
955
956/*
957 * Swap information between the eager and acceptor for a TLI/XTI client.
958 * The sockfs accept is done on the acceptor stream and control goes
959 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
960 * called. In either case, both the eager and listener are in their own
961 * perimeter (squeue) and the code has to deal with potential race.
962 *
963 * See the block comment on top of tcp_accept() and tcp_tli_accept().
964 */
965static void
966tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
967{
968	conn_t	*econnp, *aconnp;
969
970	ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
971	ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
972	ASSERT(!TCP_IS_SOCKET(acceptor));
973	ASSERT(!TCP_IS_SOCKET(eager));
974	ASSERT(!TCP_IS_SOCKET(listener));
975
976	/*
977	 * Trusted Extensions may need to use a security label that is
978	 * different from the acceptor's label on MLP and MAC-Exempt
979	 * sockets. If this is the case, the required security label
980	 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
981	 * acceptor stream refer to econnp we atomatically get that label.
982	 */
983
984	acceptor->tcp_detached = B_TRUE;
985	/*
986	 * To permit stream re-use by TLI/XTI, the eager needs a copy of
987	 * the acceptor id.
988	 */
989	eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
990
991	/* remove eager from listen list... */
992	mutex_enter(&listener->tcp_eager_lock);
993	tcp_eager_unlink(eager);
994	ASSERT(eager->tcp_eager_next_q == NULL &&
995	    eager->tcp_eager_last_q == NULL);
996	ASSERT(eager->tcp_eager_next_q0 == NULL &&
997	    eager->tcp_eager_prev_q0 == NULL);
998	mutex_exit(&listener->tcp_eager_lock);
999
1000	econnp = eager->tcp_connp;
1001	aconnp = acceptor->tcp_connp;
1002	econnp->conn_rq = aconnp->conn_rq;
1003	econnp->conn_wq = aconnp->conn_wq;
1004	econnp->conn_rq->q_ptr = econnp;
1005	econnp->conn_wq->q_ptr = econnp;
1006
1007	/*
1008	 * In the TLI/XTI loopback case, we are inside the listener's squeue,
1009	 * which might be a different squeue from our peer TCP instance.
1010	 * For TCP Fusion, the peer expects that whenever tcp_detached is
1011	 * clear, our TCP queues point to the acceptor's queues.  Thus, use
1012	 * membar_producer() to ensure that the assignments of conn_rq/conn_wq
1013	 * above reach global visibility prior to the clearing of tcp_detached.
1014	 */
1015	membar_producer();
1016	eager->tcp_detached = B_FALSE;
1017
1018	ASSERT(eager->tcp_ack_tid == 0);
1019
1020	econnp->conn_dev = aconnp->conn_dev;
1021	econnp->conn_minor_arena = aconnp->conn_minor_arena;
1022
1023	ASSERT(econnp->conn_minor_arena != NULL);
1024	if (econnp->conn_cred != NULL)
1025		crfree(econnp->conn_cred);
1026	econnp->conn_cred = aconnp->conn_cred;
1027	ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1028	econnp->conn_ixa->ixa_cred = econnp->conn_cred;
1029	aconnp->conn_cred = NULL;
1030	econnp->conn_cpid = aconnp->conn_cpid;
1031	ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
1032	ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
1033
1034	econnp->conn_zoneid = aconnp->conn_zoneid;
1035	econnp->conn_allzones = aconnp->conn_allzones;
1036	econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
1037
1038	econnp->conn_mac_mode = aconnp->conn_mac_mode;
1039	econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
1040	aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
1041
1042	/* Do the IPC initialization */
1043	CONN_INC_REF(econnp);
1044
1045	/* Done with old IPC. Drop its ref on its connp */
1046	CONN_DEC_REF(aconnp);
1047}
1048
1049/*
1050 * This runs at the tail end of accept processing on the squeue of the
1051 * new connection.
1052 */
1053/* ARGSUSED */
1054static void
1055tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1056{
1057	conn_t			*connp = (conn_t *)arg;
1058	tcp_t			*tcp = connp->conn_tcp;
1059	queue_t			*q = connp->conn_rq;
1060	tcp_stack_t		*tcps = tcp->tcp_tcps;
1061	struct stroptions	*stropt;
1062	struct sock_proto_props sopp;
1063
1064	/* Should never be called for non-STREAMS sockets */
1065	ASSERT(!IPCL_IS_NONSTR(connp));
1066
1067	/* We should just receive a single mblk that fits a T_discon_ind */
1068	ASSERT(mp->b_cont == NULL);
1069
1070	/*
1071	 * Drop the eager's ref on the listener, that was placed when
1072	 * this eager began life in tcp_input_listener.
1073	 */
1074	CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
1075
1076	tcp->tcp_detached = B_FALSE;
1077
1078	if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
1079		/*
1080		 * Someone blewoff the eager before we could finish
1081		 * the accept.
1082		 *
1083		 * The only reason eager exists it because we put in
1084		 * a ref on it when conn ind went up. We need to send
1085		 * a disconnect indication up while the last reference
1086		 * on the eager will be dropped by the squeue when we
1087		 * return.
1088		 */
1089		ASSERT(tcp->tcp_listener == NULL);
1090		if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
1091			struct	T_discon_ind	*tdi;
1092
1093			(void) putnextctl1(q, M_FLUSH, FLUSHRW);
1094			/*
1095			 * Let us reuse the incoming mblk to avoid
1096			 * memory allocation failure problems. We know
1097			 * that the size of the incoming mblk i.e.
1098			 * stroptions is greater than sizeof
1099			 * T_discon_ind.
1100			 */
1101			ASSERT(DB_REF(mp) == 1);
1102			ASSERT(MBLKSIZE(mp) >=
1103			    sizeof (struct T_discon_ind));
1104
1105			DB_TYPE(mp) = M_PROTO;
1106			((union T_primitives *)mp->b_rptr)->type =
1107			    T_DISCON_IND;
1108			tdi = (struct T_discon_ind *)mp->b_rptr;
1109			if (tcp->tcp_issocket) {
1110				tdi->DISCON_reason = ECONNREFUSED;
1111				tdi->SEQ_number = 0;
1112			} else {
1113				tdi->DISCON_reason = ENOPROTOOPT;
1114				tdi->SEQ_number =
1115				    tcp->tcp_conn_req_seqnum;
1116			}
1117			mp->b_wptr = mp->b_rptr +
1118			    sizeof (struct T_discon_ind);
1119			putnext(q, mp);
1120		}
1121		tcp->tcp_hard_binding = B_FALSE;
1122		return;
1123	}
1124
1125	/*
1126	 * This is the first time we run on the correct
1127	 * queue after tcp_accept. So fix all the q parameters
1128	 * here.
1129	 *
1130	 * Let us reuse the incoming mblk to avoid
1131	 * memory allocation failure problems. We know
1132	 * that the size of the incoming mblk is at least
1133	 * stroptions
1134	 */
1135	tcp_get_proto_props(tcp, &sopp);
1136
1137	ASSERT(DB_REF(mp) == 1);
1138	ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
1139
1140	DB_TYPE(mp) = M_SETOPTS;
1141	stropt = (struct stroptions *)mp->b_rptr;
1142	mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
1143	stropt = (struct stroptions *)mp->b_rptr;
1144	ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
1145	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
1146	stropt->so_hiwat = sopp.sopp_rxhiwat;
1147	stropt->so_wroff = sopp.sopp_wroff;
1148	stropt->so_maxblk = sopp.sopp_maxblk;
1149
1150	/* Send the options up */
1151	putnext(q, mp);
1152
1153	/*
1154	 * Pass up any data and/or a fin that has been received.
1155	 *
1156	 * Adjust receive window in case it had decreased
1157	 * (because there is data <=> tcp_rcv_list != NULL)
1158	 * while the connection was detached. Note that
1159	 * in case the eager was flow-controlled, w/o this
1160	 * code, the rwnd may never open up again!
1161	 */
1162	if (tcp->tcp_rcv_list != NULL) {
1163		/* We drain directly in case of fused tcp loopback */
1164
1165		if (!tcp->tcp_fused && canputnext(q)) {
1166			tcp->tcp_rwnd = connp->conn_rcvbuf;
1167			if (tcp->tcp_state >= TCPS_ESTABLISHED &&
1168			    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
1169				tcp_xmit_ctl(NULL,
1170				    tcp, (tcp->tcp_swnd == 0) ?
1171				    tcp->tcp_suna : tcp->tcp_snxt,
1172				    tcp->tcp_rnxt, TH_ACK);
1173			}
1174		}
1175
1176		(void) tcp_rcv_drain(tcp);
1177
1178		/*
1179		 * For fused tcp loopback, back-enable peer endpoint
1180		 * if it's currently flow-controlled.
1181		 */
1182		if (tcp->tcp_fused) {
1183			tcp_t *peer_tcp = tcp->tcp_loopback_peer;
1184
1185			ASSERT(peer_tcp != NULL);
1186			ASSERT(peer_tcp->tcp_fused);
1187
1188			mutex_enter(&peer_tcp->tcp_non_sq_lock);
1189			if (peer_tcp->tcp_flow_stopped) {
1190				tcp_clrqfull(peer_tcp);
1191				TCP_STAT(tcps, tcp_fusion_backenabled);
1192			}
1193			mutex_exit(&peer_tcp->tcp_non_sq_lock);
1194		}
1195	}
1196	ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
1197	if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
1198		tcp->tcp_ordrel_done = B_TRUE;
1199		mp = tcp->tcp_ordrel_mp;
1200		tcp->tcp_ordrel_mp = NULL;
1201		putnext(q, mp);
1202	}
1203	tcp->tcp_hard_binding = B_FALSE;
1204
1205	if (connp->conn_keepalive) {
1206		tcp->tcp_ka_last_intrvl = 0;
1207		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1208		    tcp->tcp_ka_interval);
1209	}
1210
1211	/*
1212	 * At this point, eager is fully established and will
1213	 * have the following references -
1214	 *
1215	 * 2 references for connection to exist (1 for TCP and 1 for IP).
1216	 * 1 reference for the squeue which will be dropped by the squeue as
1217	 *	soon as this function returns.
1218	 * There will be 1 additonal reference for being in classifier
1219	 *	hash list provided something bad hasn't happened.
1220	 */
1221	ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
1222	    (connp->conn_fanout == NULL && connp->conn_ref >= 3));
1223}
1224
1225/*
1226 * Pull a deferred connection indication off of the listener. The caller
1227 * must verify that there is a deferred conn ind under eager_lock before
1228 * calling this function.
1229 */
1230static mblk_t *
1231tcp_get_def_conn_ind(tcp_t *listener)
1232{
1233	tcp_t *tail;
1234	tcp_t *tcp;
1235	mblk_t *conn_ind;
1236
1237	ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
1238	ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0);
1239
1240	tcp = listener->tcp_eager_prev_q0;
1241	/*
1242	 * listener->tcp_eager_prev_q0 points to the TAIL of the
1243	 * deferred T_conn_ind queue. We need to get to the head
1244	 * of the queue in order to send up T_conn_ind the same
1245	 * order as how the 3WHS is completed.
1246	 */
1247	while (tcp != listener) {
1248		if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
1249			break;
1250		else
1251			tcp = tcp->tcp_eager_prev_q0;
1252	}
1253
1254	conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
1255	tcp->tcp_conn.tcp_eager_conn_ind = NULL;
1256	/* Move from q0 to q */
1257	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1258	listener->tcp_conn_req_cnt_q0--;
1259	listener->tcp_conn_req_cnt_q++;
1260	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1261	    tcp->tcp_eager_prev_q0;
1262	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1263	    tcp->tcp_eager_next_q0;
1264	tcp->tcp_eager_prev_q0 = NULL;
1265	tcp->tcp_eager_next_q0 = NULL;
1266	tcp->tcp_conn_def_q0 = B_FALSE;
1267
1268	/* Make sure the tcp isn't in the list of droppables */
1269	ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
1270	    tcp->tcp_eager_prev_drop_q0 == NULL);
1271
1272	/*
1273	 * Insert at end of the queue because sockfs sends
1274	 * down T_CONN_RES in chronological order. Leaving
1275	 * the older conn indications at front of the queue
1276	 * helps reducing search time.
1277	 */
1278	tail = listener->tcp_eager_last_q;
1279	if (tail != NULL) {
1280		tail->tcp_eager_next_q = tcp;
1281	} else {
1282		listener->tcp_eager_next_q = tcp;
1283	}
1284	listener->tcp_eager_last_q = tcp;
1285	tcp->tcp_eager_next_q = NULL;
1286
1287	return (conn_ind);
1288}
1289
1290
1291/*
1292 * Reply to a clients T_CONN_RES TPI message. This function
1293 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1294 * on the acceptor STREAM and processed in tcp_accept_common().
1295 * Read the block comment on top of tcp_input_listener().
1296 */
1297void
1298tcp_tli_accept(tcp_t *listener, mblk_t *mp)
1299{
1300	tcp_t		*acceptor;
1301	tcp_t		*eager;
1302	struct T_conn_res	*tcr;
1303	t_uscalar_t	acceptor_id;
1304	t_scalar_t	seqnum;
1305	mblk_t		*discon_mp = NULL;
1306	mblk_t		*ok_mp;
1307	mblk_t		*mp1;
1308	tcp_stack_t	*tcps = listener->tcp_tcps;
1309	conn_t		*econnp;
1310
1311	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1312		tcp_err_ack(listener, mp, TPROTO, 0);
1313		return;
1314	}
1315	tcr = (struct T_conn_res *)mp->b_rptr;
1316
1317	/*
1318	 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1319	 * read side queue of the streams device underneath us i.e. the
1320	 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1321	 * look it up in the queue_hash.  Under LP64 it sends down the
1322	 * minor_t of the accepting endpoint.
1323	 *
1324	 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1325	 * fanout hash lock is held.
1326	 * This prevents any thread from entering the acceptor queue from
1327	 * below (since it has not been hard bound yet i.e. any inbound
1328	 * packets will arrive on the listener conn_t and
1329	 * go through the classifier).
1330	 * The CONN_INC_REF will prevent the acceptor from closing.
1331	 *
1332	 * XXX It is still possible for a tli application to send down data
1333	 * on the accepting stream while another thread calls t_accept.
1334	 * This should not be a problem for well-behaved applications since
1335	 * the T_OK_ACK is sent after the queue swapping is completed.
1336	 *
1337	 * If the accepting fd is the same as the listening fd, avoid
1338	 * queue hash lookup since that will return an eager listener in a
1339	 * already established state.
1340	 */
1341	acceptor_id = tcr->ACCEPTOR_id;
1342	mutex_enter(&listener->tcp_eager_lock);
1343	if (listener->tcp_acceptor_id == acceptor_id) {
1344		eager = listener->tcp_eager_next_q;
1345		/* only count how many T_CONN_INDs so don't count q0 */
1346		if ((listener->tcp_conn_req_cnt_q != 1) ||
1347		    (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
1348			mutex_exit(&listener->tcp_eager_lock);
1349			tcp_err_ack(listener, mp, TBADF, 0);
1350			return;
1351		}
1352		if (listener->tcp_conn_req_cnt_q0 != 0) {
1353			/* Throw away all the eagers on q0. */
1354			tcp_eager_cleanup(listener, 1);
1355		}
1356		if (listener->tcp_syn_defense) {
1357			listener->tcp_syn_defense = B_FALSE;
1358			if (listener->tcp_ip_addr_cache != NULL) {
1359				kmem_free(listener->tcp_ip_addr_cache,
1360				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1361				listener->tcp_ip_addr_cache = NULL;
1362			}
1363		}
1364		/*
1365		 * Transfer tcp_conn_req_max to the eager so that when
1366		 * a disconnect occurs we can revert the endpoint to the
1367		 * listen state.
1368		 */
1369		eager->tcp_conn_req_max = listener->tcp_conn_req_max;
1370		ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
1371		/*
1372		 * Get a reference on the acceptor just like the
1373		 * tcp_acceptor_hash_lookup below.
1374		 */
1375		acceptor = listener;
1376		CONN_INC_REF(acceptor->tcp_connp);
1377	} else {
1378		acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
1379		if (acceptor == NULL) {
1380			if (listener->tcp_connp->conn_debug) {
1381				(void) strlog(TCP_MOD_ID, 0, 1,
1382				    SL_ERROR|SL_TRACE,
1383				    "tcp_accept: did not find acceptor 0x%x\n",
1384				    acceptor_id);
1385			}
1386			mutex_exit(&listener->tcp_eager_lock);
1387			tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
1388			return;
1389		}
1390		/*
1391		 * Verify acceptor state. The acceptable states for an acceptor
1392		 * include TCPS_IDLE and TCPS_BOUND.
1393		 */
1394		switch (acceptor->tcp_state) {
1395		case TCPS_IDLE:
1396			/* FALLTHRU */
1397		case TCPS_BOUND:
1398			break;
1399		default:
1400			CONN_DEC_REF(acceptor->tcp_connp);
1401			mutex_exit(&listener->tcp_eager_lock);
1402			tcp_err_ack(listener, mp, TOUTSTATE, 0);
1403			return;
1404		}
1405	}
1406
1407	/* The listener must be in TCPS_LISTEN */
1408	if (listener->tcp_state != TCPS_LISTEN) {
1409		CONN_DEC_REF(acceptor->tcp_connp);
1410		mutex_exit(&listener->tcp_eager_lock);
1411		tcp_err_ack(listener, mp, TOUTSTATE, 0);
1412		return;
1413	}
1414
1415	/*
1416	 * Rendezvous with an eager connection request packet hanging off
1417	 * 'tcp' that has the 'seqnum' tag.  We tagged the detached open
1418	 * tcp structure when the connection packet arrived in
1419	 * tcp_input_listener().
1420	 */
1421	seqnum = tcr->SEQ_number;
1422	eager = listener;
1423	do {
1424		eager = eager->tcp_eager_next_q;
1425		if (eager == NULL) {
1426			CONN_DEC_REF(acceptor->tcp_connp);
1427			mutex_exit(&listener->tcp_eager_lock);
1428			tcp_err_ack(listener, mp, TBADSEQ, 0);
1429			return;
1430		}
1431	} while (eager->tcp_conn_req_seqnum != seqnum);
1432	mutex_exit(&listener->tcp_eager_lock);
1433
1434	/*
1435	 * At this point, both acceptor and listener have 2 ref
1436	 * that they begin with. Acceptor has one additional ref
1437	 * we placed in lookup while listener has 3 additional
1438	 * ref for being behind the squeue (tcp_accept() is
1439	 * done on listener's squeue); being in classifier hash;
1440	 * and eager's ref on listener.
1441	 */
1442	ASSERT(listener->tcp_connp->conn_ref >= 5);
1443	ASSERT(acceptor->tcp_connp->conn_ref >= 3);
1444
1445	/*
1446	 * The eager at this point is set in its own squeue and
1447	 * could easily have been killed (tcp_accept_finish will
1448	 * deal with that) because of a TH_RST so we can only
1449	 * ASSERT for a single ref.
1450	 */
1451	ASSERT(eager->tcp_connp->conn_ref >= 1);
1452
1453	/*
1454	 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1455	 * use it if something failed.
1456	 */
1457	discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1458	    sizeof (struct stroptions)), BPRI_HI);
1459	if (discon_mp == NULL) {
1460		CONN_DEC_REF(acceptor->tcp_connp);
1461		CONN_DEC_REF(eager->tcp_connp);
1462		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1463		return;
1464	}
1465
1466	econnp = eager->tcp_connp;
1467
1468	/* Hold a copy of mp, in case reallocb fails */
1469	if ((mp1 = copymsg(mp)) == NULL) {
1470		CONN_DEC_REF(acceptor->tcp_connp);
1471		CONN_DEC_REF(eager->tcp_connp);
1472		freemsg(discon_mp);
1473		tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
1474		return;
1475	}
1476
1477	tcr = (struct T_conn_res *)mp1->b_rptr;
1478
1479	/*
1480	 * This is an expanded version of mi_tpi_ok_ack_alloc()
1481	 * which allocates a larger mblk and appends the new
1482	 * local address to the ok_ack.  The address is copied by
1483	 * soaccept() for getsockname().
1484	 */
1485	{
1486		int extra;
1487
1488		extra = (econnp->conn_family == AF_INET) ?
1489		    sizeof (sin_t) : sizeof (sin6_t);
1490
1491		/*
1492		 * Try to re-use mp, if possible.  Otherwise, allocate
1493		 * an mblk and return it as ok_mp.  In any case, mp
1494		 * is no longer usable upon return.
1495		 */
1496		if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
1497			CONN_DEC_REF(acceptor->tcp_connp);
1498			CONN_DEC_REF(eager->tcp_connp);
1499			freemsg(discon_mp);
1500			/* Original mp has been freed by now, so use mp1 */
1501			tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
1502			return;
1503		}
1504
1505		mp = NULL;	/* We should never use mp after this point */
1506
1507		switch (extra) {
1508		case sizeof (sin_t): {
1509			sin_t *sin = (sin_t *)ok_mp->b_wptr;
1510
1511			ok_mp->b_wptr += extra;
1512			sin->sin_family = AF_INET;
1513			sin->sin_port = econnp->conn_lport;
1514			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1515			break;
1516		}
1517		case sizeof (sin6_t): {
1518			sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
1519
1520			ok_mp->b_wptr += extra;
1521			sin6->sin6_family = AF_INET6;
1522			sin6->sin6_port = econnp->conn_lport;
1523			sin6->sin6_addr = econnp->conn_laddr_v6;
1524			sin6->sin6_flowinfo = econnp->conn_flowinfo;
1525			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1526			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1527				sin6->sin6_scope_id =
1528				    econnp->conn_ixa->ixa_scopeid;
1529			} else {
1530				sin6->sin6_scope_id = 0;
1531			}
1532			sin6->__sin6_src_id = 0;
1533			break;
1534		}
1535		default:
1536			break;
1537		}
1538		ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
1539	}
1540
1541	/*
1542	 * If there are no options we know that the T_CONN_RES will
1543	 * succeed. However, we can't send the T_OK_ACK upstream until
1544	 * the tcp_accept_swap is done since it would be dangerous to
1545	 * let the application start using the new fd prior to the swap.
1546	 */
1547	tcp_accept_swap(listener, acceptor, eager);
1548
1549	/*
1550	 * tcp_accept_swap unlinks eager from listener but does not drop
1551	 * the eager's reference on the listener.
1552	 */
1553	ASSERT(eager->tcp_listener == NULL);
1554	ASSERT(listener->tcp_connp->conn_ref >= 5);
1555
1556	/*
1557	 * The eager is now associated with its own queue. Insert in
1558	 * the hash so that the connection can be reused for a future
1559	 * T_CONN_RES.
1560	 */
1561	tcp_acceptor_hash_insert(acceptor_id, eager);
1562
1563	/*
1564	 * We now do the processing of options with T_CONN_RES.
1565	 * We delay till now since we wanted to have queue to pass to
1566	 * option processing routines that points back to the right
1567	 * instance structure which does not happen until after
1568	 * tcp_accept_swap().
1569	 *
1570	 * Note:
1571	 * The sanity of the logic here assumes that whatever options
1572	 * are appropriate to inherit from listner=>eager are done
1573	 * before this point, and whatever were to be overridden (or not)
1574	 * in transfer logic from eager=>acceptor in tcp_accept_swap().
1575	 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
1576	 *   before its ACCEPTOR_id comes down in T_CONN_RES ]
1577	 * This may not be true at this point in time but can be fixed
1578	 * independently. This option processing code starts with
1579	 * the instantiated acceptor instance and the final queue at
1580	 * this point.
1581	 */
1582
1583	if (tcr->OPT_length != 0) {
1584		/* Options to process */
1585		int t_error = 0;
1586		int sys_error = 0;
1587		int do_disconnect = 0;
1588
1589		if (tcp_conprim_opt_process(eager, mp1,
1590		    &do_disconnect, &t_error, &sys_error) < 0) {
1591			eager->tcp_accept_error = 1;
1592			if (do_disconnect) {
1593				/*
1594				 * An option failed which does not allow
1595				 * connection to be accepted.
1596				 *
1597				 * We allow T_CONN_RES to succeed and
1598				 * put a T_DISCON_IND on the eager queue.
1599				 */
1600				ASSERT(t_error == 0 && sys_error == 0);
1601				eager->tcp_send_discon_ind = 1;
1602			} else {
1603				ASSERT(t_error != 0);
1604				freemsg(ok_mp);
1605				/*
1606				 * Original mp was either freed or set
1607				 * to ok_mp above, so use mp1 instead.
1608				 */
1609				tcp_err_ack(listener, mp1, t_error, sys_error);
1610				goto finish;
1611			}
1612		}
1613		/*
1614		 * Most likely success in setting options (except if
1615		 * eager->tcp_send_discon_ind set).
1616		 * mp1 option buffer represented by OPT_length/offset
1617		 * potentially modified and contains results of setting
1618		 * options at this point
1619		 */
1620	}
1621
1622	/* We no longer need mp1, since all options processing has passed */
1623	freemsg(mp1);
1624
1625	putnext(listener->tcp_connp->conn_rq, ok_mp);
1626
1627	mutex_enter(&listener->tcp_eager_lock);
1628	if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1629		mblk_t	*conn_ind;
1630
1631		/*
1632		 * This path should not be executed if listener and
1633		 * acceptor streams are the same.
1634		 */
1635		ASSERT(listener != acceptor);
1636		conn_ind = tcp_get_def_conn_ind(listener);
1637		mutex_exit(&listener->tcp_eager_lock);
1638		putnext(listener->tcp_connp->conn_rq, conn_ind);
1639	} else {
1640		mutex_exit(&listener->tcp_eager_lock);
1641	}
1642
1643	/*
1644	 * Done with the acceptor - free it
1645	 *
1646	 * Note: from this point on, no access to listener should be made
1647	 * as listener can be equal to acceptor.
1648	 */
1649finish:
1650	ASSERT(acceptor->tcp_detached);
1651	acceptor->tcp_connp->conn_rq = NULL;
1652	ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
1653	acceptor->tcp_connp->conn_wq = NULL;
1654	(void) tcp_clean_death(acceptor, 0);
1655	CONN_DEC_REF(acceptor->tcp_connp);
1656
1657	/*
1658	 * We pass discon_mp to tcp_accept_finish to get on the right squeue.
1659	 *
1660	 * It will update the setting for sockfs/stream head and also take
1661	 * care of any data that arrived before accept() wad called.
1662	 * In case we already received a FIN then tcp_accept_finish will send up
1663	 * the ordrel. It will also send up a window update if the window
1664	 * has opened up.
1665	 */
1666
1667	/*
1668	 * XXX: we currently have a problem if XTI application closes the
1669	 * acceptor stream in between. This problem exists in on10-gate also
1670	 * and is well know but nothing can be done short of major rewrite
1671	 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
1672	 * eager same squeue as listener (we can distinguish non socket
1673	 * listeners at the time of handling a SYN in tcp_input_listener)
1674	 * and do most of the work that tcp_accept_finish does here itself
1675	 * and then get behind the acceptor squeue to access the acceptor
1676	 * queue.
1677	 */
1678	/*
1679	 * We already have a ref on tcp so no need to do one before squeue_enter
1680	 */
1681	SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
1682	    tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
1683	    SQTAG_TCP_ACCEPT_FINISH);
1684}
1685
1686
1687/*
1688 * This is the STREAMS entry point for T_CONN_RES coming down on
1689 * Acceptor STREAM when  sockfs listener does accept processing.
1690 * Read the block comment on top of tcp_input_listener().
1691 */
1692int
1693tcp_tpi_accept(queue_t *q, mblk_t *mp)
1694{
1695	queue_t *rq = RD(q);
1696	struct T_conn_res *conn_res;
1697	tcp_t *eager;
1698	tcp_t *listener;
1699	struct T_ok_ack *ok;
1700	t_scalar_t PRIM_type;
1701	mblk_t *discon_mp;
1702	conn_t *econnp;
1703	cred_t *cr;
1704
1705	ASSERT(DB_TYPE(mp) == M_PROTO);
1706
1707	/*
1708	 * All Solaris components should pass a db_credp
1709	 * for this TPI message, hence we ASSERT.
1710	 * But in case there is some other M_PROTO that looks
1711	 * like a TPI message sent by some other kernel
1712	 * component, we check and return an error.
1713	 */
1714	cr = msg_getcred(mp, NULL);
1715	ASSERT(cr != NULL);
1716	if (cr == NULL) {
1717		mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
1718		if (mp != NULL)
1719			putnext(rq, mp);
1720		return (0);
1721	}
1722	conn_res = (struct T_conn_res *)mp->b_rptr;
1723	ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
1724	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
1725		mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1726		if (mp != NULL)
1727			putnext(rq, mp);
1728		return (0);
1729	}
1730	switch (conn_res->PRIM_type) {
1731	case O_T_CONN_RES:
1732	case T_CONN_RES:
1733		/*
1734		 * We pass up an err ack if allocb fails. This will
1735		 * cause sockfs to issue a T_DISCON_REQ which will cause
1736		 * tcp_eager_blowoff to be called. sockfs will then call
1737		 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
1738		 * we need to do the allocb up here because we have to
1739		 * make sure rq->q_qinfo->qi_qclose still points to the
1740		 * correct function (tcp_tpi_close_accept) in case allocb
1741		 * fails.
1742		 */
1743		bcopy(mp->b_rptr + conn_res->OPT_offset,
1744		    &eager, conn_res->OPT_length);
1745		PRIM_type = conn_res->PRIM_type;
1746		mp->b_datap->db_type = M_PCPROTO;
1747		mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
1748		ok = (struct T_ok_ack *)mp->b_rptr;
1749		ok->PRIM_type = T_OK_ACK;
1750		ok->CORRECT_prim = PRIM_type;
1751		econnp = eager->tcp_connp;
1752		econnp->conn_dev = (dev_t)RD(q)->q_ptr;
1753		econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
1754		econnp->conn_rq = rq;
1755		econnp->conn_wq = q;
1756		rq->q_ptr = econnp;
1757		rq->q_qinfo = &tcp_rinitv4;	/* No open - same as rinitv6 */
1758		q->q_ptr = econnp;
1759		q->q_qinfo = &tcp_winit;
1760		listener = eager->tcp_listener;
1761
1762		/*
1763		 * Pre allocate the discon_ind mblk also. tcp_accept_finish will
1764		 * use it if something failed.
1765		 */
1766		discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
1767		    sizeof (struct stroptions)), BPRI_HI);
1768
1769		if (discon_mp == NULL) {
1770			mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
1771			if (mp != NULL)
1772				putnext(rq, mp);
1773			return (0);
1774		}
1775
1776		eager->tcp_issocket = B_TRUE;
1777
1778		ASSERT(econnp->conn_netstack ==
1779		    listener->tcp_connp->conn_netstack);
1780		ASSERT(eager->tcp_tcps == listener->tcp_tcps);
1781
1782		/* Put the ref for IP */
1783		CONN_INC_REF(econnp);
1784
1785		/*
1786		 * We should have minimum of 3 references on the conn
1787		 * at this point. One each for TCP and IP and one for
1788		 * the T_conn_ind that was sent up when the 3-way handshake
1789		 * completed. In the normal case we would also have another
1790		 * reference (making a total of 4) for the conn being in the
1791		 * classifier hash list. However the eager could have received
1792		 * an RST subsequently and tcp_closei_local could have removed
1793		 * the eager from the classifier hash list, hence we can't
1794		 * assert that reference.
1795		 */
1796		ASSERT(econnp->conn_ref >= 3);
1797
1798		mutex_enter(&listener->tcp_eager_lock);
1799		if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
1800			mblk_t *conn_ind = tcp_get_def_conn_ind(listener);
1801
1802			/* Need to get inside the listener perimeter */
1803			CONN_INC_REF(listener->tcp_connp);
1804			SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
1805			    conn_ind, tcp_send_pending, listener->tcp_connp,
1806			    NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING);
1807		}
1808		tcp_eager_unlink(eager);
1809		mutex_exit(&listener->tcp_eager_lock);
1810
1811		/*
1812		 * At this point, the eager is detached from the listener
1813		 * but we still have an extra refs on eager (apart from the
1814		 * usual tcp references). The ref was placed in tcp_input_data
1815		 * before sending the conn_ind in tcp_send_conn_ind.
1816		 * The ref will be dropped in tcp_accept_finish().
1817		 */
1818		SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
1819		    econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
1820
1821		/*
1822		 * Send the new local address also up to sockfs. There
1823		 * should already be enough space in the mp that came
1824		 * down from soaccept().
1825		 */
1826		if (econnp->conn_family == AF_INET) {
1827			sin_t *sin;
1828
1829			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1830			    (sizeof (struct T_ok_ack) + sizeof (sin_t)));
1831			sin = (sin_t *)mp->b_wptr;
1832			mp->b_wptr += sizeof (sin_t);
1833			sin->sin_family = AF_INET;
1834			sin->sin_port = econnp->conn_lport;
1835			sin->sin_addr.s_addr = econnp->conn_laddr_v4;
1836		} else {
1837			sin6_t *sin6;
1838
1839			ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
1840			    sizeof (struct T_ok_ack) + sizeof (sin6_t));
1841			sin6 = (sin6_t *)mp->b_wptr;
1842			mp->b_wptr += sizeof (sin6_t);
1843			sin6->sin6_family = AF_INET6;
1844			sin6->sin6_port = econnp->conn_lport;
1845			sin6->sin6_addr = econnp->conn_laddr_v6;
1846			if (econnp->conn_ipversion == IPV4_VERSION)
1847				sin6->sin6_flowinfo = 0;
1848			else
1849				sin6->sin6_flowinfo = econnp->conn_flowinfo;
1850			if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
1851			    (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
1852				sin6->sin6_scope_id =
1853				    econnp->conn_ixa->ixa_scopeid;
1854			} else {
1855				sin6->sin6_scope_id = 0;
1856			}
1857			sin6->__sin6_src_id = 0;
1858		}
1859
1860		putnext(rq, mp);
1861		break;
1862	default:
1863		mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
1864		if (mp != NULL)
1865			putnext(rq, mp);
1866		break;
1867	}
1868	return (0);
1869}
1870
1871/*
1872 * The function called through squeue to get behind listener's perimeter to
1873 * send a deferred conn_ind.
1874 */
1875/* ARGSUSED */
1876void
1877tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1878{
1879	conn_t	*lconnp = (conn_t *)arg;
1880	tcp_t *listener = lconnp->conn_tcp;
1881	struct T_conn_ind *conn_ind;
1882	tcp_t *tcp;
1883
1884	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1885	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1886	    conn_ind->OPT_length);
1887
1888	if (listener->tcp_state != TCPS_LISTEN) {
1889		/*
1890		 * If listener has closed, it would have caused a
1891		 * a cleanup/blowoff to happen for the eager, so
1892		 * we don't need to do anything more.
1893		 */
1894		freemsg(mp);
1895		return;
1896	}
1897
1898	putnext(lconnp->conn_rq, mp);
1899}
1900
1901/*
1902 * Sends the T_CONN_IND to the listener. The caller calls this
1903 * functions via squeue to get inside the listener's perimeter
1904 * once the 3 way hand shake is done a T_CONN_IND needs to be
1905 * sent. As an optimization, the caller can call this directly
1906 * if listener's perimeter is same as eager's.
1907 */
1908/* ARGSUSED */
1909void
1910tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
1911{
1912	conn_t			*lconnp = (conn_t *)arg;
1913	tcp_t			*listener = lconnp->conn_tcp;
1914	tcp_t			*tcp;
1915	struct T_conn_ind	*conn_ind;
1916	ipaddr_t		*addr_cache;
1917	boolean_t		need_send_conn_ind = B_FALSE;
1918	tcp_stack_t		*tcps = listener->tcp_tcps;
1919
1920	/* retrieve the eager */
1921	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1922	ASSERT(conn_ind->OPT_offset != 0 &&
1923	    conn_ind->OPT_length == sizeof (intptr_t));
1924	bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
1925	    conn_ind->OPT_length);
1926
1927	/*
1928	 * TLI/XTI applications will get confused by
1929	 * sending eager as an option since it violates
1930	 * the option semantics. So remove the eager as
1931	 * option since TLI/XTI app doesn't need it anyway.
1932	 */
1933	if (!TCP_IS_SOCKET(listener)) {
1934		conn_ind->OPT_length = 0;
1935		conn_ind->OPT_offset = 0;
1936	}
1937	if (listener->tcp_state != TCPS_LISTEN) {
1938		/*
1939		 * If listener has closed, it would have caused a
1940		 * a cleanup/blowoff to happen for the eager. We
1941		 * just need to return.
1942		 */
1943		freemsg(mp);
1944		return;
1945	}
1946
1947
1948	/*
1949	 * if the conn_req_q is full defer passing up the
1950	 * T_CONN_IND until space is availabe after t_accept()
1951	 * processing
1952	 */
1953	mutex_enter(&listener->tcp_eager_lock);
1954
1955	/*
1956	 * Take the eager out, if it is in the list of droppable eagers
1957	 * as we are here because the 3W handshake is over.
1958	 */
1959	MAKE_UNDROPPABLE(tcp);
1960
1961	if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
1962		tcp_t *tail;
1963
1964		/*
1965		 * The eager already has an extra ref put in tcp_input_data
1966		 * so that it stays till accept comes back even though it
1967		 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1968		 */
1969		ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1970		listener->tcp_conn_req_cnt_q0--;
1971		listener->tcp_conn_req_cnt_q++;
1972
1973		/* Move from SYN_RCVD to ESTABLISHED list  */
1974		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1975		    tcp->tcp_eager_prev_q0;
1976		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1977		    tcp->tcp_eager_next_q0;
1978		tcp->tcp_eager_prev_q0 = NULL;
1979		tcp->tcp_eager_next_q0 = NULL;
1980
1981		/*
1982		 * Insert at end of the queue because sockfs
1983		 * sends down T_CONN_RES in chronological
1984		 * order. Leaving the older conn indications
1985		 * at front of the queue helps reducing search
1986		 * time.
1987		 */
1988		tail = listener->tcp_eager_last_q;
1989		if (tail != NULL)
1990			tail->tcp_eager_next_q = tcp;
1991		else
1992			listener->tcp_eager_next_q = tcp;
1993		listener->tcp_eager_last_q = tcp;
1994		tcp->tcp_eager_next_q = NULL;
1995		/*
1996		 * Delay sending up the T_conn_ind until we are
1997		 * done with the eager. Once we have have sent up
1998		 * the T_conn_ind, the accept can potentially complete
1999		 * any time and release the refhold we have on the eager.
2000		 */
2001		need_send_conn_ind = B_TRUE;
2002	} else {
2003		/*
2004		 * Defer connection on q0 and set deferred
2005		 * connection bit true
2006		 */
2007		tcp->tcp_conn_def_q0 = B_TRUE;
2008
2009		/* take tcp out of q0 ... */
2010		tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2011		    tcp->tcp_eager_next_q0;
2012		tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2013		    tcp->tcp_eager_prev_q0;
2014
2015		/* ... and place it at the end of q0 */
2016		tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
2017		tcp->tcp_eager_next_q0 = listener;
2018		listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
2019		listener->tcp_eager_prev_q0 = tcp;
2020		tcp->tcp_conn.tcp_eager_conn_ind = mp;
2021	}
2022
2023	/* we have timed out before */
2024	if (tcp->tcp_syn_rcvd_timeout != 0) {
2025		tcp->tcp_syn_rcvd_timeout = 0;
2026		listener->tcp_syn_rcvd_timeout--;
2027		if (listener->tcp_syn_defense &&
2028		    listener->tcp_syn_rcvd_timeout <=
2029		    (tcps->tcps_conn_req_max_q0 >> 5) &&
2030		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
2031		    listener->tcp_last_rcv_lbolt)) {
2032			/*
2033			 * Turn off the defense mode if we
2034			 * believe the SYN attack is over.
2035			 */
2036			listener->tcp_syn_defense = B_FALSE;
2037			if (listener->tcp_ip_addr_cache) {
2038				kmem_free((void *)listener->tcp_ip_addr_cache,
2039				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2040				listener->tcp_ip_addr_cache = NULL;
2041			}
2042		}
2043	}
2044	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
2045	if (addr_cache != NULL) {
2046		/*
2047		 * We have finished a 3-way handshake with this
2048		 * remote host. This proves the IP addr is good.
2049		 * Cache it!
2050		 */
2051		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
2052		    tcp->tcp_connp->conn_faddr_v4;
2053	}
2054	mutex_exit(&listener->tcp_eager_lock);
2055	if (need_send_conn_ind)
2056		putnext(lconnp->conn_rq, mp);
2057}
2058