1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2019 Joyent, Inc.
26 * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
27 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
28 * Copyright 2022 Oxide Computer Company
29 */
30
31 /* This file contains all TCP input processing functions. */
32
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/strsun.h>
36 #include <sys/strsubr.h>
37 #include <sys/stropts.h>
38 #include <sys/strlog.h>
39 #define _SUN_TPI_VERSION 2
40 #include <sys/tihdr.h>
41 #include <sys/suntpi.h>
42 #include <sys/xti_inet.h>
43 #include <sys/squeue_impl.h>
44 #include <sys/squeue.h>
45 #include <sys/tsol/tnet.h>
46
47 #include <inet/common.h>
48 #include <inet/ip.h>
49 #include <inet/tcp.h>
50 #include <inet/tcp_impl.h>
51 #include <inet/tcp_cluster.h>
52 #include <inet/proto_set.h>
53 #include <inet/ipsec_impl.h>
54
55 /*
56 * RFC7323-recommended phrasing of TSTAMP option, for easier parsing
57 */
58
59 #ifdef _BIG_ENDIAN
60 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
61 (TCPOPT_TSTAMP << 8) | 10)
62 #else
63 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
64 (TCPOPT_NOP << 8) | TCPOPT_NOP)
65 #endif
66
67 /*
68 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days
69 */
70 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
71
72 /*
73 * Since tcp_listener is not cleared atomically with tcp_detached
74 * being cleared we need this extra bit to tell a detached connection
75 * apart from one that is in the process of being accepted.
76 */
77 #define TCP_IS_DETACHED_NONEAGER(tcp) \
78 (TCP_IS_DETACHED(tcp) && \
79 (!(tcp)->tcp_hard_binding))
80
81 /*
82 * Steps to do when a tcp_t moves to TIME-WAIT state.
83 *
84 * This connection is done, we don't need to account for it. Decrement
85 * the listener connection counter if needed.
86 *
87 * Decrement the connection counter of the stack. Note that this counter
88 * is per CPU. So the total number of connections in a stack is the sum of all
89 * of them. Since there is no lock for handling all of them exclusively, the
90 * resulting sum is only an approximation.
91 *
92 * Unconditionally clear the exclusive binding bit so this TIME-WAIT
93 * connection won't interfere with new ones.
94 *
95 * Start the TIME-WAIT timer. If upper layer has not closed the connection,
96 * the timer is handled within the context of this tcp_t. When the timer
97 * fires, tcp_clean_death() is called. If upper layer closes the connection
98 * during this period, tcp_time_wait_append() will be called to add this
99 * tcp_t to the global TIME-WAIT list. Note that this means that the
100 * actual wait time in TIME-WAIT state will be longer than the
101 * tcps_time_wait_interval since the period before upper layer closes the
102 * connection is not accounted for when tcp_time_wait_append() is called.
103 *
104 * If upper layer has closed the connection, call tcp_time_wait_append()
105 * directly.
106 *
107 */
108 #define SET_TIME_WAIT(tcps, tcp, connp) \
109 { \
110 (tcp)->tcp_state = TCPS_TIME_WAIT; \
111 if ((tcp)->tcp_listen_cnt != NULL) \
112 TCP_DECR_LISTEN_CNT(tcp); \
113 atomic_dec_64( \
114 (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt); \
115 (connp)->conn_exclbind = 0; \
116 if (!TCP_IS_DETACHED(tcp)) { \
117 TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \
118 } else { \
119 tcp_time_wait_append(tcp); \
120 TCP_DBGSTAT(tcps, tcp_rput_time_wait); \
121 } \
122 }
123
124 /*
125 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
126 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
127 * data, TCP will not respond with an ACK. RFC 793 requires that
128 * TCP responds with an ACK for such a bogus ACK. By not following
129 * the RFC, we prevent TCP from getting into an ACK storm if somehow
130 * an attacker successfully spoofs an acceptable segment to our
131 * peer; or when our peer is "confused."
132 */
133 static uint32_t tcp_drop_ack_unsent_cnt = 10;
134
135 /*
136 * To protect TCP against attacker using a small window and requesting
137 * large amount of data (DoS attack by conuming memory), TCP checks the
138 * window advertised in the last ACK of the 3-way handshake. TCP uses
139 * the tcp_mss (the size of one packet) value for comparion. The window
140 * should be larger than tcp_mss. But while a sane TCP should advertise
141 * a receive window larger than or equal to 4*MSS to avoid stop and go
142 * tarrfic, not all TCP stacks do that. This is especially true when
143 * tcp_mss is a big value.
144 *
145 * To work around this issue, an additional fixed value for comparison
146 * is also used. If the advertised window is smaller than both tcp_mss
147 * and tcp_init_wnd_chk, the ACK is considered as invalid. So for large
148 * tcp_mss value (say, 8K), a window larger than tcp_init_wnd_chk but
149 * smaller than 8K is considered to be OK.
150 */
151 static uint32_t tcp_init_wnd_chk = 4096;
152
153 /* Process ICMP source quench message or not. */
154 static boolean_t tcp_icmp_source_quench = B_FALSE;
155
156 static boolean_t tcp_outbound_squeue_switch = B_FALSE;
157
158 static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *,
159 ip_recv_attr_t *);
160 static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *,
161 ip_recv_attr_t *);
162 static boolean_t tcp_drop_q0(tcp_t *);
163 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
164 static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
165 ip_recv_attr_t *);
166 static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
167 static void tcp_process_options(tcp_t *, tcpha_t *);
168 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
169 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
170 static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
171 static void tcp_set_rto(tcp_t *, hrtime_t);
172 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
173
174 /*
175 * CC wrapper hook functions
176 */
177 static void
cc_ack_received(tcp_t * tcp,uint32_t seg_ack,int32_t bytes_acked,uint16_t type)178 cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
179 uint16_t type)
180 {
181 uint32_t old_cwnd = tcp->tcp_cwnd;
182
183 tcp->tcp_ccv.bytes_this_ack = bytes_acked;
184 if (tcp->tcp_cwnd <= tcp->tcp_swnd)
185 tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
186 else
187 tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
188
189 if (type == CC_ACK) {
190 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
191 if (tcp->tcp_ccv.flags & CCF_RTO)
192 tcp->tcp_ccv.flags &= ~CCF_RTO;
193
194 tcp->tcp_ccv.t_bytes_acked +=
195 min(tcp->tcp_ccv.bytes_this_ack,
196 tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
197 if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
198 tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
199 tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
200 }
201 } else {
202 tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
203 tcp->tcp_ccv.t_bytes_acked = 0;
204 }
205 }
206
207 if (CC_ALGO(tcp)->ack_received != NULL) {
208 /*
209 * The FreeBSD code where this originated had a comment "Find
210 * a way to live without this" in several places where curack
211 * got set. If they eventually dump curack from the cc
212 * variables, we'll need to adapt our code.
213 */
214 tcp->tcp_ccv.curack = seg_ack;
215 CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
216 }
217
218 DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
219 uint32_t, tcp->tcp_cwnd);
220 }
221
222 void
cc_cong_signal(tcp_t * tcp,uint32_t seg_ack,uint32_t type)223 cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
224 {
225 uint32_t old_cwnd = tcp->tcp_cwnd;
226 uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
227 switch (type) {
228 case CC_NDUPACK:
229 if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
230 tcp->tcp_rexmit_max = tcp->tcp_snxt;
231 if (tcp->tcp_ecn_ok) {
232 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
233 tcp->tcp_cwr = B_TRUE;
234 tcp->tcp_ecn_cwr_sent = B_FALSE;
235 }
236 }
237 break;
238 case CC_ECN:
239 if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
240 tcp->tcp_rexmit_max = tcp->tcp_snxt;
241 if (tcp->tcp_ecn_ok) {
242 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
243 tcp->tcp_cwr = B_TRUE;
244 tcp->tcp_ecn_cwr_sent = B_FALSE;
245 }
246 }
247 break;
248 case CC_RTO:
249 tcp->tcp_ccv.flags |= CCF_RTO;
250 tcp->tcp_dupack_cnt = 0;
251 tcp->tcp_ccv.t_bytes_acked = 0;
252 /*
253 * Give up on fast recovery and congestion recovery if we were
254 * attempting either.
255 */
256 EXIT_RECOVERY(tcp->tcp_ccv.flags);
257 if (CC_ALGO(tcp)->cong_signal == NULL) {
258 /*
259 * RFC5681 Section 3.1
260 * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
261 */
262 tcp->tcp_cwnd_ssthresh = max(
263 (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
264 2) * tcp->tcp_mss;
265 tcp->tcp_cwnd = tcp->tcp_mss;
266 }
267
268 if (tcp->tcp_ecn_ok) {
269 tcp->tcp_cwr = B_TRUE;
270 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
271 tcp->tcp_ecn_cwr_sent = B_FALSE;
272 }
273 break;
274 }
275
276 if (CC_ALGO(tcp)->cong_signal != NULL) {
277 tcp->tcp_ccv.curack = seg_ack;
278 CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
279 }
280
281 DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
282 uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
283 uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
284 }
285
286 static void
cc_post_recovery(tcp_t * tcp,uint32_t seg_ack)287 cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
288 {
289 uint32_t old_cwnd = tcp->tcp_cwnd;
290
291 if (CC_ALGO(tcp)->post_recovery != NULL) {
292 tcp->tcp_ccv.curack = seg_ack;
293 CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
294 }
295 tcp->tcp_ccv.t_bytes_acked = 0;
296
297 DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
298 uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
299 }
300
301 /*
302 * Set the MSS associated with a particular tcp based on its current value,
303 * and a new one passed in. Observe minimums and maximums, and reset other
304 * state variables that we want to view as multiples of MSS.
305 *
306 * The value of MSS could be either increased or descreased.
307 */
308 void
tcp_mss_set(tcp_t * tcp,uint32_t mss)309 tcp_mss_set(tcp_t *tcp, uint32_t mss)
310 {
311 uint32_t mss_max;
312 tcp_stack_t *tcps = tcp->tcp_tcps;
313 conn_t *connp = tcp->tcp_connp;
314
315 if (connp->conn_ipversion == IPV4_VERSION)
316 mss_max = tcps->tcps_mss_max_ipv4;
317 else
318 mss_max = tcps->tcps_mss_max_ipv6;
319
320 if (mss < tcps->tcps_mss_min)
321 mss = tcps->tcps_mss_min;
322 if (mss > mss_max)
323 mss = mss_max;
324 /*
325 * Unless naglim has been set by our client to
326 * a non-mss value, force naglim to track mss.
327 * This can help to aggregate small writes.
328 */
329 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
330 tcp->tcp_naglim = mss;
331 /*
332 * TCP should be able to buffer at least 4 MSS data for obvious
333 * performance reason.
334 */
335 if ((mss << 2) > connp->conn_sndbuf)
336 connp->conn_sndbuf = mss << 2;
337
338 /*
339 * Set the send lowater to at least twice of MSS.
340 */
341 if ((mss << 1) > connp->conn_sndlowat)
342 connp->conn_sndlowat = mss << 1;
343
344 /*
345 * Update tcp_cwnd according to the new value of MSS. Keep the
346 * previous ratio to preserve the transmit rate.
347 */
348 tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
349 tcp->tcp_cwnd_cnt = 0;
350
351 tcp->tcp_mss = mss;
352 (void) tcp_maxpsz_set(tcp, B_TRUE);
353 }
354
355 /*
356 * Extract option values from a tcp header. We put any found values into the
357 * tcpopt struct and return a bitmask saying which options were found.
358 */
359 int
tcp_parse_options(tcpha_t * tcpha,tcp_opt_t * tcpopt)360 tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
361 {
362 uchar_t *endp;
363 int len;
364 uint32_t mss;
365 uchar_t *up = (uchar_t *)tcpha;
366 int found = 0;
367 int32_t sack_len;
368 tcp_seq sack_begin, sack_end;
369 tcp_t *tcp;
370
371 endp = up + TCP_HDR_LENGTH(tcpha);
372 up += TCP_MIN_HEADER_LENGTH;
373 /*
374 * If timestamp option is aligned as recommended in RFC 7323 Appendix
375 * A, and is the only option, return quickly.
376 */
377 if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH +
378 TCPOPT_REAL_TS_LEN &&
379 OK_32PTR(up) &&
380 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) {
381 tcpopt->tcp_opt_ts_val = ABE32_TO_U32((up+4));
382 tcpopt->tcp_opt_ts_ecr = ABE32_TO_U32((up+8));
383
384 return (TCP_OPT_TSTAMP_PRESENT);
385 }
386 while (up < endp) {
387 len = endp - up;
388 switch (*up) {
389 case TCPOPT_EOL:
390 break;
391
392 case TCPOPT_NOP:
393 up++;
394 continue;
395
396 case TCPOPT_MAXSEG:
397 if (len < TCPOPT_MAXSEG_LEN ||
398 up[1] != TCPOPT_MAXSEG_LEN)
399 break;
400
401 mss = BE16_TO_U16(up+2);
402 /* Caller must handle tcp_mss_min and tcp_mss_max_* */
403 tcpopt->tcp_opt_mss = mss;
404 found |= TCP_OPT_MSS_PRESENT;
405
406 up += TCPOPT_MAXSEG_LEN;
407 continue;
408
409 case TCPOPT_WSCALE:
410 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
411 break;
412
413 if (up[2] > TCP_MAX_WINSHIFT)
414 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
415 else
416 tcpopt->tcp_opt_wscale = up[2];
417 found |= TCP_OPT_WSCALE_PRESENT;
418
419 up += TCPOPT_WS_LEN;
420 continue;
421
422 case TCPOPT_SACK_PERMITTED:
423 if (len < TCPOPT_SACK_OK_LEN ||
424 up[1] != TCPOPT_SACK_OK_LEN)
425 break;
426 found |= TCP_OPT_SACK_OK_PRESENT;
427 up += TCPOPT_SACK_OK_LEN;
428 continue;
429
430 case TCPOPT_SACK:
431 if (len <= 2 || up[1] <= 2 || len < up[1])
432 break;
433
434 /* If TCP is not interested in SACK blks... */
435 if ((tcp = tcpopt->tcp) == NULL) {
436 up += up[1];
437 continue;
438 }
439 sack_len = up[1] - TCPOPT_HEADER_LEN;
440 up += TCPOPT_HEADER_LEN;
441
442 /*
443 * If the list is empty, allocate one and assume
444 * nothing is sack'ed.
445 */
446 if (tcp->tcp_notsack_list == NULL) {
447 tcp_notsack_update(&(tcp->tcp_notsack_list),
448 tcp->tcp_suna, tcp->tcp_snxt,
449 &(tcp->tcp_num_notsack_blk),
450 &(tcp->tcp_cnt_notsack_list));
451
452 /*
453 * Make sure tcp_notsack_list is not NULL.
454 * This happens when kmem_alloc(KM_NOSLEEP)
455 * returns NULL.
456 */
457 if (tcp->tcp_notsack_list == NULL) {
458 up += sack_len;
459 continue;
460 }
461 tcp->tcp_fack = tcp->tcp_suna;
462 }
463
464 while (sack_len > 0) {
465 if (up + 8 > endp) {
466 up = endp;
467 break;
468 }
469 sack_begin = BE32_TO_U32(up);
470 up += 4;
471 sack_end = BE32_TO_U32(up);
472 up += 4;
473 sack_len -= 8;
474 /*
475 * Bounds checking. Make sure the SACK
476 * info is within tcp_suna and tcp_snxt.
477 * If this SACK blk is out of bound, ignore
478 * it but continue to parse the following
479 * blks.
480 */
481 if (SEQ_LEQ(sack_end, sack_begin) ||
482 SEQ_LT(sack_begin, tcp->tcp_suna) ||
483 SEQ_GT(sack_end, tcp->tcp_snxt)) {
484 continue;
485 }
486 tcp_notsack_insert(&(tcp->tcp_notsack_list),
487 sack_begin, sack_end,
488 &(tcp->tcp_num_notsack_blk),
489 &(tcp->tcp_cnt_notsack_list));
490 if (SEQ_GT(sack_end, tcp->tcp_fack)) {
491 tcp->tcp_fack = sack_end;
492 }
493 }
494 found |= TCP_OPT_SACK_PRESENT;
495 continue;
496
497 case TCPOPT_TSTAMP:
498 if (len < TCPOPT_TSTAMP_LEN ||
499 up[1] != TCPOPT_TSTAMP_LEN)
500 break;
501
502 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
503 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
504
505 found |= TCP_OPT_TSTAMP_PRESENT;
506
507 up += TCPOPT_TSTAMP_LEN;
508 continue;
509
510 default:
511 if (len <= 1 || len < (int)up[1] || up[1] == 0)
512 break;
513 up += up[1];
514 continue;
515 }
516 break;
517 }
518 return (found);
519 }
520
521 /*
522 * Process all TCP option in SYN segment. Note that this function should
523 * be called after tcp_set_destination() is called so that the necessary info
524 * from IRE is already set in the tcp structure.
525 *
526 * This function sets up the correct tcp_mss value according to the
527 * MSS option value and our header size. It also sets up the window scale
528 * and timestamp values, and initialize SACK info blocks. But it does not
529 * change receive window size after setting the tcp_mss value. The caller
530 * should do the appropriate change.
531 */
532 static void
tcp_process_options(tcp_t * tcp,tcpha_t * tcpha)533 tcp_process_options(tcp_t *tcp, tcpha_t *tcpha)
534 {
535 int options;
536 tcp_opt_t tcpopt;
537 uint32_t mss_max;
538 char *tmp_tcph;
539 tcp_stack_t *tcps = tcp->tcp_tcps;
540 conn_t *connp = tcp->tcp_connp;
541
542 tcpopt.tcp = NULL;
543 options = tcp_parse_options(tcpha, &tcpopt);
544
545 /*
546 * Process MSS option. Note that MSS option value does not account
547 * for IP or TCP options. This means that it is equal to MTU - minimum
548 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for
549 * IPv6.
550 */
551 if (!(options & TCP_OPT_MSS_PRESENT)) {
552 if (connp->conn_ipversion == IPV4_VERSION)
553 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4;
554 else
555 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6;
556 } else {
557 if (connp->conn_ipversion == IPV4_VERSION)
558 mss_max = tcps->tcps_mss_max_ipv4;
559 else
560 mss_max = tcps->tcps_mss_max_ipv6;
561 if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min)
562 tcpopt.tcp_opt_mss = tcps->tcps_mss_min;
563 else if (tcpopt.tcp_opt_mss > mss_max)
564 tcpopt.tcp_opt_mss = mss_max;
565 }
566
567 /* Process Window Scale option. */
568 if (options & TCP_OPT_WSCALE_PRESENT) {
569 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale;
570 tcp->tcp_snd_ws_ok = B_TRUE;
571 } else {
572 tcp->tcp_snd_ws = B_FALSE;
573 tcp->tcp_snd_ws_ok = B_FALSE;
574 tcp->tcp_rcv_ws = B_FALSE;
575 }
576
577 /* Process Timestamp option. */
578 if ((options & TCP_OPT_TSTAMP_PRESENT) &&
579 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) {
580 tmp_tcph = (char *)tcp->tcp_tcpha;
581
582 tcp->tcp_snd_ts_ok = B_TRUE;
583 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
584 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
585 ASSERT(OK_32PTR(tmp_tcph));
586 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
587
588 /* Fill in our template header with basic timestamp option. */
589 tmp_tcph += connp->conn_ht_ulp_len;
590 tmp_tcph[0] = TCPOPT_NOP;
591 tmp_tcph[1] = TCPOPT_NOP;
592 tmp_tcph[2] = TCPOPT_TSTAMP;
593 tmp_tcph[3] = TCPOPT_TSTAMP_LEN;
594 connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN;
595 connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN;
596 tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4);
597 } else {
598 tcp->tcp_snd_ts_ok = B_FALSE;
599 }
600
601 /*
602 * Process SACK options. If SACK is enabled for this connection,
603 * then allocate the SACK info structure. Note the following ways
604 * when tcp_snd_sack_ok is set to true.
605 *
606 * For active connection: in tcp_set_destination() called in
607 * tcp_connect().
608 *
609 * For passive connection: in tcp_set_destination() called in
610 * tcp_input_listener().
611 *
612 * That's the reason why the extra TCP_IS_DETACHED() check is there.
613 * That check makes sure that if we did not send a SACK OK option,
614 * we will not enable SACK for this connection even though the other
615 * side sends us SACK OK option. For active connection, the SACK
616 * info structure has already been allocated. So we need to free
617 * it if SACK is disabled.
618 */
619 if ((options & TCP_OPT_SACK_OK_PRESENT) &&
620 (tcp->tcp_snd_sack_ok ||
621 (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) {
622 ASSERT(tcp->tcp_num_sack_blk == 0);
623 ASSERT(tcp->tcp_notsack_list == NULL);
624
625 tcp->tcp_snd_sack_ok = B_TRUE;
626 if (tcp->tcp_snd_ts_ok) {
627 tcp->tcp_max_sack_blk = 3;
628 } else {
629 tcp->tcp_max_sack_blk = 4;
630 }
631 } else if (tcp->tcp_snd_sack_ok) {
632 /*
633 * Resetting tcp_snd_sack_ok to B_FALSE so that
634 * no SACK info will be used for this
635 * connection. This assumes that SACK usage
636 * permission is negotiated. This may need
637 * to be changed once this is clarified.
638 */
639 ASSERT(tcp->tcp_num_sack_blk == 0);
640 ASSERT(tcp->tcp_notsack_list == NULL);
641 tcp->tcp_snd_sack_ok = B_FALSE;
642 }
643
644 /*
645 * Now we know the exact TCP/IP header length, subtract
646 * that from tcp_mss to get our side's MSS.
647 */
648 tcp->tcp_mss -= connp->conn_ht_iphc_len;
649
650 /*
651 * Here we assume that the other side's header size will be equal to
652 * our header size. We calculate the real MSS accordingly. Need to
653 * take into additional stuffs IPsec puts in.
654 *
655 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header)
656 */
657 tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len +
658 tcp->tcp_ipsec_overhead -
659 ((connp->conn_ipversion == IPV4_VERSION ?
660 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
661
662 /*
663 * Set MSS to the smaller one of both ends of the connection.
664 * We should not have called tcp_mss_set() before, but our
665 * side of the MSS should have been set to a proper value
666 * by tcp_set_destination(). tcp_mss_set() will also set up the
667 * STREAM head parameters properly.
668 *
669 * If we have a larger-than-16-bit window but the other side
670 * didn't want to do window scale, tcp_rwnd_set() will take
671 * care of that.
672 */
673 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
674
675 /*
676 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
677 * updated properly.
678 */
679 TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
680
681 if (tcp->tcp_cc_algo->conn_init != NULL)
682 tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
683 }
684
685 /*
686 * Add a new piece to the tcp reassembly queue. If the gap at the beginning
687 * is filled, return as much as we can. The message passed in may be
688 * multi-part, chained using b_cont. "start" is the starting sequence
689 * number for this piece.
690 */
691 static mblk_t *
tcp_reass(tcp_t * tcp,mblk_t * mp,uint32_t start)692 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
693 {
694 uint32_t end, bytes;
695 mblk_t *mp1;
696 mblk_t *mp2;
697 mblk_t *next_mp;
698 uint32_t u1;
699 tcp_stack_t *tcps = tcp->tcp_tcps;
700
701
702 /* Walk through all the new pieces. */
703 do {
704 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
705 (uintptr_t)INT_MAX);
706 end = start + (int)(mp->b_wptr - mp->b_rptr);
707 next_mp = mp->b_cont;
708 if (start == end) {
709 /* Empty. Blast it. */
710 freeb(mp);
711 continue;
712 }
713 bytes = end - start;
714 mp->b_cont = NULL;
715 TCP_REASS_SET_SEQ(mp, start);
716 TCP_REASS_SET_END(mp, end);
717 mp1 = tcp->tcp_reass_tail;
718 if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
719 if (mp1 != NULL) {
720 /*
721 * New stuff is beyond the tail; link it on the
722 * end.
723 */
724 mp1->b_cont = mp;
725 } else {
726 tcp->tcp_reass_head = mp;
727 }
728 tcp->tcp_reass_tail = mp;
729 TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
730 TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
731 tcp->tcp_cs.tcp_in_data_unorder_segs++;
732 tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
733 continue;
734 }
735 mp1 = tcp->tcp_reass_head;
736 u1 = TCP_REASS_SEQ(mp1);
737 /* New stuff at the front? */
738 if (SEQ_LT(start, u1)) {
739 /* Yes... Check for overlap. */
740 mp->b_cont = mp1;
741 tcp->tcp_reass_head = mp;
742 tcp_reass_elim_overlap(tcp, mp);
743 continue;
744 }
745 /*
746 * The new piece fits somewhere between the head and tail.
747 * We find our slot, where mp1 precedes us and mp2 trails.
748 */
749 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) {
750 u1 = TCP_REASS_SEQ(mp2);
751 if (SEQ_LEQ(start, u1))
752 break;
753 }
754 /* Link ourselves in */
755 mp->b_cont = mp2;
756 mp1->b_cont = mp;
757
758 /* Trim overlap with following mblk(s) first */
759 tcp_reass_elim_overlap(tcp, mp);
760
761 /* Trim overlap with preceding mblk */
762 tcp_reass_elim_overlap(tcp, mp1);
763
764 } while (start = end, mp = next_mp);
765 mp1 = tcp->tcp_reass_head;
766 /* Anything ready to go? */
767 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt)
768 return (NULL);
769 /* Eat what we can off the queue */
770 for (;;) {
771 mp = mp1->b_cont;
772 end = TCP_REASS_END(mp1);
773 TCP_REASS_SET_SEQ(mp1, 0);
774 TCP_REASS_SET_END(mp1, 0);
775 if (!mp) {
776 tcp->tcp_reass_tail = NULL;
777 break;
778 }
779 if (end != TCP_REASS_SEQ(mp)) {
780 mp1->b_cont = NULL;
781 break;
782 }
783 mp1 = mp;
784 }
785 mp1 = tcp->tcp_reass_head;
786 tcp->tcp_reass_head = mp;
787 return (mp1);
788 }
789
790 /* Eliminate any overlap that mp may have over later mblks */
791 static void
tcp_reass_elim_overlap(tcp_t * tcp,mblk_t * mp)792 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp)
793 {
794 uint32_t end;
795 mblk_t *mp1;
796 uint32_t u1;
797 tcp_stack_t *tcps = tcp->tcp_tcps;
798
799 end = TCP_REASS_END(mp);
800 while ((mp1 = mp->b_cont) != NULL) {
801 u1 = TCP_REASS_SEQ(mp1);
802 if (!SEQ_GT(end, u1))
803 break;
804 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) {
805 mp->b_wptr -= end - u1;
806 TCP_REASS_SET_END(mp, u1);
807 TCPS_BUMP_MIB(tcps, tcpInDataPartDupSegs);
808 TCPS_UPDATE_MIB(tcps, tcpInDataPartDupBytes,
809 end - u1);
810 break;
811 }
812 mp->b_cont = mp1->b_cont;
813 TCP_REASS_SET_SEQ(mp1, 0);
814 TCP_REASS_SET_END(mp1, 0);
815 freeb(mp1);
816 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
817 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, end - u1);
818 }
819 if (!mp1)
820 tcp->tcp_reass_tail = mp;
821 }
822
823 /*
824 * This function does PAWS protection check, per RFC 7323 section 5. Requires
825 * that timestamp options are already processed into tcpoptp. Returns B_TRUE if
826 * the segment passes the PAWS test, else returns B_FALSE.
827 */
828 boolean_t
tcp_paws_check(tcp_t * tcp,const tcp_opt_t * tcpoptp)829 tcp_paws_check(tcp_t *tcp, const tcp_opt_t *tcpoptp)
830 {
831 if (TSTMP_LT(tcpoptp->tcp_opt_ts_val,
832 tcp->tcp_ts_recent)) {
833 if (LBOLT_FASTPATH64 <
834 (tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) {
835 /* This segment is not acceptable. */
836 return (B_FALSE);
837 } else {
838 /*
839 * Connection has been idle for
840 * too long. Reset the timestamp
841 */
842 tcp->tcp_ts_recent =
843 tcpoptp->tcp_opt_ts_val;
844 }
845 }
846 return (B_TRUE);
847 }
848
849 /*
850 * Defense for the SYN attack -
851 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
852 * one from the list of droppable eagers. This list is a subset of q0.
853 * see comments before the definition of MAKE_DROPPABLE().
854 * 2. Don't drop a SYN request before its first timeout. This gives every
855 * request at least til the first timeout to complete its 3-way handshake.
856 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
857 * requests currently on the queue that has timed out. This will be used
858 * as an indicator of whether an attack is under way, so that appropriate
859 * actions can be taken. (It's incremented in tcp_timer() and decremented
860 * either when eager goes into ESTABLISHED, or gets freed up.)
861 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
862 * # of timeout drops back to <= q0len/32 => SYN alert off
863 */
864 static boolean_t
tcp_drop_q0(tcp_t * tcp)865 tcp_drop_q0(tcp_t *tcp)
866 {
867 tcp_t *eager;
868 mblk_t *mp;
869 tcp_stack_t *tcps = tcp->tcp_tcps;
870
871 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
872 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
873
874 /* Pick oldest eager from the list of droppable eagers */
875 eager = tcp->tcp_eager_prev_drop_q0;
876
877 /* If list is empty. return B_FALSE */
878 if (eager == tcp) {
879 return (B_FALSE);
880 }
881
882 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
883 if ((mp = allocb(0, BPRI_HI)) == NULL)
884 return (B_FALSE);
885
886 /*
887 * Take this eager out from the list of droppable eagers since we are
888 * going to drop it.
889 */
890 MAKE_UNDROPPABLE(eager);
891
892 if (tcp->tcp_connp->conn_debug) {
893 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
894 "tcp_drop_q0: listen half-open queue (max=%d) overflow"
895 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
896 tcp->tcp_conn_req_cnt_q0,
897 tcp_display(tcp, NULL, DISP_PORT_ONLY));
898 }
899
900 TCPS_BUMP_MIB(tcps, tcpHalfOpenDrop);
901
902 /* Put a reference on the conn as we are enqueueing it in the sqeue */
903 CONN_INC_REF(eager->tcp_connp);
904
905 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
906 tcp_clean_death_wrapper, eager->tcp_connp, NULL,
907 SQ_FILL, SQTAG_TCP_DROP_Q0);
908
909 return (B_TRUE);
910 }
911
912 /*
913 * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6
914 */
915 static mblk_t *
tcp_conn_create_v6(conn_t * lconnp,conn_t * connp,mblk_t * mp,ip_recv_attr_t * ira)916 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
917 ip_recv_attr_t *ira)
918 {
919 tcp_t *ltcp = lconnp->conn_tcp;
920 tcp_t *tcp = connp->conn_tcp;
921 mblk_t *tpi_mp;
922 ipha_t *ipha;
923 ip6_t *ip6h;
924 sin6_t sin6;
925 uint_t ifindex = ira->ira_ruifindex;
926 tcp_stack_t *tcps = tcp->tcp_tcps;
927
928 if (ira->ira_flags & IRAF_IS_IPV4) {
929 ipha = (ipha_t *)mp->b_rptr;
930
931 connp->conn_ipversion = IPV4_VERSION;
932 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
933 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
934 connp->conn_saddr_v6 = connp->conn_laddr_v6;
935
936 sin6 = sin6_null;
937 sin6.sin6_addr = connp->conn_faddr_v6;
938 sin6.sin6_port = connp->conn_fport;
939 sin6.sin6_family = AF_INET6;
940 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
941 IPCL_ZONEID(lconnp), tcps->tcps_netstack);
942
943 if (connp->conn_recv_ancillary.crb_recvdstaddr) {
944 sin6_t sin6d;
945
946 sin6d = sin6_null;
947 sin6d.sin6_addr = connp->conn_laddr_v6;
948 sin6d.sin6_port = connp->conn_lport;
949 sin6d.sin6_family = AF_INET;
950 tpi_mp = mi_tpi_extconn_ind(NULL,
951 (char *)&sin6d, sizeof (sin6_t),
952 (char *)&tcp,
953 (t_scalar_t)sizeof (intptr_t),
954 (char *)&sin6d, sizeof (sin6_t),
955 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
956 } else {
957 tpi_mp = mi_tpi_conn_ind(NULL,
958 (char *)&sin6, sizeof (sin6_t),
959 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
960 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
961 }
962 } else {
963 ip6h = (ip6_t *)mp->b_rptr;
964
965 connp->conn_ipversion = IPV6_VERSION;
966 connp->conn_laddr_v6 = ip6h->ip6_dst;
967 connp->conn_faddr_v6 = ip6h->ip6_src;
968 connp->conn_saddr_v6 = connp->conn_laddr_v6;
969
970 sin6 = sin6_null;
971 sin6.sin6_addr = connp->conn_faddr_v6;
972 sin6.sin6_port = connp->conn_fport;
973 sin6.sin6_family = AF_INET6;
974 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
975 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
976 IPCL_ZONEID(lconnp), tcps->tcps_netstack);
977
978 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
979 /* Pass up the scope_id of remote addr */
980 sin6.sin6_scope_id = ifindex;
981 } else {
982 sin6.sin6_scope_id = 0;
983 }
984 if (connp->conn_recv_ancillary.crb_recvdstaddr) {
985 sin6_t sin6d;
986
987 sin6d = sin6_null;
988 sin6.sin6_addr = connp->conn_laddr_v6;
989 sin6d.sin6_port = connp->conn_lport;
990 sin6d.sin6_family = AF_INET6;
991 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6))
992 sin6d.sin6_scope_id = ifindex;
993
994 tpi_mp = mi_tpi_extconn_ind(NULL,
995 (char *)&sin6d, sizeof (sin6_t),
996 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
997 (char *)&sin6d, sizeof (sin6_t),
998 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
999 } else {
1000 tpi_mp = mi_tpi_conn_ind(NULL,
1001 (char *)&sin6, sizeof (sin6_t),
1002 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
1003 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
1004 }
1005 }
1006
1007 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
1008 return (tpi_mp);
1009 }
1010
1011 /* Handle a SYN on an AF_INET socket */
1012 static mblk_t *
tcp_conn_create_v4(conn_t * lconnp,conn_t * connp,mblk_t * mp,ip_recv_attr_t * ira)1013 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
1014 ip_recv_attr_t *ira)
1015 {
1016 tcp_t *ltcp = lconnp->conn_tcp;
1017 tcp_t *tcp = connp->conn_tcp;
1018 sin_t sin;
1019 mblk_t *tpi_mp = NULL;
1020 tcp_stack_t *tcps = tcp->tcp_tcps;
1021 ipha_t *ipha;
1022
1023 ASSERT(ira->ira_flags & IRAF_IS_IPV4);
1024 ipha = (ipha_t *)mp->b_rptr;
1025
1026 connp->conn_ipversion = IPV4_VERSION;
1027 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
1028 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
1029 connp->conn_saddr_v6 = connp->conn_laddr_v6;
1030
1031 sin = sin_null;
1032 sin.sin_addr.s_addr = connp->conn_faddr_v4;
1033 sin.sin_port = connp->conn_fport;
1034 sin.sin_family = AF_INET;
1035 if (lconnp->conn_recv_ancillary.crb_recvdstaddr) {
1036 sin_t sind;
1037
1038 sind = sin_null;
1039 sind.sin_addr.s_addr = connp->conn_laddr_v4;
1040 sind.sin_port = connp->conn_lport;
1041 sind.sin_family = AF_INET;
1042 tpi_mp = mi_tpi_extconn_ind(NULL,
1043 (char *)&sind, sizeof (sin_t), (char *)&tcp,
1044 (t_scalar_t)sizeof (intptr_t), (char *)&sind,
1045 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
1046 } else {
1047 tpi_mp = mi_tpi_conn_ind(NULL,
1048 (char *)&sin, sizeof (sin_t),
1049 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
1050 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
1051 }
1052
1053 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
1054 return (tpi_mp);
1055 }
1056
1057 /*
1058 * Called via squeue to get on to eager's perimeter. It sends a
1059 * TH_RST if eager is in the fanout table. The listener wants the
1060 * eager to disappear either by means of tcp_eager_blowoff() or
1061 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
1062 * called (via squeue) if the eager cannot be inserted in the
1063 * fanout table in tcp_input_listener().
1064 */
1065 /* ARGSUSED */
1066 void
tcp_eager_kill(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1067 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
1068 {
1069 conn_t *econnp = (conn_t *)arg;
1070 tcp_t *eager = econnp->conn_tcp;
1071 tcp_t *listener = eager->tcp_listener;
1072
1073 /*
1074 * We could be called because listener is closing. Since
1075 * the eager was using listener's queue's, we avoid
1076 * using the listeners queues from now on.
1077 */
1078 ASSERT(eager->tcp_detached);
1079 econnp->conn_rq = NULL;
1080 econnp->conn_wq = NULL;
1081
1082 /*
1083 * An eager's conn_fanout will be NULL if it's a duplicate
1084 * for an existing 4-tuples in the conn fanout table.
1085 * We don't want to send an RST out in such case.
1086 */
1087 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
1088 tcp_xmit_ctl("tcp_eager_kill, can't wait",
1089 eager, eager->tcp_snxt, 0, TH_RST);
1090 }
1091
1092 /* We are here because listener wants this eager gone */
1093 if (listener != NULL) {
1094 mutex_enter(&listener->tcp_eager_lock);
1095 tcp_eager_unlink(eager);
1096 if (eager->tcp_tconnind_started) {
1097 /*
1098 * The eager has sent a conn_ind up to the
1099 * listener but listener decides to close
1100 * instead. We need to drop the extra ref
1101 * placed on eager in tcp_input_data() before
1102 * sending the conn_ind to listener.
1103 */
1104 CONN_DEC_REF(econnp);
1105 }
1106 mutex_exit(&listener->tcp_eager_lock);
1107 CONN_DEC_REF(listener->tcp_connp);
1108 }
1109
1110 if (eager->tcp_state != TCPS_CLOSED)
1111 tcp_close_detached(eager);
1112 }
1113
1114 /*
1115 * Reset any eager connection hanging off this listener marked
1116 * with 'seqnum' and then reclaim it's resources.
1117 */
1118 boolean_t
tcp_eager_blowoff(tcp_t * listener,t_scalar_t seqnum)1119 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum)
1120 {
1121 tcp_t *eager;
1122 mblk_t *mp;
1123
1124 eager = listener;
1125 mutex_enter(&listener->tcp_eager_lock);
1126 do {
1127 eager = eager->tcp_eager_next_q;
1128 if (eager == NULL) {
1129 mutex_exit(&listener->tcp_eager_lock);
1130 return (B_FALSE);
1131 }
1132 } while (eager->tcp_conn_req_seqnum != seqnum);
1133
1134 if (eager->tcp_closemp_used) {
1135 mutex_exit(&listener->tcp_eager_lock);
1136 return (B_TRUE);
1137 }
1138 eager->tcp_closemp_used = B_TRUE;
1139 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
1140 CONN_INC_REF(eager->tcp_connp);
1141 mutex_exit(&listener->tcp_eager_lock);
1142 mp = &eager->tcp_closemp;
1143 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
1144 eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
1145 return (B_TRUE);
1146 }
1147
1148 /*
1149 * Reset any eager connection hanging off this listener
1150 * and then reclaim it's resources.
1151 */
1152 void
tcp_eager_cleanup(tcp_t * listener,boolean_t q0_only)1153 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
1154 {
1155 tcp_t *eager;
1156 mblk_t *mp;
1157 tcp_stack_t *tcps = listener->tcp_tcps;
1158
1159 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
1160
1161 if (!q0_only) {
1162 /* First cleanup q */
1163 TCP_STAT(tcps, tcp_eager_blowoff_q);
1164 eager = listener->tcp_eager_next_q;
1165 while (eager != NULL) {
1166 if (!eager->tcp_closemp_used) {
1167 eager->tcp_closemp_used = B_TRUE;
1168 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
1169 CONN_INC_REF(eager->tcp_connp);
1170 mp = &eager->tcp_closemp;
1171 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
1172 tcp_eager_kill, eager->tcp_connp, NULL,
1173 SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
1174 }
1175 eager = eager->tcp_eager_next_q;
1176 }
1177 }
1178 /* Then cleanup q0 */
1179 TCP_STAT(tcps, tcp_eager_blowoff_q0);
1180 eager = listener->tcp_eager_next_q0;
1181 while (eager != listener) {
1182 if (!eager->tcp_closemp_used) {
1183 eager->tcp_closemp_used = B_TRUE;
1184 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
1185 CONN_INC_REF(eager->tcp_connp);
1186 mp = &eager->tcp_closemp;
1187 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
1188 tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL,
1189 SQTAG_TCP_EAGER_CLEANUP_Q0);
1190 }
1191 eager = eager->tcp_eager_next_q0;
1192 }
1193 }
1194
1195 /*
1196 * If we are an eager connection hanging off a listener that hasn't
1197 * formally accepted the connection yet, get off its list and blow off
1198 * any data that we have accumulated.
1199 */
1200 void
tcp_eager_unlink(tcp_t * tcp)1201 tcp_eager_unlink(tcp_t *tcp)
1202 {
1203 tcp_t *listener = tcp->tcp_listener;
1204
1205 ASSERT(listener != NULL);
1206 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
1207 if (tcp->tcp_eager_next_q0 != NULL) {
1208 ASSERT(tcp->tcp_eager_prev_q0 != NULL);
1209
1210 /* Remove the eager tcp from q0 */
1211 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
1212 tcp->tcp_eager_prev_q0;
1213 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
1214 tcp->tcp_eager_next_q0;
1215 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1216 listener->tcp_conn_req_cnt_q0--;
1217
1218 tcp->tcp_eager_next_q0 = NULL;
1219 tcp->tcp_eager_prev_q0 = NULL;
1220
1221 /*
1222 * Take the eager out, if it is in the list of droppable
1223 * eagers.
1224 */
1225 MAKE_UNDROPPABLE(tcp);
1226
1227 if (tcp->tcp_syn_rcvd_timeout != 0) {
1228 /* we have timed out before */
1229 ASSERT(listener->tcp_syn_rcvd_timeout > 0);
1230 listener->tcp_syn_rcvd_timeout--;
1231 }
1232 } else {
1233 tcp_t **tcpp = &listener->tcp_eager_next_q;
1234 tcp_t *prev = NULL;
1235
1236 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
1237 if (tcpp[0] == tcp) {
1238 if (listener->tcp_eager_last_q == tcp) {
1239 /*
1240 * If we are unlinking the last
1241 * element on the list, adjust
1242 * tail pointer. Set tail pointer
1243 * to nil when list is empty.
1244 */
1245 ASSERT(tcp->tcp_eager_next_q == NULL);
1246 if (listener->tcp_eager_last_q ==
1247 listener->tcp_eager_next_q) {
1248 listener->tcp_eager_last_q =
1249 NULL;
1250 } else {
1251 /*
1252 * We won't get here if there
1253 * is only one eager in the
1254 * list.
1255 */
1256 ASSERT(prev != NULL);
1257 listener->tcp_eager_last_q =
1258 prev;
1259 }
1260 }
1261 tcpp[0] = tcp->tcp_eager_next_q;
1262 tcp->tcp_eager_next_q = NULL;
1263 tcp->tcp_eager_last_q = NULL;
1264 ASSERT(listener->tcp_conn_req_cnt_q > 0);
1265 listener->tcp_conn_req_cnt_q--;
1266 break;
1267 }
1268 prev = tcpp[0];
1269 }
1270 }
1271 tcp->tcp_listener = NULL;
1272 }
1273
1274 /* BEGIN CSTYLED */
1275 /*
1276 *
1277 * The sockfs ACCEPT path:
1278 * =======================
1279 *
1280 * The eager is now established in its own perimeter as soon as SYN is
1281 * received in tcp_input_listener(). When sockfs receives conn_ind, it
1282 * completes the accept processing on the acceptor STREAM. The sending
1283 * of conn_ind part is common for both sockfs listener and a TLI/XTI
1284 * listener but a TLI/XTI listener completes the accept processing
1285 * on the listener perimeter.
1286 *
1287 * Common control flow for 3 way handshake:
1288 * ----------------------------------------
1289 *
1290 * incoming SYN (listener perimeter) -> tcp_input_listener()
1291 *
1292 * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data()
1293 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind()
1294 *
1295 * Sockfs ACCEPT Path:
1296 * -------------------
1297 *
1298 * open acceptor stream (tcp_open allocates tcp_tli_accept()
1299 * as STREAM entry point)
1300 *
1301 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept()
1302 *
1303 * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager
1304 * association (we are not behind eager's squeue but sockfs is protecting us
1305 * and no one knows about this stream yet. The STREAMS entry point q->q_info
1306 * is changed to point at tcp_wput().
1307 *
1308 * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to
1309 * listener (done on listener's perimeter).
1310 *
1311 * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish
1312 * accept.
1313 *
1314 * TLI/XTI client ACCEPT path:
1315 * ---------------------------
1316 *
1317 * soaccept() sends T_CONN_RES on the listener STREAM.
1318 *
1319 * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send
1320 * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()).
1321 *
1322 * Locks:
1323 * ======
1324 *
1325 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
1326 * and listeners->tcp_eager_next_q.
1327 *
1328 * Referencing:
1329 * ============
1330 *
1331 * 1) We start out in tcp_input_listener by eager placing a ref on
1332 * listener and listener adding eager to listeners->tcp_eager_next_q0.
1333 *
1334 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
1335 * doing so we place a ref on the eager. This ref is finally dropped at the
1336 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
1337 * reference is dropped by the squeue framework.
1338 *
1339 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
1340 *
1341 * The reference must be released by the same entity that added the reference
1342 * In the above scheme, the eager is the entity that adds and releases the
1343 * references. Note that tcp_accept_finish executes in the squeue of the eager
1344 * (albeit after it is attached to the acceptor stream). Though 1. executes
1345 * in the listener's squeue, the eager is nascent at this point and the
1346 * reference can be considered to have been added on behalf of the eager.
1347 *
1348 * Eager getting a Reset or listener closing:
1349 * ==========================================
1350 *
1351 * Once the listener and eager are linked, the listener never does the unlink.
1352 * If the listener needs to close, tcp_eager_cleanup() is called which queues
1353 * a message on all eager perimeter. The eager then does the unlink, clears
1354 * any pointers to the listener's queue and drops the reference to the
1355 * listener. The listener waits in tcp_close outside the squeue until its
1356 * refcount has dropped to 1. This ensures that the listener has waited for
1357 * all eagers to clear their association with the listener.
1358 *
1359 * Similarly, if eager decides to go away, it can unlink itself and close.
1360 * When the T_CONN_RES comes down, we check if eager has closed. Note that
1361 * the reference to eager is still valid because of the extra ref we put
1362 * in tcp_send_conn_ind.
1363 *
1364 * Listener can always locate the eager under the protection
1365 * of the listener->tcp_eager_lock, and then do a refhold
1366 * on the eager during the accept processing.
1367 *
1368 * The acceptor stream accesses the eager in the accept processing
1369 * based on the ref placed on eager before sending T_conn_ind.
1370 * The only entity that can negate this refhold is a listener close
1371 * which is mutually exclusive with an active acceptor stream.
1372 *
1373 * Eager's reference on the listener
1374 * ===================================
1375 *
1376 * If the accept happens (even on a closed eager) the eager drops its
1377 * reference on the listener at the start of tcp_accept_finish. If the
1378 * eager is killed due to an incoming RST before the T_conn_ind is sent up,
1379 * the reference is dropped in tcp_closei_local. If the listener closes,
1380 * the reference is dropped in tcp_eager_kill. In all cases the reference
1381 * is dropped while executing in the eager's context (squeue).
1382 */
1383 /* END CSTYLED */
1384
1385 /* Process the SYN packet, mp, directed at the listener 'tcp' */
1386
1387 /*
1388 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
1389 * tcp_input_data will not see any packets for listeners since the listener
1390 * has conn_recv set to tcp_input_listener.
1391 */
1392 /* ARGSUSED */
1393 static void
tcp_input_listener(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)1394 tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1395 {
1396 tcpha_t *tcpha;
1397 uint32_t seg_seq;
1398 tcp_t *eager;
1399 int err;
1400 conn_t *econnp = NULL;
1401 squeue_t *new_sqp;
1402 mblk_t *mp1;
1403 uint_t ip_hdr_len;
1404 conn_t *lconnp = (conn_t *)arg;
1405 tcp_t *listener = lconnp->conn_tcp;
1406 tcp_stack_t *tcps = listener->tcp_tcps;
1407 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
1408 uint_t flags;
1409 mblk_t *tpi_mp;
1410 uint_t ifindex = ira->ira_ruifindex;
1411 boolean_t tlc_set = B_FALSE;
1412
1413 ip_hdr_len = ira->ira_ip_hdr_length;
1414 tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
1415 flags = (unsigned int)tcpha->tha_flags & 0xFF;
1416
1417 DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, lconnp->conn_ixa,
1418 __dtrace_tcp_void_ip_t *, mp->b_rptr, tcp_t *, listener,
1419 __dtrace_tcp_tcph_t *, tcpha);
1420
1421 if (!(flags & TH_SYN)) {
1422 if ((flags & TH_RST) || (flags & TH_URG)) {
1423 freemsg(mp);
1424 return;
1425 }
1426 if (flags & TH_ACK) {
1427 /* Note this executes in listener's squeue */
1428 tcp_xmit_listeners_reset(mp, ira, ipst, lconnp);
1429 return;
1430 }
1431
1432 freemsg(mp);
1433 return;
1434 }
1435
1436 if (listener->tcp_state != TCPS_LISTEN)
1437 goto error2;
1438
1439 ASSERT(IPCL_IS_BOUND(lconnp));
1440
1441 mutex_enter(&listener->tcp_eager_lock);
1442
1443 /*
1444 * The system is under memory pressure, so we need to do our part
1445 * to relieve the pressure. So we only accept new request if there
1446 * is nothing waiting to be accepted or waiting to complete the 3-way
1447 * handshake. This means that busy listener will not get too many
1448 * new requests which they cannot handle in time while non-busy
1449 * listener is still functioning properly.
1450 */
1451 if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 ||
1452 listener->tcp_conn_req_cnt_q0 > 0)) {
1453 mutex_exit(&listener->tcp_eager_lock);
1454 TCP_STAT(tcps, tcp_listen_mem_drop);
1455 goto error2;
1456 }
1457
1458 if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) {
1459 mutex_exit(&listener->tcp_eager_lock);
1460 TCP_STAT(tcps, tcp_listendrop);
1461 TCPS_BUMP_MIB(tcps, tcpListenDrop);
1462 if (lconnp->conn_debug) {
1463 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
1464 "tcp_input_listener: listen backlog (max=%d) "
1465 "overflow (%d pending) on %s",
1466 listener->tcp_conn_req_max,
1467 listener->tcp_conn_req_cnt_q,
1468 tcp_display(listener, NULL, DISP_PORT_ONLY));
1469 }
1470 goto error2;
1471 }
1472
1473 if (listener->tcp_conn_req_cnt_q0 >=
1474 listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
1475 /*
1476 * Q0 is full. Drop a pending half-open req from the queue
1477 * to make room for the new SYN req. Also mark the time we
1478 * drop a SYN.
1479 *
1480 * A more aggressive defense against SYN attack will
1481 * be to set the "tcp_syn_defense" flag now.
1482 */
1483 TCP_STAT(tcps, tcp_listendropq0);
1484 listener->tcp_last_rcv_lbolt = ddi_get_lbolt64();
1485 if (!tcp_drop_q0(listener)) {
1486 mutex_exit(&listener->tcp_eager_lock);
1487 TCPS_BUMP_MIB(tcps, tcpListenDropQ0);
1488 if (lconnp->conn_debug) {
1489 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
1490 "tcp_input_listener: listen half-open "
1491 "queue (max=%d) full (%d pending) on %s",
1492 tcps->tcps_conn_req_max_q0,
1493 listener->tcp_conn_req_cnt_q0,
1494 tcp_display(listener, NULL,
1495 DISP_PORT_ONLY));
1496 }
1497 goto error2;
1498 }
1499 }
1500
1501 /*
1502 * Enforce the limit set on the number of connections per listener.
1503 * Note that tlc_cnt starts with 1. So need to add 1 to tlc_max
1504 * for comparison.
1505 */
1506 if (listener->tcp_listen_cnt != NULL) {
1507 tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt;
1508 int64_t now;
1509
1510 if (atomic_inc_32_nv(&tlc->tlc_cnt) > tlc->tlc_max + 1) {
1511 mutex_exit(&listener->tcp_eager_lock);
1512 now = ddi_get_lbolt64();
1513 atomic_dec_32(&tlc->tlc_cnt);
1514 TCP_STAT(tcps, tcp_listen_cnt_drop);
1515 tlc->tlc_drop++;
1516 if (now - tlc->tlc_report_time >
1517 MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) {
1518 zcmn_err(lconnp->conn_zoneid, CE_WARN,
1519 "Listener (port %d) connection max (%u) "
1520 "reached: %u attempts dropped total\n",
1521 ntohs(listener->tcp_connp->conn_lport),
1522 tlc->tlc_max, tlc->tlc_drop);
1523 tlc->tlc_report_time = now;
1524 }
1525 goto error2;
1526 }
1527 tlc_set = B_TRUE;
1528 }
1529
1530 mutex_exit(&listener->tcp_eager_lock);
1531
1532 /*
1533 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
1534 * or based on the ring (for packets from GLD). Otherwise it is
1535 * set based on lbolt i.e., a somewhat random number.
1536 */
1537 ASSERT(ira->ira_sqp != NULL);
1538 new_sqp = ira->ira_sqp;
1539
1540 econnp = tcp_get_conn(arg2, tcps);
1541 if (econnp == NULL)
1542 goto error2;
1543
1544 ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
1545 econnp->conn_sqp = new_sqp;
1546 econnp->conn_initial_sqp = new_sqp;
1547 econnp->conn_ixa->ixa_sqp = new_sqp;
1548
1549 econnp->conn_fport = tcpha->tha_lport;
1550 econnp->conn_lport = tcpha->tha_fport;
1551
1552 err = conn_inherit_parent(lconnp, econnp);
1553 if (err != 0)
1554 goto error3;
1555
1556 /* We already know the laddr of the new connection is ours */
1557 econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation;
1558
1559 ASSERT(OK_32PTR(mp->b_rptr));
1560 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
1561 IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
1562
1563 if (lconnp->conn_family == AF_INET) {
1564 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
1565 tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira);
1566 } else {
1567 tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira);
1568 }
1569
1570 if (tpi_mp == NULL)
1571 goto error3;
1572
1573 eager = econnp->conn_tcp;
1574 eager->tcp_detached = B_TRUE;
1575 SOCK_CONNID_INIT(eager->tcp_connid);
1576
1577 /*
1578 * Initialize the eager's tcp_t and inherit some parameters from
1579 * the listener.
1580 */
1581 tcp_init_values(eager, listener);
1582
1583 ASSERT((econnp->conn_ixa->ixa_flags &
1584 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
1585 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) ==
1586 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
1587 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO));
1588
1589 if (!tcps->tcps_dev_flow_ctl)
1590 econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
1591
1592 /* Prepare for diffing against previous packets */
1593 eager->tcp_recvifindex = 0;
1594 eager->tcp_recvhops = 0xffffffffU;
1595
1596 if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) {
1597 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) ||
1598 IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) {
1599 econnp->conn_incoming_ifindex = ifindex;
1600 econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
1601 econnp->conn_ixa->ixa_scopeid = ifindex;
1602 }
1603 }
1604
1605 if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) ==
1606 (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) &&
1607 tcps->tcps_rev_src_routes) {
1608 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1609 ip_pkt_t *ipp = &econnp->conn_xmit_ipp;
1610
1611 /* Source routing option copyover (reverse it) */
1612 err = ip_find_hdr_v4(ipha, ipp, B_TRUE);
1613 if (err != 0) {
1614 freemsg(tpi_mp);
1615 goto error3;
1616 }
1617 ip_pkt_source_route_reverse_v4(ipp);
1618 }
1619
1620 ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL);
1621 ASSERT(!eager->tcp_tconnind_started);
1622 /*
1623 * If the SYN came with a credential, it's a loopback packet or a
1624 * labeled packet; attach the credential to the TPI message.
1625 */
1626 if (ira->ira_cred != NULL)
1627 mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid);
1628
1629 eager->tcp_conn.tcp_eager_conn_ind = tpi_mp;
1630 ASSERT(eager->tcp_ordrel_mp == NULL);
1631
1632 /* Inherit the listener's non-STREAMS flag */
1633 if (IPCL_IS_NONSTR(lconnp)) {
1634 econnp->conn_flags |= IPCL_NONSTR;
1635 /* All non-STREAMS tcp_ts are sockets */
1636 eager->tcp_issocket = B_TRUE;
1637 } else {
1638 /*
1639 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that
1640 * at close time, we will always have that to send up.
1641 * Otherwise, we need to do special handling in case the
1642 * allocation fails at that time.
1643 */
1644 if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
1645 goto error3;
1646 }
1647 /*
1648 * Now that the IP addresses and ports are setup in econnp we
1649 * can do the IPsec policy work.
1650 */
1651 if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1652 if (lconnp->conn_policy != NULL) {
1653 /*
1654 * Inherit the policy from the listener; use
1655 * actions from ira
1656 */
1657 if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) {
1658 CONN_DEC_REF(econnp);
1659 freemsg(mp);
1660 goto error3;
1661 }
1662 }
1663 }
1664
1665 /*
1666 * tcp_set_destination() may set tcp_rwnd according to the route
1667 * metrics. If it does not, the eager's receive window will be set
1668 * to the listener's receive window later in this function.
1669 */
1670 eager->tcp_rwnd = 0;
1671
1672 if (is_system_labeled()) {
1673 ip_xmit_attr_t *ixa = econnp->conn_ixa;
1674
1675 ASSERT(ira->ira_tsl != NULL);
1676 /* Discard any old label */
1677 if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1678 ASSERT(ixa->ixa_tsl != NULL);
1679 label_rele(ixa->ixa_tsl);
1680 ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1681 ixa->ixa_tsl = NULL;
1682 }
1683 if ((lconnp->conn_mlp_type != mlptSingle ||
1684 lconnp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1685 ira->ira_tsl != NULL) {
1686 /*
1687 * If this is an MLP connection or a MAC-Exempt
1688 * connection with an unlabeled node, packets are to be
1689 * exchanged using the security label of the received
1690 * SYN packet instead of the server application's label.
1691 * tsol_check_dest called from ip_set_destination
1692 * might later update TSF_UNLABELED by replacing
1693 * ixa_tsl with a new label.
1694 */
1695 label_hold(ira->ira_tsl);
1696 ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
1697 DTRACE_PROBE2(mlp_syn_accept, conn_t *,
1698 econnp, ts_label_t *, ixa->ixa_tsl)
1699 } else {
1700 ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
1701 DTRACE_PROBE2(syn_accept, conn_t *,
1702 econnp, ts_label_t *, ixa->ixa_tsl)
1703 }
1704 /*
1705 * conn_connect() called from tcp_set_destination will verify
1706 * the destination is allowed to receive packets at the
1707 * security label of the SYN-ACK we are generating. As part of
1708 * that, tsol_check_dest() may create a new effective label for
1709 * this connection.
1710 * Finally conn_connect() will call conn_update_label.
1711 * All that remains for TCP to do is to call
1712 * conn_build_hdr_template which is done as part of
1713 * tcp_set_destination.
1714 */
1715 }
1716
1717 /*
1718 * Since we will clear tcp_listener before we clear tcp_detached
1719 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
1720 * so we can tell a TCP_IS_DETACHED_NONEAGER apart.
1721 */
1722 eager->tcp_hard_binding = B_TRUE;
1723
1724 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
1725 TCP_BIND_HASH(econnp->conn_lport)], eager, 0);
1726
1727 CL_INET_CONNECT(econnp, B_FALSE, err);
1728 if (err != 0) {
1729 tcp_bind_hash_remove(eager);
1730 goto error3;
1731 }
1732
1733 SOCK_CONNID_BUMP(eager->tcp_connid);
1734
1735 /*
1736 * Adapt our mss, ttl, ... based on the remote address.
1737 */
1738
1739 if (tcp_set_destination(eager) != 0) {
1740 TCPS_BUMP_MIB(tcps, tcpAttemptFails);
1741 /* Undo the bind_hash_insert */
1742 tcp_bind_hash_remove(eager);
1743 goto error3;
1744 }
1745
1746 /* Process all TCP options. */
1747 tcp_process_options(eager, tcpha);
1748
1749 /* Is the other end ECN capable? */
1750 if (tcps->tcps_ecn_permitted >= 1 &&
1751 (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
1752 eager->tcp_ecn_ok = B_TRUE;
1753 }
1754
1755 /*
1756 * The listener's conn_rcvbuf should be the default window size or a
1757 * window size changed via SO_RCVBUF option. First round up the
1758 * eager's tcp_rwnd to the nearest MSS. Then find out the window
1759 * scale option value if needed. Call tcp_rwnd_set() to finish the
1760 * setting.
1761 *
1762 * Note if there is a rpipe metric associated with the remote host,
1763 * we should not inherit receive window size from listener.
1764 */
1765 eager->tcp_rwnd = MSS_ROUNDUP(
1766 (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf :
1767 eager->tcp_rwnd), eager->tcp_mss);
1768 if (eager->tcp_snd_ws_ok)
1769 tcp_set_ws_value(eager);
1770 /*
1771 * Note that this is the only place tcp_rwnd_set() is called for
1772 * accepting a connection. We need to call it here instead of
1773 * after the 3-way handshake because we need to tell the other
1774 * side our rwnd in the SYN-ACK segment.
1775 */
1776 (void) tcp_rwnd_set(eager, eager->tcp_rwnd);
1777
1778 ASSERT(eager->tcp_connp->conn_rcvbuf != 0 &&
1779 eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd);
1780
1781 ASSERT(econnp->conn_rcvbuf != 0 &&
1782 econnp->conn_rcvbuf == eager->tcp_rwnd);
1783
1784 /* Put a ref on the listener for the eager. */
1785 CONN_INC_REF(lconnp);
1786 mutex_enter(&listener->tcp_eager_lock);
1787 listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
1788 eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0;
1789 listener->tcp_eager_next_q0 = eager;
1790 eager->tcp_eager_prev_q0 = listener;
1791
1792 /* Set tcp_listener before adding it to tcp_conn_fanout */
1793 eager->tcp_listener = listener;
1794 eager->tcp_saved_listener = listener;
1795
1796 /*
1797 * Set tcp_listen_cnt so that when the connection is done, the counter
1798 * is decremented.
1799 */
1800 eager->tcp_listen_cnt = listener->tcp_listen_cnt;
1801
1802 /*
1803 * Tag this detached tcp vector for later retrieval
1804 * by our listener client in tcp_accept().
1805 */
1806 eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum;
1807 listener->tcp_conn_req_cnt_q0++;
1808 if (++listener->tcp_conn_req_seqnum == -1) {
1809 /*
1810 * -1 is "special" and defined in TPI as something
1811 * that should never be used in T_CONN_IND
1812 */
1813 ++listener->tcp_conn_req_seqnum;
1814 }
1815 mutex_exit(&listener->tcp_eager_lock);
1816
1817 if (listener->tcp_syn_defense) {
1818 /* Don't drop the SYN that comes from a good IP source */
1819 ipaddr_t *addr_cache;
1820
1821 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1822 if (addr_cache != NULL && econnp->conn_faddr_v4 ==
1823 addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) {
1824 eager->tcp_dontdrop = B_TRUE;
1825 }
1826 }
1827
1828 /*
1829 * We need to insert the eager in its own perimeter but as soon
1830 * as we do that, we expose the eager to the classifier and
1831 * should not touch any field outside the eager's perimeter.
1832 * So do all the work necessary before inserting the eager
1833 * in its own perimeter. Be optimistic that conn_connect()
1834 * will succeed but undo everything if it fails.
1835 */
1836 seg_seq = ntohl(tcpha->tha_seq);
1837 eager->tcp_irs = seg_seq;
1838 eager->tcp_rack = seg_seq;
1839 eager->tcp_rnxt = seg_seq + 1;
1840 eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt);
1841 TCPS_BUMP_MIB(tcps, tcpPassiveOpens);
1842 eager->tcp_state = TCPS_SYN_RCVD;
1843 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
1844 econnp->conn_ixa, void, NULL, tcp_t *, eager, void, NULL,
1845 int32_t, TCPS_LISTEN);
1846
1847 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
1848 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
1849 if (mp1 == NULL) {
1850 /*
1851 * Increment the ref count as we are going to
1852 * enqueueing an mp in squeue
1853 */
1854 CONN_INC_REF(econnp);
1855 goto error;
1856 }
1857
1858 /*
1859 * We need to start the rto timer. In normal case, we start
1860 * the timer after sending the packet on the wire (or at
1861 * least believing that packet was sent by waiting for
1862 * conn_ip_output() to return). Since this is the first packet
1863 * being sent on the wire for the eager, our initial tcp_rto
1864 * is at least tcp_rexmit_interval_min which is a fairly
1865 * large value to allow the algorithm to adjust slowly to large
1866 * fluctuations of RTT during first few transmissions.
1867 *
1868 * Starting the timer first and then sending the packet in this
1869 * case shouldn't make much difference since tcp_rexmit_interval_min
1870 * is of the order of several 100ms and starting the timer
1871 * first and then sending the packet will result in difference
1872 * of few micro seconds.
1873 *
1874 * Without this optimization, we are forced to hold the fanout
1875 * lock across the ipcl_bind_insert() and sending the packet
1876 * so that we don't race against an incoming packet (maybe RST)
1877 * for this eager.
1878 *
1879 * It is necessary to acquire an extra reference on the eager
1880 * at this point and hold it until after tcp_send_data() to
1881 * ensure against an eager close race.
1882 */
1883
1884 CONN_INC_REF(econnp);
1885
1886 TCP_TIMER_RESTART(eager, eager->tcp_rto);
1887
1888 /*
1889 * Insert the eager in its own perimeter now. We are ready to deal
1890 * with any packets on eager.
1891 */
1892 if (ipcl_conn_insert(econnp) != 0)
1893 goto error;
1894
1895 ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp);
1896 freemsg(mp);
1897 /*
1898 * Send the SYN-ACK. Use the right squeue so that conn_ixa is
1899 * only used by one thread at a time.
1900 */
1901 if (econnp->conn_sqp == lconnp->conn_sqp) {
1902 DTRACE_TCP5(send, mblk_t *, NULL, ip_xmit_attr_t *,
1903 econnp->conn_ixa, __dtrace_tcp_void_ip_t *, mp1->b_rptr,
1904 tcp_t *, eager, __dtrace_tcp_tcph_t *,
1905 &mp1->b_rptr[econnp->conn_ixa->ixa_ip_hdr_length]);
1906 (void) conn_ip_output(mp1, econnp->conn_ixa);
1907 CONN_DEC_REF(econnp);
1908 } else {
1909 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack,
1910 econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK);
1911 }
1912 return;
1913 error:
1914 freemsg(mp1);
1915 eager->tcp_closemp_used = B_TRUE;
1916 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
1917 mp1 = &eager->tcp_closemp;
1918 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
1919 econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
1920
1921 /*
1922 * If a connection already exists, send the mp to that connections so
1923 * that it can be appropriately dealt with.
1924 */
1925 ipst = tcps->tcps_netstack->netstack_ip;
1926
1927 if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) {
1928 if (!IPCL_IS_CONNECTED(econnp)) {
1929 /*
1930 * Something bad happened. ipcl_conn_insert()
1931 * failed because a connection already existed
1932 * in connected hash but we can't find it
1933 * anymore (someone blew it away). Just
1934 * free this message and hopefully remote
1935 * will retransmit at which time the SYN can be
1936 * treated as a new connection or dealth with
1937 * a TH_RST if a connection already exists.
1938 */
1939 CONN_DEC_REF(econnp);
1940 freemsg(mp);
1941 } else {
1942 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data,
1943 econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
1944 }
1945 } else {
1946 /* Nobody wants this packet */
1947 freemsg(mp);
1948 }
1949 return;
1950 error3:
1951 CONN_DEC_REF(econnp);
1952 error2:
1953 freemsg(mp);
1954 if (tlc_set)
1955 atomic_dec_32(&listener->tcp_listen_cnt->tlc_cnt);
1956 }
1957
1958 /*
1959 * In an ideal case of vertical partition in NUMA architecture, its
1960 * beneficial to have the listener and all the incoming connections
1961 * tied to the same squeue. The other constraint is that incoming
1962 * connections should be tied to the squeue attached to interrupted
1963 * CPU for obvious locality reason so this leaves the listener to
1964 * be tied to the same squeue. Our only problem is that when listener
1965 * is binding, the CPU that will get interrupted by the NIC whose
1966 * IP address the listener is binding to is not even known. So
1967 * the code below allows us to change that binding at the time the
1968 * CPU is interrupted by virtue of incoming connection's squeue.
1969 *
1970 * This is usefull only in case of a listener bound to a specific IP
1971 * address. For other kind of listeners, they get bound the
1972 * very first time and there is no attempt to rebind them.
1973 */
1974 void
tcp_input_listener_unbound(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)1975 tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
1976 ip_recv_attr_t *ira)
1977 {
1978 conn_t *connp = (conn_t *)arg;
1979 squeue_t *sqp = (squeue_t *)arg2;
1980 squeue_t *new_sqp;
1981 uint32_t conn_flags;
1982
1983 /*
1984 * IP sets ira_sqp to either the senders conn_sqp (for loopback)
1985 * or based on the ring (for packets from GLD). Otherwise it is
1986 * set based on lbolt i.e., a somewhat random number.
1987 */
1988 ASSERT(ira->ira_sqp != NULL);
1989 new_sqp = ira->ira_sqp;
1990
1991 if (connp->conn_fanout == NULL)
1992 goto done;
1993
1994 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
1995 mutex_enter(&connp->conn_fanout->connf_lock);
1996 mutex_enter(&connp->conn_lock);
1997 /*
1998 * No one from read or write side can access us now
1999 * except for already queued packets on this squeue.
2000 * But since we haven't changed the squeue yet, they
2001 * can't execute. If they are processed after we have
2002 * changed the squeue, they are sent back to the
2003 * correct squeue down below.
2004 * But a listner close can race with processing of
2005 * incoming SYN. If incoming SYN processing changes
2006 * the squeue then the listener close which is waiting
2007 * to enter the squeue would operate on the wrong
2008 * squeue. Hence we don't change the squeue here unless
2009 * the refcount is exactly the minimum refcount. The
2010 * minimum refcount of 4 is counted as - 1 each for
2011 * TCP and IP, 1 for being in the classifier hash, and
2012 * 1 for the mblk being processed.
2013 */
2014
2015 if (connp->conn_ref != 4 ||
2016 connp->conn_tcp->tcp_state != TCPS_LISTEN) {
2017 mutex_exit(&connp->conn_lock);
2018 mutex_exit(&connp->conn_fanout->connf_lock);
2019 goto done;
2020 }
2021 if (connp->conn_sqp != new_sqp) {
2022 while (connp->conn_sqp != new_sqp)
2023 (void) atomic_cas_ptr(&connp->conn_sqp, sqp,
2024 new_sqp);
2025 /* No special MT issues for outbound ixa_sqp hint */
2026 connp->conn_ixa->ixa_sqp = new_sqp;
2027 }
2028
2029 do {
2030 conn_flags = connp->conn_flags;
2031 conn_flags |= IPCL_FULLY_BOUND;
2032 (void) atomic_cas_32(&connp->conn_flags,
2033 connp->conn_flags, conn_flags);
2034 } while (!(connp->conn_flags & IPCL_FULLY_BOUND));
2035
2036 mutex_exit(&connp->conn_fanout->connf_lock);
2037 mutex_exit(&connp->conn_lock);
2038
2039 /*
2040 * Assume we have picked a good squeue for the listener. Make
2041 * subsequent SYNs not try to change the squeue.
2042 */
2043 connp->conn_recv = tcp_input_listener;
2044 }
2045
2046 done:
2047 if (connp->conn_sqp != sqp) {
2048 CONN_INC_REF(connp);
2049 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
2050 ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
2051 } else {
2052 tcp_input_listener(connp, mp, sqp, ira);
2053 }
2054 }
2055
2056 /*
2057 * Send up all messages queued on tcp_rcv_list.
2058 */
2059 uint_t
tcp_rcv_drain(tcp_t * tcp)2060 tcp_rcv_drain(tcp_t *tcp)
2061 {
2062 mblk_t *mp;
2063 uint_t ret = 0;
2064 #ifdef DEBUG
2065 uint_t cnt = 0;
2066 #endif
2067 queue_t *q = tcp->tcp_connp->conn_rq;
2068
2069 /* Can't drain on an eager connection */
2070 if (tcp->tcp_listener != NULL)
2071 return (ret);
2072
2073 /* Can't be a non-STREAMS connection */
2074 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
2075
2076 /* No need for the push timer now. */
2077 if (tcp->tcp_push_tid != 0) {
2078 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
2079 tcp->tcp_push_tid = 0;
2080 }
2081
2082 /*
2083 * Handle two cases here: we are currently fused or we were
2084 * previously fused and have some urgent data to be delivered
2085 * upstream. The latter happens because we either ran out of
2086 * memory or were detached and therefore sending the SIGURG was
2087 * deferred until this point. In either case we pass control
2088 * over to tcp_fuse_rcv_drain() since it may need to complete
2089 * some work.
2090 */
2091 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) {
2092 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL :
2093 &tcp->tcp_fused_sigurg_mp))
2094 return (ret);
2095 }
2096
2097 while ((mp = tcp->tcp_rcv_list) != NULL) {
2098 tcp->tcp_rcv_list = mp->b_next;
2099 mp->b_next = NULL;
2100 #ifdef DEBUG
2101 cnt += msgdsize(mp);
2102 #endif
2103 putnext(q, mp);
2104 }
2105 #ifdef DEBUG
2106 ASSERT(cnt == tcp->tcp_rcv_cnt);
2107 #endif
2108 tcp->tcp_rcv_last_head = NULL;
2109 tcp->tcp_rcv_last_tail = NULL;
2110 tcp->tcp_rcv_cnt = 0;
2111
2112 if (canputnext(q))
2113 return (tcp_rwnd_reopen(tcp));
2114
2115 return (ret);
2116 }
2117
2118 /*
2119 * Queue data on tcp_rcv_list which is a b_next chain.
2120 * tcp_rcv_last_head/tail is the last element of this chain.
2121 * Each element of the chain is a b_cont chain.
2122 *
2123 * M_DATA messages are added to the current element.
2124 * Other messages are added as new (b_next) elements.
2125 */
2126 void
tcp_rcv_enqueue(tcp_t * tcp,mblk_t * mp,uint_t seg_len,cred_t * cr)2127 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr)
2128 {
2129 ASSERT(seg_len == msgdsize(mp));
2130 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL);
2131
2132 if (is_system_labeled()) {
2133 ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL);
2134 /*
2135 * Provide for protocols above TCP such as RPC. NOPID leaves
2136 * db_cpid unchanged.
2137 * The cred could have already been set.
2138 */
2139 if (cr != NULL)
2140 mblk_setcred(mp, cr, NOPID);
2141 }
2142
2143 if (tcp->tcp_rcv_list == NULL) {
2144 ASSERT(tcp->tcp_rcv_last_head == NULL);
2145 tcp->tcp_rcv_list = mp;
2146 tcp->tcp_rcv_last_head = mp;
2147 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) {
2148 tcp->tcp_rcv_last_tail->b_cont = mp;
2149 } else {
2150 tcp->tcp_rcv_last_head->b_next = mp;
2151 tcp->tcp_rcv_last_head = mp;
2152 }
2153
2154 while (mp->b_cont)
2155 mp = mp->b_cont;
2156
2157 tcp->tcp_rcv_last_tail = mp;
2158 tcp->tcp_rcv_cnt += seg_len;
2159 tcp->tcp_rwnd -= seg_len;
2160 }
2161
2162 /* Generate an ACK-only (no data) segment for a TCP endpoint */
2163 mblk_t *
tcp_ack_mp(tcp_t * tcp)2164 tcp_ack_mp(tcp_t *tcp)
2165 {
2166 uint32_t seq_no;
2167 tcp_stack_t *tcps = tcp->tcp_tcps;
2168 conn_t *connp = tcp->tcp_connp;
2169
2170 /*
2171 * There are a few cases to be considered while setting the sequence no.
2172 * Essentially, we can come here while processing an unacceptable pkt
2173 * in the TCPS_SYN_RCVD state, in which case we set the sequence number
2174 * to snxt (per RFC 793), note the swnd wouldn't have been set yet.
2175 * If we are here for a zero window probe, stick with suna. In all
2176 * other cases, we check if suna + swnd encompasses snxt and set
2177 * the sequence number to snxt, if so. If snxt falls outside the
2178 * window (the receiver probably shrunk its window), we will go with
2179 * suna + swnd, otherwise the sequence no will be unacceptable to the
2180 * receiver.
2181 */
2182 if (tcp->tcp_zero_win_probe) {
2183 seq_no = tcp->tcp_suna;
2184 } else if (tcp->tcp_state == TCPS_SYN_RCVD) {
2185 ASSERT(tcp->tcp_swnd == 0);
2186 seq_no = tcp->tcp_snxt;
2187 } else {
2188 seq_no = SEQ_GT(tcp->tcp_snxt,
2189 (tcp->tcp_suna + tcp->tcp_swnd)) ?
2190 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt;
2191 }
2192
2193 if (tcp->tcp_valid_bits) {
2194 /*
2195 * For the complex case where we have to send some
2196 * controls (FIN or SYN), let tcp_xmit_mp do it.
2197 */
2198 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE,
2199 NULL, B_FALSE));
2200 } else {
2201 /* Generate a simple ACK */
2202 int data_length;
2203 uchar_t *rptr;
2204 tcpha_t *tcpha;
2205 mblk_t *mp1;
2206 int32_t total_hdr_len;
2207 int32_t tcp_hdr_len;
2208 int32_t num_sack_blk = 0;
2209 int32_t sack_opt_len;
2210 ip_xmit_attr_t *ixa = connp->conn_ixa;
2211
2212 /*
2213 * Allocate space for TCP + IP headers
2214 * and link-level header
2215 */
2216 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
2217 num_sack_blk = MIN(tcp->tcp_max_sack_blk,
2218 tcp->tcp_num_sack_blk);
2219 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
2220 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
2221 total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len;
2222 tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len;
2223 } else {
2224 total_hdr_len = connp->conn_ht_iphc_len;
2225 tcp_hdr_len = connp->conn_ht_ulp_len;
2226 }
2227 mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED);
2228 if (!mp1)
2229 return (NULL);
2230
2231 /* Update the latest receive window size in TCP header. */
2232 tcp->tcp_tcpha->tha_win =
2233 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
2234 /* copy in prototype TCP + IP header */
2235 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
2236 mp1->b_rptr = rptr;
2237 mp1->b_wptr = rptr + total_hdr_len;
2238 bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
2239
2240 tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
2241
2242 /* Set the TCP sequence number. */
2243 tcpha->tha_seq = htonl(seq_no);
2244
2245 /* Set up the TCP flag field. */
2246 tcpha->tha_flags = (uchar_t)TH_ACK;
2247 if (tcp->tcp_ecn_echo_on)
2248 tcpha->tha_flags |= TH_ECE;
2249
2250 tcp->tcp_rack = tcp->tcp_rnxt;
2251 tcp->tcp_rack_cnt = 0;
2252
2253 /* fill in timestamp option if in use */
2254 if (tcp->tcp_snd_ts_ok) {
2255 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
2256
2257 U32_TO_BE32(llbolt,
2258 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
2259 U32_TO_BE32(tcp->tcp_ts_recent,
2260 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
2261 }
2262
2263 /* Fill in SACK options */
2264 if (num_sack_blk > 0) {
2265 uchar_t *wptr = (uchar_t *)tcpha +
2266 connp->conn_ht_ulp_len;
2267 sack_blk_t *tmp;
2268 int32_t i;
2269
2270 wptr[0] = TCPOPT_NOP;
2271 wptr[1] = TCPOPT_NOP;
2272 wptr[2] = TCPOPT_SACK;
2273 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk *
2274 sizeof (sack_blk_t);
2275 wptr += TCPOPT_REAL_SACK_LEN;
2276
2277 tmp = tcp->tcp_sack_list;
2278 for (i = 0; i < num_sack_blk; i++) {
2279 U32_TO_BE32(tmp[i].begin, wptr);
2280 wptr += sizeof (tcp_seq);
2281 U32_TO_BE32(tmp[i].end, wptr);
2282 wptr += sizeof (tcp_seq);
2283 }
2284 tcpha->tha_offset_and_reserved +=
2285 ((num_sack_blk * 2 + 1) << 4);
2286 }
2287
2288 ixa->ixa_pktlen = total_hdr_len;
2289
2290 if (ixa->ixa_flags & IXAF_IS_IPV4) {
2291 ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len);
2292 } else {
2293 ip6_t *ip6 = (ip6_t *)rptr;
2294
2295 ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
2296 }
2297
2298 /*
2299 * Prime pump for checksum calculation in IP. Include the
2300 * adjustment for a source route if any.
2301 */
2302 data_length = tcp_hdr_len + connp->conn_sum;
2303 data_length = (data_length >> 16) + (data_length & 0xFFFF);
2304 tcpha->tha_sum = htons(data_length);
2305
2306 if (tcp->tcp_ip_forward_progress) {
2307 tcp->tcp_ip_forward_progress = B_FALSE;
2308 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
2309 } else {
2310 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
2311 }
2312 return (mp1);
2313 }
2314 }
2315
2316 /*
2317 * Dummy socket upcalls for if/when the conn_t gets detached from a
2318 * direct-callback sonode via a user-driven close(). Easy to catch with
2319 * DTrace FBT, and should be mostly harmless.
2320 */
2321
2322 /* ARGSUSED */
2323 static sock_upper_handle_t
tcp_dummy_newconn(sock_upper_handle_t x,sock_lower_handle_t y,sock_downcalls_t * z,cred_t * cr,pid_t pid,sock_upcalls_t ** ignored)2324 tcp_dummy_newconn(sock_upper_handle_t x, sock_lower_handle_t y,
2325 sock_downcalls_t *z, cred_t *cr, pid_t pid, sock_upcalls_t **ignored)
2326 {
2327 ASSERT(0); /* Panic in debug, otherwise ignore. */
2328 return (NULL);
2329 }
2330
2331 /* ARGSUSED */
2332 static void
tcp_dummy_connected(sock_upper_handle_t x,sock_connid_t y,cred_t * cr,pid_t pid)2333 tcp_dummy_connected(sock_upper_handle_t x, sock_connid_t y, cred_t *cr,
2334 pid_t pid)
2335 {
2336 ASSERT(x == NULL);
2337 /* Normally we'd crhold(cr) and attach it to socket state. */
2338 /* LINTED */
2339 }
2340
2341 /* ARGSUSED */
2342 static int
tcp_dummy_disconnected(sock_upper_handle_t x,sock_connid_t y,int blah)2343 tcp_dummy_disconnected(sock_upper_handle_t x, sock_connid_t y, int blah)
2344 {
2345 ASSERT(0); /* Panic in debug, otherwise ignore. */
2346 return (-1);
2347 }
2348
2349 /* ARGSUSED */
2350 static void
tcp_dummy_opctl(sock_upper_handle_t x,sock_opctl_action_t y,uintptr_t blah)2351 tcp_dummy_opctl(sock_upper_handle_t x, sock_opctl_action_t y, uintptr_t blah)
2352 {
2353 ASSERT(x == NULL);
2354 /* We really want this one to be a harmless NOP for now. */
2355 /* LINTED */
2356 }
2357
2358 /* ARGSUSED */
2359 static ssize_t
tcp_dummy_recv(sock_upper_handle_t x,mblk_t * mp,size_t len,int flags,int * error,boolean_t * push)2360 tcp_dummy_recv(sock_upper_handle_t x, mblk_t *mp, size_t len, int flags,
2361 int *error, boolean_t *push)
2362 {
2363 ASSERT(x == NULL);
2364
2365 /*
2366 * Consume the message, set ESHUTDOWN, and return an error.
2367 * Nobody's home!
2368 */
2369 freemsg(mp);
2370 *error = ESHUTDOWN;
2371 return (-1);
2372 }
2373
2374 /* ARGSUSED */
2375 static void
tcp_dummy_set_proto_props(sock_upper_handle_t x,struct sock_proto_props * y)2376 tcp_dummy_set_proto_props(sock_upper_handle_t x, struct sock_proto_props *y)
2377 {
2378 ASSERT(0); /* Panic in debug, otherwise ignore. */
2379 }
2380
2381 /* ARGSUSED */
2382 static void
tcp_dummy_txq_full(sock_upper_handle_t x,boolean_t y)2383 tcp_dummy_txq_full(sock_upper_handle_t x, boolean_t y)
2384 {
2385 ASSERT(0); /* Panic in debug, otherwise ignore. */
2386 }
2387
2388 /* ARGSUSED */
2389 static void
tcp_dummy_signal_oob(sock_upper_handle_t x,ssize_t len)2390 tcp_dummy_signal_oob(sock_upper_handle_t x, ssize_t len)
2391 {
2392 ASSERT(x == NULL);
2393 /* Otherwise, this would signal socket state about OOB data. */
2394 }
2395
2396 /* ARGSUSED */
2397 static void
tcp_dummy_set_error(sock_upper_handle_t x,int err)2398 tcp_dummy_set_error(sock_upper_handle_t x, int err)
2399 {
2400 ASSERT(0); /* Panic in debug, otherwise ignore. */
2401 }
2402
2403 /* ARGSUSED */
2404 static void
tcp_dummy_onearg(sock_upper_handle_t x)2405 tcp_dummy_onearg(sock_upper_handle_t x)
2406 {
2407 ASSERT(0); /* Panic in debug, otherwise ignore. */
2408 }
2409
2410 static sock_upcalls_t tcp_dummy_upcalls = {
2411 tcp_dummy_newconn,
2412 tcp_dummy_connected,
2413 tcp_dummy_disconnected,
2414 tcp_dummy_opctl,
2415 tcp_dummy_recv,
2416 tcp_dummy_set_proto_props,
2417 tcp_dummy_txq_full,
2418 tcp_dummy_signal_oob,
2419 tcp_dummy_onearg,
2420 tcp_dummy_set_error,
2421 tcp_dummy_onearg
2422 };
2423
2424 /*
2425 * Handle M_DATA messages from IP. Its called directly from IP via
2426 * squeue for received IP packets.
2427 *
2428 * The first argument is always the connp/tcp to which the mp belongs.
2429 * There are no exceptions to this rule. The caller has already put
2430 * a reference on this connp/tcp and once tcp_input_data() returns,
2431 * the squeue will do the refrele.
2432 *
2433 * The TH_SYN for the listener directly go to tcp_input_listener via
2434 * squeue. ICMP errors go directly to tcp_icmp_input().
2435 *
2436 * sqp: NULL = recursive, sqp != NULL means called from squeue
2437 */
2438 void
tcp_input_data(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)2439 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2440 {
2441 int32_t bytes_acked;
2442 int32_t gap;
2443 mblk_t *mp1;
2444 uint_t flags;
2445 uint32_t new_swnd = 0;
2446 uchar_t *iphdr;
2447 uchar_t *rptr;
2448 int32_t rgap;
2449 uint32_t seg_ack;
2450 int seg_len;
2451 uint_t ip_hdr_len;
2452 uint32_t seg_seq;
2453 tcpha_t *tcpha;
2454 int urp;
2455 tcp_opt_t tcpopt;
2456 ip_pkt_t ipp;
2457 boolean_t ofo_seg = B_FALSE; /* Out of order segment */
2458 uint32_t cwnd;
2459 int mss;
2460 conn_t *connp = (conn_t *)arg;
2461 squeue_t *sqp = (squeue_t *)arg2;
2462 tcp_t *tcp = connp->conn_tcp;
2463 tcp_stack_t *tcps = tcp->tcp_tcps;
2464 sock_upcalls_t *sockupcalls;
2465
2466 /*
2467 * RST from fused tcp loopback peer should trigger an unfuse.
2468 */
2469 if (tcp->tcp_fused) {
2470 TCP_STAT(tcps, tcp_fusion_aborted);
2471 tcp_unfuse(tcp);
2472 }
2473
2474 mss = 0;
2475 iphdr = mp->b_rptr;
2476 rptr = mp->b_rptr;
2477 ASSERT(OK_32PTR(rptr));
2478
2479 ip_hdr_len = ira->ira_ip_hdr_length;
2480 if (connp->conn_recv_ancillary.crb_all != 0) {
2481 /*
2482 * Record packet information in the ip_pkt_t
2483 */
2484 ipp.ipp_fields = 0;
2485 if (ira->ira_flags & IRAF_IS_IPV4) {
2486 (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp,
2487 B_FALSE);
2488 } else {
2489 uint8_t nexthdrp;
2490
2491 /*
2492 * IPv6 packets can only be received by applications
2493 * that are prepared to receive IPv6 addresses.
2494 * The IP fanout must ensure this.
2495 */
2496 ASSERT(connp->conn_family == AF_INET6);
2497
2498 (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp,
2499 &nexthdrp);
2500 ASSERT(nexthdrp == IPPROTO_TCP);
2501
2502 /* Could have caused a pullup? */
2503 iphdr = mp->b_rptr;
2504 rptr = mp->b_rptr;
2505 }
2506 }
2507 ASSERT(DB_TYPE(mp) == M_DATA);
2508 ASSERT(mp->b_next == NULL);
2509
2510 tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2511 seg_seq = ntohl(tcpha->tha_seq);
2512 seg_ack = ntohl(tcpha->tha_ack);
2513 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
2514 seg_len = (int)(mp->b_wptr - rptr) -
2515 (ip_hdr_len + TCP_HDR_LENGTH(tcpha));
2516 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) {
2517 do {
2518 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
2519 (uintptr_t)INT_MAX);
2520 seg_len += (int)(mp1->b_wptr - mp1->b_rptr);
2521 } while ((mp1 = mp1->b_cont) != NULL &&
2522 mp1->b_datap->db_type == M_DATA);
2523 }
2524
2525 DTRACE_TCP5(receive, mblk_t *, NULL, ip_xmit_attr_t *, connp->conn_ixa,
2526 __dtrace_tcp_void_ip_t *, iphdr, tcp_t *, tcp,
2527 __dtrace_tcp_tcph_t *, tcpha);
2528
2529 if (tcp->tcp_state == TCPS_TIME_WAIT) {
2530 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
2531 seg_len, tcpha, ira);
2532 return;
2533 }
2534
2535 if (sqp != NULL) {
2536 /*
2537 * This is the correct place to update tcp_last_recv_time. Note
2538 * that it is also updated for tcp structure that belongs to
2539 * global and listener queues which do not really need updating.
2540 * But that should not cause any harm. And it is updated for
2541 * all kinds of incoming segments, not only for data segments.
2542 */
2543 tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2544 }
2545
2546 flags = (unsigned int)tcpha->tha_flags & 0xFF;
2547
2548 TCPS_BUMP_MIB(tcps, tcpHCInSegs);
2549 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2550
2551 if ((flags & TH_URG) && sqp != NULL) {
2552 /*
2553 * TCP can't handle urgent pointers that arrive before
2554 * the connection has been accept()ed since it can't
2555 * buffer OOB data. Discard segment if this happens.
2556 *
2557 * We can't just rely on a non-null tcp_listener to indicate
2558 * that the accept() has completed since unlinking of the
2559 * eager and completion of the accept are not atomic.
2560 * tcp_detached, when it is not set (B_FALSE) indicates
2561 * that the accept() has completed.
2562 *
2563 * Nor can it reassemble urgent pointers, so discard
2564 * if it's not the next segment expected.
2565 *
2566 * Otherwise, collapse chain into one mblk (discard if
2567 * that fails). This makes sure the headers, retransmitted
2568 * data, and new data all are in the same mblk.
2569 */
2570 ASSERT(mp != NULL);
2571 if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
2572 freemsg(mp);
2573 return;
2574 }
2575 /* Update pointers into message */
2576 iphdr = rptr = mp->b_rptr;
2577 tcpha = (tcpha_t *)&rptr[ip_hdr_len];
2578 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
2579 /*
2580 * Since we can't handle any data with this urgent
2581 * pointer that is out of sequence, we expunge
2582 * the data. This allows us to still register
2583 * the urgent mark and generate the M_PCSIG,
2584 * which we can do.
2585 */
2586 mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
2587 seg_len = 0;
2588 }
2589 }
2590
2591 sockupcalls = connp->conn_upcalls;
2592 /* A conn_t may have belonged to a now-closed socket. Be careful. */
2593 if (sockupcalls == NULL)
2594 sockupcalls = &tcp_dummy_upcalls;
2595
2596 switch (tcp->tcp_state) {
2597 case TCPS_SYN_SENT:
2598 if (connp->conn_final_sqp == NULL &&
2599 tcp_outbound_squeue_switch && sqp != NULL) {
2600 ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
2601 connp->conn_final_sqp = sqp;
2602 if (connp->conn_final_sqp != connp->conn_sqp) {
2603 DTRACE_PROBE1(conn__final__sqp__switch,
2604 conn_t *, connp);
2605 CONN_INC_REF(connp);
2606 SQUEUE_SWITCH(connp, connp->conn_final_sqp);
2607 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2608 tcp_input_data, connp, ira, ip_squeue_flag,
2609 SQTAG_CONNECT_FINISH);
2610 return;
2611 }
2612 DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp);
2613 }
2614 if (flags & TH_ACK) {
2615 /*
2616 * Note that our stack cannot send data before a
2617 * connection is established, therefore the
2618 * following check is valid. Otherwise, it has
2619 * to be changed.
2620 */
2621 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) ||
2622 SEQ_GT(seg_ack, tcp->tcp_snxt)) {
2623 freemsg(mp);
2624 if (flags & TH_RST)
2625 return;
2626 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
2627 tcp, seg_ack, 0, TH_RST);
2628 return;
2629 }
2630 ASSERT(tcp->tcp_suna + 1 == seg_ack);
2631 }
2632 if (flags & TH_RST) {
2633 if (flags & TH_ACK) {
2634 DTRACE_TCP5(connect__refused, mblk_t *, NULL,
2635 ip_xmit_attr_t *, connp->conn_ixa,
2636 void_ip_t *, iphdr, tcp_t *, tcp,
2637 tcph_t *, tcpha);
2638 (void) tcp_clean_death(tcp, ECONNREFUSED);
2639 }
2640 freemsg(mp);
2641 return;
2642 }
2643 if (!(flags & TH_SYN)) {
2644 freemsg(mp);
2645 return;
2646 }
2647
2648 /* Process all TCP options. */
2649 tcp_process_options(tcp, tcpha);
2650 /*
2651 * The following changes our rwnd to be a multiple of the
2652 * MIN(peer MSS, our MSS) for performance reason.
2653 */
2654 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf,
2655 tcp->tcp_mss));
2656
2657 /* Is the other end ECN capable? */
2658 if (tcp->tcp_ecn_ok) {
2659 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) {
2660 tcp->tcp_ecn_ok = B_FALSE;
2661 }
2662 }
2663 /*
2664 * Clear ECN flags because it may interfere with later
2665 * processing.
2666 */
2667 flags &= ~(TH_ECE|TH_CWR);
2668
2669 tcp->tcp_irs = seg_seq;
2670 tcp->tcp_rack = seg_seq;
2671 tcp->tcp_rnxt = seg_seq + 1;
2672 tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
2673 if (!TCP_IS_DETACHED(tcp)) {
2674 /* Allocate room for SACK options if needed. */
2675 connp->conn_wroff = connp->conn_ht_iphc_len;
2676 if (tcp->tcp_snd_sack_ok)
2677 connp->conn_wroff += TCPOPT_MAX_SACK_LEN;
2678 if (!tcp->tcp_loopback)
2679 connp->conn_wroff += tcps->tcps_wroff_xtra;
2680
2681 (void) proto_set_tx_wroff(connp->conn_rq, connp,
2682 connp->conn_wroff);
2683 }
2684 if (flags & TH_ACK) {
2685 /*
2686 * If we can't get the confirmation upstream, pretend
2687 * we didn't even see this one.
2688 *
2689 * XXX: how can we pretend we didn't see it if we
2690 * have updated rnxt et. al.
2691 *
2692 * For loopback we defer sending up the T_CONN_CON
2693 * until after some checks below.
2694 */
2695 mp1 = NULL;
2696 /*
2697 * tcp_sendmsg() checks tcp_state without entering
2698 * the squeue so tcp_state should be updated before
2699 * sending up connection confirmation. Probe the
2700 * state change below when we are sure the connection
2701 * confirmation has been sent.
2702 */
2703 tcp->tcp_state = TCPS_ESTABLISHED;
2704 if (!tcp_conn_con(tcp, iphdr, mp,
2705 tcp->tcp_loopback ? &mp1 : NULL, ira)) {
2706 tcp->tcp_state = TCPS_SYN_SENT;
2707 freemsg(mp);
2708 return;
2709 }
2710 TCPS_CONN_INC(tcps);
2711 /* SYN was acked - making progress */
2712 tcp->tcp_ip_forward_progress = B_TRUE;
2713
2714 /* One for the SYN */
2715 tcp->tcp_suna = tcp->tcp_iss + 1;
2716 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
2717
2718 /*
2719 * If SYN was retransmitted, need to reset all
2720 * retransmission info. This is because this
2721 * segment will be treated as a dup ACK.
2722 */
2723 if (tcp->tcp_rexmit) {
2724 tcp->tcp_rexmit = B_FALSE;
2725 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
2726 tcp->tcp_rexmit_max = tcp->tcp_snxt;
2727 tcp->tcp_ms_we_have_waited = 0;
2728
2729 /*
2730 * Set tcp_cwnd back to 1 MSS, per
2731 * recommendation from
2732 * draft-floyd-incr-init-win-01.txt,
2733 * Increasing TCP's Initial Window.
2734 */
2735 DTRACE_PROBE3(cwnd__retransmitted__syn,
2736 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
2737 uint32_t, tcp->tcp_mss);
2738 tcp->tcp_cwnd = tcp->tcp_mss;
2739 }
2740
2741 tcp->tcp_swl1 = seg_seq;
2742 tcp->tcp_swl2 = seg_ack;
2743
2744 new_swnd = ntohs(tcpha->tha_win);
2745 tcp->tcp_swnd = new_swnd;
2746 if (new_swnd > tcp->tcp_max_swnd)
2747 tcp->tcp_max_swnd = new_swnd;
2748
2749 /*
2750 * Always send the three-way handshake ack immediately
2751 * in order to make the connection complete as soon as
2752 * possible on the accepting host.
2753 */
2754 flags |= TH_ACK_NEEDED;
2755
2756 /*
2757 * Trace connect-established here.
2758 */
2759 DTRACE_TCP5(connect__established, mblk_t *, NULL,
2760 ip_xmit_attr_t *, tcp->tcp_connp->conn_ixa,
2761 void_ip_t *, iphdr, tcp_t *, tcp, tcph_t *, tcpha);
2762
2763 /* Trace change from SYN_SENT -> ESTABLISHED here */
2764 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
2765 connp->conn_ixa, void, NULL, tcp_t *, tcp,
2766 void, NULL, int32_t, TCPS_SYN_SENT);
2767
2768 /*
2769 * Special case for loopback. At this point we have
2770 * received SYN-ACK from the remote endpoint. In
2771 * order to ensure that both endpoints reach the
2772 * fused state prior to any data exchange, the final
2773 * ACK needs to be sent before we indicate T_CONN_CON
2774 * to the module upstream.
2775 */
2776 if (tcp->tcp_loopback) {
2777 mblk_t *ack_mp;
2778
2779 ASSERT(!tcp->tcp_unfusable);
2780 ASSERT(mp1 != NULL);
2781 /*
2782 * For loopback, we always get a pure SYN-ACK
2783 * and only need to send back the final ACK
2784 * with no data (this is because the other
2785 * tcp is ours and we don't do T/TCP). This
2786 * final ACK triggers the passive side to
2787 * perform fusion in ESTABLISHED state.
2788 */
2789 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2790 if (tcp->tcp_ack_tid != 0) {
2791 (void) TCP_TIMER_CANCEL(tcp,
2792 tcp->tcp_ack_tid);
2793 tcp->tcp_ack_tid = 0;
2794 }
2795 tcp_send_data(tcp, ack_mp);
2796 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2797 TCPS_BUMP_MIB(tcps, tcpOutAck);
2798
2799 if (!IPCL_IS_NONSTR(connp)) {
2800 /* Send up T_CONN_CON */
2801 if (ira->ira_cred != NULL) {
2802 mblk_setcred(mp1,
2803 ira->ira_cred,
2804 ira->ira_cpid);
2805 }
2806 putnext(connp->conn_rq, mp1);
2807 } else {
2808 (*sockupcalls->su_connected)
2809 (connp->conn_upper_handle,
2810 tcp->tcp_connid,
2811 ira->ira_cred,
2812 ira->ira_cpid);
2813 freemsg(mp1);
2814 }
2815
2816 freemsg(mp);
2817 return;
2818 }
2819 /*
2820 * Forget fusion; we need to handle more
2821 * complex cases below. Send the deferred
2822 * T_CONN_CON message upstream and proceed
2823 * as usual. Mark this tcp as not capable
2824 * of fusion.
2825 */
2826 TCP_STAT(tcps, tcp_fusion_unfusable);
2827 tcp->tcp_unfusable = B_TRUE;
2828 if (!IPCL_IS_NONSTR(connp)) {
2829 if (ira->ira_cred != NULL) {
2830 mblk_setcred(mp1, ira->ira_cred,
2831 ira->ira_cpid);
2832 }
2833 putnext(connp->conn_rq, mp1);
2834 } else {
2835 (*sockupcalls->su_connected)
2836 (connp->conn_upper_handle,
2837 tcp->tcp_connid, ira->ira_cred,
2838 ira->ira_cpid);
2839 freemsg(mp1);
2840 }
2841 }
2842
2843 /*
2844 * Check to see if there is data to be sent. If
2845 * yes, set the transmit flag. Then check to see
2846 * if received data processing needs to be done.
2847 * If not, go straight to xmit_check. This short
2848 * cut is OK as we don't support T/TCP.
2849 */
2850 if (tcp->tcp_unsent)
2851 flags |= TH_XMIT_NEEDED;
2852
2853 if (seg_len == 0 && !(flags & TH_URG)) {
2854 freemsg(mp);
2855 goto xmit_check;
2856 }
2857
2858 flags &= ~TH_SYN;
2859 seg_seq++;
2860 break;
2861 }
2862 tcp->tcp_state = TCPS_SYN_RCVD;
2863 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
2864 connp->conn_ixa, void_ip_t *, NULL, tcp_t *, tcp,
2865 tcph_t *, NULL, int32_t, TCPS_SYN_SENT);
2866 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
2867 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
2868 if (mp1 != NULL) {
2869 tcp_send_data(tcp, mp1);
2870 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
2871 }
2872 freemsg(mp);
2873 return;
2874 case TCPS_SYN_RCVD:
2875 if (flags & TH_ACK) {
2876 uint32_t pinit_wnd;
2877
2878 /*
2879 * In this state, a SYN|ACK packet is either bogus
2880 * because the other side must be ACKing our SYN which
2881 * indicates it has seen the ACK for their SYN and
2882 * shouldn't retransmit it or we're crossing SYNs
2883 * on active open.
2884 */
2885 if ((flags & TH_SYN) && !tcp->tcp_active_open) {
2886 freemsg(mp);
2887 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn",
2888 tcp, seg_ack, 0, TH_RST);
2889 return;
2890 }
2891 /*
2892 * NOTE: RFC 793 pg. 72 says this should be
2893 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt
2894 * but that would mean we have an ack that ignored
2895 * our SYN.
2896 */
2897 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) ||
2898 SEQ_GT(seg_ack, tcp->tcp_snxt)) {
2899 freemsg(mp);
2900 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack",
2901 tcp, seg_ack, 0, TH_RST);
2902 return;
2903 }
2904 /*
2905 * No sane TCP stack will send such a small window
2906 * without receiving any data. Just drop this invalid
2907 * ACK. We also shorten the abort timeout in case
2908 * this is an attack.
2909 */
2910 pinit_wnd = ntohs(tcpha->tha_win) << tcp->tcp_snd_ws;
2911 if (pinit_wnd < tcp->tcp_mss &&
2912 pinit_wnd < tcp_init_wnd_chk) {
2913 freemsg(mp);
2914 TCP_STAT(tcps, tcp_zwin_ack_syn);
2915 tcp->tcp_second_ctimer_threshold =
2916 tcp_early_abort * SECONDS;
2917 return;
2918 }
2919 }
2920 break;
2921 case TCPS_LISTEN:
2922 /*
2923 * Only a TLI listener can come through this path when a
2924 * acceptor is going back to be a listener and a packet
2925 * for the acceptor hits the classifier. For a socket
2926 * listener, this can never happen because a listener
2927 * can never accept connection on itself and hence a
2928 * socket acceptor can not go back to being a listener.
2929 */
2930 ASSERT(!TCP_IS_SOCKET(tcp));
2931 /*FALLTHRU*/
2932 case TCPS_CLOSED:
2933 case TCPS_BOUND: {
2934 conn_t *new_connp;
2935 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
2936
2937 /*
2938 * Don't accept any input on a closed tcp as this TCP logically
2939 * does not exist on the system. Don't proceed further with
2940 * this TCP. For instance, this packet could trigger another
2941 * close of this tcp which would be disastrous for tcp_refcnt.
2942 * tcp_close_detached / tcp_clean_death / tcp_closei_local must
2943 * be called at most once on a TCP. In this case we need to
2944 * refeed the packet into the classifier and figure out where
2945 * the packet should go.
2946 */
2947 new_connp = ipcl_classify(mp, ira, ipst);
2948 if (new_connp != NULL) {
2949 /* Drops ref on new_connp */
2950 tcp_reinput(new_connp, mp, ira, ipst);
2951 return;
2952 }
2953 /* We failed to classify. For now just drop the packet */
2954 freemsg(mp);
2955 return;
2956 }
2957 case TCPS_IDLE:
2958 /*
2959 * Handle the case where the tcp_clean_death() has happened
2960 * on a connection (application hasn't closed yet) but a packet
2961 * was already queued on squeue before tcp_clean_death()
2962 * was processed. Calling tcp_clean_death() twice on same
2963 * connection can result in weird behaviour.
2964 */
2965 freemsg(mp);
2966 return;
2967 default:
2968 break;
2969 }
2970
2971 /*
2972 * Already on the correct queue/perimeter.
2973 * If this is a detached connection and not an eager
2974 * connection hanging off a listener then new data
2975 * (past the FIN) will cause a reset.
2976 * We do a special check here where it
2977 * is out of the main line, rather than check
2978 * if we are detached every time we see new
2979 * data down below.
2980 */
2981 if (TCP_IS_DETACHED_NONEAGER(tcp) &&
2982 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) {
2983 TCPS_BUMP_MIB(tcps, tcpInClosed);
2984 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2985 freemsg(mp);
2986 tcp_xmit_ctl("new data when detached", tcp,
2987 tcp->tcp_snxt, 0, TH_RST);
2988 (void) tcp_clean_death(tcp, EPROTO);
2989 return;
2990 }
2991
2992 mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
2993 urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION;
2994 new_swnd = ntohs(tcpha->tha_win) <<
2995 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
2996
2997 /*
2998 * We are interested in two TCP options: timestamps (if negotiated) and
2999 * SACK (if negotiated). Skip option parsing if neither is negotiated.
3000 */
3001 if (tcp->tcp_snd_ts_ok || tcp->tcp_snd_sack_ok) {
3002 int options;
3003 if (tcp->tcp_snd_sack_ok)
3004 tcpopt.tcp = tcp;
3005 else
3006 tcpopt.tcp = NULL;
3007 options = tcp_parse_options(tcpha, &tcpopt);
3008 /*
3009 * RST segments must not be subject to PAWS and are not
3010 * required to have timestamps.
3011 * We do not drop keepalive segments without
3012 * timestamps, to maintain compatibility with legacy TCP stacks.
3013 */
3014 boolean_t keepalive = (seg_len == 0 || seg_len == 1) &&
3015 (seg_seq + 1 == tcp->tcp_rnxt);
3016 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) && !keepalive) {
3017 /*
3018 * Per RFC 7323 section 3.2., silently drop non-RST
3019 * segments without expected TSopt. This is a 'SHOULD'
3020 * requirement.
3021 * We accept keepalives without TSopt to maintain
3022 * interoperability with tcp implementations that omit
3023 * the TSopt on these. Keepalive data is discarded, so
3024 * there is no risk corrupting data by accepting these.
3025 */
3026 if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
3027 /*
3028 * Leave a breadcrumb for people to detect this
3029 * behavior.
3030 */
3031 DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
3032 freemsg(mp);
3033 return;
3034 }
3035
3036 if (!tcp_paws_check(tcp, &tcpopt)) {
3037 /*
3038 * This segment is not acceptable.
3039 * Drop it and send back an ACK.
3040 */
3041 freemsg(mp);
3042 flags |= TH_ACK_NEEDED;
3043 goto ack_check;
3044 }
3045 }
3046 }
3047 try_again:;
3048 mss = tcp->tcp_mss;
3049 gap = seg_seq - tcp->tcp_rnxt;
3050 rgap = tcp->tcp_rwnd - (gap + seg_len);
3051 /*
3052 * gap is the amount of sequence space between what we expect to see
3053 * and what we got for seg_seq. A positive value for gap means
3054 * something got lost. A negative value means we got some old stuff.
3055 */
3056 if (gap < 0) {
3057 /* Old stuff present. Is the SYN in there? */
3058 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) &&
3059 (seg_len != 0)) {
3060 flags &= ~TH_SYN;
3061 seg_seq++;
3062 urp--;
3063 /* Recompute the gaps after noting the SYN. */
3064 goto try_again;
3065 }
3066 TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
3067 TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
3068 (seg_len > -gap ? -gap : seg_len));
3069 /* Remove the old stuff from seg_len. */
3070 seg_len += gap;
3071 /*
3072 * Anything left?
3073 * Make sure to check for unack'd FIN when rest of data
3074 * has been previously ack'd.
3075 */
3076 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
3077 /*
3078 * Resets are only valid if they lie within our offered
3079 * window. If the RST bit is set, we just ignore this
3080 * segment.
3081 */
3082 if (flags & TH_RST) {
3083 freemsg(mp);
3084 return;
3085 }
3086
3087 /*
3088 * The arriving of dup data packets indicate that we
3089 * may have postponed an ack for too long, or the other
3090 * side's RTT estimate is out of shape. Start acking
3091 * more often.
3092 */
3093 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) &&
3094 tcp->tcp_rack_cnt >= 1 &&
3095 tcp->tcp_rack_abs_max > 2) {
3096 tcp->tcp_rack_abs_max--;
3097 }
3098 tcp->tcp_rack_cur_max = 1;
3099
3100 /*
3101 * This segment is "unacceptable". None of its
3102 * sequence space lies within our advertized window.
3103 *
3104 * Adjust seg_len to the original value for tracing.
3105 */
3106 seg_len -= gap;
3107 if (connp->conn_debug) {
3108 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
3109 "tcp_rput: unacceptable, gap %d, rgap %d, "
3110 "flags 0x%x, seg_seq %u, seg_ack %u, "
3111 "seg_len %d, rnxt %u, snxt %u, %s",
3112 gap, rgap, flags, seg_seq, seg_ack,
3113 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt,
3114 tcp_display(tcp, NULL,
3115 DISP_ADDR_AND_PORT));
3116 }
3117
3118 /*
3119 * Arrange to send an ACK in response to the
3120 * unacceptable segment per RFC 793 page 69. There
3121 * is only one small difference between ours and the
3122 * acceptability test in the RFC - we accept ACK-only
3123 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK
3124 * will be generated.
3125 *
3126 * Note that we have to ACK an ACK-only packet at least
3127 * for stacks that send 0-length keep-alives with
3128 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122,
3129 * section 4.2.3.6. As long as we don't ever generate
3130 * an unacceptable packet in response to an incoming
3131 * packet that is unacceptable, it should not cause
3132 * "ACK wars".
3133 */
3134 flags |= TH_ACK_NEEDED;
3135
3136 /*
3137 * Continue processing this segment in order to use the
3138 * ACK information it contains, but skip all other
3139 * sequence-number processing. Processing the ACK
3140 * information is necessary in order to
3141 * re-synchronize connections that may have lost
3142 * synchronization.
3143 *
3144 * We clear seg_len and flag fields related to
3145 * sequence number processing as they are not
3146 * to be trusted for an unacceptable segment.
3147 */
3148 seg_len = 0;
3149 flags &= ~(TH_SYN | TH_FIN | TH_URG);
3150 goto process_ack;
3151 }
3152
3153 /* Fix seg_seq, and chew the gap off the front. */
3154 seg_seq = tcp->tcp_rnxt;
3155 urp += gap;
3156 do {
3157 mblk_t *mp2;
3158 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
3159 (uintptr_t)UINT_MAX);
3160 gap += (uint_t)(mp->b_wptr - mp->b_rptr);
3161 if (gap > 0) {
3162 mp->b_rptr = mp->b_wptr - gap;
3163 break;
3164 }
3165 mp2 = mp;
3166 mp = mp->b_cont;
3167 freeb(mp2);
3168 } while (gap < 0);
3169 /*
3170 * If the urgent data has already been acknowledged, we
3171 * should ignore TH_URG below
3172 */
3173 if (urp < 0)
3174 flags &= ~TH_URG;
3175 }
3176 /*
3177 * rgap is the amount of stuff received out of window. A negative
3178 * value is the amount out of window.
3179 */
3180 if (rgap < 0) {
3181 mblk_t *mp2;
3182
3183 if (tcp->tcp_rwnd == 0) {
3184 TCPS_BUMP_MIB(tcps, tcpInWinProbe);
3185 tcp->tcp_cs.tcp_in_zwnd_probes++;
3186 } else {
3187 TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3188 TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3189 }
3190
3191 /*
3192 * seg_len does not include the FIN, so if more than
3193 * just the FIN is out of window, we act like we don't
3194 * see it. (If just the FIN is out of window, rgap
3195 * will be zero and we will go ahead and acknowledge
3196 * the FIN.)
3197 */
3198 flags &= ~TH_FIN;
3199
3200 /* Fix seg_len and make sure there is something left. */
3201 seg_len += rgap;
3202 if (seg_len <= 0) {
3203 /*
3204 * Resets are only valid if they lie within our offered
3205 * window. If the RST bit is set, we just ignore this
3206 * segment.
3207 */
3208 if (flags & TH_RST) {
3209 freemsg(mp);
3210 return;
3211 }
3212
3213 /* Per RFC 793, we need to send back an ACK. */
3214 flags |= TH_ACK_NEEDED;
3215
3216 /*
3217 * Send SIGURG as soon as possible i.e. even
3218 * if the TH_URG was delivered in a window probe
3219 * packet (which will be unacceptable).
3220 *
3221 * We generate a signal if none has been generated
3222 * for this connection or if this is a new urgent
3223 * byte. Also send a zero-length "unmarked" message
3224 * to inform SIOCATMARK that this is not the mark.
3225 *
3226 * tcp_urp_last_valid is cleared when the T_exdata_ind
3227 * is sent up. This plus the check for old data
3228 * (gap >= 0) handles the wraparound of the sequence
3229 * number space without having to always track the
3230 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks
3231 * this max in its rcv_up variable).
3232 *
3233 * This prevents duplicate SIGURGS due to a "late"
3234 * zero-window probe when the T_EXDATA_IND has already
3235 * been sent up.
3236 */
3237 if ((flags & TH_URG) &&
3238 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq,
3239 tcp->tcp_urp_last))) {
3240 if (IPCL_IS_NONSTR(connp)) {
3241 if (!TCP_IS_DETACHED(tcp)) {
3242 (*sockupcalls->su_signal_oob)
3243 (connp->conn_upper_handle,
3244 urp);
3245 }
3246 } else {
3247 mp1 = allocb(0, BPRI_MED);
3248 if (mp1 == NULL) {
3249 freemsg(mp);
3250 return;
3251 }
3252 if (!TCP_IS_DETACHED(tcp) &&
3253 !putnextctl1(connp->conn_rq,
3254 M_PCSIG, SIGURG)) {
3255 /* Try again on the rexmit. */
3256 freemsg(mp1);
3257 freemsg(mp);
3258 return;
3259 }
3260 /*
3261 * If the next byte would be the mark
3262 * then mark with MARKNEXT else mark
3263 * with NOTMARKNEXT.
3264 */
3265 if (gap == 0 && urp == 0)
3266 mp1->b_flag |= MSGMARKNEXT;
3267 else
3268 mp1->b_flag |= MSGNOTMARKNEXT;
3269 freemsg(tcp->tcp_urp_mark_mp);
3270 tcp->tcp_urp_mark_mp = mp1;
3271 flags |= TH_SEND_URP_MARK;
3272 }
3273 tcp->tcp_urp_last_valid = B_TRUE;
3274 tcp->tcp_urp_last = urp + seg_seq;
3275 }
3276 /*
3277 * If this is a zero window probe, continue to
3278 * process the ACK part. But we need to set seg_len
3279 * to 0 to avoid data processing. Otherwise just
3280 * drop the segment and send back an ACK.
3281 */
3282 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) {
3283 flags &= ~(TH_SYN | TH_URG);
3284 seg_len = 0;
3285 goto process_ack;
3286 } else {
3287 freemsg(mp);
3288 goto ack_check;
3289 }
3290 }
3291 /* Pitch out of window stuff off the end. */
3292 rgap = seg_len;
3293 mp2 = mp;
3294 do {
3295 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <=
3296 (uintptr_t)INT_MAX);
3297 rgap -= (int)(mp2->b_wptr - mp2->b_rptr);
3298 if (rgap < 0) {
3299 mp2->b_wptr += rgap;
3300 if ((mp1 = mp2->b_cont) != NULL) {
3301 mp2->b_cont = NULL;
3302 freemsg(mp1);
3303 }
3304 break;
3305 }
3306 } while ((mp2 = mp2->b_cont) != NULL);
3307 }
3308 ok:;
3309 /*
3310 * TCP should check ECN info for segments inside the window only.
3311 * Therefore the check should be done here.
3312 */
3313 if (tcp->tcp_ecn_ok) {
3314 if (flags & TH_CWR) {
3315 tcp->tcp_ecn_echo_on = B_FALSE;
3316 }
3317 /*
3318 * Note that both ECN_CE and CWR can be set in the
3319 * same segment. In this case, we once again turn
3320 * on ECN_ECHO.
3321 */
3322 if (connp->conn_ipversion == IPV4_VERSION) {
3323 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service;
3324
3325 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) {
3326 tcp->tcp_ecn_echo_on = B_TRUE;
3327 }
3328 } else {
3329 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf;
3330
3331 if ((vcf & htonl(IPH_ECN_CE << 20)) ==
3332 htonl(IPH_ECN_CE << 20)) {
3333 tcp->tcp_ecn_echo_on = B_TRUE;
3334 }
3335 }
3336 }
3337
3338 /*
3339 * Check whether we can update tcp_ts_recent. This test is from RFC
3340 * 7323, section 5.3.
3341 */
3342 if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
3343 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
3344 SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
3345 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
3346 tcp->tcp_last_rcv_lbolt = LBOLT_FASTPATH64;
3347 }
3348
3349 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) {
3350 /*
3351 * FIN in an out of order segment. We record this in
3352 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq.
3353 * Clear the FIN so that any check on FIN flag will fail.
3354 * Remember that FIN also counts in the sequence number
3355 * space. So we need to ack out of order FIN only segments.
3356 */
3357 if (flags & TH_FIN) {
3358 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID;
3359 tcp->tcp_ofo_fin_seq = seg_seq + seg_len;
3360 flags &= ~TH_FIN;
3361 flags |= TH_ACK_NEEDED;
3362 }
3363 if (seg_len > 0) {
3364 /* Fill in the SACK blk list. */
3365 if (tcp->tcp_snd_sack_ok) {
3366 tcp_sack_insert(tcp->tcp_sack_list,
3367 seg_seq, seg_seq + seg_len,
3368 &(tcp->tcp_num_sack_blk));
3369 }
3370
3371 /*
3372 * Attempt reassembly and see if we have something
3373 * ready to go.
3374 */
3375 mp = tcp_reass(tcp, mp, seg_seq);
3376 /* Always ack out of order packets */
3377 flags |= TH_ACK_NEEDED | TH_PUSH;
3378 if (mp) {
3379 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
3380 (uintptr_t)INT_MAX);
3381 seg_len = mp->b_cont ? msgdsize(mp) :
3382 (int)(mp->b_wptr - mp->b_rptr);
3383 seg_seq = tcp->tcp_rnxt;
3384 /*
3385 * A gap is filled and the seq num and len
3386 * of the gap match that of a previously
3387 * received FIN, put the FIN flag back in.
3388 */
3389 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3390 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3391 flags |= TH_FIN;
3392 tcp->tcp_valid_bits &=
3393 ~TCP_OFO_FIN_VALID;
3394 }
3395 if (tcp->tcp_reass_tid != 0) {
3396 (void) TCP_TIMER_CANCEL(tcp,
3397 tcp->tcp_reass_tid);
3398 /*
3399 * Restart the timer if there is still
3400 * data in the reassembly queue.
3401 */
3402 if (tcp->tcp_reass_head != NULL) {
3403 tcp->tcp_reass_tid = TCP_TIMER(
3404 tcp, tcp_reass_timer,
3405 tcps->tcps_reass_timeout);
3406 } else {
3407 tcp->tcp_reass_tid = 0;
3408 }
3409 }
3410 } else {
3411 /*
3412 * Keep going even with NULL mp.
3413 * There may be a useful ACK or something else
3414 * we don't want to miss.
3415 *
3416 * But TCP should not perform fast retransmit
3417 * because of the ack number. TCP uses
3418 * seg_len == 0 to determine if it is a pure
3419 * ACK. And this is not a pure ACK.
3420 */
3421 seg_len = 0;
3422 ofo_seg = B_TRUE;
3423
3424 if (tcps->tcps_reass_timeout != 0 &&
3425 tcp->tcp_reass_tid == 0) {
3426 tcp->tcp_reass_tid = TCP_TIMER(tcp,
3427 tcp_reass_timer,
3428 tcps->tcps_reass_timeout);
3429 }
3430 }
3431 }
3432 } else if (seg_len > 0) {
3433 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3434 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
3435 tcp->tcp_cs.tcp_in_data_inorder_segs++;
3436 tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
3437
3438 /*
3439 * If an out of order FIN was received before, and the seq
3440 * num and len of the new segment match that of the FIN,
3441 * put the FIN flag back in.
3442 */
3443 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3444 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3445 flags |= TH_FIN;
3446 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3447 }
3448 }
3449 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) {
3450 if (flags & TH_RST) {
3451 freemsg(mp);
3452 switch (tcp->tcp_state) {
3453 case TCPS_SYN_RCVD:
3454 (void) tcp_clean_death(tcp, ECONNREFUSED);
3455 break;
3456 case TCPS_ESTABLISHED:
3457 case TCPS_FIN_WAIT_1:
3458 case TCPS_FIN_WAIT_2:
3459 case TCPS_CLOSE_WAIT:
3460 (void) tcp_clean_death(tcp, ECONNRESET);
3461 break;
3462 case TCPS_CLOSING:
3463 case TCPS_LAST_ACK:
3464 (void) tcp_clean_death(tcp, 0);
3465 break;
3466 default:
3467 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3468 (void) tcp_clean_death(tcp, ENXIO);
3469 break;
3470 }
3471 return;
3472 }
3473 if (flags & TH_SYN) {
3474 /*
3475 * See RFC 793, Page 71
3476 *
3477 * The seq number must be in the window as it should
3478 * be "fixed" above. If it is outside window, it should
3479 * be already rejected. Note that we allow seg_seq to be
3480 * rnxt + rwnd because we want to accept 0 window probe.
3481 */
3482 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) &&
3483 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd));
3484 freemsg(mp);
3485 /*
3486 * If the ACK flag is not set, just use our snxt as the
3487 * seq number of the RST segment.
3488 */
3489 if (!(flags & TH_ACK)) {
3490 seg_ack = tcp->tcp_snxt;
3491 }
3492 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
3493 TH_RST|TH_ACK);
3494 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
3495 (void) tcp_clean_death(tcp, ECONNRESET);
3496 return;
3497 }
3498 /*
3499 * urp could be -1 when the urp field in the packet is 0
3500 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
3501 * byte was at seg_seq - 1, in which case we ignore the urgent flag.
3502 */
3503 if ((flags & TH_URG) && urp >= 0) {
3504 if (!tcp->tcp_urp_last_valid ||
3505 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
3506 /*
3507 * Non-STREAMS sockets handle the urgent data a litte
3508 * differently from STREAMS based sockets. There is no
3509 * need to mark any mblks with the MSG{NOT,}MARKNEXT
3510 * flags to keep SIOCATMARK happy. Instead a
3511 * su_signal_oob upcall is made to update the mark.
3512 * Neither is a T_EXDATA_IND mblk needed to be
3513 * prepended to the urgent data. The urgent data is
3514 * delivered using the su_recv upcall, where we set
3515 * the MSG_OOB flag to indicate that it is urg data.
3516 *
3517 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED
3518 * are used by non-STREAMS sockets.
3519 */
3520 if (IPCL_IS_NONSTR(connp)) {
3521 if (!TCP_IS_DETACHED(tcp)) {
3522 (*sockupcalls->su_signal_oob)
3523 (connp->conn_upper_handle, urp);
3524 }
3525 } else {
3526 /*
3527 * If we haven't generated the signal yet for
3528 * this urgent pointer value, do it now. Also,
3529 * send up a zero-length M_DATA indicating
3530 * whether or not this is the mark. The latter
3531 * is not needed when a T_EXDATA_IND is sent up.
3532 * However, if there are allocation failures
3533 * this code relies on the sender retransmitting
3534 * and the socket code for determining the mark
3535 * should not block waiting for the peer to
3536 * transmit. Thus, for simplicity we always
3537 * send up the mark indication.
3538 */
3539 mp1 = allocb(0, BPRI_MED);
3540 if (mp1 == NULL) {
3541 freemsg(mp);
3542 return;
3543 }
3544 if (!TCP_IS_DETACHED(tcp) &&
3545 !putnextctl1(connp->conn_rq, M_PCSIG,
3546 SIGURG)) {
3547 /* Try again on the rexmit. */
3548 freemsg(mp1);
3549 freemsg(mp);
3550 return;
3551 }
3552 /*
3553 * Mark with NOTMARKNEXT for now.
3554 * The code below will change this to MARKNEXT
3555 * if we are at the mark.
3556 *
3557 * If there are allocation failures (e.g. in
3558 * dupmsg below) the next time tcp_input_data
3559 * sees the urgent segment it will send up the
3560 * MSGMARKNEXT message.
3561 */
3562 mp1->b_flag |= MSGNOTMARKNEXT;
3563 freemsg(tcp->tcp_urp_mark_mp);
3564 tcp->tcp_urp_mark_mp = mp1;
3565 flags |= TH_SEND_URP_MARK;
3566 #ifdef DEBUG
3567 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
3568 "tcp_rput: sent M_PCSIG 2 seq %x urp %x "
3569 "last %x, %s",
3570 seg_seq, urp, tcp->tcp_urp_last,
3571 tcp_display(tcp, NULL, DISP_PORT_ONLY));
3572 #endif /* DEBUG */
3573 }
3574 tcp->tcp_urp_last_valid = B_TRUE;
3575 tcp->tcp_urp_last = urp + seg_seq;
3576 } else if (tcp->tcp_urp_mark_mp != NULL) {
3577 /*
3578 * An allocation failure prevented the previous
3579 * tcp_input_data from sending up the allocated
3580 * MSG*MARKNEXT message - send it up this time
3581 * around.
3582 */
3583 flags |= TH_SEND_URP_MARK;
3584 }
3585
3586 /*
3587 * If the urgent byte is in this segment, make sure that it is
3588 * all by itself. This makes it much easier to deal with the
3589 * possibility of an allocation failure on the T_exdata_ind.
3590 * Note that seg_len is the number of bytes in the segment, and
3591 * urp is the offset into the segment of the urgent byte.
3592 * urp < seg_len means that the urgent byte is in this segment.
3593 */
3594 if (urp < seg_len) {
3595 if (seg_len != 1) {
3596 uint32_t tmp_rnxt;
3597 /*
3598 * Break it up and feed it back in.
3599 * Re-attach the IP header.
3600 */
3601 mp->b_rptr = iphdr;
3602 if (urp > 0) {
3603 /*
3604 * There is stuff before the urgent
3605 * byte.
3606 */
3607 mp1 = dupmsg(mp);
3608 if (!mp1) {
3609 /*
3610 * Trim from urgent byte on.
3611 * The rest will come back.
3612 */
3613 (void) adjmsg(mp,
3614 urp - seg_len);
3615 tcp_input_data(connp,
3616 mp, NULL, ira);
3617 return;
3618 }
3619 (void) adjmsg(mp1, urp - seg_len);
3620 /* Feed this piece back in. */
3621 tmp_rnxt = tcp->tcp_rnxt;
3622 tcp_input_data(connp, mp1, NULL, ira);
3623 /*
3624 * If the data passed back in was not
3625 * processed (ie: bad ACK) sending
3626 * the remainder back in will cause a
3627 * loop. In this case, drop the
3628 * packet and let the sender try
3629 * sending a good packet.
3630 */
3631 if (tmp_rnxt == tcp->tcp_rnxt) {
3632 freemsg(mp);
3633 return;
3634 }
3635 }
3636 if (urp != seg_len - 1) {
3637 uint32_t tmp_rnxt;
3638 /*
3639 * There is stuff after the urgent
3640 * byte.
3641 */
3642 mp1 = dupmsg(mp);
3643 if (!mp1) {
3644 /*
3645 * Trim everything beyond the
3646 * urgent byte. The rest will
3647 * come back.
3648 */
3649 (void) adjmsg(mp,
3650 urp + 1 - seg_len);
3651 tcp_input_data(connp,
3652 mp, NULL, ira);
3653 return;
3654 }
3655 (void) adjmsg(mp1, urp + 1 - seg_len);
3656 tmp_rnxt = tcp->tcp_rnxt;
3657 tcp_input_data(connp, mp1, NULL, ira);
3658 /*
3659 * If the data passed back in was not
3660 * processed (ie: bad ACK) sending
3661 * the remainder back in will cause a
3662 * loop. In this case, drop the
3663 * packet and let the sender try
3664 * sending a good packet.
3665 */
3666 if (tmp_rnxt == tcp->tcp_rnxt) {
3667 freemsg(mp);
3668 return;
3669 }
3670 }
3671 tcp_input_data(connp, mp, NULL, ira);
3672 return;
3673 }
3674 /*
3675 * This segment contains only the urgent byte. We
3676 * have to allocate the T_exdata_ind, if we can.
3677 */
3678 if (IPCL_IS_NONSTR(connp)) {
3679 int error;
3680
3681 (*sockupcalls->su_recv)
3682 (connp->conn_upper_handle, mp, seg_len,
3683 MSG_OOB, &error, NULL);
3684 /*
3685 * We should never be in middle of a
3686 * fallback, the squeue guarantees that.
3687 */
3688 ASSERT(error != EOPNOTSUPP);
3689 mp = NULL;
3690 goto update_ack;
3691 } else if (!tcp->tcp_urp_mp) {
3692 struct T_exdata_ind *tei;
3693 mp1 = allocb(sizeof (struct T_exdata_ind),
3694 BPRI_MED);
3695 if (!mp1) {
3696 /*
3697 * Sigh... It'll be back.
3698 * Generate any MSG*MARK message now.
3699 */
3700 freemsg(mp);
3701 seg_len = 0;
3702 if (flags & TH_SEND_URP_MARK) {
3703
3704
3705 ASSERT(tcp->tcp_urp_mark_mp);
3706 tcp->tcp_urp_mark_mp->b_flag &=
3707 ~MSGNOTMARKNEXT;
3708 tcp->tcp_urp_mark_mp->b_flag |=
3709 MSGMARKNEXT;
3710 }
3711 goto ack_check;
3712 }
3713 mp1->b_datap->db_type = M_PROTO;
3714 tei = (struct T_exdata_ind *)mp1->b_rptr;
3715 tei->PRIM_type = T_EXDATA_IND;
3716 tei->MORE_flag = 0;
3717 mp1->b_wptr = (uchar_t *)&tei[1];
3718 tcp->tcp_urp_mp = mp1;
3719 #ifdef DEBUG
3720 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
3721 "tcp_rput: allocated exdata_ind %s",
3722 tcp_display(tcp, NULL,
3723 DISP_PORT_ONLY));
3724 #endif /* DEBUG */
3725 /*
3726 * There is no need to send a separate MSG*MARK
3727 * message since the T_EXDATA_IND will be sent
3728 * now.
3729 */
3730 flags &= ~TH_SEND_URP_MARK;
3731 freemsg(tcp->tcp_urp_mark_mp);
3732 tcp->tcp_urp_mark_mp = NULL;
3733 }
3734 /*
3735 * Now we are all set. On the next putnext upstream,
3736 * tcp_urp_mp will be non-NULL and will get prepended
3737 * to what has to be this piece containing the urgent
3738 * byte. If for any reason we abort this segment below,
3739 * if it comes back, we will have this ready, or it
3740 * will get blown off in close.
3741 */
3742 } else if (urp == seg_len) {
3743 /*
3744 * The urgent byte is the next byte after this sequence
3745 * number. If this endpoint is non-STREAMS, then there
3746 * is nothing to do here since the socket has already
3747 * been notified about the urg pointer by the
3748 * su_signal_oob call above.
3749 *
3750 * In case of STREAMS, some more work might be needed.
3751 * If there is data it is marked with MSGMARKNEXT and
3752 * and any tcp_urp_mark_mp is discarded since it is not
3753 * needed. Otherwise, if the code above just allocated
3754 * a zero-length tcp_urp_mark_mp message, that message
3755 * is tagged with MSGMARKNEXT. Sending up these
3756 * MSGMARKNEXT messages makes SIOCATMARK work correctly
3757 * even though the T_EXDATA_IND will not be sent up
3758 * until the urgent byte arrives.
3759 */
3760 if (!IPCL_IS_NONSTR(tcp->tcp_connp)) {
3761 if (seg_len != 0) {
3762 flags |= TH_MARKNEXT_NEEDED;
3763 freemsg(tcp->tcp_urp_mark_mp);
3764 tcp->tcp_urp_mark_mp = NULL;
3765 flags &= ~TH_SEND_URP_MARK;
3766 } else if (tcp->tcp_urp_mark_mp != NULL) {
3767 flags |= TH_SEND_URP_MARK;
3768 tcp->tcp_urp_mark_mp->b_flag &=
3769 ~MSGNOTMARKNEXT;
3770 tcp->tcp_urp_mark_mp->b_flag |=
3771 MSGMARKNEXT;
3772 }
3773 }
3774 #ifdef DEBUG
3775 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
3776 "tcp_rput: AT MARK, len %d, flags 0x%x, %s",
3777 seg_len, flags,
3778 tcp_display(tcp, NULL, DISP_PORT_ONLY));
3779 #endif /* DEBUG */
3780 }
3781 #ifdef DEBUG
3782 else {
3783 /* Data left until we hit mark */
3784 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
3785 "tcp_rput: URP %d bytes left, %s",
3786 urp - seg_len, tcp_display(tcp, NULL,
3787 DISP_PORT_ONLY));
3788 }
3789 #endif /* DEBUG */
3790 }
3791
3792 process_ack:
3793 if (!(flags & TH_ACK)) {
3794 freemsg(mp);
3795 goto xmit_check;
3796 }
3797 }
3798 bytes_acked = (int)(seg_ack - tcp->tcp_suna);
3799
3800 if (bytes_acked > 0)
3801 tcp->tcp_ip_forward_progress = B_TRUE;
3802 if (tcp->tcp_state == TCPS_SYN_RCVD) {
3803 /*
3804 * tcp_sendmsg() checks tcp_state without entering
3805 * the squeue so tcp_state should be updated before
3806 * sending up a connection confirmation or a new
3807 * connection indication.
3808 */
3809 tcp->tcp_state = TCPS_ESTABLISHED;
3810
3811 /*
3812 * We are seeing the final ack in the three way
3813 * hand shake of a active open'ed connection
3814 * so we must send up a T_CONN_CON
3815 */
3816 if (tcp->tcp_active_open) {
3817 if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
3818 freemsg(mp);
3819 tcp->tcp_state = TCPS_SYN_RCVD;
3820 return;
3821 }
3822 /*
3823 * Don't fuse the loopback endpoints for
3824 * simultaneous active opens.
3825 */
3826 if (tcp->tcp_loopback) {
3827 TCP_STAT(tcps, tcp_fusion_unfusable);
3828 tcp->tcp_unfusable = B_TRUE;
3829 }
3830 /*
3831 * For simultaneous active open, trace receipt of final
3832 * ACK as tcp:::connect-established.
3833 */
3834 DTRACE_TCP5(connect__established, mblk_t *, NULL,
3835 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3836 iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3837 } else if (IPCL_IS_NONSTR(connp)) {
3838 /*
3839 * 3-way handshake has completed, so notify socket
3840 * of the new connection.
3841 *
3842 * We are here means eager is fine but it can
3843 * get a TH_RST at any point between now and till
3844 * accept completes and disappear. We need to
3845 * ensure that reference to eager is valid after
3846 * we get out of eager's perimeter. So we do
3847 * an extra refhold.
3848 */
3849 CONN_INC_REF(connp);
3850
3851 if (!tcp_newconn_notify(tcp, ira)) {
3852 /*
3853 * The state-change probe for SYN_RCVD ->
3854 * ESTABLISHED has not fired yet. We reset
3855 * the state to SYN_RCVD so that future
3856 * state-change probes report correct state
3857 * transistions.
3858 */
3859 tcp->tcp_state = TCPS_SYN_RCVD;
3860 freemsg(mp);
3861 /* notification did not go up, so drop ref */
3862 CONN_DEC_REF(connp);
3863 /* ... and close the eager */
3864 ASSERT(TCP_IS_DETACHED(tcp));
3865 (void) tcp_close_detached(tcp);
3866 return;
3867 }
3868 /*
3869 * tcp_newconn_notify() changes conn_upcalls and
3870 * connp->conn_upper_handle. Fix things now, in case
3871 * there's data attached to this ack.
3872 */
3873 if (connp->conn_upcalls != NULL)
3874 sockupcalls = connp->conn_upcalls;
3875 /*
3876 * For passive open, trace receipt of final ACK as
3877 * tcp:::accept-established.
3878 */
3879 DTRACE_TCP5(accept__established, mlbk_t *, NULL,
3880 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3881 iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3882 } else {
3883 /*
3884 * 3-way handshake complete - this is a STREAMS based
3885 * socket, so pass up the T_CONN_IND.
3886 */
3887 tcp_t *listener = tcp->tcp_listener;
3888 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind;
3889
3890 tcp->tcp_tconnind_started = B_TRUE;
3891 tcp->tcp_conn.tcp_eager_conn_ind = NULL;
3892 ASSERT(mp != NULL);
3893 /*
3894 * We are here means eager is fine but it can
3895 * get a TH_RST at any point between now and till
3896 * accept completes and disappear. We need to
3897 * ensure that reference to eager is valid after
3898 * we get out of eager's perimeter. So we do
3899 * an extra refhold.
3900 */
3901 CONN_INC_REF(connp);
3902
3903 /*
3904 * The listener also exists because of the refhold
3905 * done in tcp_input_listener. Its possible that it
3906 * might have closed. We will check that once we
3907 * get inside listeners context.
3908 */
3909 CONN_INC_REF(listener->tcp_connp);
3910 if (listener->tcp_connp->conn_sqp ==
3911 connp->conn_sqp) {
3912 /*
3913 * We optimize by not calling an SQUEUE_ENTER
3914 * on the listener since we know that the
3915 * listener and eager squeues are the same.
3916 * We are able to make this check safely only
3917 * because neither the eager nor the listener
3918 * can change its squeue. Only an active connect
3919 * can change its squeue
3920 */
3921 tcp_send_conn_ind(listener->tcp_connp, mp,
3922 listener->tcp_connp->conn_sqp);
3923 CONN_DEC_REF(listener->tcp_connp);
3924 } else if (!tcp->tcp_loopback) {
3925 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
3926 mp, tcp_send_conn_ind,
3927 listener->tcp_connp, NULL, SQ_FILL,
3928 SQTAG_TCP_CONN_IND);
3929 } else {
3930 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
3931 mp, tcp_send_conn_ind,
3932 listener->tcp_connp, NULL, SQ_NODRAIN,
3933 SQTAG_TCP_CONN_IND);
3934 }
3935 /*
3936 * For passive open, trace receipt of final ACK as
3937 * tcp:::accept-established.
3938 */
3939 DTRACE_TCP5(accept__established, mlbk_t *, NULL,
3940 ip_xmit_attr_t *, connp->conn_ixa, void_ip_t *,
3941 iphdr, tcp_t *, tcp, tcph_t *, tcpha);
3942 }
3943 TCPS_CONN_INC(tcps);
3944
3945 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
3946 bytes_acked--;
3947 /* SYN was acked - making progress */
3948 tcp->tcp_ip_forward_progress = B_TRUE;
3949
3950 /*
3951 * If SYN was retransmitted, need to reset all
3952 * retransmission info as this segment will be
3953 * treated as a dup ACK.
3954 */
3955 if (tcp->tcp_rexmit) {
3956 tcp->tcp_rexmit = B_FALSE;
3957 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
3958 tcp->tcp_rexmit_max = tcp->tcp_snxt;
3959 tcp->tcp_ms_we_have_waited = 0;
3960 DTRACE_PROBE3(cwnd__retransmitted__syn,
3961 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
3962 uint32_t, tcp->tcp_mss);
3963 tcp->tcp_cwnd = mss;
3964 }
3965
3966 /*
3967 * We set the send window to zero here.
3968 * This is needed if there is data to be
3969 * processed already on the queue.
3970 * Later (at swnd_update label), the
3971 * "new_swnd > tcp_swnd" condition is satisfied
3972 * the XMIT_NEEDED flag is set in the current
3973 * (SYN_RCVD) state. This ensures tcp_wput_data() is
3974 * called if there is already data on queue in
3975 * this state.
3976 */
3977 tcp->tcp_swnd = 0;
3978
3979 if (new_swnd > tcp->tcp_max_swnd)
3980 tcp->tcp_max_swnd = new_swnd;
3981 tcp->tcp_swl1 = seg_seq;
3982 tcp->tcp_swl2 = seg_ack;
3983 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
3984
3985 /* Trace change from SYN_RCVD -> ESTABLISHED here */
3986 DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
3987 connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
3988 int32_t, TCPS_SYN_RCVD);
3989
3990 /* Fuse when both sides are in ESTABLISHED state */
3991 if (tcp->tcp_loopback && do_tcp_fusion)
3992 tcp_fuse(tcp, iphdr, tcpha);
3993
3994 }
3995 /* This code follows 4.4BSD-Lite2 mostly. */
3996 if (bytes_acked < 0)
3997 goto est;
3998
3999 /*
4000 * If TCP is ECN capable and the congestion experience bit is
4001 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
4002 * done once per window (or more loosely, per RTT).
4003 */
4004 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
4005 tcp->tcp_cwr = B_FALSE;
4006 if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
4007 cc_cong_signal(tcp, seg_ack, CC_ECN);
4008 /*
4009 * If the cwnd is 0, use the timer to clock out
4010 * new segments. This is required by the ECN spec.
4011 */
4012 if (tcp->tcp_cwnd == 0)
4013 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4014 tcp->tcp_cwr = B_TRUE;
4015 /*
4016 * This marks the end of the current window of in
4017 * flight data. That is why we don't use
4018 * tcp_suna + tcp_swnd. Only data in flight can
4019 * provide ECN info.
4020 */
4021 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4022 }
4023
4024 mp1 = tcp->tcp_xmit_head;
4025 if (bytes_acked == 0) {
4026 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) {
4027 int dupack_cnt;
4028
4029 TCPS_BUMP_MIB(tcps, tcpInDupAck);
4030 /*
4031 * Fast retransmit. When we have seen exactly three
4032 * identical ACKs while we have unacked data
4033 * outstanding we take it as a hint that our peer
4034 * dropped something.
4035 *
4036 * If TCP is retransmitting, don't do fast retransmit.
4037 */
4038 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt &&
4039 ! tcp->tcp_rexmit) {
4040 /* Do Limited Transmit */
4041 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
4042 tcps->tcps_dupack_fast_retransmit) {
4043 cc_ack_received(tcp, seg_ack,
4044 bytes_acked, CC_DUPACK);
4045 /*
4046 * RFC 3042
4047 *
4048 * What we need to do is temporarily
4049 * increase tcp_cwnd so that new
4050 * data can be sent if it is allowed
4051 * by the receive window (tcp_rwnd).
4052 * tcp_wput_data() will take care of
4053 * the rest.
4054 *
4055 * If the connection is SACK capable,
4056 * only do limited xmit when there
4057 * is SACK info.
4058 *
4059 * Note how tcp_cwnd is incremented.
4060 * The first dup ACK will increase
4061 * it by 1 MSS. The second dup ACK
4062 * will increase it by 2 MSS. This
4063 * means that only 1 new segment will
4064 * be sent for each dup ACK.
4065 */
4066 if (tcp->tcp_unsent > 0 &&
4067 (!tcp->tcp_snd_sack_ok ||
4068 (tcp->tcp_snd_sack_ok &&
4069 tcp->tcp_notsack_list != NULL))) {
4070 tcp->tcp_cwnd += mss <<
4071 (tcp->tcp_dupack_cnt - 1);
4072 flags |= TH_LIMIT_XMIT;
4073 }
4074 } else if (dupack_cnt ==
4075 tcps->tcps_dupack_fast_retransmit) {
4076
4077 /*
4078 * If we have reduced tcp_ssthresh
4079 * because of ECN, do not reduce it again
4080 * unless it is already one window of data
4081 * away. After one window of data, tcp_cwr
4082 * should then be cleared. Note that
4083 * for non ECN capable connection, tcp_cwr
4084 * should always be false.
4085 *
4086 * Adjust cwnd since the duplicate
4087 * ack indicates that a packet was
4088 * dropped (due to congestion.)
4089 */
4090 if (!tcp->tcp_cwr) {
4091 cc_cong_signal(tcp, seg_ack,
4092 CC_NDUPACK);
4093 cc_ack_received(tcp, seg_ack,
4094 bytes_acked, CC_DUPACK);
4095 }
4096 if (tcp->tcp_ecn_ok) {
4097 tcp->tcp_cwr = B_TRUE;
4098 tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
4099 tcp->tcp_ecn_cwr_sent = B_FALSE;
4100 }
4101
4102 /*
4103 * We do Hoe's algorithm. Refer to her
4104 * paper "Improving the Start-up Behavior
4105 * of a Congestion Control Scheme for TCP,"
4106 * appeared in SIGCOMM'96.
4107 *
4108 * Save highest seq no we have sent so far.
4109 * Be careful about the invisible FIN byte.
4110 */
4111 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
4112 (tcp->tcp_unsent == 0)) {
4113 tcp->tcp_rexmit_max = tcp->tcp_fss;
4114 } else {
4115 tcp->tcp_rexmit_max = tcp->tcp_snxt;
4116 }
4117
4118 /*
4119 * For SACK:
4120 * Calculate tcp_pipe, which is the
4121 * estimated number of bytes in
4122 * network.
4123 *
4124 * tcp_fack is the highest sack'ed seq num
4125 * TCP has received.
4126 *
4127 * tcp_pipe is explained in the above quoted
4128 * Fall and Floyd's paper. tcp_fack is
4129 * explained in Mathis and Mahdavi's
4130 * "Forward Acknowledgment: Refining TCP
4131 * Congestion Control" in SIGCOMM '96.
4132 */
4133 if (tcp->tcp_snd_sack_ok) {
4134 if (tcp->tcp_notsack_list != NULL) {
4135 tcp->tcp_pipe = tcp->tcp_snxt -
4136 tcp->tcp_fack;
4137 tcp->tcp_sack_snxt = seg_ack;
4138 flags |= TH_NEED_SACK_REXMIT;
4139 } else {
4140 /*
4141 * Always initialize tcp_pipe
4142 * even though we don't have
4143 * any SACK info. If later
4144 * we get SACK info and
4145 * tcp_pipe is not initialized,
4146 * funny things will happen.
4147 */
4148 tcp->tcp_pipe =
4149 tcp->tcp_cwnd_ssthresh;
4150 }
4151 } else {
4152 flags |= TH_REXMIT_NEEDED;
4153 } /* tcp_snd_sack_ok */
4154
4155 } else {
4156 cc_ack_received(tcp, seg_ack,
4157 bytes_acked, CC_DUPACK);
4158 /*
4159 * Here we perform congestion
4160 * avoidance, but NOT slow start.
4161 * This is known as the Fast
4162 * Recovery Algorithm.
4163 */
4164 if (tcp->tcp_snd_sack_ok &&
4165 tcp->tcp_notsack_list != NULL) {
4166 flags |= TH_NEED_SACK_REXMIT;
4167 tcp->tcp_pipe -= mss;
4168 if (tcp->tcp_pipe < 0)
4169 tcp->tcp_pipe = 0;
4170 } else {
4171 /*
4172 * We know that one more packet has
4173 * left the pipe thus we can update
4174 * cwnd.
4175 */
4176 cwnd = tcp->tcp_cwnd + mss;
4177 if (cwnd > tcp->tcp_cwnd_max)
4178 cwnd = tcp->tcp_cwnd_max;
4179 DTRACE_PROBE3(cwnd__fast__recovery,
4180 tcp_t *, tcp,
4181 uint32_t, tcp->tcp_cwnd,
4182 uint32_t, cwnd);
4183 tcp->tcp_cwnd = cwnd;
4184 if (tcp->tcp_unsent > 0)
4185 flags |= TH_XMIT_NEEDED;
4186 }
4187 }
4188 }
4189 } else if (tcp->tcp_zero_win_probe) {
4190 /*
4191 * If the window has opened, need to arrange
4192 * to send additional data.
4193 */
4194 if (new_swnd != 0) {
4195 /* tcp_suna != tcp_snxt */
4196 /* Packet contains a window update */
4197 TCPS_BUMP_MIB(tcps, tcpInWinUpdate);
4198 tcp->tcp_zero_win_probe = 0;
4199 tcp->tcp_timer_backoff = 0;
4200 tcp->tcp_ms_we_have_waited = 0;
4201
4202 /*
4203 * Transmit starting with tcp_suna since
4204 * the one byte probe is not ack'ed.
4205 * If TCP has sent more than one identical
4206 * probe, tcp_rexmit will be set. That means
4207 * tcp_ss_rexmit() will send out the one
4208 * byte along with new data. Otherwise,
4209 * fake the retransmission.
4210 */
4211 flags |= TH_XMIT_NEEDED;
4212 if (!tcp->tcp_rexmit) {
4213 tcp->tcp_rexmit = B_TRUE;
4214 tcp->tcp_dupack_cnt = 0;
4215 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
4216 tcp->tcp_rexmit_max = tcp->tcp_suna + 1;
4217 }
4218 }
4219 }
4220 goto swnd_update;
4221 }
4222
4223 /*
4224 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73.
4225 * If the ACK value acks something that we have not yet sent, it might
4226 * be an old duplicate segment. Send an ACK to re-synchronize the
4227 * other side.
4228 * Note: reset in response to unacceptable ACK in SYN_RECEIVE
4229 * state is handled above, so we can always just drop the segment and
4230 * send an ACK here.
4231 *
4232 * In the case where the peer shrinks the window, we see the new window
4233 * update, but all the data sent previously is queued up by the peer.
4234 * To account for this, in tcp_process_shrunk_swnd(), the sequence
4235 * number, which was already sent, and within window, is recorded.
4236 * tcp_snxt is then updated.
4237 *
4238 * If the window has previously shrunk, and an ACK for data not yet
4239 * sent, according to tcp_snxt is recieved, it may still be valid. If
4240 * the ACK is for data within the window at the time the window was
4241 * shrunk, then the ACK is acceptable. In this case tcp_snxt is set to
4242 * the sequence number ACK'ed.
4243 *
4244 * If the ACK covers all the data sent at the time the window was
4245 * shrunk, we can now set tcp_is_wnd_shrnk to B_FALSE.
4246 *
4247 * Should we send ACKs in response to ACK only segments?
4248 */
4249
4250 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) {
4251 if ((tcp->tcp_is_wnd_shrnk) &&
4252 (SEQ_LEQ(seg_ack, tcp->tcp_snxt_shrunk))) {
4253 uint32_t data_acked_ahead_snxt;
4254
4255 data_acked_ahead_snxt = seg_ack - tcp->tcp_snxt;
4256 tcp_update_xmit_tail(tcp, seg_ack);
4257 tcp->tcp_unsent -= data_acked_ahead_snxt;
4258 } else {
4259 TCPS_BUMP_MIB(tcps, tcpInAckUnsent);
4260 /* drop the received segment */
4261 freemsg(mp);
4262
4263 /*
4264 * Send back an ACK. If tcp_drop_ack_unsent_cnt is
4265 * greater than 0, check if the number of such
4266 * bogus ACks is greater than that count. If yes,
4267 * don't send back any ACK. This prevents TCP from
4268 * getting into an ACK storm if somehow an attacker
4269 * successfully spoofs an acceptable segment to our
4270 * peer. If this continues (count > 2 X threshold),
4271 * we should abort this connection.
4272 */
4273 if (tcp_drop_ack_unsent_cnt > 0 &&
4274 ++tcp->tcp_in_ack_unsent >
4275 tcp_drop_ack_unsent_cnt) {
4276 TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4277 if (tcp->tcp_in_ack_unsent > 2 *
4278 tcp_drop_ack_unsent_cnt) {
4279 (void) tcp_clean_death(tcp, EPROTO);
4280 }
4281 return;
4282 }
4283 mp = tcp_ack_mp(tcp);
4284 if (mp != NULL) {
4285 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4286 TCPS_BUMP_MIB(tcps, tcpOutAck);
4287 tcp_send_data(tcp, mp);
4288 }
4289 return;
4290 }
4291 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4292 tcp->tcp_snxt_shrunk)) {
4293 tcp->tcp_is_wnd_shrnk = B_FALSE;
4294 }
4295
4296 /*
4297 * TCP gets a new ACK, update the notsack'ed list to delete those
4298 * blocks that are covered by this ACK.
4299 */
4300 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) {
4301 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack,
4302 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list));
4303 }
4304
4305 /*
4306 * If we got an ACK after fast retransmit, check to see
4307 * if it is a partial ACK. If it is not and the congestion
4308 * window was inflated to account for the other side's
4309 * cached packets, retract it. If it is, do Hoe's algorithm.
4310 */
4311 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) {
4312 ASSERT(tcp->tcp_rexmit == B_FALSE);
4313 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
4314 tcp->tcp_dupack_cnt = 0;
4315
4316 cc_post_recovery(tcp, seg_ack);
4317
4318 tcp->tcp_rexmit_max = seg_ack;
4319
4320 /*
4321 * Remove all notsack info to avoid confusion with
4322 * the next fast retrasnmit/recovery phase.
4323 */
4324 if (tcp->tcp_snd_sack_ok) {
4325 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list,
4326 tcp);
4327 }
4328 } else {
4329 if (tcp->tcp_snd_sack_ok &&
4330 tcp->tcp_notsack_list != NULL) {
4331 flags |= TH_NEED_SACK_REXMIT;
4332 tcp->tcp_pipe -= mss;
4333 if (tcp->tcp_pipe < 0)
4334 tcp->tcp_pipe = 0;
4335 } else {
4336 /*
4337 * Hoe's algorithm:
4338 *
4339 * Retransmit the unack'ed segment and
4340 * restart fast recovery. Note that we
4341 * need to scale back tcp_cwnd to the
4342 * original value when we started fast
4343 * recovery. This is to prevent overly
4344 * aggressive behaviour in sending new
4345 * segments.
4346 */
4347 cwnd = tcp->tcp_cwnd_ssthresh +
4348 tcps->tcps_dupack_fast_retransmit * mss;
4349 DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
4350 tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
4351 uint32_t, cwnd);
4352 tcp->tcp_cwnd = cwnd;
4353 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
4354 flags |= TH_REXMIT_NEEDED;
4355 }
4356 }
4357 } else {
4358 tcp->tcp_dupack_cnt = 0;
4359 if (tcp->tcp_rexmit) {
4360 /*
4361 * TCP is retranmitting. If the ACK ack's all
4362 * outstanding data, update tcp_rexmit_max and
4363 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt
4364 * to the correct value.
4365 *
4366 * Note that SEQ_LEQ() is used. This is to avoid
4367 * unnecessary fast retransmit caused by dup ACKs
4368 * received when TCP does slow start retransmission
4369 * after a time out. During this phase, TCP may
4370 * send out segments which are already received.
4371 * This causes dup ACKs to be sent back.
4372 */
4373 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) {
4374 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) {
4375 tcp->tcp_rexmit_nxt = seg_ack;
4376 }
4377 if (seg_ack != tcp->tcp_rexmit_max) {
4378 flags |= TH_XMIT_NEEDED;
4379 }
4380 } else {
4381 tcp->tcp_rexmit = B_FALSE;
4382 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
4383 }
4384 tcp->tcp_ms_we_have_waited = 0;
4385 }
4386 }
4387
4388 TCPS_BUMP_MIB(tcps, tcpInAckSegs);
4389 TCPS_UPDATE_MIB(tcps, tcpInAckBytes, bytes_acked);
4390 tcp->tcp_suna = seg_ack;
4391 if (tcp->tcp_zero_win_probe != 0) {
4392 tcp->tcp_zero_win_probe = 0;
4393 tcp->tcp_timer_backoff = 0;
4394 }
4395
4396 /*
4397 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed.
4398 * Note that it cannot be the SYN being ack'ed. The code flow
4399 * will not reach here.
4400 */
4401 if (mp1 == NULL) {
4402 goto fin_acked;
4403 }
4404
4405 /*
4406 * Update the congestion window.
4407 *
4408 * If TCP is not ECN capable or TCP is ECN capable but the
4409 * congestion experience bit is not set, increase the tcp_cwnd as
4410 * usual.
4411 */
4412 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
4413 if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
4414 EXIT_RECOVERY(tcp->tcp_ccv.flags);
4415 }
4416 cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
4417 }
4418
4419 /* See if the latest urgent data has been acknowledged */
4420 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
4421 SEQ_GT(seg_ack, tcp->tcp_urg))
4422 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
4423
4424 /*
4425 * Update the RTT estimates. Note that we don't use the TCP
4426 * timestamp option to calculate RTT even if one is present. This is
4427 * because the timestamp option's resolution (CPU tick) is
4428 * too coarse to measure modern datacenter networks' microsecond
4429 * latencies. The timestamp field's resolution is limited by its
4430 * 4-byte width (see RFC1323), and since we always store a
4431 * high-resolution nanosecond presision timestamp along with the data,
4432 * there is no point to ever using the timestamp option.
4433 */
4434 if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
4435 /*
4436 * An ACK sequence we haven't seen before, so get the RTT
4437 * and update the RTO. But first check if the timestamp is
4438 * valid to use.
4439 */
4440 if ((mp1->b_next != NULL) &&
4441 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
4442 tcp_set_rto(tcp, gethrtime() -
4443 (hrtime_t)(intptr_t)mp1->b_prev);
4444 } else {
4445 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4446 }
4447
4448 /* Remeber the last sequence to be ACKed */
4449 tcp->tcp_csuna = seg_ack;
4450 if (tcp->tcp_set_timer == 1) {
4451 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
4452 tcp->tcp_set_timer = 0;
4453 }
4454 } else {
4455 TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
4456 }
4457
4458 /* Eat acknowledged bytes off the xmit queue. */
4459 for (;;) {
4460 mblk_t *mp2;
4461 uchar_t *wptr;
4462
4463 wptr = mp1->b_wptr;
4464 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX);
4465 bytes_acked -= (int)(wptr - mp1->b_rptr);
4466 if (bytes_acked < 0) {
4467 mp1->b_rptr = wptr + bytes_acked;
4468 /*
4469 * Set a new timestamp if all the bytes timed by the
4470 * old timestamp have been ack'ed.
4471 */
4472 if (SEQ_GT(seg_ack,
4473 (uint32_t)(uintptr_t)(mp1->b_next))) {
4474 mp1->b_prev =
4475 (mblk_t *)(intptr_t)gethrtime();
4476 mp1->b_next = NULL;
4477 }
4478 break;
4479 }
4480 mp1->b_next = NULL;
4481 mp1->b_prev = NULL;
4482 mp2 = mp1;
4483 mp1 = mp1->b_cont;
4484
4485 /*
4486 * This notification is required for some zero-copy
4487 * clients to maintain a copy semantic. After the data
4488 * is ack'ed, client is safe to modify or reuse the buffer.
4489 */
4490 if (tcp->tcp_snd_zcopy_aware &&
4491 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
4492 tcp_zcopy_notify(tcp);
4493 freeb(mp2);
4494 if (bytes_acked == 0) {
4495 if (mp1 == NULL) {
4496 /* Everything is ack'ed, clear the tail. */
4497 tcp->tcp_xmit_tail = NULL;
4498 /*
4499 * Cancel the timer unless we are still
4500 * waiting for an ACK for the FIN packet.
4501 */
4502 if (tcp->tcp_timer_tid != 0 &&
4503 tcp->tcp_snxt == tcp->tcp_suna) {
4504 (void) TCP_TIMER_CANCEL(tcp,
4505 tcp->tcp_timer_tid);
4506 tcp->tcp_timer_tid = 0;
4507 }
4508 goto pre_swnd_update;
4509 }
4510 if (mp2 != tcp->tcp_xmit_tail)
4511 break;
4512 tcp->tcp_xmit_tail = mp1;
4513 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
4514 (uintptr_t)INT_MAX);
4515 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr -
4516 mp1->b_rptr);
4517 break;
4518 }
4519 if (mp1 == NULL) {
4520 /*
4521 * More was acked but there is nothing more
4522 * outstanding. This means that the FIN was
4523 * just acked or that we're talking to a clown.
4524 */
4525 fin_acked:
4526 ASSERT(tcp->tcp_fin_sent);
4527 tcp->tcp_xmit_tail = NULL;
4528 if (tcp->tcp_fin_sent) {
4529 /* FIN was acked - making progress */
4530 if (!tcp->tcp_fin_acked)
4531 tcp->tcp_ip_forward_progress = B_TRUE;
4532 tcp->tcp_fin_acked = B_TRUE;
4533 if (tcp->tcp_linger_tid != 0 &&
4534 TCP_TIMER_CANCEL(tcp,
4535 tcp->tcp_linger_tid) >= 0) {
4536 tcp_stop_lingering(tcp);
4537 freemsg(mp);
4538 mp = NULL;
4539 }
4540 } else {
4541 /*
4542 * We should never get here because
4543 * we have already checked that the
4544 * number of bytes ack'ed should be
4545 * smaller than or equal to what we
4546 * have sent so far (it is the
4547 * acceptability check of the ACK).
4548 * We can only get here if the send
4549 * queue is corrupted.
4550 *
4551 * Terminate the connection and
4552 * panic the system. It is better
4553 * for us to panic instead of
4554 * continuing to avoid other disaster.
4555 */
4556 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
4557 tcp->tcp_rnxt, TH_RST|TH_ACK);
4558 panic("Memory corruption "
4559 "detected for connection %s.",
4560 tcp_display(tcp, NULL,
4561 DISP_ADDR_AND_PORT));
4562 /*NOTREACHED*/
4563 }
4564 goto pre_swnd_update;
4565 }
4566 ASSERT(mp2 != tcp->tcp_xmit_tail);
4567 }
4568 if (tcp->tcp_unsent) {
4569 flags |= TH_XMIT_NEEDED;
4570 }
4571 pre_swnd_update:
4572 tcp->tcp_xmit_head = mp1;
4573 swnd_update:
4574 /*
4575 * The following check is different from most other implementations.
4576 * For bi-directional transfer, when segments are dropped, the
4577 * "normal" check will not accept a window update in those
4578 * retransmitted segemnts. Failing to do that, TCP may send out
4579 * segments which are outside receiver's window. As TCP accepts
4580 * the ack in those retransmitted segments, if the window update in
4581 * the same segment is not accepted, TCP will incorrectly calculates
4582 * that it can send more segments. This can create a deadlock
4583 * with the receiver if its window becomes zero.
4584 */
4585 if (SEQ_LT(tcp->tcp_swl2, seg_ack) ||
4586 SEQ_LT(tcp->tcp_swl1, seg_seq) ||
4587 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) {
4588 /*
4589 * The criteria for update is:
4590 *
4591 * 1. the segment acknowledges some data. Or
4592 * 2. the segment is new, i.e. it has a higher seq num. Or
4593 * 3. the segment is not old and the advertised window is
4594 * larger than the previous advertised window.
4595 */
4596 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd)
4597 flags |= TH_XMIT_NEEDED;
4598 tcp->tcp_swnd = new_swnd;
4599 if (new_swnd > tcp->tcp_max_swnd)
4600 tcp->tcp_max_swnd = new_swnd;
4601 tcp->tcp_swl1 = seg_seq;
4602 tcp->tcp_swl2 = seg_ack;
4603 }
4604 est:
4605 if (tcp->tcp_state > TCPS_ESTABLISHED) {
4606
4607 switch (tcp->tcp_state) {
4608 case TCPS_FIN_WAIT_1:
4609 if (tcp->tcp_fin_acked) {
4610 tcp->tcp_state = TCPS_FIN_WAIT_2;
4611 DTRACE_TCP6(state__change, void, NULL,
4612 ip_xmit_attr_t *, connp->conn_ixa,
4613 void, NULL, tcp_t *, tcp, void, NULL,
4614 int32_t, TCPS_FIN_WAIT_1);
4615 /*
4616 * We implement the non-standard BSD/SunOS
4617 * FIN_WAIT_2 flushing algorithm.
4618 * If there is no user attached to this
4619 * TCP endpoint, then this TCP struct
4620 * could hang around forever in FIN_WAIT_2
4621 * state if the peer forgets to send us
4622 * a FIN. To prevent this, we wait only
4623 * 2*MSL (a convenient time value) for
4624 * the FIN to arrive. If it doesn't show up,
4625 * we flush the TCP endpoint. This algorithm,
4626 * though a violation of RFC-793, has worked
4627 * for over 10 years in BSD systems.
4628 * Note: SunOS 4.x waits 675 seconds before
4629 * flushing the FIN_WAIT_2 connection.
4630 */
4631 TCP_TIMER_RESTART(tcp,
4632 tcp->tcp_fin_wait_2_flush_interval);
4633 }
4634 break;
4635 case TCPS_FIN_WAIT_2:
4636 break; /* Shutdown hook? */
4637 case TCPS_LAST_ACK:
4638 freemsg(mp);
4639 if (tcp->tcp_fin_acked) {
4640 (void) tcp_clean_death(tcp, 0);
4641 return;
4642 }
4643 goto xmit_check;
4644 case TCPS_CLOSING:
4645 if (tcp->tcp_fin_acked) {
4646 SET_TIME_WAIT(tcps, tcp, connp);
4647 DTRACE_TCP6(state__change, void, NULL,
4648 ip_xmit_attr_t *, connp->conn_ixa, void,
4649 NULL, tcp_t *, tcp, void, NULL, int32_t,
4650 TCPS_CLOSING);
4651 }
4652 /*FALLTHRU*/
4653 case TCPS_CLOSE_WAIT:
4654 freemsg(mp);
4655 goto xmit_check;
4656 default:
4657 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT);
4658 break;
4659 }
4660 }
4661 if (flags & TH_FIN) {
4662 /* Make sure we ack the fin */
4663 flags |= TH_ACK_NEEDED;
4664 if (!tcp->tcp_fin_rcvd) {
4665 tcp->tcp_fin_rcvd = B_TRUE;
4666 tcp->tcp_rnxt++;
4667 tcpha = tcp->tcp_tcpha;
4668 tcpha->tha_ack = htonl(tcp->tcp_rnxt);
4669
4670 /*
4671 * Generate the ordrel_ind at the end unless the
4672 * conn is detached or it is a STREAMS based eager.
4673 * In the eager case we defer the notification until
4674 * tcp_accept_finish has run.
4675 */
4676 if (!TCP_IS_DETACHED(tcp) && (IPCL_IS_NONSTR(connp) ||
4677 (tcp->tcp_listener == NULL &&
4678 !tcp->tcp_hard_binding)))
4679 flags |= TH_ORDREL_NEEDED;
4680 switch (tcp->tcp_state) {
4681 case TCPS_SYN_RCVD:
4682 tcp->tcp_state = TCPS_CLOSE_WAIT;
4683 DTRACE_TCP6(state__change, void, NULL,
4684 ip_xmit_attr_t *, connp->conn_ixa,
4685 void, NULL, tcp_t *, tcp, void, NULL,
4686 int32_t, TCPS_SYN_RCVD);
4687 /* Keepalive? */
4688 break;
4689 case TCPS_ESTABLISHED:
4690 tcp->tcp_state = TCPS_CLOSE_WAIT;
4691 DTRACE_TCP6(state__change, void, NULL,
4692 ip_xmit_attr_t *, connp->conn_ixa,
4693 void, NULL, tcp_t *, tcp, void, NULL,
4694 int32_t, TCPS_ESTABLISHED);
4695 /* Keepalive? */
4696 break;
4697 case TCPS_FIN_WAIT_1:
4698 if (!tcp->tcp_fin_acked) {
4699 tcp->tcp_state = TCPS_CLOSING;
4700 DTRACE_TCP6(state__change, void, NULL,
4701 ip_xmit_attr_t *, connp->conn_ixa,
4702 void, NULL, tcp_t *, tcp, void,
4703 NULL, int32_t, TCPS_FIN_WAIT_1);
4704 break;
4705 }
4706 /* FALLTHRU */
4707 case TCPS_FIN_WAIT_2:
4708 SET_TIME_WAIT(tcps, tcp, connp);
4709 DTRACE_TCP6(state__change, void, NULL,
4710 ip_xmit_attr_t *, connp->conn_ixa, void,
4711 NULL, tcp_t *, tcp, void, NULL, int32_t,
4712 TCPS_FIN_WAIT_2);
4713 if (seg_len) {
4714 /*
4715 * implies data piggybacked on FIN.
4716 * break to handle data.
4717 */
4718 break;
4719 }
4720 freemsg(mp);
4721 goto ack_check;
4722 }
4723 }
4724 }
4725 if (mp == NULL)
4726 goto xmit_check;
4727 if (seg_len == 0) {
4728 freemsg(mp);
4729 goto xmit_check;
4730 }
4731 if (mp->b_rptr == mp->b_wptr) {
4732 /*
4733 * The header has been consumed, so we remove the
4734 * zero-length mblk here.
4735 */
4736 mp1 = mp;
4737 mp = mp->b_cont;
4738 freeb(mp1);
4739 }
4740 update_ack:
4741 tcpha = tcp->tcp_tcpha;
4742 tcp->tcp_rack_cnt++;
4743 {
4744 uint32_t cur_max;
4745
4746 cur_max = tcp->tcp_rack_cur_max;
4747 if (tcp->tcp_rack_cnt >= cur_max) {
4748 /*
4749 * We have more unacked data than we should - send
4750 * an ACK now.
4751 */
4752 flags |= TH_ACK_NEEDED;
4753 cur_max++;
4754 if (cur_max > tcp->tcp_rack_abs_max)
4755 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max;
4756 else
4757 tcp->tcp_rack_cur_max = cur_max;
4758 } else if (tcp->tcp_quickack) {
4759 /* The executable asked that we ack each packet */
4760 flags |= TH_ACK_NEEDED;
4761 } else if (TCP_IS_DETACHED(tcp)) {
4762 /* We don't have an ACK timer for detached TCP. */
4763 flags |= TH_ACK_NEEDED;
4764 } else if (seg_len < mss) {
4765 /*
4766 * If we get a segment that is less than an mss, and we
4767 * already have unacknowledged data, and the amount
4768 * unacknowledged is not a multiple of mss, then we
4769 * better generate an ACK now. Otherwise, this may be
4770 * the tail piece of a transaction, and we would rather
4771 * wait for the response.
4772 */
4773 uint32_t udif;
4774 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <=
4775 (uintptr_t)INT_MAX);
4776 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack);
4777 if (udif && (udif % mss))
4778 flags |= TH_ACK_NEEDED;
4779 else
4780 flags |= TH_ACK_TIMER_NEEDED;
4781 } else {
4782 /* Start delayed ack timer */
4783 flags |= TH_ACK_TIMER_NEEDED;
4784 }
4785 }
4786 tcp->tcp_rnxt += seg_len;
4787 tcpha->tha_ack = htonl(tcp->tcp_rnxt);
4788
4789 if (mp == NULL)
4790 goto xmit_check;
4791
4792 /* Update SACK list */
4793 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
4794 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt,
4795 &(tcp->tcp_num_sack_blk));
4796 }
4797
4798 if (tcp->tcp_urp_mp) {
4799 tcp->tcp_urp_mp->b_cont = mp;
4800 mp = tcp->tcp_urp_mp;
4801 tcp->tcp_urp_mp = NULL;
4802 /* Ready for a new signal. */
4803 tcp->tcp_urp_last_valid = B_FALSE;
4804 #ifdef DEBUG
4805 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4806 "tcp_rput: sending exdata_ind %s",
4807 tcp_display(tcp, NULL, DISP_PORT_ONLY));
4808 #endif /* DEBUG */
4809 }
4810
4811 /*
4812 * Check for ancillary data changes compared to last segment.
4813 */
4814 if (connp->conn_recv_ancillary.crb_all != 0) {
4815 mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira);
4816 if (mp == NULL)
4817 return;
4818 }
4819
4820 if (IPCL_IS_NONSTR(connp)) {
4821 /*
4822 * Non-STREAMS socket
4823 */
4824 boolean_t push = flags & (TH_PUSH|TH_FIN);
4825 int error;
4826
4827 if ((*sockupcalls->su_recv)(connp->conn_upper_handle,
4828 mp, seg_len, 0, &error, &push) <= 0) {
4829 /*
4830 * We should never be in middle of a
4831 * fallback, the squeue guarantees that.
4832 */
4833 ASSERT(error != EOPNOTSUPP);
4834 if (error == ENOSPC)
4835 tcp->tcp_rwnd -= seg_len;
4836 } else if (push) {
4837 /* PUSH bit set and sockfs is not flow controlled */
4838 flags |= tcp_rwnd_reopen(tcp);
4839 }
4840 } else if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
4841 /*
4842 * Side queue inbound data until the accept happens.
4843 * tcp_accept/tcp_rput drains this when the accept happens.
4844 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or
4845 * T_EXDATA_IND) it is queued on b_next.
4846 * XXX Make urgent data use this. Requires:
4847 * Removing tcp_listener check for TH_URG
4848 * Making M_PCPROTO and MARK messages skip the eager case
4849 */
4850
4851 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
4852 } else {
4853 /* Active STREAMS socket */
4854 if (mp->b_datap->db_type != M_DATA ||
4855 (flags & TH_MARKNEXT_NEEDED)) {
4856 if (tcp->tcp_rcv_list != NULL) {
4857 flags |= tcp_rcv_drain(tcp);
4858 }
4859 ASSERT(tcp->tcp_rcv_list == NULL ||
4860 tcp->tcp_fused_sigurg);
4861
4862 if (flags & TH_MARKNEXT_NEEDED) {
4863 #ifdef DEBUG
4864 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
4865 "tcp_rput: sending MSGMARKNEXT %s",
4866 tcp_display(tcp, NULL,
4867 DISP_PORT_ONLY));
4868 #endif /* DEBUG */
4869 mp->b_flag |= MSGMARKNEXT;
4870 flags &= ~TH_MARKNEXT_NEEDED;
4871 }
4872
4873 if (is_system_labeled())
4874 tcp_setcred_data(mp, ira);
4875
4876 putnext(connp->conn_rq, mp);
4877 if (!canputnext(connp->conn_rq))
4878 tcp->tcp_rwnd -= seg_len;
4879 } else if ((flags & (TH_PUSH|TH_FIN)) ||
4880 tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) {
4881 if (tcp->tcp_rcv_list != NULL) {
4882 /*
4883 * Enqueue the new segment first and then
4884 * call tcp_rcv_drain() to send all data
4885 * up. The other way to do this is to
4886 * send all queued data up and then call
4887 * putnext() to send the new segment up.
4888 * This way can remove the else part later
4889 * on.
4890 *
4891 * We don't do this to avoid one more call to
4892 * canputnext() as tcp_rcv_drain() needs to
4893 * call canputnext().
4894 */
4895 tcp_rcv_enqueue(tcp, mp, seg_len,
4896 ira->ira_cred);
4897 flags |= tcp_rcv_drain(tcp);
4898 } else {
4899 if (is_system_labeled())
4900 tcp_setcred_data(mp, ira);
4901
4902 putnext(connp->conn_rq, mp);
4903 if (!canputnext(connp->conn_rq))
4904 tcp->tcp_rwnd -= seg_len;
4905 }
4906 } else {
4907 /*
4908 * Enqueue all packets when processing an mblk
4909 * from the co queue and also enqueue normal packets.
4910 */
4911 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
4912 }
4913 /*
4914 * Make sure the timer is running if we have data waiting
4915 * for a push bit. This provides resiliency against
4916 * implementations that do not correctly generate push bits.
4917 */
4918 if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) {
4919 /*
4920 * The connection may be closed at this point, so don't
4921 * do anything for a detached tcp.
4922 */
4923 if (!TCP_IS_DETACHED(tcp))
4924 tcp->tcp_push_tid = TCP_TIMER(tcp,
4925 tcp_push_timer,
4926 tcps->tcps_push_timer_interval);
4927 }
4928 }
4929
4930 xmit_check:
4931 /* Is there anything left to do? */
4932 ASSERT(!(flags & TH_MARKNEXT_NEEDED));
4933 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED|
4934 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED|
4935 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
4936 goto done;
4937
4938 /* Any transmit work to do and a non-zero window? */
4939 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT|
4940 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) {
4941 if (flags & TH_REXMIT_NEEDED) {
4942 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna;
4943
4944 TCPS_BUMP_MIB(tcps, tcpOutFastRetrans);
4945 if (snd_size > mss)
4946 snd_size = mss;
4947 if (snd_size > tcp->tcp_swnd)
4948 snd_size = tcp->tcp_swnd;
4949 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size,
4950 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4951 B_TRUE);
4952
4953 if (mp1 != NULL) {
4954 tcp->tcp_xmit_head->b_prev =
4955 (mblk_t *)(intptr_t)gethrtime();
4956 tcp->tcp_csuna = tcp->tcp_snxt;
4957 TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4958 TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4959 snd_size);
4960 tcp->tcp_cs.tcp_out_retrans_segs++;
4961 tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
4962 tcp_send_data(tcp, mp1);
4963 }
4964 }
4965 if (flags & TH_NEED_SACK_REXMIT) {
4966 tcp_sack_rexmit(tcp, &flags);
4967 }
4968 /*
4969 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4970 * out new segment. Note that tcp_rexmit should not be
4971 * set, otherwise TH_LIMIT_XMIT should not be set.
4972 */
4973 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) {
4974 if (!tcp->tcp_rexmit) {
4975 tcp_wput_data(tcp, NULL, B_FALSE);
4976 } else {
4977 tcp_ss_rexmit(tcp);
4978 }
4979 }
4980 /*
4981 * Adjust tcp_cwnd back to normal value after sending
4982 * new data segments.
4983 */
4984 if (flags & TH_LIMIT_XMIT) {
4985 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1);
4986 /*
4987 * This will restart the timer. Restarting the
4988 * timer is used to avoid a timeout before the
4989 * limited transmitted segment's ACK gets back.
4990 */
4991 if (tcp->tcp_xmit_head != NULL) {
4992 tcp->tcp_xmit_head->b_prev =
4993 (mblk_t *)(intptr_t)gethrtime();
4994 }
4995 }
4996
4997 /* Anything more to do? */
4998 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED|
4999 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0)
5000 goto done;
5001 }
5002 ack_check:
5003 if (flags & TH_SEND_URP_MARK) {
5004 ASSERT(tcp->tcp_urp_mark_mp);
5005 ASSERT(!IPCL_IS_NONSTR(connp));
5006 /*
5007 * Send up any queued data and then send the mark message
5008 */
5009 if (tcp->tcp_rcv_list != NULL) {
5010 flags |= tcp_rcv_drain(tcp);
5011
5012 }
5013 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
5014 mp1 = tcp->tcp_urp_mark_mp;
5015 tcp->tcp_urp_mark_mp = NULL;
5016 if (is_system_labeled())
5017 tcp_setcred_data(mp1, ira);
5018
5019 putnext(connp->conn_rq, mp1);
5020 #ifdef DEBUG
5021 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
5022 "tcp_rput: sending zero-length %s %s",
5023 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
5024 "MSGNOTMARKNEXT"),
5025 tcp_display(tcp, NULL, DISP_PORT_ONLY));
5026 #endif /* DEBUG */
5027 flags &= ~TH_SEND_URP_MARK;
5028 }
5029 if (flags & TH_ACK_NEEDED) {
5030 /*
5031 * Time to send an ack for some reason.
5032 */
5033 mp1 = tcp_ack_mp(tcp);
5034
5035 if (mp1 != NULL) {
5036 tcp_send_data(tcp, mp1);
5037 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
5038 TCPS_BUMP_MIB(tcps, tcpOutAck);
5039 }
5040 if (tcp->tcp_ack_tid != 0) {
5041 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
5042 tcp->tcp_ack_tid = 0;
5043 }
5044 }
5045 if (flags & TH_ACK_TIMER_NEEDED) {
5046 /*
5047 * Arrange for deferred ACK or push wait timeout.
5048 * Start timer if it is not already running.
5049 */
5050 if (tcp->tcp_ack_tid == 0) {
5051 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer,
5052 tcp->tcp_localnet ?
5053 tcps->tcps_local_dack_interval :
5054 tcps->tcps_deferred_ack_interval);
5055 }
5056 }
5057 if (flags & TH_ORDREL_NEEDED) {
5058 /*
5059 * Notify upper layer about an orderly release. If this is
5060 * a non-STREAMS socket, then just make an upcall. For STREAMS
5061 * we send up an ordrel_ind, unless this is an eager, in which
5062 * case the ordrel will be sent when tcp_accept_finish runs.
5063 * Note that for non-STREAMS we make an upcall even if it is an
5064 * eager, because we have an upper handle to send it to.
5065 */
5066 ASSERT(IPCL_IS_NONSTR(connp) || tcp->tcp_listener == NULL);
5067 ASSERT(!tcp->tcp_detached);
5068
5069 if (IPCL_IS_NONSTR(connp)) {
5070 ASSERT(tcp->tcp_ordrel_mp == NULL);
5071 tcp->tcp_ordrel_done = B_TRUE;
5072 (*sockupcalls->su_opctl)(connp->conn_upper_handle,
5073 SOCK_OPCTL_SHUT_RECV, 0);
5074 goto done;
5075 }
5076
5077 if (tcp->tcp_rcv_list != NULL) {
5078 /*
5079 * Push any mblk(s) enqueued from co processing.
5080 */
5081 flags |= tcp_rcv_drain(tcp);
5082 }
5083 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
5084
5085 mp1 = tcp->tcp_ordrel_mp;
5086 tcp->tcp_ordrel_mp = NULL;
5087 tcp->tcp_ordrel_done = B_TRUE;
5088 putnext(connp->conn_rq, mp1);
5089 }
5090 done:
5091 ASSERT(!(flags & TH_MARKNEXT_NEEDED));
5092 }
5093
5094 /*
5095 * Attach ancillary data to a received TCP segments for the
5096 * ancillary pieces requested by the application that are
5097 * different than they were in the previous data segment.
5098 *
5099 * Save the "current" values once memory allocation is ok so that
5100 * when memory allocation fails we can just wait for the next data segment.
5101 */
5102 static mblk_t *
tcp_input_add_ancillary(tcp_t * tcp,mblk_t * mp,ip_pkt_t * ipp,ip_recv_attr_t * ira)5103 tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
5104 ip_recv_attr_t *ira)
5105 {
5106 struct T_optdata_ind *todi;
5107 int optlen;
5108 uchar_t *optptr;
5109 struct T_opthdr *toh;
5110 crb_t addflag; /* Which pieces to add */
5111 mblk_t *mp1;
5112 conn_t *connp = tcp->tcp_connp;
5113
5114 optlen = 0;
5115 addflag.crb_all = 0;
5116
5117 /* If app asked for TOS and it has changed ... */
5118 if (connp->conn_recv_ancillary.crb_recvtos &&
5119 ipp->ipp_type_of_service != tcp->tcp_recvtos &&
5120 (ira->ira_flags & IRAF_IS_IPV4)) {
5121 optlen += sizeof (struct T_opthdr) +
5122 P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE);
5123 addflag.crb_recvtos = 1;
5124 }
5125 /* If app asked for pktinfo and the index has changed ... */
5126 if (connp->conn_recv_ancillary.crb_ip_recvpktinfo &&
5127 ira->ira_ruifindex != tcp->tcp_recvifindex) {
5128 optlen += sizeof (struct T_opthdr) +
5129 sizeof (struct in6_pktinfo);
5130 addflag.crb_ip_recvpktinfo = 1;
5131 }
5132 /* If app asked for hoplimit and it has changed ... */
5133 if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit &&
5134 ipp->ipp_hoplimit != tcp->tcp_recvhops) {
5135 optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
5136 addflag.crb_ipv6_recvhoplimit = 1;
5137 }
5138 /* If app asked for tclass and it has changed ... */
5139 if (connp->conn_recv_ancillary.crb_ipv6_recvtclass &&
5140 ipp->ipp_tclass != tcp->tcp_recvtclass) {
5141 optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
5142 addflag.crb_ipv6_recvtclass = 1;
5143 }
5144
5145 /*
5146 * If app asked for hop-by-hop headers and it has changed ...
5147 * For security labels, note that (1) security labels can't change on
5148 * a connected socket at all, (2) we're connected to at most one peer,
5149 * (3) if anything changes, then it must be some other extra option.
5150 */
5151 if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts &&
5152 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen,
5153 (ipp->ipp_fields & IPPF_HOPOPTS),
5154 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) {
5155 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
5156 addflag.crb_ipv6_recvhopopts = 1;
5157 if (!ip_allocbuf((void **)&tcp->tcp_hopopts,
5158 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS),
5159 ipp->ipp_hopopts, ipp->ipp_hopoptslen))
5160 return (mp);
5161 }
5162 /* If app asked for dst headers before routing headers ... */
5163 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts &&
5164 ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen,
5165 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
5166 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) {
5167 optlen += sizeof (struct T_opthdr) +
5168 ipp->ipp_rthdrdstoptslen;
5169 addflag.crb_ipv6_recvrthdrdstopts = 1;
5170 if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts,
5171 &tcp->tcp_rthdrdstoptslen,
5172 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
5173 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen))
5174 return (mp);
5175 }
5176 /* If app asked for routing headers and it has changed ... */
5177 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr &&
5178 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen,
5179 (ipp->ipp_fields & IPPF_RTHDR),
5180 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
5181 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
5182 addflag.crb_ipv6_recvrthdr = 1;
5183 if (!ip_allocbuf((void **)&tcp->tcp_rthdr,
5184 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR),
5185 ipp->ipp_rthdr, ipp->ipp_rthdrlen))
5186 return (mp);
5187 }
5188 /* If app asked for dest headers and it has changed ... */
5189 if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts ||
5190 connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) &&
5191 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen,
5192 (ipp->ipp_fields & IPPF_DSTOPTS),
5193 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) {
5194 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
5195 addflag.crb_ipv6_recvdstopts = 1;
5196 if (!ip_allocbuf((void **)&tcp->tcp_dstopts,
5197 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS),
5198 ipp->ipp_dstopts, ipp->ipp_dstoptslen))
5199 return (mp);
5200 }
5201
5202 if (optlen == 0) {
5203 /* Nothing to add */
5204 return (mp);
5205 }
5206 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED);
5207 if (mp1 == NULL) {
5208 /*
5209 * Defer sending ancillary data until the next TCP segment
5210 * arrives.
5211 */
5212 return (mp);
5213 }
5214 mp1->b_cont = mp;
5215 mp = mp1;
5216 mp->b_wptr += sizeof (*todi) + optlen;
5217 mp->b_datap->db_type = M_PROTO;
5218 todi = (struct T_optdata_ind *)mp->b_rptr;
5219 todi->PRIM_type = T_OPTDATA_IND;
5220 todi->DATA_flag = 1; /* MORE data */
5221 todi->OPT_length = optlen;
5222 todi->OPT_offset = sizeof (*todi);
5223 optptr = (uchar_t *)&todi[1];
5224
5225 /* If app asked for TOS and it has changed ... */
5226 if (addflag.crb_recvtos) {
5227 toh = (struct T_opthdr *)optptr;
5228 toh->level = IPPROTO_IP;
5229 toh->name = IP_RECVTOS;
5230 toh->len = sizeof (*toh) +
5231 P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE);
5232 toh->status = 0;
5233 optptr += sizeof (*toh);
5234 *(uint8_t *)optptr = ipp->ipp_type_of_service;
5235 optptr = (uchar_t *)toh + toh->len;
5236 ASSERT(__TPI_TOPT_ISALIGNED(optptr));
5237 /* Save as "last" value */
5238 tcp->tcp_recvtos = ipp->ipp_type_of_service;
5239 }
5240
5241 /*
5242 * If app asked for pktinfo and the index has changed ...
5243 * Note that the local address never changes for the connection.
5244 */
5245 if (addflag.crb_ip_recvpktinfo) {
5246 struct in6_pktinfo *pkti;
5247 uint_t ifindex;
5248
5249 ifindex = ira->ira_ruifindex;
5250 toh = (struct T_opthdr *)optptr;
5251 toh->level = IPPROTO_IPV6;
5252 toh->name = IPV6_PKTINFO;
5253 toh->len = sizeof (*toh) + sizeof (*pkti);
5254 toh->status = 0;
5255 optptr += sizeof (*toh);
5256 pkti = (struct in6_pktinfo *)optptr;
5257 pkti->ipi6_addr = connp->conn_laddr_v6;
5258 pkti->ipi6_ifindex = ifindex;
5259 optptr += sizeof (*pkti);
5260 ASSERT(OK_32PTR(optptr));
5261 /* Save as "last" value */
5262 tcp->tcp_recvifindex = ifindex;
5263 }
5264 /* If app asked for hoplimit and it has changed ... */
5265 if (addflag.crb_ipv6_recvhoplimit) {
5266 toh = (struct T_opthdr *)optptr;
5267 toh->level = IPPROTO_IPV6;
5268 toh->name = IPV6_HOPLIMIT;
5269 toh->len = sizeof (*toh) + sizeof (uint_t);
5270 toh->status = 0;
5271 optptr += sizeof (*toh);
5272 *(uint_t *)optptr = ipp->ipp_hoplimit;
5273 optptr += sizeof (uint_t);
5274 ASSERT(OK_32PTR(optptr));
5275 /* Save as "last" value */
5276 tcp->tcp_recvhops = ipp->ipp_hoplimit;
5277 }
5278 /* If app asked for tclass and it has changed ... */
5279 if (addflag.crb_ipv6_recvtclass) {
5280 toh = (struct T_opthdr *)optptr;
5281 toh->level = IPPROTO_IPV6;
5282 toh->name = IPV6_TCLASS;
5283 toh->len = sizeof (*toh) + sizeof (uint_t);
5284 toh->status = 0;
5285 optptr += sizeof (*toh);
5286 *(uint_t *)optptr = ipp->ipp_tclass;
5287 optptr += sizeof (uint_t);
5288 ASSERT(OK_32PTR(optptr));
5289 /* Save as "last" value */
5290 tcp->tcp_recvtclass = ipp->ipp_tclass;
5291 }
5292 if (addflag.crb_ipv6_recvhopopts) {
5293 toh = (struct T_opthdr *)optptr;
5294 toh->level = IPPROTO_IPV6;
5295 toh->name = IPV6_HOPOPTS;
5296 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen;
5297 toh->status = 0;
5298 optptr += sizeof (*toh);
5299 bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen);
5300 optptr += ipp->ipp_hopoptslen;
5301 ASSERT(OK_32PTR(optptr));
5302 /* Save as last value */
5303 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen,
5304 (ipp->ipp_fields & IPPF_HOPOPTS),
5305 ipp->ipp_hopopts, ipp->ipp_hopoptslen);
5306 }
5307 if (addflag.crb_ipv6_recvrthdrdstopts) {
5308 toh = (struct T_opthdr *)optptr;
5309 toh->level = IPPROTO_IPV6;
5310 toh->name = IPV6_RTHDRDSTOPTS;
5311 toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen;
5312 toh->status = 0;
5313 optptr += sizeof (*toh);
5314 bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen);
5315 optptr += ipp->ipp_rthdrdstoptslen;
5316 ASSERT(OK_32PTR(optptr));
5317 /* Save as last value */
5318 ip_savebuf((void **)&tcp->tcp_rthdrdstopts,
5319 &tcp->tcp_rthdrdstoptslen,
5320 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
5321 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
5322 }
5323 if (addflag.crb_ipv6_recvrthdr) {
5324 toh = (struct T_opthdr *)optptr;
5325 toh->level = IPPROTO_IPV6;
5326 toh->name = IPV6_RTHDR;
5327 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen;
5328 toh->status = 0;
5329 optptr += sizeof (*toh);
5330 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen);
5331 optptr += ipp->ipp_rthdrlen;
5332 ASSERT(OK_32PTR(optptr));
5333 /* Save as last value */
5334 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen,
5335 (ipp->ipp_fields & IPPF_RTHDR),
5336 ipp->ipp_rthdr, ipp->ipp_rthdrlen);
5337 }
5338 if (addflag.crb_ipv6_recvdstopts) {
5339 toh = (struct T_opthdr *)optptr;
5340 toh->level = IPPROTO_IPV6;
5341 toh->name = IPV6_DSTOPTS;
5342 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen;
5343 toh->status = 0;
5344 optptr += sizeof (*toh);
5345 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen);
5346 optptr += ipp->ipp_dstoptslen;
5347 ASSERT(OK_32PTR(optptr));
5348 /* Save as last value */
5349 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen,
5350 (ipp->ipp_fields & IPPF_DSTOPTS),
5351 ipp->ipp_dstopts, ipp->ipp_dstoptslen);
5352 }
5353 ASSERT(optptr == mp->b_wptr);
5354 return (mp);
5355 }
5356
5357 /* The minimum of smoothed mean deviation in RTO calculation (nsec). */
5358 #define TCP_SD_MIN 400000000
5359
5360 /*
5361 * Set RTO for this connection based on a new round-trip time measurement.
5362 * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
5363 * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
5364 * of that paper.
5365 *
5366 * m = new measurement
5367 * sa = smoothed RTT average (8 * average estimates).
5368 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
5369 */
5370 static void
tcp_set_rto(tcp_t * tcp,hrtime_t rtt)5371 tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5372 {
5373 hrtime_t m = rtt;
5374 hrtime_t sa = tcp->tcp_rtt_sa;
5375 hrtime_t sv = tcp->tcp_rtt_sd;
5376 tcp_stack_t *tcps = tcp->tcp_tcps;
5377
5378 TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5379 tcp->tcp_rtt_update++;
5380 tcp->tcp_rtt_sum += m;
5381 tcp->tcp_rtt_cnt++;
5382
5383 /* tcp_rtt_sa is not 0 means this is a new sample. */
5384 if (sa != 0) {
5385 /*
5386 * Update average estimator (see section 2.3 of RFC6298):
5387 * SRTT = 7/8 SRTT + 1/8 rtt
5388 *
5389 * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5390 * tcp_rtt_sa = 7 * SRTT + rtt
5391 * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
5392 * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
5393 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
5394 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
5395 * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
5396 *
5397 * (rtt - tcp_rtt_sa / 8) is simply the difference
5398 * between the new rtt measurement and the existing smoothed
5399 * RTT average. This is referred to as "Error" in subsequent
5400 * calculations.
5401 */
5402
5403 /* m is now Error. */
5404 m -= sa >> 3;
5405 if ((sa += m) <= 0) {
5406 /*
5407 * Don't allow the smoothed average to be negative.
5408 * We use 0 to denote reinitialization of the
5409 * variables.
5410 */
5411 sa = 1;
5412 }
5413
5414 /*
5415 * Update deviation estimator:
5416 * mdev = 3/4 mdev + 1/4 abs(Error)
5417 *
5418 * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
5419 * tcp_rtt_sd = 3 * mdev + abs(Error)
5420 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
5421 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
5422 * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
5423 */
5424 if (m < 0)
5425 m = -m;
5426 m -= sv >> 2;
5427 sv += m;
5428 } else {
5429 /*
5430 * This follows BSD's implementation. So the reinitialized
5431 * RTO is 3 * m. We cannot go less than 2 because if the
5432 * link is bandwidth dominated, doubling the window size
5433 * during slow start means doubling the RTT. We want to be
5434 * more conservative when we reinitialize our estimates. 3
5435 * is just a convenient number.
5436 */
5437 sa = m << 3;
5438 sv = m << 1;
5439 }
5440 if (sv < TCP_SD_MIN) {
5441 /*
5442 * Since a receiver doesn't delay its ACKs during a long run of
5443 * segments, sa may not have captured the effect of delayed ACK
5444 * timeouts on the RTT. To make sure we always account for the
5445 * possible delay (and avoid the unnecessary retransmission),
5446 * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
5447 * 200ms on older SunOS/BSD systems and modern Windows systems
5448 * (as of 2019). This means that the minimum possible mean
5449 * deviation is 100 ms.
5450 */
5451 sv = TCP_SD_MIN;
5452 }
5453 tcp->tcp_rtt_sa = sa;
5454 tcp->tcp_rtt_sd = sv;
5455
5456 tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
5457
5458 /* Now, we can reset tcp_timer_backoff to use the new RTO... */
5459 tcp->tcp_timer_backoff = 0;
5460 }
5461
5462 /*
5463 * On a labeled system we have some protocols above TCP, such as RPC, which
5464 * appear to assume that every mblk in a chain has a db_credp.
5465 */
5466 static void
tcp_setcred_data(mblk_t * mp,ip_recv_attr_t * ira)5467 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
5468 {
5469 ASSERT(is_system_labeled());
5470 ASSERT(ira->ira_cred != NULL);
5471
5472 while (mp != NULL) {
5473 mblk_setcred(mp, ira->ira_cred, NOPID);
5474 mp = mp->b_cont;
5475 }
5476 }
5477
5478 uint_t
tcp_rwnd_reopen(tcp_t * tcp)5479 tcp_rwnd_reopen(tcp_t *tcp)
5480 {
5481 uint_t ret = 0;
5482 uint_t thwin;
5483 conn_t *connp = tcp->tcp_connp;
5484
5485 /* Learn the latest rwnd information that we sent to the other side. */
5486 thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win))
5487 << tcp->tcp_rcv_ws;
5488 /* This is peer's calculated send window (our receive window). */
5489 thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
5490 /*
5491 * Increase the receive window to max. But we need to do receiver
5492 * SWS avoidance. This means that we need to check the increase of
5493 * of receive window is at least 1 MSS.
5494 */
5495 if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) {
5496 /*
5497 * If the window that the other side knows is less than max
5498 * deferred acks segments, send an update immediately.
5499 */
5500 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) {
5501 TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutWinUpdate);
5502 ret = TH_ACK_NEEDED;
5503 }
5504 tcp->tcp_rwnd = connp->conn_rcvbuf;
5505 }
5506 return (ret);
5507 }
5508
5509 /*
5510 * Handle a packet that has been reclassified by TCP.
5511 * This function drops the ref on connp that the caller had.
5512 */
5513 void
tcp_reinput(conn_t * connp,mblk_t * mp,ip_recv_attr_t * ira,ip_stack_t * ipst)5514 tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
5515 {
5516 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
5517
5518 if (connp->conn_incoming_ifindex != 0 &&
5519 connp->conn_incoming_ifindex != ira->ira_ruifindex) {
5520 freemsg(mp);
5521 CONN_DEC_REF(connp);
5522 return;
5523 }
5524
5525 if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
5526 (ira->ira_flags & IRAF_IPSEC_SECURE)) {
5527 ip6_t *ip6h;
5528 ipha_t *ipha;
5529
5530 if (ira->ira_flags & IRAF_IS_IPV4) {
5531 ipha = (ipha_t *)mp->b_rptr;
5532 ip6h = NULL;
5533 } else {
5534 ipha = NULL;
5535 ip6h = (ip6_t *)mp->b_rptr;
5536 }
5537 mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira);
5538 if (mp == NULL) {
5539 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
5540 /* Note that mp is NULL */
5541 ip_drop_input("ipIfStatsInDiscards", mp, NULL);
5542 CONN_DEC_REF(connp);
5543 return;
5544 }
5545 }
5546
5547 if (IPCL_IS_TCP(connp)) {
5548 /*
5549 * do not drain, certain use cases can blow
5550 * the stack
5551 */
5552 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
5553 connp->conn_recv, connp, ira,
5554 SQ_NODRAIN, SQTAG_IP_TCP_INPUT);
5555 } else {
5556 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
5557 (connp->conn_recv)(connp, mp, NULL,
5558 ira);
5559 CONN_DEC_REF(connp);
5560 }
5561
5562 }
5563
5564 /* ARGSUSED */
5565 static void
tcp_rsrv_input(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)5566 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
5567 {
5568 conn_t *connp = (conn_t *)arg;
5569 tcp_t *tcp = connp->conn_tcp;
5570 queue_t *q = connp->conn_rq;
5571
5572 ASSERT(!IPCL_IS_NONSTR(connp));
5573 mutex_enter(&tcp->tcp_rsrv_mp_lock);
5574 tcp->tcp_rsrv_mp = mp;
5575 mutex_exit(&tcp->tcp_rsrv_mp_lock);
5576
5577 if (TCP_IS_DETACHED(tcp) || q == NULL) {
5578 return;
5579 }
5580
5581 if (tcp->tcp_fused) {
5582 tcp_fuse_backenable(tcp);
5583 return;
5584 }
5585
5586 if (canputnext(q)) {
5587 /* Not flow-controlled, open rwnd */
5588 tcp->tcp_rwnd = connp->conn_rcvbuf;
5589
5590 /*
5591 * Send back a window update immediately if TCP is above
5592 * ESTABLISHED state and the increase of the rcv window
5593 * that the other side knows is at least 1 MSS after flow
5594 * control is lifted.
5595 */
5596 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
5597 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
5598 tcp_xmit_ctl(NULL, tcp,
5599 (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
5600 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
5601 }
5602 }
5603 }
5604
5605 /*
5606 * The read side service routine is called mostly when we get back-enabled as a
5607 * result of flow control relief. Since we don't actually queue anything in
5608 * TCP, we have no data to send out of here. What we do is clear the receive
5609 * window, and send out a window update.
5610 */
5611 int
tcp_rsrv(queue_t * q)5612 tcp_rsrv(queue_t *q)
5613 {
5614 conn_t *connp = Q_TO_CONN(q);
5615 tcp_t *tcp = connp->conn_tcp;
5616 mblk_t *mp;
5617
5618 /* No code does a putq on the read side */
5619 ASSERT(q->q_first == NULL);
5620
5621 /*
5622 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already
5623 * been run. So just return.
5624 */
5625 mutex_enter(&tcp->tcp_rsrv_mp_lock);
5626 if ((mp = tcp->tcp_rsrv_mp) == NULL) {
5627 mutex_exit(&tcp->tcp_rsrv_mp_lock);
5628 return (0);
5629 }
5630 tcp->tcp_rsrv_mp = NULL;
5631 mutex_exit(&tcp->tcp_rsrv_mp_lock);
5632
5633 CONN_INC_REF(connp);
5634 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp,
5635 NULL, SQ_PROCESS, SQTAG_TCP_RSRV);
5636 return (0);
5637 }
5638
5639 /* At minimum we need 8 bytes in the TCP header for the lookup */
5640 #define ICMP_MIN_TCP_HDR 8
5641
5642 /*
5643 * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages
5644 * passed up by IP. The message is always received on the correct tcp_t.
5645 * Assumes that IP has pulled up everything up to and including the ICMP header.
5646 */
5647 /* ARGSUSED2 */
5648 void
tcp_icmp_input(void * arg1,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)5649 tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
5650 {
5651 conn_t *connp = (conn_t *)arg1;
5652 icmph_t *icmph;
5653 ipha_t *ipha;
5654 int iph_hdr_length;
5655 tcpha_t *tcpha;
5656 uint32_t seg_seq;
5657 tcp_t *tcp = connp->conn_tcp;
5658
5659 /* Assume IP provides aligned packets */
5660 ASSERT(OK_32PTR(mp->b_rptr));
5661 ASSERT((MBLKL(mp) >= sizeof (ipha_t)));
5662
5663 /*
5664 * It's possible we have a closed, but not yet destroyed, TCP
5665 * connection. Several fields (e.g. conn_ixa->ixa_ire) are invalid
5666 * in the closed state, so don't take any chances and drop the packet.
5667 */
5668 if (tcp->tcp_state == TCPS_CLOSED) {
5669 freemsg(mp);
5670 return;
5671 }
5672
5673 /*
5674 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
5675 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
5676 */
5677 if (!(ira->ira_flags & IRAF_IS_IPV4)) {
5678 tcp_icmp_error_ipv6(tcp, mp, ira);
5679 return;
5680 }
5681
5682 /* Skip past the outer IP and ICMP headers */
5683 iph_hdr_length = ira->ira_ip_hdr_length;
5684 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
5685 /*
5686 * If we don't have the correct outer IP header length
5687 * or if we don't have a complete inner IP header
5688 * drop it.
5689 */
5690 if (iph_hdr_length < sizeof (ipha_t) ||
5691 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
5692 noticmpv4:
5693 freemsg(mp);
5694 return;
5695 }
5696 ipha = (ipha_t *)&icmph[1];
5697
5698 /* Skip past the inner IP and find the ULP header */
5699 iph_hdr_length = IPH_HDR_LENGTH(ipha);
5700 tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length);
5701 /*
5702 * If we don't have the correct inner IP header length or if the ULP
5703 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
5704 * bytes of TCP header, drop it.
5705 */
5706 if (iph_hdr_length < sizeof (ipha_t) ||
5707 ipha->ipha_protocol != IPPROTO_TCP ||
5708 (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) {
5709 goto noticmpv4;
5710 }
5711
5712 seg_seq = ntohl(tcpha->tha_seq);
5713 switch (icmph->icmph_type) {
5714 case ICMP_DEST_UNREACHABLE:
5715 switch (icmph->icmph_code) {
5716 case ICMP_FRAGMENTATION_NEEDED:
5717 /*
5718 * Update Path MTU, then try to send something out.
5719 */
5720 tcp_update_pmtu(tcp, B_TRUE);
5721 tcp_rexmit_after_error(tcp);
5722 break;
5723 case ICMP_PORT_UNREACHABLE:
5724 case ICMP_PROTOCOL_UNREACHABLE:
5725 switch (tcp->tcp_state) {
5726 case TCPS_SYN_SENT:
5727 case TCPS_SYN_RCVD:
5728 /*
5729 * ICMP can snipe away incipient
5730 * TCP connections as long as
5731 * seq number is same as initial
5732 * send seq number.
5733 */
5734 if (seg_seq == tcp->tcp_iss) {
5735 (void) tcp_clean_death(tcp,
5736 ECONNREFUSED);
5737 }
5738 break;
5739 }
5740 break;
5741 case ICMP_HOST_UNREACHABLE:
5742 case ICMP_NET_UNREACHABLE:
5743 /* Record the error in case we finally time out. */
5744 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
5745 tcp->tcp_client_errno = EHOSTUNREACH;
5746 else
5747 tcp->tcp_client_errno = ENETUNREACH;
5748 if (tcp->tcp_state == TCPS_SYN_RCVD) {
5749 if (tcp->tcp_listener != NULL &&
5750 tcp->tcp_listener->tcp_syn_defense) {
5751 /*
5752 * Ditch the half-open connection if we
5753 * suspect a SYN attack is under way.
5754 */
5755 (void) tcp_clean_death(tcp,
5756 tcp->tcp_client_errno);
5757 }
5758 }
5759 break;
5760 default:
5761 break;
5762 }
5763 break;
5764 case ICMP_SOURCE_QUENCH: {
5765 /*
5766 * use a global boolean to control
5767 * whether TCP should respond to ICMP_SOURCE_QUENCH.
5768 * The default is false.
5769 */
5770 if (tcp_icmp_source_quench) {
5771 /*
5772 * Reduce the sending rate as if we got a
5773 * retransmit timeout
5774 */
5775 uint32_t npkt;
5776
5777 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
5778 tcp->tcp_mss;
5779 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
5780
5781 DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
5782 uint32_t, tcp->tcp_cwnd,
5783 uint32_t, tcp->tcp_mss);
5784 tcp->tcp_cwnd = tcp->tcp_mss;
5785 tcp->tcp_cwnd_cnt = 0;
5786 }
5787 break;
5788 }
5789 }
5790 freemsg(mp);
5791 }
5792
5793 /*
5794 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
5795 * error messages passed up by IP.
5796 * Assumes that IP has pulled up all the extension headers as well
5797 * as the ICMPv6 header.
5798 */
5799 static void
tcp_icmp_error_ipv6(tcp_t * tcp,mblk_t * mp,ip_recv_attr_t * ira)5800 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
5801 {
5802 icmp6_t *icmp6;
5803 ip6_t *ip6h;
5804 uint16_t iph_hdr_length = ira->ira_ip_hdr_length;
5805 tcpha_t *tcpha;
5806 uint8_t *nexthdrp;
5807 uint32_t seg_seq;
5808
5809 /*
5810 * Verify that we have a complete IP header.
5811 */
5812 ASSERT((MBLKL(mp) >= sizeof (ip6_t)));
5813
5814 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
5815 ip6h = (ip6_t *)&icmp6[1];
5816 /*
5817 * Verify if we have a complete ICMP and inner IP header.
5818 */
5819 if ((uchar_t *)&ip6h[1] > mp->b_wptr) {
5820 noticmpv6:
5821 freemsg(mp);
5822 return;
5823 }
5824
5825 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
5826 goto noticmpv6;
5827 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
5828 /*
5829 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
5830 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the
5831 * packet.
5832 */
5833 if ((*nexthdrp != IPPROTO_TCP) ||
5834 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
5835 goto noticmpv6;
5836 }
5837
5838 seg_seq = ntohl(tcpha->tha_seq);
5839 switch (icmp6->icmp6_type) {
5840 case ICMP6_PACKET_TOO_BIG:
5841 /*
5842 * Update Path MTU, then try to send something out.
5843 */
5844 tcp_update_pmtu(tcp, B_TRUE);
5845 tcp_rexmit_after_error(tcp);
5846 break;
5847 case ICMP6_DST_UNREACH:
5848 switch (icmp6->icmp6_code) {
5849 case ICMP6_DST_UNREACH_NOPORT:
5850 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
5851 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
5852 (seg_seq == tcp->tcp_iss)) {
5853 (void) tcp_clean_death(tcp, ECONNREFUSED);
5854 }
5855 break;
5856 case ICMP6_DST_UNREACH_ADMIN:
5857 case ICMP6_DST_UNREACH_NOROUTE:
5858 case ICMP6_DST_UNREACH_BEYONDSCOPE:
5859 case ICMP6_DST_UNREACH_ADDR:
5860 /* Record the error in case we finally time out. */
5861 tcp->tcp_client_errno = EHOSTUNREACH;
5862 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
5863 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
5864 (seg_seq == tcp->tcp_iss)) {
5865 if (tcp->tcp_listener != NULL &&
5866 tcp->tcp_listener->tcp_syn_defense) {
5867 /*
5868 * Ditch the half-open connection if we
5869 * suspect a SYN attack is under way.
5870 */
5871 (void) tcp_clean_death(tcp,
5872 tcp->tcp_client_errno);
5873 }
5874 }
5875
5876
5877 break;
5878 default:
5879 break;
5880 }
5881 break;
5882 case ICMP6_PARAM_PROB:
5883 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
5884 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
5885 (uchar_t *)ip6h + icmp6->icmp6_pptr ==
5886 (uchar_t *)nexthdrp) {
5887 if (tcp->tcp_state == TCPS_SYN_SENT ||
5888 tcp->tcp_state == TCPS_SYN_RCVD) {
5889 (void) tcp_clean_death(tcp, ECONNREFUSED);
5890 }
5891 break;
5892 }
5893 break;
5894
5895 case ICMP6_TIME_EXCEEDED:
5896 default:
5897 break;
5898 }
5899 freemsg(mp);
5900 }
5901
5902 /*
5903 * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might
5904 * change. But it can refer to fields like tcp_suna and tcp_snxt.
5905 *
5906 * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP
5907 * error messages received by IP. The message is always received on the correct
5908 * tcp_t.
5909 */
5910 /* ARGSUSED */
5911 boolean_t
tcp_verifyicmp(conn_t * connp,void * arg2,icmph_t * icmph,icmp6_t * icmp6,ip_recv_attr_t * ira)5912 tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
5913 ip_recv_attr_t *ira)
5914 {
5915 tcpha_t *tcpha = (tcpha_t *)arg2;
5916 uint32_t seq = ntohl(tcpha->tha_seq);
5917 tcp_t *tcp = connp->conn_tcp;
5918
5919 /*
5920 * TCP sequence number contained in payload of the ICMP error message
5921 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise,
5922 * the message is either a stale ICMP error, or an attack from the
5923 * network. Fail the verification.
5924 */
5925 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
5926 return (B_FALSE);
5927
5928 /* For "too big" we also check the ignore flag */
5929 if (ira->ira_flags & IRAF_IS_IPV4) {
5930 ASSERT(icmph != NULL);
5931 if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
5932 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
5933 tcp->tcp_tcps->tcps_ignore_path_mtu)
5934 return (B_FALSE);
5935 } else {
5936 ASSERT(icmp6 != NULL);
5937 if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG &&
5938 tcp->tcp_tcps->tcps_ignore_path_mtu)
5939 return (B_FALSE);
5940 }
5941 return (B_TRUE);
5942 }
5943