17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5ee4701baSericheng  * Common Development and Distribution License (the "License").
6ee4701baSericheng  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
2266cd0f60SKacheong Poon  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
2378a2e113SAndy Fiddaman  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
24*041297c2SDan McDonald  * Copyright 2022 Joyent, Inc.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate /*
287c478bd9Sstevel@tonic-gate  * IP PACKET CLASSIFIER
297c478bd9Sstevel@tonic-gate  *
307c478bd9Sstevel@tonic-gate  * The IP packet classifier provides mapping between IP packets and persistent
317c478bd9Sstevel@tonic-gate  * connection state for connection-oriented protocols. It also provides
327c478bd9Sstevel@tonic-gate  * interface for managing connection states.
337c478bd9Sstevel@tonic-gate  *
347c478bd9Sstevel@tonic-gate  * The connection state is kept in conn_t data structure and contains, among
357c478bd9Sstevel@tonic-gate  * other things:
367c478bd9Sstevel@tonic-gate  *
377c478bd9Sstevel@tonic-gate  *	o local/remote address and ports
387c478bd9Sstevel@tonic-gate  *	o Transport protocol
397c478bd9Sstevel@tonic-gate  *	o squeue for the connection (for TCP only)
407c478bd9Sstevel@tonic-gate  *	o reference counter
417c478bd9Sstevel@tonic-gate  *	o Connection state
427c478bd9Sstevel@tonic-gate  *	o hash table linkage
437c478bd9Sstevel@tonic-gate  *	o interface/ire information
447c478bd9Sstevel@tonic-gate  *	o credentials
457c478bd9Sstevel@tonic-gate  *	o ipsec policy
467c478bd9Sstevel@tonic-gate  *	o send and receive functions.
477c478bd9Sstevel@tonic-gate  *	o mutex lock.
487c478bd9Sstevel@tonic-gate  *
497c478bd9Sstevel@tonic-gate  * Connections use a reference counting scheme. They are freed when the
507c478bd9Sstevel@tonic-gate  * reference counter drops to zero. A reference is incremented when connection
517c478bd9Sstevel@tonic-gate  * is placed in a list or table, when incoming packet for the connection arrives
527c478bd9Sstevel@tonic-gate  * and when connection is processed via squeue (squeue processing may be
537c478bd9Sstevel@tonic-gate  * asynchronous and the reference protects the connection from being destroyed
547c478bd9Sstevel@tonic-gate  * before its processing is finished).
557c478bd9Sstevel@tonic-gate  *
56bd670b35SErik Nordmark  * conn_recv is used to pass up packets to the ULP.
57bd670b35SErik Nordmark  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
58bd670b35SErik Nordmark  * a listener, and changes to tcp_input_listener as the listener has picked a
59bd670b35SErik Nordmark  * good squeue. For other cases it is set to tcp_input_data.
60bd670b35SErik Nordmark  *
61bd670b35SErik Nordmark  * conn_recvicmp is used to pass up ICMP errors to the ULP.
627c478bd9Sstevel@tonic-gate  *
637c478bd9Sstevel@tonic-gate  * Classifier uses several hash tables:
647c478bd9Sstevel@tonic-gate  *
6578a2e113SAndy Fiddaman  *	ipcl_conn_fanout:	contains all TCP connections in CONNECTED state
667c478bd9Sstevel@tonic-gate  *	ipcl_bind_fanout:	contains all connections in BOUND state
677c478bd9Sstevel@tonic-gate  *	ipcl_proto_fanout:	IPv4 protocol fanout
687c478bd9Sstevel@tonic-gate  *	ipcl_proto_fanout_v6:	IPv6 protocol fanout
697c478bd9Sstevel@tonic-gate  *	ipcl_udp_fanout:	contains all UDP connections
702b24ab6bSSebastien Roy  *	ipcl_iptun_fanout:	contains all IP tunnel connections
717c478bd9Sstevel@tonic-gate  *	ipcl_globalhash_fanout:	contains all connections
727c478bd9Sstevel@tonic-gate  *
737c478bd9Sstevel@tonic-gate  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
747c478bd9Sstevel@tonic-gate  * which need to view all existing connections.
757c478bd9Sstevel@tonic-gate  *
767c478bd9Sstevel@tonic-gate  * All tables are protected by per-bucket locks. When both per-bucket lock and
777c478bd9Sstevel@tonic-gate  * connection lock need to be held, the per-bucket lock should be acquired
787c478bd9Sstevel@tonic-gate  * first, followed by the connection lock.
797c478bd9Sstevel@tonic-gate  *
807c478bd9Sstevel@tonic-gate  * All functions doing search in one of these tables increment a reference
817c478bd9Sstevel@tonic-gate  * counter on the connection found (if any). This reference should be dropped
827c478bd9Sstevel@tonic-gate  * when the caller has finished processing the connection.
837c478bd9Sstevel@tonic-gate  *
847c478bd9Sstevel@tonic-gate  *
857c478bd9Sstevel@tonic-gate  * INTERFACES:
867c478bd9Sstevel@tonic-gate  * ===========
877c478bd9Sstevel@tonic-gate  *
887c478bd9Sstevel@tonic-gate  * Connection Lookup:
897c478bd9Sstevel@tonic-gate  * ------------------
907c478bd9Sstevel@tonic-gate  *
91bd670b35SErik Nordmark  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
92bd670b35SErik Nordmark  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
937c478bd9Sstevel@tonic-gate  *
947c478bd9Sstevel@tonic-gate  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
957c478bd9Sstevel@tonic-gate  * it can't find any associated connection. If the connection is found, its
967c478bd9Sstevel@tonic-gate  * reference counter is incremented.
977c478bd9Sstevel@tonic-gate  *
987c478bd9Sstevel@tonic-gate  *	mp:	mblock, containing packet header. The full header should fit
997c478bd9Sstevel@tonic-gate  *		into a single mblock. It should also contain at least full IP
1007c478bd9Sstevel@tonic-gate  *		and TCP or UDP header.
1017c478bd9Sstevel@tonic-gate  *
1027c478bd9Sstevel@tonic-gate  *	protocol: Either IPPROTO_TCP or IPPROTO_UDP.
1037c478bd9Sstevel@tonic-gate  *
1047c478bd9Sstevel@tonic-gate  *	hdr_len: The size of IP header. It is used to find TCP or UDP header in
1057c478bd9Sstevel@tonic-gate  *		 the packet.
1067c478bd9Sstevel@tonic-gate  *
10778a2e113SAndy Fiddaman  *	ira->ira_zoneid: The zone in which the returned connection must be; the
108bd670b35SErik Nordmark  *		zoneid corresponding to the ire_zoneid on the IRE located for
109bd670b35SErik Nordmark  *		the packet's destination address.
110bd670b35SErik Nordmark  *
111bd670b35SErik Nordmark  *	ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
112bd670b35SErik Nordmark  *		IRAF_TX_SHARED_ADDR flags
1137c478bd9Sstevel@tonic-gate  *
1147c478bd9Sstevel@tonic-gate  *	For TCP connections, the lookup order is as follows:
1157c478bd9Sstevel@tonic-gate  *		5-tuple {src, dst, protocol, local port, remote port}
1167c478bd9Sstevel@tonic-gate  *			lookup in ipcl_conn_fanout table.
1177c478bd9Sstevel@tonic-gate  *		3-tuple {dst, remote port, protocol} lookup in
1187c478bd9Sstevel@tonic-gate  *			ipcl_bind_fanout table.
1197c478bd9Sstevel@tonic-gate  *
1207c478bd9Sstevel@tonic-gate  *	For UDP connections, a 5-tuple {src, dst, protocol, local port,
1217c478bd9Sstevel@tonic-gate  *	remote port} lookup is done on ipcl_udp_fanout. Note that,
1227c478bd9Sstevel@tonic-gate  *	these interfaces do not handle cases where a packets belongs
1237c478bd9Sstevel@tonic-gate  *	to multiple UDP clients, which is handled in IP itself.
1247c478bd9Sstevel@tonic-gate  *
12545916cd2Sjpk  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
12645916cd2Sjpk  * determine which actual zone gets the segment.  This is used only in a
12745916cd2Sjpk  * labeled environment.  The matching rules are:
12845916cd2Sjpk  *
12945916cd2Sjpk  *	- If it's not a multilevel port, then the label on the packet selects
13045916cd2Sjpk  *	  the zone.  Unlabeled packets are delivered to the global zone.
13145916cd2Sjpk  *
13245916cd2Sjpk  *	- If it's a multilevel port, then only the zone registered to receive
13345916cd2Sjpk  *	  packets on that port matches.
13445916cd2Sjpk  *
13545916cd2Sjpk  * Also, in a labeled environment, packet labels need to be checked.  For fully
13645916cd2Sjpk  * bound TCP connections, we can assume that the packet label was checked
13745916cd2Sjpk  * during connection establishment, and doesn't need to be checked on each
13845916cd2Sjpk  * packet.  For others, though, we need to check for strict equality or, for
13945916cd2Sjpk  * multilevel ports, membership in the range or set.  This part currently does
14045916cd2Sjpk  * a tnrh lookup on each packet, but could be optimized to use cached results
14145916cd2Sjpk  * if that were necessary.  (SCTP doesn't come through here, but if it did,
14245916cd2Sjpk  * we would apply the same rules as TCP.)
14345916cd2Sjpk  *
14445916cd2Sjpk  * An implication of the above is that fully-bound TCP sockets must always use
14545916cd2Sjpk  * distinct 4-tuples; they can't be discriminated by label alone.
14645916cd2Sjpk  *
14745916cd2Sjpk  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
14845916cd2Sjpk  * as there's no connection set-up handshake and no shared state.
14945916cd2Sjpk  *
15045916cd2Sjpk  * Labels on looped-back packets within a single zone do not need to be
15145916cd2Sjpk  * checked, as all processes in the same zone have the same label.
15245916cd2Sjpk  *
15345916cd2Sjpk  * Finally, for unlabeled packets received by a labeled system, special rules
15445916cd2Sjpk  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
15545916cd2Sjpk  * socket in the zone whose label matches the default label of the sender, if
15645916cd2Sjpk  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
15745916cd2Sjpk  * receiver's label must dominate the sender's default label.
15845916cd2Sjpk  *
159bd670b35SErik Nordmark  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
160f4b3ec61Sdh  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
161f4b3ec61Sdh  *					 ip_stack);
1627c478bd9Sstevel@tonic-gate  *
1637c478bd9Sstevel@tonic-gate  *	Lookup routine to find a exact match for {src, dst, local port,
1647c478bd9Sstevel@tonic-gate  *	remote port) for TCP connections in ipcl_conn_fanout. The address and
1657c478bd9Sstevel@tonic-gate  *	ports are read from the IP and TCP header respectively.
1667c478bd9Sstevel@tonic-gate  *
167f4b3ec61Sdh  * conn_t	*ipcl_lookup_listener_v4(lport, laddr, protocol,
168f4b3ec61Sdh  *					 zoneid, ip_stack);
169f4b3ec61Sdh  * conn_t	*ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
170f4b3ec61Sdh  *					 zoneid, ip_stack);
1717c478bd9Sstevel@tonic-gate  *
17278a2e113SAndy Fiddaman  *	Lookup routine to find a listener with the tuple {lport, laddr,
17378a2e113SAndy Fiddaman  *	protocol} in the ipcl_bind_fanout table. For IPv6, an additional
17478a2e113SAndy Fiddaman  *	parameter interface index is also compared.
1757c478bd9Sstevel@tonic-gate  *
176f4b3ec61Sdh  * void ipcl_walk(func, arg, ip_stack)
1777c478bd9Sstevel@tonic-gate  *
17878a2e113SAndy Fiddaman  *	Apply 'func' to every connection available. The 'func' is called as
1797c478bd9Sstevel@tonic-gate  *	(*func)(connp, arg). The walk is non-atomic so connections may be
1807c478bd9Sstevel@tonic-gate  *	created and destroyed during the walk. The CONN_CONDEMNED and
1817c478bd9Sstevel@tonic-gate  *	CONN_INCIPIENT flags ensure that connections which are newly created
1827c478bd9Sstevel@tonic-gate  *	or being destroyed are not selected by the walker.
1837c478bd9Sstevel@tonic-gate  *
1847c478bd9Sstevel@tonic-gate  * Table Updates
1857c478bd9Sstevel@tonic-gate  * -------------
1867c478bd9Sstevel@tonic-gate  *
187bd670b35SErik Nordmark  * int ipcl_conn_insert(connp);
188bd670b35SErik Nordmark  * int ipcl_conn_insert_v4(connp);
189bd670b35SErik Nordmark  * int ipcl_conn_insert_v6(connp);
1907c478bd9Sstevel@tonic-gate  *
1917c478bd9Sstevel@tonic-gate  *	Insert 'connp' in the ipcl_conn_fanout.
19278a2e113SAndy Fiddaman  *	Arguments :
1937c478bd9Sstevel@tonic-gate  *		connp		conn_t to be inserted
1947c478bd9Sstevel@tonic-gate  *
1957c478bd9Sstevel@tonic-gate  *	Return value :
1967c478bd9Sstevel@tonic-gate  *		0		if connp was inserted
1977c478bd9Sstevel@tonic-gate  *		EADDRINUSE	if the connection with the same tuple
1987c478bd9Sstevel@tonic-gate  *				already exists.
1997c478bd9Sstevel@tonic-gate  *
200bd670b35SErik Nordmark  * int ipcl_bind_insert(connp);
201bd670b35SErik Nordmark  * int ipcl_bind_insert_v4(connp);
202bd670b35SErik Nordmark  * int ipcl_bind_insert_v6(connp);
2037c478bd9Sstevel@tonic-gate  *
20478a2e113SAndy Fiddaman  *	Insert 'connp' in ipcl_bind_fanout.
20578a2e113SAndy Fiddaman  *	Arguments :
20678a2e113SAndy Fiddaman  *		connp		conn_t to be inserted
2077c478bd9Sstevel@tonic-gate  *
2087c478bd9Sstevel@tonic-gate  *
2097c478bd9Sstevel@tonic-gate  * void ipcl_hash_remove(connp);
2107c478bd9Sstevel@tonic-gate  *
21178a2e113SAndy Fiddaman  *	Removes the 'connp' from the connection fanout table.
2127c478bd9Sstevel@tonic-gate  *
2137c478bd9Sstevel@tonic-gate  * Connection Creation/Destruction
2147c478bd9Sstevel@tonic-gate  * -------------------------------
2157c478bd9Sstevel@tonic-gate  *
216f4b3ec61Sdh  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
2177c478bd9Sstevel@tonic-gate  *
21878a2e113SAndy Fiddaman  *	Creates a new conn based on the type flag, inserts it into
21978a2e113SAndy Fiddaman  *	globalhash table.
2207c478bd9Sstevel@tonic-gate  *
2217c478bd9Sstevel@tonic-gate  *	type:	This flag determines the type of conn_t which needs to be
222fc80c0dfSnordmark  *		created i.e., which kmem_cache it comes from.
2237c478bd9Sstevel@tonic-gate  *		IPCL_TCPCONN	indicates a TCP connection
224fc80c0dfSnordmark  *		IPCL_SCTPCONN	indicates a SCTP connection
225fc80c0dfSnordmark  *		IPCL_UDPCONN	indicates a UDP conn_t.
226fc80c0dfSnordmark  *		IPCL_RAWIPCONN	indicates a RAWIP/ICMP conn_t.
227fc80c0dfSnordmark  *		IPCL_RTSCONN	indicates a RTS conn_t.
228fc80c0dfSnordmark  *		IPCL_IPCCONN	indicates all other connections.
2297c478bd9Sstevel@tonic-gate  *
2307c478bd9Sstevel@tonic-gate  * void ipcl_conn_destroy(connp)
2317c478bd9Sstevel@tonic-gate  *
23278a2e113SAndy Fiddaman  *	Destroys the connection state, removes it from the global
23378a2e113SAndy Fiddaman  *	connection hash table and frees its memory.
2347c478bd9Sstevel@tonic-gate  */
2357c478bd9Sstevel@tonic-gate 
2367c478bd9Sstevel@tonic-gate #include <sys/types.h>
2377c478bd9Sstevel@tonic-gate #include <sys/stream.h>
2387c478bd9Sstevel@tonic-gate #include <sys/stropts.h>
2397c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
2407c478bd9Sstevel@tonic-gate #include <sys/strsubr.h>
2417c478bd9Sstevel@tonic-gate #include <sys/strsun.h>
2427c478bd9Sstevel@tonic-gate #define	_SUN_TPI_VERSION 2
2437c478bd9Sstevel@tonic-gate #include <sys/ddi.h>
2447c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
2457c478bd9Sstevel@tonic-gate #include <sys/debug.h>
2467c478bd9Sstevel@tonic-gate 
2477c478bd9Sstevel@tonic-gate #include <sys/systm.h>
2487c478bd9Sstevel@tonic-gate #include <sys/param.h>
2497c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
2507c478bd9Sstevel@tonic-gate #include <sys/isa_defs.h>
2517c478bd9Sstevel@tonic-gate #include <inet/common.h>
2527c478bd9Sstevel@tonic-gate #include <netinet/ip6.h>
2537c478bd9Sstevel@tonic-gate #include <netinet/icmp6.h>
2547c478bd9Sstevel@tonic-gate 
2557c478bd9Sstevel@tonic-gate #include <inet/ip.h>
256bd670b35SErik Nordmark #include <inet/ip_if.h>
257bd670b35SErik Nordmark #include <inet/ip_ire.h>
2587c478bd9Sstevel@tonic-gate #include <inet/ip6.h>
2597c478bd9Sstevel@tonic-gate #include <inet/ip_ndp.h>
2600f1702c5SYu Xiangning #include <inet/ip_impl.h>
261ff550d0eSmasputra #include <inet/udp_impl.h>
2627c478bd9Sstevel@tonic-gate #include <inet/sctp_ip.h>
263f4b3ec61Sdh #include <inet/sctp/sctp_impl.h>
264fc80c0dfSnordmark #include <inet/rawip_impl.h>
265fc80c0dfSnordmark #include <inet/rts_impl.h>
2662b24ab6bSSebastien Roy #include <inet/iptun/iptun_impl.h>
2677c478bd9Sstevel@tonic-gate 
2687c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
2697c478bd9Sstevel@tonic-gate 
2707c478bd9Sstevel@tonic-gate #include <inet/ipclassifier.h>
2710f1702c5SYu Xiangning #include <inet/tcp.h>
2727c478bd9Sstevel@tonic-gate #include <inet/ipsec_impl.h>
2737c478bd9Sstevel@tonic-gate 
27445916cd2Sjpk #include <sys/tsol/tnet.h>
2750f1702c5SYu Xiangning #include <sys/sockio.h>
27645916cd2Sjpk 
277f4b3ec61Sdh /* Old value for compatibility. Setable in /etc/system */
2787c478bd9Sstevel@tonic-gate uint_t tcp_conn_hash_size = 0;
2797c478bd9Sstevel@tonic-gate 
280f4b3ec61Sdh /* New value. Zero means choose automatically.  Setable in /etc/system */
2817c478bd9Sstevel@tonic-gate uint_t ipcl_conn_hash_size = 0;
2827c478bd9Sstevel@tonic-gate uint_t ipcl_conn_hash_memfactor = 8192;
2837c478bd9Sstevel@tonic-gate uint_t ipcl_conn_hash_maxsize = 82500;
2847c478bd9Sstevel@tonic-gate 
2857c478bd9Sstevel@tonic-gate /* bind/udp fanout table size */
2867c478bd9Sstevel@tonic-gate uint_t ipcl_bind_fanout_size = 512;
287ee4701baSericheng uint_t ipcl_udp_fanout_size = 16384;
2887c478bd9Sstevel@tonic-gate 
2897c478bd9Sstevel@tonic-gate /* Raw socket fanout size.  Must be a power of 2. */
2907c478bd9Sstevel@tonic-gate uint_t ipcl_raw_fanout_size = 256;
2917c478bd9Sstevel@tonic-gate 
2922b24ab6bSSebastien Roy /*
2932b24ab6bSSebastien Roy  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
2942b24ab6bSSebastien Roy  * expect that most large deployments would have hundreds of tunnels, and
2952b24ab6bSSebastien Roy  * thousands in the extreme case.
2962b24ab6bSSebastien Roy  */
2972b24ab6bSSebastien Roy uint_t ipcl_iptun_fanout_size = 6143;
2982b24ab6bSSebastien Roy 
2997c478bd9Sstevel@tonic-gate /*
3007c478bd9Sstevel@tonic-gate  * Power of 2^N Primes useful for hashing for N of 0-28,
3017c478bd9Sstevel@tonic-gate  * these primes are the nearest prime <= 2^N - 2^(N-2).
3027c478bd9Sstevel@tonic-gate  */
3037c478bd9Sstevel@tonic-gate 
3047c478bd9Sstevel@tonic-gate #define	P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,	\
3057c478bd9Sstevel@tonic-gate 		6143, 12281, 24571, 49139, 98299, 196597, 393209,	\
3067c478bd9Sstevel@tonic-gate 		786431, 1572853, 3145721, 6291449, 12582893, 25165813,	\
3077c478bd9Sstevel@tonic-gate 		50331599, 100663291, 201326557, 0}
3087c478bd9Sstevel@tonic-gate 
3097c478bd9Sstevel@tonic-gate /*
310fc80c0dfSnordmark  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
311fc80c0dfSnordmark  * are aligned on cache lines.
3127c478bd9Sstevel@tonic-gate  */
313fc80c0dfSnordmark typedef union itc_s {
314fc80c0dfSnordmark 	conn_t	itc_conn;
315fc80c0dfSnordmark 	char	itcu_filler[CACHE_ALIGN(conn_s)];
3167c478bd9Sstevel@tonic-gate } itc_t;
3177c478bd9Sstevel@tonic-gate 
318fc80c0dfSnordmark struct kmem_cache  *tcp_conn_cache;
319fc80c0dfSnordmark struct kmem_cache  *ip_conn_cache;
3207c478bd9Sstevel@tonic-gate extern struct kmem_cache  *sctp_conn_cache;
321fc80c0dfSnordmark struct kmem_cache  *udp_conn_cache;
322fc80c0dfSnordmark struct kmem_cache  *rawip_conn_cache;
323fc80c0dfSnordmark struct kmem_cache  *rts_conn_cache;
3247c478bd9Sstevel@tonic-gate 
3257c478bd9Sstevel@tonic-gate extern void	tcp_timermp_free(tcp_t *);
3267c478bd9Sstevel@tonic-gate extern mblk_t	*tcp_timermp_alloc(int);
3277c478bd9Sstevel@tonic-gate 
328fc80c0dfSnordmark static int	ip_conn_constructor(void *, void *, int);
329fc80c0dfSnordmark static void	ip_conn_destructor(void *, void *);
330fc80c0dfSnordmark 
331fc80c0dfSnordmark static int	tcp_conn_constructor(void *, void *, int);
332fc80c0dfSnordmark static void	tcp_conn_destructor(void *, void *);
333fc80c0dfSnordmark 
334fc80c0dfSnordmark static int	udp_conn_constructor(void *, void *, int);
335fc80c0dfSnordmark static void	udp_conn_destructor(void *, void *);
336fc80c0dfSnordmark 
337fc80c0dfSnordmark static int	rawip_conn_constructor(void *, void *, int);
338fc80c0dfSnordmark static void	rawip_conn_destructor(void *, void *);
339fc80c0dfSnordmark 
340fc80c0dfSnordmark static int	rts_conn_constructor(void *, void *, int);
341fc80c0dfSnordmark static void	rts_conn_destructor(void *, void *);
3427c478bd9Sstevel@tonic-gate 
3437c478bd9Sstevel@tonic-gate /*
344f4b3ec61Sdh  * Global (for all stack instances) init routine
3457c478bd9Sstevel@tonic-gate  */
3467c478bd9Sstevel@tonic-gate void
ipcl_g_init(void)347f4b3ec61Sdh ipcl_g_init(void)
3487c478bd9Sstevel@tonic-gate {
349fc80c0dfSnordmark 	ip_conn_cache = kmem_cache_create("ip_conn_cache",
3507c478bd9Sstevel@tonic-gate 	    sizeof (conn_t), CACHE_ALIGN_SIZE,
351fc80c0dfSnordmark 	    ip_conn_constructor, ip_conn_destructor,
352fc80c0dfSnordmark 	    NULL, NULL, NULL, 0);
353fc80c0dfSnordmark 
354fc80c0dfSnordmark 	tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
355fc80c0dfSnordmark 	    sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
356fc80c0dfSnordmark 	    tcp_conn_constructor, tcp_conn_destructor,
35793fcb0b9SKacheong Poon 	    tcp_conn_reclaim, NULL, NULL, 0);
358fc80c0dfSnordmark 
359fc80c0dfSnordmark 	udp_conn_cache = kmem_cache_create("udp_conn_cache",
360fc80c0dfSnordmark 	    sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
361fc80c0dfSnordmark 	    udp_conn_constructor, udp_conn_destructor,
362fc80c0dfSnordmark 	    NULL, NULL, NULL, 0);
3637c478bd9Sstevel@tonic-gate 
364fc80c0dfSnordmark 	rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
365fc80c0dfSnordmark 	    sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
366fc80c0dfSnordmark 	    rawip_conn_constructor, rawip_conn_destructor,
367fc80c0dfSnordmark 	    NULL, NULL, NULL, 0);
368fc80c0dfSnordmark 
369fc80c0dfSnordmark 	rts_conn_cache = kmem_cache_create("rts_conn_cache",
370fc80c0dfSnordmark 	    sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
371fc80c0dfSnordmark 	    rts_conn_constructor, rts_conn_destructor,
3727c478bd9Sstevel@tonic-gate 	    NULL, NULL, NULL, 0);
373f4b3ec61Sdh }
374f4b3ec61Sdh 
375f4b3ec61Sdh /*
376f4b3ec61Sdh  * ipclassifier intialization routine, sets up hash tables.
377f4b3ec61Sdh  */
378f4b3ec61Sdh void
ipcl_init(ip_stack_t * ipst)379f4b3ec61Sdh ipcl_init(ip_stack_t *ipst)
380f4b3ec61Sdh {
381f4b3ec61Sdh 	int i;
382f4b3ec61Sdh 	int sizes[] = P2Ps();
3837c478bd9Sstevel@tonic-gate 
3847c478bd9Sstevel@tonic-gate 	/*
385f4b3ec61Sdh 	 * Calculate size of conn fanout table from /etc/system settings
3867c478bd9Sstevel@tonic-gate 	 */
3877c478bd9Sstevel@tonic-gate 	if (ipcl_conn_hash_size != 0) {
388f4b3ec61Sdh 		ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
3897c478bd9Sstevel@tonic-gate 	} else if (tcp_conn_hash_size != 0) {
390f4b3ec61Sdh 		ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
3917c478bd9Sstevel@tonic-gate 	} else {
3927c478bd9Sstevel@tonic-gate 		extern pgcnt_t freemem;
3937c478bd9Sstevel@tonic-gate 
394f4b3ec61Sdh 		ipst->ips_ipcl_conn_fanout_size =
3957c478bd9Sstevel@tonic-gate 		    (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
3967c478bd9Sstevel@tonic-gate 
397f4b3ec61Sdh 		if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
398f4b3ec61Sdh 			ipst->ips_ipcl_conn_fanout_size =
399f4b3ec61Sdh 			    ipcl_conn_hash_maxsize;
400f4b3ec61Sdh 		}
4017c478bd9Sstevel@tonic-gate 	}
4027c478bd9Sstevel@tonic-gate 
4037c478bd9Sstevel@tonic-gate 	for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
404f4b3ec61Sdh 		if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
4057c478bd9Sstevel@tonic-gate 			break;
4067c478bd9Sstevel@tonic-gate 		}
4077c478bd9Sstevel@tonic-gate 	}
408f4b3ec61Sdh 	if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
4097c478bd9Sstevel@tonic-gate 		/* Out of range, use the 2^16 value */
410f4b3ec61Sdh 		ipst->ips_ipcl_conn_fanout_size = sizes[16];
4117c478bd9Sstevel@tonic-gate 	}
4127c478bd9Sstevel@tonic-gate 
413f4b3ec61Sdh 	/* Take values from /etc/system */
414f4b3ec61Sdh 	ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
415f4b3ec61Sdh 	ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
416f4b3ec61Sdh 	ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
4172b24ab6bSSebastien Roy 	ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
418f4b3ec61Sdh 
419f4b3ec61Sdh 	ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
420f4b3ec61Sdh 
421f4b3ec61Sdh 	ipst->ips_ipcl_conn_fanout = kmem_zalloc(
422f4b3ec61Sdh 	    ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
423f4b3ec61Sdh 
424f4b3ec61Sdh 	for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
425f4b3ec61Sdh 		mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
4267c478bd9Sstevel@tonic-gate 		    MUTEX_DEFAULT, NULL);
4277c478bd9Sstevel@tonic-gate 	}
4287c478bd9Sstevel@tonic-gate 
429f4b3ec61Sdh 	ipst->ips_ipcl_bind_fanout = kmem_zalloc(
430f4b3ec61Sdh 	    ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
4317c478bd9Sstevel@tonic-gate 
432f4b3ec61Sdh 	for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
433f4b3ec61Sdh 		mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
4347c478bd9Sstevel@tonic-gate 		    MUTEX_DEFAULT, NULL);
4357c478bd9Sstevel@tonic-gate 	}
4367c478bd9Sstevel@tonic-gate 
437