1a50ffc2imp/*-
24736ccfpfg * SPDX-License-Identifier: BSD-3-Clause
34736ccfpfg *
48fb65cergrimes * Copyright (c) 1982, 1986, 1990, 1993
56fc2ee1rwatson *	The Regents of the University of California.
656d9139rwatson * Copyright (c) 2010-2011 Juniper Networks, Inc.
76fc2ee1rwatson * All rights reserved.
88fb65cergrimes *
956d9139rwatson * Portions of this software were developed by Robert N. M. Watson under
1056d9139rwatson * contract to Juniper Networks, Inc.
1156d9139rwatson *
128fb65cergrimes * Redistribution and use in source and binary forms, with or without
138fb65cergrimes * modification, are permitted provided that the following conditions
148fb65cergrimes * are met:
158fb65cergrimes * 1. Redistributions of source code must retain the above copyright
168fb65cergrimes *    notice, this list of conditions and the following disclaimer.
178fb65cergrimes * 2. Redistributions in binary form must reproduce the above copyright
188fb65cergrimes *    notice, this list of conditions and the following disclaimer in the
198fb65cergrimes *    documentation and/or other materials provided with the distribution.
207e6cabdimp * 3. Neither the name of the University nor the names of its contributors
218fb65cergrimes *    may be used to endorse or promote products derived from this software
228fb65cergrimes *    without specific prior written permission.
238fb65cergrimes *
248fb65cergrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
258fb65cergrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
268fb65cergrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
278fb65cergrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
288fb65cergrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
298fb65cergrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
308fb65cergrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
318fb65cergrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
328fb65cergrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
338fb65cergrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
348fb65cergrimes * SUCH DAMAGE.
358fb65cergrimes *
368fb65cergrimes *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
373b842d3peter * $FreeBSD$
388fb65cergrimes */
398fb65cergrimes
408197ce5paul#ifndef _NETINET_IN_PCB_H_
418197ce5paul#define _NETINET_IN_PCB_H_
428197ce5paul
43fafc2e7bde#include <sys/queue.h>
4499ec598mmacy#include <sys/epoch.h>
4558f6726bde#include <sys/_lock.h>
4658f6726bde#include <sys/_mutex.h>
47ca47fccrwatson#include <sys/_rwlock.h>
48c3d5404gnn#include <net/route.h>
49fafc2e7bde
50ca47fccrwatson#ifdef _KERNEL
5145c14b9bz#include <sys/lock.h>
52ca47fccrwatson#include <sys/rwlock.h>
5357ca458rwatson#include <net/vnet.h>
541fdd3bcrwatson#include <vm/uma.h>
55ca47fccrwatson#endif
561cbc14bmmacy#include <sys/ck.h>
57ca47fccrwatson
588fb65cergrimes/*
59402e931rwatson * struct inpcb is the common protocol control block structure used in most
60402e931rwatson * IP transport protocols.
61c27ef03rwatson *
62c27ef03rwatson * Pointers to local and foreign host table entries, local and foreign socket
63c27ef03rwatson * numbers, and pointers up (to a socket structure) and down (to a
64c27ef03rwatson * protocol-specific control block) are stored here.
658fb65cergrimes */
661cbc14bmmacyCK_LIST_HEAD(inpcbhead, inpcb);
671cbc14bmmacyCK_LIST_HEAD(inpcbporthead, inpcbport);
686131d60markjCK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
693a5c9aaglebiustypedef	uint64_t	inp_gen_t;
70919fdebdg
71d43e611wollman/*
72cad2014shin * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
73c27ef03rwatson * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
74c27ef03rwatson * the following structure.
75cad2014shin */
76cad2014shinstruct in_addr_4in6 {
77cad2014shin	u_int32_t	ia46_pad32[3];
78cad2014shin	struct	in_addr	ia46_addr4;
79cad2014shin};
80cad2014shin
81d0aeaa5sbrunounion in_dependaddr {
82d0aeaa5sbruno	struct in_addr_4in6 id46_addr;
83d0aeaa5sbruno	struct in6_addr	id6_addr;
84d0aeaa5sbruno};
85d0aeaa5sbruno
86cad2014shin/*
87c27ef03rwatson * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
88c27ef03rwatson * some extra padding to accomplish this.
89790dc6fglebius * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
90790dc6fglebius * lport, faddr to generate hash, so these fields shouldn't be moved.
91a3c1c9fjlemon */
92a3c1c9fjlemonstruct in_endpoints {
93a3c1c9fjlemon	u_int16_t	ie_fport;		/* foreign port */
94a3c1c9fjlemon	u_int16_t	ie_lport;		/* local port */
95a3c1c9fjlemon	/* protocol dependent part, local and foreign addr */
96d0aeaa5sbruno	union in_dependaddr ie_dependfaddr;	/* foreign host table entry */
97d0aeaa5sbruno	union in_dependaddr ie_dependladdr;	/* local host table entry */
98d0aeaa5sbruno#define	ie_faddr	ie_dependfaddr.id46_addr.ia46_addr4
99d0aeaa5sbruno#define	ie_laddr	ie_dependladdr.id46_addr.ia46_addr4
100d0aeaa5sbruno#define	ie6_faddr	ie_dependfaddr.id6_addr
101d0aeaa5sbruno#define	ie6_laddr	ie_dependladdr.id6_addr
1021576b69ae	u_int32_t	ie6_zoneid;		/* scope zone id */
1036fc2ee1rwatson};
104a3c1c9fjlemon
105a3c1c9fjlemon/*
106c27ef03rwatson * XXX The defines for inc_* are hacks and should be changed to direct
107c27ef03rwatson * references.
108a3c1c9fjlemon */
109a3c1c9fjlemonstruct in_conninfo {
110a3c1c9fjlemon	u_int8_t	inc_flags;
111a3c1c9fjlemon	u_int8_t	inc_len;
1121dfc5c9julian	u_int16_t	inc_fibnum;	/* XXX was pad, 16 bits is plenty */
1136164d7candre	/* protocol dependent part */
114a3c1c9fjlemon	struct	in_endpoints inc_ie;
115a3c1c9fjlemon};
116ea0d9d2bz
117ea0d9d2bz/*
118ea0d9d2bz * Flags for inc_flags.
119ea0d9d2bz */
120ea0d9d2bz#define	INC_ISIPV6	0x01
1213055b3btuexen#define	INC_IPV6MINMTU	0x02
122ea0d9d2bz
123a3c1c9fjlemon#define	inc_fport	inc_ie.ie_fport
124a3c1c9fjlemon#define	inc_lport	inc_ie.ie_lport
125a3c1c9fjlemon#define	inc_faddr	inc_ie.ie_faddr
126a3c1c9fjlemon#define	inc_laddr	inc_ie.ie_laddr
127a3c1c9fjlemon#define	inc6_faddr	inc_ie.ie6_faddr
128a3c1c9fjlemon#define	inc6_laddr	inc_ie.ie6_laddr
1291576b69ae#define	inc6_zoneid	inc_ie.ie6_zoneid
130a3c1c9fjlemon
1313a5c9aaglebius#if defined(_KERNEL) || defined(_WANT_INPCB)
1323a5c9aaglebius/*
13367927a7jch * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
13467927a7jch * IPv6 sockets.  In the case of TCP and UDP, further per-connection state is
135e31c8aarwatson * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
136e31c8aarwatson * are static after creation or protected by a per-inpcb rwlock, inp_lock.  A
13767927a7jch * few fields are protected by multiple locks as indicated in the locking notes
13867927a7jch * below.  For these fields, all of the listed locks must be write-locked for
13967927a7jch * any modifications.  However, these fields can be safely read while any one of
14067927a7jch * the listed locks are read-locked.  This model can permit greater concurrency
14167927a7jch * for read operations.  For example, connections can be looked up while only
14267927a7jch * holding a read lock on the global pcblist lock.  This is important for
14367927a7jch * performance when attempting to find the connection for a packet given its IP
14467927a7jch * and port tuple.
14567927a7jch *
14667927a7jch * One noteworthy exception is that the global pcbinfo lock follows a different
14767927a7jch * set of rules in relation to the inp_list field.  Rather than being
14867927a7jch * write-locked for modifications and read-locked for list iterations, it must
14967927a7jch * be read-locked during modifications and write-locked during list iterations.
15067927a7jch * This ensures that the relatively rare global list iterations safely walk a
15167927a7jch * stable snapshot of connections while allowing more common list modifications
15267927a7jch * to safely grab the pcblist lock just while adding or removing a connection
15367927a7jch * from the global list.
154e31c8aarwatson *
155e31c8aarwatson * Key:
156863f90drrs * (b) - Protected by the hpts lock.
157e31c8aarwatson * (c) - Constant after initialization
1587979378mmacy * (e) - Protected by the net_epoch_prempt epoch
1596e29aearwatson * (g) - Protected by the pcbgroup lock
160e31c8aarwatson * (i) - Protected by the inpcb lock
161e31c8aarwatson * (p) - Protected by the pcbinfo lock for the inpcb
16267927a7jch * (l) - Protected by the pcblist lock for the inpcb
16367927a7jch * (h) - Protected by the pcbhash lock for the inpcb
164e31c8aarwatson * (s) - Protected by another subsystem's locks
165e31c8aarwatson * (x) - Undefined locking
1669dd4070rrs *
167863f90drrs * Notes on the tcp_hpts:
1689dd4070rrs *
169863f90drrs * First Hpts lock order is
170863f90drrs * 1) INP_WLOCK()
1719dd4070rrs * 2) HPTS_LOCK() i.e. hpts->pmtx
172863f90drrs *
1739dd4070rrs * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
1749dd4070rrs * You may check the inp->inp_in_hpts flag without the hpts lock.
1759dd4070rrs * The hpts is the only one that will clear this flag holding
176863f90drrs * only the hpts lock. This means that in your tcp_output()
1779dd4070rrs * routine when you test for the inp_in_hpts flag to be 1
1789dd4070rrs * it may be transitioning to 0 (by the hpts).
1799dd4070rrs * That's ok since that will just mean an extra call to tcp_output
180863f90drrs * that most likely will find the call you executed
1819dd4070rrs * (when the mis-match occured) will have put the TCB back
182863f90drrs * on the hpts and it will return. If your
183863f90drrs * call did not add the inp back to the hpts then you will either
184863f90drrs * over-send or the cwnd will block you from sending more.
185863f90drrs *
186863f90drrs * Note you should also be holding the INP_WLOCK() when you
187863f90drrs * call the remove from the hpts as well. Though usually
188863f90drrs * you are either doing this from a timer, where you need and have
189863f90drrs * the INP_WLOCK() or from destroying your TCB where again
190863f90drrs * you should already have the INP_WLOCK().
191863f90drrs *
1929dd4070rrs * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
193863f90drrs * inp_input_cpu_set fields are controlled completely by
194863f90drrs * the hpts. Do not ever set these. The inp_hpts_cpu_set
195863f90drrs * and inp_input_cpu_set fields indicate if the hpts has
196863f90drrs * setup the respective cpu field. It is advised if this
197863f90drrs * field is 0, to enqueue the packet with the appropriate
198863f90drrs * hpts_immediate() call. If the _set field is 1, then
199863f90drrs * you may compare the inp_*_cpu field to the curcpu and
200863f90drrs * may want to again insert onto the hpts if these fields
201863f90drrs * are not equal (i.e. you are not on the expected CPU).
202863f90drrs *
203863f90drrs * A note on inp_hpts_calls and inp_input_calls, these
204863f90drrs * flags are set when the hpts calls either the output
205863f90drrs * or do_segment routines respectively. If the routine
206863f90drrs * being called wants to use this, then it needs to
207863f90drrs * clear the flag before returning. The hpts will not
208863f90drrs * clear the flag. The flags can be used to tell if
209863f90drrs * the hpts is the function calling the respective
210863f90drrs * routine.
211e31c8aarwatson *
212e31c8aarwatson * A few other notes:
213e31c8aarwatson *
214e31c8aarwatson * When a read lock is held, stability of the field is guaranteed; to write
215e31c8aarwatson * to a field, a write lock must generally be held.
216e31c8aarwatson *
217e31c8aarwatson * netinet/netinet6-layer code should not assume that the inp_socket pointer
218e31c8aarwatson * is safe to dereference without inp_lock being held, even for protocols
219e31c8aarwatson * other than TCP (where the inpcb persists during TIMEWAIT even after the
220e31c8aarwatson * socket has been freed), or there may be close(2)-related races.
221e31c8aarwatson *
222e31c8aarwatson * The inp_vflag field is overloaded, and would otherwise ideally be (c).
22367927a7jch *
22467927a7jch * TODO:  Currently only the TCP stack is leveraging the global pcbinfo lock
22567927a7jch * read-lock usage during modification, this model can be applied to other
22667927a7jch * protocols (especially SCTP).
227e31c8aarwatson */
2283a5c9aaglebiusstruct icmp6_filter;
2293a5c9aaglebiusstruct inpcbpolicy;
230efa6326hselaskystruct m_snd_tag;
2318fb65cergrimesstruct inpcb {
2327845c5bglebius	/* Cache line #1 (amd64) */
2337979378mmacy	CK_LIST_ENTRY(inpcb) inp_hash;	/* [w](h/i) [r](e/i)  hash list */
2341cbc14bmmacy	CK_LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
2357845c5bglebius	struct rwlock	inp_lock;
2367845c5bglebius	/* Cache line #2 (amd64) */
237863f90drrs#define	inp_start_zero	inp_hpts
2387845c5bglebius#define	inp_zero_size	(sizeof(struct inpcb) - \
2397845c5bglebius			    offsetof(struct inpcb, inp_start_zero))
240863f90drrs	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */
241863f90drrs
242863f90drrs	uint32_t inp_hpts_request;	/* Current hpts request, zero if
243863f90drrs					 * fits in the pacing window (i&b). */
244863f90drrs	/*
245863f90drrs	 * Note the next fields are protected by a
2469dd4070rrs	 * different lock (hpts-lock). This means that
247863f90drrs	 * they must correspond in size to the smallest
248863f90drrs	 * protectable bit field (uint8_t on x86, and
249863f90drrs	 * other platfomrs potentially uint32_t?). Also
250863f90drrs	 * since CPU switches can occur at different times the two
251863f90drrs	 * fields can *not* be collapsed into a signal bit field.
252863f90drrs	 */
2539dd4070rrs#if defined(__amd64__) || defined(__i386__)
254863f90drrs	volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
255863f90drrs	volatile uint8_t inp_in_input; /* on input hpts (lock b) */
256863f90drrs#else
257863f90drrs	volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
258863f90drrs	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
259863f90drrs#endif
260863f90drrs	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
2617845c5bglebius	u_int	inp_refcount;		/* (i) refcount */
2627845c5bglebius	int	inp_flags;		/* (i) generic IP/datagram flags */
2637845c5bglebius	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
264863f90drrs	volatile uint16_t  inp_input_cpu; /* Lock (i) */
265863f90drrs	volatile uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
266863f90drrs			 inp_input_cpu_set : 1,	/* on input hpts (i) */
267863f90drrs			 inp_hpts_calls :1,	/* (i) from output hpts */
268863f90drrs			 inp_input_calls :1,	/* (i) from input hpts */
269863f90drrs			 inp_spare_bits2 : 4;
27063aec38gallatin	uint8_t inp_numa_domain;	/* numa domain */
271e31c8aarwatson	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
2727845c5bglebius	struct	socket *inp_socket;	/* (i) back pointer to socket */
273863f90drrs	uint32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
274863f90drrs	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
275863f90drrs	TAILQ_ENTRY(inpcb) inp_input;	/* pacing in  queue next lock(b) */
276e31c8aarwatson	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
2776e29aearwatson	struct	inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
278255aa2fmmacy	CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
27977f80e0bz	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
2806fc2ee1rwatson	u_int32_t inp_flow;		/* (i) IPv6 flow information */
281e31c8aarwatson	u_char	inp_vflag;		/* (i) IP version flag (v4/v6) */
282e31c8aarwatson	u_char	inp_ip_ttl;		/* (i) time to live proto */
283e31c8aarwatson	u_char	inp_ip_p;		/* (c) protocol proto */
284e31c8aarwatson	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
285fbd3646kmacy	uint32_t inp_flowid;		/* (x) flow id / queue id */
286efa6326hselasky	struct m_snd_tag *inp_snd_tag;	/* (i) send tag for outgoing mbufs */
287f91e4baadrian	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
288627c686adrian	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
28912b5f9ckmacy
29012b5f9ckmacy	/* Local and foreign ports, local and foreign addr. */
29167927a7jch	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
29212b5f9ckmacy
2936fc2ee1rwatson	/* MAC and IPSEC policy information. */
294e31c8aarwatson	struct	label *inp_label;	/* (i) MAC label */
295e31c8aarwatson	struct	inpcbpolicy *inp_sp;    /* (s) for IPSEC */
296cad2014shin
297c27ef03rwatson	/* Protocol-dependent part; options. */
298cad2014shin	struct {
2993a5c9aaglebius		u_char	inp_ip_tos;		/* (i) type of service proto */
3003a5c9aaglebius		struct mbuf		*inp_options;	/* (i) IP options */
3013a5c9aaglebius		struct ip_moptions	*inp_moptions;	/* (i) mcast options */
3023a5c9aaglebius	};
303cad2014shin	struct {
304e31c8aarwatson		/* (i) IP options */
3053a5c9aaglebius		struct mbuf		*in6p_options;
306e31c8aarwatson		/* (i) IP6 options for outgoing packets */
3073a5c9aaglebius		struct ip6_pktopts	*in6p_outputopts;
308e31c8aarwatson		/* (i) IP multicast options */
3093a5c9aaglebius		struct ip6_moptions	*in6p_moptions;
310e31c8aarwatson		/* (i) ICMPv6 code type filter */
3113a5c9aaglebius		struct icmp6_filter	*in6p_icmp6filt;
312e31c8aarwatson		/* (i) IPV6_CHECKSUM setsockopt */
3133a5c9aaglebius		int	in6p_cksum;
3143a5c9aaglebius		short	in6p_hops;
3153a5c9aaglebius	};
3161cbc14bmmacy	CK_LIST_ENTRY(inpcb) inp_portlist;	/* (i/h) */
31767927a7jch	struct	inpcbport *inp_phd;	/* (i/h) head of this list */
3186fc2ee1rwatson	inp_gen_t	inp_gencnt;	/* (c) generation count */
319f7f4373bz	void		*spare_ptr;	/* Spare pointer. */
320c3d5404gnn	rt_gen_t	inp_rt_cookie;	/* generation for route entry */
321c3d5404gnn	union {				/* cached L3 information */
3227845c5bglebius		struct route inp_route;
3237845c5bglebius		struct route_in6 inp_route6;
3247845c5bglebius	};
3251cbc14bmmacy	CK_LIST_ENTRY(inpcb) inp_list;	/* (p/l) list for all PCBs for proto */
3267979378mmacy	                                /* (e[r]) for list iteration */
3277979378mmacy	                                /* (p[w]/l) for addition/removal */
328f2fc01cmmacy	struct epoch_context inp_epoch_ctx;
3296fc2ee1rwatson};
3303a5c9aaglebius#endif	/* _KERNEL */
3313a5c9aaglebius
3326fc2ee1rwatson#define	inp_fport	inp_inc.inc_fport
3336fc2ee1rwatson#define	inp_lport	inp_inc.inc_lport
3346fc2ee1rwatson#define	inp_faddr	inp_inc.inc_faddr
3356fc2ee1rwatson#define	inp_laddr	inp_inc.inc_laddr
336cd25d46hsu
337a3c1c9fjlemon#define	in6p_faddr	inp_inc.inc6_faddr
338a3c1c9fjlemon#define	in6p_laddr	inp_inc.inc6_laddr
3391576b69ae#define	in6p_zoneid	inp_inc.inc6_zoneid
3406fc2ee1rwatson
34139b6dc8zec#define	inp_vnet	inp_pcbinfo->ipi_vnet
34239b6dc8zec
343d43e611wollman/*
344c27ef03rwatson * The range of the generation count, as used in this implementation, is 9e19.
345c27ef03rwatson * We would have to create 300 billion connections per second for this number
346c27ef03rwatson * to roll over in a year.  This seems sufficiently unlikely that we simply
347c27ef03rwatson * don't concern ourselves with that possibility.
348d43e611wollman */
3497262ff6dg
350bbc4497wollman/*
351c27ef03rwatson * Interface exported to userland by various protocols which use inpcbs.  Hack
352c27ef03rwatson * alert -- only define if struct xsocket is in scope.
3533a5c9aaglebius * Fields prefixed with "xi_" are unique to this structure, and the rest
3543a5c9aaglebius * match fields in the struct inpcb, to ease coding and porting.
3553a5c9aaglebius *
3563a5c9aaglebius * Legend:
3573a5c9aaglebius * (s) - used by userland utilities in src
3583a5c9aaglebius * (p) - used by utilities in ports
3593a5c9aaglebius * (3) - is known to be used by third party software not in ports
3603a5c9aaglebius * (n) - no known usage
361bbc4497wollman */
362bbc4497wollman#ifdef _SYS_SOCKETVAR_H_
3633a5c9aaglebiusstruct xinpcb {
3646615ed4brooks	ksize_t		xi_len;			/* length of this structure */
3653a5c9aaglebius	struct xsocket	xi_socket;		/* (s,p) */
3663a5c9aaglebius	struct in_conninfo inp_inc;		/* (s,p) */
3673a5c9aaglebius	uint64_t	inp_gencnt;		/* (s,p) */
3688a6f698glebius	kvaddr_t	inp_ppcb;		/* (s) netstat(1) */
3693a5c9aaglebius	int64_t		inp_spare64[4];
3703a5c9aaglebius	uint32_t	inp_flow;		/* (s) */
3713a5c9aaglebius	uint32_t	inp_flowid;		/* (s) */
3723a5c9aaglebius	uint32_t	inp_flowtype;		/* (s) */
3733a5c9aaglebius	int32_t		inp_flags;		/* (s,p) */
3743a5c9aaglebius	int32_t		inp_flags2;		/* (s) */
3753a5c9aaglebius	int32_t		inp_rss_listen_bucket;	/* (n) */
3763a5c9aaglebius	int32_t		in6p_cksum;		/* (n) */
3773a5c9aaglebius	int32_t		inp_spare32[4];
3783a5c9aaglebius	uint16_t	in6p_hops;		/* (n) */
3793a5c9aaglebius	uint8_t		inp_ip_tos;		/* (n) */
3803a5c9aaglebius	int8_t		pad8;
3813a5c9aaglebius	uint8_t		inp_vflag;		/* (s,p) */
3823a5c9aaglebius	uint8_t		inp_ip_ttl;		/* (n) */
3833a5c9aaglebius	uint8_t		inp_ip_p;		/* (n) */
3843a5c9aaglebius	uint8_t		inp_ip_minttl;		/* (n) */
3853a5c9aaglebius	int8_t		inp_spare8[4];
3863a5c9aaglebius} __aligned(8);
387bbc4497wollman
3883a5c9aaglebiusstruct xinpgen {
3896615ed4brooks	ksize_t	xig_len;	/* length of this structure */
3903a5c9aaglebius	u_int		xig_count;	/* number of PCBs at this time */
3916615ed4brooks	uint32_t	_xig_spare32;
3923a5c9aaglebius	inp_gen_t	xig_gen;	/* generation count at this time */
3933a5c9aaglebius	so_gen_t	xig_sogen;	/* socket generation count this time */
3946615ed4brooks	uint64_t	_xig_spare64[4];
395a58d001glebius} __aligned(8);
3963a5c9aaglebius#ifdef	_KERNEL
3973a5c9aaglebiusvoid	in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
3983a5c9aaglebius#endif
399bbc4497wollman#endif /* _SYS_SOCKETVAR_H_ */
400bbc4497wollman
4017262ff6dgstruct inpcbport {
4026e4e86fmmacy	struct epoch_context phd_epoch_ctx;
4031cbc14bmmacy	CK_LIST_ENTRY(inpcbport) phd_hash;
4047262ff6dg	struct inpcbhead phd_pcblist;
4057262ff6dg	u_short phd_port;
4068fb65cergrimes};
4078fb65cergrimes
40899ec598mmacystruct in_pcblist {
40999ec598mmacy	int il_count;
41099ec598mmacy	struct epoch_context il_epoch_ctx;
41199ec598mmacy	struct inpcbinfo *il_pcbinfo;
41299ec598mmacy	struct inpcb *il_inp_list[0];
41399ec598mmacy};
41499ec598mmacy
41556d9139rwatson/*-
416c27ef03rwatson * Global data structure for each high-level protocol (UDP, TCP, ...) in both
417c27ef03rwatson * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
41856d9139rwatson *
41967927a7jch * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
42067927a7jch * ipi_list_lock:
42167927a7jch *  - ipi_lock covering the global pcb list stability during loop iteration,
42267927a7jch *  - ipi_hash_lock covering the hashed lookup tables,
42367927a7jch *  - ipi_list_lock covering mutable global fields (such as the global
42467927a7jch *    pcb list)
42567927a7jch *
42667927a7jch * The lock order is:
42756d9139rwatson *
42867927a7jch *    ipi_lock (before)
42967927a7jch *        inpcb locks (before)
43067927a7jch *            ipi_list locks (before)
43167927a7jch *                {ipi_hash_lock, pcbgroup locks}
43256d9139rwatson *
43356d9139rwatson * Locking key:
43456d9139rwatson *
43556d9139rwatson * (c) Constant or nearly constant after initialisation
4367979378mmacy * (e) - Protected by the net_epoch_prempt epoch
43756d9139rwatson * (g) Locked by ipi_lock
43867927a7jch * (l) Locked by ipi_list_lock
4397979378mmacy * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
4406e29aearwatson * (p) Protected by one or more pcbgroup locks
44156d9139rwatson * (x) Synchronisation properties poorly defined
442c27ef03rwatson */
443c27ef03rwatsonstruct inpcbinfo {
444c27ef03rwatson	/*
4457979378mmacy	 * Global lock protecting inpcb list modification
446c27ef03rwatson	 */
4477979378mmacy	struct mtx		 ipi_lock;
448c27ef03rwatson
449c27ef03rwatson	/*
45056d9139rwatson	 * Global list of inpcbs on the protocol.
451c27ef03rwatson	 */
4527979378mmacy	struct inpcbhead	*ipi_listhead;		/* [r](e) [w](g/l) */
45367927a7jch	u_int			 ipi_count;		/* (l) */
454c27ef03rwatson
455c27ef03rwatson	/*
45656d9139rwatson	 * Generation count -- incremented each time a connection is allocated
45756d9139rwatson	 * or freed.
458c27ef03rwatson	 */
45967927a7jch	u_quad_t		 ipi_gencnt;		/* (l) */
460c27ef03rwatson
461c27ef03rwatson	/*
462c27ef03rwatson	 * Fields associated with port lookup and allocation.
463c27ef03rwatson	 */
46456d9139rwatson	u_short			 ipi_lastport;		/* (x) */
46556d9139rwatson	u_short			 ipi_lastlow;		/* (x) */
46656d9139rwatson	u_short			 ipi_lasthi;		/* (x) */
467c27ef03rwatson
468c27ef03rwatson	/*
469c27ef03rwatson	 * UMA zone from which inpcbs are allocated for this protocol.
470c27ef03rwatson	 */
47156d9139rwatson	struct	uma_zone	*ipi_zone;		/* (c) */
472c27ef03rwatson
473c27ef03rwatson	/*
4746e29aearwatson	 * Connection groups associated with this protocol.  These fields are
4756e29aearwatson	 * constant, but pcbgroup structures themselves are protected by
4766e29aearwatson	 * per-pcbgroup locks.
4776e29aearwatson	 */
4786e29aearwatson	struct inpcbgroup	*ipi_pcbgroups;		/* (c) */
4796e29aearwatson	u_int			 ipi_npcbgroups;	/* (c) */
4806e29aearwatson	u_int			 ipi_hashfields;	/* (c) */
4816e29aearwatson
4826e29aearwatson	/*
4837979378mmacy	 * Global lock protecting modification non-pcbgroup hash lookup tables.
484fdfdadbrwatson	 */
4857979378mmacy	struct mtx		 ipi_hash_lock;
486fdfdadbrwatson
487fdfdadbrwatson	/*
48856d9139rwatson	 * Global hash of inpcbs, hashed by local and foreign addresses and
48956d9139rwatson	 * port numbers.
490c27ef03rwatson	 */
491fdfdadbrwatson	struct inpcbhead	*ipi_hashbase;		/* (h) */
492fdfdadbrwatson	u_long			 ipi_hashmask;		/* (h) */
49356d9139rwatson
49456d9139rwatson	/*
49556d9139rwatson	 * Global hash of inpcbs, hashed by only local port number.
49656d9139rwatson	 */
497fdfdadbrwatson	struct inpcbporthead	*ipi_porthashbase;	/* (h) */
498fdfdadbrwatson	u_long			 ipi_porthashmask;	/* (h) */
49912b5f9ckmacy
50012b5f9ckmacy	/*
5016e29aearwatson	 * List of wildcard inpcbs for use with pcbgroups.  In the past, was
502