xref: /illumos-gate/usr/src/uts/common/inet/ilb/ilb_nat.c (revision 1a5e258f)
1dbed73cbSSangeeta Misra /*
2dbed73cbSSangeeta Misra  * CDDL HEADER START
3dbed73cbSSangeeta Misra  *
4dbed73cbSSangeeta Misra  * The contents of this file are subject to the terms of the
5dbed73cbSSangeeta Misra  * Common Development and Distribution License (the "License").
6dbed73cbSSangeeta Misra  * You may not use this file except in compliance with the License.
7dbed73cbSSangeeta Misra  *
8dbed73cbSSangeeta Misra  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9dbed73cbSSangeeta Misra  * or http://www.opensolaris.org/os/licensing.
10dbed73cbSSangeeta Misra  * See the License for the specific language governing permissions
11dbed73cbSSangeeta Misra  * and limitations under the License.
12dbed73cbSSangeeta Misra  *
13dbed73cbSSangeeta Misra  * When distributing Covered Code, include this CDDL HEADER in each
14dbed73cbSSangeeta Misra  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15dbed73cbSSangeeta Misra  * If applicable, add the following below this CDDL HEADER, with the
16dbed73cbSSangeeta Misra  * fields enclosed by brackets "[]" replaced with your own identifying
17dbed73cbSSangeeta Misra  * information: Portions Copyright [yyyy] [name of copyright owner]
18dbed73cbSSangeeta Misra  *
19dbed73cbSSangeeta Misra  * CDDL HEADER END
20dbed73cbSSangeeta Misra  */
21dbed73cbSSangeeta Misra 
22dbed73cbSSangeeta Misra /*
2347b75f87SKacheong Poon  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24dbed73cbSSangeeta Misra  * Use is subject to license terms.
25dbed73cbSSangeeta Misra  */
26dbed73cbSSangeeta Misra 
27dbed73cbSSangeeta Misra #include <sys/types.h>
28dbed73cbSSangeeta Misra #include <sys/cmn_err.h>
29dbed73cbSSangeeta Misra #include <sys/crc32.h>
30dbed73cbSSangeeta Misra #include <netinet/in.h>
31dbed73cbSSangeeta Misra #include <inet/ip.h>
32dbed73cbSSangeeta Misra #include <inet/ip6.h>
33dbed73cbSSangeeta Misra #include <inet/tcp.h>
34dbed73cbSSangeeta Misra #include <inet/udp_impl.h>
35dbed73cbSSangeeta Misra #include <inet/ilb.h>
36dbed73cbSSangeeta Misra 
37dbed73cbSSangeeta Misra #include "ilb_impl.h"
38dbed73cbSSangeeta Misra #include "ilb_stack.h"
39dbed73cbSSangeeta Misra #include "ilb_nat.h"
40dbed73cbSSangeeta Misra 
41dbed73cbSSangeeta Misra /*
42dbed73cbSSangeeta Misra  * NAT source entry garbarge collection timeout.  The actual timeout value
43dbed73cbSSangeeta Misra  * includes a random jitter bounded by the ILB_NAT_SRC_TIMEOUT_JITTER.
44dbed73cbSSangeeta Misra  */
45dbed73cbSSangeeta Misra #define	ILB_NAT_SRC_TIMEOUT		30
46dbed73cbSSangeeta Misra #define	ILB_NAT_SRC_TIMEOUT_JITTER	5
47dbed73cbSSangeeta Misra 
48dbed73cbSSangeeta Misra /* key1/2 are assumed to be uint32_t. */
49dbed73cbSSangeeta Misra #define	ILB_NAT_SRC_HASH(hash, key1, key2, hash_size)			\
50dbed73cbSSangeeta Misra {									\
51dbed73cbSSangeeta Misra 	CRC32((hash), (key1), sizeof (uint32_t), -1U, crc32_table);	\
52dbed73cbSSangeeta Misra 	CRC32((hash), (key2), sizeof (uint32_t), (hash), crc32_table);	\
53dbed73cbSSangeeta Misra 	(hash) %= (hash_size);						\
54dbed73cbSSangeeta Misra }
55dbed73cbSSangeeta Misra 
56dbed73cbSSangeeta Misra /* NAT source port space instance number.  */
57dbed73cbSSangeeta Misra static uint32_t ilb_nat_src_instance = 0;
58dbed73cbSSangeeta Misra 
59dbed73cbSSangeeta Misra static void
incr_addr(in6_addr_t * a)60dbed73cbSSangeeta Misra incr_addr(in6_addr_t *a)
61dbed73cbSSangeeta Misra {
62dbed73cbSSangeeta Misra 	uint32_t i;
63dbed73cbSSangeeta Misra 
64dbed73cbSSangeeta Misra 	i = ntohl(a->s6_addr32[3]);
65dbed73cbSSangeeta Misra 	if (IN6_IS_ADDR_V4MAPPED(a)) {
66dbed73cbSSangeeta Misra 		a->s6_addr32[3] = htonl(++i);
67dbed73cbSSangeeta Misra 		ASSERT(i != 0);
68dbed73cbSSangeeta Misra 		return;
69dbed73cbSSangeeta Misra 	}
70dbed73cbSSangeeta Misra 
71dbed73cbSSangeeta Misra 	if (++i != 0) {
72dbed73cbSSangeeta Misra 		a->s6_addr32[3] = htonl(i);
73dbed73cbSSangeeta Misra 		return;
74dbed73cbSSangeeta Misra 	}
75dbed73cbSSangeeta Misra 	a->s6_addr32[3] = 0;
76dbed73cbSSangeeta Misra 	i = ntohl(a->s6_addr[2]);
77dbed73cbSSangeeta Misra 	if (++i != 0) {
78dbed73cbSSangeeta Misra 		a->s6_addr32[2] = htonl(i);
79dbed73cbSSangeeta Misra 		return;
80dbed73cbSSangeeta Misra 	}
81dbed73cbSSangeeta Misra 	a->s6_addr32[2] = 0;
82dbed73cbSSangeeta Misra 	i = ntohl(a->s6_addr[1]);
83dbed73cbSSangeeta Misra 	if (++i != 0) {
84dbed73cbSSangeeta Misra 		a->s6_addr32[1] = htonl(i);
85dbed73cbSSangeeta Misra 		return;
86dbed73cbSSangeeta Misra 	}
87dbed73cbSSangeeta Misra 	a->s6_addr32[1] = 0;
88dbed73cbSSangeeta Misra 	i = ntohl(a->s6_addr[0]);
89dbed73cbSSangeeta Misra 	a->s6_addr[0] = htonl(++i);
90dbed73cbSSangeeta Misra 	ASSERT(i != 0);
91dbed73cbSSangeeta Misra }
92dbed73cbSSangeeta Misra 
93dbed73cbSSangeeta Misra /*
94dbed73cbSSangeeta Misra  * When ILB does full NAT, it first picks one source address from the rule's
95dbed73cbSSangeeta Misra  * specified NAT source address list (currently done in round robin fashion).
96dbed73cbSSangeeta Misra  * Then it needs to allocate a port.  This source port must make the tuple
97dbed73cbSSangeeta Misra  * (source address:source port:destination address:destination port)
98dbed73cbSSangeeta Misra  * unique.  The destination part of the tuple is determined by the back
99dbed73cbSSangeeta Misra  * end server, and could not be changed.
100dbed73cbSSangeeta Misra  *
101dbed73cbSSangeeta Misra  * To handle the above source port number allocation, ILB sets up a table
102dbed73cbSSangeeta Misra  * of entries identified by source address:back end server address:server port
103dbed73cbSSangeeta Misra  * tuple.  This table is used by all rules for NAT source port allocation.
104dbed73cbSSangeeta Misra  * Each tuple has an associated vmem arena used for managing the NAT source
105dbed73cbSSangeeta Misra  * port space between the source address and back end server address/port.
106dbed73cbSSangeeta Misra  * Each back end server (ilb_server_t) has an array of pointers (iser_nat_src)
107dbed73cbSSangeeta Misra  * to the different entries in this table for NAT source port allocation.
108dbed73cbSSangeeta Misra  * When ILB needs to allocate a NAT source address and port to talk to a back
109dbed73cbSSangeeta Misra  * end server, it picks a source address  and uses the array pointer to get
110dbed73cbSSangeeta Misra  * to an entry.  Then it calls vmem_alloc() on the associated vmem arena to
111dbed73cbSSangeeta Misra  * find an unused port.
112dbed73cbSSangeeta Misra  *
113dbed73cbSSangeeta Misra  * When a back end server is added, ILB sets up the aforementioned array.
114dbed73cbSSangeeta Misra  * For each source address specified in the rule, ILB checks if there is any
115dbed73cbSSangeeta Misra  * existing entry which matches this source address:back end server address:
116dbed73cbSSangeeta Misra  * port tuple.  The server port is either a specific port or 0 (meaning wild
117dbed73cbSSangeeta Misra  * card port).  Normally, a back end server uses the same port as in the rule.
118dbed73cbSSangeeta Misra  * If a back end server is used to serve two different rules, there will be
119dbed73cbSSangeeta Misra  * two different ports.  Source port allocation for these two rules do not
120dbed73cbSSangeeta Misra  * conflict, hence we can use two vmem arenas (two different entries in the
121dbed73cbSSangeeta Misra  * table).  But if a server uses port range in one rule, we will treat it as
122dbed73cbSSangeeta Misra  * a wild card port.  Wild card poart matches with any port.  If this server
123dbed73cbSSangeeta Misra  * is used to serve more than one rules and those rules use the same set of
124dbed73cbSSangeeta Misra  * NAT source addresses, this means that they must share the same set of vmem
125dbed73cbSSangeeta Misra  * arenas (source port spaces).  We do this for simplicity reason.  If not,
126dbed73cbSSangeeta Misra  * we need to partition the port range so that we can identify different forms
127dbed73cbSSangeeta Misra  * of source port number collision.
128dbed73cbSSangeeta Misra  */
129dbed73cbSSangeeta Misra 
130dbed73cbSSangeeta Misra /*
131dbed73cbSSangeeta Misra  * NAT source address initialization routine.
132dbed73cbSSangeeta Misra  */
133dbed73cbSSangeeta Misra void
ilb_nat_src_init(ilb_stack_t * ilbs)134dbed73cbSSangeeta Misra ilb_nat_src_init(ilb_stack_t *ilbs)
135dbed73cbSSangeeta Misra {
136dbed73cbSSangeeta Misra 	int i;
137dbed73cbSSangeeta Misra 
138dbed73cbSSangeeta Misra 	ilbs->ilbs_nat_src = kmem_zalloc(sizeof (ilb_nat_src_hash_t) *
139dbed73cbSSangeeta Misra 	    ilbs->ilbs_nat_src_hash_size, KM_SLEEP);
140dbed73cbSSangeeta Misra 	for (i = 0; i < ilbs->ilbs_nat_src_hash_size; i++) {
141dbed73cbSSangeeta Misra 		list_create(&ilbs->ilbs_nat_src[i].nsh_head,
142dbed73cbSSangeeta Misra 		    sizeof (ilb_nat_src_entry_t),
143dbed73cbSSangeeta Misra 		    offsetof(ilb_nat_src_entry_t, nse_link));
144dbed73cbSSangeeta Misra 		mutex_init(&ilbs->ilbs_nat_src[i].nsh_lock, NULL,
145dbed73cbSSangeeta Misra 		    MUTEX_DEFAULT, NULL);
146dbed73cbSSangeeta Misra 	}
14747b75f87SKacheong Poon 	ilbs->ilbs_nat_src_tid = timeout(ilb_nat_src_timer, ilbs,
14847b75f87SKacheong Poon 	    SEC_TO_TICK(ILB_NAT_SRC_TIMEOUT +
14947b75f87SKacheong Poon 	    gethrtime() % ILB_NAT_SRC_TIMEOUT_JITTER));
150dbed73cbSSangeeta Misra }
151dbed73cbSSangeeta Misra 
152dbed73cbSSangeeta Misra /*
153dbed73cbSSangeeta Misra  * NAT source address clean up routine.
154dbed73cbSSangeeta Misra  */
155dbed73cbSSangeeta Misra void
ilb_nat_src_fini(ilb_stack_t * ilbs)156dbed73cbSSangeeta Misra ilb_nat_src_fini(ilb_stack_t *ilbs)
157dbed73cbSSangeeta Misra {
158dbed73cbSSangeeta Misra 	ilb_nat_src_entry_t *cur;
159dbed73cbSSangeeta Misra 	timeout_id_t tid;
160dbed73cbSSangeeta Misra 	int i;
161dbed73cbSSangeeta Misra 
162dbed73cbSSangeeta Misra 	/*
163dbed73cbSSangeeta Misra 	 * By setting ilbs_nat_src_tid to 0, the timer handler will not
164dbed73cbSSangeeta Misra 	 * restart the timer.
165dbed73cbSSangeeta Misra 	 */
166dbed73cbSSangeeta Misra 	mutex_enter(&ilbs->ilbs_nat_src_lock);
167dbed73cbSSangeeta Misra 	tid = ilbs->ilbs_nat_src_tid;
168dbed73cbSSangeeta Misra 	ilbs->ilbs_nat_src_tid = 0;
169dbed73cbSSangeeta Misra 	mutex_exit(&ilbs->ilbs_nat_src_lock);
170dbed73cbSSangeeta Misra 	if (tid != 0)
171dbed73cbSSangeeta Misra 		(void) untimeout(tid);
172dbed73cbSSangeeta Misra 
173dbed73cbSSangeeta Misra 	mutex_destroy(&ilbs->ilbs_nat_src_lock);
174dbed73cbSSangeeta Misra 
175dbed73cbSSangeeta Misra 	for (i = 0; i < ilbs->ilbs_nat_src_hash_size; i++) {
176dbed73cbSSangeeta Misra 		while ((cur = list_remove_head(&ilbs->ilbs_nat_src[i].nsh_head))
177dbed73cbSSangeeta Misra 		    != NULL) {
178dbed73cbSSangeeta Misra 			vmem_destroy(cur->nse_port_arena);
179dbed73cbSSangeeta Misra 			kmem_free(cur, sizeof (ilb_nat_src_entry_t));
180dbed73cbSSangeeta Misra 		}
181dbed73cbSSangeeta Misra 		mutex_destroy(&ilbs->ilbs_nat_src[i].nsh_lock);
182dbed73cbSSangeeta Misra 	}
183dbed73cbSSangeeta Misra 
184dbed73cbSSangeeta Misra 	kmem_free(ilbs->ilbs_nat_src, sizeof (ilb_nat_src_hash_t) *
185dbed73cbSSangeeta Misra 	    ilbs->ilbs_nat_src_hash_size);
186dbed73cbSSangeeta Misra 	ilbs->ilbs_nat_src = NULL;
187dbed73cbSSangeeta Misra }
188dbed73cbSSangeeta Misra 
189dbed73cbSSangeeta Misra /* An arena name is "ilb_ns" + "_xxxxxxxxxx"  */
190dbed73cbSSangeeta Misra #define	ARENA_NAMESZ	18
191dbed73cbSSangeeta Misra #define	NAT_PORT_START	4096
192dbed73cbSSangeeta Misra #define	NAT_PORT_SIZE	65535 - NAT_PORT_START
193dbed73cbSSangeeta Misra 
194dbed73cbSSangeeta Misra /*
195dbed73cbSSangeeta Misra  * Check if the NAT source and back end server pair ilb_nat_src_entry_t
196dbed73cbSSangeeta Misra  * exists.  If it does, increment the refcnt and return it.  If not, create
197dbed73cbSSangeeta Misra  * one and return it.
198dbed73cbSSangeeta Misra  */
199dbed73cbSSangeeta Misra static ilb_nat_src_entry_t *
ilb_find_nat_src(ilb_stack_t * ilbs,const in6_addr_t * nat_src,const in6_addr_t * serv_addr,in_port_t port)200dbed73cbSSangeeta Misra ilb_find_nat_src(ilb_stack_t *ilbs, const in6_addr_t *nat_src,
201dbed73cbSSangeeta Misra     const in6_addr_t *serv_addr, in_port_t port)
202dbed73cbSSangeeta Misra {
203dbed73cbSSangeeta Misra 	ilb_nat_src_entry_t *tmp;
204dbed73cbSSangeeta Misra 	uint32_t idx;
205dbed73cbSSangeeta Misra 	char arena_name[ARENA_NAMESZ];
206dbed73cbSSangeeta Misra 	list_t *head;
207dbed73cbSSangeeta Misra 
208dbed73cbSSangeeta Misra 	ILB_NAT_SRC_HASH(idx, &nat_src->s6_addr32[3], &serv_addr->s6_addr32[3],
209dbed73cbSSangeeta Misra 	    ilbs->ilbs_nat_src_hash_size);
210dbed73cbSSangeeta Misra 	mutex_enter(&ilbs->ilbs_nat_src[idx].nsh_lock);
211dbed73cbSSangeeta Misra 	head = &ilbs->ilbs_nat_src[idx].nsh_head;
212dbed73cbSSangeeta Misra 	for (tmp = list_head(head); tmp != NULL; tmp = list_next(head, tmp)) {
213dbed73cbSSangeeta Misra 		if (IN6_ARE_ADDR_EQUAL(&tmp->nse_src_addr, nat_src) &&
214dbed73cbSSangeeta Misra 		    IN6_ARE_ADDR_EQUAL(&tmp->nse_serv_addr, serv_addr) &&
215dbed73cbSSangeeta Misra 		    (port == tmp->nse_port || port == 0 ||
216dbed73cbSSangeeta Misra 		    tmp->nse_port == 0)) {
217dbed73cbSSangeeta Misra 			break;
218dbed73cbSSangeeta Misra 		}
219dbed73cbSSangeeta Misra 	}
220dbed73cbSSangeeta Misra 	/* Found one, return it. */
221dbed73cbSSangeeta Misra 	if (tmp != NULL) {
222dbed73cbSSangeeta Misra 		tmp->nse_refcnt++;
223dbed73cbSSangeeta Misra 		mutex_exit(&ilbs->ilbs_nat_src[idx].nsh_lock);
224dbed73cbSSangeeta Misra 		return (tmp);
225dbed73cbSSangeeta Misra 	}
226dbed73cbSSangeeta Misra 
227dbed73cbSSangeeta Misra 	tmp = kmem_alloc(sizeof (ilb_nat_src_entry_t), KM_NOSLEEP);
228dbed73cbSSangeeta Misra 	if (tmp == NULL) {
229dbed73cbSSangeeta Misra 		mutex_exit(&ilbs->ilbs_nat_src[idx].nsh_lock);
230dbed73cbSSangeeta Misra 		return (NULL);
231dbed73cbSSangeeta Misra 	}
232dbed73cbSSangeeta Misra 	tmp->nse_src_addr = *nat_src;
233dbed73cbSSangeeta Misra 	tmp->nse_serv_addr = *serv_addr;
234dbed73cbSSangeeta Misra 	tmp->nse_port = port;
235dbed73cbSSangeeta Misra 	tmp->nse_nsh_lock = &ilbs->ilbs_nat_src[idx].nsh_lock;
236dbed73cbSSangeeta Misra 	tmp->nse_refcnt = 1;
237dbed73cbSSangeeta Misra 
238dbed73cbSSangeeta Misra 	(void) snprintf(arena_name, ARENA_NAMESZ, "ilb_ns_%u",
239*1a5e258fSJosef 'Jeff' Sipek 	    atomic_inc_32_nv(&ilb_nat_src_instance));
240dbed73cbSSangeeta Misra 	if ((tmp->nse_port_arena = vmem_create(arena_name,
241dbed73cbSSangeeta Misra 	    (void *)NAT_PORT_START, NAT_PORT_SIZE, 1, NULL, NULL, NULL, 1,
242dbed73cbSSangeeta Misra 	    VM_SLEEP | VMC_IDENTIFIER)) == NULL) {
243dbed73cbSSangeeta Misra 		kmem_free(tmp, sizeof (*tmp));
244dbed73cbSSangeeta Misra 		return (NULL);
245dbed73cbSSangeeta Misra 	}
246dbed73cbSSangeeta Misra 
247dbed73cbSSangeeta Misra 	list_insert_tail(head, tmp);
248dbed73cbSSangeeta Misra 	mutex_exit(&ilbs->ilbs_nat_src[idx].nsh_lock);
249dbed73cbSSangeeta Misra 
250dbed73cbSSangeeta Misra 	return (tmp);
251dbed73cbSSangeeta Misra }
252dbed73cbSSangeeta Misra 
253dbed73cbSSangeeta Misra /*
254dbed73cbSSangeeta Misra  * Create ilb_nat_src_t struct for a ilb_server_t struct.
255dbed73cbSSangeeta Misra  */
256dbed73cbSSangeeta Misra int
ilb_create_nat_src(ilb_stack_t * ilbs,ilb_nat_src_t ** nat_src,const in6_addr_t * srv_addr,in_port_t port,const in6_addr_t * start,int num)257dbed73cbSSangeeta Misra ilb_create_nat_src(ilb_stack_t *ilbs, ilb_nat_src_t **nat_src,
258dbed73cbSSangeeta Misra     const in6_addr_t *srv_addr, in_port_t port, const in6_addr_t *start,
259dbed73cbSSangeeta Misra     int num)
260dbed73cbSSangeeta Misra {
261dbed73cbSSangeeta Misra 	ilb_nat_src_t *src;
262dbed73cbSSangeeta Misra 	in6_addr_t cur_addr;
263dbed73cbSSangeeta Misra 	int i;
264dbed73cbSSangeeta Misra 
265dbed73cbSSangeeta Misra 	if ((src = kmem_zalloc(sizeof (ilb_nat_src_t), KM_NOSLEEP)) == NULL) {
266dbed73cbSSangeeta Misra 		*nat_src = NULL;
267dbed73cbSSangeeta Misra 		return (ENOMEM);
268dbed73cbSSangeeta Misra 	}
269dbed73cbSSangeeta Misra 	cur_addr = *start;
270dbed73cbSSangeeta Misra 	for (i = 0; i < num && i < ILB_MAX_NAT_SRC; i++) {
271dbed73cbSSangeeta Misra 		src->src_list[i] = ilb_find_nat_src(ilbs, &cur_addr, srv_addr,
272dbed73cbSSangeeta Misra 		    port);
273dbed73cbSSangeeta Misra 		if (src->src_list[i] == NULL) {
274dbed73cbSSangeeta Misra 			ilb_destroy_nat_src(&src);
275dbed73cbSSangeeta Misra 			*nat_src = NULL;
276dbed73cbSSangeeta Misra 			return (ENOMEM);
277dbed73cbSSangeeta Misra 		}
278dbed73cbSSangeeta Misra 		incr_addr(&cur_addr);
279dbed73cbSSangeeta Misra 		/*
280dbed73cbSSangeeta Misra 		 * Increment num_src here so that we can call
281dbed73cbSSangeeta Misra 		 * ilb_destroy_nat_src() when we need to do cleanup.
282dbed73cbSSangeeta Misra 		 */
283dbed73cbSSangeeta Misra 		src->num_src++;
284dbed73cbSSangeeta Misra 	}
285dbed73cbSSangeeta Misra 	*nat_src = src;
286dbed73cbSSangeeta Misra 	return (0);
287dbed73cbSSangeeta Misra }
288dbed73cbSSangeeta Misra 
289dbed73cbSSangeeta Misra /*
290dbed73cbSSangeeta Misra  * Timer routine for garbage collecting unneeded NAT source entry.  We
291dbed73cbSSangeeta Misra  * don't use a taskq for this since the table should be relatively small
292dbed73cbSSangeeta Misra  * and should be OK for a timer to handle.
293dbed73cbSSangeeta Misra  */
294dbed73cbSSangeeta Misra void
ilb_nat_src_timer(void * arg)295dbed73cbSSangeeta Misra ilb_nat_src_timer(void *arg)
296dbed73cbSSangeeta Misra {
297dbed73cbSSangeeta Misra 	ilb_stack_t *ilbs = (ilb_stack_t *)arg;
298dbed73cbSSangeeta Misra 	ilb_nat_src_entry_t *cur, *tmp;
299dbed73cbSSangeeta Misra 	list_t *head;
300dbed73cbSSangeeta Misra 	int i;
301dbed73cbSSangeeta Misra 
302dbed73cbSSangeeta Misra 	for (i = 0; i < ilbs->ilbs_nat_src_hash_size; i++) {
303dbed73cbSSangeeta Misra 		mutex_enter(&ilbs->ilbs_nat_src[i].nsh_lock);
304dbed73cbSSangeeta Misra 		head = &ilbs->ilbs_nat_src[i].nsh_head;
305dbed73cbSSangeeta Misra 		cur = list_head(head);
306dbed73cbSSangeeta Misra 		while (cur != NULL) {
307dbed73cbSSangeeta Misra 			/*
308dbed73cbSSangeeta Misra 			 * When a server is removed, it will release its
309dbed73cbSSangeeta Misra 			 * reference on an entry.  But there may still be
310dbed73cbSSangeeta Misra 			 * conn using some ports.  So check the size also.
311dbed73cbSSangeeta Misra 			 */
312dbed73cbSSangeeta Misra 			if (cur->nse_refcnt != 0 ||
313dbed73cbSSangeeta Misra 			    vmem_size(cur->nse_port_arena, VMEM_ALLOC) != 0) {
314dbed73cbSSangeeta Misra 				cur = list_next(head, cur);
315dbed73cbSSangeeta Misra 				continue;
316dbed73cbSSangeeta Misra 			}
317dbed73cbSSangeeta Misra 			tmp = cur;
318dbed73cbSSangeeta Misra 			cur = list_next(head, cur);
319dbed73cbSSangeeta Misra 			list_remove(head, tmp);
320dbed73cbSSangeeta Misra 			vmem_destroy(tmp->nse_port_arena);
321dbed73cbSSangeeta Misra 			kmem_free(tmp, sizeof (ilb_nat_src_entry_t));
322dbed73cbSSangeeta Misra 		}
323dbed73cbSSangeeta Misra 		mutex_exit(&ilbs->ilbs_nat_src[i].nsh_lock);
324dbed73cbSSangeeta Misra 	}
325dbed73cbSSangeeta Misra 
326dbed73cbSSangeeta Misra 	mutex_enter(&ilbs->ilbs_nat_src_lock);
327dbed73cbSSangeeta Misra 	if (ilbs->ilbs_nat_src_tid == 0) {
328dbed73cbSSangeeta Misra 		mutex_exit(&ilbs->ilbs_nat_src_lock);
329dbed73cbSSangeeta Misra 	} else {
330dbed73cbSSangeeta Misra 		ilbs->ilbs_nat_src_tid = timeout(ilb_nat_src_timer, ilbs,
331dbed73cbSSangeeta Misra 		    SEC_TO_TICK(ILB_NAT_SRC_TIMEOUT +
332dbed73cbSSangeeta Misra 		    gethrtime() % ILB_NAT_SRC_TIMEOUT_JITTER));
333dbed73cbSSangeeta Misra 		mutex_exit(&ilbs->ilbs_nat_src_lock);
334dbed73cbSSangeeta Misra 	}
335dbed73cbSSangeeta Misra }
336dbed73cbSSangeeta Misra 
337dbed73cbSSangeeta Misra /*
338dbed73cbSSangeeta Misra  * Destroy a given ilb_nat_src_t struct.  It will also release the reference
339dbed73cbSSangeeta Misra  * hold on all its ilb_nat_src_entry_t.
340dbed73cbSSangeeta Misra  */
341dbed73cbSSangeeta Misra void
ilb_destroy_nat_src(ilb_nat_src_t ** nat_src)342dbed73cbSSangeeta Misra ilb_destroy_nat_src(ilb_nat_src_t **nat_src)
343dbed73cbSSangeeta Misra {
344dbed73cbSSangeeta Misra 	int i, size;
345dbed73cbSSangeeta Misra 	ilb_nat_src_t *src;
346dbed73cbSSangeeta Misra 	ilb_nat_src_entry_t *entry;
347dbed73cbSSangeeta Misra 
348dbed73cbSSangeeta Misra 	src = *nat_src;
349dbed73cbSSangeeta Misra 	if (src == NULL)
350dbed73cbSSangeeta Misra 		return;
351dbed73cbSSangeeta Misra 	size = src->num_src;
352dbed73cbSSangeeta Misra 	/*
353dbed73cbSSangeeta Misra 	 * Set each entry to be condemned and the garbarge collector will
354dbed73cbSSangeeta Misra 	 * clean them up.
355dbed73cbSSangeeta Misra 	 */
356dbed73cbSSangeeta Misra 	for (i = 0; i < size; i++) {
357dbed73cbSSangeeta Misra 		entry = src->src_list[i];
358dbed73cbSSangeeta Misra 		mutex_enter(entry->nse_nsh_lock);
359dbed73cbSSangeeta Misra 		entry->nse_refcnt--;
360dbed73cbSSangeeta Misra 		mutex_exit(entry->nse_nsh_lock);
361dbed73cbSSangeeta Misra 	}
362dbed73cbSSangeeta Misra 	kmem_free(src, sizeof (ilb_nat_src_t));
363dbed73cbSSangeeta Misra 	*nat_src = NULL;
364dbed73cbSSangeeta Misra }
365dbed73cbSSangeeta Misra 
366dbed73cbSSangeeta Misra /*
367dbed73cbSSangeeta Misra  * Given a backend server address and its ilb_nat_src_t, allocate a source
368dbed73cbSSangeeta Misra  * address and port for NAT usage.
369dbed73cbSSangeeta Misra  */
370dbed73cbSSangeeta Misra ilb_nat_src_entry_t *
ilb_alloc_nat_addr(ilb_nat_src_t * src,in6_addr_t * addr,in_port_t * port,uint16_t * nat_src_idx)371dbed73cbSSangeeta Misra ilb_alloc_nat_addr(ilb_nat_src_t *src, in6_addr_t *addr, in_port_t *port,
372dbed73cbSSangeeta Misra     uint16_t *nat_src_idx)
373dbed73cbSSangeeta Misra {
374dbed73cbSSangeeta Misra 	int i, try, size;
375dbed73cbSSangeeta Misra 	in_port_t p;
376dbed73cbSSangeeta Misra 
377dbed73cbSSangeeta Misra 	size = src->num_src;
378dbed73cbSSangeeta Misra 	/* Increment of cur does not need to be atomic.  It is just a hint. */
379dbed73cbSSangeeta Misra 	if (nat_src_idx == NULL)
380dbed73cbSSangeeta Misra 		i = (++src->cur) % size;
381dbed73cbSSangeeta Misra 	else
382dbed73cbSSangeeta Misra 		i = *nat_src_idx;
383dbed73cbSSangeeta Misra 
384dbed73cbSSangeeta Misra 	for (try = 0; try < size; try++) {
385dbed73cbSSangeeta Misra 		p = (in_port_t)(uintptr_t)vmem_alloc(
386dbed73cbSSangeeta Misra 		    src->src_list[i]->nse_port_arena, 1, VM_NOSLEEP);
387dbed73cbSSangeeta Misra 		if (p != 0)
388dbed73cbSSangeeta Misra 			break;
389dbed73cbSSangeeta Misra 		/*
390dbed73cbSSangeeta Misra 		 * If an index is given and we cannot allocate a port using
391dbed73cbSSangeeta Misra 		 * that entry, return NULL.
392dbed73cbSSangeeta Misra 		 */
393dbed73cbSSangeeta Misra 		if (nat_src_idx != NULL)
394dbed73cbSSangeeta Misra 			return (NULL);
395dbed73cbSSangeeta Misra 		i = (i + 1) % size;
396dbed73cbSSangeeta Misra 	}
397dbed73cbSSangeeta Misra 	if (try == size)
398dbed73cbSSangeeta Misra 		return (NULL);
399dbed73cbSSangeeta Misra 	*addr = src->src_list[i]->nse_src_addr;
400dbed73cbSSangeeta Misra 	*port = htons(p);
401dbed73cbSSangeeta Misra 	return (src->src_list[i]);
402dbed73cbSSangeeta Misra }
403dbed73cbSSangeeta Misra 
404dbed73cbSSangeeta Misra /*
405dbed73cbSSangeeta Misra  * Use the pre-calculated checksum to adjust the checksum of a packet after
406dbed73cbSSangeeta Misra  * NAT.
407dbed73cbSSangeeta Misra  */
408dbed73cbSSangeeta Misra static void
adj_cksum(uint16_t * chksum,uint32_t adj_sum)409dbed73cbSSangeeta Misra adj_cksum(uint16_t *chksum, uint32_t adj_sum)
410dbed73cbSSangeeta Misra {
411dbed73cbSSangeeta Misra 	adj_sum += (uint16_t)~(*chksum);
412dbed73cbSSangeeta Misra 	while ((adj_sum >> 16) != 0)
413dbed73cbSSangeeta Misra 		adj_sum = (adj_sum & 0xffff) + (adj_sum >> 16);
414dbed73cbSSangeeta Misra 	*chksum = (uint16_t)~adj_sum;
415dbed73cbSSangeeta Misra }
416dbed73cbSSangeeta Misra 
417dbed73cbSSangeeta Misra /* Do full NAT (replace both source and desination info) on a packet. */
418dbed73cbSSangeeta Misra void
ilb_full_nat(int l3,void * iph,int l4,void * tph,ilb_nat_info_t * info,uint32_t adj_ip_sum,uint32_t adj_tp_sum,boolean_t c2s)419dbed73cbSSangeeta Misra ilb_full_nat(int l3, void *iph, int l4, void *tph, ilb_nat_info_t *info,
420dbed73cbSSangeeta Misra     uint32_t adj_ip_sum, uint32_t adj_tp_sum, boolean_t c2s)
421dbed73cbSSangeeta Misra {
422dbed73cbSSangeeta Misra 	in_port_t *orig_sport, *orig_dport;
423dbed73cbSSangeeta Misra 	uint16_t *tp_cksum;
424dbed73cbSSangeeta Misra 
425dbed73cbSSangeeta Misra 	switch (l4) {
426dbed73cbSSangeeta Misra 	case IPPROTO_TCP:
427dbed73cbSSangeeta Misra 		orig_sport = &((tcpha_t *)tph)->tha_lport;
428dbed73cbSSangeeta Misra 		orig_dport = &((tcpha_t *)tph)->tha_fport;
429dbed73cbSSangeeta Misra 		tp_cksum = &((tcpha_t *)tph)->tha_sum;
430dbed73cbSSangeeta Misra 		break;
431dbed73cbSSangeeta Misra 	case IPPROTO_UDP:
432dbed73cbSSangeeta Misra 		orig_sport = &((udpha_t *)tph)->uha_src_port;
433dbed73cbSSangeeta Misra 		orig_dport = &((udpha_t *)tph)->uha_dst_port;
434dbed73cbSSangeeta Misra 		tp_cksum = &((udpha_t *)tph)->uha_checksum;
435dbed73cbSSangeeta Misra 		break;
436dbed73cbSSangeeta Misra 	default:
437dbed73cbSSangeeta Misra 		ASSERT(0);
438dbed73cbSSangeeta Misra 		return;
439dbed73cbSSangeeta Misra 	}
440dbed73cbSSangeeta Misra 
441dbed73cbSSangeeta Misra 	switch (l3) {
442dbed73cbSSangeeta Misra 	case IPPROTO_IP: {
443dbed73cbSSangeeta Misra 		ipha_t *ipha;
444dbed73cbSSangeeta Misra 
445dbed73cbSSangeeta Misra 		ipha = iph;
446dbed73cbSSangeeta Misra 		if (c2s) {
447dbed73cbSSangeeta Misra 			IN6_V4MAPPED_TO_IPADDR(&info->nat_src,
448dbed73cbSSangeeta Misra 			    ipha->ipha_src);
449dbed73cbSSangeeta Misra 			IN6_V4MAPPED_TO_IPADDR(&info->nat_dst,
450dbed73cbSSangeeta Misra 			    ipha->ipha_dst);
451dbed73cbSSangeeta Misra 			*orig_sport = info->nat_sport;
452dbed73cbSSangeeta Misra 			*orig_dport = info->nat_dport;
453dbed73cbSSangeeta Misra 		} else {
454dbed73cbSSangeeta Misra 			IN6_V4MAPPED_TO_IPADDR(&info->vip, ipha->ipha_src);
455dbed73cbSSangeeta Misra 			IN6_V4MAPPED_TO_IPADDR(&info->src, ipha->ipha_dst);
456dbed73cbSSangeeta Misra 			*orig_sport = info->dport;
457dbed73cbSSangeeta Misra 			*orig_dport = info->sport;
458dbed73cbSSangeeta Misra 		}
459dbed73cbSSangeeta Misra 		adj_cksum(&ipha->ipha_hdr_checksum, adj_ip_sum);
460dbed73cbSSangeeta Misra 		adj_cksum(tp_cksum, adj_tp_sum);
461dbed73cbSSangeeta Misra 		break;
462dbed73cbSSangeeta Misra 	}
463dbed73cbSSangeeta Misra 	case IPPROTO_IPV6: {
464dbed73cbSSangeeta Misra 		ip6_t *ip6h;
465dbed73cbSSangeeta Misra 
466dbed73cbSSangeeta Misra 		ip6h = iph;
467dbed73cbSSangeeta Misra 		if (c2s) {
468dbed73cbSSangeeta Misra 			ip6h->ip6_src = info->nat_src;
469dbed73cbSSangeeta Misra 			ip6h->ip6_dst = info->nat_dst;
470dbed73cbSSangeeta Misra 			*orig_sport = info->nat_sport;
471dbed73cbSSangeeta Misra 			*orig_dport = info->nat_dport;
472dbed73cbSSangeeta Misra 		} else {
473dbed73cbSSangeeta Misra 			ip6h->ip6_src = info->vip;
474dbed73cbSSangeeta Misra 			ip6h->ip6_dst = info->src;
475dbed73cbSSangeeta Misra 			*orig_sport = info->dport;
476dbed73cbSSangeeta Misra 			*orig_dport = info->sport;
477dbed73cbSSangeeta Misra 		}
478dbed73cbSSangeeta Misra 		/* No checksum for IPv6 header */
479dbed73cbSSangeeta Misra 		adj_cksum(tp_cksum, adj_tp_sum);
480dbed73cbSSangeeta Misra 		break;
481dbed73cbSSangeeta Misra 	}
482dbed73cbSSangeeta Misra 	default:
483dbed73cbSSangeeta Misra 		ASSERT(0);
484dbed73cbSSangeeta Misra 		break;
485dbed73cbSSangeeta Misra 	}
486dbed73cbSSangeeta Misra }
487dbed73cbSSangeeta Misra 
488dbed73cbSSangeeta Misra /* Do half NAT (only replace the destination info) on a packet. */
489dbed73cbSSangeeta Misra void
ilb_half_nat(int l3,void * iph,int l4,void * tph,ilb_nat_info_t * info,uint32_t adj_ip_sum,uint32_t adj_tp_sum,boolean_t c2s)490dbed73cbSSangeeta Misra ilb_half_nat(int l3, void *iph, int l4, void *tph, ilb_nat_info_t *info,
491dbed73cbSSangeeta Misra     uint32_t adj_ip_sum, uint32_t adj_tp_sum, boolean_t c2s)
492dbed73cbSSangeeta Misra {
493dbed73cbSSangeeta Misra 	in_port_t *orig_port;
494dbed73cbSSangeeta Misra 	uint16_t *tp_cksum;
495dbed73cbSSangeeta Misra 
496dbed73cbSSangeeta Misra 	switch (l4) {
497dbed73cbSSangeeta Misra 	case IPPROTO_TCP:
498dbed73cbSSangeeta Misra 		if (c2s)
499dbed73cbSSangeeta Misra 			orig_port = &((tcpha_t *)tph)->tha_fport;
500dbed73cbSSangeeta Misra 		else
501dbed73cbSSangeeta Misra 			orig_port = &((tcpha_t *)tph)->tha_lport;
502dbed73cbSSangeeta Misra 		tp_cksum = &((tcpha_t *)tph)->tha_sum;
503dbed73cbSSangeeta Misra 		break;
504dbed73cbSSangeeta Misra 	case IPPROTO_UDP:
505dbed73cbSSangeeta Misra 		if (c2s)
506dbed73cbSSangeeta Misra 			orig_port = &((udpha_t *)tph)->uha_dst_port;
507dbed73cbSSangeeta Misra 		else
508dbed73cbSSangeeta Misra 			orig_port = &((udpha_t *)tph)->uha_src_port;
509dbed73cbSSangeeta Misra 		tp_cksum = &((udpha_t *)tph)->uha_checksum;
510dbed73cbSSangeeta Misra 		break;
511dbed73cbSSangeeta Misra 	default:
512dbed73cbSSangeeta Misra 		ASSERT(0);
513dbed73cbSSangeeta Misra 		return;
514dbed73cbSSangeeta Misra 	}
515dbed73cbSSangeeta Misra 
516dbed73cbSSangeeta Misra 	switch (l3) {
517dbed73cbSSangeeta Misra 	case IPPROTO_IP: {
518dbed73cbSSangeeta Misra 		ipha_t *ipha;
519dbed73cbSSangeeta Misra 
520dbed73cbSSangeeta Misra 		ipha = iph;
521dbed73cbSSangeeta Misra 		if (c2s) {
522dbed73cbSSangeeta Misra 			IN6_V4MAPPED_TO_IPADDR(&info->nat_dst,
523dbed73cbSSangeeta Misra 			    ipha->ipha_dst);
524dbed73cbSSangeeta Misra 			*orig_port = info->nat_dport;
525dbed73cbSSangeeta Misra 		} else {
526dbed73cbSSangeeta Misra 			IN6_V4MAPPED_TO_IPADDR(&info->vip, ipha->ipha_src);
527dbed73cbSSangeeta Misra 			*orig_port = info->dport;
528dbed73cbSSangeeta Misra 		}
529dbed73cbSSangeeta Misra 		adj_cksum(&ipha->ipha_hdr_checksum, adj_ip_sum);
530dbed73cbSSangeeta Misra 		adj_cksum(tp_cksum, adj_tp_sum);
531dbed73cbSSangeeta Misra 		break;
532dbed73cbSSangeeta Misra 	}
533dbed73cbSSangeeta Misra 	case IPPROTO_IPV6: {
534dbed73cbSSangeeta Misra 		ip6_t *ip6h;
535dbed73cbSSangeeta Misra 
536dbed73cbSSangeeta Misra 		ip6h = iph;
537dbed73cbSSangeeta Misra 		if (c2s) {
538dbed73cbSSangeeta Misra 			ip6h->ip6_dst = info->nat_dst;
539dbed73cbSSangeeta Misra 			*orig_port = info->nat_dport;
540dbed73cbSSangeeta Misra 		} else {
541dbed73cbSSangeeta Misra 			ip6h->ip6_src = info->vip;
542dbed73cbSSangeeta Misra 			*orig_port = info->dport;
543dbed73cbSSangeeta Misra 		}
544dbed73cbSSangeeta Misra 		/* No checksum for IPv6 header */
545dbed73cbSSangeeta Misra 		adj_cksum(tp_cksum, adj_tp_sum);
546dbed73cbSSangeeta Misra 		break;
547dbed73cbSSangeeta Misra 	}
548dbed73cbSSangeeta Misra 	default:
549dbed73cbSSangeeta Misra 		ASSERT(0);
550dbed73cbSSangeeta Misra 		break;
551dbed73cbSSangeeta Misra 	}
552dbed73cbSSangeeta Misra }
553dbed73cbSSangeeta Misra 
554dbed73cbSSangeeta Misra /* Calculate the IPv6 pseudo checksum, used for ICMPv6 NAT. */
555dbed73cbSSangeeta Misra uint32_t
ilb_pseudo_sum_v6(ip6_t * ip6h,uint8_t nxt_hdr)556dbed73cbSSangeeta Misra ilb_pseudo_sum_v6(ip6_t *ip6h, uint8_t nxt_hdr)
557dbed73cbSSangeeta Misra {
558dbed73cbSSangeeta Misra 	uint32_t sum;
559dbed73cbSSangeeta Misra 	uint16_t *cur;
560dbed73cbSSangeeta Misra 
561dbed73cbSSangeeta Misra 	cur = (uint16_t *)&ip6h->ip6_src;
562dbed73cbSSangeeta Misra 	sum = cur[0] + cur[1] + cur[2] + cur[3] + cur[4] + cur[5] + cur[6] +
563dbed73cbSSangeeta Misra 	    cur[7] + cur[8] + cur[9] + cur[10] + cur[11] + cur[12] + cur[13] +
564dbed73cbSSangeeta Misra 	    cur[14] + cur[15] + htons(nxt_hdr);
565dbed73cbSSangeeta Misra 	return ((sum & 0xffff) + (sum >> 16));
566dbed73cbSSangeeta Misra }
567dbed73cbSSangeeta Misra 
568dbed73cbSSangeeta Misra /* Do NAT on an ICMPv4 packet. */
569dbed73cbSSangeeta Misra void
ilb_nat_icmpv4(mblk_t * mp,ipha_t * out_iph,icmph_t * icmph,ipha_t * in_iph,in_port_t * sport,in_port_t * dport,ilb_nat_info_t * info,uint32_t sum,boolean_t full_nat)570dbed73cbSSangeeta Misra ilb_nat_icmpv4(mblk_t *mp, ipha_t *out_iph, icmph_t *icmph, ipha_t *in_iph,
571dbed73cbSSangeeta Misra     in_port_t *sport, in_port_t *dport, ilb_nat_info_t *info, uint32_t sum,
572dbed73cbSSangeeta Misra     boolean_t full_nat)
573dbed73cbSSangeeta Misra {
574dbed73cbSSangeeta Misra 	if (full_nat) {
575dbed73cbSSangeeta Misra 		IN6_V4MAPPED_TO_IPADDR(&info->nat_src, out_iph->ipha_src);
576dbed73cbSSangeeta Misra 		IN6_V4MAPPED_TO_IPADDR(&info->nat_src, in_iph->ipha_dst);
577dbed73cbSSangeeta Misra 		*dport = info->nat_sport;
578dbed73cbSSangeeta Misra 	}
579dbed73cbSSangeeta Misra 	IN6_V4MAPPED_TO_IPADDR(&info->nat_dst, out_iph->ipha_dst);
580dbed73cbSSangeeta Misra 	adj_cksum(&out_iph->ipha_hdr_checksum, sum);
581dbed73cbSSangeeta Misra 	IN6_V4MAPPED_TO_IPADDR(&info->nat_dst, in_iph->ipha_src);
582dbed73cbSSangeeta Misra 	*sport = info->nat_dport;
583dbed73cbSSangeeta Misra 
584dbed73cbSSangeeta Misra 	icmph->icmph_checksum = 0;
585dbed73cbSSangeeta Misra 	icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(out_iph), 0);
586dbed73cbSSangeeta Misra }
587dbed73cbSSangeeta Misra 
588dbed73cbSSangeeta Misra /* Do NAT on an ICMPv6 packet. */
589dbed73cbSSangeeta Misra void
ilb_nat_icmpv6(mblk_t * mp,ip6_t * out_ip6h,icmp6_t * icmp6h,ip6_t * in_ip6h,in_port_t * sport,in_port_t * dport,ilb_nat_info_t * info,boolean_t full_nat)590dbed73cbSSangeeta Misra ilb_nat_icmpv6(mblk_t *mp, ip6_t *out_ip6h, icmp6_t *icmp6h, ip6_t *in_ip6h,
591dbed73cbSSangeeta Misra     in_port_t *sport, in_port_t *dport, ilb_nat_info_t *info,
592dbed73cbSSangeeta Misra     boolean_t full_nat)
593dbed73cbSSangeeta Misra {
594dbed73cbSSangeeta Misra 	int hdr_len;
595dbed73cbSSangeeta Misra 
596dbed73cbSSangeeta Misra 	if (full_nat) {
597dbed73cbSSangeeta Misra 		out_ip6h->ip6_src = info->nat_src;
598dbed73cbSSangeeta Misra 		in_ip6h->ip6_dst = info->nat_src;
599dbed73cbSSangeeta Misra 		*dport = info->nat_sport;
600dbed73cbSSangeeta Misra 	}
601dbed73cbSSangeeta Misra 	out_ip6h->ip6_dst = info->nat_dst;
602dbed73cbSSangeeta Misra 	in_ip6h->ip6_src = info->nat_dst;
603dbed73cbSSangeeta Misra 	*sport = info->nat_dport;
604dbed73cbSSangeeta Misra 
605dbed73cbSSangeeta Misra 	icmp6h->icmp6_cksum = out_ip6h->ip6_plen;
606dbed73cbSSangeeta Misra 	hdr_len = (char *)icmp6h - (char *)out_ip6h;
607dbed73cbSSangeeta Misra 	icmp6h->icmp6_cksum = IP_CSUM(mp, hdr_len,
608dbed73cbSSangeeta Misra 	    ilb_pseudo_sum_v6(out_ip6h, IPPROTO_ICMPV6));
609dbed73cbSSangeeta Misra }
610