1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 * Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 * Copyright (c) 2018, Joyent, Inc.
27 */
28
29/*
30 * IPsec Security Policy Database.
31 *
32 * This module maintains the SPD and provides routines used by ip and ip6
33 * to apply IPsec policy to inbound and outbound datagrams.
34 */
35
36#include <sys/types.h>
37#include <sys/stream.h>
38#include <sys/stropts.h>
39#include <sys/sysmacros.h>
40#include <sys/strsubr.h>
41#include <sys/strsun.h>
42#include <sys/strlog.h>
43#include <sys/strsun.h>
44#include <sys/cmn_err.h>
45#include <sys/zone.h>
46
47#include <sys/systm.h>
48#include <sys/param.h>
49#include <sys/kmem.h>
50#include <sys/ddi.h>
51
52#include <sys/crypto/api.h>
53
54#include <inet/common.h>
55#include <inet/mi.h>
56
57#include <netinet/ip6.h>
58#include <netinet/icmp6.h>
59#include <netinet/udp.h>
60
61#include <inet/ip.h>
62#include <inet/ip6.h>
63
64#include <net/pfkeyv2.h>
65#include <net/pfpolicy.h>
66#include <inet/sadb.h>
67#include <inet/ipsec_impl.h>
68
69#include <inet/ip_impl.h>	/* For IP_MOD_ID */
70
71#include <inet/ipsecah.h>
72#include <inet/ipsecesp.h>
73#include <inet/ipdrop.h>
74#include <inet/ipclassifier.h>
75#include <inet/iptun.h>
76#include <inet/iptun/iptun_impl.h>
77
78static void ipsec_update_present_flags(ipsec_stack_t *);
79static ipsec_act_t *ipsec_act_wildcard_expand(ipsec_act_t *, uint_t *,
80    netstack_t *);
81static mblk_t *ipsec_check_ipsecin_policy(mblk_t *, ipsec_policy_t *,
82    ipha_t *, ip6_t *, uint64_t, ip_recv_attr_t *, netstack_t *);
83static void ipsec_action_free_table(ipsec_action_t *);
84static void ipsec_action_reclaim(void *);
85static void ipsec_action_reclaim_stack(ipsec_stack_t *);
86static void ipsid_init(netstack_t *);
87static void ipsid_fini(netstack_t *);
88
89/* sel_flags values for ipsec_init_inbound_sel(). */
90#define	SEL_NONE	0x0000
91#define	SEL_PORT_POLICY	0x0001
92#define	SEL_IS_ICMP	0x0002
93#define	SEL_TUNNEL_MODE	0x0004
94#define	SEL_POST_FRAG	0x0008
95
96/* Return values for ipsec_init_inbound_sel(). */
97typedef enum { SELRET_NOMEM, SELRET_BADPKT, SELRET_SUCCESS, SELRET_TUNFRAG}
98    selret_t;
99
100static selret_t ipsec_init_inbound_sel(ipsec_selector_t *, mblk_t *,
101    ipha_t *, ip6_t *, uint8_t);
102
103static boolean_t ipsec_check_ipsecin_action(ip_recv_attr_t *, mblk_t *,
104    struct ipsec_action_s *, ipha_t *ipha, ip6_t *ip6h, const char **,
105    kstat_named_t **, netstack_t *);
106static void ipsec_unregister_prov_update(void);
107static void ipsec_prov_update_callback_stack(uint32_t, void *, netstack_t *);
108static boolean_t ipsec_compare_action(ipsec_policy_t *, ipsec_policy_t *);
109static uint32_t selector_hash(ipsec_selector_t *, ipsec_policy_root_t *);
110static boolean_t ipsec_kstat_init(ipsec_stack_t *);
111static void ipsec_kstat_destroy(ipsec_stack_t *);
112static int ipsec_free_tables(ipsec_stack_t *);
113static int tunnel_compare(const void *, const void *);
114static void ipsec_freemsg_chain(mblk_t *);
115static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *,
116    struct kstat_named *, ipdropper_t *);
117static boolean_t ipsec_kstat_init(ipsec_stack_t *);
118static void ipsec_kstat_destroy(ipsec_stack_t *);
119static int ipsec_free_tables(ipsec_stack_t *);
120static int tunnel_compare(const void *, const void *);
121static void ipsec_freemsg_chain(mblk_t *);
122
123/*
124 * Selector hash table is statically sized at module load time.
125 * we default to 251 buckets, which is the largest prime number under 255
126 */
127
128#define	IPSEC_SPDHASH_DEFAULT 251
129
130/* SPD hash-size tunable per tunnel. */
131#define	TUN_SPDHASH_DEFAULT 5
132
133uint32_t ipsec_spd_hashsize;
134uint32_t tun_spd_hashsize;
135
136#define	IPSEC_SEL_NOHASH ((uint32_t)(~0))
137
138/*
139 * Handle global across all stack instances
140 */
141static crypto_notify_handle_t prov_update_handle = NULL;
142
143static kmem_cache_t *ipsec_action_cache;
144static kmem_cache_t *ipsec_sel_cache;
145static kmem_cache_t *ipsec_pol_cache;
146
147/* Frag cache prototypes */
148static void ipsec_fragcache_clean(ipsec_fragcache_t *, ipsec_stack_t *);
149static ipsec_fragcache_entry_t *fragcache_delentry(int,
150    ipsec_fragcache_entry_t *, ipsec_fragcache_t *, ipsec_stack_t *);
151boolean_t ipsec_fragcache_init(ipsec_fragcache_t *);
152void ipsec_fragcache_uninit(ipsec_fragcache_t *, ipsec_stack_t *ipss);
153mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *,
154    int, ipsec_stack_t *);
155
156int ipsec_hdr_pullup_needed = 0;
157int ipsec_weird_null_inbound_policy = 0;
158
159#define	ALGBITS_ROUND_DOWN(x, align)	(((x)/(align))*(align))
160#define	ALGBITS_ROUND_UP(x, align)	ALGBITS_ROUND_DOWN((x)+(align)-1, align)
161
162/*
163 * Inbound traffic should have matching identities for both SA's.
164 */
165
166#define	SA_IDS_MATCH(sa1, sa2)						\
167	(((sa1) == NULL) || ((sa2) == NULL) ||				\
168	(((sa1)->ipsa_src_cid == (sa2)->ipsa_src_cid) &&		\
169	    (((sa1)->ipsa_dst_cid == (sa2)->ipsa_dst_cid))))
170
171/*
172 * IPv6 Fragments
173 */
174#define	IS_V6_FRAGMENT(ipp)	(ipp.ipp_fields & IPPF_FRAGHDR)
175
176/*
177 * Policy failure messages.
178 */
179static char *ipsec_policy_failure_msgs[] = {
180
181	/* IPSEC_POLICY_NOT_NEEDED */
182	"%s: Dropping the datagram because the incoming packet "
183	"is %s, but the recipient expects clear; Source %s, "
184	"Destination %s.\n",
185
186	/* IPSEC_POLICY_MISMATCH */
187	"%s: Policy Failure for the incoming packet (%s); Source %s, "
188	"Destination %s.\n",
189
190	/* IPSEC_POLICY_AUTH_NOT_NEEDED	*/
191	"%s: Authentication present while not expected in the "
192	"incoming %s packet; Source %s, Destination %s.\n",
193
194	/* IPSEC_POLICY_ENCR_NOT_NEEDED */
195	"%s: Encryption present while not expected in the "
196	"incoming %s packet; Source %s, Destination %s.\n",
197
198	/* IPSEC_POLICY_SE_NOT_NEEDED */
199	"%s: Self-Encapsulation present while not expected in the "
200	"incoming %s packet; Source %s, Destination %s.\n",
201};
202
203/*
204 * General overviews:
205 *
206 * Locking:
207 *
208 *	All of the system policy structures are protected by a single
209 *	rwlock.  These structures are threaded in a
210 *	fairly complex fashion and are not expected to change on a
211 *	regular basis, so this should not cause scaling/contention
212 *	problems.  As a result, policy checks should (hopefully) be MT-hot.
213 *
214 * Allocation policy:
215 *
216 *	We use custom kmem cache types for the various
217 *	bits & pieces of the policy data structures.  All allocations
218 *	use KM_NOSLEEP instead of KM_SLEEP for policy allocation.  The
219 *	policy table is of potentially unbounded size, so we don't
220 *	want to provide a way to hog all system memory with policy
221 *	entries..
222 */
223
224/* Convenient functions for freeing or dropping a b_next linked mblk chain */
225
226/* Free all messages in an mblk chain */
227static void
228ipsec_freemsg_chain(mblk_t *mp)
229{
230	mblk_t *mpnext;
231	while (mp != NULL) {
232		ASSERT(mp->b_prev == NULL);
233		mpnext = mp->b_next;
234		mp->b_next = NULL;
235		freemsg(mp);
236		mp = mpnext;
237	}
238}
239
240/*
241 * ip_drop all messages in an mblk chain
242 * Can handle a b_next chain of ip_recv_attr_t mblks, or just a b_next chain
243 * of data.
244 */
245static void
246ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *ill,
247    struct kstat_named *counter, ipdropper_t *who_called)
248{
249	mblk_t *mpnext;
250	while (mp != NULL) {
251		ASSERT(mp->b_prev == NULL);
252		mpnext = mp->b_next;
253		mp->b_next = NULL;
254		if (ip_recv_attr_is_mblk(mp))
255			mp = ip_recv_attr_free_mblk(mp);
256		ip_drop_packet(mp, inbound, ill, counter, who_called);
257		mp = mpnext;
258	}
259}
260
261/*
262 * AVL tree comparison function.
263 * the in-kernel avl assumes unique keys for all objects.
264 * Since sometimes policy will duplicate rules, we may insert
265 * multiple rules with the same rule id, so we need a tie-breaker.
266 */
267static int
268ipsec_policy_cmpbyid(const void *a, const void *b)
269{
270	const ipsec_policy_t *ipa, *ipb;
271	uint64_t idxa, idxb;
272
273	ipa = (const ipsec_policy_t *)a;
274	ipb = (const ipsec_policy_t *)b;
275	idxa = ipa->ipsp_index;
276	idxb = ipb->ipsp_index;
277
278	if (idxa < idxb)
279		return (-1);
280	if (idxa > idxb)
281		return (1);
282	/*
283	 * Tie-breaker #1: All installed policy rules have a non-NULL
284	 * ipsl_sel (selector set), so an entry with a NULL ipsp_sel is not
285	 * actually in-tree but rather a template node being used in
286	 * an avl_find query; see ipsec_policy_delete().  This gives us
287	 * a placeholder in the ordering just before the first entry with
288	 * a key >= the one we're looking for, so we can walk forward from
289	 * that point to get the remaining entries with the same id.
290	 */
291	if ((ipa->ipsp_sel == NULL) && (ipb->ipsp_sel != NULL))
292		return (-1);
293	if ((ipb->ipsp_sel == NULL) && (ipa->ipsp_sel != NULL))
294		return (1);
295	/*
296	 * At most one of the arguments to the comparison should have a
297	 * NULL selector pointer; if not, the tree is broken.
298	 */
299	ASSERT(ipa->ipsp_sel != NULL);
300	ASSERT(ipb->ipsp_sel != NULL);
301	/*
302	 * Tie-breaker #2: use the virtual address of the policy node
303	 * to arbitrarily break ties.  Since we use the new tree node in
304	 * the avl_find() in ipsec_insert_always, the new node will be
305	 * inserted into the tree in the right place in the sequence.
306	 */
307	if (ipa < ipb)
308		return (-1);
309	if (ipa > ipb)
310		return (1);
311	return (0);
312}
313
314/*
315 * Free what ipsec_alloc_table allocated.
316 */
317void
318ipsec_polhead_free_table(ipsec_policy_head_t *iph)
319{
320	int dir;
321	int i;
322
323	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
324		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
325
326		if (ipr->ipr_hash == NULL)
327			continue;
328
329		for (i = 0; i < ipr->ipr_nchains; i++) {
330			ASSERT(ipr->ipr_hash[i].hash_head == NULL);
331		}
332		kmem_free(ipr->ipr_hash, ipr->ipr_nchains *
333		    sizeof (ipsec_policy_hash_t));
334		ipr->ipr_hash = NULL;
335	}
336}
337
338void
339ipsec_polhead_destroy(ipsec_policy_head_t *iph)
340{
341	int dir;
342
343	avl_destroy(&iph->iph_rulebyid);
344	rw_destroy(&iph->iph_lock);
345
346	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
347		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
348		int chain;
349
350		for (chain = 0; chain < ipr->ipr_nchains; chain++)
351			mutex_destroy(&(ipr->ipr_hash[chain].hash_lock));
352
353	}
354	ipsec_polhead_free_table(iph);
355}
356
357/*
358 * Free the IPsec stack instance.
359 */
360/* ARGSUSED */
361static void
362ipsec_stack_fini(netstackid_t stackid, void *arg)
363{
364	ipsec_stack_t	*ipss = (ipsec_stack_t *)arg;
365	void *cookie;
366	ipsec_tun_pol_t *node;
367	netstack_t	*ns = ipss->ipsec_netstack;
368	int		i;
369	ipsec_algtype_t	algtype;
370
371	ipsec_loader_destroy(ipss);
372
373	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER);
374	/*
375	 * It's possible we can just ASSERT() the tree is empty.  After all,
376	 * we aren't called until IP is ready to unload (and presumably all
377	 * tunnels have been unplumbed).  But we'll play it safe for now, the
378	 * loop will just exit immediately if it's empty.
379	 */
380	cookie = NULL;
381	while ((node = (ipsec_tun_pol_t *)
382	    avl_destroy_nodes(&ipss->ipsec_tunnel_policies,
383	    &cookie)) != NULL) {
384		ITP_REFRELE(node, ns);
385	}
386	avl_destroy(&ipss->ipsec_tunnel_policies);
387	rw_exit(&ipss->ipsec_tunnel_policy_lock);
388	rw_destroy(&ipss->ipsec_tunnel_policy_lock);
389
390	ipsec_config_flush(ns);
391
392	ipsec_kstat_destroy(ipss);
393
394	ip_drop_unregister(&ipss->ipsec_dropper);
395
396	ip_drop_unregister(&ipss->ipsec_spd_dropper);
397	ip_drop_destroy(ipss);
398	/*
399	 * Globals start with ref == 1 to prevent IPPH_REFRELE() from
400	 * attempting to free them, hence they should have 1 now.
401	 */
402	ipsec_polhead_destroy(&ipss->ipsec_system_policy);
403	ASSERT(ipss->ipsec_system_policy.iph_refs == 1);
404	ipsec_polhead_destroy(&ipss->ipsec_inactive_policy);
405	ASSERT(ipss->ipsec_inactive_policy.iph_refs == 1);
406
407	for (i = 0; i < IPSEC_ACTION_HASH_SIZE; i++) {
408		ipsec_action_free_table(ipss->ipsec_action_hash[i].hash_head);
409		ipss->ipsec_action_hash[i].hash_head = NULL;
410		mutex_destroy(&(ipss->ipsec_action_hash[i].hash_lock));
411	}
412
413	for (i = 0; i < ipss->ipsec_spd_hashsize; i++) {
414		ASSERT(ipss->ipsec_sel_hash[i].hash_head == NULL);
415		mutex_destroy(&(ipss->ipsec_sel_hash[i].hash_lock));
416	}
417
418	rw_enter(&ipss->ipsec_alg_lock, RW_WRITER);
419	for (algtype = 0; algtype < IPSEC_NALGTYPES; algtype ++) {
420		for (i = 0; i < IPSEC_MAX_ALGS; i++) {
421			if (ipss->ipsec_alglists[algtype][i] != NULL)
422				ipsec_alg_unreg(algtype, i, ns);
423		}
424	}
425	rw_exit(&ipss->ipsec_alg_lock);
426	rw_destroy(&ipss->ipsec_alg_lock);
427
428	ipsid_gc(ns);
429	ipsid_fini(ns);
430
431	(void) ipsec_free_tables(ipss);
432	kmem_free(ipss, sizeof (*ipss));
433}
434
435void
436ipsec_policy_g_destroy(void)
437{
438	kmem_cache_destroy(ipsec_action_cache);
439	kmem_cache_destroy(ipsec_sel_cache);
440	kmem_cache_destroy(ipsec_pol_cache);
441
442	ipsec_unregister_prov_update();
443
444	netstack_unregister(NS_IPSEC);
445}
446
447
448/*
449 * Free what ipsec_alloc_tables allocated.
450 * Called when table allocation fails to free the table.
451 */
452static int
453ipsec_free_tables(ipsec_stack_t *ipss)
454{
455	int i;
456
457	if (ipss->ipsec_sel_hash != NULL) {
458		for (i = 0; i < ipss->ipsec_spd_hashsize; i++) {
459			ASSERT(ipss->ipsec_sel_hash[i].hash_head == NULL);
460		}
461		kmem_free(ipss->ipsec_sel_hash, ipss->ipsec_spd_hashsize *
462		    sizeof (*ipss->ipsec_sel_hash));
463		ipss->ipsec_sel_hash = NULL;
464		ipss->ipsec_spd_hashsize = 0;
465	}
466	ipsec_polhead_free_table(&ipss->ipsec_system_policy);
467	ipsec_polhead_free_table(&ipss->ipsec_inactive_policy);
468
469	return (ENOMEM);
470}
471
472/*
473 * Attempt to allocate the tables in a single policy head.
474 * Return nonzero on failure after cleaning up any work in progress.
475 */
476int
477ipsec_alloc_table(ipsec_policy_head_t *iph, int nchains, int kmflag,
478    boolean_t global_cleanup, netstack_t *ns)
479{
480	int dir;
481
482	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
483		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
484
485		ipr->ipr_nchains = nchains;
486		ipr->ipr_hash = kmem_zalloc(nchains *
487		    sizeof (ipsec_policy_hash_t), kmflag);
488		if (ipr->ipr_hash == NULL)
489			return (global_cleanup ?
490			    ipsec_free_tables(ns->netstack_ipsec) :
491			    ENOMEM);
492	}
493	return (0);
494}
495
496/*
497 * Attempt to allocate the various tables.  Return nonzero on failure
498 * after cleaning up any work in progress.
499 */
500static int
501ipsec_alloc_tables(int kmflag, netstack_t *ns)
502{
503	int error;
504	ipsec_stack_t	*ipss = ns->netstack_ipsec;
505
506	error = ipsec_alloc_table(&ipss->ipsec_system_policy,
507	    ipss->ipsec_spd_hashsize, kmflag, B_TRUE, ns);
508	if (error != 0)
509		return (error);
510
511	error = ipsec_alloc_table(&ipss->ipsec_inactive_policy,
512	    ipss->ipsec_spd_hashsize, kmflag, B_TRUE, ns);
513	if (error != 0)
514		return (error);
515
516	ipss->ipsec_sel_hash = kmem_zalloc(ipss->ipsec_spd_hashsize *
517	    sizeof (*ipss->ipsec_sel_hash), kmflag);
518
519	if (ipss->ipsec_sel_hash == NULL)
520		return (ipsec_free_tables(ipss));
521
522	return (0);
523}
524
525/*
526 * After table allocation, initialize a policy head.
527 */
528void
529ipsec_polhead_init(ipsec_policy_head_t *iph, int nchains)
530{
531	int dir, chain;
532
533	rw_init(&iph->iph_lock, NULL, RW_DEFAULT, NULL);
534	avl_create(&iph->iph_rulebyid, ipsec_policy_cmpbyid,
535	    sizeof (ipsec_policy_t), offsetof(ipsec_policy_t, ipsp_byid));
536
537	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
538		ipsec_policy_root_t *ipr = &iph->iph_root[dir];
539		ipr->ipr_nchains = nchains;
540
541		for (chain = 0; chain < nchains; chain++) {
542			mutex_init(&(ipr->ipr_hash[chain].hash_lock),
543			    NULL, MUTEX_DEFAULT, NULL);
544		}
545	}
546}
547
548static boolean_t
549ipsec_kstat_init(ipsec_stack_t *ipss)
550{
551	ipss->ipsec_ksp = kstat_create_netstack("ip", 0, "ipsec_stat", "net",
552	    KSTAT_TYPE_NAMED, sizeof (ipsec_kstats_t) / sizeof (kstat_named_t),
553	    KSTAT_FLAG_PERSISTENT, ipss->ipsec_netstack->netstack_stackid);
554
555	if (ipss->ipsec_ksp == NULL || ipss->ipsec_ksp->ks_data == NULL)
556		return (B_FALSE);
557
558	ipss->ipsec_kstats = ipss->ipsec_ksp->ks_data;
559
560#define	KI(x) kstat_named_init(&ipss->ipsec_kstats->x, #x, KSTAT_DATA_UINT64)
561	KI(esp_stat_in_requests);
562	KI(esp_stat_in_discards);
563	KI(esp_stat_lookup_failure);
564	KI(ah_stat_in_requests);
565	KI(ah_stat_in_discards);
566	KI(ah_stat_lookup_failure);
567	KI(sadb_acquire_maxpackets);
568	KI(sadb_acquire_qhiwater);
569#undef KI
570
571	kstat_install(ipss->ipsec_ksp);
572	return (B_TRUE);
573}
574
575static void
576ipsec_kstat_destroy(ipsec_stack_t *ipss)
577{
578	kstat_delete_netstack(ipss->ipsec_ksp,
579	    ipss->ipsec_netstack->netstack_stackid);
580	ipss->ipsec_kstats = NULL;
581
582}
583
584/*
585 * Initialize the IPsec stack instance.
586 */
587/* ARGSUSED */
588static void *
589ipsec_stack_init(netstackid_t stackid, netstack_t *ns)
590{
591	ipsec_stack_t	*ipss;
592	int i;
593
594	ipss = (ipsec_stack_t *)kmem_zalloc(sizeof (*ipss), KM_SLEEP);
595	ipss->ipsec_netstack = ns;
596
597	/*
598	 * FIXME: netstack_ipsec is used by some of the routines we call
599	 * below, but it isn't set until this routine returns.
600	 * Either we introduce optional xxx_stack_alloc() functions
601	 * that will be called by the netstack framework before xxx_stack_init,
602	 * or we switch spd.c and sadb.c to operate on ipsec_stack_t
603	 * (latter has some include file order issues for sadb.h, but makes
604	 * sense if we merge some of the ipsec related stack_t's together.
605	 */
606	ns->netstack_ipsec = ipss;
607
608	/*
609	 * Make two attempts to allocate policy hash tables; try it at
610	 * the "preferred" size (may be set in /etc/system) first,
611	 * then fall back to the default size.
612	 */
613	ipss->ipsec_spd_hashsize = (ipsec_spd_hashsize == 0) ?
614	    IPSEC_SPDHASH_DEFAULT : ipsec_spd_hashsize;
615
616	if (ipsec_alloc_tables(KM_NOSLEEP, ns) != 0) {
617		cmn_err(CE_WARN,
618		    "Unable to allocate %d entry IPsec policy hash table",
619		    ipss->ipsec_spd_hashsize);
620		ipss->ipsec_spd_hashsize = IPSEC_SPDHASH_DEFAULT;
621		cmn_err(CE_WARN, "Falling back to %d entries",
622		    ipss->ipsec_spd_hashsize);
623		(void) ipsec_alloc_tables(KM_SLEEP, ns);
624	}
625
626	/* Just set a default for tunnels. */
627	ipss->ipsec_tun_spd_hashsize = (tun_spd_hashsize == 0) ?
628	    TUN_SPDHASH_DEFAULT : tun_spd_hashsize;
629
630	ipsid_init(ns);
631	/*
632	 * Globals need ref == 1 to prevent IPPH_REFRELE() from attempting
633	 * to free them.
634	 */
635	ipss->ipsec_system_policy.iph_refs = 1;
636	ipss->ipsec_inactive_policy.iph_refs = 1;
637	ipsec_polhead_init(&ipss->ipsec_system_policy,
638	    ipss->ipsec_spd_hashsize);
639	ipsec_polhead_init(&ipss->ipsec_inactive_policy,
640	    ipss->ipsec_spd_hashsize);
641	rw_init(&ipss->ipsec_tunnel_policy_lock, NULL, RW_DEFAULT, NULL);
642	avl_create(&ipss->ipsec_tunnel_policies, tunnel_compare,
643	    sizeof (ipsec_tun_pol_t), 0);
644
645	ipss->ipsec_next_policy_index = 1;
646
647	rw_init(&ipss->ipsec_system_policy.iph_lock, NULL, RW_DEFAULT, NULL);
648	rw_init(&ipss->ipsec_inactive_policy.iph_lock, NULL, RW_DEFAULT, NULL);
649
650	for (i = 0; i < IPSEC_ACTION_HASH_SIZE; i++)
651		mutex_init(&(ipss->ipsec_action_hash[i].hash_lock),
652		    NULL, MUTEX_DEFAULT, NULL);
653
654	for (i = 0; i < ipss->ipsec_spd_hashsize; i++)
655		mutex_init(&(ipss->ipsec_sel_hash[i].hash_lock),
656		    NULL, MUTEX_DEFAULT, NULL);
657
658	rw_init(&ipss->ipsec_alg_lock, NULL, RW_DEFAULT, NULL);
659	for (i = 0; i < IPSEC_NALGTYPES; i++) {
660		ipss->ipsec_nalgs[i] = 0;
661	}
662
663	ip_drop_init(ipss);
664	ip_drop_register(&ipss->ipsec_spd_dropper, "IPsec SPD");
665
666	/* IP's IPsec code calls the packet dropper */
667	ip_drop_register(&ipss->ipsec_dropper, "IP IPsec processing");
668
669	(void) ipsec_kstat_init(ipss);
670
671	ipsec_loader_init(ipss);
672	ipsec_loader_start(ipss);
673
674	return (ipss);
675}
676
677/* Global across all stack instances */
678void
679ipsec_policy_g_init(void)
680{
681	ipsec_action_cache = kmem_cache_create("ipsec_actions",
682	    sizeof (ipsec_action_t), _POINTER_ALIGNMENT, NULL, NULL,
683	    ipsec_action_reclaim, NULL, NULL, 0);
684	ipsec_sel_cache = kmem_cache_create("ipsec_selectors",
685	    sizeof (ipsec_sel_t), _POINTER_ALIGNMENT, NULL, NULL,
686	    NULL, NULL, NULL, 0);
687	ipsec_pol_cache = kmem_cache_create("ipsec_policy",
688	    sizeof (ipsec_policy_t), _POINTER_ALIGNMENT, NULL, NULL,
689	    NULL, NULL, NULL, 0);
690
691	/*
692	 * We want to be informed each time a stack is created or
693	 * destroyed in the kernel, so we can maintain the
694	 * set of ipsec_stack_t's.
695	 */
696	netstack_register(NS_IPSEC, ipsec_stack_init, NULL, ipsec_stack_fini);
697}
698
699/*
700 * Sort algorithm lists.
701 *
702 * I may need to split this based on
703 * authentication/encryption, and I may wish to have an administrator
704 * configure this list.  Hold on to some NDD variables...
705 *
706 * XXX For now, sort on minimum key size (GAG!).  While minimum key size is
707 * not the ideal metric, it's the only quantifiable measure available.
708 * We need a better metric for sorting algorithms by preference.
709 */
710static void
711alg_insert_sortlist(enum ipsec_algtype at, uint8_t algid, netstack_t *ns)
712{
713	ipsec_stack_t	*ipss = ns->netstack_ipsec;
714	ipsec_alginfo_t *ai = ipss->ipsec_alglists[at][algid];
715	uint8_t holder, swap;
716	uint_t i;
717	uint_t count = ipss->ipsec_nalgs[at];
718	ASSERT(ai != NULL);
719	ASSERT(algid == ai->alg_id);
720
721	ASSERT(RW_WRITE_HELD(&ipss->ipsec_alg_lock));
722
723	holder = algid;
724
725	for (i = 0; i < count - 1; i++) {
726		ipsec_alginfo_t *alt;
727
728		alt = ipss->ipsec_alglists[at][ipss->ipsec_sortlist[at][i]];
729		/*
730		 * If you want to give precedence to newly added algs,
731		 * add the = in the > comparison.
732		 */
733		if ((holder != algid) || (ai->alg_minbits > alt->alg_minbits)) {
734			/* Swap sortlist[i] and holder. */
735			swap = ipss->ipsec_sortlist[at][i];
736			ipss->ipsec_sortlist[at][i] = holder;
737			holder = swap;
738			ai = alt;
739		} /* Else just continue. */
740	}
741
742	/* Store holder in last slot. */
743	ipss->ipsec_sortlist[at][i] = holder;
744}
745
746/*
747 * Remove an algorithm from a sorted algorithm list.
748 * This should be considerably easier, even with complex sorting.
749 */
750static void
751alg_remove_sortlist(enum ipsec_algtype at, uint8_t algid, netstack_t *ns)
752{
753	boolean_t copyback = B_FALSE;
754	int i;
755	ipsec_stack_t	*ipss = ns->netstack_ipsec;
756	int newcount = ipss->ipsec_nalgs[at];
757
758	ASSERT(RW_WRITE_HELD(&ipss->ipsec_alg_lock));
759
760	for (i = 0; i <= newcount; i++) {
761		if (copyback) {
762			ipss->ipsec_sortlist[at][i-1] =
763			    ipss->ipsec_sortlist[at][i];
764		} else if (ipss->ipsec_sortlist[at][i] == algid) {
765			copyback = B_TRUE;
766		}
767	}
768}
769
770/*
771 * Add the specified algorithm to the algorithm tables.
772 * Must be called while holding the algorithm table writer lock.
773 */
774void
775ipsec_alg_reg(ipsec_algtype_t algtype, ipsec_alginfo_t *alg, netstack_t *ns)
776{
777	ipsec_stack_t	*ipss = ns->netstack_ipsec;
778
779	ASSERT(RW_WRITE_HELD(&ipss->ipsec_alg_lock));
780
781	ASSERT(ipss->ipsec_alglists[algtype][alg->alg_id] == NULL);
782	ipsec_alg_fix_min_max(alg, algtype, ns);
783	ipss->ipsec_alglists[algtype][alg->alg_id] = alg;
784
785	ipss->ipsec_nalgs[algtype]++;
786	alg_insert_sortlist(algtype, alg->alg_id, ns);
787}
788
789/*
790 * Remove the specified algorithm from the algorithm tables.
791 * Must be called while holding the algorithm table writer lock.
792 */
793void
794ipsec_alg_unreg(ipsec_algtype_t algtype, uint8_t algid, netstack_t *ns)
795{
796	ipsec_stack_t	*ipss = ns->netstack_ipsec;
797
798	ASSERT(RW_WRITE_HELD(&ipss->ipsec_alg_lock));
799
800	ASSERT(ipss->ipsec_alglists[algtype][algid] != NULL);
801	ipsec_alg_free(ipss->ipsec_alglists[algtype][algid]);
802	ipss->ipsec_alglists[algtype][algid] = NULL;
803
804	ipss->ipsec_nalgs[algtype]--;
805	alg_remove_sortlist(algtype, algid, ns);
806}
807
808/*
809 * Hooks for spdsock to get a grip on system policy.
810 */
811
812ipsec_policy_head_t *
813ipsec_system_policy(netstack_t *ns)
814{
815	ipsec_stack_t	*ipss = ns->netstack_ipsec;
816	ipsec_policy_head_t *h = &ipss->ipsec_system_policy;
817
818	IPPH_REFHOLD(h);
819	return (h);
820}
821
822ipsec_policy_head_t *
823ipsec_inactive_policy(netstack_t *ns)
824{
825	ipsec_stack_t	*ipss = ns->netstack_ipsec;
826	ipsec_policy_head_t *h = &ipss->ipsec_inactive_policy;
827
828	IPPH_REFHOLD(h);
829	return (h);
830}
831
832/*
833 * Lock inactive policy, then active policy, then exchange policy root
834 * pointers.
835 */
836void
837ipsec_swap_policy(ipsec_policy_head_t *active, ipsec_policy_head_t *inactive,
838    netstack_t *ns)
839{
840	int af, dir;
841	avl_tree_t r1, r2;
842
843	rw_enter(&inactive->iph_lock, RW_WRITER);
844	rw_enter(&active->iph_lock, RW_WRITER);
845
846	r1 = active->iph_rulebyid;
847	r2 = inactive->iph_rulebyid;
848	active->iph_rulebyid = r2;
849	inactive->iph_rulebyid = r1;
850
851	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
852		ipsec_policy_hash_t *h1, *h2;
853
854		h1 = active->iph_root[dir].ipr_hash;
855		h2 = inactive->iph_root[dir].ipr_hash;
856		active->iph_root[dir].ipr_hash = h2;
857		inactive->iph_root[dir].ipr_hash = h1;
858
859		for (af = 0; af < IPSEC_NAF; af++) {
860			ipsec_policy_t *t1, *t2;
861
862			t1 = active->iph_root[dir].ipr_nonhash[af];
863			t2 = inactive->iph_root[dir].ipr_nonhash[af];
864			active->iph_root[dir].ipr_nonhash[af] = t2;
865			inactive->iph_root[dir].ipr_nonhash[af] = t1;
866			if (t1 != NULL) {
867				t1->ipsp_hash.hash_pp =
868				    &(inactive->iph_root[dir].ipr_nonhash[af]);
869			}
870			if (t2 != NULL) {
871				t2->ipsp_hash.hash_pp =
872				    &(active->iph_root[dir].ipr_nonhash[af]);
873			}
874
875		}
876	}
877	active->iph_gen++;
878	inactive->iph_gen++;
879	ipsec_update_present_flags(ns->netstack_ipsec);
880	rw_exit(&active->iph_lock);
881	rw_exit(&inactive->iph_lock);
882}
883
884/*
885 * Swap global policy primary/secondary.
886 */
887void
888ipsec_swap_global_policy(netstack_t *ns)
889{
890	ipsec_stack_t	*ipss = ns->netstack_ipsec;
891
892	ipsec_swap_policy(&ipss->ipsec_system_policy,
893	    &ipss->ipsec_inactive_policy, ns);
894}
895
896/*
897 * Clone one policy rule..
898 */
899static ipsec_policy_t *
900ipsec_copy_policy(const ipsec_policy_t *src)
901{
902	ipsec_policy_t *dst = kmem_cache_alloc(ipsec_pol_cache, KM_NOSLEEP);
903
904	if (dst == NULL)
905		return (NULL);
906
907	/*
908	 * Adjust refcounts of cloned state.
909	 */
910	IPACT_REFHOLD(src->ipsp_act);
911	src->ipsp_sel->ipsl_refs++;
912
913	HASH_NULL(dst, ipsp_hash);
914	dst->ipsp_netstack = src->ipsp_netstack;
915	dst->ipsp_refs = 1;
916	dst->ipsp_sel = src->ipsp_sel;
917	dst->ipsp_act = src->ipsp_act;
918	dst->ipsp_prio = src->ipsp_prio;
919	dst->ipsp_index = src->ipsp_index;
920
921	return (dst);
922}
923
924void
925ipsec_insert_always(avl_tree_t *tree, void *new_node)
926{
927	void *node;
928	avl_index_t where;
929
930	node = avl_find(tree, new_node, &where);
931	ASSERT(node == NULL);
932	avl_insert(tree, new_node, where);
933}
934
935
936static int
937ipsec_copy_chain(ipsec_policy_head_t *dph, ipsec_policy_t *src,
938    ipsec_policy_t **dstp)
939{
940	for (; src != NULL; src = src->ipsp_hash.hash_next) {
941		ipsec_policy_t *dst = ipsec_copy_policy(src);
942		if (dst == NULL)
943			return (ENOMEM);
944
945		HASHLIST_INSERT(dst, ipsp_hash, *dstp);
946		ipsec_insert_always(&dph->iph_rulebyid, dst);
947	}
948	return (0);
949}
950
951
952
953/*
954 * Make one policy head look exactly like another.
955 *
956 * As with ipsec_swap_policy, we lock the destination policy head first, then
957 * the source policy head. Note that we only need to read-lock the source
958 * policy head as we are not changing it.
959 */
960int
961ipsec_copy_polhead(ipsec_policy_head_t *sph, ipsec_policy_head_t *dph,
962    netstack_t *ns)
963{
964	int af, dir, chain, nchains;
965
966	rw_enter(&dph->iph_lock, RW_WRITER);
967
968	ipsec_polhead_flush(dph, ns);
969
970	rw_enter(&sph->iph_lock, RW_READER);
971
972	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
973		ipsec_policy_root_t *dpr = &dph->iph_root[dir];
974		ipsec_policy_root_t *spr = &sph->iph_root[dir];
975		nchains = dpr->ipr_nchains;
976
977		ASSERT(dpr->ipr_nchains == spr->ipr_nchains);
978
979		for (af = 0; af < IPSEC_NAF; af++) {
980			if (ipsec_copy_chain(dph, spr->ipr_nonhash[af],
981			    &dpr->ipr_nonhash[af]))
982				goto abort_copy;
983		}
984
985		for (chain = 0; chain < nchains; chain++) {
986			if (ipsec_copy_chain(dph,
987			    spr->ipr_hash[chain].hash_head,
988			    &dpr->ipr_hash[chain].hash_head))
989				goto abort_copy;
990		}
991	}
992
993	dph->iph_gen++;
994
995	rw_exit(&sph->iph_lock);
996	rw_exit(&dph->iph_lock);
997	return (0);
998
999abort_copy:
1000	ipsec_polhead_flush(dph, ns);
1001	rw_exit(&sph->iph_lock);
1002	rw_exit(&dph->iph_lock);
1003	return (ENOMEM);
1004}
1005
1006/*
1007 * Clone currently active policy to the inactive policy list.
1008 */
1009int
1010ipsec_clone_system_policy(netstack_t *ns)
1011{
1012	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1013
1014	return (ipsec_copy_polhead(&ipss->ipsec_system_policy,
1015	    &ipss->ipsec_inactive_policy, ns));
1016}
1017
1018/*
1019 * Extract the string from ipsec_policy_failure_msgs[type] and
1020 * log it.
1021 *
1022 */
1023void
1024ipsec_log_policy_failure(int type, char *func_name, ipha_t *ipha, ip6_t *ip6h,
1025    boolean_t secure, netstack_t *ns)
1026{
1027	char	sbuf[INET6_ADDRSTRLEN];
1028	char	dbuf[INET6_ADDRSTRLEN];
1029	char	*s;
1030	char	*d;
1031	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1032
1033	ASSERT((ipha == NULL && ip6h != NULL) ||
1034	    (ip6h == NULL && ipha != NULL));
1035
1036	if (ipha != NULL) {
1037		s = inet_ntop(AF_INET, &ipha->ipha_src, sbuf, sizeof (sbuf));
1038		d = inet_ntop(AF_INET, &ipha->ipha_dst, dbuf, sizeof (dbuf));
1039	} else {
1040		s = inet_ntop(AF_INET6, &ip6h->ip6_src, sbuf, sizeof (sbuf));
1041		d = inet_ntop(AF_INET6, &ip6h->ip6_dst, dbuf, sizeof (dbuf));
1042
1043	}
1044
1045	/* Always bump the policy failure counter. */
1046	ipss->ipsec_policy_failure_count[type]++;
1047
1048	ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
1049	    ipsec_policy_failure_msgs[type], func_name,
1050	    (secure ? "secure" : "not secure"), s, d);
1051}
1052
1053/*
1054 * Rate-limiting front-end to strlog() for AH and ESP.	Uses the ndd variables
1055 * in /dev/ip and the same rate-limiting clock so that there's a single
1056 * knob to turn to throttle the rate of messages.
1057 */
1058void
1059ipsec_rl_strlog(netstack_t *ns, short mid, short sid, char level, ushort_t sl,
1060    char *fmt, ...)
1061{
1062	va_list adx;
1063	hrtime_t current = gethrtime();
1064	ip_stack_t	*ipst = ns->netstack_ip;
1065	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1066
1067	sl |= SL_CONSOLE;
1068	/*
1069	 * Throttle logging to stop syslog from being swamped. If variable
1070	 * 'ipsec_policy_log_interval' is zero, don't log any messages at
1071	 * all, otherwise log only one message every 'ipsec_policy_log_interval'
1072	 * msec. Convert interval (in msec) to hrtime (in nsec).
1073	 */
1074
1075	if (ipst->ips_ipsec_policy_log_interval) {
1076		if (ipss->ipsec_policy_failure_last +
1077		    MSEC2NSEC(ipst->ips_ipsec_policy_log_interval) <= current) {
1078			va_start(adx, fmt);
1079			(void) vstrlog(mid, sid, level, sl, fmt, adx);
1080			va_end(adx);
1081			ipss->ipsec_policy_failure_last = current;
1082		}
1083	}
1084}
1085
1086void
1087ipsec_config_flush(netstack_t *ns)
1088{
1089	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1090
1091	rw_enter(&ipss->ipsec_system_policy.iph_lock, RW_WRITER);
1092	ipsec_polhead_flush(&ipss->ipsec_system_policy, ns);
1093	ipss->ipsec_next_policy_index = 1;
1094	rw_exit(&ipss->ipsec_system_policy.iph_lock);
1095	ipsec_action_reclaim_stack(ipss);
1096}
1097
1098/*
1099 * Clip a policy's min/max keybits vs. the capabilities of the
1100 * algorithm.
1101 */
1102static void
1103act_alg_adjust(uint_t algtype, uint_t algid,
1104    uint16_t *minbits, uint16_t *maxbits, netstack_t *ns)
1105{
1106	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1107	ipsec_alginfo_t *algp = ipss->ipsec_alglists[algtype][algid];
1108
1109	if (algp != NULL) {
1110		/*
1111		 * If passed-in minbits is zero, we assume the caller trusts
1112		 * us with setting the minimum key size.  We pick the
1113		 * algorithms DEFAULT key size for the minimum in this case.
1114		 */
1115		if (*minbits == 0) {
1116			*minbits = algp->alg_default_bits;
1117			ASSERT(*minbits >= algp->alg_minbits);
1118		} else {
1119			*minbits = MAX(MIN(*minbits, algp->alg_maxbits),
1120			    algp->alg_minbits);
1121		}
1122		if (*maxbits == 0)
1123			*maxbits = algp->alg_maxbits;
1124		else
1125			*maxbits = MIN(MAX(*maxbits, algp->alg_minbits),
1126			    algp->alg_maxbits);
1127		ASSERT(*minbits <= *maxbits);
1128	} else {
1129		*minbits = 0;
1130		*maxbits = 0;
1131	}
1132}
1133
1134/*
1135 * Check an action's requested algorithms against the algorithms currently
1136 * loaded in the system.
1137 */
1138boolean_t
1139ipsec_check_action(ipsec_act_t *act, int *diag, netstack_t *ns)
1140{
1141	ipsec_prot_t *ipp;
1142	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1143
1144	ipp = &act->ipa_apply;
1145
1146	if (ipp->ipp_use_ah &&
1147	    ipss->ipsec_alglists[IPSEC_ALG_AUTH][ipp->ipp_auth_alg] == NULL) {
1148		*diag = SPD_DIAGNOSTIC_UNSUPP_AH_ALG;
1149		return (B_FALSE);
1150	}
1151	if (ipp->ipp_use_espa &&
1152	    ipss->ipsec_alglists[IPSEC_ALG_AUTH][ipp->ipp_esp_auth_alg] ==
1153	    NULL) {
1154		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_AUTH_ALG;
1155		return (B_FALSE);
1156	}
1157	if (ipp->ipp_use_esp &&
1158	    ipss->ipsec_alglists[IPSEC_ALG_ENCR][ipp->ipp_encr_alg] == NULL) {
1159		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_ENCR_ALG;
1160		return (B_FALSE);
1161	}
1162
1163	act_alg_adjust(IPSEC_ALG_AUTH, ipp->ipp_auth_alg,
1164	    &ipp->ipp_ah_minbits, &ipp->ipp_ah_maxbits, ns);
1165	act_alg_adjust(IPSEC_ALG_AUTH, ipp->ipp_esp_auth_alg,
1166	    &ipp->ipp_espa_minbits, &ipp->ipp_espa_maxbits, ns);
1167	act_alg_adjust(IPSEC_ALG_ENCR, ipp->ipp_encr_alg,
1168	    &ipp->ipp_espe_minbits, &ipp->ipp_espe_maxbits, ns);
1169
1170	if (ipp->ipp_ah_minbits > ipp->ipp_ah_maxbits) {
1171		*diag = SPD_DIAGNOSTIC_UNSUPP_AH_KEYSIZE;
1172		return (B_FALSE);
1173	}
1174	if (ipp->ipp_espa_minbits > ipp->ipp_espa_maxbits) {
1175		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_AUTH_KEYSIZE;
1176		return (B_FALSE);
1177	}
1178	if (ipp->ipp_espe_minbits > ipp->ipp_espe_maxbits) {
1179		*diag = SPD_DIAGNOSTIC_UNSUPP_ESP_ENCR_KEYSIZE;
1180		return (B_FALSE);
1181	}
1182	/* TODO: sanity check lifetimes */
1183	return (B_TRUE);
1184}
1185
1186/*
1187 * Set up a single action during wildcard expansion..
1188 */
1189static void
1190ipsec_setup_act(ipsec_act_t *outact, ipsec_act_t *act,
1191    uint_t auth_alg, uint_t encr_alg, uint_t eauth_alg, netstack_t *ns)
1192{
1193	ipsec_prot_t *ipp;
1194
1195	*outact = *act;
1196	ipp = &outact->ipa_apply;
1197	ipp->ipp_auth_alg = (uint8_t)auth_alg;
1198	ipp->ipp_encr_alg = (uint8_t)encr_alg;
1199	ipp->ipp_esp_auth_alg = (uint8_t)eauth_alg;
1200
1201	act_alg_adjust(IPSEC_ALG_AUTH, auth_alg,
1202	    &ipp->ipp_ah_minbits, &ipp->ipp_ah_maxbits, ns);
1203	act_alg_adjust(IPSEC_ALG_AUTH, eauth_alg,
1204	    &ipp->ipp_espa_minbits, &ipp->ipp_espa_maxbits, ns);
1205	act_alg_adjust(IPSEC_ALG_ENCR, encr_alg,
1206	    &ipp->ipp_espe_minbits, &ipp->ipp_espe_maxbits, ns);
1207}
1208
1209/*
1210 * combinatoric expansion time: expand a wildcarded action into an
1211 * array of wildcarded actions; we return the exploded action list,
1212 * and return a count in *nact (output only).
1213 */
1214static ipsec_act_t *
1215ipsec_act_wildcard_expand(ipsec_act_t *act, uint_t *nact, netstack_t *ns)
1216{
1217	boolean_t use_ah, use_esp, use_espa;
1218	boolean_t wild_auth, wild_encr, wild_eauth;
1219	uint_t	auth_alg, auth_idx, auth_min, auth_max;
1220	uint_t	eauth_alg, eauth_idx, eauth_min, eauth_max;
1221	uint_t  encr_alg, encr_idx, encr_min, encr_max;
1222	uint_t	action_count, ai;
1223	ipsec_act_t *outact;
1224	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1225
1226	if (act->ipa_type != IPSEC_ACT_APPLY) {
1227		outact = kmem_alloc(sizeof (*act), KM_NOSLEEP);
1228		*nact = 1;
1229		if (outact != NULL)
1230			bcopy(act, outact, sizeof (*act));
1231		return (outact);
1232	}
1233	/*
1234	 * compute the combinatoric explosion..
1235	 *
1236	 * we assume a request for encr if esp_req is PREF_REQUIRED
1237	 * we assume a request for ah auth if ah_req is PREF_REQUIRED.
1238	 * we assume a request for esp auth if !ah and esp_req is PREF_REQUIRED
1239	 */
1240
1241	use_ah = act->ipa_apply.ipp_use_ah;
1242	use_esp = act->ipa_apply.ipp_use_esp;
1243	use_espa = act->ipa_apply.ipp_use_espa;
1244	auth_alg = act->ipa_apply.ipp_auth_alg;
1245	eauth_alg = act->ipa_apply.ipp_esp_auth_alg;
1246	encr_alg = act->ipa_apply.ipp_encr_alg;
1247
1248	wild_auth = use_ah && (auth_alg == 0);
1249	wild_eauth = use_espa && (eauth_alg == 0);
1250	wild_encr = use_esp && (encr_alg == 0);
1251
1252	action_count = 1;
1253	auth_min = auth_max = auth_alg;
1254	eauth_min = eauth_max = eauth_alg;
1255	encr_min = encr_max = encr_alg;
1256
1257	/*
1258	 * set up for explosion.. for each dimension, expand output
1259	 * size by the explosion factor.
1260	 *
1261	 * Don't include the "any" algorithms, if defined, as no
1262	 * kernel policies should be set for these algorithms.
1263	 */
1264
1265#define	SET_EXP_MINMAX(type, wild, alg, min, max, ipss)		\
1266	if (wild) {						\
1267		int nalgs = ipss->ipsec_nalgs[type];		\
1268		if (ipss->ipsec_alglists[type][alg] != NULL)	\
1269			nalgs--;				\
1270		action_count *= nalgs;				\
1271		min = 0;					\
1272		max = ipss->ipsec_nalgs[type] - 1;		\
1273	}
1274
1275	SET_EXP_MINMAX(IPSEC_ALG_AUTH, wild_auth, SADB_AALG_NONE,
1276	    auth_min, auth_max, ipss);
1277	SET_EXP_MINMAX(IPSEC_ALG_AUTH, wild_eauth, SADB_AALG_NONE,
1278	    eauth_min, eauth_max, ipss);
1279	SET_EXP_MINMAX(IPSEC_ALG_ENCR, wild_encr, SADB_EALG_NONE,
1280	    encr_min, encr_max, ipss);
1281
1282#undef	SET_EXP_MINMAX
1283
1284	/*
1285	 * ok, allocate the whole mess..
1286	 */
1287
1288	outact = kmem_alloc(sizeof (*outact) * action_count, KM_NOSLEEP);
1289	if (outact == NULL)
1290		return (NULL);
1291
1292	/*
1293	 * Now compute all combinations.  Note that non-wildcarded
1294	 * dimensions just get a single value from auth_min, while
1295	 * wildcarded dimensions indirect through the sortlist.
1296	 *
1297	 * We do encryption outermost since, at this time, there's
1298	 * greater difference in security and performance between
1299	 * encryption algorithms vs. authentication algorithms.
1300	 */
1301
1302	ai = 0;
1303
1304#define	WHICH_ALG(type, wild, idx, ipss) \
1305	((wild)?(ipss->ipsec_sortlist[type][idx]):(idx))
1306
1307	for (encr_idx = encr_min; encr_idx <= encr_max; encr_idx++) {
1308		encr_alg = WHICH_ALG(IPSEC_ALG_ENCR, wild_encr, encr_idx, ipss);
1309		if (wild_encr && encr_alg == SADB_EALG_NONE)
1310			continue;
1311		for (auth_idx = auth_min; auth_idx <= auth_max; auth_idx++) {
1312			auth_alg = WHICH_ALG(IPSEC_ALG_AUTH, wild_auth,
1313			    auth_idx, ipss);
1314			if (wild_auth && auth_alg == SADB_AALG_NONE)
1315				continue;
1316			for (eauth_idx = eauth_min; eauth_idx <= eauth_max;
1317			    eauth_idx++) {
1318				eauth_alg = WHICH_ALG(IPSEC_ALG_AUTH,
1319				    wild_eauth, eauth_idx, ipss);
1320				if (wild_eauth && eauth_alg == SADB_AALG_NONE)
1321					continue;
1322
1323				ipsec_setup_act(&outact[ai], act,
1324				    auth_alg, encr_alg, eauth_alg, ns);
1325				ai++;
1326			}
1327		}
1328	}
1329
1330#undef WHICH_ALG
1331
1332	ASSERT(ai == action_count);
1333	*nact = action_count;
1334	return (outact);
1335}
1336
1337/*
1338 * Extract the parts of an ipsec_prot_t from an old-style ipsec_req_t.
1339 */
1340static void
1341ipsec_prot_from_req(const ipsec_req_t *req, ipsec_prot_t *ipp)
1342{
1343	bzero(ipp, sizeof (*ipp));
1344	/*
1345	 * ipp_use_* are bitfields.  Look at "!!" in the following as a
1346	 * "boolean canonicalization" operator.
1347	 */
1348	ipp->ipp_use_ah = !!(req->ipsr_ah_req & IPSEC_PREF_REQUIRED);
1349	ipp->ipp_use_esp = !!(req->ipsr_esp_req & IPSEC_PREF_REQUIRED);
1350	ipp->ipp_use_espa = !!(req->ipsr_esp_auth_alg);
1351	ipp->ipp_use_se = !!(req->ipsr_self_encap_req & IPSEC_PREF_REQUIRED);
1352	ipp->ipp_use_unique = !!((req->ipsr_ah_req|req->ipsr_esp_req) &
1353	    IPSEC_PREF_UNIQUE);
1354	ipp->ipp_encr_alg = req->ipsr_esp_alg;
1355	/*
1356	 * SADB_AALG_ANY is a placeholder to distinguish "any" from
1357	 * "none" above.  If auth is required, as determined above,
1358	 * SADB_AALG_ANY becomes 0, which is the representation
1359	 * of "any" and "none" in PF_KEY v2.
1360	 */
1361	ipp->ipp_auth_alg = (req->ipsr_auth_alg != SADB_AALG_ANY) ?
1362	    req->ipsr_auth_alg : 0;
1363	ipp->ipp_esp_auth_alg = (req->ipsr_esp_auth_alg != SADB_AALG_ANY) ?
1364	    req->ipsr_esp_auth_alg : 0;
1365}
1366
1367/*
1368 * Extract a new-style action from a request.
1369 */
1370void
1371ipsec_actvec_from_req(const ipsec_req_t *req, ipsec_act_t **actp, uint_t *nactp,
1372    netstack_t *ns)
1373{
1374	struct ipsec_act act;
1375
1376	bzero(&act, sizeof (act));
1377	if ((req->ipsr_ah_req & IPSEC_PREF_NEVER) &&
1378	    (req->ipsr_esp_req & IPSEC_PREF_NEVER)) {
1379		act.ipa_type = IPSEC_ACT_BYPASS;
1380	} else {
1381		act.ipa_type = IPSEC_ACT_APPLY;
1382		ipsec_prot_from_req(req, &act.ipa_apply);
1383	}
1384	*actp = ipsec_act_wildcard_expand(&act, nactp, ns);
1385}
1386
1387/*
1388 * Convert a new-style "prot" back to an ipsec_req_t (more backwards compat).
1389 * We assume caller has already zero'ed *req for us.
1390 */
1391static int
1392ipsec_req_from_prot(ipsec_prot_t *ipp, ipsec_req_t *req)
1393{
1394	req->ipsr_esp_alg = ipp->ipp_encr_alg;
1395	req->ipsr_auth_alg = ipp->ipp_auth_alg;
1396	req->ipsr_esp_auth_alg = ipp->ipp_esp_auth_alg;
1397
1398	if (ipp->ipp_use_unique) {
1399		req->ipsr_ah_req |= IPSEC_PREF_UNIQUE;
1400		req->ipsr_esp_req |= IPSEC_PREF_UNIQUE;
1401	}
1402	if (ipp->ipp_use_se)
1403		req->ipsr_self_encap_req |= IPSEC_PREF_REQUIRED;
1404	if (ipp->ipp_use_ah)
1405		req->ipsr_ah_req |= IPSEC_PREF_REQUIRED;
1406	if (ipp->ipp_use_esp)
1407		req->ipsr_esp_req |= IPSEC_PREF_REQUIRED;
1408	return (sizeof (*req));
1409}
1410
1411/*
1412 * Convert a new-style action back to an ipsec_req_t (more backwards compat).
1413 * We assume caller has already zero'ed *req for us.
1414 */
1415static int
1416ipsec_req_from_act(ipsec_action_t *ap, ipsec_req_t *req)
1417{
1418	switch (ap->ipa_act.ipa_type) {
1419	case IPSEC_ACT_BYPASS:
1420		req->ipsr_ah_req = IPSEC_PREF_NEVER;
1421		req->ipsr_esp_req = IPSEC_PREF_NEVER;
1422		return (sizeof (*req));
1423	case IPSEC_ACT_APPLY:
1424		return (ipsec_req_from_prot(&ap->ipa_act.ipa_apply, req));
1425	}
1426	return (sizeof (*req));
1427}
1428
1429/*
1430 * Convert a new-style action back to an ipsec_req_t (more backwards compat).
1431 * We assume caller has already zero'ed *req for us.
1432 */
1433int
1434ipsec_req_from_head(ipsec_policy_head_t *ph, ipsec_req_t *req, int af)
1435{
1436	ipsec_policy_t *p;
1437
1438	/*
1439	 * FULL-PERSOCK: consult hash table, too?
1440	 */
1441	for (p = ph->iph_root[IPSEC_INBOUND].ipr_nonhash[af];
1442	    p != NULL;
1443	    p = p->ipsp_hash.hash_next) {
1444		if ((p->ipsp_sel->ipsl_key.ipsl_valid & IPSL_WILDCARD) == 0)
1445			return (ipsec_req_from_act(p->ipsp_act, req));
1446	}
1447	return (sizeof (*req));
1448}
1449
1450/*
1451 * Based on per-socket or latched policy, convert to an appropriate
1452 * IP_SEC_OPT ipsec_req_t for the socket option; return size so we can
1453 * be tail-called from ip.
1454 */
1455int
1456ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af)
1457{
1458	ipsec_latch_t *ipl;
1459	int rv = sizeof (ipsec_req_t);
1460
1461	bzero(req, sizeof (*req));
1462
1463	ASSERT(MUTEX_HELD(&connp->conn_lock));
1464	ipl = connp->conn_latch;
1465
1466	/*
1467	 * Find appropriate policy.  First choice is latched action;
1468	 * failing that, see latched policy; failing that,
1469	 * look at configured policy.
1470	 */
1471	if (ipl != NULL) {
1472		if (connp->conn_latch_in_action != NULL) {
1473			rv = ipsec_req_from_act(connp->conn_latch_in_action,
1474			    req);
1475			goto done;
1476		}
1477		if (connp->conn_latch_in_policy != NULL) {
1478			rv = ipsec_req_from_act(
1479			    connp->conn_latch_in_policy->ipsp_act, req);
1480			goto done;
1481		}
1482	}
1483	if (connp->conn_policy != NULL)
1484		rv = ipsec_req_from_head(connp->conn_policy, req, af);
1485done:
1486	return (rv);
1487}
1488
1489void
1490ipsec_actvec_free(ipsec_act_t *act, uint_t nact)
1491{
1492	kmem_free(act, nact * sizeof (*act));
1493}
1494
1495/*
1496 * Consumes a reference to ipsp.
1497 */
1498static mblk_t *
1499ipsec_check_loopback_policy(mblk_t *data_mp, ip_recv_attr_t *ira,
1500    ipsec_policy_t *ipsp)
1501{
1502	if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
1503		return (data_mp);
1504
1505	ASSERT(ira->ira_flags & IRAF_LOOPBACK);
1506
1507	IPPOL_REFRELE(ipsp);
1508
1509	/*
1510	 * We should do an actual policy check here.  Revisit this
1511	 * when we revisit the IPsec API.  (And pass a conn_t in when we
1512	 * get there.)
1513	 */
1514
1515	return (data_mp);
1516}
1517
1518/*
1519 * Check that packet's inbound ports & proto match the selectors
1520 * expected by the SAs it traversed on the way in.
1521 */
1522static boolean_t
1523ipsec_check_ipsecin_unique(ip_recv_attr_t *ira, const char **reason,
1524    kstat_named_t **counter, uint64_t pkt_unique, netstack_t *ns)
1525{
1526	uint64_t ah_mask, esp_mask;
1527	ipsa_t *ah_assoc;
1528	ipsa_t *esp_assoc;
1529	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1530
1531	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1532	ASSERT(!(ira->ira_flags & IRAF_LOOPBACK));
1533
1534	ah_assoc = ira->ira_ipsec_ah_sa;
1535	esp_assoc = ira->ira_ipsec_esp_sa;
1536	ASSERT((ah_assoc != NULL) || (esp_assoc != NULL));
1537
1538	ah_mask = (ah_assoc != NULL) ? ah_assoc->ipsa_unique_mask : 0;
1539	esp_mask = (esp_assoc != NULL) ? esp_assoc->ipsa_unique_mask : 0;
1540
1541	if ((ah_mask == 0) && (esp_mask == 0))
1542		return (B_TRUE);
1543
1544	/*
1545	 * The pkt_unique check will also check for tunnel mode on the SA
1546	 * vs. the tunneled_packet boolean.  "Be liberal in what you receive"
1547	 * should not apply in this case.  ;)
1548	 */
1549
1550	if (ah_mask != 0 &&
1551	    ah_assoc->ipsa_unique_id != (pkt_unique & ah_mask)) {
1552		*reason = "AH inner header mismatch";
1553		*counter = DROPPER(ipss, ipds_spd_ah_innermismatch);
1554		return (B_FALSE);
1555	}
1556	if (esp_mask != 0 &&
1557	    esp_assoc->ipsa_unique_id != (pkt_unique & esp_mask)) {
1558		*reason = "ESP inner header mismatch";
1559		*counter = DROPPER(ipss, ipds_spd_esp_innermismatch);
1560		return (B_FALSE);
1561	}
1562	return (B_TRUE);
1563}
1564
1565static boolean_t
1566ipsec_check_ipsecin_action(ip_recv_attr_t *ira, mblk_t *mp, ipsec_action_t *ap,
1567    ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter,
1568    netstack_t *ns)
1569{
1570	boolean_t ret = B_TRUE;
1571	ipsec_prot_t *ipp;
1572	ipsa_t *ah_assoc;
1573	ipsa_t *esp_assoc;
1574	boolean_t decaps;
1575	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1576
1577	ASSERT((ipha == NULL && ip6h != NULL) ||
1578	    (ip6h == NULL && ipha != NULL));
1579
1580	if (ira->ira_flags & IRAF_LOOPBACK) {
1581		/*
1582		 * Besides accepting pointer-equivalent actions, we also
1583		 * accept any ICMP errors we generated for ourselves,
1584		 * regardless of policy.  If we do not wish to make this
1585		 * assumption in the future, check here, and where
1586		 * IXAF_TRUSTED_ICMP is initialized in ip.c and ip6.c.
1587		 */
1588		if (ap == ira->ira_ipsec_action ||
1589		    (ira->ira_flags & IRAF_TRUSTED_ICMP))
1590			return (B_TRUE);
1591
1592		/* Deep compare necessary here?? */
1593		*counter = DROPPER(ipss, ipds_spd_loopback_mismatch);
1594		*reason = "loopback policy mismatch";
1595		return (B_FALSE);
1596	}
1597	ASSERT(!(ira->ira_flags & IRAF_TRUSTED_ICMP));
1598	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1599
1600	ah_assoc = ira->ira_ipsec_ah_sa;
1601	esp_assoc = ira->ira_ipsec_esp_sa;
1602
1603	decaps = (ira->ira_flags & IRAF_IPSEC_DECAPS);
1604
1605	switch (ap->ipa_act.ipa_type) {
1606	case IPSEC_ACT_DISCARD:
1607	case IPSEC_ACT_REJECT:
1608		/* Should "fail hard" */
1609		*counter = DROPPER(ipss, ipds_spd_explicit);
1610		*reason = "blocked by policy";
1611		return (B_FALSE);
1612
1613	case IPSEC_ACT_BYPASS:
1614	case IPSEC_ACT_CLEAR:
1615		*counter = DROPPER(ipss, ipds_spd_got_secure);
1616		*reason = "expected clear, got protected";
1617		return (B_FALSE);
1618
1619	case IPSEC_ACT_APPLY:
1620		ipp = &ap->ipa_act.ipa_apply;
1621		/*
1622		 * As of now we do the simple checks of whether
1623		 * the datagram has gone through the required IPSEC
1624		 * protocol constraints or not. We might have more
1625		 * in the future like sensitive levels, key bits, etc.
1626		 * If it fails the constraints, check whether we would
1627		 * have accepted this if it had come in clear.
1628		 */
1629		if (ipp->ipp_use_ah) {
1630			if (ah_assoc == NULL) {
1631				ret = ipsec_inbound_accept_clear(mp, ipha,
1632				    ip6h);
1633				*counter = DROPPER(ipss, ipds_spd_got_clear);
1634				*reason = "unprotected not accepted";
1635				break;
1636			}
1637			ASSERT(ah_assoc != NULL);
1638			ASSERT(ipp->ipp_auth_alg != 0);
1639
1640			if (ah_assoc->ipsa_auth_alg !=
1641			    ipp->ipp_auth_alg) {
1642				*counter = DROPPER(ipss, ipds_spd_bad_ahalg);
1643				*reason = "unacceptable ah alg";
1644				ret = B_FALSE;
1645				break;
1646			}
1647		} else if (ah_assoc != NULL) {
1648			/*
1649			 * Don't allow this. Check IPSEC NOTE above
1650			 * ip_fanout_proto().
1651			 */
1652			*counter = DROPPER(ipss, ipds_spd_got_ah);
1653			*reason = "unexpected AH";
1654			ret = B_FALSE;
1655			break;
1656		}
1657		if (ipp->ipp_use_esp) {
1658			if (esp_assoc == NULL) {
1659				ret = ipsec_inbound_accept_clear(mp, ipha,
1660				    ip6h);
1661				*counter = DROPPER(ipss, ipds_spd_got_clear);
1662				*reason = "unprotected not accepted";
1663				break;
1664			}
1665			ASSERT(esp_assoc != NULL);
1666			ASSERT(ipp->ipp_encr_alg != 0);
1667
1668			if (esp_assoc->ipsa_encr_alg !=
1669			    ipp->ipp_encr_alg) {
1670				*counter = DROPPER(ipss, ipds_spd_bad_espealg);
1671				*reason = "unacceptable esp alg";
1672				ret = B_FALSE;
1673				break;
1674			}
1675			/*
1676			 * If the client does not need authentication,
1677			 * we don't verify the alogrithm.
1678			 */
1679			if (ipp->ipp_use_espa) {
1680				if (esp_assoc->ipsa_auth_alg !=
1681				    ipp->ipp_esp_auth_alg) {
1682					*counter = DROPPER(ipss,
1683					    ipds_spd_bad_espaalg);
1684					*reason = "unacceptable esp auth alg";
1685					ret = B_FALSE;
1686					break;
1687				}
1688			}
1689		} else if (esp_assoc != NULL) {
1690			/*
1691			 * Don't allow this. Check IPSEC NOTE above
1692			 * ip_fanout_proto().
1693			 */
1694			*counter = DROPPER(ipss, ipds_spd_got_esp);
1695			*reason = "unexpected ESP";
1696			ret = B_FALSE;
1697			break;
1698		}
1699		if (ipp->ipp_use_se) {
1700			if (!decaps) {
1701				ret = ipsec_inbound_accept_clear(mp, ipha,
1702				    ip6h);
1703				if (!ret) {
1704					/* XXX mutant? */
1705					*counter = DROPPER(ipss,
1706					    ipds_spd_bad_selfencap);
1707					*reason = "self encap not found";
1708					break;
1709				}
1710			}
1711		} else if (decaps) {
1712			/*
1713			 * XXX If the packet comes in tunneled and the
1714			 * recipient does not expect it to be tunneled, it
1715			 * is okay. But we drop to be consistent with the
1716			 * other cases.
1717			 */
1718			*counter = DROPPER(ipss, ipds_spd_got_selfencap);
1719			*reason = "unexpected self encap";
1720			ret = B_FALSE;
1721			break;
1722		}
1723		if (ira->ira_ipsec_action != NULL) {
1724			/*
1725			 * This can happen if we do a double policy-check on
1726			 * a packet
1727			 * XXX XXX should fix this case!
1728			 */
1729			IPACT_REFRELE(ira->ira_ipsec_action);
1730		}
1731		ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1732		ASSERT(ira->ira_ipsec_action == NULL);
1733		IPACT_REFHOLD(ap);
1734		ira->ira_ipsec_action = ap;
1735		break;	/* from switch */
1736	}
1737	return (ret);
1738}
1739
1740static boolean_t
1741spd_match_inbound_ids(ipsec_latch_t *ipl, ipsa_t *sa)
1742{
1743	ASSERT(ipl->ipl_ids_latched == B_TRUE);
1744	return ipsid_equal(ipl->ipl_remote_cid, sa->ipsa_src_cid) &&
1745	    ipsid_equal(ipl->ipl_local_cid, sa->ipsa_dst_cid);
1746}
1747
1748/*
1749 * Takes a latched conn and an inbound packet and returns a unique_id suitable
1750 * for SA comparisons.  Most of the time we will copy from the conn_t, but
1751 * there are cases when the conn_t is latched but it has wildcard selectors,
1752 * and then we need to fallback to scooping them out of the packet.
1753 *
1754 * Assume we'll never have 0 with a conn_t present, so use 0 as a failure.  We
1755 * can get away with this because we only have non-zero ports/proto for
1756 * latched conn_ts.
1757 *
1758 * Ideal candidate for an "inline" keyword, as we're JUST convoluted enough
1759 * to not be a nice macro.
1760 */
1761static uint64_t
1762conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h)
1763{
1764	ipsec_selector_t sel;
1765	uint8_t ulp = connp->conn_proto;
1766
1767	ASSERT(connp->conn_latch_in_policy != NULL);
1768
1769	if ((ulp == IPPROTO_TCP || ulp == IPPROTO_UDP || ulp == IPPROTO_SCTP) &&
1770	    (connp->conn_fport == 0 || connp->conn_lport == 0)) {
1771		/* Slow path - we gotta grab from the packet. */
1772		if (ipsec_init_inbound_sel(&sel, data_mp, ipha, ip6h,
1773		    SEL_NONE) != SELRET_SUCCESS) {
1774			/* Failure -> have caller free packet with ENOMEM. */
1775			return (0);
1776		}
1777		return (SA_UNIQUE_ID(sel.ips_remote_port, sel.ips_local_port,
1778		    sel.ips_protocol, 0));
1779	}
1780
1781#ifdef DEBUG_NOT_UNTIL_6478464
1782	if (ipsec_init_inbound_sel(&sel, data_mp, ipha, ip6h, SEL_NONE) ==
1783	    SELRET_SUCCESS) {
1784		ASSERT(sel.ips_local_port == connp->conn_lport);
1785		ASSERT(sel.ips_remote_port == connp->conn_fport);
1786		ASSERT(sel.ips_protocol == connp->conn_proto);
1787	}
1788	ASSERT(connp->conn_proto != 0);
1789#endif
1790
1791	return (SA_UNIQUE_ID(connp->conn_fport, connp->conn_lport, ulp, 0));
1792}
1793
1794/*
1795 * Called to check policy on a latched connection.
1796 * Note that we don't dereference conn_latch or conn_ihere since the conn might
1797 * be closing. The caller passes a held ipsec_latch_t instead.
1798 */
1799static boolean_t
1800ipsec_check_ipsecin_latch(ip_recv_attr_t *ira, mblk_t *mp, ipsec_latch_t *ipl,
1801    ipsec_action_t *ap, ipha_t *ipha, ip6_t *ip6h, const char **reason,
1802    kstat_named_t **counter, conn_t *connp, netstack_t *ns)
1803{
1804	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1805
1806	ASSERT(ipl->ipl_ids_latched == B_TRUE);
1807	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1808
1809	if (!(ira->ira_flags & IRAF_LOOPBACK)) {
1810		/*
1811		 * Over loopback, there aren't real security associations,
1812		 * so there are neither identities nor "unique" values
1813		 * for us to check the packet against.
1814		 */
1815		if (ira->ira_ipsec_ah_sa != NULL) {
1816			if (!spd_match_inbound_ids(ipl,
1817			    ira->ira_ipsec_ah_sa)) {
1818				*counter = DROPPER(ipss, ipds_spd_ah_badid);
1819				*reason = "AH identity mismatch";
1820				return (B_FALSE);
1821			}
1822		}
1823
1824		if (ira->ira_ipsec_esp_sa != NULL) {
1825			if (!spd_match_inbound_ids(ipl,
1826			    ira->ira_ipsec_esp_sa)) {
1827				*counter = DROPPER(ipss, ipds_spd_esp_badid);
1828				*reason = "ESP identity mismatch";
1829				return (B_FALSE);
1830			}
1831		}
1832
1833		/*
1834		 * Can fudge pkt_unique from connp because we're latched.
1835		 * In DEBUG kernels (see conn_to_unique()'s implementation),
1836		 * verify this even if it REALLY slows things down.
1837		 */
1838		if (!ipsec_check_ipsecin_unique(ira, reason, counter,
1839		    conn_to_unique(connp, mp, ipha, ip6h), ns)) {
1840			return (B_FALSE);
1841		}
1842	}
1843	return (ipsec_check_ipsecin_action(ira, mp, ap, ipha, ip6h, reason,
1844	    counter, ns));
1845}
1846
1847/*
1848 * Check to see whether this secured datagram meets the policy
1849 * constraints specified in ipsp.
1850 *
1851 * Called from ipsec_check_global_policy, and ipsec_check_inbound_policy.
1852 *
1853 * Consumes a reference to ipsp.
1854 * Returns the mblk if ok.
1855 */
1856static mblk_t *
1857ipsec_check_ipsecin_policy(mblk_t *data_mp, ipsec_policy_t *ipsp,
1858    ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, ip_recv_attr_t *ira,
1859    netstack_t *ns)
1860{
1861	ipsec_action_t *ap;
1862	const char *reason = "no policy actions found";
1863	ip_stack_t	*ipst = ns->netstack_ip;
1864	ipsec_stack_t	*ipss = ns->netstack_ipsec;
1865	kstat_named_t *counter;
1866
1867	counter = DROPPER(ipss, ipds_spd_got_secure);
1868
1869	ASSERT(ipsp != NULL);
1870
1871	ASSERT((ipha == NULL && ip6h != NULL) ||
1872	    (ip6h == NULL && ipha != NULL));
1873
1874	if (ira->ira_flags & IRAF_LOOPBACK)
1875		return (ipsec_check_loopback_policy(data_mp, ira, ipsp));
1876
1877	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
1878
1879	if (ira->ira_ipsec_action != NULL) {
1880		/*
1881		 * this can happen if we do a double policy-check on a packet
1882		 * Would be nice to be able to delete this test..
1883		 */
1884		IPACT_REFRELE(ira->ira_ipsec_action);
1885	}
1886	ASSERT(ira->ira_ipsec_action == NULL);
1887
1888	if (!SA_IDS_MATCH(ira->ira_ipsec_ah_sa, ira->ira_ipsec_esp_sa)) {
1889		reason = "inbound AH and ESP identities differ";
1890		counter = DROPPER(ipss, ipds_spd_ahesp_diffid);
1891		goto drop;
1892	}
1893
1894	if (!ipsec_check_ipsecin_unique(ira, &reason, &counter, pkt_unique,
1895	    ns))
1896		goto drop;
1897
1898	/*
1899	 * Ok, now loop through the possible actions and see if any
1900	 * of them work for us.
1901	 */
1902
1903	for (ap = ipsp->ipsp_act; ap != NULL; ap = ap->ipa_next) {
1904		if (ipsec_check_ipsecin_action(ira, data_mp, ap,
1905		    ipha, ip6h, &reason, &counter, ns)) {
1906			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
1907			IPPOL_REFRELE(ipsp);
1908			return (data_mp);
1909		}
1910	}
1911drop:
1912	ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
1913	    "ipsec inbound policy mismatch: %s, packet dropped\n",
1914	    reason);
1915	IPPOL_REFRELE(ipsp);
1916	ASSERT(ira->ira_ipsec_action == NULL);
1917	BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
1918	ip_drop_packet(data_mp, B_TRUE, NULL, counter,
1919	    &ipss->ipsec_spd_dropper);
1920	return (NULL);
1921}
1922
1923/*
1924 * sleazy prefix-length-based compare.
1925 * another inlining candidate..
1926 */
1927boolean_t
1928ip_addr_match(uint8_t *addr1, int pfxlen, in6_addr_t *addr2p)
1929{
1930	int offset = pfxlen>>3;
1931	int bitsleft = pfxlen & 7;
1932	uint8_t *addr2 = (uint8_t *)addr2p;
1933
1934	/*
1935	 * and there was much evil..
1936	 * XXX should inline-expand the bcmp here and do this 32 bits
1937	 * or 64 bits at a time..
1938	 */
1939	return ((bcmp(addr1, addr2, offset) == 0) &&
1940	    ((bitsleft == 0) ||
1941	    (((addr1[offset] ^ addr2[offset]) & (0xff<<(8-bitsleft))) == 0)));
1942}
1943
1944static ipsec_policy_t *
1945ipsec_find_policy_chain(ipsec_policy_t *best, ipsec_policy_t *chain,
1946    ipsec_selector_t *sel, boolean_t is_icmp_inv_acq)
1947{
1948	ipsec_selkey_t *isel;
1949	ipsec_policy_t *p;
1950	int bpri = best ? best->ipsp_prio : 0;
1951
1952	for (p = chain; p != NULL; p = p->ipsp_hash.hash_next) {
1953		uint32_t valid;
1954
1955		if (p->ipsp_prio <= bpri)
1956			continue;
1957		isel = &p->ipsp_sel->ipsl_key;
1958		valid = isel->ipsl_valid;
1959
1960		if ((valid & IPSL_PROTOCOL) &&
1961		    (isel->ipsl_proto != sel->ips_protocol))
1962			continue;
1963
1964		if ((valid & IPSL_REMOTE_ADDR) &&
1965		    !ip_addr_match((uint8_t *)&isel->ipsl_remote,
1966		    isel->ipsl_remote_pfxlen, &sel->ips_remote_addr_v6))
1967			continue;
1968
1969		if ((valid & IPSL_LOCAL_ADDR) &&
1970		    !ip_addr_match((uint8_t *)&isel->ipsl_local,
1971		    isel->ipsl_local_pfxlen, &sel->ips_local_addr_v6))
1972			continue;
1973
1974		if ((valid & IPSL_REMOTE_PORT) &&
1975		    isel->ipsl_rport != sel->ips_remote_port)
1976			continue;
1977
1978		if ((valid & IPSL_LOCAL_PORT) &&
1979		    isel->ipsl_lport != sel->ips_local_port)
1980			continue;
1981
1982		if (!is_icmp_inv_acq) {
1983			if ((valid & IPSL_ICMP_TYPE) &&
1984			    (isel->ipsl_icmp_type > sel->ips_icmp_type ||
1985			    isel->ipsl_icmp_type_end < sel->ips_icmp_type)) {
1986				continue;
1987			}
1988
1989			if ((valid & IPSL_ICMP_CODE) &&
1990			    (isel->ipsl_icmp_code > sel->ips_icmp_code ||
1991			    isel->ipsl_icmp_code_end <
1992			    sel->ips_icmp_code)) {
1993				continue;
1994			}
1995		} else {
1996			/*
1997			 * special case for icmp inverse acquire
1998			 * we only want policies that aren't drop/pass
1999			 */
2000			if (p->ipsp_act->ipa_act.ipa_type != IPSEC_ACT_APPLY)
2001				continue;
2002		}
2003
2004		/* we matched all the packet-port-field selectors! */
2005		best = p;
2006		bpri = p->ipsp_prio;
2007	}
2008
2009	return (best);
2010}
2011
2012/*
2013 * Try to find and return the best policy entry under a given policy
2014 * root for a given set of selectors; the first parameter "best" is
2015 * the current best policy so far.  If "best" is non-null, we have a
2016 * reference to it.  We return a reference to a policy; if that policy
2017 * is not the original "best", we need to release that reference
2018 * before returning.
2019 */
2020ipsec_policy_t *
2021ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
2022    int direction, ipsec_selector_t *sel)
2023{
2024	ipsec_policy_t *curbest;
2025	ipsec_policy_root_t *root;
2026	uint8_t is_icmp_inv_acq = sel->ips_is_icmp_inv_acq;
2027	int af = sel->ips_isv4 ? IPSEC_AF_V4 : IPSEC_AF_V6;
2028
2029	curbest = best;
2030	root = &head->iph_root[direction];
2031
2032#ifdef DEBUG
2033	if (is_icmp_inv_acq) {
2034		if (sel->ips_isv4) {
2035			if (sel->ips_protocol != IPPROTO_ICMP) {
2036				cmn_err(CE_WARN, "ipsec_find_policy_head:"
2037				    " expecting icmp, got %d",
2038				    sel->ips_protocol);
2039			}
2040		} else {
2041			if (sel->ips_protocol != IPPROTO_ICMPV6) {
2042				cmn_err(CE_WARN, "ipsec_find_policy_head:"
2043				    " expecting icmpv6, got %d",
2044				    sel->ips_protocol);
2045			}
2046		}
2047	}
2048#endif
2049
2050	rw_enter(&head->iph_lock, RW_READER);
2051
2052	if (root->ipr_nchains > 0) {
2053		curbest = ipsec_find_policy_chain(curbest,
2054		    root->ipr_hash[selector_hash(sel, root)].hash_head, sel,
2055		    is_icmp_inv_acq);
2056	}
2057	curbest = ipsec_find_policy_chain(curbest, root->ipr_nonhash[af], sel,
2058	    is_icmp_inv_acq);
2059
2060	/*
2061	 * Adjust reference counts if we found anything new.
2062	 */
2063	if (curbest != best) {
2064		ASSERT(curbest != NULL);
2065		IPPOL_REFHOLD(curbest);
2066
2067		if (best != NULL) {
2068			IPPOL_REFRELE(best);
2069		}
2070	}
2071
2072	rw_exit(&head->iph_lock);
2073
2074	return (curbest);
2075}
2076
2077/*
2078 * Find the best system policy (either global or per-interface) which
2079 * applies to the given selector; look in all the relevant policy roots
2080 * to figure out which policy wins.
2081 *
2082 * Returns a reference to a policy; caller must release this
2083 * reference when done.
2084 */
2085ipsec_policy_t *
2086ipsec_find_policy(int direction, const conn_t *connp, ipsec_selector_t *sel,
2087    netstack_t *ns)
2088{
2089	ipsec_policy_t *p;
2090	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2091
2092	p = ipsec_find_policy_head(NULL, &ipss->ipsec_system_policy,
2093	    direction, sel);
2094	if ((connp != NULL) && (connp->conn_policy != NULL)) {
2095		p = ipsec_find_policy_head(p, connp->conn_policy,
2096		    direction, sel);
2097	}
2098
2099	return (p);
2100}
2101
2102/*
2103 * Check with global policy and see whether this inbound
2104 * packet meets the policy constraints.
2105 *
2106 * Locate appropriate policy from global policy, supplemented by the
2107 * conn's configured and/or cached policy if the conn is supplied.
2108 *
2109 * Dispatch to ipsec_check_ipsecin_policy if we have policy and an
2110 * encrypted packet to see if they match.
2111 *
2112 * Otherwise, see if the policy allows cleartext; if not, drop it on the
2113 * floor.
2114 */
2115mblk_t *
2116ipsec_check_global_policy(mblk_t *data_mp, conn_t *connp,
2117    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, netstack_t *ns)
2118{
2119	ipsec_policy_t *p;
2120	ipsec_selector_t sel;
2121	boolean_t policy_present;
2122	kstat_named_t *counter;
2123	uint64_t pkt_unique;
2124	ip_stack_t	*ipst = ns->netstack_ip;
2125	ipsec_stack_t	*ipss = ns->netstack_ipsec;
2126
2127	sel.ips_is_icmp_inv_acq = 0;
2128
2129	ASSERT((ipha == NULL && ip6h != NULL) ||
2130	    (ip6h == NULL && ipha != NULL));
2131
2132	if (ipha != NULL)
2133		policy_present = ipss->ipsec_inbound_v4_policy_present;
2134	else
2135		policy_present = ipss->ipsec_inbound_v6_policy_present;
2136
2137	if (!policy_present && connp == NULL) {
2138		/*
2139		 * No global policy and no per-socket policy;
2140		 * just pass it back (but we shouldn't get here in that case)
2141		 */
2142		return (data_mp);
2143	}
2144
2145	/*
2146	 * If we have cached policy, use it.
2147	 * Otherwise consult system policy.
2148	 */
2149	if ((connp != NULL) && (connp->conn_latch != NULL)) {
2150		p = connp->conn_latch_in_policy;
2151		if (p != NULL) {
2152			IPPOL_REFHOLD(p);
2153		}
2154		/*
2155		 * Fudge sel for UNIQUE_ID setting below.
2156		 */
2157		pkt_unique = conn_to_unique(connp, data_mp, ipha, ip6h);
2158	} else {
2159		/* Initialize the ports in the selector */
2160		if (ipsec_init_inbound_sel(&sel, data_mp, ipha, ip6h,
2161		    SEL_NONE) == SELRET_NOMEM) {
2162			/*
2163			 * Technically not a policy mismatch, but it is
2164			 * an internal failure.
2165			 */
2166			ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
2167			    "ipsec_init_inbound_sel", ipha, ip6h, B_TRUE, ns);
2168			counter = DROPPER(ipss, ipds_spd_nomem);
2169			goto fail;
2170		}
2171
2172		/*
2173		 * Find the policy which best applies.
2174		 *
2175		 * If we find global policy, we should look at both
2176		 * local policy and global policy and see which is
2177		 * stronger and match accordingly.
2178		 *
2179		 * If we don't find a global policy, check with
2180		 * local policy alone.
2181		 */
2182
2183		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
2184		pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
2185		    sel.ips_local_port, sel.ips_protocol, 0);
2186	}
2187
2188	if (p == NULL) {
2189		if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
2190			/*
2191			 * We have no policy; default to succeeding.
2192			 * XXX paranoid system design doesn't do this.
2193			 */
2194			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2195			return (data_mp);
2196		} else {
2197			counter = DROPPER(ipss, ipds_spd_got_secure);
2198			ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED,
2199			    "ipsec_check_global_policy", ipha, ip6h, B_TRUE,
2200			    ns);
2201			goto fail;
2202		}
2203	}
2204	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2205		return (ipsec_check_ipsecin_policy(data_mp, p, ipha, ip6h,
2206		    pkt_unique, ira, ns));
2207	}
2208	if (p->ipsp_act->ipa_allow_clear) {
2209		BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2210		IPPOL_REFRELE(p);
2211		return (data_mp);
2212	}
2213	IPPOL_REFRELE(p);
2214	/*
2215	 * If we reach here, we will drop the packet because it failed the
2216	 * global policy check because the packet was cleartext, and it
2217	 * should not have been.
2218	 */
2219	ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
2220	    "ipsec_check_global_policy", ipha, ip6h, B_FALSE, ns);
2221	counter = DROPPER(ipss, ipds_spd_got_clear);
2222
2223fail:
2224	ip_drop_packet(data_mp, B_TRUE, NULL, counter,
2225	    &ipss->ipsec_spd_dropper);
2226	BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2227	return (NULL);
2228}
2229
2230/*
2231 * We check whether an inbound datagram is a valid one
2232 * to accept in clear. If it is secure, it is the job
2233 * of IPSEC to log information appropriately if it
2234 * suspects that it may not be the real one.
2235 *
2236 * It is called only while fanning out to the ULP
2237 * where ULP accepts only secure data and the incoming
2238 * is clear. Usually we never accept clear datagrams in
2239 * such cases. ICMP is the only exception.
2240 *
2241 * NOTE : We don't call this function if the client (ULP)
2242 * is willing to accept things in clear.
2243 */
2244boolean_t
2245ipsec_inbound_accept_clear(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2246{
2247	ushort_t iph_hdr_length;
2248	icmph_t *icmph;
2249	icmp6_t *icmp6;
2250	uint8_t *nexthdrp;
2251
2252	ASSERT((ipha != NULL && ip6h == NULL) ||
2253	    (ipha == NULL && ip6h != NULL));
2254
2255	if (ip6h != NULL) {
2256		iph_hdr_length = ip_hdr_length_v6(mp, ip6h);
2257		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
2258		    &nexthdrp)) {
2259			return (B_FALSE);
2260		}
2261		if (*nexthdrp != IPPROTO_ICMPV6)
2262			return (B_FALSE);
2263		icmp6 = (icmp6_t *)(&mp->b_rptr[iph_hdr_length]);
2264		/* Match IPv6 ICMP policy as closely as IPv4 as possible. */
2265		switch (icmp6->icmp6_type) {
2266		case ICMP6_PARAM_PROB:
2267			/* Corresponds to port/proto unreach in IPv4. */
2268		case ICMP6_ECHO_REQUEST:
2269			/* Just like IPv4. */
2270			return (B_FALSE);
2271
2272		case MLD_LISTENER_QUERY:
2273		case MLD_LISTENER_REPORT:
2274		case MLD_LISTENER_REDUCTION:
2275			/*
2276			 * XXX Seperate NDD in IPv4 what about here?
2277			 * Plus, mcast is important to ND.
2278			 */
2279		case ICMP6_DST_UNREACH:
2280			/* Corresponds to HOST/NET unreachable in IPv4. */
2281		case ICMP6_PACKET_TOO_BIG:
2282		case ICMP6_ECHO_REPLY:
2283			/* These are trusted in IPv4. */
2284		case ND_ROUTER_SOLICIT:
2285		case ND_ROUTER_ADVERT:
2286		case ND_NEIGHBOR_SOLICIT:
2287		case ND_NEIGHBOR_ADVERT:
2288		case ND_REDIRECT:
2289			/* Trust ND messages for now. */
2290		case ICMP6_TIME_EXCEEDED:
2291		default:
2292			return (B_TRUE);
2293		}
2294	} else {
2295		/*
2296		 * If it is not ICMP, fail this request.
2297		 */
2298		if (ipha->ipha_protocol != IPPROTO_ICMP) {
2299#ifdef FRAGCACHE_DEBUG
2300			cmn_err(CE_WARN, "Dropping - ipha_proto = %d\n",
2301			    ipha->ipha_protocol);
2302#endif
2303			return (B_FALSE);
2304		}
2305		iph_hdr_length = IPH_HDR_LENGTH(ipha);
2306		icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
2307		/*
2308		 * It is an insecure icmp message. Check to see whether we are
2309		 * willing to accept this one.
2310		 */
2311
2312		switch (icmph->icmph_type) {
2313		case ICMP_ECHO_REPLY:
2314		case ICMP_TIME_STAMP_REPLY:
2315		case ICMP_INFO_REPLY:
2316		case ICMP_ROUTER_ADVERTISEMENT:
2317			/*
2318			 * We should not encourage clear replies if this
2319			 * client expects secure. If somebody is replying
2320			 * in clear some mailicious user watching both the
2321			 * request and reply, can do chosen-plain-text attacks.
2322			 * With global policy we might be just expecting secure
2323			 * but sending out clear. We don't know what the right
2324			 * thing is. We can't do much here as we can't control
2325			 * the sender here. Till we are sure of what to do,
2326			 * accept them.
2327			 */
2328			return (B_TRUE);
2329		case ICMP_ECHO_REQUEST:
2330		case ICMP_TIME_STAMP_REQUEST:
2331		case ICMP_INFO_REQUEST:
2332		case ICMP_ADDRESS_MASK_REQUEST:
2333		case ICMP_ROUTER_SOLICITATION:
2334		case ICMP_ADDRESS_MASK_REPLY:
2335			/*
2336			 * Don't accept this as somebody could be sending
2337			 * us plain text to get encrypted data. If we reply,
2338			 * it will lead to chosen plain text attack.
2339			 */
2340			return (B_FALSE);
2341		case ICMP_DEST_UNREACHABLE:
2342			switch (icmph->icmph_code) {
2343			case ICMP_FRAGMENTATION_NEEDED:
2344				/*
2345				 * Be in sync with icmp_inbound, where we have
2346				 * already set dce_pmtu
2347				 */
2348#ifdef FRAGCACHE_DEBUG
2349			cmn_err(CE_WARN, "ICMP frag needed\n");
2350#endif
2351				return (B_TRUE);
2352			case ICMP_HOST_UNREACHABLE:
2353			case ICMP_NET_UNREACHABLE:
2354				/*
2355				 * By accepting, we could reset a connection.
2356				 * How do we solve the problem of some
2357				 * intermediate router sending in-secure ICMP
2358				 * messages ?
2359				 */
2360				return (B_TRUE);
2361			case ICMP_PORT_UNREACHABLE:
2362			case ICMP_PROTOCOL_UNREACHABLE:
2363			default :
2364				return (B_FALSE);
2365			}
2366		case ICMP_SOURCE_QUENCH:
2367			/*
2368			 * If this is an attack, TCP will slow start
2369			 * because of this. Is it very harmful ?
2370			 */
2371			return (B_TRUE);
2372		case ICMP_PARAM_PROBLEM:
2373			return (B_FALSE);
2374		case ICMP_TIME_EXCEEDED:
2375			return (B_TRUE);
2376		case ICMP_REDIRECT:
2377			return (B_FALSE);
2378		default :
2379			return (B_FALSE);
2380		}
2381	}
2382}
2383
2384void
2385ipsec_latch_ids(ipsec_latch_t *ipl, ipsid_t *local, ipsid_t *remote)
2386{
2387	mutex_enter(&ipl->ipl_lock);
2388
2389	if (ipl->ipl_ids_latched) {
2390		/* I lost, someone else got here before me */
2391		mutex_exit(&ipl->ipl_lock);
2392		return;
2393	}
2394
2395	if (local != NULL)
2396		IPSID_REFHOLD(local);
2397	if (remote != NULL)
2398		IPSID_REFHOLD(remote);
2399
2400	ipl->ipl_local_cid = local;
2401	ipl->ipl_remote_cid = remote;
2402	ipl->ipl_ids_latched = B_TRUE;
2403	mutex_exit(&ipl->ipl_lock);
2404}
2405
2406void
2407ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira)
2408{
2409	ipsa_t *sa;
2410	ipsec_latch_t *ipl = connp->conn_latch;
2411
2412	if (!ipl->ipl_ids_latched) {
2413		ipsid_t *local = NULL;
2414		ipsid_t *remote = NULL;
2415
2416		if (!(ira->ira_flags & IRAF_LOOPBACK)) {
2417			ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2418			if (ira->ira_ipsec_esp_sa != NULL)
2419				sa = ira->ira_ipsec_esp_sa;
2420			else
2421				sa = ira->ira_ipsec_ah_sa;
2422			ASSERT(sa != NULL);
2423			local = sa->ipsa_dst_cid;
2424			remote = sa->ipsa_src_cid;
2425		}
2426		ipsec_latch_ids(ipl, local, remote);
2427	}
2428	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2429		if (connp->conn_latch_in_action != NULL) {
2430			/*
2431			 * Previously cached action.  This is probably
2432			 * harmless, but in DEBUG kernels, check for
2433			 * action equality.
2434			 *
2435			 * Preserve the existing action to preserve latch
2436			 * invariance.
2437			 */
2438			ASSERT(connp->conn_latch_in_action ==
2439			    ira->ira_ipsec_action);
2440			return;
2441		}
2442		connp->conn_latch_in_action = ira->ira_ipsec_action;
2443		IPACT_REFHOLD(connp->conn_latch_in_action);
2444	}
2445}
2446
2447/*
2448 * Check whether the policy constraints are met either for an
2449 * inbound datagram; called from IP in numerous places.
2450 *
2451 * Note that this is not a chokepoint for inbound policy checks;
2452 * see also ipsec_check_ipsecin_latch() and ipsec_check_global_policy()
2453 */
2454mblk_t *
2455ipsec_check_inbound_policy(mblk_t *mp, conn_t *connp,
2456    ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira)
2457{
2458	boolean_t	ret;
2459	ipsec_latch_t	*ipl;
2460	ipsec_action_t	*ap;
2461	uint64_t	unique_id;
2462	ipsec_stack_t	*ipss;
2463	ip_stack_t	*ipst;
2464	netstack_t	*ns;
2465	ipsec_policy_head_t *policy_head;
2466	ipsec_policy_t	*p = NULL;
2467
2468	ASSERT(connp != NULL);
2469	ns = connp->conn_netstack;
2470	ipss = ns->netstack_ipsec;
2471	ipst = ns->netstack_ip;
2472
2473	if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
2474		/*
2475		 * This is the case where the incoming datagram is
2476		 * cleartext and we need to see whether this client
2477		 * would like to receive such untrustworthy things from
2478		 * the wire.
2479		 */
2480		ASSERT(mp != NULL);
2481
2482		mutex_enter(&connp->conn_lock);
2483		if (connp->conn_state_flags & CONN_CONDEMNED) {
2484			mutex_exit(&connp->conn_lock);
2485			ip_drop_packet(mp, B_TRUE, NULL,
2486			    DROPPER(ipss, ipds_spd_got_clear),
2487			    &ipss->ipsec_spd_dropper);
2488			BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2489			return (NULL);
2490		}
2491		if (connp->conn_latch != NULL) {
2492			/* Hold a reference in case the conn is closing */
2493			p = connp->conn_latch_in_policy;
2494			if (p != NULL)
2495				IPPOL_REFHOLD(p);
2496			mutex_exit(&connp->conn_lock);
2497			/*
2498			 * Policy is cached in the conn.
2499			 */
2500			if (p != NULL && !p->ipsp_act->ipa_allow_clear) {
2501				ret = ipsec_inbound_accept_clear(mp,
2502				    ipha, ip6h);
2503				if (ret) {
2504					BUMP_MIB(&ipst->ips_ip_mib,
2505					    ipsecInSucceeded);
2506					IPPOL_REFRELE(p);
2507					return (mp);
2508				} else {
2509					ipsec_log_policy_failure(
2510					    IPSEC_POLICY_MISMATCH,
2511					    "ipsec_check_inbound_policy", ipha,
2512					    ip6h, B_FALSE, ns);
2513					ip_drop_packet(mp, B_TRUE, NULL,
2514					    DROPPER(ipss, ipds_spd_got_clear),
2515					    &ipss->ipsec_spd_dropper);
2516					BUMP_MIB(&ipst->ips_ip_mib,
2517					    ipsecInFailed);
2518					IPPOL_REFRELE(p);
2519					return (NULL);
2520				}
2521			} else {
2522				BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2523				if (p != NULL)
2524					IPPOL_REFRELE(p);
2525				return (mp);
2526			}
2527		} else {
2528			policy_head = connp->conn_policy;
2529
2530			/* Hold a reference in case the conn is closing */
2531			if (policy_head != NULL)
2532				IPPH_REFHOLD(policy_head);
2533			mutex_exit(&connp->conn_lock);
2534			/*
2535			 * As this is a non-hardbound connection we need
2536			 * to look at both per-socket policy and global
2537			 * policy.
2538			 */
2539			mp = ipsec_check_global_policy(mp, connp,
2540			    ipha, ip6h, ira, ns);
2541			if (policy_head != NULL)
2542				IPPH_REFRELE(policy_head, ns);
2543			return (mp);
2544		}
2545	}
2546
2547	mutex_enter(&connp->conn_lock);
2548	/* Connection is closing */
2549	if (connp->conn_state_flags & CONN_CONDEMNED) {
2550		mutex_exit(&connp->conn_lock);
2551		ip_drop_packet(mp, B_TRUE, NULL,
2552		    DROPPER(ipss, ipds_spd_got_clear),
2553		    &ipss->ipsec_spd_dropper);
2554		BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2555		return (NULL);
2556	}
2557
2558	/*
2559	 * Once a connection is latched it remains so for life, the conn_latch
2560	 * pointer on the conn has not changed, simply initializing ipl here
2561	 * as the earlier initialization was done only in the cleartext case.
2562	 */
2563	if ((ipl = connp->conn_latch) == NULL) {
2564		mblk_t *retmp;
2565		policy_head = connp->conn_policy;
2566
2567		/* Hold a reference in case the conn is closing */
2568		if (policy_head != NULL)
2569			IPPH_REFHOLD(policy_head);
2570		mutex_exit(&connp->conn_lock);
2571		/*
2572		 * We don't have policies cached in the conn
2573		 * for this stream. So, look at the global
2574		 * policy. It will check against conn or global
2575		 * depending on whichever is stronger.
2576		 */
2577		retmp = ipsec_check_global_policy(mp, connp,
2578		    ipha, ip6h, ira, ns);
2579		if (policy_head != NULL)
2580			IPPH_REFRELE(policy_head, ns);
2581		return (retmp);
2582	}
2583
2584	IPLATCH_REFHOLD(ipl);
2585	/* Hold reference on conn_latch_in_action in case conn is closing */
2586	ap = connp->conn_latch_in_action;
2587	if (ap != NULL)
2588		IPACT_REFHOLD(ap);
2589	mutex_exit(&connp->conn_lock);
2590
2591	if (ap != NULL) {
2592		/* Policy is cached & latched; fast(er) path */
2593		const char *reason;
2594		kstat_named_t *counter;
2595
2596		if (ipsec_check_ipsecin_latch(ira, mp, ipl, ap,
2597		    ipha, ip6h, &reason, &counter, connp, ns)) {
2598			BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
2599			IPLATCH_REFRELE(ipl);
2600			IPACT_REFRELE(ap);
2601			return (mp);
2602		}
2603		ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0,
2604		    SL_ERROR|SL_WARN|SL_CONSOLE,
2605		    "ipsec inbound policy mismatch: %s, packet dropped\n",
2606		    reason);
2607		ip_drop_packet(mp, B_TRUE, NULL, counter,
2608		    &ipss->ipsec_spd_dropper);
2609		BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
2610		IPLATCH_REFRELE(ipl);
2611		IPACT_REFRELE(ap);
2612		return (NULL);
2613	}
2614	if ((p = connp->conn_latch_in_policy) == NULL) {
2615		ipsec_weird_null_inbound_policy++;
2616		IPLATCH_REFRELE(ipl);
2617		return (mp);
2618	}
2619
2620	unique_id = conn_to_unique(connp, mp, ipha, ip6h);
2621	IPPOL_REFHOLD(p);
2622	mp = ipsec_check_ipsecin_policy(mp, p, ipha, ip6h, unique_id, ira, ns);
2623	/*
2624	 * NOTE: ipsecIn{Failed,Succeeeded} bumped by
2625	 * ipsec_check_ipsecin_policy().
2626	 */
2627	if (mp != NULL)
2628		ipsec_latch_inbound(connp, ira);
2629	IPLATCH_REFRELE(ipl);
2630	return (mp);
2631}
2632
2633/*
2634 * Handle all sorts of cases like tunnel-mode and ICMP.
2635 */
2636static int
2637prepended_length(mblk_t *mp, uintptr_t hptr)
2638{
2639	int rc = 0;
2640
2641	while (mp != NULL) {
2642		if (hptr >= (uintptr_t)mp->b_rptr && hptr <
2643		    (uintptr_t)mp->b_wptr) {
2644			rc += (int)(hptr - (uintptr_t)mp->b_rptr);
2645			break;	/* out of while loop */
2646		}
2647		rc += (int)MBLKL(mp);
2648		mp = mp->b_cont;
2649	}
2650
2651	if (mp == NULL) {
2652		/*
2653		 * IF (big IF) we make it here by naturally exiting the loop,
2654		 * then ip6h isn't in the mblk chain "mp" at all.
2655		 *
2656		 * The only case where this happens is with a reversed IP
2657		 * header that gets passed up by inbound ICMP processing.
2658		 * This unfortunately triggers longstanding bug 6478464.  For
2659		 * now, just pass up 0 for the answer.
2660		 */
2661#ifdef DEBUG_NOT_UNTIL_6478464
2662		ASSERT(mp != NULL);
2663#endif
2664		rc = 0;
2665	}
2666
2667	return (rc);
2668}
2669
2670/*
2671 * Returns:
2672 *
2673 * SELRET_NOMEM --> msgpullup() needed to gather things failed.
2674 * SELRET_BADPKT --> If we're being called after tunnel-mode fragment
2675 *		     gathering, the initial fragment is too short for
2676 *		     useful data.  Only returned if SEL_TUNNEL_FIRSTFRAG is
2677 *		     set.
2678 * SELRET_SUCCESS --> "sel" now has initialized IPsec selector data.
2679 * SELRET_TUNFRAG --> This is a fragment in a tunnel-mode packet.  Caller
2680 *		      should put this packet in a fragment-gathering queue.
2681 *		      Only returned if SEL_TUNNEL_MODE and SEL_PORT_POLICY
2682 *		      is set.
2683 *
2684 * Note that ipha/ip6h can be in a different mblk (mp->b_cont) in the case
2685 * of tunneled packets.
2686 * Also, mp->b_rptr can be an ICMP error where ipha/ip6h is the packet in
2687 * error past the ICMP error.
2688 */
2689static selret_t
2690ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
2691    ip6_t *ip6h, uint8_t sel_flags)
2692{
2693	uint16_t *ports;
2694	int outer_hdr_len = 0;	/* For ICMP or tunnel-mode cases... */
2695	ushort_t hdr_len;
2696	mblk_t *spare_mp = NULL;
2697	uint8_t *nexthdrp, *transportp;
2698	uint8_t nexthdr;
2699	uint8_t icmp_proto;
2700	ip_pkt_t ipp;
2701	boolean_t port_policy_present = (sel_flags & SEL_PORT_POLICY);
2702	boolean_t is_icmp = (sel_flags & SEL_IS_ICMP);
2703	boolean_t tunnel_mode = (sel_flags & SEL_TUNNEL_MODE);
2704	boolean_t post_frag = (sel_flags & SEL_POST_FRAG);
2705
2706	ASSERT((ipha == NULL && ip6h != NULL) ||
2707	    (ipha != NULL && ip6h == NULL));
2708
2709	if (ip6h != NULL) {
2710		outer_hdr_len = prepended_length(mp, (uintptr_t)ip6h);
2711		nexthdr = ip6h->ip6_nxt;
2712		icmp_proto = IPPROTO_ICMPV6;
2713		sel->ips_isv4 = B_FALSE;
2714		sel->ips_local_addr_v6 = ip6h->ip6_dst;
2715		sel->ips_remote_addr_v6 = ip6h->ip6_src;
2716
2717		bzero(&ipp, sizeof (ipp));
2718
2719		switch (nexthdr) {
2720		case IPPROTO_HOPOPTS:
2721		case IPPROTO_ROUTING:
2722		case IPPROTO_DSTOPTS:
2723		case IPPROTO_FRAGMENT:
2724			/*
2725			 * Use ip_hdr_length_nexthdr_v6().  And have a spare
2726			 * mblk that's contiguous to feed it
2727			 */
2728			if ((spare_mp = msgpullup(mp, -1)) == NULL)
2729				return (SELRET_NOMEM);
2730			if (!ip_hdr_length_nexthdr_v6(spare_mp,
2731			    (ip6_t *)(spare_mp->b_rptr + outer_hdr_len),
2732			    &hdr_len, &nexthdrp)) {
2733				/* Malformed packet - caller frees. */
2734				ipsec_freemsg_chain(spare_mp);
2735				return (SELRET_BADPKT);
2736			}
2737			/* Repopulate now that we have the whole packet */
2738			ip6h = (ip6_t *)(spare_mp->b_rptr + outer_hdr_len);
2739			(void) ip_find_hdr_v6(spare_mp, ip6h, B_FALSE, &ipp,
2740			    NULL);
2741			nexthdr = *nexthdrp;
2742			/* We can just extract based on hdr_len now. */
2743			break;
2744		default:
2745			(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL);
2746			hdr_len = IPV6_HDR_LEN;
2747			break;
2748		}
2749		if (port_policy_present && IS_V6_FRAGMENT(ipp) && !is_icmp) {
2750			/* IPv6 Fragment */
2751			ipsec_freemsg_chain(spare_mp);
2752			return (SELRET_TUNFRAG);
2753		}
2754		transportp = (uint8_t *)ip6h + hdr_len;
2755	} else {
2756		outer_hdr_len = prepended_length(mp, (uintptr_t)ipha);
2757		icmp_proto = IPPROTO_ICMP;
2758		sel->ips_isv4 = B_TRUE;
2759		sel->ips_local_addr_v4 = ipha->ipha_dst;
2760		sel->ips_remote_addr_v4 = ipha->ipha_src;
2761		nexthdr = ipha->ipha_protocol;
2762		hdr_len = IPH_HDR_LENGTH(ipha);
2763
2764		if (port_policy_present &&
2765		    IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags) &&
2766		    !is_icmp) {
2767			/* IPv4 Fragment */
2768			ipsec_freemsg_chain(spare_mp);
2769			return (SELRET_TUNFRAG);
2770		}
2771		transportp = (uint8_t *)ipha + hdr_len;
2772	}
2773	sel->ips_protocol = nexthdr;
2774
2775	if ((nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP &&
2776	    nexthdr != IPPROTO_SCTP && nexthdr != icmp_proto) ||
2777	    (!port_policy_present && !post_frag && tunnel_mode)) {
2778		sel->ips_remote_port = sel->ips_local_port = 0;
2779		ipsec_freemsg_chain(spare_mp);
2780		return (SELRET_SUCCESS);
2781	}
2782
2783	if (transportp + 4 > mp->b_wptr) {
2784		/* If we didn't pullup a copy already, do so now. */
2785		/*
2786		 * XXX performance, will upper-layers frequently split TCP/UDP
2787		 * apart from IP or options?  If so, perhaps we should revisit
2788		 * the spare_mp strategy.
2789		 */
2790		ipsec_hdr_pullup_needed++;
2791		if (spare_mp == NULL &&
2792		    (spare_mp = msgpullup(mp, -1)) == NULL) {
2793			return (SELRET_NOMEM);
2794		}
2795		transportp = &spare_mp->b_rptr[hdr_len + outer_hdr_len];
2796	}
2797
2798	if (nexthdr == icmp_proto) {
2799		sel->ips_icmp_type = *transportp++;
2800		sel->ips_icmp_code = *transportp;
2801		sel->ips_remote_port = sel->ips_local_port = 0;
2802	} else {
2803		ports = (uint16_t *)transportp;
2804		sel->ips_remote_port = *ports++;
2805		sel->ips_local_port = *ports;
2806	}
2807	ipsec_freemsg_chain(spare_mp);
2808	return (SELRET_SUCCESS);
2809}
2810
2811/*
2812 * This is called with a b_next chain of messages from the fragcache code,
2813 * hence it needs to discard a chain on error.
2814 */
2815static boolean_t
2816ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
2817    ip6_t *ip6h, int outer_hdr_len, ipsec_stack_t *ipss)
2818{
2819	/*
2820	 * XXX cut&paste shared with ipsec_init_inbound_sel
2821	 */
2822	uint16_t *ports;
2823	ushort_t hdr_len;
2824	mblk_t *spare_mp = NULL;
2825	uint8_t *nexthdrp;
2826	uint8_t nexthdr;
2827	uint8_t *typecode;
2828	uint8_t check_proto;
2829
2830	ASSERT((ipha == NULL && ip6h != NULL) ||
2831	    (ipha != NULL && ip6h == NULL));
2832
2833	if (ip6h != NULL) {
2834		check_proto = IPPROTO_ICMPV6;
2835		nexthdr = ip6h->ip6_nxt;
2836		switch (nexthdr) {
2837		case IPPROTO_HOPOPTS:
2838		case IPPROTO_ROUTING:
2839		case IPPROTO_DSTOPTS:
2840		case IPPROTO_FRAGMENT:
2841			/*
2842			 * Use ip_hdr_length_nexthdr_v6().  And have a spare
2843			 * mblk that's contiguous to feed it
2844			 */
2845			spare_mp = msgpullup(mp, -1);
2846			if (spare_mp == NULL ||
2847			    !ip_hdr_length_nexthdr_v6(spare_mp,
2848			    (ip6_t *)(spare_mp->b_rptr + outer_hdr_len),
2849			    &hdr_len, &nexthdrp)) {
2850				/* Always works, even if NULL. */
2851				ipsec_freemsg_chain(spare_mp);
2852				ip_drop_packet_chain(mp, B_FALSE, NULL,
2853				    DROPPER(ipss, ipds_spd_nomem),
2854				    &ipss->ipsec_spd_dropper);
2855				return (B_FALSE);
2856			} else {
2857				nexthdr = *nexthdrp;
2858				/* We can just extract based on hdr_len now. */
2859			}
2860			break;
2861		default:
2862			hdr_len = IPV6_HDR_LEN;
2863			break;
2864		}
2865	} else {
2866		check_proto = IPPROTO_ICMP;
2867		hdr_len = IPH_HDR_LENGTH(ipha);
2868		nexthdr = ipha->ipha_protocol;
2869	}
2870
2871	sel->ips_protocol = nexthdr;
2872	if (nexthdr != IPPROTO_TCP && nexthdr != IPPROTO_UDP &&
2873	    nexthdr != IPPROTO_SCTP && nexthdr != check_proto) {
2874		sel->ips_local_port = sel->ips_remote_port = 0;
2875		ipsec_freemsg_chain(spare_mp); /* Always works, even if NULL */
2876		return (B_TRUE);
2877	}
2878
2879	if (&mp->b_rptr[hdr_len] + 4 + outer_hdr_len > mp->b_wptr) {
2880		/* If we didn't pullup a copy already, do so now. */
2881		/*
2882		 * XXX performance, will upper-layers frequently split TCP/UDP
2883		 * apart from IP or options?  If so, perhaps we should revisit
2884		 * the spare_mp strategy.
2885		 *
2886		 * XXX should this be msgpullup(mp, hdr_len+4) ???
2887		 */
2888		if (spare_mp == NULL &&
2889		    (spare_mp = msgpullup(mp, -1)) == NULL) {
2890			ip_drop_packet_chain(mp, B_FALSE, NULL,
2891			    DROPPER(ipss, ipds_spd_nomem),
2892			    &ipss->ipsec_spd_dropper);
2893			return (B_FALSE);
2894		}
2895		ports = (uint16_t *)&spare_mp->b_rptr[hdr_len + outer_hdr_len];
2896	} else {
2897		ports = (uint16_t *)&mp->b_rptr[hdr_len + outer_hdr_len];
2898	}
2899
2900	if (nexthdr == check_proto) {
2901		typecode = (uint8_t *)ports;
2902		sel->ips_icmp_type = *typecode++;
2903		sel->ips_icmp_code = *typecode;
2904		sel->ips_remote_port = sel->ips_local_port = 0;
2905	} else {
2906		sel->ips_local_port = *ports++;
2907		sel->ips_remote_port = *ports;
2908	}
2909	ipsec_freemsg_chain(spare_mp);	/* Always works, even if NULL */
2910	return (B_TRUE);
2911}
2912
2913/*
2914 * Prepend an mblk with a ipsec_crypto_t to the message chain.
2915 * Frees the argument and returns NULL should the allocation fail.
2916 * Returns the pointer to the crypto data part.
2917 */
2918mblk_t *
2919ipsec_add_crypto_data(mblk_t *data_mp, ipsec_crypto_t **icp)
2920{
2921	mblk_t	*mp;
2922
2923	mp = allocb(sizeof (ipsec_crypto_t), BPRI_MED);
2924	if (mp == NULL) {
2925		freemsg(data_mp);
2926		return (NULL);
2927	}
2928	bzero(mp->b_rptr, sizeof (ipsec_crypto_t));
2929	mp->b_wptr += sizeof (ipsec_crypto_t);
2930	mp->b_cont = data_mp;
2931	mp->b_datap->db_type = M_EVENT;	/* For ASSERT */
2932	*icp = (ipsec_crypto_t *)mp->b_rptr;
2933	return (mp);
2934}
2935
2936/*
2937 * Remove what was prepended above. Return b_cont and a pointer to the
2938 * crypto data.
2939 * The caller must call ipsec_free_crypto_data for mblk once it is done
2940 * with the crypto data.
2941 */
2942mblk_t *
2943ipsec_remove_crypto_data(mblk_t *crypto_mp, ipsec_crypto_t **icp)
2944{
2945	ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
2946	ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
2947
2948	*icp = (ipsec_crypto_t *)crypto_mp->b_rptr;
2949	return (crypto_mp->b_cont);
2950}
2951
2952/*
2953 * Free what was prepended above. Return b_cont.
2954 */
2955mblk_t *
2956ipsec_free_crypto_data(mblk_t *crypto_mp)
2957{
2958	mblk_t	*mp;
2959
2960	ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
2961	ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
2962
2963	mp = crypto_mp->b_cont;
2964	freeb(crypto_mp);
2965	return (mp);
2966}
2967
2968/*
2969 * Create an ipsec_action_t based on the way an inbound packet was protected.
2970 * Used to reflect traffic back to a sender.
2971 *
2972 * We don't bother interning the action into the hash table.
2973 */
2974ipsec_action_t *
2975ipsec_in_to_out_action(ip_recv_attr_t *ira)
2976{
2977	ipsa_t *ah_assoc, *esp_assoc;
2978	uint_t auth_alg = 0, encr_alg = 0, espa_alg = 0;
2979	ipsec_action_t *ap;
2980	boolean_t unique;
2981
2982	ap = kmem_cache_alloc(ipsec_action_cache, KM_NOSLEEP);
2983
2984	if (ap == NULL)
2985		return (NULL);
2986
2987	bzero(ap, sizeof (*ap));
2988	HASH_NULL(ap, ipa_hash);
2989	ap->ipa_next = NULL;
2990	ap->ipa_refs = 1;
2991
2992	/*
2993	 * Get the algorithms that were used for this packet.
2994	 */
2995	ap->ipa_act.ipa_type = IPSEC_ACT_APPLY;
2996	ap->ipa_act.ipa_log = 0;
2997	ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2998
2999	ah_assoc = ira->ira_ipsec_ah_sa;
3000	ap->ipa_act.ipa_apply.ipp_use_ah = (ah_assoc != NULL);
3001
3002	esp_assoc = ira->ira_ipsec_esp_sa;
3003	ap->ipa_act.ipa_apply.ipp_use_esp = (esp_assoc != NULL);
3004
3005	if (esp_assoc != NULL) {
3006		encr_alg = esp_assoc->ipsa_encr_alg;
3007		espa_alg = esp_assoc->ipsa_auth_alg;
3008		ap->ipa_act.ipa_apply.ipp_use_espa = (espa_alg != 0);
3009	}
3010	if (ah_assoc != NULL)
3011		auth_alg = ah_assoc->ipsa_auth_alg;
3012
3013	ap->ipa_act.ipa_apply.ipp_encr_alg = (uint8_t)encr_alg;
3014	ap->ipa_act.ipa_apply.ipp_auth_alg = (uint8_t)auth_alg;
3015	ap->ipa_act.ipa_apply.ipp_esp_auth_alg = (uint8_t)espa_alg;
3016	ap->ipa_act.ipa_apply.ipp_use_se =
3017	    !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
3018	unique = B_FALSE;
3019
3020	if (esp_assoc != NULL) {
3021		ap->ipa_act.ipa_apply.ipp_espa_minbits =
3022		    esp_assoc->ipsa_authkeybits;
3023		ap->ipa_act.ipa_apply.ipp_espa_maxbits =
3024		    esp_assoc->ipsa_authkeybits;
3025		ap->ipa_act.ipa_apply.ipp_espe_minbits =
3026		    esp_assoc->ipsa_encrkeybits;
3027		ap->ipa_act.ipa_apply.ipp_espe_maxbits =
3028		    esp_assoc->ipsa_encrkeybits;
3029		ap->ipa_act.ipa_apply.ipp_km_proto = esp_assoc->ipsa_kmp;
3030		ap->ipa_act.ipa_apply.ipp_km_cookie = esp_assoc->ipsa_kmc;
3031		if (esp_assoc->ipsa_flags & IPSA_F_UNIQUE)
3032			unique = B_TRUE;
3033	}
3034	if (ah_assoc != NULL) {
3035		ap->ipa_act.ipa_apply.ipp_ah_minbits =
3036		    ah_assoc->ipsa_authkeybits;
3037		ap->ipa_act.ipa_apply.ipp_ah_maxbits =
3038		    ah_assoc->ipsa_authkeybits;
3039		ap->ipa_act.ipa_apply.ipp_km_proto = ah_assoc->ipsa_kmp;
3040		ap->ipa_act.ipa_apply.ipp_km_cookie = ah_assoc->ipsa_kmc;
3041		if (ah_assoc->ipsa_flags & IPSA_F_UNIQUE)
3042			unique = B_TRUE;
3043	}
3044	ap->ipa_act.ipa_apply.ipp_use_unique = unique;
3045	ap->ipa_want_unique = unique;
3046	ap->ipa_allow_clear = B_FALSE;
3047	ap->ipa_want_se = !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
3048	ap->ipa_want_ah = (ah_assoc != NULL);
3049	ap->ipa_want_esp = (esp_assoc != NULL);
3050
3051	ap->ipa_ovhd = ipsec_act_ovhd(&ap->ipa_act);
3052
3053	ap->ipa_act.ipa_apply.ipp_replay_depth = 0; /* don't care */
3054
3055	return (ap);
3056}
3057
3058
3059/*
3060 * Compute the worst-case amount of extra space required by an action.
3061 * Note that, because of the ESP considerations listed below, this is
3062 * actually not the same as the best-case reduction in the MTU; in the
3063 * future, we should pass additional information to this function to
3064 * allow the actual MTU impact to be computed.
3065 *
3066 * AH: Revisit this if we implement algorithms with
3067 * a verifier size of more than 12 bytes.
3068 *
3069 * ESP: A more exact but more messy computation would take into
3070 * account the interaction between the cipher block size and the
3071 * effective MTU, yielding the inner payload size which reflects a
3072 * packet with *minimum* ESP padding..
3073 */
3074int32_t
3075ipsec_act_ovhd(const ipsec_act_t *act)
3076{
3077	int32_t overhead = 0;
3078
3079	if (act->ipa_type == IPSEC_ACT_APPLY) {
3080		const ipsec_prot_t *ipp = &act->ipa_apply;
3081
3082		if (ipp->ipp_use_ah)
3083			overhead += IPSEC_MAX_AH_HDR_SIZE;
3084		if (ipp->ipp_use_esp) {
3085			overhead += IPSEC_MAX_ESP_HDR_SIZE;
3086			overhead += sizeof (struct udphdr);
3087		}
3088		if (ipp->ipp_use_se)
3089			overhead += IP_SIMPLE_HDR_LENGTH;
3090	}
3091	return (overhead);
3092}
3093
3094/*
3095 * This hash function is used only when creating policies and thus is not
3096 * performance-critical for packet flows.
3097 *
3098 * Future work: canonicalize the structures hashed with this (i.e.,
3099 * zeroize padding) so the hash works correctly.
3100 */
3101/* ARGSUSED */
3102static uint32_t
3103policy_hash(int size, const void *start, const void *end)
3104{
3105	return (0);
3106}
3107
3108
3109/*
3110 * Hash function macros for each address type.
3111 *
3112 * The IPV6 hash function assumes that the low order 32-bits of the
3113 * address (typically containing the low order 24 bits of the mac
3114 * address) are reasonably well-distributed.  Revisit this if we run
3115 * into trouble from lots of collisions on ::1 addresses and the like
3116 * (seems unlikely).
3117 */
3118#define	IPSEC_IPV4_HASH(a, n) ((a) % (n))
3119#define	IPSEC_IPV6_HASH(a, n) (((a).s6_addr32[3]) % (n))
3120
3121/*
3122 * These two hash functions should produce coordinated values
3123 * but have slightly different roles.
3124 */
3125static uint32_t
3126selkey_hash(const ipsec_selkey_t *selkey, netstack_t *ns)
3127{
3128	uint32_t valid = selkey->ipsl_valid;
3129	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3130
3131	if (!(valid & IPSL_REMOTE_ADDR))
3132		return (IPSEC_SEL_NOHASH);
3133
3134	if (valid & IPSL_IPV4) {
3135		if (selkey->ipsl_remote_pfxlen == 32) {
3136			return (IPSEC_IPV4_HASH(selkey->ipsl_remote.ipsad_v4,
3137			    ipss->ipsec_spd_hashsize));
3138		}
3139	}
3140	if (valid & IPSL_IPV6) {
3141		if (selkey->ipsl_remote_pfxlen == 128) {
3142			return (IPSEC_IPV6_HASH(selkey->ipsl_remote.ipsad_v6,
3143			    ipss->ipsec_spd_hashsize));
3144		}
3145	}
3146	return (IPSEC_SEL_NOHASH);
3147}
3148
3149static uint32_t
3150selector_hash(ipsec_selector_t *sel, ipsec_policy_root_t *root)
3151{
3152	if (sel->ips_isv4) {
3153		return (IPSEC_IPV4_HASH(sel->ips_remote_addr_v4,
3154		    root->ipr_nchains));
3155	}
3156	return (IPSEC_IPV6_HASH(sel->ips_remote_addr_v6, root->ipr_nchains));
3157}
3158
3159/*
3160 * Intern actions into the action hash table.
3161 */
3162ipsec_action_t *
3163ipsec_act_find(const ipsec_act_t *a, int n, netstack_t *ns)
3164{
3165	int i;
3166	uint32_t hval;
3167	ipsec_action_t *ap;
3168	ipsec_action_t *prev = NULL;
3169	int32_t overhead, maxovhd = 0;
3170	boolean_t allow_clear = B_FALSE;
3171	boolean_t want_ah = B_FALSE;
3172	boolean_t want_esp = B_FALSE;
3173	boolean_t want_se = B_FALSE;
3174	boolean_t want_unique = B_FALSE;
3175	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3176
3177	/*
3178	 * TODO: should canonicalize a[] (i.e., zeroize any padding)
3179	 * so we can use a non-trivial policy_hash function.
3180	 */
3181	ap = NULL;
3182	for (i = n-1; i >= 0; i--) {
3183		hval = policy_hash(IPSEC_ACTION_HASH_SIZE, &a[i], &a[n]);
3184
3185		HASH_LOCK(ipss->ipsec_action_hash, hval);
3186
3187		for (HASH_ITERATE(ap, ipa_hash,
3188		    ipss->ipsec_action_hash, hval)) {
3189			if (bcmp(&ap->ipa_act, &a[i], sizeof (*a)) != 0)
3190				continue;
3191			if (ap->ipa_next != prev)
3192				continue;
3193			break;
3194		}
3195		if (ap != NULL) {
3196			HASH_UNLOCK(ipss->ipsec_action_hash, hval);
3197			prev = ap;
3198			continue;
3199		}
3200		/*
3201		 * need to allocate a new one..
3202		 */
3203		ap = kmem_cache_alloc(ipsec_action_cache, KM_NOSLEEP);
3204		if (ap == NULL) {
3205			HASH_UNLOCK(ipss->ipsec_action_hash, hval);
3206			if (prev != NULL)
3207				ipsec_action_free(prev);
3208			return (NULL);
3209		}
3210		HASH_INSERT(ap, ipa_hash, ipss->ipsec_action_hash, hval);
3211
3212		ap->ipa_next = prev;
3213		ap->ipa_act = a[i];
3214
3215		overhead = ipsec_act_ovhd(&a[i]);
3216		if (maxovhd < overhead)
3217			maxovhd = overhead;
3218
3219		if ((a[i].ipa_type == IPSEC_ACT_BYPASS) ||
3220		    (a[i].ipa_type == IPSEC_ACT_CLEAR))
3221			allow_clear = B_TRUE;
3222		if (a[i].ipa_type == IPSEC_ACT_APPLY) {
3223			const ipsec_prot_t *ipp = &a[i].ipa_apply;
3224
3225			ASSERT(ipp->ipp_use_ah || ipp->ipp_use_esp);
3226			want_ah |= ipp->ipp_use_ah;
3227			want_esp |= ipp->ipp_use_esp;
3228			want_se |= ipp->ipp_use_se;
3229			want_unique |= ipp->ipp_use_unique;
3230		}
3231		ap->ipa_allow_clear = allow_clear;
3232		ap->ipa_want_ah = want_ah;
3233		ap->ipa_want_esp = want_esp;
3234		ap->ipa_want_se = want_se;
3235		ap->ipa_want_unique = want_unique;
3236		ap->ipa_refs = 1; /* from the hash table */
3237		ap->ipa_ovhd = maxovhd;
3238		if (prev)
3239			prev->ipa_refs++;
3240		prev = ap;
3241		HASH_UNLOCK(ipss->ipsec_action_hash, hval);
3242	}
3243
3244	ap->ipa_refs++;		/* caller's reference */
3245
3246	return (ap);
3247}
3248
3249/*
3250 * Called when refcount goes to 0, indicating that all references to this
3251 * node are gone.
3252 *
3253 * This does not unchain the action from the hash table.
3254 */
3255void
3256ipsec_action_free(ipsec_action_t *ap)
3257{
3258	for (;;) {
3259		ipsec_action_t *np = ap->ipa_next;
3260		ASSERT(ap->ipa_refs == 0);
3261		ASSERT(ap->ipa_hash.hash_pp == NULL);
3262		kmem_cache_free(ipsec_action_cache, ap);
3263		ap = np;
3264		/* Inlined IPACT_REFRELE -- avoid recursion */
3265		if (ap == NULL)
3266			break;
3267		membar_exit();
3268		if (atomic_dec_32_nv(&(ap)->ipa_refs) != 0)
3269			break;
3270		/* End inlined IPACT_REFRELE */
3271	}
3272}
3273
3274/*
3275 * Called when the action hash table goes away.
3276 *
3277 * The actions can be queued on an mblk with ipsec_in or
3278 * ipsec_out, hence the actions might still be around.
3279 * But we decrement ipa_refs here since we no longer have
3280 * a reference to the action from the hash table.
3281 */
3282static void
3283ipsec_action_free_table(ipsec_action_t *ap)
3284{
3285	while (ap != NULL) {
3286		ipsec_action_t *np = ap->ipa_next;
3287
3288		/* FIXME: remove? */
3289		(void) printf("ipsec_action_free_table(%p) ref %d\n",
3290		    (void *)ap, ap->ipa_refs);
3291		ASSERT(ap->ipa_refs > 0);
3292		IPACT_REFRELE(ap);
3293		ap = np;
3294	}
3295}
3296
3297/*
3298 * Need to walk all stack instances since the reclaim function
3299 * is global for all instances
3300 */
3301/* ARGSUSED */
3302static void
3303ipsec_action_reclaim(void *arg)
3304{
3305	netstack_handle_t nh;
3306	netstack_t *ns;
3307	ipsec_stack_t *ipss;
3308
3309	netstack_next_init(&nh);
3310	while ((ns = netstack_next(&nh)) != NULL) {
3311		/*
3312		 * netstack_next() can return a netstack_t with a NULL
3313		 * netstack_ipsec at boot time.
3314		 */
3315		if ((ipss = ns->netstack_ipsec) == NULL) {
3316			netstack_rele(ns);
3317			continue;
3318		}
3319		ipsec_action_reclaim_stack(ipss);
3320		netstack_rele(ns);
3321	}
3322	netstack_next_fini(&nh);
3323}
3324
3325/*
3326 * Periodically sweep action hash table for actions with refcount==1, and
3327 * nuke them.  We cannot do this "on demand" (i.e., from IPACT_REFRELE)
3328 * because we can't close the race between another thread finding the action
3329 * in the hash table without holding the bucket lock during IPACT_REFRELE.
3330 * Instead, we run this function sporadically to clean up after ourselves;
3331 * we also set it as the "reclaim" function for the action kmem_cache.
3332 *
3333 * Note that it may take several passes of ipsec_action_gc() to free all
3334 * "stale" actions.
3335 */
3336static void
3337ipsec_action_reclaim_stack(ipsec_stack_t *ipss)
3338{
3339	int i;
3340
3341	for (i = 0; i < IPSEC_ACTION_HASH_SIZE; i++) {
3342		ipsec_action_t *ap, *np;
3343
3344		/* skip the lock if nobody home */
3345		if (ipss->ipsec_action_hash[i].hash_head == NULL)
3346			continue;
3347
3348		HASH_LOCK(ipss->ipsec_action_hash, i);
3349		for (ap = ipss->ipsec_action_hash[i].hash_head;
3350		    ap != NULL; ap = np) {
3351			ASSERT(ap->ipa_refs > 0);
3352			np = ap->ipa_hash.hash_next;
3353			if (ap->ipa_refs > 1)
3354				continue;
3355			HASH_UNCHAIN(ap, ipa_hash,
3356			    ipss->ipsec_action_hash, i);
3357			IPACT_REFRELE(ap);
3358		}
3359		HASH_UNLOCK(ipss->ipsec_action_hash, i);
3360	}
3361}
3362
3363/*
3364 * Intern a selector set into the selector set hash table.
3365 * This is simpler than the actions case..
3366 */
3367static ipsec_sel_t *
3368ipsec_find_sel(ipsec_selkey_t *selkey, netstack_t *ns)
3369{
3370	ipsec_sel_t *sp;
3371	uint32_t hval, bucket;
3372	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3373
3374	/*
3375	 * Exactly one AF bit should be set in selkey.
3376	 */
3377	ASSERT(!(selkey->ipsl_valid & IPSL_IPV4) ^
3378	    !(selkey->ipsl_valid & IPSL_IPV6));
3379
3380	hval = selkey_hash(selkey, ns);
3381	/* Set pol_hval to uninitialized until we put it in a polhead. */
3382	selkey->ipsl_sel_hval = hval;
3383
3384	bucket = (hval == IPSEC_SEL_NOHASH) ? 0 : hval;
3385
3386	ASSERT(!HASH_LOCKED(ipss->ipsec_sel_hash, bucket));
3387	HASH_LOCK(ipss->ipsec_sel_hash, bucket);
3388
3389	for (HASH_ITERATE(sp, ipsl_hash, ipss->ipsec_sel_hash, bucket)) {
3390		if (bcmp(&sp->ipsl_key, selkey,
3391		    offsetof(ipsec_selkey_t, ipsl_pol_hval)) == 0)
3392			break;
3393	}
3394	if (sp != NULL) {
3395		sp->ipsl_refs++;
3396
3397		HASH_UNLOCK(ipss->ipsec_sel_hash, bucket);
3398		return (sp);
3399	}
3400
3401	sp = kmem_cache_alloc(ipsec_sel_cache, KM_NOSLEEP);
3402	if (sp == NULL) {
3403		HASH_UNLOCK(ipss->ipsec_sel_hash, bucket);
3404		return (NULL);
3405	}
3406
3407	HASH_INSERT(sp, ipsl_hash, ipss->ipsec_sel_hash, bucket);
3408	sp->ipsl_refs = 2;	/* one for hash table, one for caller */
3409	sp->ipsl_key = *selkey;
3410	/* Set to uninitalized and have insertion into polhead fix things. */
3411	if (selkey->ipsl_sel_hval != IPSEC_SEL_NOHASH)
3412		sp->ipsl_key.ipsl_pol_hval = 0;
3413	else
3414		sp->ipsl_key.ipsl_pol_hval = IPSEC_SEL_NOHASH;
3415
3416	HASH_UNLOCK(ipss->ipsec_sel_hash, bucket);
3417
3418	return (sp);
3419}
3420
3421static void
3422ipsec_sel_rel(ipsec_sel_t **spp, netstack_t *ns)
3423{
3424	ipsec_sel_t *sp = *spp;
3425	int hval = sp->ipsl_key.ipsl_sel_hval;
3426	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3427
3428	*spp = NULL;
3429
3430	if (hval == IPSEC_SEL_NOHASH)
3431		hval = 0;
3432
3433	ASSERT(!HASH_LOCKED(ipss->ipsec_sel_hash, hval));
3434	HASH_LOCK(ipss->ipsec_sel_hash, hval);
3435	if (--sp->ipsl_refs == 1) {
3436		HASH_UNCHAIN(sp, ipsl_hash, ipss->ipsec_sel_hash, hval);
3437		sp->ipsl_refs--;
3438		HASH_UNLOCK(ipss->ipsec_sel_hash, hval);
3439		ASSERT(sp->ipsl_refs == 0);
3440		kmem_cache_free(ipsec_sel_cache, sp);
3441		/* Caller unlocks */
3442		return;
3443	}
3444
3445	HASH_UNLOCK(ipss->ipsec_sel_hash, hval);
3446}
3447
3448/*
3449 * Free a policy rule which we know is no longer being referenced.
3450 */
3451void
3452ipsec_policy_free(ipsec_policy_t *ipp)
3453{
3454	ASSERT(ipp->ipsp_refs == 0);
3455	ASSERT(ipp->ipsp_sel != NULL);
3456	ASSERT(ipp->ipsp_act != NULL);
3457	ASSERT(ipp->ipsp_netstack != NULL);
3458
3459	ipsec_sel_rel(&ipp->ipsp_sel, ipp->ipsp_netstack);
3460	IPACT_REFRELE(ipp->ipsp_act);
3461	kmem_cache_free(ipsec_pol_cache, ipp);
3462}
3463
3464/*
3465 * Construction of new policy rules; construct a policy, and add it to
3466 * the appropriate tables.
3467 */
3468ipsec_policy_t *
3469ipsec_policy_create(ipsec_selkey_t *keys, const ipsec_act_t *a,
3470    int nacts, int prio, uint64_t *index_ptr, netstack_t *ns)
3471{
3472	ipsec_action_t *ap;
3473	ipsec_sel_t *sp;
3474	ipsec_policy_t *ipp;
3475	ipsec_stack_t	*ipss = ns->netstack_ipsec;
3476
3477	if (index_ptr == NULL)
3478		index_ptr = &ipss->ipsec_next_policy_index;
3479
3480	ipp = kmem_cache_alloc(ipsec_pol_cache, KM_NOSLEEP);
3481	ap = ipsec_act_find(a, nacts, ns);
3482	sp = ipsec_find_sel(keys, ns);
3483
3484	if ((ap == NULL) || (sp == NULL) || (ipp == NULL)) {
3485		if (ap != NULL) {
3486			IPACT_REFRELE(ap);
3487		}
3488		if (sp != NULL)
3489			ipsec_sel_rel(&sp, ns);
3490		if (ipp != NULL)
3491			kmem_cache_free(ipsec_pol_cache, ipp);
3492		return (NULL);
3493	}
3494
3495	HASH_NULL(ipp, ipsp_hash);
3496
3497	ipp->ipsp_netstack = ns;	/* Needed for ipsec_policy_free */
3498	ipp->ipsp_refs = 1;	/* caller's reference */
3499	ipp->ipsp_sel = sp;
3500	ipp->ipsp_act = ap;
3501	ipp->ipsp_prio = prio;	/* rule priority */
3502	ipp->ipsp_index = *index_ptr;
3503	(*index_ptr)++;
3504
3505	return (ipp);
3506}
3507
3508static void
3509ipsec_update_present_flags(ipsec_stack_t *ipss)
3510{
3511	boolean_t hashpol;
3512
3513	hashpol = (avl_numnodes(&ipss->ipsec_system_policy.iph_rulebyid) > 0);
3514
3515	if (hashpol) {
3516		ipss->ipsec_outbound_v4_policy_present = B_TRUE;
3517		ipss->ipsec_outbound_v6_policy_present = B_TRUE;
3518		ipss->ipsec_inbound_v4_policy_present = B_TRUE;
3519		ipss->ipsec_inbound_v6_policy_present = B_TRUE;
3520		return;
3521	}
3522
3523	ipss->ipsec_outbound_v4_policy_present = (NULL !=
3524	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_OUTBOUND].
3525	    ipr_nonhash[IPSEC_AF_V4]);
3526	ipss->ipsec_outbound_v6_policy_present = (NULL !=
3527	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_OUTBOUND].
3528	    ipr_nonhash[IPSEC_AF_V6]);
3529	ipss->ipsec_inbound_v4_policy_present = (NULL !=
3530	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_INBOUND].
3531	    ipr_nonhash[IPSEC_AF_V4]);
3532	ipss->ipsec_inbound_v6_policy_present = (NULL !=
3533	    ipss->ipsec_system_policy.iph_root[IPSEC_TYPE_INBOUND].
3534	    ipr_nonhash[IPSEC_AF_V6]);
3535}
3536
3537boolean_t
3538ipsec_policy_delete(ipsec_policy_head_t *php, ipsec_selkey_t *keys, int dir,
3539    netstack_t *ns)
3540{
3541	ipsec_sel_t *sp;
3542	ipsec_policy_t *ip, *nip, *head;
3543	int af;
3544	ipsec_policy_root_t *pr = &php->iph_root[dir];
3545
3546	sp = ipsec_find_sel(keys, ns);
3547
3548	if (sp == NULL)
3549		return (B_FALSE);
3550
3551	af = (sp->ipsl_key.ipsl_valid & IPSL_IPV4) ? IPSEC_AF_V4 : IPSEC_AF_V6;
3552
3553	rw_enter(&php->iph_lock, RW_WRITER);
3554
3555	if (sp->ipsl_key.ipsl_pol_hval == IPSEC_SEL_NOHASH) {
3556		head = pr->ipr_nonhash[af];
3557	} else {
3558		head = pr->ipr_hash[sp->ipsl_key.ipsl_pol_hval].hash_head;
3559	}
3560
3561	for (ip = head; ip != NULL; ip = nip) {
3562		nip = ip->ipsp_hash.hash_next;
3563		if (ip->ipsp_sel != sp) {
3564			continue;
3565		}
3566
3567		IPPOL_UNCHAIN(php, ip);
3568
3569		php->iph_gen++;
3570		ipsec_update_present_flags(ns->netstack_ipsec);
3571
3572		rw_exit(&php->iph_lock);
3573
3574		ipsec_sel_rel(&sp, ns);
3575
3576		return (B_TRUE);
3577	}
3578
3579	rw_exit(&php->iph_lock);
3580	ipsec_sel_rel(&sp, ns);
3581	return (B_FALSE);
3582}
3583
3584int
3585ipsec_policy_delete_index(ipsec_policy_head_t *php, uint64_t policy_index,
3586    netstack_t *ns)
3587{
3588	boolean_t found = B_FALSE;
3589	ipsec_policy_t ipkey;
3590	ipsec_policy_t *ip;
3591	avl_index_t where;
3592
3593	bzero(&ipkey, sizeof (ipkey));
3594	ipkey.ipsp_index = policy_index;
3595
3596	rw_enter(&php->iph_lock, RW_WRITER);
3597
3598	/*
3599	 * We could be cleverer here about the walk.
3600	 * but well, (k+1)*log(N) will do for now (k==number of matches,
3601	 * N==number of table entries
3602	 */
3603	for (;;) {
3604		ip = (ipsec_policy_t *)avl_find(&php->iph_rulebyid,
3605		    (void *)&ipkey, &where);
3606		ASSERT(ip == NULL);
3607
3608		ip = avl_nearest(&php->iph_rulebyid, where, AVL_AFTER);
3609
3610		if (ip == NULL)
3611			break;
3612
3613		if (ip->ipsp_index != policy_index) {
3614			ASSERT(ip->ipsp_index > policy_index);
3615			break;
3616		}
3617
3618		IPPOL_UNCHAIN(php, ip);
3619		found = B_TRUE;
3620	}
3621
3622	if (found) {
3623		php->iph_gen++;
3624		ipsec_update_present_flags(ns->netstack_ipsec);
3625	}
3626
3627	rw_exit(&php->iph_lock);
3628
3629	return (found ? 0 : ENOENT);
3630}
3631
3632/*
3633 * Given a constructed ipsec_policy_t policy rule, see if it can be entered
3634 * into the correct policy ruleset.  As a side-effect, it sets the hash
3635 * entries on "ipp"'s ipsp_pol_hval.
3636 *
3637 * Returns B_TRUE if it can be entered, B_FALSE if it can't be (because a
3638 * duplicate policy exists with exactly the same selectors), or an icmp
3639 * rule exists with a different encryption/authentication action.
3640 */
3641boolean_t
3642ipsec_check_policy(ipsec_policy_head_t *php, ipsec_policy_t *ipp, int direction)
3643{
3644	ipsec_policy_root_t *pr = &php->iph_root[direction];
3645	int af = -1;
3646	ipsec_policy_t *p2, *head;
3647	uint8_t check_proto;
3648	ipsec_selkey_t *selkey = &ipp->ipsp_sel->ipsl_key;
3649	uint32_t	valid = selkey->ipsl_valid;
3650
3651	if (valid & IPSL_IPV6) {
3652		ASSERT(!(valid & IPSL_IPV4));
3653		af = IPSEC_AF_V6;
3654		check_proto = IPPROTO_ICMPV6;
3655	} else {
3656		ASSERT(valid & IPSL_IPV4);
3657		af = IPSEC_AF_V4;
3658		check_proto = IPPROTO_ICMP;
3659	}
3660
3661	ASSERT(RW_WRITE_HELD(&php->iph_lock));
3662
3663	/*
3664	 * Double-check that we don't have any duplicate selectors here.
3665	 * Because selectors are interned below, we need only compare pointers
3666	 * for equality.
3667	 */
3668	if (selkey->ipsl_sel_hval == IPSEC_SEL_NOHASH) {
3669		head = pr->ipr_nonhash[af];
3670	} else {
3671		selkey->ipsl_pol_hval =
3672		    (selkey->ipsl_valid & IPSL_IPV4) ?
3673		    IPSEC_IPV4_HASH(selkey->ipsl_remote.ipsad_v4,
3674		    pr->ipr_nchains) :
3675		    IPSEC_IPV6_HASH(selkey->ipsl_remote.ipsad_v6,
3676		    pr->ipr_nchains);
3677
3678		head = pr->ipr_hash[selkey->ipsl_pol_hval].hash_head;
3679	}
3680
3681	for (p2 = head; p2 != NULL; p2 = p2->ipsp_hash.hash_next) {
3682		if (p2->ipsp_sel == ipp->ipsp_sel)
3683			return (B_FALSE);
3684	}
3685
3686	/*
3687	 * If it's ICMP and not a drop or pass rule, run through the ICMP
3688	 * rules and make sure the action is either new or the same as any
3689	 * other actions.  We don't have to check the full chain because
3690	 * discard and bypass will override all other actions
3691	 */
3692
3693	if (valid & IPSL_PROTOCOL &&
3694	    selkey->ipsl_proto == check_proto &&
3695	    (ipp->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_APPLY)) {
3696
3697		for (p2 = head; p2 != NULL; p2 = p2->ipsp_hash.hash_next) {
3698
3699			if (p2->ipsp_sel->ipsl_key.ipsl_valid & IPSL_PROTOCOL &&
3700			    p2->ipsp_sel->ipsl_key.ipsl_proto == check_proto &&
3701			    (p2->ipsp_act->ipa_act.ipa_type ==
3702			    IPSEC_ACT_APPLY)) {
3703				return (ipsec_compare_action(p2, ipp));
3704			}
3705		}
3706	}
3707
3708	return (B_TRUE);
3709}
3710
3711/*
3712 * compare the action chains of two policies for equality
3713 * B_TRUE -> effective equality
3714 */
3715
3716static boolean_t
3717ipsec_compare_action(ipsec_policy_t *p1, ipsec_policy_t *p2)
3718{
3719
3720	ipsec_action_t *act1, *act2;
3721
3722	/* We have a valid rule. Let's compare the actions */
3723	if (p1->ipsp_act == p2->ipsp_act) {
3724		/* same action. We are good */
3725		return (B_TRUE);
3726	}
3727
3728	/* we have to walk the chain */
3729
3730	act1 = p1->ipsp_act;
3731	act2 = p2->ipsp_act;
3732
3733	while (act1 != NULL && act2 != NULL) {
3734
3735		/* otherwise, Are we close enough? */
3736		if (act1->ipa_allow_clear != act2->ipa_allow_clear ||
3737		    act1->ipa_want_ah != act2->ipa_want_ah ||
3738		    act1->ipa_want_esp != act2->ipa_want_esp ||
3739		    act1->ipa_want_se != act2->ipa_want_se) {
3740			/* Nope, we aren't */
3741			return (B_FALSE);
3742		}
3743
3744		if (act1->ipa_want_ah) {
3745			if (act1->ipa_act.ipa_apply.ipp_auth_alg !=
3746			    act2->ipa_act.ipa_apply.ipp_auth_alg) {
3747				return (B_FALSE);
3748			}
3749
3750			if (act1->ipa_act.ipa_apply.ipp_ah_minbits !=
3751			    act2->ipa_act.ipa_apply.ipp_ah_minbits ||
3752			    act1->ipa_act.ipa_apply.ipp_ah_maxbits !=
3753			    act2->ipa_act.ipa_apply.ipp_ah_maxbits) {
3754				return (B_FALSE);
3755			}
3756		}
3757
3758		if (act1->ipa_want_esp) {
3759			if (act1->ipa_act.ipa_apply.ipp_use_esp !=
3760			    act2->ipa_act.ipa_apply.ipp_use_esp ||
3761			    act1->ipa_act.ipa_apply.ipp_use_espa !=
3762			    act2->ipa_act.ipa_apply.ipp_use_espa) {
3763				return (B_FALSE);
3764			}
3765
3766			if (act1->ipa_act.ipa_apply.ipp_use_esp) {
3767				if (act1->ipa_act.ipa_apply.ipp_encr_alg !=
3768				    act2->ipa_act.ipa_apply.ipp_encr_alg) {
3769					return (B_FALSE);
3770				}
3771
3772				if (act1->ipa_act.ipa_apply.ipp_espe_minbits !=
3773				    act2->ipa_act.ipa_apply.ipp_espe_minbits ||
3774				    act1->ipa_act.ipa_apply.ipp_espe_maxbits !=
3775				    act2->ipa_act.ipa_apply.ipp_espe_maxbits) {
3776					return (B_FALSE);
3777				}
3778			}
3779
3780			if (act1->ipa_act.ipa_apply.ipp_use_espa) {
3781				if (act1->ipa_act.ipa_apply.ipp_esp_auth_alg !=
3782				    act2->ipa_act.ipa_apply.ipp_esp_auth_alg) {
3783					return (B_FALSE);
3784				}
3785
3786				if (act1->ipa_act.ipa_apply.ipp_espa_minbits !=
3787				    act2->ipa_act.ipa_apply.ipp_espa_minbits ||
3788				    act1->ipa_act.ipa_apply.ipp_espa_maxbits !=
3789				    act2->ipa_act.ipa_apply.ipp_espa_maxbits) {
3790					return (B_FALSE);
3791				}
3792			}
3793
3794		}
3795
3796		act1 = act1->ipa_next;
3797		act2 = act2->ipa_next;
3798	}
3799
3800	if (act1 != NULL || act2 != NULL) {
3801		return (B_FALSE);
3802	}
3803
3804	return (B_TRUE);
3805}
3806
3807
3808/*
3809 * Given a constructed ipsec_policy_t policy rule, enter it into
3810 * the correct policy ruleset.
3811 *
3812 * ipsec_check_policy() is assumed to have succeeded first (to check for
3813 * duplicates).
3814 */
3815void
3816ipsec_enter_policy(ipsec_policy_head_t *php, ipsec_policy_t *ipp, int direction,
3817    netstack_t *ns)
3818{
3819	ipsec_policy_root_t *pr = &php->iph_root[direction];
3820	ipsec_selkey_t *selkey = &ipp->ipsp_sel->ipsl_key;
3821	uint32_t valid = selkey->ipsl_valid;
3822	uint32_t hval = selkey->ipsl_pol_hval;
3823	int af = -1;
3824
3825	ASSERT(RW_WRITE_HELD(&php->iph_lock));
3826
3827	if (valid & IPSL_IPV6) {
3828		ASSERT(!(valid & IPSL_IPV4));
3829		af = IPSEC_AF_V6;
3830	} else {
3831		ASSERT(valid & IPSL_IPV4);
3832		af = IPSEC_AF_V4;
3833	}
3834
3835	php->iph_gen++;
3836
3837	if (hval == IPSEC_SEL_NOHASH) {
3838		HASHLIST_INSERT(ipp, ipsp_hash, pr->ipr_nonhash[af]);
3839	} else {
3840		HASH_LOCK(pr->ipr_hash, hval);
3841		HASH_INSERT(ipp, ipsp_hash, pr->ipr_hash, hval);
3842		HASH_UNLOCK(pr->ipr_hash, hval);
3843	}
3844
3845	ipsec_insert_always(&php->iph_rulebyid, ipp);
3846
3847	ipsec_update_present_flags(ns->netstack_ipsec);
3848}
3849
3850static void
3851ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr)
3852{
3853	ipsec_policy_t *ip, *nip;
3854	int af, chain, nchain;
3855
3856	for (af = 0; af < IPSEC_NAF; af++) {
3857		for (ip = ipr->ipr_nonhash[af]; ip != NULL; ip = nip) {
3858			nip = ip->ipsp_hash.hash_next;
3859			IPPOL_UNCHAIN(php, ip);
3860		}
3861		ipr->ipr_nonhash[af] = NULL;
3862	}
3863	nchain = ipr->ipr_nchains;
3864
3865	for (chain = 0; chain < nchain; chain++) {
3866		for (ip = ipr->ipr_hash[chain].hash_head; ip != NULL;
3867		    ip = nip) {
3868			nip = ip->ipsp_hash.hash_next;
3869			IPPOL_UNCHAIN(php, ip);
3870		}
3871		ipr->ipr_hash[chain].hash_head = NULL;
3872	}
3873}
3874
3875/*
3876 * Create and insert inbound or outbound policy associated with actp for the
3877 * address family fam into the policy head ph.  Returns B_TRUE if policy was
3878 * inserted, and B_FALSE otherwise.
3879 */
3880boolean_t
3881ipsec_polhead_insert(ipsec_policy_head_t *ph, ipsec_act_t *actp, uint_t nact,
3882    int fam, int ptype, netstack_t *ns)
3883{
3884	ipsec_selkey_t		sel;
3885	ipsec_policy_t		*pol;
3886	ipsec_policy_root_t	*pr;
3887
3888	bzero(&sel, sizeof (sel));
3889	sel.ipsl_valid = (fam == IPSEC_AF_V4 ? IPSL_IPV4 : IPSL_IPV6);
3890	if ((pol = ipsec_policy_create(&sel, actp, nact, IPSEC_PRIO_SOCKET,
3891	    NULL, ns)) != NULL) {
3892		pr = &ph->iph_root[ptype];
3893		HASHLIST_INSERT(pol, ipsp_hash, pr->ipr_nonhash[fam]);
3894		ipsec_insert_always(&ph->iph_rulebyid, pol);
3895	}
3896	return (pol != NULL);
3897}
3898
3899void
3900ipsec_polhead_flush(ipsec_policy_head_t *php, netstack_t *ns)
3901{
3902	int dir;
3903
3904	ASSERT(RW_WRITE_HELD(&php->iph_lock));
3905
3906	for (dir = 0; dir < IPSEC_NTYPES; dir++)
3907		ipsec_ipr_flush(php, &php->iph_root[dir]);
3908
3909	php->iph_gen++;
3910	ipsec_update_present_flags(ns->netstack_ipsec);
3911}
3912
3913void
3914ipsec_polhead_free(ipsec_policy_head_t *php, netstack_t *ns)
3915{
3916	int dir;
3917
3918	ASSERT(php->iph_refs == 0);
3919
3920	rw_enter(&php->iph_lock, RW_WRITER);
3921	ipsec_polhead_flush(php, ns);
3922	rw_exit(&php->iph_lock);
3923	rw_destroy(&php->iph_lock);
3924	for (dir = 0; dir < IPSEC_NTYPES; dir++) {
3925		ipsec_policy_root_t *ipr = &php->iph_root[dir];
3926		int chain;
3927
3928		for (chain = 0; chain < ipr->ipr_nchains; chain++)
3929			mutex_destroy(&(ipr->ipr_hash[chain].hash_lock));
3930
3931	}
3932	ipsec_polhead_free_table(php);
3933	kmem_free(php, sizeof (*php));
3934}
3935
3936static void
3937ipsec_ipr_init(ipsec_policy_root_t *ipr)
3938{
3939	int af;
3940
3941	ipr->ipr_nchains = 0;
3942	ipr->ipr_hash = NULL;
3943
3944	for (af = 0; af < IPSEC_NAF; af++) {
3945		ipr->ipr_nonhash[af] = NULL;
3946	}
3947}
3948
3949ipsec_policy_head_t *
3950ipsec_polhead_create(void)
3951{
3952	ipsec_policy_head_t *php;
3953
3954	php = kmem_alloc(sizeof (*php), KM_NOSLEEP);
3955	if (php == NULL)
3956		return (php);
3957
3958	rw_init(&php->iph_lock, NULL, RW_DEFAULT, NULL);
3959	php->iph_refs = 1;
3960	php->iph_gen = 0;
3961
3962	ipsec_ipr_init(&php->iph_root[IPSEC_TYPE_INBOUND]);
3963	ipsec_ipr_init(&php->iph_root[IPSEC_TYPE_OUTBOUND]);
3964
3965	avl_create(&php->iph_rulebyid, ipsec_policy_cmpbyid,
3966	    sizeof (ipsec_policy_t), offsetof(ipsec_policy_t, ipsp_byid));
3967
3968	return (php);
3969}
3970
3971/*
3972 * Clone the policy head into a new polhead; release one reference to the
3973 * old one and return the only reference to the new one.
3974 * If the old one had a refcount of 1, just return it.
3975 */
3976ipsec_policy_head_t *
3977ipsec_polhead_split(ipsec_policy_head_t *php, netstack_t *ns)
3978{
3979	ipsec_policy_head_t *nphp;
3980
3981	if (php == NULL)
3982		return (ipsec_polhead_create());
3983	else if (php->iph_refs == 1)
3984		return (php);
3985
3986	nphp = ipsec_polhead_create();
3987	if (nphp == NULL)
3988		return (NULL);
3989
3990	if (ipsec_copy_polhead(php, nphp, ns) != 0) {
3991		ipsec_polhead_free(nphp, ns);
3992		return (NULL);
3993	}
3994	IPPH_REFRELE(php, ns);
3995	return (nphp);
3996}
3997
3998/*
3999 * When sending a response to a ICMP request or generating a RST
4000 * in the TCP case, the outbound packets need to go at the same level
4001 * of protection as the incoming ones i.e we associate our outbound
4002 * policy with how the packet came in. We call this after we have
4003 * accepted the incoming packet which may or may not have been in
4004 * clear and hence we are sending the reply back with the policy
4005 * matching the incoming datagram's policy.
4006 *
4007 * NOTE : This technology serves two purposes :
4008 *
4009 * 1) If we have multiple outbound policies, we send out a reply
4010 *    matching with how it came in rather than matching the outbound
4011 *    policy.
4012 *
4013 * 2) For assymetric policies, we want to make sure that incoming
4014 *    and outgoing has the same level of protection. Assymetric
4015 *    policies exist only with global policy where we may not have
4016 *    both outbound and inbound at the same time.
4017 *
4018 * NOTE2:	This function is called by cleartext cases, so it needs to be
4019 *		in IP proper.
4020 *
4021 * Note: the caller has moved other parts of ira into ixa already.
4022 */
4023boolean_t
4024ipsec_in_to_out(ip_recv_attr_t *ira, ip_xmit_attr_t *ixa, mblk_t *data_mp,
4025    ipha_t *ipha, ip6_t *ip6h)
4026{
4027	ipsec_selector_t sel;
4028	ipsec_action_t	*reflect_action = NULL;
4029	netstack_t	*ns = ixa->ixa_ipst->ips_netstack;
4030
4031	bzero((void*)&sel, sizeof (sel));
4032
4033	if (ira->ira_ipsec_action != NULL) {
4034		/* transfer reference.. */
4035		reflect_action = ira->ira_ipsec_action;
4036		ira->ira_ipsec_action = NULL;
4037	} else if (!(ira->ira_flags & IRAF_LOOPBACK))
4038		reflect_action = ipsec_in_to_out_action(ira);
4039
4040	/*
4041	 * The caller is going to send the datagram out which might
4042	 * go on the wire or delivered locally through ire_send_local.
4043	 *
4044	 * 1) If it goes out on the wire, new associations will be
4045	 *    obtained.
4046	 * 2) If it is delivered locally, ire_send_local will convert
4047	 *    this ip_xmit_attr_t back to a ip_recv_attr_t looking at the
4048	 *    requests.
4049	 */
4050	ixa->ixa_ipsec_action = reflect_action;
4051
4052	if (!ipsec_init_outbound_ports(&sel, data_mp, ipha, ip6h, 0,
4053	    ns->netstack_ipsec)) {
4054		/* Note: data_mp already consumed and ip_drop_packet done */
4055		return (B_FALSE);
4056	}
4057	ixa->ixa_ipsec_src_port = sel.ips_local_port;
4058	ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4059	ixa->ixa_ipsec_proto = sel.ips_protocol;
4060	ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4061	ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4062
4063	/*
4064	 * Don't use global policy for this, as we want
4065	 * to use the same protection that was applied to the inbound packet.
4066	 * Thus we set IXAF_NO_IPSEC is it arrived in the clear to make
4067	 * it be sent in the clear.
4068	 */
4069	if (ira->ira_flags & IRAF_IPSEC_SECURE)
4070		ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4071	else
4072		ixa->ixa_flags |= IXAF_NO_IPSEC;
4073
4074	return (B_TRUE);
4075}
4076
4077void
4078ipsec_out_release_refs(ip_xmit_attr_t *ixa)
4079{
4080	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
4081		return;
4082
4083	if (ixa->ixa_ipsec_ah_sa != NULL) {
4084		IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
4085		ixa->ixa_ipsec_ah_sa = NULL;
4086	}
4087	if (ixa->ixa_ipsec_esp_sa != NULL) {
4088		IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
4089		ixa->ixa_ipsec_esp_sa = NULL;
4090	}
4091	if (ixa->ixa_ipsec_policy != NULL) {
4092		IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4093		ixa->ixa_ipsec_policy = NULL;
4094	}
4095	if (ixa->ixa_ipsec_action != NULL) {
4096		IPACT_REFRELE(ixa->ixa_ipsec_action);
4097		ixa->ixa_ipsec_action = NULL;
4098	}
4099	if (ixa->ixa_ipsec_latch) {
4100		IPLATCH_REFRELE(ixa->ixa_ipsec_latch);
4101		ixa->ixa_ipsec_latch = NULL;
4102	}
4103	/* Clear the soft references to the SAs */
4104	ixa->ixa_ipsec_ref[0].ipsr_sa = NULL;
4105	ixa->ixa_ipsec_ref[0].ipsr_bucket = NULL;
4106	ixa->ixa_ipsec_ref[0].ipsr_gen = 0;
4107	ixa->ixa_ipsec_ref[1].ipsr_sa = NULL;
4108	ixa->ixa_ipsec_ref[1].ipsr_bucket = NULL;
4109	ixa->ixa_ipsec_ref[1].ipsr_gen = 0;
4110	ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4111}
4112
4113void
4114ipsec_in_release_refs(ip_recv_attr_t *ira)
4115{
4116	if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
4117		return;
4118
4119	if (ira->ira_ipsec_ah_sa != NULL) {
4120		IPSA_REFRELE(ira->ira_ipsec_ah_sa);
4121		ira->ira_ipsec_ah_sa = NULL;
4122	}
4123	if (ira->ira_ipsec_esp_sa != NULL) {
4124		IPSA_REFRELE(ira->ira_ipsec_esp_sa);
4125		ira->ira_ipsec_esp_sa = NULL;
4126	}
4127	if (ira->ira_ipsec_action != NULL) {
4128		IPACT_REFRELE(ira->ira_ipsec_action);
4129		ira->ira_ipsec_action = NULL;
4130	}
4131
4132	ira->ira_flags &= ~IRAF_IPSEC_SECURE;
4133}
4134
4135/*
4136 * This is called from ire_send_local when a packet
4137 * is looped back. We setup the ip_recv_attr_t "borrowing" the references
4138 * held by the callers.
4139 * Note that we don't do any IPsec but we carry the actions and IPSEC flags
4140 * across so that the fanout policy checks see that IPsec was applied.
4141 *
4142 * The caller should do ipsec_in_release_refs() on the ira by calling
4143 * ira_cleanup().
4144 */
4145void
4146ipsec_out_to_in(ip_xmit_attr_t *ixa, ill_t *ill, ip_recv_attr_t *ira)
4147{
4148	ipsec_policy_t *pol;
4149	ipsec_action_t *act;
4150
4151	/* Non-IPsec operations */
4152	ira->ira_free_flags = 0;
4153	ira->ira_zoneid = ixa->ixa_zoneid;
4154	ira->ira_cred = ixa->ixa_cred;
4155	ira->ira_cpid = ixa->ixa_cpid;
4156	ira->ira_tsl = ixa->ixa_tsl;
4157	ira->ira_ill = ira->ira_rill = ill;
4158	ira->ira_flags = ixa->ixa_flags & IAF_MASK;
4159	ira->ira_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
4160	ira->ira_pktlen = ixa->ixa_pktlen;
4161	ira->ira_ip_hdr_length = ixa->ixa_ip_hdr_length;
4162	ira->ira_protocol = ixa->ixa_protocol;
4163	ira->ira_mhip = NULL;
4164
4165	ira->ira_flags |= IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
4166
4167	ira->ira_sqp = ixa->ixa_sqp;
4168	ira->ira_ring = NULL;
4169
4170	ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
4171	ira->ira_rifindex = ira->ira_ruifindex;
4172
4173	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
4174		return;
4175
4176	ira->ira_flags |= IRAF_IPSEC_SECURE;
4177
4178	ira->ira_ipsec_ah_sa = NULL;
4179	ira->ira_ipsec_esp_sa = NULL;
4180
4181	act = ixa->ixa_ipsec_action;
4182	if (act == NULL) {
4183		pol = ixa->ixa_ipsec_policy;
4184		if (pol != NULL) {
4185			act = pol->ipsp_act;
4186			IPACT_REFHOLD(act);
4187		}
4188	}
4189	ixa->ixa_ipsec_action = NULL;
4190	ira->ira_ipsec_action = act;
4191}
4192
4193/*
4194 * Consults global policy and per-socket policy to see whether this datagram
4195 * should go out secure. If so it updates the ip_xmit_attr_t
4196 * Should not be used when connecting, since then we want to latch the policy.
4197 *
4198 * If connp is NULL we just look at the global policy.
4199 *
4200 * Returns NULL if the packet was dropped, in which case the MIB has
4201 * been incremented and ip_drop_packet done.
4202 */
4203mblk_t *
4204ip_output_attach_policy(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
4205    const conn_t *connp, ip_xmit_attr_t *ixa)
4206{
4207	ipsec_selector_t sel;
4208	boolean_t	policy_present;
4209	ip_stack_t	*ipst = ixa->ixa_ipst;
4210	netstack_t	*ns = ipst->ips_netstack;
4211	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4212	ipsec_policy_t	*p;
4213
4214	ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
4215	ASSERT((ipha != NULL && ip6h == NULL) ||
4216	    (ip6h != NULL && ipha == NULL));
4217
4218	if (ipha != NULL)
4219		policy_present = ipss->ipsec_outbound_v4_policy_present;
4220	else
4221		policy_present = ipss->ipsec_outbound_v6_policy_present;
4222
4223	if (!policy_present && (connp == NULL || connp->conn_policy == NULL))
4224		return (mp);
4225
4226	bzero((void*)&sel, sizeof (sel));
4227
4228	if (ipha != NULL) {
4229		sel.ips_local_addr_v4 = ipha->ipha_src;
4230		sel.ips_remote_addr_v4 = ip_get_dst(ipha);
4231		sel.ips_isv4 = B_TRUE;
4232	} else {
4233		sel.ips_isv4 = B_FALSE;
4234		sel.ips_local_addr_v6 = ip6h->ip6_src;
4235		sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, NULL);
4236	}
4237	sel.ips_protocol = ixa->ixa_protocol;
4238
4239	if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0, ipss)) {
4240		if (ipha != NULL) {
4241			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
4242		} else {
4243			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
4244		}
4245		/* Note: mp already consumed and ip_drop_packet done */
4246		return (NULL);
4247	}
4248
4249	ASSERT(ixa->ixa_ipsec_policy == NULL);
4250	p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
4251	ixa->ixa_ipsec_policy = p;
4252	if (p != NULL) {
4253		ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4254		if (connp == NULL || connp->conn_policy == NULL)
4255			ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
4256	} else {
4257		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4258	}
4259
4260	/*
4261	 * Copy the right port information.
4262	 */
4263	ixa->ixa_ipsec_src_port = sel.ips_local_port;
4264	ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4265	ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4266	ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4267	ixa->ixa_ipsec_proto = sel.ips_protocol;
4268	return (mp);
4269}
4270
4271/*
4272 * When appropriate, this function caches inbound and outbound policy
4273 * for this connection. The outbound policy is stored in conn_ixa.
4274 * Note that it can not be used for SCTP since conn_faddr isn't set for SCTP.
4275 *
4276 * XXX need to work out more details about per-interface policy and
4277 * caching here!
4278 *
4279 * XXX may want to split inbound and outbound caching for ill..
4280 */
4281int
4282ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
4283{
4284	boolean_t global_policy_present;
4285	netstack_t	*ns = connp->conn_netstack;
4286	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4287
4288	connp->conn_ixa->ixa_ipsec_policy_gen =
4289	    ipss->ipsec_system_policy.iph_gen;
4290	/*
4291	 * There is no policy latching for ICMP sockets because we can't
4292	 * decide on which policy to use until we see the packet and get
4293	 * type/code selectors.
4294	 */
4295	if (connp->conn_proto == IPPROTO_ICMP ||
4296	    connp->conn_proto == IPPROTO_ICMPV6) {
4297		connp->conn_in_enforce_policy =
4298		    connp->conn_out_enforce_policy = B_TRUE;
4299		if (connp->conn_latch != NULL) {
4300			IPLATCH_REFRELE(connp->conn_latch);
4301			connp->conn_latch = NULL;
4302		}
4303		if (connp->conn_latch_in_policy != NULL) {
4304			IPPOL_REFRELE(connp->conn_latch_in_policy);
4305			connp->conn_latch_in_policy = NULL;
4306		}
4307		if (connp->conn_latch_in_action != NULL) {
4308			IPACT_REFRELE(connp->conn_latch_in_action);
4309			connp->conn_latch_in_action = NULL;
4310		}
4311		if (connp->conn_ixa->ixa_ipsec_policy != NULL) {
4312			IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
4313			connp->conn_ixa->ixa_ipsec_policy = NULL;
4314		}
4315		if (connp->conn_ixa->ixa_ipsec_action != NULL) {
4316			IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
4317			connp->conn_ixa->ixa_ipsec_action = NULL;
4318		}
4319		connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4320		return (0);
4321	}
4322
4323	global_policy_present = isv4 ?
4324	    (ipss->ipsec_outbound_v4_policy_present ||
4325	    ipss->ipsec_inbound_v4_policy_present) :
4326	    (ipss->ipsec_outbound_v6_policy_present ||
4327	    ipss->ipsec_inbound_v6_policy_present);
4328
4329	if ((connp->conn_policy != NULL) || global_policy_present) {
4330		ipsec_selector_t sel;
4331		ipsec_policy_t	*p;
4332
4333		if (connp->conn_latch == NULL &&
4334		    (connp->conn_latch = iplatch_create()) == NULL) {
4335			return (ENOMEM);
4336		}
4337
4338		bzero((void*)&sel, sizeof (sel));
4339
4340		sel.ips_protocol = connp->conn_proto;
4341		sel.ips_local_port = connp->conn_lport;
4342		sel.ips_remote_port = connp->conn_fport;
4343		sel.ips_is_icmp_inv_acq = 0;
4344		sel.ips_isv4 = isv4;
4345		if (isv4) {
4346			sel.ips_local_addr_v4 = connp->conn_laddr_v4;
4347			sel.ips_remote_addr_v4 = connp->conn_faddr_v4;
4348		} else {
4349			sel.ips_local_addr_v6 = connp->conn_laddr_v6;
4350			sel.ips_remote_addr_v6 = connp->conn_faddr_v6;
4351		}
4352
4353		p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
4354		if (connp->conn_latch_in_policy != NULL)
4355			IPPOL_REFRELE(connp->conn_latch_in_policy);
4356		connp->conn_latch_in_policy = p;
4357		connp->conn_in_enforce_policy = (p != NULL);
4358
4359		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
4360		if (connp->conn_ixa->ixa_ipsec_policy != NULL)
4361			IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
4362		connp->conn_ixa->ixa_ipsec_policy = p;
4363		connp->conn_out_enforce_policy = (p != NULL);
4364		if (p != NULL) {
4365			connp->conn_ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4366			if (connp->conn_policy == NULL) {
4367				connp->conn_ixa->ixa_flags |=
4368				    IXAF_IPSEC_GLOBAL_POLICY;
4369			}
4370		} else {
4371			connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4372		}
4373		/* Clear the latched actions too, in case we're recaching. */
4374		if (connp->conn_ixa->ixa_ipsec_action != NULL) {
4375			IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
4376			connp->conn_ixa->ixa_ipsec_action = NULL;
4377		}
4378		if (connp->conn_latch_in_action != NULL) {
4379			IPACT_REFRELE(connp->conn_latch_in_action);
4380			connp->conn_latch_in_action = NULL;
4381		}
4382		connp->conn_ixa->ixa_ipsec_src_port = sel.ips_local_port;
4383		connp->conn_ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4384		connp->conn_ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4385		connp->conn_ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4386		connp->conn_ixa->ixa_ipsec_proto = sel.ips_protocol;
4387	} else {
4388		connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4389	}
4390
4391	/*
4392	 * We may or may not have policy for this endpoint.  We still set
4393	 * conn_policy_cached so that inbound datagrams don't have to look
4394	 * at global policy as policy is considered latched for these
4395	 * endpoints.  We should not set conn_policy_cached until the conn
4396	 * reflects the actual policy. If we *set* this before inheriting
4397	 * the policy there is a window where the check
4398	 * CONN_INBOUND_POLICY_PRESENT, will neither check with the policy
4399	 * on the conn (because we have not yet copied the policy on to
4400	 * conn and hence not set conn_in_enforce_policy) nor with the
4401	 * global policy (because conn_policy_cached is already set).
4402	 */
4403	connp->conn_policy_cached = B_TRUE;
4404	return (0);
4405}
4406
4407/*
4408 * When appropriate, this function caches outbound policy for faddr/fport.
4409 * It is used when we are not connected i.e., when we can not latch the
4410 * policy.
4411 */
4412void
4413ipsec_cache_outbound_policy(const conn_t *connp, const in6_addr_t *v6src,
4414    const in6_addr_t *v6dst, in_port_t dstport, ip_xmit_attr_t *ixa)
4415{
4416	boolean_t	isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0;
4417	boolean_t	global_policy_present;
4418	netstack_t	*ns = connp->conn_netstack;
4419	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4420
4421	ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
4422
4423	/*
4424	 * There is no policy caching for ICMP sockets because we can't
4425	 * decide on which policy to use until we see the packet and get
4426	 * type/code selectors.
4427	 */
4428	if (connp->conn_proto == IPPROTO_ICMP ||
4429	    connp->conn_proto == IPPROTO_ICMPV6) {
4430		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4431		if (ixa->ixa_ipsec_policy != NULL) {
4432			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4433			ixa->ixa_ipsec_policy = NULL;
4434		}
4435		if (ixa->ixa_ipsec_action != NULL) {
4436			IPACT_REFRELE(ixa->ixa_ipsec_action);
4437			ixa->ixa_ipsec_action = NULL;
4438		}
4439		return;
4440	}
4441
4442	global_policy_present = isv4 ?
4443	    (ipss->ipsec_outbound_v4_policy_present ||
4444	    ipss->ipsec_inbound_v4_policy_present) :
4445	    (ipss->ipsec_outbound_v6_policy_present ||
4446	    ipss->ipsec_inbound_v6_policy_present);
4447
4448	if ((connp->conn_policy != NULL) || global_policy_present) {
4449		ipsec_selector_t sel;
4450		ipsec_policy_t	*p;
4451
4452		bzero((void*)&sel, sizeof (sel));
4453
4454		sel.ips_protocol = connp->conn_proto;
4455		sel.ips_local_port = connp->conn_lport;
4456		sel.ips_remote_port = dstport;
4457		sel.ips_is_icmp_inv_acq = 0;
4458		sel.ips_isv4 = isv4;
4459		if (isv4) {
4460			IN6_V4MAPPED_TO_IPADDR(v6src, sel.ips_local_addr_v4);
4461			IN6_V4MAPPED_TO_IPADDR(v6dst, sel.ips_remote_addr_v4);
4462		} else {
4463			sel.ips_local_addr_v6 = *v6src;
4464			sel.ips_remote_addr_v6 = *v6dst;
4465		}
4466
4467		p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
4468		if (ixa->ixa_ipsec_policy != NULL)
4469			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4470		ixa->ixa_ipsec_policy = p;
4471		if (p != NULL) {
4472			ixa->ixa_flags |= IXAF_IPSEC_SECURE;
4473			if (connp->conn_policy == NULL)
4474				ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
4475		} else {
4476			ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4477		}
4478		/* Clear the latched actions too, in case we're recaching. */
4479		if (ixa->ixa_ipsec_action != NULL) {
4480			IPACT_REFRELE(ixa->ixa_ipsec_action);
4481			ixa->ixa_ipsec_action = NULL;
4482		}
4483
4484		ixa->ixa_ipsec_src_port = sel.ips_local_port;
4485		ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
4486		ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
4487		ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
4488		ixa->ixa_ipsec_proto = sel.ips_protocol;
4489	} else {
4490		ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
4491		if (ixa->ixa_ipsec_policy != NULL) {
4492			IPPOL_REFRELE(ixa->ixa_ipsec_policy);
4493			ixa->ixa_ipsec_policy = NULL;
4494		}
4495		if (ixa->ixa_ipsec_action != NULL) {
4496			IPACT_REFRELE(ixa->ixa_ipsec_action);
4497			ixa->ixa_ipsec_action = NULL;
4498		}
4499	}
4500}
4501
4502/*
4503 * Returns B_FALSE if the policy has gone stale.
4504 */
4505boolean_t
4506ipsec_outbound_policy_current(ip_xmit_attr_t *ixa)
4507{
4508	ipsec_stack_t	*ipss = ixa->ixa_ipst->ips_netstack->netstack_ipsec;
4509
4510	if (!(ixa->ixa_flags & IXAF_IPSEC_GLOBAL_POLICY))
4511		return (B_TRUE);
4512
4513	return (ixa->ixa_ipsec_policy_gen == ipss->ipsec_system_policy.iph_gen);
4514}
4515
4516void
4517iplatch_free(ipsec_latch_t *ipl)
4518{
4519	if (ipl->ipl_local_cid != NULL)
4520		IPSID_REFRELE(ipl->ipl_local_cid);
4521	if (ipl->ipl_remote_cid != NULL)
4522		IPSID_REFRELE(ipl->ipl_remote_cid);
4523	mutex_destroy(&ipl->ipl_lock);
4524	kmem_free(ipl, sizeof (*ipl));
4525}
4526
4527ipsec_latch_t *
4528iplatch_create()
4529{
4530	ipsec_latch_t *ipl = kmem_zalloc(sizeof (*ipl), KM_NOSLEEP);
4531	if (ipl == NULL)
4532		return (ipl);
4533	mutex_init(&ipl->ipl_lock, NULL, MUTEX_DEFAULT, NULL);
4534	ipl->ipl_refcnt = 1;
4535	return (ipl);
4536}
4537
4538/*
4539 * Hash function for ID hash table.
4540 */
4541static uint32_t
4542ipsid_hash(int idtype, char *idstring)
4543{
4544	uint32_t hval = idtype;
4545	unsigned char c;
4546
4547	while ((c = *idstring++) != 0) {
4548		hval = (hval << 4) | (hval >> 28);
4549		hval ^= c;
4550	}
4551	hval = hval ^ (hval >> 16);
4552	return (hval & (IPSID_HASHSIZE-1));
4553}
4554
4555/*
4556 * Look up identity string in hash table.  Return identity object
4557 * corresponding to the name -- either preexisting, or newly allocated.
4558 *
4559 * Return NULL if we need to allocate a new one and can't get memory.
4560 */
4561ipsid_t *
4562ipsid_lookup(int idtype, char *idstring, netstack_t *ns)
4563{
4564	ipsid_t *retval;
4565	char *nstr;
4566	int idlen = strlen(idstring) + 1;
4567	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4568	ipsif_t *bucket;
4569
4570	bucket = &ipss->ipsec_ipsid_buckets[ipsid_hash(idtype, idstring)];
4571
4572	mutex_enter(&bucket->ipsif_lock);
4573
4574	for (retval = bucket->ipsif_head; retval != NULL;
4575	    retval = retval->ipsid_next) {
4576		if (idtype != retval->ipsid_type)
4577			continue;
4578		if (bcmp(idstring, retval->ipsid_cid, idlen) != 0)
4579			continue;
4580
4581		IPSID_REFHOLD(retval);
4582		mutex_exit(&bucket->ipsif_lock);
4583		return (retval);
4584	}
4585
4586	retval = kmem_alloc(sizeof (*retval), KM_NOSLEEP);
4587	if (!retval) {
4588		mutex_exit(&bucket->ipsif_lock);
4589		return (NULL);
4590	}
4591
4592	nstr = kmem_alloc(idlen, KM_NOSLEEP);
4593	if (!nstr) {
4594		mutex_exit(&bucket->ipsif_lock);
4595		kmem_free(retval, sizeof (*retval));
4596		return (NULL);
4597	}
4598
4599	retval->ipsid_refcnt = 1;
4600	retval->ipsid_next = bucket->ipsif_head;
4601	if (retval->ipsid_next != NULL)
4602		retval->ipsid_next->ipsid_ptpn = &retval->ipsid_next;
4603	retval->ipsid_ptpn = &bucket->ipsif_head;
4604	retval->ipsid_type = idtype;
4605	retval->ipsid_cid = nstr;
4606	bucket->ipsif_head = retval;
4607	bcopy(idstring, nstr, idlen);
4608	mutex_exit(&bucket->ipsif_lock);
4609
4610	return (retval);
4611}
4612
4613/*
4614 * Garbage collect the identity hash table.
4615 */
4616void
4617ipsid_gc(netstack_t *ns)
4618{
4619	int i, len;
4620	ipsid_t *id, *nid;
4621	ipsif_t *bucket;
4622	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4623
4624	for (i = 0; i < IPSID_HASHSIZE; i++) {
4625		bucket = &ipss->ipsec_ipsid_buckets[i];
4626		mutex_enter(&bucket->ipsif_lock);
4627		for (id = bucket->ipsif_head; id != NULL; id = nid) {
4628			nid = id->ipsid_next;
4629			if (id->ipsid_refcnt == 0) {
4630				*id->ipsid_ptpn = nid;
4631				if (nid != NULL)
4632					nid->ipsid_ptpn = id->ipsid_ptpn;
4633				len = strlen(id->ipsid_cid) + 1;
4634				kmem_free(id->ipsid_cid, len);
4635				kmem_free(id, sizeof (*id));
4636			}
4637		}
4638		mutex_exit(&bucket->ipsif_lock);
4639	}
4640}
4641
4642/*
4643 * Return true if two identities are the same.
4644 */
4645boolean_t
4646ipsid_equal(ipsid_t *id1, ipsid_t *id2)
4647{
4648	if (id1 == id2)
4649		return (B_TRUE);
4650#ifdef DEBUG
4651	if ((id1 == NULL) || (id2 == NULL))
4652		return (B_FALSE);
4653	/*
4654	 * test that we're interning id's correctly..
4655	 */
4656	ASSERT((strcmp(id1->ipsid_cid, id2->ipsid_cid) != 0) ||
4657	    (id1->ipsid_type != id2->ipsid_type));
4658#endif
4659	return (B_FALSE);
4660}
4661
4662/*
4663 * Initialize identity table; called during module initialization.
4664 */
4665static void
4666ipsid_init(netstack_t *ns)
4667{
4668	ipsif_t *bucket;
4669	int i;
4670	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4671
4672	for (i = 0; i < IPSID_HASHSIZE; i++) {
4673		bucket = &ipss->ipsec_ipsid_buckets[i];
4674		mutex_init(&bucket->ipsif_lock, NULL, MUTEX_DEFAULT, NULL);
4675	}
4676}
4677
4678/*
4679 * Free identity table (preparatory to module unload)
4680 */
4681static void
4682ipsid_fini(netstack_t *ns)
4683{
4684	ipsif_t *bucket;
4685	int i;
4686	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4687
4688	for (i = 0; i < IPSID_HASHSIZE; i++) {
4689		bucket = &ipss->ipsec_ipsid_buckets[i];
4690		ASSERT(bucket->ipsif_head == NULL);
4691		mutex_destroy(&bucket->ipsif_lock);
4692	}
4693}
4694
4695/*
4696 * Update the minimum and maximum supported key sizes for the specified
4697 * algorithm, which is either a member of a netstack alg array or about to be,
4698 * and therefore must be called holding ipsec_alg_lock for write.
4699 */
4700void
4701ipsec_alg_fix_min_max(ipsec_alginfo_t *alg, ipsec_algtype_t alg_type,
4702    netstack_t *ns)
4703{
4704	size_t crypto_min = (size_t)-1, crypto_max = 0;
4705	size_t cur_crypto_min, cur_crypto_max;
4706	boolean_t is_valid;
4707	crypto_mechanism_info_t *mech_infos;
4708	uint_t nmech_infos;
4709	int crypto_rc, i;
4710	crypto_mech_usage_t mask;
4711	ipsec_stack_t	*ipss = ns->netstack_ipsec;
4712
4713	ASSERT(RW_WRITE_HELD(&ipss->ipsec_alg_lock));
4714
4715	/*
4716	 * Compute the min, max, and default key sizes (in number of
4717	 * increments to the default key size in bits) as defined
4718	 * by the algorithm mappings. This range of key sizes is used
4719	 * for policy related operations. The effective key sizes
4720	 * supported by the framework could be more limited than
4721	 * those defined for an algorithm.
4722	 */
4723	alg->alg_default_bits = alg->alg_key_sizes[0];
4724	alg->alg_default = 0;
4725	if (alg->alg_increment != 0) {
4726		/* key sizes are defined by range & increment */
4727		alg->alg_minbits = alg->alg_key_sizes[1];
4728		alg->alg_maxbits = alg->alg_key_sizes[2];
4729	} else if (alg->alg_nkey_sizes == 0) {
4730		/* no specified key size for algorithm */
4731		alg->alg_minbits = alg->alg_maxbits = 0;
4732	} else {
4733		/* key sizes are defined by enumeration */
4734		alg->alg_minbits = (uint16_t)-1;
4735		alg->alg_maxbits = 0;
4736
4737		for (i = 0; i < alg->alg_nkey_sizes; i++) {
4738			if (alg->alg_key_sizes[i] < alg->alg_minbits)
4739				alg->alg_minbits = alg->alg_key_sizes[i];
4740			if (alg->alg_key_sizes[i] > alg->alg_maxbits)
4741				alg->alg_maxbits = alg->alg_key_sizes[i];
4742		}
4743	}
4744
4745	if (!(alg->alg_flags & ALG_FLAG_VALID))
4746		return;
4747
4748	/*
4749	 * Mechanisms do not apply to the NULL encryption
4750	 * algorithm, so simply return for this case.
4751	 */
4752	if (alg->alg_id == SADB_EALG_NULL)
4753		return;
4754
4755	/*
4756	 * Find the min and max key sizes supported by the cryptographic
4757	 * framework providers.
4758	 */
4759
4760	/* get the key sizes supported by the framework */
4761	crypto_rc = crypto_get_all_mech_info(alg->alg_mech_type,
4762	    &mech_infos, &nmech_infos, KM_SLEEP);
4763	if (crypto_rc != CRYPTO_SUCCESS || nmech_infos == 0) {
4764		alg->alg_flags &= ~ALG_FLAG_VALID;
4765		return;
4766	}
4767
4768	/* min and max key sizes supported by framework */
4769	for (i = 0, is_valid = B_FALSE; i < nmech_infos; i++) {
4770		int unit_bits;
4771
4772		/*
4773		 * Ignore entries that do not support the operations
4774		 * needed for the algorithm type.
4775		 */
4776		if (alg_type == IPSEC_ALG_AUTH) {
4777			mask = CRYPTO_MECH_USAGE_MAC;
4778		} else {
4779			mask = CRYPTO_MECH_USAGE_ENCRYPT |
4780			    CRYPTO_MECH_USAGE_DECRYPT;
4781		}
4782		if ((mech_infos[i].mi_usage & mask) != mask)
4783			continue;
4784
4785		unit_bits = (mech_infos[i].mi_keysize_unit ==
4786		    CRYPTO_KEYSIZE_UNIT_IN_BYTES)  ? 8 : 1;
4787		/* adjust min/max supported by framework */
4788		cur_crypto_min = mech_infos[i].mi_min_key_size * unit_bits;
4789		cur_crypto_max = mech_infos[i].mi_max_key_size * unit_bits;
4790
4791		if (cur_crypto_min < crypto_min)
4792			crypto_min = cur_crypto_min;
4793
4794		/*
4795		 * CRYPTO_EFFECTIVELY_INFINITE is a special value of
4796		 * the crypto framework which means "no upper limit".
4797		 */
4798		if (mech_infos[i].mi_max_key_size ==
4799		    CRYPTO_EFFECTIVELY_INFINITE) {
4800			crypto_max = (size_t)-1;
4801		} else if (cur_crypto_max > crypto_max) {
4802			crypto_max = cur_crypto_max;
4803		}
4804
4805		is_valid = B_TRUE;
4806	}
4807
4808	kmem_free(mech_infos, sizeof (crypto_mechanism_info_t) *
4809	    nmech_infos);
4810
4811	if (!is_valid) {
4812		/* no key sizes supported by framework */
4813		alg->alg_flags &= ~ALG_FLAG_VALID;
4814		return;
4815	}
4816
4817	/*
4818	 * Determine min and max key sizes from alg_key_sizes[].
4819	 * defined for the algorithm entry. Adjust key sizes based on
4820	 * those supported by the framework.
4821	 */
4822	alg->alg_ef_default_bits = alg->alg_key_sizes[0];
4823
4824	/*
4825	 * For backwards compatability, assume that the IV length
4826	 * is the same as the data length.
4827	 */
4828	alg->alg_ivlen = alg->alg_datalen;
4829
4830	/*
4831	 * Copy any algorithm parameters (if provided) into dedicated
4832	 * elements in the ipsec_alginfo_t structure.
4833	 * There may be a better place to put this code.
4834	 */
4835	for (i = 0; i < alg->alg_nparams; i++) {
4836		switch (i) {
4837		case 0:
4838			/* Initialisation Vector length (bytes) */
4839			alg->alg_ivlen =  alg->alg_params[0];
4840			break;
4841		case 1:
4842			/* Integrity Check Vector length (bytes) */
4843			alg->alg_icvlen = alg->alg_params[1];
4844			break;
4845		case 2:
4846			/* Salt length (bytes) */
4847			alg->alg_saltlen = (uint8_t)alg->alg_params[2];
4848			break;
4849		default:
4850			break;
4851		}
4852	}
4853
4854	/* Default if the IV length is not specified. */
4855	if (alg_type == IPSEC_ALG_ENCR && alg->alg_ivlen == 0)
4856		alg->alg_ivlen = alg->alg_datalen;
4857
4858	alg_flag_check(alg);
4859
4860	if (alg->alg_increment != 0) {
4861		/* supported key sizes are defined by range  & increment */
4862		crypto_min = ALGBITS_ROUND_UP(crypto_min, alg->alg_increment);
4863		crypto_max = ALGBITS_ROUND_DOWN(crypto_max, alg->alg_increment);
4864
4865		alg->alg_ef_minbits = MAX(alg->alg_minbits,
4866		    (uint16_t)crypto_min);
4867		alg->alg_ef_maxbits = MIN(alg->alg_maxbits,
4868		    (uint16_t)crypto_max);
4869
4870		/*
4871		 * If the sizes supported by the framework are outside
4872		 * the range of sizes defined by the algorithm mappings,
4873		 * the algorithm cannot be used. Check for this
4874		 * condition here.
4875		 */
4876		if (alg->alg_ef_minbits > alg->alg_ef_maxbits) {
4877			alg->alg_flags &= ~ALG_FLAG_VALID;
4878			return;
4879		}
4880		if (alg->alg_ef_default_bits < alg->alg_ef_minbits)
4881			alg->alg_ef_default_bits = alg->alg_ef_minbits;
4882		if (alg->alg_ef_default_bits > alg->alg_ef_maxbits)
4883			alg->alg_ef_default_bits = alg->alg_ef_maxbits;
4884	} else if (alg->alg_nkey_sizes == 0) {
4885		/* no specified key size for algorithm */
4886		alg->alg_ef_minbits = alg->alg_ef_maxbits = 0;
4887	} else {
4888		/* supported key sizes are defined by enumeration */
4889		alg->alg_ef_minbits = (uint16_t)-1;
4890		alg->alg_ef_maxbits = 0;
4891
4892		for (i = 0, is_valid = B_FALSE; i < alg->alg_nkey_sizes; i++) {
4893			/*
4894			 * Ignore the current key size if it is not in the
4895			 * range of sizes supported by the framework.
4896			 */
4897			if (alg->alg_key_sizes[i] < crypto_min ||
4898			    alg->alg_key_sizes[i] > crypto_max)
4899				continue;
4900			if (alg->alg_key_sizes[i] < alg->alg_ef_minbits)
4901				alg->alg_ef_minbits = alg->alg_key_sizes[i];
4902			if (alg->alg_key_sizes[i] > alg->alg_ef_maxbits)
4903				alg->alg_ef_maxbits = alg->alg_key_sizes[i];
4904			is_valid = B_TRUE;
4905		}
4906
4907		if (!is_valid) {
4908			alg->alg_flags &= ~ALG_FLAG_VALID;
4909			return;
4910		}
4911		alg->alg_ef_default = 0;
4912	}
4913}
4914
4915/*
4916 * Sanity check parameters provided by ipsecalgs(1m). Assume that
4917 * the algoritm is marked as valid, there is a check at the top
4918 * of this function. If any of the checks below fail, the algorithm
4919 * entry is invalid.
4920 */
4921void
4922alg_flag_check(ipsec_alginfo_t *alg)
4923{
4924	alg->alg_flags &= ~ALG_FLAG_VALID;
4925
4926	/*
4927	 * Can't have the algorithm marked as CCM and GCM.
4928	 * Check the ALG_FLAG_COMBINED and ALG_FLAG_COUNTERMODE
4929	 * flags are set for CCM & GCM.
4930	 */
4931	if ((alg->alg_flags & (ALG_FLAG_CCM|ALG_FLAG_GCM)) ==
4932	    (ALG_FLAG_CCM|ALG_FLAG_GCM))
4933		return;
4934	if (alg->alg_flags & (ALG_FLAG_CCM|ALG_FLAG_GCM)) {
4935		if (!(alg->alg_flags & ALG_FLAG_COUNTERMODE))
4936			return;
4937		if (!(alg->alg_flags & ALG_FLAG_COMBINED))
4938			return;
4939	}
4940
4941	/*
4942	 * For ALG_FLAG_COUNTERMODE, check the parameters
4943	 * fit in the ipsec_nonce_t structure.
4944	 */
4945	if (alg->alg_flags & ALG_FLAG_COUNTERMODE) {
4946		if (alg->alg_ivlen != sizeof (((ipsec_nonce_t *)NULL)->iv))
4947			return;
4948		if (alg->alg_saltlen > sizeof (((ipsec_nonce_t *)NULL)->salt))
4949			return;
4950	}
4951	if ((alg->alg_flags & ALG_FLAG_COMBINED) &&
4952	    (alg->alg_icvlen == 0))
4953		return;
4954
4955	/* all is well. */
4956	alg->alg_flags |= ALG_FLAG_VALID;
4957}
4958
4959/*
4960 * Free the memory used by the specified algorithm.
4961 */
4962void
4963ipsec_alg_free(ipsec_alginfo_t *alg)
4964{
4965	if (alg == NULL)
4966		return;
4967
4968	if (alg->alg_key_sizes != NULL) {
4969		kmem_free(alg->alg_key_sizes,
4970		    (alg->alg_nkey_sizes + 1) * sizeof (uint16_t));
4971		alg->alg_key_sizes = NULL;
4972	}
4973	if (alg->alg_block_sizes != NULL) {
4974		kmem_free(alg->alg_block_sizes,
4975		    (alg->alg_nblock_sizes + 1) * sizeof (uint16_t));
4976		alg->alg_block_sizes = NULL;
4977	}
4978	if (alg->alg_params != NULL) {
4979		kmem_free(alg->alg_params,
4980		    (alg->alg_nparams + 1) * sizeof (uint16_t));
4981		alg->alg_params = NULL;
4982	}
4983	kmem_free(alg, sizeof (*alg));
4984}
4985
4986/*
4987 * Check the validity of the specified key size for an algorithm.
4988 * Returns B_TRUE if key size is valid, B_FALSE otherwise.
4989 */
4990boolean_t
4991ipsec_valid_key_size(uint16_t key_size, ipsec_alginfo_t *alg)
4992{
4993	if (key_size < alg->alg_ef_minbits || key_size > alg->alg_ef_maxbits)
4994		return (B_FALSE);
4995
4996	if (alg->alg_increment == 0 && alg->alg_nkey_sizes != 0) {
4997		/*
4998		 * If the key sizes are defined by enumeration, the new
4999		 * key size must be equal to one of the supported values.
5000		 */
5001		int i;
5002
5003		for (i = 0; i < alg->alg_nkey_sizes; i++)
5004			if (key_size == alg->alg_key_sizes[i])
5005				break;
5006		if (i == alg->alg_nkey_sizes)
5007			return (B_FALSE);
5008	}
5009
5010	return (B_TRUE);
5011}
5012
5013/*
5014 * Callback function invoked by the crypto framework when a provider
5015 * registers or unregisters. This callback updates the algorithms
5016 * tables when a crypto algorithm is no longer available or becomes
5017 * available, and triggers the freeing/creation of context templates
5018 * associated with existing SAs, if needed.
5019 *
5020 * Need to walk all stack instances since the callback is global
5021 * for all instances
5022 */
5023void
5024ipsec_prov_update_callback(uint32_t event, void *event_arg)
5025{
5026	netstack_handle_t nh;
5027	netstack_t *ns;
5028
5029	netstack_next_init(&nh);
5030	while ((ns = netstack_next(&nh)) != NULL) {
5031		ipsec_prov_update_callback_stack(event, event_arg, ns);
5032		netstack_rele(ns);
5033	}
5034	netstack_next_fini(&nh);
5035}
5036
5037static void
5038ipsec_prov_update_callback_stack(uint32_t event, void *event_arg,
5039    netstack_t *ns)
5040{
5041	crypto_notify_event_change_t *prov_change =
5042	    (crypto_notify_event_change_t *)event_arg;
5043	uint_t algidx, algid, algtype, mech_count, mech_idx;
5044	ipsec_alginfo_t *alg;
5045	ipsec_alginfo_t oalg;
5046	crypto_mech_name_t *mechs;
5047	boolean_t alg_changed = B_FALSE;
5048	ipsec_stack_t	*ipss = ns->netstack_ipsec;
5049
5050	/* ignore events for which we didn't register */
5051	if (event != CRYPTO_EVENT_MECHS_CHANGED) {
5052		ip1dbg(("ipsec_prov_update_callback: unexpected event 0x%x "
5053		    " received from crypto framework\n", event));
5054		return;
5055	}
5056
5057	mechs = crypto_get_mech_list(&mech_count, KM_SLEEP);
5058	if (mechs == NULL)
5059		return;
5060
5061	/*
5062	 * Walk the list of currently defined IPsec algorithm. Update
5063	 * the algorithm valid flag and trigger an update of the
5064	 * SAs that depend on that algorithm.
5065	 */
5066	rw_enter(&ipss->ipsec_alg_lock, RW_WRITER);
5067	for (algtype = 0; algtype < IPSEC_NALGTYPES; algtype++) {
5068		for (algidx = 0; algidx < ipss->ipsec_nalgs[algtype];
5069		    algidx++) {
5070
5071			algid = ipss->ipsec_sortlist[algtype][algidx];
5072			alg = ipss->ipsec_alglists[algtype][algid];
5073			ASSERT(alg != NULL);
5074
5075			/*
5076			 * Skip the algorithms which do not map to the
5077			 * crypto framework provider being added or removed.
5078			 */
5079			if (strncmp(alg->alg_mech_name,
5080			    prov_change->ec_mech_name,
5081			    CRYPTO_MAX_MECH_NAME) != 0)
5082				continue;
5083
5084			/*
5085			 * Determine if the mechanism is valid. If it
5086			 * is not, mark the algorithm as being invalid. If
5087			 * it is, mark the algorithm as being valid.
5088			 */
5089			for (mech_idx = 0; mech_idx < mech_count; mech_idx++)
5090				if (strncmp(alg->alg_mech_name,
5091				    mechs[mech_idx], CRYPTO_MAX_MECH_NAME) == 0)
5092					break;
5093			if (mech_idx == mech_count &&
5094			    alg->alg_flags & ALG_FLAG_VALID) {
5095				alg->alg_flags &= ~ALG_FLAG_VALID;
5096				alg_changed = B_TRUE;
5097			} else if (mech_idx < mech_count &&
5098			    !(alg->alg_flags & ALG_FLAG_VALID)) {
5099				alg->alg_flags |= ALG_FLAG_VALID;
5100				alg_changed = B_TRUE;
5101			}
5102
5103			/*
5104			 * Update the supported key sizes, regardless
5105			 * of whether a crypto provider was added or
5106			 * removed.
5107			 */
5108			oalg = *alg;
5109			ipsec_alg_fix_min_max(alg, algtype, ns);
5110			if (!alg_changed &&
5111			    alg->alg_ef_minbits != oalg.alg_ef_minbits ||
5112			    alg->alg_ef_maxbits != oalg.alg_ef_maxbits ||
5113			    alg->alg_ef_default != oalg.alg_ef_default ||
5114			    alg->alg_ef_default_bits !=
5115			    oalg.alg_ef_default_bits)
5116				alg_changed = B_TRUE;
5117
5118			/*
5119			 * Update the affected SAs if a software provider is
5120			 * being added or removed.
5121			 */
5122			if (prov_change->ec_provider_type ==
5123			    CRYPTO_SW_PROVIDER)
5124				sadb_alg_update(algtype, alg->alg_id,
5125				    prov_change->ec_change ==
5126				    CRYPTO_MECH_ADDED, ns);
5127		}
5128	}
5129	rw_exit(&ipss->ipsec_alg_lock);
5130	crypto_free_mech_list(mechs, mech_count);
5131
5132	if (alg_changed) {
5133		/*
5134		 * An algorithm has changed, i.e. it became valid or
5135		 * invalid, or its support key sizes have changed.
5136		 * Notify ipsecah and ipsecesp of this change so
5137		 * that they can send a SADB_REGISTER to their consumers.
5138		 */
5139		ipsecah_algs_changed(ns);
5140		ipsecesp_algs_changed(ns);
5141	}
5142}
5143
5144/*
5145 * Registers with the crypto framework to be notified of crypto
5146 * providers changes. Used to update the algorithm tables and
5147 * to free or create context templates if needed. Invoked after IPsec
5148 * is loaded successfully.
5149 *
5150 * This is called separately for each IP instance, so we ensure we only
5151 * register once.
5152 */
5153void
5154ipsec_register_prov_update(void)
5155{
5156	if (prov_update_handle != NULL)
5157		return;
5158
5159	prov_update_handle = crypto_notify_events(
5160	    ipsec_prov_update_callback, CRYPTO_EVENT_MECHS_CHANGED);
5161}
5162
5163/*
5164 * Unregisters from the framework to be notified of crypto providers
5165 * changes. Called from ipsec_policy_g_destroy().
5166 */
5167static void
5168ipsec_unregister_prov_update(void)
5169{
5170	if (prov_update_handle != NULL)
5171		crypto_unnotify_events(prov_update_handle);
5172}
5173
5174/*
5175 * Tunnel-mode support routines.
5176 */
5177
5178/*
5179 * Returns an mblk chain suitable for putnext() if policies match and IPsec
5180 * SAs are available.  If there's no per-tunnel policy, or a match comes back
5181 * with no match, then still return the packet and have global policy take
5182 * a crack at it in IP.
5183 * This updates the ip_xmit_attr with the IPsec policy.
5184 *
5185 * Remember -> we can be forwarding packets.  Keep that in mind w.r.t.
5186 * inner-packet contents.
5187 */
5188mblk_t *
5189ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
5190    ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len,
5191    ip_xmit_attr_t *ixa)
5192{
5193	ipsec_policy_head_t *polhead;
5194	ipsec_selector_t sel;
5195	mblk_t *nmp;
5196	boolean_t is_fragment;
5197	ipsec_policy_t *pol;
5198	ipsec_tun_pol_t *itp = iptun->iptun_itp;
5199	netstack_t *ns = iptun->iptun_ns;
5200	ipsec_stack_t *ipss = ns->netstack_ipsec;
5201
5202	ASSERT(outer_ipv6 != NULL && outer_ipv4 == NULL ||
5203	    outer_ipv4 != NULL && outer_ipv6 == NULL);
5204	/* We take care of inners in a bit. */
5205
5206	/* Are the IPsec fields initialized at all? */
5207	if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) {
5208		ASSERT(ixa->ixa_ipsec_policy == NULL);
5209		ASSERT(ixa->ixa_ipsec_latch == NULL);
5210		ASSERT(ixa->ixa_ipsec_action == NULL);
5211		ASSERT(ixa->ixa_ipsec_ah_sa == NULL);
5212		ASSERT(ixa->ixa_ipsec_esp_sa == NULL);
5213	}
5214
5215	ASSERT(itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE));
5216	polhead = itp->itp_policy;
5217
5218	bzero(&sel, sizeof (sel));
5219	if (inner_ipv4 != NULL) {
5220		ASSERT(inner_ipv6 == NULL);
5221		sel.ips_isv4 = B_TRUE;
5222		sel.ips_local_addr_v4 = inner_ipv4->ipha_src;
5223		sel.ips_remote_addr_v4 = inner_ipv4->ipha_dst;
5224		sel.ips_protocol = (uint8_t)inner_ipv4->ipha_protocol;
5225	} else {
5226		ASSERT(inner_ipv6 != NULL);
5227		sel.ips_isv4 = B_FALSE;
5228		sel.ips_local_addr_v6 = inner_ipv6->ip6_src;
5229		/*
5230		 * We don't care about routing-header dests in the
5231		 * forwarding/tunnel path, so just grab ip6_dst.
5232		 */
5233		sel.ips_remote_addr_v6 = inner_ipv6->ip6_dst;
5234	}
5235
5236	if (itp->itp_flags & ITPF_P_PER_PORT_SECURITY) {
5237		/*
5238		 * Caller can prepend the outer header, which means
5239		 * inner_ipv[46] may be stuck in the middle.  Pullup the whole
5240		 * mess now if need-be, for easier processing later.  Don't
5241		 * forget to rewire the outer header too.
5242		 */
5243		if (mp->b_cont != NULL) {
5244			nmp = msgpullup(mp, -1);
5245			if (nmp == NULL) {
5246				ip_drop_packet(mp, B_FALSE, NULL,
5247				    DROPPER(ipss, ipds_spd_nomem),
5248				    &ipss->ipsec_spd_dropper);
5249				return (NULL);
5250			}
5251			freemsg(mp);
5252			mp = nmp;
5253			if (outer_ipv4 != NULL)
5254				outer_ipv4 = (ipha_t *)mp->b_rptr;
5255			else
5256				outer_ipv6 = (ip6_t *)mp->b_rptr;
5257			if (inner_ipv4 != NULL) {
5258				inner_ipv4 =
5259				    (ipha_t *)(mp->b_rptr + outer_hdr_len);
5260			} else {
5261				inner_ipv6 =
5262				    (ip6_t *)(mp->b_rptr + outer_hdr_len);
5263			}
5264		}
5265		if (inner_ipv4 != NULL) {
5266			is_fragment = IS_V4_FRAGMENT(
5267			    inner_ipv4->ipha_fragment_offset_and_flags);
5268		} else {
5269			sel.ips_remote_addr_v6 = ip_get_dst_v6(inner_ipv6, mp,
5270			    &is_fragment);
5271		}
5272
5273		if (is_fragment) {
5274			ipha_t *oiph;
5275			ipha_t *iph = NULL;
5276			ip6_t *ip6h = NULL;
5277			int hdr_len;
5278			uint16_t ip6_hdr_length;
5279			uint8_t v6_proto;
5280			uint8_t *v6_proto_p;
5281
5282			/*
5283			 * We have a fragment we need to track!
5284			 */
5285			mp = ipsec_fragcache_add(&itp->itp_fragcache, NULL, mp,
5286			    outer_hdr_len, ipss);
5287			if (mp == NULL)
5288				return (NULL);
5289			ASSERT(mp->b_cont == NULL);
5290
5291			/*
5292			 * If we get here, we have a full fragment chain
5293			 */
5294
5295			oiph = (ipha_t *)mp->b_rptr;
5296			if (IPH_HDR_VERSION(oiph) == IPV4_VERSION) {
5297				hdr_len = ((outer_hdr_len != 0) ?
5298				    IPH_HDR_LENGTH(oiph) : 0);
5299				iph = (ipha_t *)(mp->b_rptr + hdr_len);
5300			} else {
5301				ASSERT(IPH_HDR_VERSION(oiph) == IPV6_VERSION);
5302				ip6h = (ip6_t *)mp->b_rptr;
5303				if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
5304				    &ip6_hdr_length, &v6_proto_p)) {
5305					ip_drop_packet_chain(mp, B_FALSE, NULL,
5306					    DROPPER(ipss,
5307					    ipds_spd_malformed_packet),
5308					    &ipss->ipsec_spd_dropper);
5309					return (NULL);
5310				}
5311				hdr_len = ip6_hdr_length;
5312			}
5313			outer_hdr_len = hdr_len;
5314
5315			if (sel.ips_isv4) {
5316				if (iph == NULL) {
5317					/* Was v6 outer */
5318					iph = (ipha_t *)(mp->b_rptr + hdr_len);
5319				}
5320				inner_ipv4 = iph;
5321				sel.ips_local_addr_v4 = inner_ipv4->ipha_src;
5322				sel.ips_remote_addr_v4 = inner_ipv4->ipha_dst;
5323				sel.ips_protocol =
5324				    (uint8_t)inner_ipv4->ipha_protocol;
5325			} else {
5326				inner_ipv6 = (ip6_t *)(mp->b_rptr +
5327				    hdr_len);
5328				sel.ips_local_addr_v6 = inner_ipv6->ip6_src;
5329				sel.ips_remote_addr_v6 = inner_ipv6->ip6_dst;
5330				if (!ip_hdr_length_nexthdr_v6(mp,
5331				    inner_ipv6, &ip6_hdr_length, &v6_proto_p)) {
5332					ip_drop_packet_chain(mp, B_FALSE, NULL,
5333					    DROPPER(ipss,
5334					    ipds_spd_malformed_frag),
5335					    &ipss->ipsec_spd_dropper);
5336					return (NULL);
5337				}
5338				v6_proto = *v6_proto_p;
5339				sel.ips_protocol = v6_proto;
5340#ifdef FRAGCACHE_DEBUG
5341				cmn_err(CE_WARN, "v6_sel.ips_protocol = %d\n",
5342				    sel.ips_protocol);
5343#endif
5344			}
5345			/* Ports are extracted below */
5346		}
5347
5348		/* Get ports... */
5349		if (!ipsec_init_outbound_ports(&sel, mp,
5350		    inner_ipv4, inner_ipv6, outer_hdr_len, ipss)) {
5351			/* callee did ip_drop_packet_chain() on mp. */
5352			return (NULL);
5353		}
5354#ifdef FRAGCACHE_DEBUG
5355		if (inner_ipv4 != NULL)
5356			cmn_err(CE_WARN,
5357			    "(v4) sel.ips_protocol = %d, "
5358			    "sel.ips_local_port = %d, "
5359			    "sel.ips_remote_port = %d\n",
5360			    sel.ips_protocol, ntohs(sel.ips_local_port),
5361			    ntohs(sel.ips_remote_port));
5362		if (inner_ipv6 != NULL)
5363			cmn_err(CE_WARN,
5364			    "(v6) sel.ips_protocol = %d, "
5365			    "sel.ips_local_port = %d, "
5366			    "sel.ips_remote_port = %d\n",
5367			    sel.ips_protocol, ntohs(sel.ips_local_port),
5368			    ntohs(sel.ips_remote_port));
5369#endif
5370		/* Success so far! */
5371	}
5372	rw_enter(&polhead->iph_lock, RW_READER);
5373	pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND, &sel);
5374	rw_exit(&polhead->iph_lock);
5375	if (pol == NULL) {
5376		/*
5377		 * No matching policy on this tunnel, drop the packet.
5378		 *
5379		 * NOTE:  Tunnel-mode tunnels are different from the
5380		 * IP global transport mode policy head.  For a tunnel-mode
5381		 * tunnel, we drop the packet in lieu of passing it
5382		 * along accepted the way a global-policy miss would.
5383		 *
5384		 * NOTE2:  "negotiate transport" tunnels should match ALL
5385		 * inbound packets, but we do not uncomment the ASSERT()
5386		 * below because if/when we open PF_POLICY, a user can
5387		 * shoot themself in the foot with a 0 priority.
5388		 */
5389
5390		/* ASSERT(itp->itp_flags & ITPF_P_TUNNEL); */
5391#ifdef FRAGCACHE_DEBUG
5392		cmn_err(CE_WARN, "ipsec_tun_outbound(): No matching tunnel "
5393		    "per-port policy\n");
5394#endif
5395		ip_drop_packet_chain(mp, B_FALSE, NULL,
5396		    DROPPER(ipss, ipds_spd_explicit),
5397		    &ipss->ipsec_spd_dropper);
5398		return (NULL);
5399	}
5400
5401#ifdef FRAGCACHE_DEBUG
5402	cmn_err(CE_WARN, "Having matching tunnel per-port policy\n");
5403#endif
5404
5405	/*
5406	 * NOTE: ixa_cleanup() function will release pol references.
5407	 */
5408	ixa->ixa_ipsec_policy = pol;
5409	/*
5410	 * NOTE: There is a subtle difference between iptun_zoneid and
5411	 * iptun_connp->conn_zoneid explained in iptun_conn_create().  When
5412	 * interacting with the ip module, we must use conn_zoneid.
5413	 */
5414	ixa->ixa_zoneid = iptun->iptun_connp->conn_zoneid;
5415
5416	ASSERT((outer_ipv4 != NULL) ? (ixa->ixa_flags & IXAF_IS_IPV4) :
5417	    !(ixa->ixa_flags & IXAF_IS_IPV4));
5418	ASSERT(ixa->ixa_ipsec_policy != NULL);
5419	ixa->ixa_flags |= IXAF_IPSEC_SECURE;
5420
5421	if (!(itp->itp_flags & ITPF_P_TUNNEL)) {
5422		/* Set up transport mode for tunnelled packets. */
5423		ixa->ixa_ipsec_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP :
5424		    IPPROTO_IPV6;
5425		return (mp);
5426	}
5427
5428	/* Fill in tunnel-mode goodies here. */
5429	ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
5430	/* XXX Do I need to fill in all of the goodies here? */
5431	if (inner_ipv4) {
5432		ixa->ixa_ipsec_inaf = AF_INET;
5433		ixa->ixa_ipsec_insrc[0] =
5434		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v4;
5435		ixa->ixa_ipsec_indst[0] =
5436		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v4;
5437	} else {
5438		ixa->ixa_ipsec_inaf = AF_INET6;
5439		ixa->ixa_ipsec_insrc[0] =
5440		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[0];
5441		ixa->ixa_ipsec_insrc[1] =
5442		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[1];
5443		ixa->ixa_ipsec_insrc[2] =
5444		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[2];
5445		ixa->ixa_ipsec_insrc[3] =
5446		    pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[3];
5447		ixa->ixa_ipsec_indst[0] =
5448		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[0];
5449		ixa->ixa_ipsec_indst[1] =
5450		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[1];
5451		ixa->ixa_ipsec_indst[2] =
5452		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[2];
5453		ixa->ixa_ipsec_indst[3] =
5454		    pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[3];
5455	}
5456	ixa->ixa_ipsec_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen;
5457	ixa->ixa_ipsec_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen;
5458	/* NOTE:  These are used for transport mode too. */
5459	ixa->ixa_ipsec_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport;
5460	ixa->ixa_ipsec_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport;
5461	ixa->ixa_ipsec_proto = pol->ipsp_sel->ipsl_key.ipsl_proto;
5462
5463	return (mp);
5464}
5465
5466/*
5467 * NOTE: The following releases pol's reference and
5468 * calls ip_drop_packet() for me on NULL returns.
5469 */
5470mblk_t *
5471ipsec_check_ipsecin_policy_reasm(mblk_t *attr_mp, ipsec_policy_t *pol,
5472    ipha_t *inner_ipv4, ip6_t *inner_ipv6, uint64_t pkt_unique, netstack_t *ns)
5473{
5474	/* Assume attr_mp is a chain of b_next-linked ip_recv_attr mblk. */
5475	mblk_t *data_chain = NULL, *data_tail = NULL;
5476	mblk_t *next;
5477	mblk_t *data_mp;
5478	ip_recv_attr_t	iras;
5479
5480	while (attr_mp != NULL) {
5481		ASSERT(ip_recv_attr_is_mblk(attr_mp));
5482		next = attr_mp->b_next;
5483		attr_mp->b_next = NULL;  /* No tripping asserts. */
5484
5485		data_mp = attr_mp->b_cont;
5486		attr_mp->b_cont = NULL;
5487		if (!ip_recv_attr_from_mblk(attr_mp, &iras)) {
5488			/* The ill or ip_stack_t disappeared on us */
5489			freemsg(data_mp);	/* ip_drop_packet?? */
5490			ira_cleanup(&iras, B_TRUE);
5491			goto fail;
5492		}
5493
5494		/*
5495		 * Need IPPOL_REFHOLD(pol) for extras because
5496		 * ipsecin_policy does the refrele.
5497		 */
5498		IPPOL_REFHOLD(pol);
5499
5500		data_mp = ipsec_check_ipsecin_policy(data_mp, pol, inner_ipv4,
5501		    inner_ipv6, pkt_unique, &iras, ns);
5502		ira_cleanup(&iras, B_TRUE);
5503
5504		if (data_mp == NULL)
5505			goto fail;
5506
5507		if (data_tail == NULL) {
5508			/* First one */
5509			data_chain = data_tail = data_mp;
5510		} else {
5511			data_tail->b_next = data_mp;
5512			data_tail = data_mp;
5513		}
5514		attr_mp = next;
5515	}
5516	/*
5517	 * One last release because either the loop bumped it up, or we never
5518	 * called ipsec_check_ipsecin_policy().
5519	 */
5520	IPPOL_REFRELE(pol);
5521
5522	/* data_chain is ready for return to tun module. */
5523	return (data_chain);
5524
5525fail:
5526	/*
5527	 * Need to get rid of any extra pol
5528	 * references, and any remaining bits as well.
5529	 */
5530	IPPOL_REFRELE(pol);
5531	ipsec_freemsg_chain(data_chain);
5532	ipsec_freemsg_chain(next);	/* ipdrop stats? */
5533	return (NULL);
5534}
5535
5536/*
5537 * Return a message if the inbound packet passed an IPsec policy check.  Returns
5538 * NULL if it failed or if it is a fragment needing its friends before a
5539 * policy check can be performed.
5540 *
5541 * Expects a non-NULL data_mp, and a non-NULL polhead.
5542 * The returned mblk may be a b_next chain of packets if fragments
5543 * neeeded to be collected for a proper policy check.
5544 *
5545 * This function calls ip_drop_packet() on data_mp if need be.
5546 *
5547 * NOTE:  outer_hdr_len is signed.  If it's a negative value, the caller
5548 * is inspecting an ICMP packet.
5549 */
5550mblk_t *
5551ipsec_tun_inbound(ip_recv_attr_t *ira, mblk_t *data_mp, ipsec_tun_pol_t *itp,
5552    ipha_t *inner_ipv4, ip6_t *inner_ipv6, ipha_t *outer_ipv4,
5553    ip6_t *outer_ipv6, int outer_hdr_len, netstack_t *ns)
5554{
5555	ipsec_policy_head_t *polhead;
5556	ipsec_selector_t sel;
5557	ipsec_policy_t *pol;
5558	uint16_t tmpport;
5559	selret_t rc;
5560	boolean_t port_policy_present, is_icmp, global_present;
5561	in6_addr_t tmpaddr;
5562	ipaddr_t tmp4;
5563	uint8_t flags, *inner_hdr;
5564	ipsec_stack_t *ipss = ns->netstack_ipsec;
5565
5566	sel.ips_is_icmp_inv_acq = 0;
5567
5568	if (outer_ipv4 != NULL) {
5569		ASSERT(outer_ipv6 == NULL);
5570		global_present = ipss->ipsec_inbound_v4_policy_present;
5571	} else {
5572		ASSERT(outer_ipv6 != NULL);
5573		global_present = ipss->ipsec_inbound_v6_policy_present;
5574	}
5575
5576	ASSERT(inner_ipv4 != NULL && inner_ipv6 == NULL ||
5577	    inner_ipv4 == NULL && inner_ipv6 != NULL);
5578
5579	if (outer_hdr_len < 0) {
5580		outer_hdr_len = (-outer_hdr_len);
5581		is_icmp = B_TRUE;
5582	} else {
5583		is_icmp = B_FALSE;
5584	}
5585
5586	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
5587		mblk_t *mp = data_mp;
5588
5589		polhead = itp->itp_policy;
5590		/*
5591		 * We need to perform full Tunnel-Mode enforcement,
5592		 * and we need to have inner-header data for such enforcement.
5593		 *
5594		 * See ipsec_init_inbound_sel() for the 0x80000000 on inbound
5595		 * and on return.
5596		 */
5597
5598		port_policy_present = ((itp->itp_flags &
5599		    ITPF_P_PER_PORT_SECURITY) ? B_TRUE : B_FALSE);
5600		/*
5601		 * NOTE:  Even if our policy is transport mode, set the
5602		 * SEL_TUNNEL_MODE flag so ipsec_init_inbound_sel() can
5603		 * do the right thing w.r.t. outer headers.
5604		 */
5605		flags = ((port_policy_present ? SEL_PORT_POLICY : SEL_NONE) |
5606		    (is_icmp ? SEL_IS_ICMP : SEL_NONE) | SEL_TUNNEL_MODE);
5607
5608		rc = ipsec_init_inbound_sel(&sel, data_mp, inner_ipv4,
5609		    inner_ipv6, flags);
5610
5611		switch (rc) {
5612		case SELRET_NOMEM:
5613			ip_drop_packet(data_mp, B_TRUE, NULL,
5614			    DROPPER(ipss, ipds_spd_nomem),
5615			    &ipss->ipsec_spd_dropper);
5616			return (NULL);
5617		case SELRET_TUNFRAG:
5618			/*
5619			 * At this point, if we're cleartext, we don't want
5620			 * to go there.
5621			 */
5622			if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
5623				ip_drop_packet(data_mp, B_TRUE, NULL,
5624				    DROPPER(ipss, ipds_spd_got_clear),
5625				    &ipss->ipsec_spd_dropper);
5626				return (NULL);
5627			}
5628
5629			/*
5630			 * Inner and outer headers may not be contiguous.
5631			 * Pullup the data_mp now to satisfy assumptions of
5632			 * ipsec_fragcache_add()
5633			 */
5634			if (data_mp->b_cont != NULL) {
5635				mblk_t *nmp;
5636
5637				nmp = msgpullup(data_mp, -1);
5638				if (nmp == NULL) {
5639					ip_drop_packet(data_mp, B_TRUE, NULL,
5640					    DROPPER(ipss, ipds_spd_nomem),
5641					    &ipss->ipsec_spd_dropper);
5642					return (NULL);
5643				}
5644				freemsg(data_mp);
5645				data_mp = nmp;
5646				if (outer_ipv4 != NULL)
5647					outer_ipv4 =
5648					    (ipha_t *)data_mp->b_rptr;
5649				else
5650					outer_ipv6 =
5651					    (ip6_t *)data_mp->b_rptr;
5652				if (inner_ipv4 != NULL) {
5653					inner_ipv4 =
5654					    (ipha_t *)(data_mp->b_rptr +
5655					    outer_hdr_len);
5656				} else {
5657					inner_ipv6 =
5658					    (ip6_t *)(data_mp->b_rptr +
5659					    outer_hdr_len);
5660				}
5661			}
5662
5663			/*
5664			 * If we need to queue the packet. First we
5665			 * get an mblk with the attributes. ipsec_fragcache_add
5666			 * will prepend that to the queued data and return
5667			 * a list of b_next messages each of which starts with
5668			 * the attribute mblk.
5669			 */
5670			mp = ip_recv_attr_to_mblk(ira);
5671			if (mp == NULL) {
5672				ip_drop_packet(data_mp, B_TRUE, NULL,
5673				    DROPPER(ipss, ipds_spd_nomem),
5674				    &ipss->ipsec_spd_dropper);
5675				return (NULL);
5676			}
5677
5678			mp = ipsec_fragcache_add(&itp->itp_fragcache,
5679			    mp, data_mp, outer_hdr_len, ipss);
5680
5681			if (mp == NULL) {
5682				/*
5683				 * Data is cached, fragment chain is not
5684				 * complete.
5685				 */
5686				return (NULL);
5687			}
5688
5689			/*
5690			 * If we get here, we have a full fragment chain.
5691			 * Reacquire headers and selectors from first fragment.
5692			 */
5693			ASSERT(ip_recv_attr_is_mblk(mp));
5694			data_mp = mp->b_cont;
5695			inner_hdr = data_mp->b_rptr;
5696			if (outer_ipv4 != NULL) {
5697				inner_hdr += IPH_HDR_LENGTH(
5698				    (ipha_t *)data_mp->b_rptr);
5699			} else {
5700				inner_hdr += ip_hdr_length_v6(data_mp,
5701				    (ip6_t *)data_mp->b_rptr);
5702			}
5703			ASSERT(inner_hdr <= data_mp->b_wptr);
5704
5705			if (inner_ipv4 != NULL) {
5706				inner_ipv4 = (ipha_t *)inner_hdr;
5707				inner_ipv6 = NULL;
5708			} else {
5709				inner_ipv6 = (ip6_t *)inner_hdr;
5710				inner_ipv4 = NULL;
5711			}
5712
5713			/*
5714			 * Use SEL_TUNNEL_MODE to take into account the outer
5715			 * header.  Use SEL_POST_FRAG so we always get ports.
5716			 */
5717			rc = ipsec_init_inbound_sel(&sel, data_mp,
5718			    inner_ipv4, inner_ipv6,
5719			    SEL_TUNNEL_MODE | SEL_POST_FRAG);
5720			switch (rc) {
5721			case SELRET_SUCCESS:
5722				/*
5723				 * Get to same place as first caller's
5724				 * SELRET_SUCCESS case.
5725				 */
5726				break;
5727			case SELRET_NOMEM:
5728				ip_drop_packet_chain(mp, B_TRUE, NULL,
5729				    DROPPER(ipss, ipds_spd_nomem),
5730				    &ipss->ipsec_spd_dropper);
5731				return (NULL);
5732			case SELRET_BADPKT:
5733				ip_drop_packet_chain(mp, B_TRUE, NULL,
5734				    DROPPER(ipss, ipds_spd_malformed_frag),
5735				    &ipss->ipsec_spd_dropper);
5736				return (NULL);
5737			case SELRET_TUNFRAG:
5738				cmn_err(CE_WARN, "(TUNFRAG on 2nd call...)");
5739				/* FALLTHRU */
5740			default:
5741				cmn_err(CE_WARN, "ipsec_init_inbound_sel(mark2)"
5742				    " returns bizarro 0x%x", rc);
5743				/* Guaranteed panic! */
5744				ASSERT(rc == SELRET_NOMEM);
5745				return (NULL);
5746			}
5747			/* FALLTHRU */
5748		case SELRET_SUCCESS:
5749			/*
5750			 * Common case:
5751			 * No per-port policy or a non-fragment.  Keep going.
5752			 */
5753			break;
5754		case SELRET_BADPKT:
5755			/*
5756			 * We may receive ICMP (with IPv6 inner) packets that
5757			 * trigger this return value.  Send 'em in for
5758			 * enforcement checking.
5759			 */
5760			cmn_err(CE_NOTE, "ipsec_tun_inbound(): "
5761			    "sending 'bad packet' in for enforcement");
5762			break;
5763		default:
5764			cmn_err(CE_WARN,
5765			    "ipsec_init_inbound_sel() returns bizarro 0x%x",
5766			    rc);
5767			ASSERT(rc == SELRET_NOMEM);	/* Guaranteed panic! */
5768			return (NULL);
5769		}
5770
5771		if (is_icmp) {
5772			/*
5773			 * Swap local/remote because this is an ICMP packet.
5774			 */
5775			tmpaddr = sel.ips_local_addr_v6;
5776			sel.ips_local_addr_v6 = sel.ips_remote_addr_v6;
5777			sel.ips_remote_addr_v6 = tmpaddr;
5778			tmpport = sel.ips_local_port;
5779			sel.ips_local_port = sel.ips_remote_port;
5780			sel.ips_remote_port = tmpport;
5781		}
5782
5783		/* find_policy_head() */
5784		rw_enter(&polhead->iph_lock, RW_READER);
5785		pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND,
5786		    &sel);
5787		rw_exit(&polhead->iph_lock);
5788		if (pol != NULL) {
5789			uint64_t pkt_unique;
5790
5791			if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
5792				if (!pol->ipsp_act->ipa_allow_clear) {
5793					/*
5794					 * XXX should never get here with
5795					 * tunnel reassembled fragments?
5796					 */
5797					ASSERT(mp == data_mp);
5798					ip_drop_packet(data_mp, B_TRUE, NULL,
5799					    DROPPER(ipss, ipds_spd_got_clear),
5800					    &ipss->ipsec_spd_dropper);
5801					IPPOL_REFRELE(pol);
5802					return (NULL);
5803				} else {
5804					IPPOL_REFRELE(pol);
5805					return (mp);
5806				}
5807			}
5808			pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
5809			    sel.ips_local_port,
5810			    (inner_ipv4 == NULL) ? IPPROTO_IPV6 :
5811			    IPPROTO_ENCAP, sel.ips_protocol);
5812
5813			/*
5814			 * NOTE: The following releases pol's reference and
5815			 * calls ip_drop_packet() for me on NULL returns.
5816			 *
5817			 * "sel" is still good here, so let's use it!
5818			 */
5819			if (data_mp == mp) {
5820				/* A single packet without attributes */
5821				data_mp = ipsec_check_ipsecin_policy(data_mp,
5822				    pol, inner_ipv4, inner_ipv6, pkt_unique,
5823				    ira, ns);
5824			} else {
5825				/*
5826				 * We pass in the b_next chain of attr_mp's
5827				 * and get back a b_next chain of data_mp's.
5828				 */
5829				data_mp = ipsec_check_ipsecin_policy_reasm(mp,
5830				    pol, inner_ipv4, inner_ipv6, pkt_unique,
5831				    ns);
5832			}
5833			return (data_mp);
5834		}
5835
5836		/*
5837		 * Else fallthru and check the global policy on the outer
5838		 * header(s) if this tunnel is an old-style transport-mode
5839		 * one.  Drop the packet explicitly (no policy entry) for
5840		 * a new-style tunnel-mode tunnel.
5841		 */
5842		if ((itp->itp_flags & ITPF_P_TUNNEL) && !is_icmp) {
5843			ip_drop_packet_chain(data_mp, B_TRUE, NULL,
5844			    DROPPER(ipss, ipds_spd_explicit),
5845			    &ipss->ipsec_spd_dropper);
5846			return (NULL);
5847		}
5848	}
5849
5850	/*
5851	 * NOTE:  If we reach here, we will not have packet chains from
5852	 * fragcache_add(), because the only way I get chains is on a
5853	 * tunnel-mode tunnel, which either returns with a pass, or gets
5854	 * hit by the ip_drop_packet_chain() call right above here.
5855	 */
5856	ASSERT(data_mp->b_next == NULL);
5857
5858	/* If no per-tunnel security, check global policy now. */
5859	if ((ira->ira_flags & IRAF_IPSEC_SECURE) && !global_present) {
5860		if (ira->ira_flags & IRAF_TRUSTED_ICMP) {
5861			/*
5862			 * This is an ICMP message that was geenrated locally.
5863			 * We should accept it.
5864			 */
5865			return (data_mp);
5866		}
5867
5868		ip_drop_packet(data_mp, B_TRUE, NULL,
5869		    DROPPER(ipss, ipds_spd_got_secure),
5870		    &ipss->ipsec_spd_dropper);
5871		return (NULL);
5872	}
5873
5874	if (is_icmp) {
5875		/*
5876		 * For ICMP packets, "outer_ipvN" is set to the outer header
5877		 * that is *INSIDE* the ICMP payload.  For global policy
5878		 * checking, we need to reverse src/dst on the payload in
5879		 * order to construct selectors appropriately.  See "ripha"
5880		 * constructions in ip.c.  To avoid a bug like 6478464 (see
5881		 * earlier in this file), we will actually exchange src/dst
5882		 * in the packet, and reverse if after the call to
5883		 * ipsec_check_global_policy().
5884		 */
5885		if (outer_ipv4 != NULL) {
5886			tmp4 = outer_ipv4->ipha_src;
5887			outer_ipv4->ipha_src = outer_ipv4->ipha_dst;
5888			outer_ipv4->ipha_dst = tmp4;
5889		} else {
5890			ASSERT(outer_ipv6 != NULL);
5891			tmpaddr = outer_ipv6->ip6_src;
5892			outer_ipv6->ip6_src = outer_ipv6->ip6_dst;
5893			outer_ipv6->ip6_dst = tmpaddr;
5894		}
5895	}
5896
5897	data_mp = ipsec_check_global_policy(data_mp, NULL, outer_ipv4,
5898	    outer_ipv6, ira, ns);
5899	if (data_mp == NULL)
5900		return (NULL);
5901
5902	if (is_icmp) {
5903		/* Set things back to normal. */
5904		if (outer_ipv4 != NULL) {
5905			tmp4 = outer_ipv4->ipha_src;
5906			outer_ipv4->ipha_src = outer_ipv4->ipha_dst;
5907			outer_ipv4->ipha_dst = tmp4;
5908		} else {
5909			/* No need for ASSERT()s now. */
5910			tmpaddr = outer_ipv6->ip6_src;
5911			outer_ipv6->ip6_src = outer_ipv6->ip6_dst;
5912			outer_ipv6->ip6_dst = tmpaddr;
5913		}
5914	}
5915
5916	/*
5917	 * At this point, we pretend it's a cleartext accepted
5918	 * packet.
5919	 */
5920	return (data_mp);
5921}
5922
5923/*
5924 * AVL comparison routine for our list of tunnel polheads.
5925 */
5926static int
5927tunnel_compare(const void *arg1, const void *arg2)
5928{
5929	ipsec_tun_pol_t *left, *right;
5930	int rc;
5931
5932	left = (ipsec_tun_pol_t *)arg1;
5933	right = (ipsec_tun_pol_t *)arg2;
5934
5935	rc = strncmp(left->itp_name, right->itp_name, LIFNAMSIZ);
5936	return (rc == 0 ? rc : (rc > 0 ? 1 : -1));
5937}
5938
5939/*
5940 * Free a tunnel policy node.
5941 */
5942void
5943itp_free(ipsec_tun_pol_t *node, netstack_t *ns)
5944{
5945	if (node->itp_policy != NULL) {
5946		IPPH_REFRELE(node->itp_policy, ns);
5947		node->itp_policy = NULL;
5948	}
5949	if (node->itp_inactive != NULL) {
5950		IPPH_REFRELE(node->itp_inactive, ns);
5951		node->itp_inactive = NULL;
5952	}
5953	mutex_destroy(&node->itp_lock);
5954	kmem_free(node, sizeof (*node));
5955}
5956
5957void
5958itp_unlink(ipsec_tun_pol_t *node, netstack_t *ns)
5959{
5960	ipsec_stack_t *ipss = ns->netstack_ipsec;
5961
5962	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER);
5963	ipss->ipsec_tunnel_policy_gen++;
5964	ipsec_fragcache_uninit(&node->itp_fragcache, ipss);
5965	avl_remove(&ipss->ipsec_tunnel_policies, node);
5966	rw_exit(&ipss->ipsec_tunnel_policy_lock);
5967	ITP_REFRELE(node, ns);
5968}
5969
5970/*
5971 * Public interface to look up a tunnel security policy by name.  Used by
5972 * spdsock mostly.  Returns "node" with a bumped refcnt.
5973 */
5974ipsec_tun_pol_t *
5975get_tunnel_policy(char *name, netstack_t *ns)
5976{
5977	ipsec_tun_pol_t *node, lookup;
5978	ipsec_stack_t *ipss = ns->netstack_ipsec;
5979
5980	(void) strncpy(lookup.itp_name, name, LIFNAMSIZ);
5981
5982	rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_READER);