xref: /illumos-gate/usr/src/uts/common/io/mac/mac_util.c (revision 9820c710)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * MAC Services Module - misc utilities
28  */
29 
30 #include <sys/types.h>
31 #include <sys/mac.h>
32 #include <sys/mac_impl.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/mac_client_impl.h>
35 #include <sys/mac_soft_ring.h>
36 #include <sys/strsubr.h>
37 #include <sys/strsun.h>
38 #include <sys/vlan.h>
39 #include <sys/pattr.h>
40 #include <sys/pci_tools.h>
41 #include <inet/ip.h>
42 #include <inet/ip_impl.h>
43 #include <inet/ip6.h>
44 #include <sys/vtrace.h>
45 #include <sys/dlpi.h>
46 #include <sys/sunndi.h>
47 #include <inet/ipsec_impl.h>
48 #include <inet/sadb.h>
49 #include <inet/ipsecesp.h>
50 #include <inet/ipsecah.h>
51 
52 /*
53  * Copy an mblk, preserving its hardware checksum flags.
54  */
55 static mblk_t *
56 mac_copymsg_cksum(mblk_t *mp)
57 {
58 	mblk_t *mp1;
59 	uint32_t start, stuff, end, value, flags;
60 
61 	mp1 = copymsg(mp);
62 	if (mp1 == NULL)
63 		return (NULL);
64 
65 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
66 	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
67 	    flags, KM_NOSLEEP);
68 
69 	return (mp1);
70 }
71 
72 /*
73  * Copy an mblk chain, presenting the hardware checksum flags of the
74  * individual mblks.
75  */
76 mblk_t *
77 mac_copymsgchain_cksum(mblk_t *mp)
78 {
79 	mblk_t *nmp = NULL;
80 	mblk_t **nmpp = &nmp;
81 
82 	for (; mp != NULL; mp = mp->b_next) {
83 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
84 			freemsgchain(nmp);
85 			return (NULL);
86 		}
87 
88 		nmpp = &((*nmpp)->b_next);
89 	}
90 
91 	return (nmp);
92 }
93 
94 /*
95  * Process the specified mblk chain for proper handling of hardware
96  * checksum offload. This routine is invoked for loopback traffic
97  * between MAC clients.
98  * The function handles a NULL mblk chain passed as argument.
99  */
100 mblk_t *
101 mac_fix_cksum(mblk_t *mp_chain)
102 {
103 	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
104 	uint32_t flags, start, stuff, end, value;
105 
106 	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
107 		uint16_t len;
108 		uint32_t offset;
109 		struct ether_header *ehp;
110 		uint16_t sap;
111 
112 		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
113 		    &flags);
114 		if (flags == 0)
115 			continue;
116 
117 		/*
118 		 * Since the processing of checksum offload for loopback
119 		 * traffic requires modification of the packet contents,
120 		 * ensure sure that we are always modifying our own copy.
121 		 */
122 		if (DB_REF(mp) > 1) {
123 			mp1 = copymsg(mp);
124 			if (mp1 == NULL)
125 				continue;
126 			mp1->b_next = mp->b_next;
127 			mp->b_next = NULL;
128 			freemsg(mp);
129 			if (prev != NULL)
130 				prev->b_next = mp1;
131 			else
132 				new_chain = mp1;
133 			mp = mp1;
134 		}
135 
136 		/*
137 		 * Ethernet, and optionally VLAN header.
138 		 */
139 		/* LINTED: improper alignment cast */
140 		ehp = (struct ether_header *)mp->b_rptr;
141 		if (ntohs(ehp->ether_type) == VLAN_TPID) {
142 			struct ether_vlan_header *evhp;
143 
144 			ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
145 			/* LINTED: improper alignment cast */
146 			evhp = (struct ether_vlan_header *)mp->b_rptr;
147 			sap = ntohs(evhp->ether_type);
148 			offset = sizeof (struct ether_vlan_header);
149 		} else {
150 			sap = ntohs(ehp->ether_type);
151 			offset = sizeof (struct ether_header);
152 		}
153 
154 		if (MBLKL(mp) <= offset) {
155 			offset -= MBLKL(mp);
156 			if (mp->b_cont == NULL) {
157 				/* corrupted packet, skip it */
158 				if (prev != NULL)
159 					prev->b_next = mp->b_next;
160 				else
161 					new_chain = mp->b_next;
162 				mp1 = mp->b_next;
163 				mp->b_next = NULL;
164 				freemsg(mp);
165 				mp = mp1;
166 				continue;
167 			}
168 			mp = mp->b_cont;
169 		}
170 
171 		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
172 			ipha_t *ipha = NULL;
173 
174 			/*
175 			 * In order to compute the full and header
176 			 * checksums, we need to find and parse
177 			 * the IP and/or ULP headers.
178 			 */
179 
180 			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
181 
182 			/*
183 			 * IP header.
184 			 */
185 			if (sap != ETHERTYPE_IP)
186 				continue;
187 
188 			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
189 			/* LINTED: improper alignment cast */
190 			ipha = (ipha_t *)(mp->b_rptr + offset);
191 
192 			if (flags & HCK_FULLCKSUM) {
193 				ipaddr_t src, dst;
194 				uint32_t cksum;
195 				uint16_t *up;
196 				uint8_t proto;
197 
198 				/*
199 				 * Pointer to checksum field in ULP header.
200 				 */
201 				proto = ipha->ipha_protocol;
202 				ASSERT(ipha->ipha_version_and_hdr_length ==
203 				    IP_SIMPLE_HDR_VERSION);
204 				if (proto == IPPROTO_TCP) {
205 					/* LINTED: improper alignment cast */
206 					up = IPH_TCPH_CHECKSUMP(ipha,
207 					    IP_SIMPLE_HDR_LENGTH);
208 				} else {
209 					ASSERT(proto == IPPROTO_UDP);
210 					/* LINTED: improper alignment cast */
211 					up = IPH_UDPH_CHECKSUMP(ipha,
212 					    IP_SIMPLE_HDR_LENGTH);
213 				}
214 
215 				/*
216 				 * Pseudo-header checksum.
217 				 */
218 				src = ipha->ipha_src;
219 				dst = ipha->ipha_dst;
220 				len = ntohs(ipha->ipha_length) -
221 				    IP_SIMPLE_HDR_LENGTH;
222 
223 				cksum = (dst >> 16) + (dst & 0xFFFF) +
224 				    (src >> 16) + (src & 0xFFFF);
225 				cksum += htons(len);
226 
227 				/*
228 				 * The checksum value stored in the packet needs
229 				 * to be correct. Compute it here.
230 				 */
231 				*up = 0;
232 				cksum += (((proto) == IPPROTO_UDP) ?
233 				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
234 				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
235 				    offset, cksum);
236 				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
237 
238 				flags |= HCK_FULLCKSUM_OK;
239 				value = 0xffff;
240 			}
241 
242 			if (flags & HCK_IPV4_HDRCKSUM) {
243 				ASSERT(ipha != NULL);
244 				ipha->ipha_hdr_checksum =
245 				    (uint16_t)ip_csum_hdr(ipha);
246 			}
247 		}
248 
249 		if (flags & HCK_PARTIALCKSUM) {
250 			uint16_t *up, partial, cksum;
251 			uchar_t *ipp; /* ptr to beginning of IP header */
252 
253 			if (mp->b_cont != NULL) {
254 				mblk_t *mp1;
255 
256 				mp1 = msgpullup(mp, offset + end);
257 				if (mp1 == NULL)
258 					continue;
259 				mp1->b_next = mp->b_next;
260 				mp->b_next = NULL;
261 				freemsg(mp);
262 				if (prev != NULL)
263 					prev->b_next = mp1;
264 				else
265 					new_chain = mp1;
266 				mp = mp1;
267 			}
268 
269 			ipp = mp->b_rptr + offset;
270 			/* LINTED: cast may result in improper alignment */
271 			up = (uint16_t *)((uchar_t *)ipp + stuff);
272 			partial = *up;
273 			*up = 0;
274 
275 			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
276 			    end - start, partial);
277 			cksum = ~cksum;
278 			*up = cksum ? cksum : ~cksum;
279 
280 			/*
281 			 * Since we already computed the whole checksum,
282 			 * indicate to the stack that it has already
283 			 * been verified by the hardware.
284 			 */
285 			flags &= ~HCK_PARTIALCKSUM;
286 			flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
287 			value = 0xffff;
288 		}
289 
290 		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
291 		    value, flags, KM_NOSLEEP);
292 	}
293 
294 	return (new_chain);
295 }
296 
297 /*
298  * Add VLAN tag to the specified mblk.
299  */
300 mblk_t *
301 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
302 {
303 	mblk_t *hmp;
304 	struct ether_vlan_header *evhp;
305 	struct ether_header *ehp;
306 	uint32_t start, stuff, end, value, flags;
307 
308 	ASSERT(pri != 0 || vid != 0);
309 
310 	/*
311 	 * Allocate an mblk for the new tagged ethernet header,
312 	 * and copy the MAC addresses and ethertype from the
313 	 * original header.
314 	 */
315 
316 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
317 	if (hmp == NULL) {
318 		freemsg(mp);
319 		return (NULL);
320 	}
321 
322 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
323 	ehp = (struct ether_header *)mp->b_rptr;
324 
325 	bcopy(ehp, evhp, (ETHERADDRL * 2));
326 	evhp->ether_type = ehp->ether_type;
327 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
328 
329 	hmp->b_wptr += sizeof (struct ether_vlan_header);
330 	mp->b_rptr += sizeof (struct ether_header);
331 
332 	/*
333 	 * Free the original message if it's now empty. Link the
334 	 * rest of messages to the header message.
335 	 */
336 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
337 	(void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags,
338 	    KM_NOSLEEP);
339 	if (MBLKL(mp) == 0) {
340 		hmp->b_cont = mp->b_cont;
341 		freeb(mp);
342 	} else {
343 		hmp->b_cont = mp;
344 	}
345 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
346 
347 	/*
348 	 * Initialize the new TCI (Tag Control Information).
349 	 */
350 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
351 
352 	return (hmp);
353 }
354 
355 /*
356  * Adds a VLAN tag with the specified VID and priority to each mblk of
357  * the specified chain.
358  */
359 mblk_t *
360 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
361 {
362 	mblk_t *next_mp, **prev, *mp;
363 
364 	mp = mp_chain;
365 	prev = &mp_chain;
366 
367 	while (mp != NULL) {
368 		next_mp = mp->b_next;
369 		mp->b_next = NULL;
370 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
371 			freemsgchain(next_mp);
372 			break;
373 		}
374 		*prev = mp;
375 		prev = &mp->b_next;
376 		mp = mp->b_next = next_mp;
377 	}
378 
379 	return (mp_chain);
380 }
381 
382 /*
383  * Strip VLAN tag
384  */
385 mblk_t *
386 mac_strip_vlan_tag(mblk_t *mp)
387 {
388 	mblk_t *newmp;
389 	struct ether_vlan_header *evhp;
390 
391 	evhp = (struct ether_vlan_header *)mp->b_rptr;
392 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
393 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
394 
395 		if (DB_REF(mp) > 1) {
396 			newmp = copymsg(mp);
397 			if (newmp == NULL)
398 				return (NULL);
399 			freemsg(mp);
400 			mp = newmp;
401 		}
402 
403 		evhp = (struct ether_vlan_header *)mp->b_rptr;
404 
405 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
406 		mp->b_rptr += VLAN_TAGSZ;
407 	}
408 	return (mp);
409 }
410 
411 /*
412  * Strip VLAN tag from each mblk of the chain.
413  */
414 mblk_t *
415 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
416 {
417 	mblk_t *mp, *next_mp, **prev;
418 
419 	mp = mp_chain;
420 	prev = &mp_chain;
421 
422 	while (mp != NULL) {
423 		next_mp = mp->b_next;
424 		mp->b_next = NULL;
425 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
426 			freemsgchain(next_mp);
427 			break;
428 		}
429 		*prev = mp;
430 		prev = &mp->b_next;
431 		mp = mp->b_next = next_mp;
432 	}
433 
434 	return (mp_chain);
435 }
436 
437 /*
438  * Default callback function. Used when the datapath is not yet initialized.
439  */
440 /* ARGSUSED */
441 void
442 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
443     boolean_t loopback)
444 {
445 	mblk_t	*mp1 = mp;
446 
447 	while (mp1 != NULL) {
448 		mp1->b_prev = NULL;
449 		mp1->b_queue = NULL;
450 		mp1 = mp1->b_next;
451 	}
452 	freemsgchain(mp);
453 }
454 
455 /*
456  * Determines the IPv6 header length accounting for all the optional IPv6
457  * headers (hop-by-hop, destination, routing and fragment). The header length
458  * and next header value (a transport header) is captured.
459  *
460  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
461  * returns B_TRUE.
462  */
463 boolean_t
464 mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length,
465     uint8_t *next_hdr, boolean_t *ip_fragmented, uint32_t *ip_frag_ident)
466 {
467 	uint16_t length;
468 	uint_t	ehdrlen;
469 	uint8_t *whereptr;
470 	uint8_t *endptr;
471 	uint8_t *nexthdrp;
472 	ip6_dest_t *desthdr;
473 	ip6_rthdr_t *rthdr;
474 	ip6_frag_t *fraghdr;
475 
476 	endptr = mp->b_wptr;
477 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
478 		return (B_FALSE);
479 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
480 	length = IPV6_HDR_LEN;
481 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
482 
483 	if (ip_fragmented != NULL)
484 		*ip_fragmented = B_FALSE;
485 
486 	nexthdrp = &ip6h->ip6_nxt;
487 	while (whereptr < endptr) {
488 		/* Is there enough left for len + nexthdr? */
489 		if (whereptr + MIN_EHDR_LEN > endptr)
490 			break;
491 
492 		switch (*nexthdrp) {
493 		case IPPROTO_HOPOPTS:
494 		case IPPROTO_DSTOPTS:
495 			/* Assumes the headers are identical for hbh and dst */
496 			desthdr = (ip6_dest_t *)whereptr;
497 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
498 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
499 				return (B_FALSE);
500 			nexthdrp = &desthdr->ip6d_nxt;
501 			break;
502 		case IPPROTO_ROUTING:
503 			rthdr = (ip6_rthdr_t *)whereptr;
504 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
505 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
506 				return (B_FALSE);
507 			nexthdrp = &rthdr->ip6r_nxt;
508 			break;
509 		case IPPROTO_FRAGMENT:
510 			fraghdr = (ip6_frag_t *)whereptr;
511 			ehdrlen = sizeof (ip6_frag_t);
512 			if ((uchar_t *)&fraghdr[1] > endptr)
513 				return (B_FALSE);
514 			nexthdrp = &fraghdr->ip6f_nxt;
515 			if (ip_fragmented != NULL)
516 				*ip_fragmented = B_TRUE;
517 			if (ip_frag_ident != NULL)
518 				*ip_frag_ident = fraghdr->ip6f_ident;
519 			break;
520 		case IPPROTO_NONE:
521 			/* No next header means we're finished */
522 		default:
523 			*hdr_length = length;
524 			*next_hdr = *nexthdrp;
525 			return (B_TRUE);
526 		}
527 		length += ehdrlen;
528 		whereptr += ehdrlen;
529 		*hdr_length = length;
530 		*next_hdr = *nexthdrp;
531 	}
532 	switch (*nexthdrp) {
533 	case IPPROTO_HOPOPTS:
534 	case IPPROTO_DSTOPTS:
535 	case IPPROTO_ROUTING:
536 	case IPPROTO_FRAGMENT:
537 		/*
538 		 * If any know extension headers are still to be processed,
539 		 * the packet's malformed (or at least all the IP header(s) are
540 		 * not in the same mblk - and that should never happen.
541 		 */
542 		return (B_FALSE);
543 
544 	default:
545 		/*
546 		 * If we get here, we know that all of the IP headers were in
547 		 * the same mblk, even if the ULP header is in the next mblk.
548 		 */
549 		*hdr_length = length;
550 		*next_hdr = *nexthdrp;
551 		return (B_TRUE);
552 	}
553 }
554 
555 typedef struct mac_dladm_intr {
556 	int	ino;
557 	int	cpu_id;
558 	char	driver_path[MAXPATHLEN];
559 	char	nexus_path[MAXPATHLEN];
560 } mac_dladm_intr_t;
561 
562 /* Bind the interrupt to cpu_num */
563 static int
564 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int ino)
565 {
566 	pcitool_intr_set_t	iset;
567 	int			err;
568 
569 	iset.ino = ino;
570 	iset.cpu_id = cpu_num;
571 	iset.user_version = PCITOOL_VERSION;
572 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
573 	    kcred, NULL);
574 
575 	return (err);
576 }
577 
578 /*
579  * Search interrupt information. iget is filled in with the info to search
580  */
581 static boolean_t
582 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
583 {
584 	int	i;
585 	char	driver_path[2 * MAXPATHLEN];
586 
587 	for (i = 0; i < iget_p->num_devs; i++) {
588 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
589 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
590 		    ":%s%d", iget_p->dev[i].driver_name,
591 		    iget_p->dev[i].dev_inst);
592 		/* Match the device path for the device path */
593 		if (strcmp(driver_path, dln->driver_path) == 0) {
594 			dln->ino = iget_p->ino;
595 			dln->cpu_id = iget_p->cpu_id;
596 			return (B_TRUE);
597 		}
598 	}
599 	return (B_FALSE);
600 }
601 
602 /*
603  * Get information about ino, i.e. if this is the interrupt for our
604  * device and where it is bound etc.
605  */
606 static boolean_t
607 mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln)
608 {
609 	pcitool_intr_get_t	*iget_p;
610 	int			ipsz;
611 	int			nipsz;
612 	int			err;
613 	uint8_t			inum;
614 
615 	/*
616 	 * Check if SLEEP is OK, i.e if could come here in response to
617 	 * changing the fanout due to some callback from the driver, say
618 	 * link speed changes.
619 	 */
620 	ipsz = PCITOOL_IGET_SIZE(0);
621 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
622 
623 	iget_p->num_devs_ret = 0;
624 	iget_p->user_version = PCITOOL_VERSION;
625 	iget_p->ino = ino;
626 
627 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
628 	    FKIOCTL, kcred, NULL);
629 	if (err != 0) {
630 		kmem_free(iget_p, ipsz);
631 		return (B_FALSE);
632 	}
633 	if (iget_p->num_devs == 0) {
634 		kmem_free(iget_p, ipsz);
635 		return (B_FALSE);
636 	}
637 	inum = iget_p->num_devs;
638 	if (iget_p->num_devs_ret < iget_p->num_devs) {
639 		/* Reallocate */
640 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
641 
642 		kmem_free(iget_p, ipsz);
643 		ipsz = nipsz;
644 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
645 
646 		iget_p->num_devs_ret = inum;
647 		iget_p->ino = ino;
648 		iget_p->user_version = PCITOOL_VERSION;
649 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
650 		    FKIOCTL, kcred, NULL);
651 		if (err != 0) {
652 			kmem_free(iget_p, ipsz);
653 			return (B_FALSE);
654 		}
655 		/* defensive */
656 		if (iget_p->num_devs != iget_p->num_devs_ret) {
657 			kmem_free(iget_p, ipsz);
658 			return (B_FALSE);
659 		}
660 	}
661 
662 	if (mac_search_intrinfo(iget_p, dln)) {
663 		kmem_free(iget_p, ipsz);
664 		return (B_TRUE);
665 	}
666 	kmem_free(iget_p, ipsz);
667 	return (B_FALSE);
668 }
669 
670 /*
671  * Get the interrupts and check each one to see if it is for our device.
672  */
673 static int
674 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
675 {
676 	pcitool_intr_info_t	intr_info;
677 	int			err;
678 	int			ino;
679 
680 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
681 	    FKIOCTL, kcred, NULL);
682 	if (err != 0)
683 		return (-1);
684 
685 	for (ino = 0; ino < intr_info.num_intr; ino++) {
686 		if (mac_get_single_intr(lh, ino, dln)) {
687 			if (dln->cpu_id == cpuid)
688 				return (0);
689 			return (1);
690 		}
691 	}
692 	return (-1);
693 }
694 
695 /*
696  * Obtain the nexus parent node info. for mdip.
697  */
698 static dev_info_t *
699 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
700 {
701 	struct dev_info		*tdip = (struct dev_info *)mdip;
702 	struct ddi_minor_data	*minordata;
703 	int			circ;
704 	dev_info_t		*pdip;
705 	char			pathname[MAXPATHLEN];
706 
707 	while (tdip != NULL) {
708 		/*
709 		 * The netboot code could call this function while walking the
710 		 * device tree so we need to use ndi_devi_tryenter() here to
711 		 * avoid deadlock.
712 		 */
713 		if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
714 			break;
715 
716 		for (minordata = tdip->devi_minor; minordata != NULL;
717 		    minordata = minordata->next) {
718 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
719 			    strlen(DDI_NT_INTRCTL)) == 0) {
720 				pdip = minordata->dip;
721 				(void) ddi_pathname(pdip, pathname);
722 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
723 				    "/devices%s:intr", pathname);
724 				(void) ddi_pathname_minor(minordata, pathname);
725 				ndi_devi_exit((dev_info_t *)tdip, circ);
726 				return (pdip);
727 			}
728 		}
729 		ndi_devi_exit((dev_info_t *)tdip, circ);
730 		tdip = tdip->devi_parent;
731 	}
732 	return (NULL);
733 }
734 
735 /*
736  * For a primary MAC client, if the user has set a list or CPUs or
737  * we have obtained it implicitly, we try to retarget the interrupt
738  * for that device on one of the CPUs in the list.
739  * We assign the interrupt to the same CPU as the poll thread.
740  */
741 static boolean_t
742 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
743 {
744 	ldi_handle_t		lh = NULL;
745 	ldi_ident_t		li = NULL;
746 	int			err;
747 	int			ret;
748 	mac_dladm_intr_t	dln;
749 	dev_info_t		*dip;
750 	struct ddi_minor_data	*minordata;
751 
752 	dln.nexus_path[0] = '\0';
753 	dln.driver_path[0] = '\0';
754 
755 	minordata = ((struct dev_info *)mdip)->devi_minor;
756 	while (minordata != NULL) {
757 		if (minordata->type == DDM_MINOR)
758 			break;
759 		minordata = minordata->next;
760 	}
761 	if (minordata == NULL)
762 		return (B_FALSE);
763 
764 	(void) ddi_pathname_minor(minordata, dln.driver_path);
765 
766 	dip = mac_get_nexus_node(mdip, &dln);
767 	/* defensive */
768 	if (dip == NULL)
769 		return (B_FALSE);
770 
771 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
772 	if (err != 0)
773 		return (B_FALSE);
774 
775 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
776 	if (err != 0)
777 		return (B_FALSE);
778 
779 	ret = mac_validate_intr(lh, &dln, cpuid);
780 	if (ret < 0) {
781 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
782 		return (B_FALSE);
783 	}
784 	/* cmn_note? */
785 	if (ret != 0)
786 		if ((err = (mac_set_intr(lh, cpuid, dln.ino))) != 0) {
787 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
788 			return (B_FALSE);
789 		}
790 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
791 	return (B_TRUE);
792 }
793 
794 void
795 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
796 {
797 	dev_info_t		*mdip = (dev_info_t *)arg;
798 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
799 	mac_resource_props_t	*mrp;
800 	mac_perim_handle_t	mph;
801 
802 	if (cpuid == -1 || !mac_check_interrupt_binding(mdip, cpuid))
803 		return;
804 
805 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
806 	mrp = MCIP_RESOURCE_PROPS(mcip);
807 	mrp->mrp_intr_cpu = cpuid;
808 	mac_perim_exit(mph);
809 }
810 
811 int32_t
812 mac_client_intr_cpu(mac_client_handle_t mch)
813 {
814 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
815 	mac_cpus_t		*srs_cpu;
816 	mac_soft_ring_set_t	*rx_srs;
817 	flow_entry_t		*flent = mcip->mci_flent;
818 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
819 
820 	/*
821 	 * Check if we need to retarget the interrupt. We do this only
822 	 * for the primary MAC client. We do this if we have the only
823 	 *  exclusive ring in the group.
824 	 */
825 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
826 		rx_srs = flent->fe_rx_srs[1];
827 		srs_cpu = &rx_srs->srs_cpu;
828 		if (mrp->mrp_intr_cpu == srs_cpu->mc_pollid)
829 			return (-1);
830 		return (srs_cpu->mc_pollid);
831 	}
832 	return (-1);
833 }
834 
835 void *
836 mac_get_devinfo(mac_handle_t mh)
837 {
838 	mac_impl_t	*mip = (mac_impl_t *)mh;
839 
840 	return ((void *)mip->mi_dip);
841 }
842 
843 #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
844 #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
845 #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
846 
847 uint64_t
848 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
849 {
850 	struct ether_header *ehp;
851 	uint64_t hash = 0;
852 	uint16_t sap;
853 	uint_t skip_len;
854 	uint8_t proto;
855 	boolean_t ip_fragmented;
856 
857 	/*
858 	 * We may want to have one of these per MAC type plugin in the
859 	 * future. For now supports only ethernet.
860 	 */
861 	if (media != DL_ETHER)
862 		return (0L);
863 
864 	/* for now we support only outbound packets */
865 	ASSERT(is_outbound);
866 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
867 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
868 
869 	/* compute L2 hash */
870 
871 	ehp = (struct ether_header *)mp->b_rptr;
872 
873 	if ((policy & MAC_PKT_HASH_L2) != 0) {
874 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
875 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
876 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
877 		policy &= ~MAC_PKT_HASH_L2;
878 	}
879 
880 	if (policy == 0)
881 		goto done;
882 
883 	/* skip ethernet header */
884 
885 	sap = ntohs(ehp->ether_type);
886 	if (sap == ETHERTYPE_VLAN) {
887 		struct ether_vlan_header *evhp;
888 		mblk_t *newmp = NULL;
889 
890 		skip_len = sizeof (struct ether_vlan_header);
891 		if (MBLKL(mp) < skip_len) {
892 			/* the vlan tag is the payload, pull up first */
893 			newmp = msgpullup(mp, -1);
894 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
895 				goto done;
896 			}
897 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
898 		} else {
899 			evhp = (struct ether_vlan_header *)mp->b_rptr;
900 		}
901 
902 		sap = ntohs(evhp->ether_type);
903 		freemsg(newmp);
904 	} else {
905 		skip_len = sizeof (struct ether_header);
906 	}
907 
908 	/* if ethernet header is in its own mblk, skip it */
909 	if (MBLKL(mp) <= skip_len) {
910 		skip_len -= MBLKL(mp);
911 		mp = mp->b_cont;
912 		if (mp == NULL)
913 			goto done;
914 	}
915 
916 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
917 
918 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
919 
920 	switch (sap) {
921 	case ETHERTYPE_IP: {
922 		ipha_t *iphp;
923 
924 		/*
925 		 * If the header is not aligned or the header doesn't fit
926 		 * in the mblk, bail now. Note that this may cause packets
927 		 * reordering.
928 		 */
929 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
930 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
931 		    !OK_32PTR((char *)iphp))
932 			goto done;
933 
934 		proto = iphp->ipha_protocol;
935 		skip_len += IPH_HDR_LENGTH(iphp);
936 
937 		/* Check if the packet is fragmented. */
938 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
939 		    IPH_OFFSET;
940 
941 		/*
942 		 * For fragmented packets, use addresses in addition to
943 		 * the frag_id to generate the hash inorder to get
944 		 * better distribution.
945 		 */
946 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
947 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
948 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
949 
950 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
951 			    PKT_HASH_4BYTES(ip_dst));
952 			policy &= ~MAC_PKT_HASH_L3;
953 		}
954 
955 		if (ip_fragmented) {
956 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
957 			hash ^= PKT_HASH_2BYTES(identp);
958 			goto done;
959 		}
960 		break;
961 	}
962 	case ETHERTYPE_IPV6: {
963 		ip6_t *ip6hp;
964 		uint16_t hdr_length;
965 		uint32_t ip_frag_ident;
966 
967 		/*
968 		 * If the header is not aligned or the header doesn't fit
969 		 * in the mblk, bail now. Note that this may cause packets
970 		 * reordering.
971 		 */
972 
973 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
974 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
975 		    !OK_32PTR((char *)ip6hp))
976 			goto done;
977 
978 		if (!mac_ip_hdr_length_v6(mp, ip6hp, &hdr_length, &proto,
979 		    &ip_fragmented, &ip_frag_ident))
980 			goto done;
981 		skip_len += hdr_length;
982 
983 		/*
984 		 * For fragmented packets, use addresses in addition to
985 		 * the frag_id to generate the hash inorder to get
986 		 * better distribution.
987 		 */
988 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
989 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
990 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
991 
992 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
993 			    PKT_HASH_4BYTES(ip_dst));
994 			policy &= ~MAC_PKT_HASH_L3;
995 		}
996 
997 		if (ip_fragmented) {
998 			uint8_t *identp = (uint8_t *)&ip_frag_ident;
999 			hash ^= PKT_HASH_4BYTES(identp);
1000 			goto done;
1001 		}
1002 		break;
1003 	}
1004 	default:
1005 		goto done;
1006 	}
1007 
1008 	if (policy == 0)
1009 		goto done;
1010 
1011 	/* if ip header is in its own mblk, skip it */
1012 	if (MBLKL(mp) <= skip_len) {
1013 		skip_len -= MBLKL(mp);
1014 		mp = mp->b_cont;
1015 		if (mp == NULL)
1016 			goto done;
1017 	}
1018 
1019 	/* parse ULP header */
1020 again:
1021 	switch (proto) {
1022 	case IPPROTO_TCP:
1023 	case IPPROTO_UDP:
1024 	case IPPROTO_ESP:
1025 	case IPPROTO_SCTP:
1026 		/*
1027 		 * These Internet Protocols are intentionally designed
1028 		 * for hashing from the git-go.  Port numbers are in the first
1029 		 * word for transports, SPI is first for ESP.
1030 		 */
1031 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1032 			goto done;
1033 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1034 		break;
1035 
1036 	case IPPROTO_AH: {
1037 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1038 		uint_t ah_length = AH_TOTAL_LEN(ah);
1039 
1040 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1041 			goto done;
1042 
1043 		proto = ah->ah_nexthdr;
1044 		skip_len += ah_length;
1045 
1046 		/* if AH header is in its own mblk, skip it */
1047 		if (MBLKL(mp) <= skip_len) {
1048 			skip_len -= MBLKL(mp);
1049 			mp = mp->b_cont;
1050 			if (mp == NULL)
1051 				goto done;
1052 		}
1053 
1054 		goto again;
1055 	}
1056 	}
1057 
1058 done:
1059 	return (hash);
1060 }
1061