xref: /illumos-gate/usr/src/uts/common/io/mac/mac_util.c (revision ec71f88e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2018 Joyent, Inc.
24  */
25 
26 /*
27  * MAC Services Module - misc utilities
28  */
29 
30 #include <sys/types.h>
31 #include <sys/mac.h>
32 #include <sys/mac_impl.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/mac_client_impl.h>
35 #include <sys/mac_soft_ring.h>
36 #include <sys/strsubr.h>
37 #include <sys/strsun.h>
38 #include <sys/vlan.h>
39 #include <sys/pattr.h>
40 #include <sys/pci_tools.h>
41 #include <inet/ip.h>
42 #include <inet/ip_impl.h>
43 #include <inet/ip6.h>
44 #include <sys/vtrace.h>
45 #include <sys/dlpi.h>
46 #include <sys/sunndi.h>
47 #include <inet/ipsec_impl.h>
48 #include <inet/sadb.h>
49 #include <inet/ipsecesp.h>
50 #include <inet/ipsecah.h>
51 
52 /*
53  * Copy an mblk, preserving its hardware checksum flags.
54  */
55 static mblk_t *
56 mac_copymsg_cksum(mblk_t *mp)
57 {
58 	mblk_t *mp1;
59 
60 	mp1 = copymsg(mp);
61 	if (mp1 == NULL)
62 		return (NULL);
63 
64 	mac_hcksum_clone(mp, mp1);
65 
66 	return (mp1);
67 }
68 
69 /*
70  * Copy an mblk chain, presenting the hardware checksum flags of the
71  * individual mblks.
72  */
73 mblk_t *
74 mac_copymsgchain_cksum(mblk_t *mp)
75 {
76 	mblk_t *nmp = NULL;
77 	mblk_t **nmpp = &nmp;
78 
79 	for (; mp != NULL; mp = mp->b_next) {
80 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
81 			freemsgchain(nmp);
82 			return (NULL);
83 		}
84 
85 		nmpp = &((*nmpp)->b_next);
86 	}
87 
88 	return (nmp);
89 }
90 
91 /*
92  * Process the specified mblk chain for proper handling of hardware
93  * checksum offload. This routine is invoked for loopback traffic
94  * between MAC clients.
95  * The function handles a NULL mblk chain passed as argument.
96  */
97 mblk_t *
98 mac_fix_cksum(mblk_t *mp_chain)
99 {
100 	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
101 	uint32_t flags, start, stuff, end, value;
102 
103 	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
104 		uint16_t len;
105 		uint32_t offset;
106 		struct ether_header *ehp;
107 		uint16_t sap;
108 
109 		mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
110 		if (flags == 0)
111 			continue;
112 
113 		/*
114 		 * Since the processing of checksum offload for loopback
115 		 * traffic requires modification of the packet contents,
116 		 * ensure sure that we are always modifying our own copy.
117 		 */
118 		if (DB_REF(mp) > 1) {
119 			mp1 = copymsg(mp);
120 			if (mp1 == NULL)
121 				continue;
122 			mp1->b_next = mp->b_next;
123 			mp->b_next = NULL;
124 			freemsg(mp);
125 			if (prev != NULL)
126 				prev->b_next = mp1;
127 			else
128 				new_chain = mp1;
129 			mp = mp1;
130 		}
131 
132 		/*
133 		 * Ethernet, and optionally VLAN header.
134 		 */
135 		/* LINTED: improper alignment cast */
136 		ehp = (struct ether_header *)mp->b_rptr;
137 		if (ntohs(ehp->ether_type) == VLAN_TPID) {
138 			struct ether_vlan_header *evhp;
139 
140 			ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
141 			/* LINTED: improper alignment cast */
142 			evhp = (struct ether_vlan_header *)mp->b_rptr;
143 			sap = ntohs(evhp->ether_type);
144 			offset = sizeof (struct ether_vlan_header);
145 		} else {
146 			sap = ntohs(ehp->ether_type);
147 			offset = sizeof (struct ether_header);
148 		}
149 
150 		if (MBLKL(mp) <= offset) {
151 			offset -= MBLKL(mp);
152 			if (mp->b_cont == NULL) {
153 				/* corrupted packet, skip it */
154 				if (prev != NULL)
155 					prev->b_next = mp->b_next;
156 				else
157 					new_chain = mp->b_next;
158 				mp1 = mp->b_next;
159 				mp->b_next = NULL;
160 				freemsg(mp);
161 				mp = mp1;
162 				continue;
163 			}
164 			mp = mp->b_cont;
165 		}
166 
167 		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
168 			ipha_t *ipha = NULL;
169 
170 			/*
171 			 * In order to compute the full and header
172 			 * checksums, we need to find and parse
173 			 * the IP and/or ULP headers.
174 			 */
175 
176 			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
177 
178 			/*
179 			 * IP header.
180 			 */
181 			if (sap != ETHERTYPE_IP)
182 				continue;
183 
184 			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
185 			/* LINTED: improper alignment cast */
186 			ipha = (ipha_t *)(mp->b_rptr + offset);
187 
188 			if (flags & HCK_FULLCKSUM) {
189 				ipaddr_t src, dst;
190 				uint32_t cksum;
191 				uint16_t *up;
192 				uint8_t proto;
193 
194 				/*
195 				 * Pointer to checksum field in ULP header.
196 				 */
197 				proto = ipha->ipha_protocol;
198 				ASSERT(ipha->ipha_version_and_hdr_length ==
199 				    IP_SIMPLE_HDR_VERSION);
200 
201 				switch (proto) {
202 				case IPPROTO_TCP:
203 					/* LINTED: improper alignment cast */
204 					up = IPH_TCPH_CHECKSUMP(ipha,
205 					    IP_SIMPLE_HDR_LENGTH);
206 					break;
207 
208 				case IPPROTO_UDP:
209 					/* LINTED: improper alignment cast */
210 					up = IPH_UDPH_CHECKSUMP(ipha,
211 					    IP_SIMPLE_HDR_LENGTH);
212 					break;
213 
214 				default:
215 					cmn_err(CE_WARN, "mac_fix_cksum: "
216 					    "unexpected protocol: %d", proto);
217 					continue;
218 				}
219 
220 				/*
221 				 * Pseudo-header checksum.
222 				 */
223 				src = ipha->ipha_src;
224 				dst = ipha->ipha_dst;
225 				len = ntohs(ipha->ipha_length) -
226 				    IP_SIMPLE_HDR_LENGTH;
227 
228 				cksum = (dst >> 16) + (dst & 0xFFFF) +
229 				    (src >> 16) + (src & 0xFFFF);
230 				cksum += htons(len);
231 
232 				/*
233 				 * The checksum value stored in the packet needs
234 				 * to be correct. Compute it here.
235 				 */
236 				*up = 0;
237 				cksum += (((proto) == IPPROTO_UDP) ?
238 				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
239 				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
240 				    offset, cksum);
241 				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
242 
243 				/*
244 				 * Flag the packet so that it appears
245 				 * that the checksum has already been
246 				 * verified by the hardware.
247 				 */
248 				flags &= ~HCK_FULLCKSUM;
249 				flags |= HCK_FULLCKSUM_OK;
250 				value = 0;
251 			}
252 
253 			if (flags & HCK_IPV4_HDRCKSUM) {
254 				ASSERT(ipha != NULL);
255 				ipha->ipha_hdr_checksum =
256 				    (uint16_t)ip_csum_hdr(ipha);
257 				flags &= ~HCK_IPV4_HDRCKSUM;
258 				flags |= HCK_IPV4_HDRCKSUM_OK;
259 
260 			}
261 		}
262 
263 		if (flags & HCK_PARTIALCKSUM) {
264 			uint16_t *up, partial, cksum;
265 			uchar_t *ipp; /* ptr to beginning of IP header */
266 
267 			if (mp->b_cont != NULL) {
268 				mblk_t *mp1;
269 
270 				mp1 = msgpullup(mp, offset + end);
271 				if (mp1 == NULL)
272 					continue;
273 				mp1->b_next = mp->b_next;
274 				mp->b_next = NULL;
275 				freemsg(mp);
276 				if (prev != NULL)
277 					prev->b_next = mp1;
278 				else
279 					new_chain = mp1;
280 				mp = mp1;
281 			}
282 
283 			ipp = mp->b_rptr + offset;
284 			/* LINTED: cast may result in improper alignment */
285 			up = (uint16_t *)((uchar_t *)ipp + stuff);
286 			partial = *up;
287 			*up = 0;
288 
289 			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
290 			    end - start, partial);
291 			cksum = ~cksum;
292 			*up = cksum ? cksum : ~cksum;
293 
294 			/*
295 			 * Since we already computed the whole checksum,
296 			 * indicate to the stack that it has already
297 			 * been verified by the hardware.
298 			 */
299 			flags &= ~HCK_PARTIALCKSUM;
300 			flags |= HCK_FULLCKSUM_OK;
301 			value = 0;
302 		}
303 
304 		mac_hcksum_set(mp, start, stuff, end, value, flags);
305 	}
306 
307 	return (new_chain);
308 }
309 
310 /*
311  * Add VLAN tag to the specified mblk.
312  */
313 mblk_t *
314 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
315 {
316 	mblk_t *hmp;
317 	struct ether_vlan_header *evhp;
318 	struct ether_header *ehp;
319 
320 	ASSERT(pri != 0 || vid != 0);
321 
322 	/*
323 	 * Allocate an mblk for the new tagged ethernet header,
324 	 * and copy the MAC addresses and ethertype from the
325 	 * original header.
326 	 */
327 
328 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
329 	if (hmp == NULL) {
330 		freemsg(mp);
331 		return (NULL);
332 	}
333 
334 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
335 	ehp = (struct ether_header *)mp->b_rptr;
336 
337 	bcopy(ehp, evhp, (ETHERADDRL * 2));
338 	evhp->ether_type = ehp->ether_type;
339 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
340 
341 	hmp->b_wptr += sizeof (struct ether_vlan_header);
342 	mp->b_rptr += sizeof (struct ether_header);
343 
344 	/*
345 	 * Free the original message if it's now empty. Link the
346 	 * rest of messages to the header message.
347 	 */
348 	mac_hcksum_clone(mp, hmp);
349 	if (MBLKL(mp) == 0) {
350 		hmp->b_cont = mp->b_cont;
351 		freeb(mp);
352 	} else {
353 		hmp->b_cont = mp;
354 	}
355 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
356 
357 	/*
358 	 * Initialize the new TCI (Tag Control Information).
359 	 */
360 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
361 
362 	return (hmp);
363 }
364 
365 /*
366  * Adds a VLAN tag with the specified VID and priority to each mblk of
367  * the specified chain.
368  */
369 mblk_t *
370 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
371 {
372 	mblk_t *next_mp, **prev, *mp;
373 
374 	mp = mp_chain;
375 	prev = &mp_chain;
376 
377 	while (mp != NULL) {
378 		next_mp = mp->b_next;
379 		mp->b_next = NULL;
380 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
381 			freemsgchain(next_mp);
382 			break;
383 		}
384 		*prev = mp;
385 		prev = &mp->b_next;
386 		mp = mp->b_next = next_mp;
387 	}
388 
389 	return (mp_chain);
390 }
391 
392 /*
393  * Strip VLAN tag
394  */
395 mblk_t *
396 mac_strip_vlan_tag(mblk_t *mp)
397 {
398 	mblk_t *newmp;
399 	struct ether_vlan_header *evhp;
400 
401 	evhp = (struct ether_vlan_header *)mp->b_rptr;
402 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
403 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
404 
405 		if (DB_REF(mp) > 1) {
406 			newmp = copymsg(mp);
407 			if (newmp == NULL)
408 				return (NULL);
409 			freemsg(mp);
410 			mp = newmp;
411 		}
412 
413 		evhp = (struct ether_vlan_header *)mp->b_rptr;
414 
415 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
416 		mp->b_rptr += VLAN_TAGSZ;
417 	}
418 	return (mp);
419 }
420 
421 /*
422  * Strip VLAN tag from each mblk of the chain.
423  */
424 mblk_t *
425 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
426 {
427 	mblk_t *mp, *next_mp, **prev;
428 
429 	mp = mp_chain;
430 	prev = &mp_chain;
431 
432 	while (mp != NULL) {
433 		next_mp = mp->b_next;
434 		mp->b_next = NULL;
435 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
436 			freemsgchain(next_mp);
437 			break;
438 		}
439 		*prev = mp;
440 		prev = &mp->b_next;
441 		mp = mp->b_next = next_mp;
442 	}
443 
444 	return (mp_chain);
445 }
446 
447 /*
448  * Default callback function. Used when the datapath is not yet initialized.
449  */
450 /* ARGSUSED */
451 void
452 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
453     boolean_t loopback)
454 {
455 	mblk_t	*mp1 = mp;
456 
457 	while (mp1 != NULL) {
458 		mp1->b_prev = NULL;
459 		mp1->b_queue = NULL;
460 		mp1 = mp1->b_next;
461 	}
462 	freemsgchain(mp);
463 }
464 
465 /*
466  * Determines the IPv6 header length accounting for all the optional IPv6
467  * headers (hop-by-hop, destination, routing and fragment). The header length
468  * and next header value (a transport header) is captured.
469  *
470  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
471  * returns B_TRUE.
472  */
473 boolean_t
474 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
475     uint8_t *next_hdr, ip6_frag_t **fragp)
476 {
477 	uint16_t length;
478 	uint_t	ehdrlen;
479 	uint8_t *whereptr;
480 	uint8_t *nexthdrp;
481 	ip6_dest_t *desthdr;
482 	ip6_rthdr_t *rthdr;
483 	ip6_frag_t *fraghdr;
484 
485 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
486 		return (B_FALSE);
487 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
488 	length = IPV6_HDR_LEN;
489 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
490 
491 	if (fragp != NULL)
492 		*fragp = NULL;
493 
494 	nexthdrp = &ip6h->ip6_nxt;
495 	while (whereptr < endptr) {
496 		/* Is there enough left for len + nexthdr? */
497 		if (whereptr + MIN_EHDR_LEN > endptr)
498 			break;
499 
500 		switch (*nexthdrp) {
501 		case IPPROTO_HOPOPTS:
502 		case IPPROTO_DSTOPTS:
503 			/* Assumes the headers are identical for hbh and dst */
504 			desthdr = (ip6_dest_t *)whereptr;
505 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
506 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
507 				return (B_FALSE);
508 			nexthdrp = &desthdr->ip6d_nxt;
509 			break;
510 		case IPPROTO_ROUTING:
511 			rthdr = (ip6_rthdr_t *)whereptr;
512 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
513 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
514 				return (B_FALSE);
515 			nexthdrp = &rthdr->ip6r_nxt;
516 			break;
517 		case IPPROTO_FRAGMENT:
518 			fraghdr = (ip6_frag_t *)whereptr;
519 			ehdrlen = sizeof (ip6_frag_t);
520 			if ((uchar_t *)&fraghdr[1] > endptr)
521 				return (B_FALSE);
522 			nexthdrp = &fraghdr->ip6f_nxt;
523 			if (fragp != NULL)
524 				*fragp = fraghdr;
525 			break;
526 		case IPPROTO_NONE:
527 			/* No next header means we're finished */
528 		default:
529 			*hdr_length = length;
530 			*next_hdr = *nexthdrp;
531 			return (B_TRUE);
532 		}
533 		length += ehdrlen;
534 		whereptr += ehdrlen;
535 		*hdr_length = length;
536 		*next_hdr = *nexthdrp;
537 	}
538 	switch (*nexthdrp) {
539 	case IPPROTO_HOPOPTS:
540 	case IPPROTO_DSTOPTS:
541 	case IPPROTO_ROUTING:
542 	case IPPROTO_FRAGMENT:
543 		/*
544 		 * If any know extension headers are still to be processed,
545 		 * the packet's malformed (or at least all the IP header(s) are
546 		 * not in the same mblk - and that should never happen.
547 		 */
548 		return (B_FALSE);
549 
550 	default:
551 		/*
552 		 * If we get here, we know that all of the IP headers were in
553 		 * the same mblk, even if the ULP header is in the next mblk.
554 		 */
555 		*hdr_length = length;
556 		*next_hdr = *nexthdrp;
557 		return (B_TRUE);
558 	}
559 }
560 
561 /*
562  * The following set of routines are there to take care of interrupt
563  * re-targeting for legacy (fixed) interrupts. Some older versions
564  * of the popular NICs like e1000g do not support MSI-X interrupts
565  * and they reserve fixed interrupts for RX/TX rings. To re-target
566  * these interrupts, PCITOOL ioctls need to be used.
567  */
568 typedef struct mac_dladm_intr {
569 	int	ino;
570 	int	cpu_id;
571 	char	driver_path[MAXPATHLEN];
572 	char	nexus_path[MAXPATHLEN];
573 } mac_dladm_intr_t;
574 
575 /* Bind the interrupt to cpu_num */
576 static int
577 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
578 {
579 	pcitool_intr_set_t	iset;
580 	int			err;
581 
582 	iset.old_cpu = oldcpuid;
583 	iset.ino = ino;
584 	iset.cpu_id = cpu_num;
585 	iset.user_version = PCITOOL_VERSION;
586 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
587 	    kcred, NULL);
588 
589 	return (err);
590 }
591 
592 /*
593  * Search interrupt information. iget is filled in with the info to search
594  */
595 static boolean_t
596 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
597 {
598 	int	i;
599 	char	driver_path[2 * MAXPATHLEN];
600 
601 	for (i = 0; i < iget_p->num_devs; i++) {
602 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
603 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
604 		    ":%s%d", iget_p->dev[i].driver_name,
605 		    iget_p->dev[i].dev_inst);
606 		/* Match the device path for the device path */
607 		if (strcmp(driver_path, dln->driver_path) == 0) {
608 			dln->ino = iget_p->ino;
609 			dln->cpu_id = iget_p->cpu_id;
610 			return (B_TRUE);
611 		}
612 	}
613 	return (B_FALSE);
614 }
615 
616 /*
617  * Get information about ino, i.e. if this is the interrupt for our
618  * device and where it is bound etc.
619  */
620 static boolean_t
621 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
622     mac_dladm_intr_t *dln)
623 {
624 	pcitool_intr_get_t	*iget_p;
625 	int			ipsz;
626 	int			nipsz;
627 	int			err;
628 	uint8_t			inum;
629 
630 	/*
631 	 * Check if SLEEP is OK, i.e if could come here in response to
632 	 * changing the fanout due to some callback from the driver, say
633 	 * link speed changes.
634 	 */
635 	ipsz = PCITOOL_IGET_SIZE(0);
636 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
637 
638 	iget_p->num_devs_ret = 0;
639 	iget_p->user_version = PCITOOL_VERSION;
640 	iget_p->cpu_id = oldcpuid;
641 	iget_p->ino = ino;
642 
643 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
644 	    FKIOCTL, kcred, NULL);
645 	if (err != 0) {
646 		kmem_free(iget_p, ipsz);
647 		return (B_FALSE);
648 	}
649 	if (iget_p->num_devs == 0) {
650 		kmem_free(iget_p, ipsz);
651 		return (B_FALSE);
652 	}
653 	inum = iget_p->num_devs;
654 	if (iget_p->num_devs_ret < iget_p->num_devs) {
655 		/* Reallocate */
656 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
657 
658 		kmem_free(iget_p, ipsz);
659 		ipsz = nipsz;
660 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
661 
662 		iget_p->num_devs_ret = inum;
663 		iget_p->cpu_id = oldcpuid;
664 		iget_p->ino = ino;
665 		iget_p->user_version = PCITOOL_VERSION;
666 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
667 		    FKIOCTL, kcred, NULL);
668 		if (err != 0) {
669 			kmem_free(iget_p, ipsz);
670 			return (B_FALSE);
671 		}
672 		/* defensive */
673 		if (iget_p->num_devs != iget_p->num_devs_ret) {
674 			kmem_free(iget_p, ipsz);
675 			return (B_FALSE);
676 		}
677 	}
678 
679 	if (mac_search_intrinfo(iget_p, dln)) {
680 		kmem_free(iget_p, ipsz);
681 		return (B_TRUE);
682 	}
683 	kmem_free(iget_p, ipsz);
684 	return (B_FALSE);
685 }
686 
687 /*
688  * Get the interrupts and check each one to see if it is for our device.
689  */
690 static int
691 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
692 {
693 	pcitool_intr_info_t	intr_info;
694 	int			err;
695 	int			ino;
696 	int			oldcpuid;
697 
698 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
699 	    FKIOCTL, kcred, NULL);
700 	if (err != 0)
701 		return (-1);
702 
703 	for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
704 		for (ino = 0; ino < intr_info.num_intr; ino++) {
705 			if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
706 				if (dln->cpu_id == cpuid)
707 					return (0);
708 				return (1);
709 			}
710 		}
711 	}
712 	return (-1);
713 }
714 
715 /*
716  * Obtain the nexus parent node info. for mdip.
717  */
718 static dev_info_t *
719 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
720 {
721 	struct dev_info		*tdip = (struct dev_info *)mdip;
722 	struct ddi_minor_data	*minordata;
723 	int			circ;
724 	dev_info_t		*pdip;
725 	char			pathname[MAXPATHLEN];
726 
727 	while (tdip != NULL) {
728 		/*
729 		 * The netboot code could call this function while walking the
730 		 * device tree so we need to use ndi_devi_tryenter() here to
731 		 * avoid deadlock.
732 		 */
733 		if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
734 			break;
735 
736 		for (minordata = tdip->devi_minor; minordata != NULL;
737 		    minordata = minordata->next) {
738 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
739 			    strlen(DDI_NT_INTRCTL)) == 0) {
740 				pdip = minordata->dip;
741 				(void) ddi_pathname(pdip, pathname);
742 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
743 				    "/devices%s:intr", pathname);
744 				(void) ddi_pathname_minor(minordata, pathname);
745 				ndi_devi_exit((dev_info_t *)tdip, circ);
746 				return (pdip);
747 			}
748 		}
749 		ndi_devi_exit((dev_info_t *)tdip, circ);
750 		tdip = tdip->devi_parent;
751 	}
752 	return (NULL);
753 }
754 
755 /*
756  * For a primary MAC client, if the user has set a list or CPUs or
757  * we have obtained it implicitly, we try to retarget the interrupt
758  * for that device on one of the CPUs in the list.
759  * We assign the interrupt to the same CPU as the poll thread.
760  */
761 static boolean_t
762 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
763 {
764 	ldi_handle_t		lh = NULL;
765 	ldi_ident_t		li = NULL;
766 	int			err;
767 	int			ret;
768 	mac_dladm_intr_t	dln;
769 	dev_info_t		*dip;
770 	struct ddi_minor_data	*minordata;
771 
772 	dln.nexus_path[0] = '\0';
773 	dln.driver_path[0] = '\0';
774 
775 	minordata = ((struct dev_info *)mdip)->devi_minor;
776 	while (minordata != NULL) {
777 		if (minordata->type == DDM_MINOR)
778 			break;
779 		minordata = minordata->next;
780 	}
781 	if (minordata == NULL)
782 		return (B_FALSE);
783 
784 	(void) ddi_pathname_minor(minordata, dln.driver_path);
785 
786 	dip = mac_get_nexus_node(mdip, &dln);
787 	/* defensive */
788 	if (dip == NULL)
789 		return (B_FALSE);
790 
791 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
792 	if (err != 0)
793 		return (B_FALSE);
794 
795 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
796 	if (err != 0)
797 		return (B_FALSE);
798 
799 	ret = mac_validate_intr(lh, &dln, cpuid);
800 	if (ret < 0) {
801 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
802 		return (B_FALSE);
803 	}
804 	/* cmn_note? */
805 	if (ret != 0)
806 		if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
807 		    != 0) {
808 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
809 			return (B_FALSE);
810 		}
811 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
812 	return (B_TRUE);
813 }
814 
815 void
816 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
817 {
818 	dev_info_t		*mdip = (dev_info_t *)arg;
819 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
820 	mac_resource_props_t	*mrp;
821 	mac_perim_handle_t	mph;
822 	flow_entry_t		*flent = mcip->mci_flent;
823 	mac_soft_ring_set_t	*rx_srs;
824 	mac_cpus_t		*srs_cpu;
825 
826 	if (!mac_check_interrupt_binding(mdip, cpuid))
827 		cpuid = -1;
828 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
829 	mrp = MCIP_RESOURCE_PROPS(mcip);
830 	mrp->mrp_rx_intr_cpu = cpuid;
831 	if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
832 		rx_srs = flent->fe_rx_srs[1];
833 		srs_cpu = &rx_srs->srs_cpu;
834 		srs_cpu->mc_rx_intr_cpu = cpuid;
835 	}
836 	mac_perim_exit(mph);
837 }
838 
839 int32_t
840 mac_client_intr_cpu(mac_client_handle_t mch)
841 {
842 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
843 	mac_cpus_t		*srs_cpu;
844 	mac_soft_ring_set_t	*rx_srs;
845 	flow_entry_t		*flent = mcip->mci_flent;
846 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
847 	mac_ring_t		*ring;
848 	mac_intr_t		*mintr;
849 
850 	/*
851 	 * Check if we need to retarget the interrupt. We do this only
852 	 * for the primary MAC client. We do this if we have the only
853 	 * exclusive ring in the group.
854 	 */
855 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
856 		rx_srs = flent->fe_rx_srs[1];
857 		srs_cpu = &rx_srs->srs_cpu;
858 		ring = rx_srs->srs_ring;
859 		mintr = &ring->mr_info.mri_intr;
860 		/*
861 		 * If ddi_handle is present or the poll CPU is
862 		 * already bound to the interrupt CPU, return -1.
863 		 */
864 		if (mintr->mi_ddi_handle != NULL ||
865 		    ((mrp->mrp_ncpus != 0) &&
866 		    (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
867 			return (-1);
868 		}
869 		return (srs_cpu->mc_rx_pollid);
870 	}
871 	return (-1);
872 }
873 
874 void *
875 mac_get_devinfo(mac_handle_t mh)
876 {
877 	mac_impl_t	*mip = (mac_impl_t *)mh;
878 
879 	return ((void *)mip->mi_dip);
880 }
881 
882 #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
883 #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
884 #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
885 
886 uint64_t
887 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
888 {
889 	struct ether_header *ehp;
890 	uint64_t hash = 0;
891 	uint16_t sap;
892 	uint_t skip_len;
893 	uint8_t proto;
894 	boolean_t ip_fragmented;
895 
896 	/*
897 	 * We may want to have one of these per MAC type plugin in the
898 	 * future. For now supports only ethernet.
899 	 */
900 	if (media != DL_ETHER)
901 		return (0L);
902 
903 	/* for now we support only outbound packets */
904 	ASSERT(is_outbound);
905 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
906 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
907 
908 	/* compute L2 hash */
909 
910 	ehp = (struct ether_header *)mp->b_rptr;
911 
912 	if ((policy & MAC_PKT_HASH_L2) != 0) {
913 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
914 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
915 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
916 		policy &= ~MAC_PKT_HASH_L2;
917 	}
918 
919 	if (policy == 0)
920 		goto done;
921 
922 	/* skip ethernet header */
923 
924 	sap = ntohs(ehp->ether_type);
925 	if (sap == ETHERTYPE_VLAN) {
926 		struct ether_vlan_header *evhp;
927 		mblk_t *newmp = NULL;
928 
929 		skip_len = sizeof (struct ether_vlan_header);
930 		if (MBLKL(mp) < skip_len) {
931 			/* the vlan tag is the payload, pull up first */
932 			newmp = msgpullup(mp, -1);
933 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
934 				goto done;
935 			}
936 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
937 		} else {
938 			evhp = (struct ether_vlan_header *)mp->b_rptr;
939 		}
940 
941 		sap = ntohs(evhp->ether_type);
942 		freemsg(newmp);
943 	} else {
944 		skip_len = sizeof (struct ether_header);
945 	}
946 
947 	/* if ethernet header is in its own mblk, skip it */
948 	if (MBLKL(mp) <= skip_len) {
949 		skip_len -= MBLKL(mp);
950 		mp = mp->b_cont;
951 		if (mp == NULL)
952 			goto done;
953 	}
954 
955 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
956 
957 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
958 
959 	switch (sap) {
960 	case ETHERTYPE_IP: {
961 		ipha_t *iphp;
962 
963 		/*
964 		 * If the header is not aligned or the header doesn't fit
965 		 * in the mblk, bail now. Note that this may cause packets
966 		 * reordering.
967 		 */
968 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
969 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
970 		    !OK_32PTR((char *)iphp))
971 			goto done;
972 
973 		proto = iphp->ipha_protocol;
974 		skip_len += IPH_HDR_LENGTH(iphp);
975 
976 		/* Check if the packet is fragmented. */
977 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
978 		    IPH_OFFSET;
979 
980 		/*
981 		 * For fragmented packets, use addresses in addition to
982 		 * the frag_id to generate the hash inorder to get
983 		 * better distribution.
984 		 */
985 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
986 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
987 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
988 
989 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
990 			    PKT_HASH_4BYTES(ip_dst));
991 			policy &= ~MAC_PKT_HASH_L3;
992 		}
993 
994 		if (ip_fragmented) {
995 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
996 			hash ^= PKT_HASH_2BYTES(identp);
997 			goto done;
998 		}
999 		break;
1000 	}
1001 	case ETHERTYPE_IPV6: {
1002 		ip6_t *ip6hp;
1003 		ip6_frag_t *frag = NULL;
1004 		uint16_t hdr_length;
1005 
1006 		/*
1007 		 * If the header is not aligned or the header doesn't fit
1008 		 * in the mblk, bail now. Note that this may cause packets
1009 		 * reordering.
1010 		 */
1011 
1012 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
1013 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
1014 		    !OK_32PTR((char *)ip6hp))
1015 			goto done;
1016 
1017 		if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
1018 		    &proto, &frag))
1019 			goto done;
1020 		skip_len += hdr_length;
1021 
1022 		/*
1023 		 * For fragmented packets, use addresses in addition to
1024 		 * the frag_id to generate the hash inorder to get
1025 		 * better distribution.
1026 		 */
1027 		if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
1028 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
1029 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
1030 
1031 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
1032 			    PKT_HASH_4BYTES(ip_dst));
1033 			policy &= ~MAC_PKT_HASH_L3;
1034 		}
1035 
1036 		if (frag != NULL) {
1037 			uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
1038 			hash ^= PKT_HASH_4BYTES(identp);
1039 			goto done;
1040 		}
1041 		break;
1042 	}
1043 	default:
1044 		goto done;
1045 	}
1046 
1047 	if (policy == 0)
1048 		goto done;
1049 
1050 	/* if ip header is in its own mblk, skip it */
1051 	if (MBLKL(mp) <= skip_len) {
1052 		skip_len -= MBLKL(mp);
1053 		mp = mp->b_cont;
1054 		if (mp == NULL)
1055 			goto done;
1056 	}
1057 
1058 	/* parse ULP header */
1059 again:
1060 	switch (proto) {
1061 	case IPPROTO_TCP:
1062 	case IPPROTO_UDP:
1063 	case IPPROTO_ESP:
1064 	case IPPROTO_SCTP:
1065 		/*
1066 		 * These Internet Protocols are intentionally designed
1067 		 * for hashing from the git-go.  Port numbers are in the first
1068 		 * word for transports, SPI is first for ESP.
1069 		 */
1070 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
1071 			goto done;
1072 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
1073 		break;
1074 
1075 	case IPPROTO_AH: {
1076 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
1077 		uint_t ah_length = AH_TOTAL_LEN(ah);
1078 
1079 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
1080 			goto done;
1081 
1082 		proto = ah->ah_nexthdr;
1083 		skip_len += ah_length;
1084 
1085 		/* if AH header is in its own mblk, skip it */
1086 		if (MBLKL(mp) <= skip_len) {
1087 			skip_len -= MBLKL(mp);
1088 			mp = mp->b_cont;
1089 			if (mp == NULL)
1090 				goto done;
1091 		}
1092 
1093 		goto again;
1094 	}
1095 	}
1096 
1097 done:
1098 	return (hash);
1099 }
1100