1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 */
25
26 /*
27 * MAC Services Module - misc utilities
28 */
29
30 #include <sys/types.h>
31 #include <sys/mac.h>
32 #include <sys/mac_impl.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/mac_client_impl.h>
35 #include <sys/mac_soft_ring.h>
36 #include <sys/strsubr.h>
37 #include <sys/strsun.h>
38 #include <sys/vlan.h>
39 #include <sys/pattr.h>
40 #include <sys/pci_tools.h>
41 #include <inet/ip.h>
42 #include <inet/ip_impl.h>
43 #include <inet/ip6.h>
44 #include <sys/vtrace.h>
45 #include <sys/dlpi.h>
46 #include <sys/sunndi.h>
47 #include <inet/ipsec_impl.h>
48 #include <inet/sadb.h>
49 #include <inet/ipsecesp.h>
50 #include <inet/ipsecah.h>
51 #include <inet/tcp.h>
52 #include <inet/udp_impl.h>
53 #include <inet/sctp_ip.h>
54
55 /*
56 * The next two functions are used for dropping packets or chains of
57 * packets, respectively. We could use one function for both but
58 * separating the use cases allows us to specify intent and prevent
59 * dropping more data than intended.
60 *
61 * The purpose of these functions is to aid the debugging effort,
62 * especially in production. Rather than use freemsg()/freemsgchain(),
63 * it's preferable to use these functions when dropping a packet in
64 * the MAC layer. These functions should only be used during
65 * unexpected conditions. That is, any time a packet is dropped
66 * outside of the regular, successful datapath. Consolidating all
67 * drops on these functions allows the user to trace one location and
68 * determine why the packet was dropped based on the msg. It also
69 * allows the user to inspect the packet before it is freed. Finally,
70 * it allows the user to avoid tracing freemsg()/freemsgchain() thus
71 * keeping the hot path running as efficiently as possible.
72 *
73 * NOTE: At this time not all MAC drops are aggregated on these
74 * functions; but that is the plan. This comment should be erased once
75 * completed.
76 */
77
78 /*PRINTFLIKE2*/
79 void
mac_drop_pkt(mblk_t * mp,const char * fmt,...)80 mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
81 {
82 va_list adx;
83 char msg[128];
84 char *msgp = msg;
85
86 ASSERT3P(mp->b_next, ==, NULL);
87
88 va_start(adx, fmt);
89 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
90 va_end(adx);
91
92 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
93 freemsg(mp);
94 }
95
96 /*PRINTFLIKE2*/
97 void
mac_drop_chain(mblk_t * chain,const char * fmt,...)98 mac_drop_chain(mblk_t *chain, const char *fmt, ...)
99 {
100 va_list adx;
101 char msg[128];
102 char *msgp = msg;
103
104 va_start(adx, fmt);
105 (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
106 va_end(adx);
107
108 /*
109 * We could use freemsgchain() for the actual freeing but
110 * since we are already walking the chain to fire the dtrace
111 * probe we might as well free the msg here too.
112 */
113 for (mblk_t *mp = chain, *next; mp != NULL; ) {
114 next = mp->b_next;
115 DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
116 freemsg(mp);
117 mp = next;
118 }
119 }
120
121 /*
122 * Copy an mblk, preserving its hardware checksum flags.
123 */
124 static mblk_t *
mac_copymsg_cksum(mblk_t * mp)125 mac_copymsg_cksum(mblk_t *mp)
126 {
127 mblk_t *mp1;
128
129 mp1 = copymsg(mp);
130 if (mp1 == NULL)
131 return (NULL);
132
133 mac_hcksum_clone(mp, mp1);
134
135 return (mp1);
136 }
137
138 /*
139 * Copy an mblk chain, presenting the hardware checksum flags of the
140 * individual mblks.
141 */
142 mblk_t *
mac_copymsgchain_cksum(mblk_t * mp)143 mac_copymsgchain_cksum(mblk_t *mp)
144 {
145 mblk_t *nmp = NULL;
146 mblk_t **nmpp = &nmp;
147
148 for (; mp != NULL; mp = mp->b_next) {
149 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
150 freemsgchain(nmp);
151 return (NULL);
152 }
153
154 nmpp = &((*nmpp)->b_next);
155 }
156
157 return (nmp);
158 }
159
160 /*
161 * Calculate the ULP checksum for IPv4. Return true if the calculation
162 * was successful, or false if an error occurred. If the later, place
163 * an error message into '*err'.
164 */
165 static boolean_t
mac_sw_cksum_ipv4(mblk_t * mp,uint32_t ip_hdr_offset,ipha_t * ipha,const char ** err)166 mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha,
167 const char **err)
168 {
169 const uint8_t proto = ipha->ipha_protocol;
170 size_t len;
171 const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha);
172 /* ULP offset from start of L2. */
173 const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz;
174 ipaddr_t src, dst;
175 uint32_t cksum;
176 uint16_t *up;
177
178 /*
179 * We need a pointer to the ULP checksum. We're assuming the
180 * ULP checksum pointer resides in the first mblk. Our native
181 * TCP stack should always put the headers in the first mblk,
182 * but currently we have no way to guarantee that other
183 * clients don't spread headers (or even header fields) across
184 * mblks.
185 */
186 switch (proto) {
187 case IPPROTO_TCP:
188 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
189 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
190 *err = "mblk doesn't contain TCP header";
191 goto bail;
192 }
193
194 up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz);
195 cksum = IP_TCP_CSUM_COMP;
196 break;
197
198 case IPPROTO_UDP:
199 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
200 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
201 *err = "mblk doesn't contain UDP header";
202 goto bail;
203 }
204
205 up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz);
206 cksum = IP_UDP_CSUM_COMP;
207 break;
208
209 case IPPROTO_SCTP: {
210 sctp_hdr_t *sctph;
211
212 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
213 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
214 *err = "mblk doesn't contain SCTP header";
215 goto bail;
216 }
217
218 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
219 sctph->sh_chksum = 0;
220 sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
221 return (B_TRUE);
222 }
223
224 default:
225 *err = "unexpected protocol";
226 goto bail;
227
228 }
229
230 /* Pseudo-header checksum. */
231 src = ipha->ipha_src;
232 dst = ipha->ipha_dst;
233 len = ntohs(ipha->ipha_length) - ip_hdr_sz;
234
235 cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
236 cksum += htons(len);
237
238 /*
239 * We have already accounted for the pseudo checksum above.
240 * Make sure the ULP checksum field is zero before computing
241 * the rest.
242 */
243 *up = 0;
244 cksum = IP_CSUM(mp, ulp_offset, cksum);
245 *up = (uint16_t)(cksum ? cksum : ~cksum);
246
247 return (B_TRUE);
248
249 bail:
250 return (B_FALSE);
251 }
252
253 /*
254 * Calculate the ULP checksum for IPv6. Return true if the calculation
255 * was successful, or false if an error occurred. If the later, place
256 * an error message into '*err'.
257 */
258 static boolean_t
mac_sw_cksum_ipv6(mblk_t * mp,uint32_t ip_hdr_offset,const char ** err)259 mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err)
260 {
261 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset);
262 const uint8_t proto = ip6h->ip6_nxt;
263 const uint16_t *iphs = (uint16_t *)ip6h;
264 /* ULP offset from start of L2. */
265 uint32_t ulp_offset;
266 size_t len;
267 uint32_t cksum;
268 uint16_t *up;
269 uint16_t ip_hdr_sz;
270
271 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) {
272 *err = "malformed IPv6 header";
273 goto bail;
274 }
275
276 ulp_offset = ip_hdr_offset + ip_hdr_sz;
277
278 /*
279 * We need a pointer to the ULP checksum. We're assuming the
280 * ULP checksum pointer resides in the first mblk. Our native
281 * TCP stack should always put the headers in the first mblk,
282 * but currently we have no way to guarantee that other
283 * clients don't spread headers (or even header fields) across
284 * mblks.
285 */
286 switch (proto) {
287 case IPPROTO_TCP:
288 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
289 if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
290 *err = "mblk doesn't contain TCP header";
291 goto bail;
292 }
293
294 up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz);
295 cksum = IP_TCP_CSUM_COMP;
296 break;
297
298 case IPPROTO_UDP:
299 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
300 if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
301 *err = "mblk doesn't contain UDP header";
302 goto bail;
303 }
304
305 up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz);
306 cksum = IP_UDP_CSUM_COMP;
307 break;
308
309 case IPPROTO_SCTP: {
310 sctp_hdr_t *sctph;
311
312 ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
313 if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
314 *err = "mblk doesn't contain SCTP header";
315 goto bail;
316 }
317
318 sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
319 /*
320 * Zero out the checksum field to ensure proper
321 * checksum calculation.
322 */
323 sctph->sh_chksum = 0;
324 sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
325 return (B_TRUE);
326 }
327
328 default:
329 *err = "unexpected protocol";
330 goto bail;
331 }
332
333 /*
334 * The payload length includes the payload and the IPv6
335 * extension headers; the idea is to subtract the extension
336 * header length to get the real payload length.
337 */
338 len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN);
339 cksum += len;
340
341 /*
342 * We accumulate the pseudo header checksum in cksum; then we
343 * call IP_CSUM to compute the checksum over the payload.
344 */
345 cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] +
346 iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] +
347 iphs[16] + iphs[17] + iphs[18] + iphs[19];
348 cksum = IP_CSUM(mp, ulp_offset, cksum);
349
350 /* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */
351 if (proto == IPPROTO_UDP && cksum == 0)
352 cksum = ~cksum;
353
354 *up = (uint16_t)cksum;
355
356 return (B_TRUE);
357
358 bail:
359 return (B_FALSE);
360 }
361
362 /*
363 * Perform software checksum on a single message, if needed. The
364 * emulation performed is determined by an intersection of the mblk's
365 * flags and the emul flags requested. The emul flags are documented
366 * in mac.h.
367 */
368 static mblk_t *
mac_sw_cksum(mblk_t * mp,mac_emul_t emul)369 mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
370 {
371 mblk_t *skipped_hdr = NULL;
372 uint32_t flags, start, stuff, end, value;
373 uint32_t ip_hdr_offset;
374 uint16_t etype;
375 size_t ip_hdr_sz;
376 struct ether_header *ehp;
377 const char *err = "";
378
379 /*
380 * This function should only be called from mac_hw_emul()
381 * which handles mblk chains and the shared ref case.
382 */
383 ASSERT3P(mp->b_next, ==, NULL);
384
385 mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
386
387 flags = DB_CKSUMFLAGS(mp);
388
389 /* Why call this if checksum emulation isn't needed? */
390 ASSERT3U(flags & (HCK_FLAGS), !=, 0);
391
392 /*
393 * Ethernet, and optionally VLAN header. mac_hw_emul() has
394 * already verified we have enough data to read the L2 header.
395 */
396 ehp = (struct ether_header *)mp->b_rptr;
397 if (ntohs(ehp->ether_type) == VLAN_TPID) {
398 struct ether_vlan_header *evhp;
399
400 evhp = (struct ether_vlan_header *)mp->b_rptr;
401 etype = ntohs(evhp->ether_type);
402 ip_hdr_offset = sizeof (struct ether_vlan_header);
403 } else {
404 etype = ntohs(ehp->ether_type);
405 ip_hdr_offset = sizeof (struct ether_header);
406 }
407
408 /*
409 * If this packet isn't IP, then leave it alone. We don't want
410 * to affect non-IP traffic like ARP. Assume the IP header
411 * doesn't include any options, for now. We will use the
412 * correct size later after we know there are enough bytes to
413 * at least fill out the basic header.
414 */
415 switch (etype) {
416 case ETHERTYPE_IP:
417 ip_hdr_sz = sizeof (ipha_t);
418 break;
419 case ETHERTYPE_IPV6:
420 ip_hdr_sz = sizeof (ip6_t);
421 break;
422 default:
423 return (mp);
424 }
425
426 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset);
427
428 /*
429 * If the first mblk of this packet contains only the ethernet
430 * header, skip past it for now. Packets with their data
431 * contained in only a single mblk can then use the fastpaths
432 * tuned to that possibility.
433 */
434 if (MBLKL(mp) == ip_hdr_offset) {
435 ip_hdr_offset -= MBLKL(mp);
436 /* This is guaranteed by mac_hw_emul(). */
437 ASSERT3P(mp->b_cont, !=, NULL);
438 skipped_hdr = mp;
439 mp = mp->b_cont;
440 }
441
442 /*
443 * Both full and partial checksum rely on finding the IP
444 * header in the current mblk. Our native TCP stack honors
445 * this assumption but it's prudent to guard our future
446 * clients that might not honor this contract.
447 */
448 ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz);
449 if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) {
450 err = "mblk doesn't contain IP header";
451 goto bail;
452 }
453
454 /*
455 * We are about to modify the header mblk; make sure we are
456 * modifying our own copy. The code that follows assumes that
457 * the IP/ULP headers exist in this mblk (and drops the
458 * message if they don't).
459 */
460 if (DB_REF(mp) > 1) {
461 mblk_t *tmp = copyb(mp);
462
463 if (tmp == NULL) {
464 err = "copyb failed";
465 goto bail;
466 }
467
468 if (skipped_hdr != NULL) {
469 ASSERT3P(skipped_hdr->b_cont, ==, mp);
470 skipped_hdr->b_cont = tmp;
471 }
472
473 tmp->b_cont = mp->b_cont;
474 freeb(mp);
475 mp = tmp;
476 }
477
478 if (etype == ETHERTYPE_IP) {
479 ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset);
480
481 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
482 if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err))
483 goto bail;
484 }
485
486 /* We always update the ULP checksum flags. */
487 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
488 flags &= ~HCK_FULLCKSUM;
489 flags |= HCK_FULLCKSUM_OK;
490 value = 0;
491 }
492
493 /*
494 * While unlikely, it's possible to write code that
495 * might end up calling mac_sw_cksum() twice on the
496 * same mblk (performing both LSO and checksum
497 * emualtion in a single mblk chain loop -- the LSO
498 * emulation inserts a new chain into the existing
499 * chain and then the loop iterates back over the new
500 * segments and emulates the checksum a second time).
501 * Normally this wouldn't be a problem, because the
502 * HCK_*_OK flags are supposed to indicate that we
503 * don't need to do peform the work. But
504 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
505 * same value; so we cannot use these flags to
506 * determine if the IP header checksum has already
507 * been calculated or not. For this reason, we zero
508 * out the the checksum first. In the future, we
509 * should fix the HCK_* flags.
510 */
511 if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
512 ipha->ipha_hdr_checksum = 0;
513 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
514 flags &= ~HCK_IPV4_HDRCKSUM;
515 flags |= HCK_IPV4_HDRCKSUM_OK;
516 }
517 } else if (etype == ETHERTYPE_IPV6) {
518 /* There is no IP header checksum for IPv6. */
519 if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
520 if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err))
521 goto bail;
522 flags &= ~HCK_FULLCKSUM;
523 flags |= HCK_FULLCKSUM_OK;
524 value = 0;
525 }
526 }
527
528 /*
529 * Partial checksum is the same for both IPv4 and IPv6.
530 */
531 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
532 uint16_t *up, partial, cksum;
533 uchar_t *ipp; /* ptr to beginning of IP header */
534
535 ipp = mp->b_rptr + ip_hdr_offset;
536 up = (uint16_t *)((uchar_t *)ipp + stuff);
537 partial = *up;
538 *up = 0;
539
540 ASSERT3S(end, >, start);
541 cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial);
542 *up = cksum != 0 ? cksum : ~cksum;
543 }
544
545 /* We always update the ULP checksum flags. */
546 if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
547 flags &= ~HCK_PARTIALCKSUM;
548 flags |= HCK_FULLCKSUM_OK;
549 value = 0;
550 }
551
552 mac_hcksum_set(mp, start, stuff, end, value, flags);
553
554 /* Don't forget to reattach the header. */
555 if (skipped_hdr != NULL) {
556 ASSERT3P(skipped_hdr->b_cont, ==, mp);
557
558 /*
559 * Duplicate the HCKSUM data into the header mblk.
560 * This mimics mac_add_vlan_tag which ensures that
561 * both the first mblk _and_ the first data bearing
562 * mblk possess the HCKSUM information. Consumers like
563 * IP will end up discarding the ether_header mblk, so
564 * for now, it is important that the data be available
565 * in both places.
566 */
567 mac_hcksum_clone(mp, skipped_hdr);
568 mp = skipped_hdr;
569 }
570
571 return (mp);
572
573 bail:
574 if (skipped_hdr != NULL) {
575 ASSERT3P(skipped_hdr->b_cont, ==, mp);
576 mp = skipped_hdr;
577 }
578
579 mac_drop_pkt(mp, err);
580 return (NULL);
581 }
582
583 /*
584 * Build a single data segment from an LSO packet. The mblk chain
585 * returned, seg_head, represents the data segment and is always
586 * exactly seg_len bytes long. The lso_mp and offset input/output
587 * parameters track our position in the LSO packet. This function
588 * exists solely as a helper to mac_sw_lso().
589 *
590 * Case A
591 *
592 * The current lso_mp is larger than the requested seg_len. The
593 * beginning of seg_head may start at the beginning of lso_mp or
594 * offset into it. In either case, a single mblk is returned, and
595 * *offset is updated to reflect our new position in the current
596 * lso_mp.
597 *
598 * +----------------------------+
599 * | in *lso_mp / out *lso_mp |
600 * +----------------------------+
601 * ^ ^
602 * | |
603 * | |
604 * | |
605 * +------------------------+
606 * | seg_head |
607 * +------------------------+
608 * ^ ^
609 * | |
610 * in *offset = 0 out *offset = seg_len
611 *
612 * |------ seg_len ----|
613 *
614 *
615 * +------------------------------+
616 * | in *lso_mp / out *lso_mp |
617 * +------------------------------+
618 * ^ ^
619 * | |
620 * | |
621 * | |
622 * +------------------------+
623 * | seg_head |
624 * +------------------------+
625 * ^ ^
626 * | |
627 * in *offset = N out *offset = N + seg_len
628 *
629 * |------ seg_len ----|
630 *
631 *
632 *
633 * Case B
634 *
635 * The requested seg_len consumes exactly the rest of the lso_mp.
636 * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
637 * The seg_head may start at the beginning of the lso_mp or at some
638 * offset into it. In either case we return a single mblk, reset
639 * *offset to zero, and walk to the next lso_mp.
640 *
641 * +------------------------+ +------------------------+
642 * | in *lso_mp |---------->| out *lso_mp |
643 * +------------------------+ +------------------------+
644 * ^ ^ ^
645 * | | |
646 * | | out *offset = 0
647 * | |
648 * +------------------------+
649 * | seg_head |
650 * +------------------------+
651 * ^
652 * |
653 * in *offset = 0
654 *
655 * |------ seg_len ----|
656 *
657 *
658 *
659 * +----------------------------+ +------------------------+
660 * | in *lso_mp |---------->| out *lso_mp |
661 * +----------------------------+ +------------------------+
662 * ^ ^ ^
663 * | | |
664 * | | out *offset = 0
665 * | |
666 * +------------------------+
667 * | seg_head |
668 * +------------------------+
669 * ^
670 * |
671 * in *offset = N
672 *
673 * |------ seg_len ----|
674 *
675 *
676 * Case C
677 *
678 * The requested seg_len is greater than the current lso_mp. In
679 * this case we must consume LSO mblks until we have enough data to
680 * satisfy either case (A) or (B) above. We will return multiple
681 * mblks linked via b_cont, offset will be set based on the cases
682 * above, and lso_mp will walk forward at least one mblk, but maybe
683 * more.
684 *
685 * N.B. This digram is not exhaustive. The seg_head may start on
686 * the beginning of an lso_mp. The seg_tail may end exactly on the
687 * boundary of an lso_mp. And there may be two (in this case the
688 * middle block wouldn't exist), three, or more mblks in the
689 * seg_head chain. This is meant as one example of what might
690 * happen. The main thing to remember is that the seg_tail mblk
691 * must be one of case (A) or (B) above.
692 *
693 * +------------------+ +----------------+ +------------------+
694 * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp |
695 * +------------------+ +----------------+ +------------------+
696 * ^ ^ ^ ^ ^ ^
697 * | | | | | |
698 * | | | | | |
699 * | | | | | |
700 * | | | | | |
701 * +------------+ +----------------+ +------------+
702 * | seg_head |--->| |--->| seg_tail |
703 * +------------+ +----------------+ +------------+
704 * ^ ^
705 * | |
706 * in *offset = N out *offset = MBLKL(seg_tail)
707 *
708 * |------------------- seg_len -------------------|
709 *
710 */
711 static mblk_t *
build_data_seg(mblk_t ** lso_mp,uint32_t * offset,uint32_t seg_len)712 build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
713 {
714 mblk_t *seg_head, *seg_tail, *seg_mp;
715
716 ASSERT3P(*lso_mp, !=, NULL);
717 ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
718
719 seg_mp = dupb(*lso_mp);
720 if (seg_mp == NULL)
721 return (NULL);
722
723 seg_head = seg_mp;
724 seg_tail = seg_mp;
725
726 /* Continue where we left off from in the lso_mp. */
727 seg_mp->b_rptr += *offset;
728
729 last_mblk:
730 /* Case (A) */
731 if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
732 *offset += seg_len;
733 seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
734 return (seg_head);
735 }
736
737 /* Case (B) */
738 if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
739 *offset = 0;
740 *lso_mp = (*lso_mp)->b_cont;
741 return (seg_head);
742 }
743
744 /* Case (C) */
745 ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
746
747 /*
748 * The current LSO mblk doesn't have enough data to satisfy
749 * seg_len -- continue peeling off LSO mblks to build the new
750 * segment message. If allocation fails we free the previously
751 * allocated segment mblks and return NULL.
752 */
753 while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
754 ASSERT3U(MBLKL(seg_mp), <=, seg_len);
755 seg_len -= MBLKL(seg_mp);
756 *offset = 0;
757 *lso_mp = (*lso_mp)->b_cont;
758 seg_mp = dupb(*lso_mp);
759
760 if (seg_mp == NULL) {
761 freemsgchain(seg_head);
762 return (NULL);
763 }
764
765 seg_tail->b_cont = seg_mp;
766 seg_tail = seg_mp;
767 }
768
769 /*
770 * We've walked enough LSO mblks that we can now satisfy the
771 * remaining seg_len. At this point we need to jump back to
772 * determine if we have arrived at case (A) or (B).
773 */
774
775 /* Just to be paranoid that we didn't underflow. */
776 ASSERT3U(seg_len, <, IP_MAXPACKET);
777 ASSERT3U(seg_len, >, 0);
778 goto last_mblk;
779 }
780
781 /*
782 * Perform software segmentation of a single LSO message. Take an LSO
783 * message as input and return head/tail pointers as output. This
784 * function should not be invoked directly but instead through
785 * mac_hw_emul().
786 *
787 * The resulting chain is comprised of multiple (nsegs) MSS sized
788 * segments. Each segment will consist of two or more mblks joined by
789 * b_cont: a header and one or more data mblks. The header mblk is
790 * allocated anew for each message. The first segment's header is used
791 * as a template for the rest with adjustments made for things such as
792 * ID, sequence, length, TCP flags, etc. The data mblks reference into
793 * the existing LSO mblk (passed in as omp) by way of dupb(). Their
794 * b_rptr/b_wptr values are adjusted to reference only the fraction of
795 * the LSO message they are responsible for. At the successful
796 * completion of this function the original mblk (omp) is freed,
797 * leaving the newely created segment chain as the only remaining
798 * reference to the data.
799 */
800 static void
mac_sw_lso(mblk_t * omp,mac_emul_t emul,mblk_t ** head,mblk_t ** tail,uint_t * count)801 mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
802 uint_t *count)
803 {
804 uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
805 uint32_t mss;
806 uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
807 uint32_t oleft;
808 uint_t nsegs, seg;
809 int len;
810
811 struct ether_vlan_header *oevh;
812 const ipha_t *oiph;
813 const tcph_t *otcph;
814 ipha_t *niph;
815 tcph_t *ntcph;
816 uint16_t ip_id;
817 uint32_t tcp_seq, tcp_sum, otcp_sum;
818
819 uint32_t offset;
820 mblk_t *odatamp;
821 mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
822 mblk_t *tmptail;
823
824 ASSERT3P(head, !=, NULL);
825 ASSERT3P(tail, !=, NULL);
826 ASSERT3P(count, !=, NULL);
827 ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
828
829 /* Assume we are dealing with a single LSO message. */
830 ASSERT3P(omp->b_next, ==, NULL);
831
832 /*
833 * XXX: This is a hack to deal with mac_add_vlan_tag().
834 *
835 * When VLANs are in play, mac_add_vlan_tag() creates a new
836 * mblk with just the ether_vlan_header and tacks it onto the
837 * front of 'omp'. This breaks the assumptions made below;
838 * namely that the TCP/IP headers are in the first mblk. In
839 * this case, since we already have to pay the cost of LSO
840 * emulation, we simply pull up everything. While this might
841 * seem irksome, keep in mind this will only apply in a couple
842 * of scenarios: a) an LSO-capable VLAN client sending to a
843 * non-LSO-capable client over the "MAC/bridge loopback"
844 * datapath or b) an LSO-capable VLAN client is sending to a
845 * client that, for whatever reason, doesn't have DLS-bypass
846 * enabled. Finally, we have to check for both a tagged and
847 * untagged sized mblk depending on if the mblk came via
848 * mac_promisc_dispatch() or mac_rx_deliver().
849 *
850 * In the future, two things should be done:
851 *
852 * 1. This function should make use of some yet to be
853 * implemented "mblk helpers". These helper functions would
854 * perform all the b_cont walking for us and guarantee safe
855 * access to the mblk data.
856 *
857 * 2. We should add some slop to the mblks so that
858 * mac_add_vlan_tag() can just edit the first mblk instead
859 * of allocating on the hot path.
860 */
861 if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
862 MBLKL(omp) == sizeof (struct ether_header)) {
863 mblk_t *tmp = msgpullup(omp, -1);
864
865 if (tmp == NULL) {
866 mac_drop_pkt(omp, "failed to pull up");
867 goto fail;
868 }
869
870 mac_hcksum_clone(omp, tmp);
871 freemsg(omp);
872 omp = tmp;
873 }
874
875 mss = DB_LSOMSS(omp);
876 ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
877 sizeof (struct ether_vlan_header));
878 opktlen = msgsize(omp);
879
880 /*
881 * First, get references to the IP and TCP headers and
882 * determine the total TCP length (header + data).
883 *
884 * Thanks to mac_hw_emul() we know that the first mblk must
885 * contain (at minimum) the full L2 header. However, this
886 * function assumes more than that. It assumes the L2/L3/L4
887 * headers are all contained in the first mblk of a message
888 * (i.e., no b_cont walking for headers). While this is a
889 * current reality (our native TCP stack and viona both
890 * enforce this) things may become more nuanced in the future
891 * (e.g. when introducing encap support or adding new
892 * clients). For now we guard against this case by dropping
893 * the packet.
894 */
895 oevh = (struct ether_vlan_header *)omp->b_rptr;
896 if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
897 oehlen = sizeof (struct ether_vlan_header);
898 else
899 oehlen = sizeof (struct ether_header);
900
901 ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
902 if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
903 mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
904 goto fail;
905 }
906
907 oiph = (ipha_t *)(omp->b_rptr + oehlen);
908 oiphlen = IPH_HDR_LENGTH(oiph);
909 otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
910 otcphlen = TCP_HDR_LENGTH(otcph);
911
912 /*
913 * Currently we only support LSO for TCP/IPv4.
914 */
915 if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
916 mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
917 IPH_HDR_VERSION(oiph));
918 goto fail;
919 }
920
921 if (oiph->ipha_protocol != IPPROTO_TCP) {
922 mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
923 oiph->ipha_protocol);
924 goto fail;
925 }
926
927 if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
928 mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
929 goto fail;
930 }
931
932 ohdrslen = oehlen + oiphlen + otcphlen;
933 if ((len = MBLKL(omp)) < ohdrslen) {
934 mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
935 ohdrslen);
936 goto fail;
937 }
938
939 /*
940 * Either we have data in the first mblk or it's just the
941 * header. In either case, we need to set rptr to the start of
942 * the TCP data.
943 */
944 if (len > ohdrslen) {
945 odatamp = omp;
946 offset = ohdrslen;
947 } else {
948 ASSERT3U(len, ==, ohdrslen);
949 odatamp = omp->b_cont;
950 offset = 0;
951 }
952
953 /* Make sure we still have enough data. */
954 ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
955
956 /*
957 * If a MAC negotiated LSO then it must negotioate both
958 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
959 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
960 * change during LSO segmentation (only the 3 fields of the
961 * pseudo header checksum don't change: src, dst, proto). Thus
962 * we would expect these flags (HCK_IPV4_HDRCKSUM |
963 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
964 * function to emulate those checksums in software. However,
965 * that assumes a world where we only expose LSO if the
966 * underlying hardware exposes LSO. Moving forward the plan is
967 * to assume LSO in the upper layers and have MAC perform
968 * software LSO when the underlying provider doesn't support
969 * it. In such a world, if the provider doesn't support LSO
970 * but does support hardware checksum offload, then we could
971 * simply perform the segmentation and allow the hardware to
972 * calculate the checksums. To the hardware it's just another
973 * chain of non-LSO packets.
974 */
975 ASSERT3S(DB_TYPE(omp), ==, M_DATA);
976 ocsum_flags = DB_CKSUMFLAGS(omp);
977 ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
978 ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
979
980 /*
981 * If hardware only provides partial checksum then software
982 * must supply the pseudo-header checksum. In the case of LSO
983 * we leave the TCP length at zero to be filled in by
984 * hardware. This function must handle two scenarios.
985 *
986 * 1. Being called by a MAC client on the Rx path to segment
987 * an LSO packet and calculate the checksum.
988 *
989 * 2. Being called by a MAC provider to segment an LSO packet.
990 * In this case the LSO segmentation is performed in
991 * software (by this routine) but the MAC provider should
992 * still calculate the TCP/IP checksums in hardware.
993 *
994 * To elaborate on the second case: we cannot have the
995 * scenario where IP sends LSO packets but the underlying HW
996 * doesn't support checksum offload -- because in that case
997 * TCP/IP would calculate the checksum in software (for the
998 * LSO packet) but then MAC would segment the packet and have
999 * to redo all the checksum work. So IP should never do LSO
1000 * if HW doesn't support both IP and TCP checksum.
1001 */
1002 if (ocsum_flags & HCK_PARTIALCKSUM) {
1003 ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
1004 ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
1005 }
1006
1007 odatalen = opktlen - ohdrslen;
1008
1009 /*
1010 * Subtract one to account for the case where the data length
1011 * is evenly divisble by the MSS. Add one to account for the
1012 * fact that the division will always result in one less
1013 * segment than needed.
1014 */
1015 nsegs = ((odatalen - 1) / mss) + 1;
1016 if (nsegs < 2) {
1017 mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
1018 goto fail;
1019 }
1020
1021 DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
1022 __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
1023 nsegs);
1024
1025 seg_chain = NULL;
1026 tmptail = seg_chain;
1027 oleft = odatalen;
1028
1029 for (uint_t i = 0; i < nsegs; i++) {
1030 boolean_t last_seg = ((i + 1) == nsegs);
1031 uint32_t seg_len;
1032
1033 /*
1034 * If we fail to allocate, then drop the partially
1035 * allocated chain as well as the LSO packet. Let the
1036 * sender deal with the fallout.
1037 */
1038 if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
1039 freemsgchain(seg_chain);
1040 mac_drop_pkt(omp, "failed to alloc segment header");
1041 goto fail;
1042 }
1043 ASSERT3P(nhdrmp->b_cont, ==, NULL);
1044
1045 if (seg_chain == NULL) {
1046 seg_chain = nhdrmp;
1047 } else {
1048 ASSERT3P(tmptail, !=, NULL);
1049 tmptail->b_next = nhdrmp;
1050 }
1051
1052 tmptail = nhdrmp;
1053
1054 /*
1055 * Calculate this segment's lengh. It's either the MSS
1056 * or whatever remains for the last segment.
1057 */
1058 seg_len = last_seg ? oleft : mss;
1059 ASSERT3U(seg_len, <=, mss);
1060 ndatamp = build_data_seg(&odatamp, &offset, seg_len);
1061
1062 if (ndatamp == NULL) {
1063 freemsgchain(seg_chain);
1064 mac_drop_pkt(omp, "LSO failed to segment data");
1065 goto fail;
1066 }
1067
1068 /* Attach data mblk to header mblk. */
1069 nhdrmp->b_cont = ndatamp;
1070 DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
1071 ASSERT3U(seg_len, <=, oleft);
1072 oleft -= seg_len;
1073 }
1074
1075 /* We should have consumed entire LSO msg. */
1076 ASSERT3S(oleft, ==, 0);
1077 ASSERT3P(odatamp, ==, NULL);
1078
1079 /*
1080 * All seg data mblks are referenced by the header mblks, null
1081 * out this pointer to catch any bad derefs.
1082 */
1083 ndatamp = NULL;
1084
1085 /*
1086 * Set headers and checksum for first segment.
1087 */
1088 nhdrmp = seg_chain;
1089 bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
1090 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1091 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1092 ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
1093 niph->ipha_length = htons(oiphlen + otcphlen + mss);
1094 niph->ipha_hdr_checksum = 0;
1095 ip_id = ntohs(niph->ipha_ident);
1096 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1097 tcp_seq = BE32_TO_U32(ntcph->th_seq);
1098 tcp_seq += mss;
1099
1100 /*
1101 * The first segment shouldn't:
1102 *
1103 * o indicate end of data transmission (FIN),
1104 * o indicate immediate handling of the data (PUSH).
1105 */
1106 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1107 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1108
1109 /*
1110 * If the underlying HW provides partial checksum, then make
1111 * sure to correct the pseudo header checksum before calling
1112 * mac_sw_cksum(). The native TCP stack doesn't include the
1113 * length field in the pseudo header when LSO is in play -- so
1114 * we need to calculate it here.
1115 */
1116 if (ocsum_flags & HCK_PARTIALCKSUM) {
1117 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1118 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1119 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1120 tcp_sum = BE16_TO_U16(ntcph->th_sum);
1121 otcp_sum = tcp_sum;
1122 tcp_sum += mss + otcphlen;
1123 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1124 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1125 }
1126
1127 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1128 (emul & MAC_HWCKSUM_EMULS)) {
1129 next_nhdrmp = nhdrmp->b_next;
1130 nhdrmp->b_next = NULL;
1131 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1132 nhdrmp->b_next = next_nhdrmp;
1133 next_nhdrmp = NULL;
1134
1135 /*
1136 * We may have freed the nhdrmp argument during
1137 * checksum emulation, make sure that seg_chain
1138 * references a valid mblk.
1139 */
1140 seg_chain = nhdrmp;
1141 }
1142
1143 ASSERT3P(nhdrmp, !=, NULL);
1144
1145 seg = 1;
1146 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1147 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1148 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
1149 uint_t, seg);
1150 seg++;
1151
1152 /* There better be at least 2 segs. */
1153 ASSERT3P(nhdrmp->b_next, !=, NULL);
1154 prev_nhdrmp = nhdrmp;
1155 nhdrmp = nhdrmp->b_next;
1156
1157 /*
1158 * Now adjust the headers of the middle segments. For each
1159 * header we need to adjust the following.
1160 *
1161 * o IP ID
1162 * o IP length
1163 * o TCP sequence
1164 * o TCP flags
1165 * o cksum flags
1166 * o cksum values (if MAC_HWCKSUM_EMUL is set)
1167 */
1168 for (; seg < nsegs; seg++) {
1169 /*
1170 * We use seg_chain as a reference to the first seg
1171 * header mblk -- this first header is a template for
1172 * the rest of the segments. This copy will include
1173 * the now updated checksum values from the first
1174 * header. We must reset these checksum values to
1175 * their original to make sure we produce the correct
1176 * value.
1177 */
1178 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1179 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1180 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1181 niph->ipha_ident = htons(++ip_id);
1182 ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1183 niph->ipha_length = htons(oiphlen + otcphlen + mss);
1184 niph->ipha_hdr_checksum = 0;
1185 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1186 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1187 tcp_seq += mss;
1188 /*
1189 * Just like the first segment, the middle segments
1190 * shouldn't have these flags set.
1191 */
1192 ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1193 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1194
1195 if (ocsum_flags & HCK_PARTIALCKSUM) {
1196 /*
1197 * First and middle segs have same
1198 * pseudo-header checksum.
1199 */
1200 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1201 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1202 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1203 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1204 }
1205
1206 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1207 (emul & MAC_HWCKSUM_EMULS)) {
1208 next_nhdrmp = nhdrmp->b_next;
1209 nhdrmp->b_next = NULL;
1210 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1211 nhdrmp->b_next = next_nhdrmp;
1212 next_nhdrmp = NULL;
1213 /* We may have freed the original nhdrmp. */
1214 prev_nhdrmp->b_next = nhdrmp;
1215 }
1216
1217 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1218 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1219 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
1220 uint_t, mss, uint_t, seg);
1221
1222 ASSERT3P(nhdrmp->b_next, !=, NULL);
1223 prev_nhdrmp = nhdrmp;
1224 nhdrmp = nhdrmp->b_next;
1225 }
1226
1227 /* Make sure we are on the last segment. */
1228 ASSERT3U(seg, ==, nsegs);
1229 ASSERT3P(nhdrmp->b_next, ==, NULL);
1230
1231 /*
1232 * Now we set the last segment header. The difference being
1233 * that FIN/PSH/RST flags are allowed.
1234 */
1235 bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1236 nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1237 niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1238 niph->ipha_ident = htons(++ip_id);
1239 len = msgsize(nhdrmp->b_cont);
1240 ASSERT3S(len, >, 0);
1241 niph->ipha_length = htons(oiphlen + otcphlen + len);
1242 niph->ipha_hdr_checksum = 0;
1243 ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1244 U32_TO_BE32(tcp_seq, ntcph->th_seq);
1245
1246 DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1247 if (ocsum_flags & HCK_PARTIALCKSUM) {
1248 DB_CKSUMSTART(nhdrmp) = ocsum_start;
1249 DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1250 DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1251 tcp_sum = otcp_sum;
1252 tcp_sum += len + otcphlen;
1253 tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1254 U16_TO_BE16(tcp_sum, ntcph->th_sum);
1255 }
1256
1257 if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1258 (emul & MAC_HWCKSUM_EMULS)) {
1259 /* This should be the last mblk. */
1260 ASSERT3P(nhdrmp->b_next, ==, NULL);
1261 nhdrmp = mac_sw_cksum(nhdrmp, emul);
1262 prev_nhdrmp->b_next = nhdrmp;
1263 }
1264
1265 DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1266 (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1267 (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
1268 uint_t, seg);
1269
1270 /*
1271 * Free the reference to the original LSO message as it is
1272 * being replaced by seg_cahin.
1273 */
1274 freemsg(omp);
1275 *head = seg_chain;
1276 *tail = nhdrmp;
1277 *count = nsegs;
1278 return;
1279
1280 fail:
1281 *head = NULL;
1282 *tail = NULL;
1283 *count = 0;
1284 }
1285
1286 #define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1287
1288 /*
1289 * Emulate various hardware offload features in software. Take a chain
1290 * of packets as input and emulate the hardware features specified in
1291 * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1292 * pointer given as input, and its tail pointer is written to
1293 * '*otail'. The number of packets in the new chain is written to
1294 * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1295 * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1296 * which case 'mp_chain' will simply stay a NULL chain.
1297 *
1298 * While unlikely, it is technically possible that this function could
1299 * receive a non-NULL chain as input and return a NULL chain as output
1300 * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1301 * zero). This could happen if all the packets in the chain are
1302 * dropped or if we fail to allocate new mblks. In this case, there is
1303 * nothing for the caller to free. In any event, the caller shouldn't
1304 * assume that '*mp_chain' is non-NULL on return.
1305 *
1306 * This function was written with three main use cases in mind.
1307 *
1308 * 1. To emulate hardware offloads when traveling mac-loopback (two
1309 * clients on the same mac). This is wired up in mac_tx_send().
1310 *
1311 * 2. To provide hardware offloads to the client when the underlying
1312 * provider cannot. This is currently wired up in mac_tx() but we
1313 * still only negotiate offloads when the underlying provider
1314 * supports them.
1315 *
1316 * 3. To emulate real hardware in simnet.
1317 */
1318 void
mac_hw_emul(mblk_t ** mp_chain,mblk_t ** otail,uint_t * ocount,mac_emul_t emul)1319 mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1320 {
1321 mblk_t *head = NULL, *tail = NULL;
1322 uint_t count = 0;
1323
1324 ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1325 ASSERT3P(mp_chain, !=, NULL);
1326
1327 for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1328 mblk_t *tmp, *next, *tmphead, *tmptail;
1329 struct ether_header *ehp;
1330 uint32_t flags;
1331 uint_t len = MBLKL(mp), l2len;
1332
1333 /* Perform LSO/cksum one message at a time. */
1334 next = mp->b_next;
1335 mp->b_next = NULL;
1336
1337 /*
1338 * For our sanity the first mblk should contain at
1339 * least the full L2 header.
1340 */
1341 if (len < sizeof (struct ether_header)) {
1342 mac_drop_pkt(mp, "packet too short (A): %u", len);
1343 mp = next;
1344 continue;
1345 }
1346
1347 ehp = (struct ether_header *)mp->b_rptr;
1348 if (ntohs(ehp->ether_type) == VLAN_TPID)
1349 l2len = sizeof (struct ether_vlan_header);
1350 else
1351 l2len = sizeof (struct ether_header);
1352
1353 /*
1354 * If the first mblk is solely the L2 header, then
1355 * there better be more data.
1356 */
1357 if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1358 mac_drop_pkt(mp, "packet too short (C): %u", len);
1359 mp = next;
1360 continue;
1361 }
1362
1363 DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1364
1365 /*
1366 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1367 * because we don't want to mask-out the LSO flag.
1368 */
1369 flags = DB_CKSUMFLAGS(mp);
1370
1371 if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1372 uint_t tmpcount = 0;
1373
1374 /*
1375 * LSO fix-up handles checksum emulation
1376 * inline (if requested). It also frees mp.
1377 */
1378 mac_sw_lso(mp, emul, &tmphead, &tmptail,
1379 &tmpcount);
1380 if (tmphead == NULL) {
1381 /* mac_sw_lso() freed the mp. */
1382 mp = next;
1383 continue;
1384 }
1385 count += tmpcount;
1386 } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1387 tmp = mac_sw_cksum(mp, emul);
1388 if (tmp == NULL) {
1389 /* mac_sw_cksum() freed the mp. */
1390 mp = next;
1391 continue;
1392 }
1393 tmphead = tmp;
1394 tmptail = tmp;
1395 count++;
1396 } else {
1397 /* There is nothing to emulate. */
1398 tmp = mp;
1399 tmphead = tmp;
1400 tmptail = tmp;
1401 count++;
1402 }
1403
1404 /*
1405 * The tmp mblk chain is either the start of the new
1406 * chain or added to the tail of the new chain.
1407 */
1408 if (head == NULL) {
1409 head = tmphead;
1410 tail = tmptail;
1411 } else {
1412 /* Attach the new mblk to the end of the new chain. */
1413 tail->b_next = tmphead;
1414 tail = tmptail;
1415 }
1416
1417 mp = next;
1418 }
1419
1420 *mp_chain = head;
1421
1422 if (otail != NULL)
1423 *otail = tail;
1424
1425 if (ocount != NULL)
1426 *ocount = count;
1427 }
1428
1429 /*
1430 * Add VLAN tag to the specified mblk.
1431 */
1432 mblk_t *
mac_add_vlan_tag(mblk_t * mp,uint_t pri,uint16_t vid)1433 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1434 {
1435 mblk_t *hmp;
1436 struct ether_vlan_header *evhp;
1437 struct ether_header *ehp;
1438
1439 ASSERT(pri != 0 || vid != 0);
1440
1441 /*
1442 * Allocate an mblk for the new tagged ethernet header,
1443 * and copy the MAC addresses and ethertype from the
1444 * original header.
1445 */
1446
1447 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1448 if (hmp == NULL) {
1449 freemsg(mp);
1450 return (NULL);
1451 }
1452
1453 evhp = (struct ether_vlan_header *)hmp->b_rptr;
1454 ehp = (struct ether_header *)mp->b_rptr;
1455
1456 bcopy(ehp, evhp, (ETHERADDRL * 2));
1457 evhp->ether_type = ehp->ether_type;
1458 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1459
1460 hmp->b_wptr += sizeof (struct ether_vlan_header);
1461 mp->b_rptr += sizeof (struct ether_header);
1462
1463 /*
1464 * Free the original message if it's now empty. Link the
1465 * rest of messages to the header message.
1466 */
1467 mac_hcksum_clone(mp, hmp);
1468 if (MBLKL(mp) == 0) {
1469 hmp->b_cont = mp->b_cont;
1470 freeb(mp);
1471 } else {
1472 hmp->b_cont = mp;
1473 }
1474 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1475
1476 /*
1477 * Initialize the new TCI (Tag Control Information).
1478 */
1479 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1480
1481 return (hmp);
1482 }
1483
1484 /*
1485 * Adds a VLAN tag with the specified VID and priority to each mblk of
1486 * the specified chain.
1487 */
1488 mblk_t *
mac_add_vlan_tag_chain(mblk_t * mp_chain,uint_t pri,uint16_t vid)1489 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1490 {
1491 mblk_t *next_mp, **prev, *mp;
1492
1493 mp = mp_chain;
1494 prev = &mp_chain;
1495
1496 while (mp != NULL) {
1497 next_mp = mp->b_next;
1498 mp->b_next = NULL;
1499 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1500 freemsgchain(next_mp);
1501 break;
1502 }
1503 *prev = mp;
1504 prev = &mp->b_next;
1505 mp = mp->b_next = next_mp;
1506 }
1507
1508 return (mp_chain);
1509 }
1510
1511 /*
1512 * Strip VLAN tag
1513 */
1514 mblk_t *
mac_strip_vlan_tag(mblk_t * mp)1515 mac_strip_vlan_tag(mblk_t *mp)
1516 {
1517 mblk_t *newmp;
1518 struct ether_vlan_header *evhp;
1519
1520 evhp = (struct ether_vlan_header *)mp->b_rptr;
1521 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1522 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1523
1524 if (DB_REF(mp) > 1) {
1525 newmp = copymsg(mp);
1526 if (newmp == NULL)
1527 return (NULL);
1528 freemsg(mp);
1529 mp = newmp;
1530 }
1531
1532 evhp = (struct ether_vlan_header *)mp->b_rptr;
1533
1534 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1535 mp->b_rptr += VLAN_TAGSZ;
1536 }
1537 return (mp);
1538 }
1539
1540 /*
1541 * Strip VLAN tag from each mblk of the chain.
1542 */
1543 mblk_t *
mac_strip_vlan_tag_chain(mblk_t * mp_chain)1544 mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1545 {
1546 mblk_t *mp, *next_mp, **prev;
1547
1548 mp = mp_chain;
1549 prev = &mp_chain;
1550
1551 while (mp != NULL) {
1552 next_mp = mp->b_next;
1553 mp->b_next = NULL;
1554 if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1555 freemsgchain(next_mp);
1556 break;
1557 }
1558 *prev = mp;
1559 prev = &mp->b_next;
1560 mp = mp->b_next = next_mp;
1561 }
1562
1563 return (mp_chain);
1564 }
1565
1566 /*
1567 * Default callback function. Used when the datapath is not yet initialized.
1568 */
1569 /* ARGSUSED */
1570 void
mac_rx_def(void * arg,mac_resource_handle_t resource,mblk_t * mp_chain,boolean_t loopback)1571 mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1572 boolean_t loopback)
1573 {
1574 freemsgchain(mp_chain);
1575 }
1576
1577 /*
1578 * Determines the IPv6 header length accounting for all the optional IPv6
1579 * headers (hop-by-hop, destination, routing and fragment). The header length
1580 * and next header value (a transport header) is captured.
1581 *
1582 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1583 * returns B_TRUE.
1584 */
1585 boolean_t
mac_ip_hdr_length_v6(ip6_t * ip6h,uint8_t * endptr,uint16_t * hdr_length,uint8_t * next_hdr,ip6_frag_t ** fragp)1586 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
1587 uint8_t *next_hdr, ip6_frag_t **fragp)
1588 {
1589 uint16_t length;
1590 uint_t ehdrlen;
1591 uint8_t *whereptr;
1592 uint8_t *nexthdrp;
1593 ip6_dest_t *desthdr;
1594 ip6_rthdr_t *rthdr;
1595 ip6_frag_t *fraghdr;
1596
1597 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1598 return (B_FALSE);
1599 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1600 length = IPV6_HDR_LEN;
1601 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1602
1603 if (fragp != NULL)
1604 *fragp = NULL;
1605
1606 nexthdrp = &ip6h->ip6_nxt;
1607 while (whereptr < endptr) {
1608 /* Is there enough left for len + nexthdr? */
1609 if (whereptr + MIN_EHDR_LEN > endptr)
1610 break;
1611
1612 switch (*nexthdrp) {
1613 case IPPROTO_HOPOPTS:
1614 case IPPROTO_DSTOPTS:
1615 /* Assumes the headers are identical for hbh and dst */
1616 desthdr = (ip6_dest_t *)whereptr;
1617 ehdrlen = 8 * (desthdr->ip6d_len + 1);
1618 if ((uchar_t *)desthdr + ehdrlen > endptr)
1619 return (B_FALSE);
1620 nexthdrp = &desthdr->ip6d_nxt;
1621 break;
1622 case IPPROTO_ROUTING:
1623 rthdr = (ip6_rthdr_t *)whereptr;
1624 ehdrlen = 8 * (rthdr->ip6r_len + 1);
1625 if ((uchar_t *)rthdr + ehdrlen > endptr)
1626 return (B_FALSE);
1627 nexthdrp = &rthdr->ip6r_nxt;
1628 break;
1629 case IPPROTO_FRAGMENT:
1630 fraghdr = (ip6_frag_t *)whereptr;
1631 ehdrlen = sizeof (ip6_frag_t);
1632 if ((uchar_t *)&fraghdr[1] > endptr)
1633 return (B_FALSE);
1634 nexthdrp = &fraghdr->ip6f_nxt;
1635 if (fragp != NULL)
1636 *fragp = fraghdr;
1637 break;
1638 case IPPROTO_NONE:
1639 /* No next header means we're finished */
1640 default:
1641 *hdr_length = length;
1642 *next_hdr = *nexthdrp;
1643 return (B_TRUE);
1644 }
1645 length += ehdrlen;
1646 whereptr += ehdrlen;
1647 *hdr_length = length;
1648 *next_hdr = *nexthdrp;
1649 }
1650 switch (*nexthdrp) {
1651 case IPPROTO_HOPOPTS:
1652 case IPPROTO_DSTOPTS:
1653 case IPPROTO_ROUTING:
1654 case IPPROTO_FRAGMENT:
1655 /*
1656 * If any know extension headers are still to be processed,
1657 * the packet's malformed (or at least all the IP header(s) are
1658 * not in the same mblk - and that should never happen.
1659 */
1660 return (B_FALSE);
1661
1662 default:
1663 /*
1664 * If we get here, we know that all of the IP headers were in
1665 * the same mblk, even if the ULP header is in the next mblk.
1666 */
1667 *hdr_length = length;
1668 *next_hdr = *nexthdrp;
1669 return (B_TRUE);
1670 }
1671 }
1672
1673 /*
1674 * The following set of routines are there to take care of interrupt
1675 * re-targeting for legacy (fixed) interrupts. Some older versions
1676 * of the popular NICs like e1000g do not support MSI-X interrupts
1677 * and they reserve fixed interrupts for RX/TX rings. To re-target
1678 * these interrupts, PCITOOL ioctls need to be used.
1679 */
1680 typedef struct mac_dladm_intr {
1681 int ino;
1682 int cpu_id;
1683 char driver_path[MAXPATHLEN];
1684 char nexus_path[MAXPATHLEN];
1685 } mac_dladm_intr_t;
1686
1687 /* Bind the interrupt to cpu_num */
1688 static int
mac_set_intr(ldi_handle_t lh,processorid_t cpu_num,int oldcpuid,int ino)1689 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1690 {
1691 pcitool_intr_set_t iset;
1692 int err;
1693
1694 iset.old_cpu = oldcpuid;
1695 iset.ino = ino;
1696 iset.cpu_id = cpu_num;
1697 iset.user_version = PCITOOL_VERSION;
1698 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1699 kcred, NULL);
1700
1701 return (err);
1702 }
1703
1704 /*
1705 * Search interrupt information. iget is filled in with the info to search
1706 */
1707 static boolean_t
mac_search_intrinfo(pcitool_intr_get_t * iget_p,mac_dladm_intr_t * dln)1708 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1709 {
1710 int i;
1711 char driver_path[2 * MAXPATHLEN];
1712
1713 for (i = 0; i < iget_p->num_devs; i++) {
1714 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1715 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1716 ":%s%d", iget_p->dev[i].driver_name,
1717 iget_p->dev[i].dev_inst);
1718 /* Match the device path for the device path */
1719 if (strcmp(driver_path, dln->driver_path) == 0) {
1720 dln->ino = iget_p->ino;
1721 dln->cpu_id = iget_p->cpu_id;
1722 return (B_TRUE);
1723 }
1724 }
1725 return (B_FALSE);
1726 }
1727
1728 /*
1729 * Get information about ino, i.e. if this is the interrupt for our
1730 * device and where it is bound etc.
1731 */
1732 static boolean_t
mac_get_single_intr(ldi_handle_t lh,int oldcpuid,int ino,mac_dladm_intr_t * dln)1733 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
1734 mac_dladm_intr_t *dln)
1735 {
1736 pcitool_intr_get_t *iget_p;
1737 int ipsz;
1738 int nipsz;
1739 int err;
1740 uint8_t inum;
1741
1742 /*
1743 * Check if SLEEP is OK, i.e if could come here in response to
1744 * changing the fanout due to some callback from the driver, say
1745 * link speed changes.
1746 */
1747 ipsz = PCITOOL_IGET_SIZE(0);
1748 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1749
1750 iget_p->num_devs_ret = 0;
1751 iget_p->user_version = PCITOOL_VERSION;
1752 iget_p->cpu_id = oldcpuid;
1753 iget_p->ino = ino;
1754
1755 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1756 FKIOCTL, kcred, NULL);
1757 if (err != 0) {
1758 kmem_free(iget_p, ipsz);
1759 return (B_FALSE);
1760 }
1761 if (iget_p->num_devs == 0) {
1762 kmem_free(iget_p, ipsz);
1763 return (B_FALSE);
1764 }
1765 inum = iget_p->num_devs;
1766 if (iget_p->num_devs_ret < iget_p->num_devs) {
1767 /* Reallocate */
1768 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1769
1770 kmem_free(iget_p, ipsz);
1771 ipsz = nipsz;
1772 iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1773
1774 iget_p->num_devs_ret = inum;
1775 iget_p->cpu_id = oldcpuid;
1776 iget_p->ino = ino;
1777 iget_p->user_version = PCITOOL_VERSION;
1778 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1779 FKIOCTL, kcred, NULL);
1780 if (err != 0) {
1781 kmem_free(iget_p, ipsz);
1782 return (B_FALSE);
1783 }
1784 /* defensive */
1785 if (iget_p->num_devs != iget_p->num_devs_ret) {
1786 kmem_free(iget_p, ipsz);
1787 return (B_FALSE);
1788 }
1789 }
1790
1791 if (mac_search_intrinfo(iget_p, dln)) {
1792 kmem_free(iget_p, ipsz);
1793 return (B_TRUE);
1794 }
1795 kmem_free(iget_p, ipsz);
1796 return (B_FALSE);
1797 }
1798
1799 /*
1800 * Get the interrupts and check each one to see if it is for our device.
1801 */
1802 static int
mac_validate_intr(ldi_handle_t lh,mac_dladm_intr_t * dln,processorid_t cpuid)1803 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1804 {
1805 pcitool_intr_info_t intr_info;
1806 int err;
1807 int ino;
1808 int oldcpuid;
1809
1810 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1811 FKIOCTL, kcred, NULL);
1812 if (err != 0)
1813 return (-1);
1814
1815 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
1816 for (ino = 0; ino < intr_info.num_intr; ino++) {
1817 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
1818 if (dln->cpu_id == cpuid)
1819 return (0);
1820 return (1);
1821 }
1822 }
1823 }
1824 return (-1);
1825 }
1826
1827 /*
1828 * Obtain the nexus parent node info. for mdip.
1829 */
1830 static dev_info_t *
mac_get_nexus_node(dev_info_t * mdip,mac_dladm_intr_t * dln)1831 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1832 {
1833 struct dev_info *tdip = (struct dev_info *)mdip;
1834 struct ddi_minor_data *minordata;
1835 int circ;
1836 dev_info_t *pdip;
1837 char pathname[MAXPATHLEN];
1838
1839 while (tdip != NULL) {
1840 /*
1841 * The netboot code could call this function while walking the
1842 * device tree so we need to use ndi_devi_tryenter() here to
1843 * avoid deadlock.
1844 */
1845 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0)
1846 break;
1847
1848 for (minordata = tdip->devi_minor; minordata != NULL;
1849 minordata = minordata->next) {
1850 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1851 strlen(DDI_NT_INTRCTL)) == 0) {
1852 pdip = minordata->dip;
1853 (void) ddi_pathname(pdip, pathname);
1854 (void) snprintf(dln->nexus_path, MAXPATHLEN,
1855 "/devices%s:intr", pathname);
1856 (void) ddi_pathname_minor(minordata, pathname);
1857 ndi_devi_exit((dev_info_t *)tdip, circ);
1858 return (pdip);
1859 }
1860 }
1861 ndi_devi_exit((dev_info_t *)tdip, circ);
1862 tdip = tdip->devi_parent;
1863 }
1864 return (NULL);
1865 }
1866
1867 /*
1868 * For a primary MAC client, if the user has set a list or CPUs or
1869 * we have obtained it implicitly, we try to retarget the interrupt
1870 * for that device on one of the CPUs in the list.
1871 * We assign the interrupt to the same CPU as the poll thread.
1872 */
1873 static boolean_t
mac_check_interrupt_binding(dev_info_t * mdip,int32_t cpuid)1874 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1875 {
1876 ldi_handle_t lh = NULL;
1877 ldi_ident_t li = NULL;
1878 int err;
1879 int ret;
1880 mac_dladm_intr_t dln;
1881 dev_info_t *dip;
1882 struct ddi_minor_data *minordata;
1883
1884 dln.nexus_path[0] = '\0';
1885 dln.driver_path[0] = '\0';
1886
1887 minordata = ((struct dev_info *)mdip)->devi_minor;
1888 while (minordata != NULL) {
1889 if (minordata->type == DDM_MINOR)
1890 break;
1891 minordata = minordata->next;
1892 }
1893 if (minordata == NULL)
1894 return (B_FALSE);
1895
1896 (void) ddi_pathname_minor(minordata, dln.driver_path);
1897
1898 dip = mac_get_nexus_node(mdip, &dln);
1899 /* defensive */
1900 if (dip == NULL)
1901 return (B_FALSE);
1902
1903 err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1904 if (err != 0)
1905 return (B_FALSE);
1906
1907 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1908 if (err != 0)
1909 return (B_FALSE);
1910
1911 ret = mac_validate_intr(lh, &dln, cpuid);
1912 if (ret < 0) {
1913 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1914 return (B_FALSE);
1915 }
1916 /* cmn_note? */
1917 if (ret != 0)
1918 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
1919 != 0) {
1920 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1921 return (B_FALSE);
1922 }
1923 (void) ldi_close(lh, FREAD|FWRITE, kcred);
1924 return (B_TRUE);
1925 }
1926
1927 void
mac_client_set_intr_cpu(void * arg,mac_client_handle_t mch,int32_t cpuid)1928 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1929 {
1930 dev_info_t *mdip = (dev_info_t *)arg;
1931 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1932 mac_resource_props_t *mrp;
1933 mac_perim_handle_t mph;
1934 flow_entry_t *flent = mcip->mci_flent;
1935 mac_soft_ring_set_t *rx_srs;
1936 mac_cpus_t *srs_cpu;
1937
1938 if (!mac_check_interrupt_binding(mdip, cpuid))
1939 cpuid = -1;
1940 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1941 mrp = MCIP_RESOURCE_PROPS(mcip);
1942 mrp->mrp_rx_intr_cpu = cpuid;
1943 if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
1944 rx_srs = flent->fe_rx_srs[1];
1945 srs_cpu = &rx_srs->srs_cpu;
1946 srs_cpu->mc_rx_intr_cpu = cpuid;
1947 }
1948 mac_perim_exit(mph);
1949 }
1950
1951 int32_t
mac_client_intr_cpu(mac_client_handle_t mch)1952 mac_client_intr_cpu(mac_client_handle_t mch)
1953 {
1954 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1955 mac_cpus_t *srs_cpu;
1956 mac_soft_ring_set_t *rx_srs;
1957 flow_entry_t *flent = mcip->mci_flent;
1958 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip);
1959 mac_ring_t *ring;
1960 mac_intr_t *mintr;
1961
1962 /*
1963 * Check if we need to retarget the interrupt. We do this only
1964 * for the primary MAC client. We do this if we have the only
1965 * exclusive ring in the group.
1966 */
1967 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1968 rx_srs = flent->fe_rx_srs[1];
1969 srs_cpu = &rx_srs->srs_cpu;
1970 ring = rx_srs->srs_ring;
1971 mintr = &ring->mr_info.mri_intr;
1972 /*
1973 * If ddi_handle is present or the poll CPU is
1974 * already bound to the interrupt CPU, return -1.
1975 */
1976 if (mintr->mi_ddi_handle != NULL ||
1977 ((mrp->mrp_ncpus != 0) &&
1978 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1979 return (-1);
1980 }
1981 return (srs_cpu->mc_rx_pollid);
1982 }
1983 return (-1);
1984 }
1985
1986 void *
mac_get_devinfo(mac_handle_t mh)1987 mac_get_devinfo(mac_handle_t mh)
1988 {
1989 mac_impl_t *mip = (mac_impl_t *)mh;
1990
1991 return ((void *)mip->mi_dip);
1992 }
1993
1994 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1995 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1996 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1997
1998 uint64_t
mac_pkt_hash(uint_t media,mblk_t * mp,uint8_t policy,boolean_t is_outbound)1999 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
2000 {
2001 struct ether_header *ehp;
2002 uint64_t hash = 0;
2003 uint16_t sap;
2004 uint_t skip_len;
2005 uint8_t proto;
2006 boolean_t ip_fragmented;
2007
2008 /*
2009 * We may want to have one of these per MAC type plugin in the
2010 * future. For now supports only ethernet.
2011 */
2012 if (media != DL_ETHER)
2013 return (0L);
2014
2015 /* for now we support only outbound packets */
2016 ASSERT(is_outbound);
2017 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
2018 ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
2019
2020 /* compute L2 hash */
2021
2022 ehp = (struct ether_header *)mp->b_rptr;
2023
2024 if ((policy & MAC_PKT_HASH_L2) != 0) {
2025 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
2026 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
2027 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
2028 policy &= ~MAC_PKT_HASH_L2;
2029 }
2030
2031 if (policy == 0)
2032 goto done;
2033
2034 /* skip ethernet header */
2035
2036 sap = ntohs(ehp->ether_type);
2037 if (sap == ETHERTYPE_VLAN) {
2038 struct ether_vlan_header *evhp;
2039 mblk_t *newmp = NULL;
2040
2041 skip_len = sizeof (struct ether_vlan_header);
2042 if (MBLKL(mp) < skip_len) {
2043 /* the vlan tag is the payload, pull up first */
2044 newmp = msgpullup(mp, -1);
2045 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
2046 goto done;
2047 }
2048 evhp = (struct ether_vlan_header *)newmp->b_rptr;
2049 } else {
2050 evhp = (struct ether_vlan_header *)mp->b_rptr;
2051 }
2052
2053 sap = ntohs(evhp->ether_type);
2054 freemsg(newmp);
2055 } else {
2056 skip_len = sizeof (struct ether_header);
2057 }
2058
2059 /* if ethernet header is in its own mblk, skip it */
2060 if (MBLKL(mp) <= skip_len) {
2061 skip_len -= MBLKL(mp);
2062 mp = mp->b_cont;
2063 if (mp == NULL)
2064 goto done;
2065 }
2066
2067 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
2068
2069 /* compute IP src/dst addresses hash and skip IPv{4,6} header */
2070
2071 switch (sap) {
2072 case ETHERTYPE_IP: {
2073 ipha_t *iphp;
2074
2075 /*
2076 * If the header is not aligned or the header doesn't fit
2077 * in the mblk, bail now. Note that this may cause packets
2078 * reordering.
2079 */
2080 iphp = (ipha_t *)(mp->b_rptr + skip_len);
2081 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
2082 !OK_32PTR((char *)iphp))
2083 goto done;
2084
2085 proto = iphp->ipha_protocol;
2086 skip_len += IPH_HDR_LENGTH(iphp);
2087
2088 /* Check if the packet is fragmented. */
2089 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
2090 IPH_OFFSET;
2091
2092 /*
2093 * For fragmented packets, use addresses in addition to
2094 * the frag_id to generate the hash inorder to get
2095 * better distribution.
2096 */
2097 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2098 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2099 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2100
2101 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2102 PKT_HASH_4BYTES(ip_dst));
2103 policy &= ~MAC_PKT_HASH_L3;
2104 }
2105
2106 if (ip_fragmented) {
2107 uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
2108 hash ^= PKT_HASH_2BYTES(identp);
2109 goto done;
2110 }
2111 break;
2112 }
2113 case ETHERTYPE_IPV6: {
2114 ip6_t *ip6hp;
2115 ip6_frag_t *frag = NULL;
2116 uint16_t hdr_length;
2117
2118 /*
2119 * If the header is not aligned or the header doesn't fit
2120 * in the mblk, bail now. Note that this may cause packets
2121 * reordering.
2122 */
2123
2124 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2125 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2126 !OK_32PTR((char *)ip6hp))
2127 goto done;
2128
2129 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
2130 &proto, &frag))
2131 goto done;
2132 skip_len += hdr_length;
2133
2134 /*
2135 * For fragmented packets, use addresses in addition to
2136 * the frag_id to generate the hash inorder to get
2137 * better distribution.
2138 */
2139 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2140 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2141 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2142
2143 hash ^= (PKT_HASH_4BYTES(ip_src) ^
2144 PKT_HASH_4BYTES(ip_dst));
2145 policy &= ~MAC_PKT_HASH_L3;
2146 }
2147
2148 if (frag != NULL) {
2149 uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
2150 hash ^= PKT_HASH_4BYTES(identp);
2151 goto done;
2152 }
2153 break;
2154 }
2155 default:
2156 goto done;
2157 }
2158
2159 if (policy == 0)
2160 goto done;
2161
2162 /* if ip header is in its own mblk, skip it */
2163 if (MBLKL(mp) <= skip_len) {
2164 skip_len -= MBLKL(mp);
2165 mp = mp->b_cont;
2166 if (mp == NULL)
2167 goto done;
2168 }
2169
2170 /* parse ULP header */
2171 again:
2172 switch (proto) {
2173 case IPPROTO_TCP:
2174 case IPPROTO_UDP:
2175 case IPPROTO_ESP:
2176 case IPPROTO_SCTP:
2177 /*
2178 * These Internet Protocols are intentionally designed
2179 * for hashing from the git-go. Port numbers are in the first
2180 * word for transports, SPI is first for ESP.
2181 */
2182 if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2183 goto done;
2184 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2185 break;
2186
2187 case IPPROTO_AH: {
2188 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2189 uint_t ah_length = AH_TOTAL_LEN(ah);
2190
2191 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2192 goto done;
2193
2194 proto = ah->ah_nexthdr;
2195 skip_len += ah_length;
2196
2197 /* if AH header is in its own mblk, skip it */
2198 if (MBLKL(mp) <= skip_len) {
2199 skip_len -= MBLKL(mp);
2200 mp = mp->b_cont;
2201 if (mp == NULL)
2202 goto done;
2203 }
2204
2205 goto again;
2206 }
2207 }
2208
2209 done:
2210 return (hash);
2211 }
2212