1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2018 Joyent, Inc. 24 */ 25 26 /* 27 * MAC Services Module - misc utilities 28 */ 29 30 #include <sys/types.h> 31 #include <sys/mac.h> 32 #include <sys/mac_impl.h> 33 #include <sys/mac_client_priv.h> 34 #include <sys/mac_client_impl.h> 35 #include <sys/mac_soft_ring.h> 36 #include <sys/strsubr.h> 37 #include <sys/strsun.h> 38 #include <sys/vlan.h> 39 #include <sys/pattr.h> 40 #include <sys/pci_tools.h> 41 #include <inet/ip.h> 42 #include <inet/ip_impl.h> 43 #include <inet/ip6.h> 44 #include <sys/vtrace.h> 45 #include <sys/dlpi.h> 46 #include <sys/sunndi.h> 47 #include <inet/ipsec_impl.h> 48 #include <inet/sadb.h> 49 #include <inet/ipsecesp.h> 50 #include <inet/ipsecah.h> 51 52 /* 53 * Copy an mblk, preserving its hardware checksum flags. 54 */ 55 static mblk_t * 56 mac_copymsg_cksum(mblk_t *mp) 57 { 58 mblk_t *mp1; 59 60 mp1 = copymsg(mp); 61 if (mp1 == NULL) 62 return (NULL); 63 64 mac_hcksum_clone(mp, mp1); 65 66 return (mp1); 67 } 68 69 /* 70 * Copy an mblk chain, presenting the hardware checksum flags of the 71 * individual mblks. 72 */ 73 mblk_t * 74 mac_copymsgchain_cksum(mblk_t *mp) 75 { 76 mblk_t *nmp = NULL; 77 mblk_t **nmpp = &nmp; 78 79 for (; mp != NULL; mp = mp->b_next) { 80 if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { 81 freemsgchain(nmp); 82 return (NULL); 83 } 84 85 nmpp = &((*nmpp)->b_next); 86 } 87 88 return (nmp); 89 } 90 91 /* 92 * Process the specified mblk chain for proper handling of hardware 93 * checksum offload. This routine is invoked for loopback traffic 94 * between MAC clients. 95 * The function handles a NULL mblk chain passed as argument. 96 */ 97 mblk_t * 98 mac_fix_cksum(mblk_t *mp_chain) 99 { 100 mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; 101 uint32_t flags, start, stuff, end, value; 102 103 for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { 104 uint16_t len; 105 uint32_t offset; 106 struct ether_header *ehp; 107 uint16_t sap; 108 109 mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags); 110 if (flags == 0) 111 continue; 112 113 /* 114 * Since the processing of checksum offload for loopback 115 * traffic requires modification of the packet contents, 116 * ensure sure that we are always modifying our own copy. 117 */ 118 if (DB_REF(mp) > 1) { 119 mp1 = copymsg(mp); 120 if (mp1 == NULL) 121 continue; 122 mp1->b_next = mp->b_next; 123 mp->b_next = NULL; 124 freemsg(mp); 125 if (prev != NULL) 126 prev->b_next = mp1; 127 else 128 new_chain = mp1; 129 mp = mp1; 130 } 131 132 /* 133 * Ethernet, and optionally VLAN header. 134 */ 135 /* LINTED: improper alignment cast */ 136 ehp = (struct ether_header *)mp->b_rptr; 137 if (ntohs(ehp->ether_type) == VLAN_TPID) { 138 struct ether_vlan_header *evhp; 139 140 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 141 /* LINTED: improper alignment cast */ 142 evhp = (struct ether_vlan_header *)mp->b_rptr; 143 sap = ntohs(evhp->ether_type); 144 offset = sizeof (struct ether_vlan_header); 145 } else { 146 sap = ntohs(ehp->ether_type); 147 offset = sizeof (struct ether_header); 148 } 149 150 if (MBLKL(mp) <= offset) { 151 offset -= MBLKL(mp); 152 if (mp->b_cont == NULL) { 153 /* corrupted packet, skip it */ 154 if (prev != NULL) 155 prev->b_next = mp->b_next; 156 else 157 new_chain = mp->b_next; 158 mp1 = mp->b_next; 159 mp->b_next = NULL; 160 freemsg(mp); 161 mp = mp1; 162 continue; 163 } 164 mp = mp->b_cont; 165 } 166 167 if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { 168 ipha_t *ipha = NULL; 169 170 /* 171 * In order to compute the full and header 172 * checksums, we need to find and parse 173 * the IP and/or ULP headers. 174 */ 175 176 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 177 178 /* 179 * IP header. 180 */ 181 if (sap != ETHERTYPE_IP) 182 continue; 183 184 ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); 185 /* LINTED: improper alignment cast */ 186 ipha = (ipha_t *)(mp->b_rptr + offset); 187 188 if (flags & HCK_FULLCKSUM) { 189 ipaddr_t src, dst; 190 uint32_t cksum; 191 uint16_t *up; 192 uint8_t proto; 193 194 /* 195 * Pointer to checksum field in ULP header. 196 */ 197 proto = ipha->ipha_protocol; 198 ASSERT(ipha->ipha_version_and_hdr_length == 199 IP_SIMPLE_HDR_VERSION); 200 201 switch (proto) { 202 case IPPROTO_TCP: 203 /* LINTED: improper alignment cast */ 204 up = IPH_TCPH_CHECKSUMP(ipha, 205 IP_SIMPLE_HDR_LENGTH); 206 break; 207 208 case IPPROTO_UDP: 209 /* LINTED: improper alignment cast */ 210 up = IPH_UDPH_CHECKSUMP(ipha, 211 IP_SIMPLE_HDR_LENGTH); 212 break; 213 214 default: 215 cmn_err(CE_WARN, "mac_fix_cksum: " 216 "unexpected protocol: %d", proto); 217 continue; 218 } 219 220 /* 221 * Pseudo-header checksum. 222 */ 223 src = ipha->ipha_src; 224 dst = ipha->ipha_dst; 225 len = ntohs(ipha->ipha_length) - 226 IP_SIMPLE_HDR_LENGTH; 227 228 cksum = (dst >> 16) + (dst & 0xFFFF) + 229 (src >> 16) + (src & 0xFFFF); 230 cksum += htons(len); 231 232 /* 233 * The checksum value stored in the packet needs 234 * to be correct. Compute it here. 235 */ 236 *up = 0; 237 cksum += (((proto) == IPPROTO_UDP) ? 238 IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); 239 cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + 240 offset, cksum); 241 *(up) = (uint16_t)(cksum ? cksum : ~cksum); 242 243 /* 244 * Flag the packet so that it appears 245 * that the checksum has already been 246 * verified by the hardware. 247 */ 248 flags &= ~HCK_FULLCKSUM; 249 flags |= HCK_FULLCKSUM_OK; 250 value = 0; 251 } 252 253 if (flags & HCK_IPV4_HDRCKSUM) { 254 ASSERT(ipha != NULL); 255 ipha->ipha_hdr_checksum = 256 (uint16_t)ip_csum_hdr(ipha); 257 flags &= ~HCK_IPV4_HDRCKSUM; 258 flags |= HCK_IPV4_HDRCKSUM_OK; 259 260 } 261 } 262 263 if (flags & HCK_PARTIALCKSUM) { 264 uint16_t *up, partial, cksum; 265 uchar_t *ipp; /* ptr to beginning of IP header */ 266 267 if (mp->b_cont != NULL) { 268 mblk_t *mp1; 269 270 mp1 = msgpullup(mp, offset + end); 271 if (mp1 == NULL) 272 continue; 273 mp1->b_next = mp->b_next; 274 mp->b_next = NULL; 275 freemsg(mp); 276 if (prev != NULL) 277 prev->b_next = mp1; 278 else 279 new_chain = mp1; 280 mp = mp1; 281 } 282 283 ipp = mp->b_rptr + offset; 284 /* LINTED: cast may result in improper alignment */ 285 up = (uint16_t *)((uchar_t *)ipp + stuff); 286 partial = *up; 287 *up = 0; 288 289 cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, 290 end - start, partial); 291 cksum = ~cksum; 292 *up = cksum ? cksum : ~cksum; 293 294 /* 295 * Since we already computed the whole checksum, 296 * indicate to the stack that it has already 297 * been verified by the hardware. 298 */ 299 flags &= ~HCK_PARTIALCKSUM; 300 flags |= HCK_FULLCKSUM_OK; 301 value = 0; 302 } 303 304 mac_hcksum_set(mp, start, stuff, end, value, flags); 305 } 306 307 return (new_chain); 308 } 309 310 /* 311 * Add VLAN tag to the specified mblk. 312 */ 313 mblk_t * 314 mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) 315 { 316 mblk_t *hmp; 317 struct ether_vlan_header *evhp; 318 struct ether_header *ehp; 319 320 ASSERT(pri != 0 || vid != 0); 321 322 /* 323 * Allocate an mblk for the new tagged ethernet header, 324 * and copy the MAC addresses and ethertype from the 325 * original header. 326 */ 327 328 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 329 if (hmp == NULL) { 330 freemsg(mp); 331 return (NULL); 332 } 333 334 evhp = (struct ether_vlan_header *)hmp->b_rptr; 335 ehp = (struct ether_header *)mp->b_rptr; 336 337 bcopy(ehp, evhp, (ETHERADDRL * 2)); 338 evhp->ether_type = ehp->ether_type; 339 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 340 341 hmp->b_wptr += sizeof (struct ether_vlan_header); 342 mp->b_rptr += sizeof (struct ether_header); 343 344 /* 345 * Free the original message if it's now empty. Link the 346 * rest of messages to the header message. 347 */ 348 mac_hcksum_clone(mp, hmp); 349 if (MBLKL(mp) == 0) { 350 hmp->b_cont = mp->b_cont; 351 freeb(mp); 352 } else { 353 hmp->b_cont = mp; 354 } 355 ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); 356 357 /* 358 * Initialize the new TCI (Tag Control Information). 359 */ 360 evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); 361 362 return (hmp); 363 } 364 365 /* 366 * Adds a VLAN tag with the specified VID and priority to each mblk of 367 * the specified chain. 368 */ 369 mblk_t * 370 mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) 371 { 372 mblk_t *next_mp, **prev, *mp; 373 374 mp = mp_chain; 375 prev = &mp_chain; 376 377 while (mp != NULL) { 378 next_mp = mp->b_next; 379 mp->b_next = NULL; 380 if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { 381 freemsgchain(next_mp); 382 break; 383 } 384 *prev = mp; 385 prev = &mp->b_next; 386 mp = mp->b_next = next_mp; 387 } 388 389 return (mp_chain); 390 } 391 392 /* 393 * Strip VLAN tag 394 */ 395 mblk_t * 396 mac_strip_vlan_tag(mblk_t *mp) 397 { 398 mblk_t *newmp; 399 struct ether_vlan_header *evhp; 400 401 evhp = (struct ether_vlan_header *)mp->b_rptr; 402 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 403 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 404 405 if (DB_REF(mp) > 1) { 406 newmp = copymsg(mp); 407 if (newmp == NULL) 408 return (NULL); 409 freemsg(mp); 410 mp = newmp; 411 } 412 413 evhp = (struct ether_vlan_header *)mp->b_rptr; 414 415 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 416 mp->b_rptr += VLAN_TAGSZ; 417 } 418 return (mp); 419 } 420 421 /* 422 * Strip VLAN tag from each mblk of the chain. 423 */ 424 mblk_t * 425 mac_strip_vlan_tag_chain(mblk_t *mp_chain) 426 { 427 mblk_t *mp, *next_mp, **prev; 428 429 mp = mp_chain; 430 prev = &mp_chain; 431 432 while (mp != NULL) { 433 next_mp = mp->b_next; 434 mp->b_next = NULL; 435 if ((mp = mac_strip_vlan_tag(mp)) == NULL) { 436 freemsgchain(next_mp); 437 break; 438 } 439 *prev = mp; 440 prev = &mp->b_next; 441 mp = mp->b_next = next_mp; 442 } 443 444 return (mp_chain); 445 } 446 447 /* 448 * Default callback function. Used when the datapath is not yet initialized. 449 */ 450 /* ARGSUSED */ 451 void 452 mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, 453 boolean_t loopback) 454 { 455 mblk_t *mp1 = mp; 456 457 while (mp1 != NULL) { 458 mp1->b_prev = NULL; 459 mp1->b_queue = NULL; 460 mp1 = mp1->b_next; 461 } 462 freemsgchain(mp); 463 } 464 465 /* 466 * Determines the IPv6 header length accounting for all the optional IPv6 467 * headers (hop-by-hop, destination, routing and fragment). The header length 468 * and next header value (a transport header) is captured. 469 * 470 * Returns B_FALSE if all the IP headers are not in the same mblk otherwise 471 * returns B_TRUE. 472 */ 473 boolean_t 474 mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length, 475 uint8_t *next_hdr, ip6_frag_t **fragp) 476 { 477 uint16_t length; 478 uint_t ehdrlen; 479 uint8_t *whereptr; 480 uint8_t *nexthdrp; 481 ip6_dest_t *desthdr; 482 ip6_rthdr_t *rthdr; 483 ip6_frag_t *fraghdr; 484 485 if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) 486 return (B_FALSE); 487 ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); 488 length = IPV6_HDR_LEN; 489 whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ 490 491 if (fragp != NULL) 492 *fragp = NULL; 493 494 nexthdrp = &ip6h->ip6_nxt; 495 while (whereptr < endptr) { 496 /* Is there enough left for len + nexthdr? */ 497 if (whereptr + MIN_EHDR_LEN > endptr) 498 break; 499 500 switch (*nexthdrp) { 501 case IPPROTO_HOPOPTS: 502 case IPPROTO_DSTOPTS: 503 /* Assumes the headers are identical for hbh and dst */ 504 desthdr = (ip6_dest_t *)whereptr; 505 ehdrlen = 8 * (desthdr->ip6d_len + 1); 506 if ((uchar_t *)desthdr + ehdrlen > endptr) 507 return (B_FALSE); 508 nexthdrp = &desthdr->ip6d_nxt; 509 break; 510 case IPPROTO_ROUTING: 511 rthdr = (ip6_rthdr_t *)whereptr; 512 ehdrlen = 8 * (rthdr->ip6r_len + 1); 513 if ((uchar_t *)rthdr + ehdrlen > endptr) 514 return (B_FALSE); 515 nexthdrp = &rthdr->ip6r_nxt; 516 break; 517 case IPPROTO_FRAGMENT: 518 fraghdr = (ip6_frag_t *)whereptr; 519 ehdrlen = sizeof (ip6_frag_t); 520 if ((uchar_t *)&fraghdr[1] > endptr) 521 return (B_FALSE); 522 nexthdrp = &fraghdr->ip6f_nxt; 523 if (fragp != NULL) 524 *fragp = fraghdr; 525 break; 526 case IPPROTO_NONE: 527 /* No next header means we're finished */ 528 default: 529 *hdr_length = length; 530 *next_hdr = *nexthdrp; 531 return (B_TRUE); 532 } 533 length += ehdrlen; 534 whereptr += ehdrlen; 535 *hdr_length = length; 536 *next_hdr = *nexthdrp; 537 } 538 switch (*nexthdrp) { 539 case IPPROTO_HOPOPTS: 540 case IPPROTO_DSTOPTS: 541 case IPPROTO_ROUTING: 542 case IPPROTO_FRAGMENT: 543 /* 544 * If any know extension headers are still to be processed, 545 * the packet's malformed (or at least all the IP header(s) are 546 * not in the same mblk - and that should never happen. 547 */ 548 return (B_FALSE); 549 550 default: 551 /* 552 * If we get here, we know that all of the IP headers were in 553 * the same mblk, even if the ULP header is in the next mblk. 554 */ 555 *hdr_length = length; 556 *next_hdr = *nexthdrp; 557 return (B_TRUE); 558 } 559 } 560 561 /* 562 * The following set of routines are there to take care of interrupt 563 * re-targeting for legacy (fixed) interrupts. Some older versions 564 * of the popular NICs like e1000g do not support MSI-X interrupts 565 * and they reserve fixed interrupts for RX/TX rings. To re-target 566 * these interrupts, PCITOOL ioctls need to be used. 567 */ 568 typedef struct mac_dladm_intr { 569 int ino; 570 int cpu_id; 571 char driver_path[MAXPATHLEN]; 572 char nexus_path[MAXPATHLEN]; 573 } mac_dladm_intr_t; 574 575 /* Bind the interrupt to cpu_num */ 576 static int 577 mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino) 578 { 579 pcitool_intr_set_t iset; 580 int err; 581 582 iset.old_cpu = oldcpuid; 583 iset.ino = ino; 584 iset.cpu_id = cpu_num; 585 iset.user_version = PCITOOL_VERSION; 586 err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, 587 kcred, NULL); 588 589 return (err); 590 } 591 592 /* 593 * Search interrupt information. iget is filled in with the info to search 594 */ 595 static boolean_t 596 mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) 597 { 598 int i; 599 char driver_path[2 * MAXPATHLEN]; 600 601 for (i = 0; i < iget_p->num_devs; i++) { 602 (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); 603 (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, 604 ":%s%d", iget_p->dev[i].driver_name, 605 iget_p->dev[i].dev_inst); 606 /* Match the device path for the device path */ 607 if (strcmp(driver_path, dln->driver_path) == 0) { 608 dln->ino = iget_p->ino; 609 dln->cpu_id = iget_p->cpu_id; 610 return (B_TRUE); 611 } 612 } 613 return (B_FALSE); 614 } 615 616 /* 617 * Get information about ino, i.e. if this is the interrupt for our 618 * device and where it is bound etc. 619 */ 620 static boolean_t 621 mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino, 622 mac_dladm_intr_t *dln) 623 { 624 pcitool_intr_get_t *iget_p; 625 int ipsz; 626 int nipsz; 627 int err; 628 uint8_t inum; 629 630 /* 631 * Check if SLEEP is OK, i.e if could come here in response to 632 * changing the fanout due to some callback from the driver, say 633 * link speed changes. 634 */ 635 ipsz = PCITOOL_IGET_SIZE(0); 636 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 637 638 iget_p->num_devs_ret = 0; 639 iget_p->user_version = PCITOOL_VERSION; 640 iget_p->cpu_id = oldcpuid; 641 iget_p->ino = ino; 642 643 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 644 FKIOCTL, kcred, NULL); 645 if (err != 0) { 646 kmem_free(iget_p, ipsz); 647 return (B_FALSE); 648 } 649 if (iget_p->num_devs == 0) { 650 kmem_free(iget_p, ipsz); 651 return (B_FALSE); 652 } 653 inum = iget_p->num_devs; 654 if (iget_p->num_devs_ret < iget_p->num_devs) { 655 /* Reallocate */ 656 nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); 657 658 kmem_free(iget_p, ipsz); 659 ipsz = nipsz; 660 iget_p = kmem_zalloc(ipsz, KM_SLEEP); 661 662 iget_p->num_devs_ret = inum; 663 iget_p->cpu_id = oldcpuid; 664 iget_p->ino = ino; 665 iget_p->user_version = PCITOOL_VERSION; 666 err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, 667 FKIOCTL, kcred, NULL); 668 if (err != 0) { 669 kmem_free(iget_p, ipsz); 670 return (B_FALSE); 671 } 672 /* defensive */ 673 if (iget_p->num_devs != iget_p->num_devs_ret) { 674 kmem_free(iget_p, ipsz); 675 return (B_FALSE); 676 } 677 } 678 679 if (mac_search_intrinfo(iget_p, dln)) { 680 kmem_free(iget_p, ipsz); 681 return (B_TRUE); 682 } 683 kmem_free(iget_p, ipsz); 684 return (B_FALSE); 685 } 686 687 /* 688 * Get the interrupts and check each one to see if it is for our device. 689 */ 690 static int 691 mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) 692 { 693 pcitool_intr_info_t intr_info; 694 int err; 695 int ino; 696 int oldcpuid; 697 698 err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, 699 FKIOCTL, kcred, NULL); 700 if (err != 0) 701 return (-1); 702 703 for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) { 704 for (ino = 0; ino < intr_info.num_intr; ino++) { 705 if (mac_get_single_intr(lh, oldcpuid, ino, dln)) { 706 if (dln->cpu_id == cpuid) 707 return (0); 708 return (1); 709 } 710 } 711 } 712 return (-1); 713 } 714 715 /* 716 * Obtain the nexus parent node info. for mdip. 717 */ 718 static dev_info_t * 719 mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) 720 { 721 struct dev_info *tdip = (struct dev_info *)mdip; 722 struct ddi_minor_data *minordata; 723 int circ; 724 dev_info_t *pdip; 725 char pathname[MAXPATHLEN]; 726 727 while (tdip != NULL) { 728 /* 729 * The netboot code could call this function while walking the 730 * device tree so we need to use ndi_devi_tryenter() here to 731 * avoid deadlock. 732 */ 733 if (ndi_devi_tryenter((dev_info_t *)tdip, &circ) == 0) 734 break; 735 736 for (minordata = tdip->devi_minor; minordata != NULL; 737 minordata = minordata->next) { 738 if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, 739 strlen(DDI_NT_INTRCTL)) == 0) { 740 pdip = minordata->dip; 741 (void) ddi_pathname(pdip, pathname); 742 (void) snprintf(dln->nexus_path, MAXPATHLEN, 743 "/devices%s:intr", pathname); 744 (void) ddi_pathname_minor(minordata, pathname); 745 ndi_devi_exit((dev_info_t *)tdip, circ); 746 return (pdip); 747 } 748 } 749 ndi_devi_exit((dev_info_t *)tdip, circ); 750 tdip = tdip->devi_parent; 751 } 752 return (NULL); 753 } 754 755 /* 756 * For a primary MAC client, if the user has set a list or CPUs or 757 * we have obtained it implicitly, we try to retarget the interrupt 758 * for that device on one of the CPUs in the list. 759 * We assign the interrupt to the same CPU as the poll thread. 760 */ 761 static boolean_t 762 mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) 763 { 764 ldi_handle_t lh = NULL; 765 ldi_ident_t li = NULL; 766 int err; 767 int ret; 768 mac_dladm_intr_t dln; 769 dev_info_t *dip; 770 struct ddi_minor_data *minordata; 771 772 dln.nexus_path[0] = '\0'; 773 dln.driver_path[0] = '\0'; 774 775 minordata = ((struct dev_info *)mdip)->devi_minor; 776 while (minordata != NULL) { 777 if (minordata->type == DDM_MINOR) 778 break; 779 minordata = minordata->next; 780 } 781 if (minordata == NULL) 782 return (B_FALSE); 783 784 (void) ddi_pathname_minor(minordata, dln.driver_path); 785 786 dip = mac_get_nexus_node(mdip, &dln); 787 /* defensive */ 788 if (dip == NULL) 789 return (B_FALSE); 790 791 err = ldi_ident_from_major(ddi_driver_major(dip), &li); 792 if (err != 0) 793 return (B_FALSE); 794 795 err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); 796 if (err != 0) 797 return (B_FALSE); 798 799 ret = mac_validate_intr(lh, &dln, cpuid); 800 if (ret < 0) { 801 (void) ldi_close(lh, FREAD|FWRITE, kcred); 802 return (B_FALSE); 803 } 804 /* cmn_note? */ 805 if (ret != 0) 806 if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino))) 807 != 0) { 808 (void) ldi_close(lh, FREAD|FWRITE, kcred); 809 return (B_FALSE); 810 } 811 (void) ldi_close(lh, FREAD|FWRITE, kcred); 812 return (B_TRUE); 813 } 814 815 void 816 mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) 817 { 818 dev_info_t *mdip = (dev_info_t *)arg; 819 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 820 mac_resource_props_t *mrp; 821 mac_perim_handle_t mph; 822 flow_entry_t *flent = mcip->mci_flent; 823 mac_soft_ring_set_t *rx_srs; 824 mac_cpus_t *srs_cpu; 825 826 if (!mac_check_interrupt_binding(mdip, cpuid)) 827 cpuid = -1; 828 mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); 829 mrp = MCIP_RESOURCE_PROPS(mcip); 830 mrp->mrp_rx_intr_cpu = cpuid; 831 if (flent != NULL && flent->fe_rx_srs_cnt == 2) { 832 rx_srs = flent->fe_rx_srs[1]; 833 srs_cpu = &rx_srs->srs_cpu; 834 srs_cpu->mc_rx_intr_cpu = cpuid; 835 } 836 mac_perim_exit(mph); 837 } 838 839 int32_t 840 mac_client_intr_cpu(mac_client_handle_t mch) 841 { 842 mac_client_impl_t *mcip = (mac_client_impl_t *)mch; 843 mac_cpus_t *srs_cpu; 844 mac_soft_ring_set_t *rx_srs; 845 flow_entry_t *flent = mcip->mci_flent; 846 mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); 847 mac_ring_t *ring; 848 mac_intr_t *mintr; 849 850 /* 851 * Check if we need to retarget the interrupt. We do this only 852 * for the primary MAC client. We do this if we have the only 853 * exclusive ring in the group. 854 */ 855 if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { 856 rx_srs = flent->fe_rx_srs[1]; 857 srs_cpu = &rx_srs->srs_cpu; 858 ring = rx_srs->srs_ring; 859 mintr = &ring->mr_info.mri_intr; 860 /* 861 * If ddi_handle is present or the poll CPU is 862 * already bound to the interrupt CPU, return -1. 863 */ 864 if (mintr->mi_ddi_handle != NULL || 865 ((mrp->mrp_ncpus != 0) && 866 (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) { 867 return (-1); 868 } 869 return (srs_cpu->mc_rx_pollid); 870 } 871 return (-1); 872 } 873 874 void * 875 mac_get_devinfo(mac_handle_t mh) 876 { 877 mac_impl_t *mip = (mac_impl_t *)mh; 878 879 return ((void *)mip->mi_dip); 880 } 881 882 #define PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1]) 883 #define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) 884 #define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) 885 886 uint64_t 887 mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) 888 { 889 struct ether_header *ehp; 890 uint64_t hash = 0; 891 uint16_t sap; 892 uint_t skip_len; 893 uint8_t proto; 894 boolean_t ip_fragmented; 895 896 /* 897 * We may want to have one of these per MAC type plugin in the 898 * future. For now supports only ethernet. 899 */ 900 if (media != DL_ETHER) 901 return (0L); 902 903 /* for now we support only outbound packets */ 904 ASSERT(is_outbound); 905 ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); 906 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 907 908 /* compute L2 hash */ 909 910 ehp = (struct ether_header *)mp->b_rptr; 911 912 if ((policy & MAC_PKT_HASH_L2) != 0) { 913 uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; 914 uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; 915 hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); 916 policy &= ~MAC_PKT_HASH_L2; 917 } 918 919 if (policy == 0) 920 goto done; 921 922 /* skip ethernet header */ 923 924 sap = ntohs(ehp->ether_type); 925 if (sap == ETHERTYPE_VLAN) { 926 struct ether_vlan_header *evhp; 927 mblk_t *newmp = NULL; 928 929 skip_len = sizeof (struct ether_vlan_header); 930 if (MBLKL(mp) < skip_len) { 931 /* the vlan tag is the payload, pull up first */ 932 newmp = msgpullup(mp, -1); 933 if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { 934 goto done; 935 } 936 evhp = (struct ether_vlan_header *)newmp->b_rptr; 937 } else { 938 evhp = (struct ether_vlan_header *)mp->b_rptr; 939 } 940 941 sap = ntohs(evhp->ether_type); 942 freemsg(newmp); 943 } else { 944 skip_len = sizeof (struct ether_header); 945 } 946 947 /* if ethernet header is in its own mblk, skip it */ 948 if (MBLKL(mp) <= skip_len) { 949 skip_len -= MBLKL(mp); 950 mp = mp->b_cont; 951 if (mp == NULL) 952 goto done; 953 } 954 955 sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; 956 957 /* compute IP src/dst addresses hash and skip IPv{4,6} header */ 958 959 switch (sap) { 960 case ETHERTYPE_IP: { 961 ipha_t *iphp; 962 963 /* 964 * If the header is not aligned or the header doesn't fit 965 * in the mblk, bail now. Note that this may cause packets 966 * reordering. 967 */ 968 iphp = (ipha_t *)(mp->b_rptr + skip_len); 969 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || 970 !OK_32PTR((char *)iphp)) 971 goto done; 972 973 proto = iphp->ipha_protocol; 974 skip_len += IPH_HDR_LENGTH(iphp); 975 976 /* Check if the packet is fragmented. */ 977 ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) & 978 IPH_OFFSET; 979 980 /* 981 * For fragmented packets, use addresses in addition to 982 * the frag_id to generate the hash inorder to get 983 * better distribution. 984 */ 985 if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) { 986 uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); 987 uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); 988 989 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 990 PKT_HASH_4BYTES(ip_dst)); 991 policy &= ~MAC_PKT_HASH_L3; 992 } 993 994 if (ip_fragmented) { 995 uint8_t *identp = (uint8_t *)&iphp->ipha_ident; 996 hash ^= PKT_HASH_2BYTES(identp); 997 goto done; 998 } 999 break; 1000 } 1001 case ETHERTYPE_IPV6: { 1002 ip6_t *ip6hp; 1003 ip6_frag_t *frag = NULL; 1004 uint16_t hdr_length; 1005 1006 /* 1007 * If the header is not aligned or the header doesn't fit 1008 * in the mblk, bail now. Note that this may cause packets 1009 * reordering. 1010 */ 1011 1012 ip6hp = (ip6_t *)(mp->b_rptr + skip_len); 1013 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || 1014 !OK_32PTR((char *)ip6hp)) 1015 goto done; 1016 1017 if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length, 1018 &proto, &frag)) 1019 goto done; 1020 skip_len += hdr_length; 1021 1022 /* 1023 * For fragmented packets, use addresses in addition to 1024 * the frag_id to generate the hash inorder to get 1025 * better distribution. 1026 */ 1027 if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) { 1028 uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); 1029 uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); 1030 1031 hash ^= (PKT_HASH_4BYTES(ip_src) ^ 1032 PKT_HASH_4BYTES(ip_dst)); 1033 policy &= ~MAC_PKT_HASH_L3; 1034 } 1035 1036 if (frag != NULL) { 1037 uint8_t *identp = (uint8_t *)&frag->ip6f_ident; 1038 hash ^= PKT_HASH_4BYTES(identp); 1039 goto done; 1040 } 1041 break; 1042 } 1043 default: 1044 goto done; 1045 } 1046 1047 if (policy == 0) 1048 goto done; 1049 1050 /* if ip header is in its own mblk, skip it */ 1051 if (MBLKL(mp) <= skip_len) { 1052 skip_len -= MBLKL(mp); 1053 mp = mp->b_cont; 1054 if (mp == NULL) 1055 goto done; 1056 } 1057 1058 /* parse ULP header */ 1059 again: 1060 switch (proto) { 1061 case IPPROTO_TCP: 1062 case IPPROTO_UDP: 1063 case IPPROTO_ESP: 1064 case IPPROTO_SCTP: 1065 /* 1066 * These Internet Protocols are intentionally designed 1067 * for hashing from the git-go. Port numbers are in the first 1068 * word for transports, SPI is first for ESP. 1069 */ 1070 if (mp->b_rptr + skip_len + 4 > mp->b_wptr) 1071 goto done; 1072 hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); 1073 break; 1074 1075 case IPPROTO_AH: { 1076 ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); 1077 uint_t ah_length = AH_TOTAL_LEN(ah); 1078 1079 if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) 1080 goto done; 1081 1082 proto = ah->ah_nexthdr; 1083 skip_len += ah_length; 1084 1085 /* if AH header is in its own mblk, skip it */ 1086 if (MBLKL(mp) <= skip_len) { 1087 skip_len -= MBLKL(mp); 1088 mp = mp->b_cont; 1089 if (mp == NULL) 1090 goto done; 1091 } 1092 1093 goto again; 1094 } 1095 } 1096 1097 done: 1098 return (hash); 1099 } 1100