xref: /illumos-gate/usr/src/uts/common/io/mac/mac_util.c (revision 3fe80ca4)
1da14cebeSEric Cheng /*
2da14cebeSEric Cheng  * CDDL HEADER START
3da14cebeSEric Cheng  *
4da14cebeSEric Cheng  * The contents of this file are subject to the terms of the
5da14cebeSEric Cheng  * Common Development and Distribution License (the "License").
6da14cebeSEric Cheng  * You may not use this file except in compliance with the License.
7da14cebeSEric Cheng  *
8da14cebeSEric Cheng  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9da14cebeSEric Cheng  * or http://www.opensolaris.org/os/licensing.
10da14cebeSEric Cheng  * See the License for the specific language governing permissions
11da14cebeSEric Cheng  * and limitations under the License.
12da14cebeSEric Cheng  *
13da14cebeSEric Cheng  * When distributing Covered Code, include this CDDL HEADER in each
14da14cebeSEric Cheng  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15da14cebeSEric Cheng  * If applicable, add the following below this CDDL HEADER, with the
16da14cebeSEric Cheng  * fields enclosed by brackets "[]" replaced with your own identifying
17da14cebeSEric Cheng  * information: Portions Copyright [yyyy] [name of copyright owner]
18da14cebeSEric Cheng  *
19da14cebeSEric Cheng  * CDDL HEADER END
20da14cebeSEric Cheng  */
21da14cebeSEric Cheng /*
225cd376e8SJimmy Vetayases  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23c61a1653SRyan Zezeski  * Copyright 2019 Joyent, Inc.
24ff4179b7SLuqman Aden  * Copyright 2023 Oxide Computer Company
25da14cebeSEric Cheng  */
26da14cebeSEric Cheng 
27da14cebeSEric Cheng /*
28da14cebeSEric Cheng  * MAC Services Module - misc utilities
29da14cebeSEric Cheng  */
30da14cebeSEric Cheng 
31da14cebeSEric Cheng #include <sys/types.h>
32da14cebeSEric Cheng #include <sys/mac.h>
33da14cebeSEric Cheng #include <sys/mac_impl.h>
34da14cebeSEric Cheng #include <sys/mac_client_priv.h>
35da14cebeSEric Cheng #include <sys/mac_client_impl.h>
36da14cebeSEric Cheng #include <sys/mac_soft_ring.h>
37da14cebeSEric Cheng #include <sys/strsubr.h>
38da14cebeSEric Cheng #include <sys/strsun.h>
39da14cebeSEric Cheng #include <sys/vlan.h>
40da14cebeSEric Cheng #include <sys/pattr.h>
41da14cebeSEric Cheng #include <sys/pci_tools.h>
42da14cebeSEric Cheng #include <inet/ip.h>
43da14cebeSEric Cheng #include <inet/ip_impl.h>
44da14cebeSEric Cheng #include <inet/ip6.h>
45da14cebeSEric Cheng #include <sys/vtrace.h>
46da14cebeSEric Cheng #include <sys/dlpi.h>
47da14cebeSEric Cheng #include <sys/sunndi.h>
48ae6aa22aSVenugopal Iyer #include <inet/ipsec_impl.h>
49ae6aa22aSVenugopal Iyer #include <inet/sadb.h>
50ae6aa22aSVenugopal Iyer #include <inet/ipsecesp.h>
51ae6aa22aSVenugopal Iyer #include <inet/ipsecah.h>
52c61a1653SRyan Zezeski #include <inet/tcp.h>
53c61a1653SRyan Zezeski #include <inet/udp_impl.h>
54c61a1653SRyan Zezeski #include <inet/sctp_ip.h>
55c61a1653SRyan Zezeski 
56c61a1653SRyan Zezeski /*
57c61a1653SRyan Zezeski  * The next two functions are used for dropping packets or chains of
58c61a1653SRyan Zezeski  * packets, respectively. We could use one function for both but
59c61a1653SRyan Zezeski  * separating the use cases allows us to specify intent and prevent
60c61a1653SRyan Zezeski  * dropping more data than intended.
61c61a1653SRyan Zezeski  *
62c61a1653SRyan Zezeski  * The purpose of these functions is to aid the debugging effort,
63c61a1653SRyan Zezeski  * especially in production. Rather than use freemsg()/freemsgchain(),
64c61a1653SRyan Zezeski  * it's preferable to use these functions when dropping a packet in
65c61a1653SRyan Zezeski  * the MAC layer. These functions should only be used during
66c61a1653SRyan Zezeski  * unexpected conditions. That is, any time a packet is dropped
67c61a1653SRyan Zezeski  * outside of the regular, successful datapath. Consolidating all
68c61a1653SRyan Zezeski  * drops on these functions allows the user to trace one location and
69c61a1653SRyan Zezeski  * determine why the packet was dropped based on the msg. It also
70c61a1653SRyan Zezeski  * allows the user to inspect the packet before it is freed. Finally,
71c61a1653SRyan Zezeski  * it allows the user to avoid tracing freemsg()/freemsgchain() thus
72c61a1653SRyan Zezeski  * keeping the hot path running as efficiently as possible.
73c61a1653SRyan Zezeski  *
74c61a1653SRyan Zezeski  * NOTE: At this time not all MAC drops are aggregated on these
75c61a1653SRyan Zezeski  * functions; but that is the plan. This comment should be erased once
76c61a1653SRyan Zezeski  * completed.
77c61a1653SRyan Zezeski  */
78c61a1653SRyan Zezeski 
79c61a1653SRyan Zezeski /*PRINTFLIKE2*/
80c61a1653SRyan Zezeski void
mac_drop_pkt(mblk_t * mp,const char * fmt,...)81c61a1653SRyan Zezeski mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
82c61a1653SRyan Zezeski {
83c61a1653SRyan Zezeski 	va_list adx;
84c61a1653SRyan Zezeski 	char msg[128];
85c61a1653SRyan Zezeski 	char *msgp = msg;
86c61a1653SRyan Zezeski 
87c61a1653SRyan Zezeski 	ASSERT3P(mp->b_next, ==, NULL);
88c61a1653SRyan Zezeski 
89c61a1653SRyan Zezeski 	va_start(adx, fmt);
90c61a1653SRyan Zezeski 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
91c61a1653SRyan Zezeski 	va_end(adx);
92c61a1653SRyan Zezeski 
93c61a1653SRyan Zezeski 	DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
94c61a1653SRyan Zezeski 	freemsg(mp);
95c61a1653SRyan Zezeski }
96c61a1653SRyan Zezeski 
97c61a1653SRyan Zezeski /*PRINTFLIKE2*/
98c61a1653SRyan Zezeski void
mac_drop_chain(mblk_t * chain,const char * fmt,...)99c61a1653SRyan Zezeski mac_drop_chain(mblk_t *chain, const char *fmt, ...)
100c61a1653SRyan Zezeski {
101c61a1653SRyan Zezeski 	va_list adx;
102c61a1653SRyan Zezeski 	char msg[128];
103c61a1653SRyan Zezeski 	char *msgp = msg;
104c61a1653SRyan Zezeski 
105c61a1653SRyan Zezeski 	va_start(adx, fmt);
106c61a1653SRyan Zezeski 	(void) vsnprintf(msgp, sizeof (msg), fmt, adx);
107c61a1653SRyan Zezeski 	va_end(adx);
108c61a1653SRyan Zezeski 
109c61a1653SRyan Zezeski 	/*
110c61a1653SRyan Zezeski 	 * We could use freemsgchain() for the actual freeing but
111c61a1653SRyan Zezeski 	 * since we are already walking the chain to fire the dtrace
112c61a1653SRyan Zezeski 	 * probe we might as well free the msg here too.
113c61a1653SRyan Zezeski 	 */
114c61a1653SRyan Zezeski 	for (mblk_t *mp = chain, *next; mp != NULL; ) {
115c61a1653SRyan Zezeski 		next = mp->b_next;
116c61a1653SRyan Zezeski 		DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
117ff4179b7SLuqman Aden 		mp->b_next = NULL;
118c61a1653SRyan Zezeski 		freemsg(mp);
119c61a1653SRyan Zezeski 		mp = next;
120c61a1653SRyan Zezeski 	}
121c61a1653SRyan Zezeski }
122da14cebeSEric Cheng 
123da14cebeSEric Cheng /*
124da14cebeSEric Cheng  * Copy an mblk, preserving its hardware checksum flags.
125da14cebeSEric Cheng  */
126da14cebeSEric Cheng static mblk_t *
mac_copymsg_cksum(mblk_t * mp)127da14cebeSEric Cheng mac_copymsg_cksum(mblk_t *mp)
128da14cebeSEric Cheng {
129da14cebeSEric Cheng 	mblk_t *mp1;
130da14cebeSEric Cheng 
131da14cebeSEric Cheng 	mp1 = copymsg(mp);
132da14cebeSEric Cheng 	if (mp1 == NULL)
133da14cebeSEric Cheng 		return (NULL);
134da14cebeSEric Cheng 
135ec71f88eSPatrick Mooney 	mac_hcksum_clone(mp, mp1);
136da14cebeSEric Cheng 
137da14cebeSEric Cheng 	return (mp1);
138da14cebeSEric Cheng }
139da14cebeSEric Cheng 
140da14cebeSEric Cheng /*
141da14cebeSEric Cheng  * Copy an mblk chain, presenting the hardware checksum flags of the
142da14cebeSEric Cheng  * individual mblks.
143da14cebeSEric Cheng  */
144da14cebeSEric Cheng mblk_t *
mac_copymsgchain_cksum(mblk_t * mp)145da14cebeSEric Cheng mac_copymsgchain_cksum(mblk_t *mp)
146da14cebeSEric Cheng {
147da14cebeSEric Cheng 	mblk_t *nmp = NULL;
148da14cebeSEric Cheng 	mblk_t **nmpp = &nmp;
149da14cebeSEric Cheng 
150da14cebeSEric Cheng 	for (; mp != NULL; mp = mp->b_next) {
151da14cebeSEric Cheng 		if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) {
152da14cebeSEric Cheng 			freemsgchain(nmp);
153da14cebeSEric Cheng 			return (NULL);
154da14cebeSEric Cheng 		}
155da14cebeSEric Cheng 
156da14cebeSEric Cheng 		nmpp = &((*nmpp)->b_next);
157da14cebeSEric Cheng 	}
158da14cebeSEric Cheng 
159da14cebeSEric Cheng 	return (nmp);
160da14cebeSEric Cheng }
161da14cebeSEric Cheng 
162da14cebeSEric Cheng /*
163c61a1653SRyan Zezeski  * Calculate the ULP checksum for IPv4. Return true if the calculation
164c61a1653SRyan Zezeski  * was successful, or false if an error occurred. If the later, place
165c61a1653SRyan Zezeski  * an error message into '*err'.
166da14cebeSEric Cheng  */
167c61a1653SRyan Zezeski static boolean_t
mac_sw_cksum_ipv4(mblk_t * mp,uint32_t ip_hdr_offset,ipha_t * ipha,const char ** err)168c61a1653SRyan Zezeski mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha,
169c61a1653SRyan Zezeski     const char **err)
170c61a1653SRyan Zezeski {
171c61a1653SRyan Zezeski 	const uint8_t proto = ipha->ipha_protocol;
172c61a1653SRyan Zezeski 	size_t len;
173c61a1653SRyan Zezeski 	const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha);
174c61a1653SRyan Zezeski 	/* ULP offset from start of L2. */
175c61a1653SRyan Zezeski 	const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz;
176c61a1653SRyan Zezeski 	ipaddr_t src, dst;
177c61a1653SRyan Zezeski 	uint32_t cksum;
178c61a1653SRyan Zezeski 	uint16_t *up;
179c61a1653SRyan Zezeski 
180c61a1653SRyan Zezeski 	/*
181c61a1653SRyan Zezeski 	 * We need a pointer to the ULP checksum. We're assuming the
182c61a1653SRyan Zezeski 	 * ULP checksum pointer resides in the first mblk. Our native
183c61a1653SRyan Zezeski 	 * TCP stack should always put the headers in the first mblk,
184c61a1653SRyan Zezeski 	 * but currently we have no way to guarantee that other
185c61a1653SRyan Zezeski 	 * clients don't spread headers (or even header fields) across
186c61a1653SRyan Zezeski 	 * mblks.
187c61a1653SRyan Zezeski 	 */
188c61a1653SRyan Zezeski 	switch (proto) {
189c61a1653SRyan Zezeski 	case IPPROTO_TCP:
190c61a1653SRyan Zezeski 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
191c61a1653SRyan Zezeski 		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
192c61a1653SRyan Zezeski 			*err = "mblk doesn't contain TCP header";
193c61a1653SRyan Zezeski 			goto bail;
194c61a1653SRyan Zezeski 		}
195c61a1653SRyan Zezeski 
196c61a1653SRyan Zezeski 		up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz);
197c61a1653SRyan Zezeski 		cksum = IP_TCP_CSUM_COMP;
198c61a1653SRyan Zezeski 		break;
199c61a1653SRyan Zezeski 
200c61a1653SRyan Zezeski 	case IPPROTO_UDP:
201c61a1653SRyan Zezeski 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
202c61a1653SRyan Zezeski 		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
203c61a1653SRyan Zezeski 			*err = "mblk doesn't contain UDP header";
204c61a1653SRyan Zezeski 			goto bail;
205c61a1653SRyan Zezeski 		}
206c61a1653SRyan Zezeski 
207c61a1653SRyan Zezeski 		up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz);
208c61a1653SRyan Zezeski 		cksum = IP_UDP_CSUM_COMP;
209c61a1653SRyan Zezeski 		break;
210c61a1653SRyan Zezeski 
211c61a1653SRyan Zezeski 	case IPPROTO_SCTP: {
212c61a1653SRyan Zezeski 		sctp_hdr_t *sctph;
213c61a1653SRyan Zezeski 
214c61a1653SRyan Zezeski 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
215c61a1653SRyan Zezeski 		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
216c61a1653SRyan Zezeski 			*err = "mblk doesn't contain SCTP header";
217c61a1653SRyan Zezeski 			goto bail;
218c61a1653SRyan Zezeski 		}
219c61a1653SRyan Zezeski 
220c61a1653SRyan Zezeski 		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
221c61a1653SRyan Zezeski 		sctph->sh_chksum = 0;
222c61a1653SRyan Zezeski 		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
223c61a1653SRyan Zezeski 		return (B_TRUE);
224c61a1653SRyan Zezeski 	}
225c61a1653SRyan Zezeski 
226c61a1653SRyan Zezeski 	default:
227c61a1653SRyan Zezeski 		*err = "unexpected protocol";
228c61a1653SRyan Zezeski 		goto bail;
229c61a1653SRyan Zezeski 
230c61a1653SRyan Zezeski 	}
231c61a1653SRyan Zezeski 
232c61a1653SRyan Zezeski 	/* Pseudo-header checksum. */
233c61a1653SRyan Zezeski 	src = ipha->ipha_src;
234c61a1653SRyan Zezeski 	dst = ipha->ipha_dst;
235c61a1653SRyan Zezeski 	len = ntohs(ipha->ipha_length) - ip_hdr_sz;
236c61a1653SRyan Zezeski 
237c61a1653SRyan Zezeski 	cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
238c61a1653SRyan Zezeski 	cksum += htons(len);
239c61a1653SRyan Zezeski 
240c61a1653SRyan Zezeski 	/*
241c61a1653SRyan Zezeski 	 * We have already accounted for the pseudo checksum above.
242c61a1653SRyan Zezeski 	 * Make sure the ULP checksum field is zero before computing
243c61a1653SRyan Zezeski 	 * the rest.
244c61a1653SRyan Zezeski 	 */
245c61a1653SRyan Zezeski 	*up = 0;
246c61a1653SRyan Zezeski 	cksum = IP_CSUM(mp, ulp_offset, cksum);
247c61a1653SRyan Zezeski 	*up = (uint16_t)(cksum ? cksum : ~cksum);
248c61a1653SRyan Zezeski 
249c61a1653SRyan Zezeski 	return (B_TRUE);
250c61a1653SRyan Zezeski 
251c61a1653SRyan Zezeski bail:
252c61a1653SRyan Zezeski 	return (B_FALSE);
253c61a1653SRyan Zezeski }
254c61a1653SRyan Zezeski 
255c61a1653SRyan Zezeski /*
256c61a1653SRyan Zezeski  * Calculate the ULP checksum for IPv6. Return true if the calculation
257c61a1653SRyan Zezeski  * was successful, or false if an error occurred. If the later, place
258c61a1653SRyan Zezeski  * an error message into '*err'.
259c61a1653SRyan Zezeski  */
260c61a1653SRyan Zezeski static boolean_t
mac_sw_cksum_ipv6(mblk_t * mp,uint32_t ip_hdr_offset,const char ** err)261c61a1653SRyan Zezeski mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err)
262da14cebeSEric Cheng {
263c61a1653SRyan Zezeski 	ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset);
264c61a1653SRyan Zezeski 	const uint8_t proto = ip6h->ip6_nxt;
265c61a1653SRyan Zezeski 	const uint16_t *iphs = (uint16_t *)ip6h;
266c61a1653SRyan Zezeski 	/* ULP offset from start of L2. */
267c61a1653SRyan Zezeski 	uint32_t ulp_offset;
268c61a1653SRyan Zezeski 	size_t len;
269c61a1653SRyan Zezeski 	uint32_t cksum;
270c61a1653SRyan Zezeski 	uint16_t *up;
271c61a1653SRyan Zezeski 	uint16_t ip_hdr_sz;
272c61a1653SRyan Zezeski 
273c61a1653SRyan Zezeski 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) {
274c61a1653SRyan Zezeski 		*err = "malformed IPv6 header";
275c61a1653SRyan Zezeski 		goto bail;
276c61a1653SRyan Zezeski 	}
277c61a1653SRyan Zezeski 
278c61a1653SRyan Zezeski 	ulp_offset = ip_hdr_offset + ip_hdr_sz;
279c61a1653SRyan Zezeski 
280c61a1653SRyan Zezeski 	/*
281c61a1653SRyan Zezeski 	 * We need a pointer to the ULP checksum. We're assuming the
282c61a1653SRyan Zezeski 	 * ULP checksum pointer resides in the first mblk. Our native
283c61a1653SRyan Zezeski 	 * TCP stack should always put the headers in the first mblk,
284c61a1653SRyan Zezeski 	 * but currently we have no way to guarantee that other
285c61a1653SRyan Zezeski 	 * clients don't spread headers (or even header fields) across
286c61a1653SRyan Zezeski 	 * mblks.
287c61a1653SRyan Zezeski 	 */
288c61a1653SRyan Zezeski 	switch (proto) {
289c61a1653SRyan Zezeski 	case IPPROTO_TCP:
290c61a1653SRyan Zezeski 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
291c61a1653SRyan Zezeski 		if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
292c61a1653SRyan Zezeski 			*err = "mblk doesn't contain TCP header";
293c61a1653SRyan Zezeski 			goto bail;
294c61a1653SRyan Zezeski 		}
295c61a1653SRyan Zezeski 
296c61a1653SRyan Zezeski 		up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz);
297c61a1653SRyan Zezeski 		cksum = IP_TCP_CSUM_COMP;
298c61a1653SRyan Zezeski 		break;
299c61a1653SRyan Zezeski 
300c61a1653SRyan Zezeski 	case IPPROTO_UDP:
301c61a1653SRyan Zezeski 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
302c61a1653SRyan Zezeski 		if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
303c61a1653SRyan Zezeski 			*err = "mblk doesn't contain UDP header";
304c61a1653SRyan Zezeski 			goto bail;
305c61a1653SRyan Zezeski 		}
306c61a1653SRyan Zezeski 
307c61a1653SRyan Zezeski 		up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz);
308c61a1653SRyan Zezeski 		cksum = IP_UDP_CSUM_COMP;
309c61a1653SRyan Zezeski 		break;
310c61a1653SRyan Zezeski 
311c61a1653SRyan Zezeski 	case IPPROTO_SCTP: {
312c61a1653SRyan Zezeski 		sctp_hdr_t *sctph;
313c61a1653SRyan Zezeski 
314c61a1653SRyan Zezeski 		ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
315c61a1653SRyan Zezeski 		if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
316c61a1653SRyan Zezeski 			*err = "mblk doesn't contain SCTP header";
317c61a1653SRyan Zezeski 			goto bail;
318c61a1653SRyan Zezeski 		}
319c61a1653SRyan Zezeski 
320c61a1653SRyan Zezeski 		sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
321c61a1653SRyan Zezeski 		/*
322c61a1653SRyan Zezeski 		 * Zero out the checksum field to ensure proper
323c61a1653SRyan Zezeski 		 * checksum calculation.
324c61a1653SRyan Zezeski 		 */
325c61a1653SRyan Zezeski 		sctph->sh_chksum = 0;
326c61a1653SRyan Zezeski 		sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
327c61a1653SRyan Zezeski 		return (B_TRUE);
328c61a1653SRyan Zezeski 	}
329c61a1653SRyan Zezeski 
330c61a1653SRyan Zezeski 	default:
331c61a1653SRyan Zezeski 		*err = "unexpected protocol";
332c61a1653SRyan Zezeski 		goto bail;
333c61a1653SRyan Zezeski 	}
334c61a1653SRyan Zezeski 
335c61a1653SRyan Zezeski 	/*
336c61a1653SRyan Zezeski 	 * The payload length includes the payload and the IPv6
337c61a1653SRyan Zezeski 	 * extension headers; the idea is to subtract the extension
338c61a1653SRyan Zezeski 	 * header length to get the real payload length.
339c61a1653SRyan Zezeski 	 */
340c61a1653SRyan Zezeski 	len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN);
341c61a1653SRyan Zezeski 	cksum += len;
342c61a1653SRyan Zezeski 
343c61a1653SRyan Zezeski 	/*
344c61a1653SRyan Zezeski 	 * We accumulate the pseudo header checksum in cksum; then we
345c61a1653SRyan Zezeski 	 * call IP_CSUM to compute the checksum over the payload.
346c61a1653SRyan Zezeski 	 */
347c61a1653SRyan Zezeski 	cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] +
348c61a1653SRyan Zezeski 	    iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] +
349c61a1653SRyan Zezeski 	    iphs[16] + iphs[17] + iphs[18] + iphs[19];
350c61a1653SRyan Zezeski 	cksum = IP_CSUM(mp, ulp_offset, cksum);
351c61a1653SRyan Zezeski 
352c61a1653SRyan Zezeski 	/* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */
353c61a1653SRyan Zezeski 	if (proto == IPPROTO_UDP && cksum == 0)
354c61a1653SRyan Zezeski 		cksum = ~cksum;
355c61a1653SRyan Zezeski 
356c61a1653SRyan Zezeski 	*up = (uint16_t)cksum;
357c61a1653SRyan Zezeski 
358c61a1653SRyan Zezeski 	return (B_TRUE);
359c61a1653SRyan Zezeski 
360c61a1653SRyan Zezeski bail:
361c61a1653SRyan Zezeski 	return (B_FALSE);
362c61a1653SRyan Zezeski }
363c61a1653SRyan Zezeski 
364c61a1653SRyan Zezeski /*
365c61a1653SRyan Zezeski  * Perform software checksum on a single message, if needed. The
366c61a1653SRyan Zezeski  * emulation performed is determined by an intersection of the mblk's
367c61a1653SRyan Zezeski  * flags and the emul flags requested. The emul flags are documented
368c61a1653SRyan Zezeski  * in mac.h.
369c61a1653SRyan Zezeski  */
370c61a1653SRyan Zezeski static mblk_t *
mac_sw_cksum(mblk_t * mp,mac_emul_t emul)371c61a1653SRyan Zezeski mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
372c61a1653SRyan Zezeski {
373c61a1653SRyan Zezeski 	mblk_t *skipped_hdr = NULL;
374da14cebeSEric Cheng 	uint32_t flags, start, stuff, end, value;
375c61a1653SRyan Zezeski 	uint32_t ip_hdr_offset;
376c61a1653SRyan Zezeski 	uint16_t etype;
377c61a1653SRyan Zezeski 	size_t ip_hdr_sz;
378c61a1653SRyan Zezeski 	struct ether_header *ehp;
379c61a1653SRyan Zezeski 	const char *err = "";
380da14cebeSEric Cheng 
381c61a1653SRyan Zezeski 	/*
382c61a1653SRyan Zezeski 	 * This function should only be called from mac_hw_emul()
383c61a1653SRyan Zezeski 	 * which handles mblk chains and the shared ref case.
384c61a1653SRyan Zezeski 	 */
385c61a1653SRyan Zezeski 	ASSERT3P(mp->b_next, ==, NULL);
386da14cebeSEric Cheng 
387c61a1653SRyan Zezeski 	mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
388c61a1653SRyan Zezeski 
389c61a1653SRyan Zezeski 	flags = DB_CKSUMFLAGS(mp);
390c61a1653SRyan Zezeski 
391c61a1653SRyan Zezeski 	/* Why call this if checksum emulation isn't needed? */
392c61a1653SRyan Zezeski 	ASSERT3U(flags & (HCK_FLAGS), !=, 0);
393c61a1653SRyan Zezeski 
394c61a1653SRyan Zezeski 	/*
395c61a1653SRyan Zezeski 	 * Ethernet, and optionally VLAN header. mac_hw_emul() has
396c61a1653SRyan Zezeski 	 * already verified we have enough data to read the L2 header.
397c61a1653SRyan Zezeski 	 */
398c61a1653SRyan Zezeski 	ehp = (struct ether_header *)mp->b_rptr;
399c61a1653SRyan Zezeski 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
400c61a1653SRyan Zezeski 		struct ether_vlan_header *evhp;
401c61a1653SRyan Zezeski 
402c61a1653SRyan Zezeski 		evhp = (struct ether_vlan_header *)mp->b_rptr;
403c61a1653SRyan Zezeski 		etype = ntohs(evhp->ether_type);
404c61a1653SRyan Zezeski 		ip_hdr_offset = sizeof (struct ether_vlan_header);
405c61a1653SRyan Zezeski 	} else {
406c61a1653SRyan Zezeski 		etype = ntohs(ehp->ether_type);
407c61a1653SRyan Zezeski 		ip_hdr_offset = sizeof (struct ether_header);
408c61a1653SRyan Zezeski 	}
409c61a1653SRyan Zezeski 
410c61a1653SRyan Zezeski 	/*
411c61a1653SRyan Zezeski 	 * If this packet isn't IP, then leave it alone. We don't want
412c61a1653SRyan Zezeski 	 * to affect non-IP traffic like ARP. Assume the IP header
413c61a1653SRyan Zezeski 	 * doesn't include any options, for now. We will use the
414c61a1653SRyan Zezeski 	 * correct size later after we know there are enough bytes to
415c61a1653SRyan Zezeski 	 * at least fill out the basic header.
416c61a1653SRyan Zezeski 	 */
417c61a1653SRyan Zezeski 	switch (etype) {
418c61a1653SRyan Zezeski 	case ETHERTYPE_IP:
419c61a1653SRyan Zezeski 		ip_hdr_sz = sizeof (ipha_t);
420c61a1653SRyan Zezeski 		break;
421c61a1653SRyan Zezeski 	case ETHERTYPE_IPV6:
422c61a1653SRyan Zezeski 		ip_hdr_sz = sizeof (ip6_t);
423c61a1653SRyan Zezeski 		break;
424c61a1653SRyan Zezeski 	default:
425c61a1653SRyan Zezeski 		return (mp);
426c61a1653SRyan Zezeski 	}
427c61a1653SRyan Zezeski 
428c61a1653SRyan Zezeski 	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset);
429c61a1653SRyan Zezeski 
430c61a1653SRyan Zezeski 	/*
431c61a1653SRyan Zezeski 	 * If the first mblk of this packet contains only the ethernet
432c61a1653SRyan Zezeski 	 * header, skip past it for now. Packets with their data
433c61a1653SRyan Zezeski 	 * contained in only a single mblk can then use the fastpaths
434c61a1653SRyan Zezeski 	 * tuned to that possibility.
435c61a1653SRyan Zezeski 	 */
436c61a1653SRyan Zezeski 	if (MBLKL(mp) == ip_hdr_offset) {
437c61a1653SRyan Zezeski 		ip_hdr_offset -= MBLKL(mp);
438c61a1653SRyan Zezeski 		/* This is guaranteed by mac_hw_emul(). */
439c61a1653SRyan Zezeski 		ASSERT3P(mp->b_cont, !=, NULL);
440c61a1653SRyan Zezeski 		skipped_hdr = mp;
441c61a1653SRyan Zezeski 		mp = mp->b_cont;
442c61a1653SRyan Zezeski 	}
443c61a1653SRyan Zezeski 
444c61a1653SRyan Zezeski 	/*
445c61a1653SRyan Zezeski 	 * Both full and partial checksum rely on finding the IP
446c61a1653SRyan Zezeski 	 * header in the current mblk. Our native TCP stack honors
447c61a1653SRyan Zezeski 	 * this assumption but it's prudent to guard our future
448c61a1653SRyan Zezeski 	 * clients that might not honor this contract.
449c61a1653SRyan Zezeski 	 */
450c61a1653SRyan Zezeski 	ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz);
451c61a1653SRyan Zezeski 	if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) {
452c61a1653SRyan Zezeski 		err = "mblk doesn't contain IP header";
453c61a1653SRyan Zezeski 		goto bail;
454c61a1653SRyan Zezeski 	}
455c61a1653SRyan Zezeski 
456c61a1653SRyan Zezeski 	/*
457c61a1653SRyan Zezeski 	 * We are about to modify the header mblk; make sure we are
458c61a1653SRyan Zezeski 	 * modifying our own copy. The code that follows assumes that
459c61a1653SRyan Zezeski 	 * the IP/ULP headers exist in this mblk (and drops the
460c61a1653SRyan Zezeski 	 * message if they don't).
461c61a1653SRyan Zezeski 	 */
462c61a1653SRyan Zezeski 	if (DB_REF(mp) > 1) {
463c61a1653SRyan Zezeski 		mblk_t *tmp = copyb(mp);
464c61a1653SRyan Zezeski 
465c61a1653SRyan Zezeski 		if (tmp == NULL) {
466c61a1653SRyan Zezeski 			err = "copyb failed";
467c61a1653SRyan Zezeski 			goto bail;
468c61a1653SRyan Zezeski 		}
469c61a1653SRyan Zezeski 
470c61a1653SRyan Zezeski 		if (skipped_hdr != NULL) {
471c61a1653SRyan Zezeski 			ASSERT3P(skipped_hdr->b_cont, ==, mp);
472c61a1653SRyan Zezeski 			skipped_hdr->b_cont = tmp;
473c61a1653SRyan Zezeski 		}
474c61a1653SRyan Zezeski 
475c61a1653SRyan Zezeski 		tmp->b_cont = mp->b_cont;
476c61a1653SRyan Zezeski 		freeb(mp);
477c61a1653SRyan Zezeski 		mp = tmp;
478c61a1653SRyan Zezeski 	}
479c61a1653SRyan Zezeski 
480c61a1653SRyan Zezeski 	if (etype == ETHERTYPE_IP) {
481c61a1653SRyan Zezeski 		ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset);
482c61a1653SRyan Zezeski 
483c61a1653SRyan Zezeski 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
484c61a1653SRyan Zezeski 			if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err))
485c61a1653SRyan Zezeski 				goto bail;
486c61a1653SRyan Zezeski 		}
487c61a1653SRyan Zezeski 
488c61a1653SRyan Zezeski 		/* We always update the ULP checksum flags. */
489c61a1653SRyan Zezeski 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
490c61a1653SRyan Zezeski 			flags &= ~HCK_FULLCKSUM;
491c61a1653SRyan Zezeski 			flags |= HCK_FULLCKSUM_OK;
492c61a1653SRyan Zezeski 			value = 0;
493c61a1653SRyan Zezeski 		}
494da14cebeSEric Cheng 
495da14cebeSEric Cheng 		/*
496c61a1653SRyan Zezeski 		 * While unlikely, it's possible to write code that
497c61a1653SRyan Zezeski 		 * might end up calling mac_sw_cksum() twice on the
498c61a1653SRyan Zezeski 		 * same mblk (performing both LSO and checksum
499c61a1653SRyan Zezeski 		 * emualtion in a single mblk chain loop -- the LSO
500c61a1653SRyan Zezeski 		 * emulation inserts a new chain into the existing
501c61a1653SRyan Zezeski 		 * chain and then the loop iterates back over the new
502c61a1653SRyan Zezeski 		 * segments and emulates the checksum a second time).
503c61a1653SRyan Zezeski 		 * Normally this wouldn't be a problem, because the
504c61a1653SRyan Zezeski 		 * HCK_*_OK flags are supposed to indicate that we
505c61a1653SRyan Zezeski 		 * don't need to do peform the work. But
506c61a1653SRyan Zezeski 		 * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
507c61a1653SRyan Zezeski 		 * same value; so we cannot use these flags to
508c61a1653SRyan Zezeski 		 * determine if the IP header checksum has already
509c61a1653SRyan Zezeski 		 * been calculated or not. For this reason, we zero
510c61a1653SRyan Zezeski 		 * out the the checksum first. In the future, we
511c61a1653SRyan Zezeski 		 * should fix the HCK_* flags.
512da14cebeSEric Cheng 		 */
513c61a1653SRyan Zezeski 		if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
514c61a1653SRyan Zezeski 			ipha->ipha_hdr_checksum = 0;
515c61a1653SRyan Zezeski 			ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
516c61a1653SRyan Zezeski 			flags &= ~HCK_IPV4_HDRCKSUM;
517c61a1653SRyan Zezeski 			flags |= HCK_IPV4_HDRCKSUM_OK;
518da14cebeSEric Cheng 		}
519c61a1653SRyan Zezeski 	} else if (etype == ETHERTYPE_IPV6) {
520c61a1653SRyan Zezeski 		/* There is no IP header checksum for IPv6. */
521c61a1653SRyan Zezeski 		if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
522c61a1653SRyan Zezeski 			if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err))
523c61a1653SRyan Zezeski 				goto bail;
524c61a1653SRyan Zezeski 			flags &= ~HCK_FULLCKSUM;
525c61a1653SRyan Zezeski 			flags |= HCK_FULLCKSUM_OK;
526c61a1653SRyan Zezeski 			value = 0;
527c61a1653SRyan Zezeski 		}
528c61a1653SRyan Zezeski 	}
529c61a1653SRyan Zezeski 
530c61a1653SRyan Zezeski 	/*
531c61a1653SRyan Zezeski 	 * Partial checksum is the same for both IPv4 and IPv6.
532c61a1653SRyan Zezeski 	 */
533c61a1653SRyan Zezeski 	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
534c61a1653SRyan Zezeski 		uint16_t *up, partial, cksum;
535c61a1653SRyan Zezeski 		uchar_t *ipp; /* ptr to beginning of IP header */
536c61a1653SRyan Zezeski 
537c61a1653SRyan Zezeski 		ipp = mp->b_rptr + ip_hdr_offset;
538c61a1653SRyan Zezeski 		up = (uint16_t *)((uchar_t *)ipp + stuff);
539c61a1653SRyan Zezeski 		partial = *up;
540c61a1653SRyan Zezeski 		*up = 0;
541c61a1653SRyan Zezeski 
542c61a1653SRyan Zezeski 		ASSERT3S(end, >, start);
543c61a1653SRyan Zezeski 		cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial);
544c61a1653SRyan Zezeski 		*up = cksum != 0 ? cksum : ~cksum;
545c61a1653SRyan Zezeski 	}
546c61a1653SRyan Zezeski 
547c61a1653SRyan Zezeski 	/* We always update the ULP checksum flags. */
548c61a1653SRyan Zezeski 	if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
549c61a1653SRyan Zezeski 		flags &= ~HCK_PARTIALCKSUM;
550c61a1653SRyan Zezeski 		flags |= HCK_FULLCKSUM_OK;
551c61a1653SRyan Zezeski 		value = 0;
552c61a1653SRyan Zezeski 	}
553c61a1653SRyan Zezeski 
554c61a1653SRyan Zezeski 	mac_hcksum_set(mp, start, stuff, end, value, flags);
555c61a1653SRyan Zezeski 
556c61a1653SRyan Zezeski 	/* Don't forget to reattach the header. */
557c61a1653SRyan Zezeski 	if (skipped_hdr != NULL) {
558c61a1653SRyan Zezeski 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
559da14cebeSEric Cheng 
560da14cebeSEric Cheng 		/*
561c61a1653SRyan Zezeski 		 * Duplicate the HCKSUM data into the header mblk.
562c61a1653SRyan Zezeski 		 * This mimics mac_add_vlan_tag which ensures that
563c61a1653SRyan Zezeski 		 * both the first mblk _and_ the first data bearing
564c61a1653SRyan Zezeski 		 * mblk possess the HCKSUM information. Consumers like
565c61a1653SRyan Zezeski 		 * IP will end up discarding the ether_header mblk, so
566c61a1653SRyan Zezeski 		 * for now, it is important that the data be available
567c61a1653SRyan Zezeski 		 * in both places.
568da14cebeSEric Cheng 		 */
569c61a1653SRyan Zezeski 		mac_hcksum_clone(mp, skipped_hdr);
570c61a1653SRyan Zezeski 		mp = skipped_hdr;
571c61a1653SRyan Zezeski 	}
572c61a1653SRyan Zezeski 
573c61a1653SRyan Zezeski 	return (mp);
574c61a1653SRyan Zezeski 
575c61a1653SRyan Zezeski bail:
576c61a1653SRyan Zezeski 	if (skipped_hdr != NULL) {
577c61a1653SRyan Zezeski 		ASSERT3P(skipped_hdr->b_cont, ==, mp);
578c61a1653SRyan Zezeski 		mp = skipped_hdr;
579c61a1653SRyan Zezeski 	}
580c61a1653SRyan Zezeski 
581c61a1653SRyan Zezeski 	mac_drop_pkt(mp, err);
582c61a1653SRyan Zezeski 	return (NULL);
583c61a1653SRyan Zezeski }
584c61a1653SRyan Zezeski 
585c61a1653SRyan Zezeski /*
586c61a1653SRyan Zezeski  * Build a single data segment from an LSO packet. The mblk chain
587c61a1653SRyan Zezeski  * returned, seg_head, represents the data segment and is always
588c61a1653SRyan Zezeski  * exactly seg_len bytes long. The lso_mp and offset input/output
589c61a1653SRyan Zezeski  * parameters track our position in the LSO packet. This function
590c61a1653SRyan Zezeski  * exists solely as a helper to mac_sw_lso().
591c61a1653SRyan Zezeski  *
592c61a1653SRyan Zezeski  * Case A
593c61a1653SRyan Zezeski  *
594c61a1653SRyan Zezeski  *     The current lso_mp is larger than the requested seg_len. The
595c61a1653SRyan Zezeski  *     beginning of seg_head may start at the beginning of lso_mp or
596c61a1653SRyan Zezeski  *     offset into it. In either case, a single mblk is returned, and
597c61a1653SRyan Zezeski  *     *offset is updated to reflect our new position in the current
598c61a1653SRyan Zezeski  *     lso_mp.
599c61a1653SRyan Zezeski  *
600c61a1653SRyan Zezeski  *          +----------------------------+
601c61a1653SRyan Zezeski  *          |  in *lso_mp / out *lso_mp  |
602c61a1653SRyan Zezeski  *          +----------------------------+
603c61a1653SRyan Zezeski  *          ^                        ^
604c61a1653SRyan Zezeski  *          |                        |
605c61a1653SRyan Zezeski  *          |                        |
606c61a1653SRyan Zezeski  *          |                        |
607c61a1653SRyan Zezeski  *          +------------------------+
608c61a1653SRyan Zezeski  *          |        seg_head        |
609c61a1653SRyan Zezeski  *          +------------------------+
610c61a1653SRyan Zezeski  *          ^                        ^
611c61a1653SRyan Zezeski  *          |                        |
612c61a1653SRyan Zezeski  *   in *offset = 0        out *offset = seg_len
613c61a1653SRyan Zezeski  *
614c61a1653SRyan Zezeski  *          |------   seg_len    ----|
615c61a1653SRyan Zezeski  *
616c61a1653SRyan Zezeski  *
617c61a1653SRyan Zezeski  *       +------------------------------+
618c61a1653SRyan Zezeski  *       |   in *lso_mp / out *lso_mp   |
619c61a1653SRyan Zezeski  *       +------------------------------+
620c61a1653SRyan Zezeski  *          ^                        ^
621c61a1653SRyan Zezeski  *          |                        |
622c61a1653SRyan Zezeski  *          |                        |
623c61a1653SRyan Zezeski  *          |                        |
624c61a1653SRyan Zezeski  *          +------------------------+
625c61a1653SRyan Zezeski  *          |        seg_head        |
626c61a1653SRyan Zezeski  *          +------------------------+
627c61a1653SRyan Zezeski  *          ^                        ^
628c61a1653SRyan Zezeski  *          |                        |
629c61a1653SRyan Zezeski  *   in *offset = N        out *offset = N + seg_len
630c61a1653SRyan Zezeski  *
631c61a1653SRyan Zezeski  *          |------   seg_len    ----|
632c61a1653SRyan Zezeski  *
633c61a1653SRyan Zezeski  *
634c61a1653SRyan Zezeski  *
635c61a1653SRyan Zezeski  * Case B
636c61a1653SRyan Zezeski  *
637c61a1653SRyan Zezeski  *    The requested seg_len consumes exactly the rest of the lso_mp.
638c61a1653SRyan Zezeski  *    I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
639c61a1653SRyan Zezeski  *    The seg_head may start at the beginning of the lso_mp or at some
640c61a1653SRyan Zezeski  *    offset into it. In either case we return a single mblk, reset
641c61a1653SRyan Zezeski  *    *offset to zero, and walk to the next lso_mp.
642c61a1653SRyan Zezeski  *
643c61a1653SRyan Zezeski  *          +------------------------+           +------------------------+
644c61a1653SRyan Zezeski  *          |       in *lso_mp       |---------->|      out *lso_mp       |
645c61a1653SRyan Zezeski  *          +------------------------+           +------------------------+
646c61a1653SRyan Zezeski  *          ^                        ^           ^
647c61a1653SRyan Zezeski  *          |                        |           |
648c61a1653SRyan Zezeski  *          |                        |    out *offset = 0
649c61a1653SRyan Zezeski  *          |                        |
650c61a1653SRyan Zezeski  *          +------------------------+
651c61a1653SRyan Zezeski  *          |        seg_head        |
652c61a1653SRyan Zezeski  *          +------------------------+
653c61a1653SRyan Zezeski  *          ^
654c61a1653SRyan Zezeski  *          |
655c61a1653SRyan Zezeski  *   in *offset = 0
656c61a1653SRyan Zezeski  *
657c61a1653SRyan Zezeski  *          |------   seg_len    ----|
658c61a1653SRyan Zezeski  *
659c61a1653SRyan Zezeski  *
660c61a1653SRyan Zezeski  *
661c61a1653SRyan Zezeski  *      +----------------------------+           +------------------------+
662c61a1653SRyan Zezeski  *      |         in *lso_mp         |---------->|      out *lso_mp       |
663c61a1653SRyan Zezeski  *      +----------------------------+           +------------------------+
664c61a1653SRyan Zezeski  *          ^                        ^           ^
665c61a1653SRyan Zezeski  *          |                        |           |
666c61a1653SRyan Zezeski  *          |                        |    out *offset = 0
667c61a1653SRyan Zezeski  *          |                        |
668c61a1653SRyan Zezeski  *          +------------------------+
669c61a1653SRyan Zezeski  *          |        seg_head        |
670c61a1653SRyan Zezeski  *          +------------------------+
671c61a1653SRyan Zezeski  *          ^
672c61a1653SRyan Zezeski  *          |
673c61a1653SRyan Zezeski  *   in *offset = N
674c61a1653SRyan Zezeski  *
675c61a1653SRyan Zezeski  *          |------   seg_len    ----|
676c61a1653SRyan Zezeski  *
677c61a1653SRyan Zezeski  *
678c61a1653SRyan Zezeski  * Case C
679c61a1653SRyan Zezeski  *
680c61a1653SRyan Zezeski  *    The requested seg_len is greater than the current lso_mp. In
681c61a1653SRyan Zezeski  *    this case we must consume LSO mblks until we have enough data to
682c61a1653SRyan Zezeski  *    satisfy either case (A) or (B) above. We will return multiple
683c61a1653SRyan Zezeski  *    mblks linked via b_cont, offset will be set based on the cases
684c61a1653SRyan Zezeski  *    above, and lso_mp will walk forward at least one mblk, but maybe
685c61a1653SRyan Zezeski  *    more.
686c61a1653SRyan Zezeski  *
687c61a1653SRyan Zezeski  *    N.B. This digram is not exhaustive. The seg_head may start on
688c61a1653SRyan Zezeski  *    the beginning of an lso_mp. The seg_tail may end exactly on the
689c61a1653SRyan Zezeski  *    boundary of an lso_mp. And there may be two (in this case the
690c61a1653SRyan Zezeski  *    middle block wouldn't exist), three, or more mblks in the
691c61a1653SRyan Zezeski  *    seg_head chain. This is meant as one example of what might
692c61a1653SRyan Zezeski  *    happen. The main thing to remember is that the seg_tail mblk
693c61a1653SRyan Zezeski  *    must be one of case (A) or (B) above.
694c61a1653SRyan Zezeski  *
695c61a1653SRyan Zezeski  *  +------------------+    +----------------+    +------------------+
696c61a1653SRyan Zezeski  *  |    in *lso_mp    |--->|    *lso_mp     |--->|   out *lso_mp    |
697c61a1653SRyan Zezeski  *  +------------------+    +----------------+    +------------------+
698c61a1653SRyan Zezeski  *        ^            ^    ^                ^    ^            ^
699c61a1653SRyan Zezeski  *        |            |    |                |    |            |
700c61a1653SRyan Zezeski  *        |            |    |                |    |            |
701c61a1653SRyan Zezeski  *        |            |    |                |    |            |
702c61a1653SRyan Zezeski  *        |            |    |                |    |            |
703c61a1653SRyan Zezeski  *        +------------+    +----------------+    +------------+
704c61a1653SRyan Zezeski  *        |  seg_head  |--->|                |--->|  seg_tail  |
705c61a1653SRyan Zezeski  *        +------------+    +----------------+    +------------+
706c61a1653SRyan Zezeski  *        ^                                                    ^
707c61a1653SRyan Zezeski  *        |                                                    |
708c61a1653SRyan Zezeski  *  in *offset = N                          out *offset = MBLKL(seg_tail)
709c61a1653SRyan Zezeski  *
710c61a1653SRyan Zezeski  *        |-------------------   seg_len    -------------------|
711c61a1653SRyan Zezeski  *
712c61a1653SRyan Zezeski  */
713c61a1653SRyan Zezeski static mblk_t *
build_data_seg(mblk_t ** lso_mp,uint32_t * offset,uint32_t seg_len)714c61a1653SRyan Zezeski build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
715c61a1653SRyan Zezeski {
716c61a1653SRyan Zezeski 	mblk_t *seg_head, *seg_tail, *seg_mp;
717c61a1653SRyan Zezeski 
718c61a1653SRyan Zezeski 	ASSERT3P(*lso_mp, !=, NULL);
719c61a1653SRyan Zezeski 	ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
720c61a1653SRyan Zezeski 
721c61a1653SRyan Zezeski 	seg_mp = dupb(*lso_mp);
722c61a1653SRyan Zezeski 	if (seg_mp == NULL)
723c61a1653SRyan Zezeski 		return (NULL);
724c61a1653SRyan Zezeski 
725c61a1653SRyan Zezeski 	seg_head = seg_mp;
726c61a1653SRyan Zezeski 	seg_tail = seg_mp;
727c61a1653SRyan Zezeski 
728c61a1653SRyan Zezeski 	/* Continue where we left off from in the lso_mp. */
729c61a1653SRyan Zezeski 	seg_mp->b_rptr += *offset;
730c61a1653SRyan Zezeski 
731c61a1653SRyan Zezeski last_mblk:
732c61a1653SRyan Zezeski 	/* Case (A) */
733c61a1653SRyan Zezeski 	if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
734c61a1653SRyan Zezeski 		*offset += seg_len;
735c61a1653SRyan Zezeski 		seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
736c61a1653SRyan Zezeski 		return (seg_head);
737c61a1653SRyan Zezeski 	}
738c61a1653SRyan Zezeski 
739c61a1653SRyan Zezeski 	/* Case (B) */
740c61a1653SRyan Zezeski 	if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
741c61a1653SRyan Zezeski 		*offset = 0;
742c61a1653SRyan Zezeski 		*lso_mp = (*lso_mp)->b_cont;
743c61a1653SRyan Zezeski 		return (seg_head);
744c61a1653SRyan Zezeski 	}
745c61a1653SRyan Zezeski 
746c61a1653SRyan Zezeski 	/* Case (C) */
747c61a1653SRyan Zezeski 	ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
748c61a1653SRyan Zezeski 
749c61a1653SRyan Zezeski 	/*
750c61a1653SRyan Zezeski 	 * The current LSO mblk doesn't have enough data to satisfy
751c61a1653SRyan Zezeski 	 * seg_len -- continue peeling off LSO mblks to build the new
752c61a1653SRyan Zezeski 	 * segment message. If allocation fails we free the previously
753c61a1653SRyan Zezeski 	 * allocated segment mblks and return NULL.
754c61a1653SRyan Zezeski 	 */
755c61a1653SRyan Zezeski 	while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
756c61a1653SRyan Zezeski 		ASSERT3U(MBLKL(seg_mp), <=, seg_len);
757c61a1653SRyan Zezeski 		seg_len -= MBLKL(seg_mp);
758c61a1653SRyan Zezeski 		*offset = 0;
759c61a1653SRyan Zezeski 		*lso_mp = (*lso_mp)->b_cont;
760c61a1653SRyan Zezeski 		seg_mp = dupb(*lso_mp);
761c61a1653SRyan Zezeski 
762c61a1653SRyan Zezeski 		if (seg_mp == NULL) {
763c61a1653SRyan Zezeski 			freemsgchain(seg_head);
764c61a1653SRyan Zezeski 			return (NULL);
765c61a1653SRyan Zezeski 		}
766c61a1653SRyan Zezeski 
767c61a1653SRyan Zezeski 		seg_tail->b_cont = seg_mp;
768c61a1653SRyan Zezeski 		seg_tail = seg_mp;
769c61a1653SRyan Zezeski 	}
770c61a1653SRyan Zezeski 
771c61a1653SRyan Zezeski 	/*
772c61a1653SRyan Zezeski 	 * We've walked enough LSO mblks that we can now satisfy the
773c61a1653SRyan Zezeski 	 * remaining seg_len. At this point we need to jump back to
774c61a1653SRyan Zezeski 	 * determine if we have arrived at case (A) or (B).
775c61a1653SRyan Zezeski 	 */
776c61a1653SRyan Zezeski 
777c61a1653SRyan Zezeski 	/* Just to be paranoid that we didn't underflow. */
778c61a1653SRyan Zezeski 	ASSERT3U(seg_len, <, IP_MAXPACKET);
779c61a1653SRyan Zezeski 	ASSERT3U(seg_len, >, 0);
780c61a1653SRyan Zezeski 	goto last_mblk;
781c61a1653SRyan Zezeski }
782c61a1653SRyan Zezeski 
783c61a1653SRyan Zezeski /*
784c61a1653SRyan Zezeski  * Perform software segmentation of a single LSO message. Take an LSO
785c61a1653SRyan Zezeski  * message as input and return head/tail pointers as output. This
786c61a1653SRyan Zezeski  * function should not be invoked directly but instead through
787c61a1653SRyan Zezeski  * mac_hw_emul().
788c61a1653SRyan Zezeski  *
789c61a1653SRyan Zezeski  * The resulting chain is comprised of multiple (nsegs) MSS sized
790c61a1653SRyan Zezeski  * segments. Each segment will consist of two or more mblks joined by
791c61a1653SRyan Zezeski  * b_cont: a header and one or more data mblks. The header mblk is
792c61a1653SRyan Zezeski  * allocated anew for each message. The first segment's header is used
793c61a1653SRyan Zezeski  * as a template for the rest with adjustments made for things such as
794c61a1653SRyan Zezeski  * ID, sequence, length, TCP flags, etc. The data mblks reference into
795c61a1653SRyan Zezeski  * the existing LSO mblk (passed in as omp) by way of dupb(). Their
796c61a1653SRyan Zezeski  * b_rptr/b_wptr values are adjusted to reference only the fraction of
797c61a1653SRyan Zezeski  * the LSO message they are responsible for. At the successful
798c61a1653SRyan Zezeski  * completion of this function the original mblk (omp) is freed,
799c61a1653SRyan Zezeski  * leaving the newely created segment chain as the only remaining
800c61a1653SRyan Zezeski  * reference to the data.
801c61a1653SRyan Zezeski  */
802c61a1653SRyan Zezeski static void
mac_sw_lso(mblk_t * omp,mac_emul_t emul,mblk_t ** head,mblk_t ** tail,uint_t * count)803c61a1653SRyan Zezeski mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
804c61a1653SRyan Zezeski     uint_t *count)
805c61a1653SRyan Zezeski {
806c61a1653SRyan Zezeski 	uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
807c61a1653SRyan Zezeski 	uint32_t mss;
808c61a1653SRyan Zezeski 	uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
809c61a1653SRyan Zezeski 	uint32_t oleft;
810c61a1653SRyan Zezeski 	uint_t nsegs, seg;
811c61a1653SRyan Zezeski 	int len;
812c61a1653SRyan Zezeski 
813c61a1653SRyan Zezeski 	struct ether_vlan_header *oevh;
814c61a1653SRyan Zezeski 	const ipha_t *oiph;
815c61a1653SRyan Zezeski 	const tcph_t *otcph;
816c61a1653SRyan Zezeski 	ipha_t *niph;
817c61a1653SRyan Zezeski 	tcph_t *ntcph;
818c61a1653SRyan Zezeski 	uint16_t ip_id;
819c61a1653SRyan Zezeski 	uint32_t tcp_seq, tcp_sum, otcp_sum;
820c61a1653SRyan Zezeski 
821c61a1653SRyan Zezeski 	uint32_t offset;
822c61a1653SRyan Zezeski 	mblk_t *odatamp;
823c61a1653SRyan Zezeski 	mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
824c61a1653SRyan Zezeski 	mblk_t *tmptail;
825c61a1653SRyan Zezeski 
826c61a1653SRyan Zezeski 	ASSERT3P(head, !=, NULL);
827c61a1653SRyan Zezeski 	ASSERT3P(tail, !=, NULL);
828c61a1653SRyan Zezeski 	ASSERT3P(count, !=, NULL);
829c61a1653SRyan Zezeski 	ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
830c61a1653SRyan Zezeski 
831c61a1653SRyan Zezeski 	/* Assume we are dealing with a single LSO message. */
832c61a1653SRyan Zezeski 	ASSERT3P(omp->b_next, ==, NULL);
833c61a1653SRyan Zezeski 
834c61a1653SRyan Zezeski 	/*
835c61a1653SRyan Zezeski 	 * XXX: This is a hack to deal with mac_add_vlan_tag().
836c61a1653SRyan Zezeski 	 *
837c61a1653SRyan Zezeski 	 * When VLANs are in play, mac_add_vlan_tag() creates a new
838c61a1653SRyan Zezeski 	 * mblk with just the ether_vlan_header and tacks it onto the
839c61a1653SRyan Zezeski 	 * front of 'omp'. This breaks the assumptions made below;
840c61a1653SRyan Zezeski 	 * namely that the TCP/IP headers are in the first mblk. In
841c61a1653SRyan Zezeski 	 * this case, since we already have to pay the cost of LSO
842c61a1653SRyan Zezeski 	 * emulation, we simply pull up everything. While this might
843c61a1653SRyan Zezeski 	 * seem irksome, keep in mind this will only apply in a couple
844c61a1653SRyan Zezeski 	 * of scenarios: a) an LSO-capable VLAN client sending to a
845c61a1653SRyan Zezeski 	 * non-LSO-capable client over the "MAC/bridge loopback"
846c61a1653SRyan Zezeski 	 * datapath or b) an LSO-capable VLAN client is sending to a
847c61a1653SRyan Zezeski 	 * client that, for whatever reason, doesn't have DLS-bypass
848c61a1653SRyan Zezeski 	 * enabled. Finally, we have to check for both a tagged and
849c61a1653SRyan Zezeski 	 * untagged sized mblk depending on if the mblk came via
850c61a1653SRyan Zezeski 	 * mac_promisc_dispatch() or mac_rx_deliver().
851c61a1653SRyan Zezeski 	 *
852c61a1653SRyan Zezeski 	 * In the future, two things should be done:
853c61a1653SRyan Zezeski 	 *
854c61a1653SRyan Zezeski 	 * 1. This function should make use of some yet to be
855c61a1653SRyan Zezeski 	 *    implemented "mblk helpers". These helper functions would
856c61a1653SRyan Zezeski 	 *    perform all the b_cont walking for us and guarantee safe
857c61a1653SRyan Zezeski 	 *    access to the mblk data.
858c61a1653SRyan Zezeski 	 *
859c61a1653SRyan Zezeski 	 * 2. We should add some slop to the mblks so that
860c61a1653SRyan Zezeski 	 *    mac_add_vlan_tag() can just edit the first mblk instead
861c61a1653SRyan Zezeski 	 *    of allocating on the hot path.
862c61a1653SRyan Zezeski 	 */
863c61a1653SRyan Zezeski 	if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
864c61a1653SRyan Zezeski 	    MBLKL(omp) == sizeof (struct ether_header)) {
865c61a1653SRyan Zezeski 		mblk_t *tmp = msgpullup(omp, -1);
866c61a1653SRyan Zezeski 
867c61a1653SRyan Zezeski 		if (tmp == NULL) {
868c61a1653SRyan Zezeski 			mac_drop_pkt(omp, "failed to pull up");
869c61a1653SRyan Zezeski 			goto fail;
870c61a1653SRyan Zezeski 		}
871c61a1653SRyan Zezeski 
872c61a1653SRyan Zezeski 		mac_hcksum_clone(omp, tmp);
873c61a1653SRyan Zezeski 		freemsg(omp);
874c61a1653SRyan Zezeski 		omp = tmp;
875c61a1653SRyan Zezeski 	}
876c61a1653SRyan Zezeski 
877c61a1653SRyan Zezeski 	mss = DB_LSOMSS(omp);
878c61a1653SRyan Zezeski 	ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
879c61a1653SRyan Zezeski 	    sizeof (struct ether_vlan_header));
880c61a1653SRyan Zezeski 	opktlen = msgsize(omp);
881c61a1653SRyan Zezeski 
882c61a1653SRyan Zezeski 	/*
883c61a1653SRyan Zezeski 	 * First, get references to the IP and TCP headers and
884c61a1653SRyan Zezeski 	 * determine the total TCP length (header + data).
885c61a1653SRyan Zezeski 	 *
886c61a1653SRyan Zezeski 	 * Thanks to mac_hw_emul() we know that the first mblk must
887c61a1653SRyan Zezeski 	 * contain (at minimum) the full L2 header. However, this
888c61a1653SRyan Zezeski 	 * function assumes more than that. It assumes the L2/L3/L4
889c61a1653SRyan Zezeski 	 * headers are all contained in the first mblk of a message
890c61a1653SRyan Zezeski 	 * (i.e., no b_cont walking for headers). While this is a
891c61a1653SRyan Zezeski 	 * current reality (our native TCP stack and viona both
892c61a1653SRyan Zezeski 	 * enforce this) things may become more nuanced in the future
893c61a1653SRyan Zezeski 	 * (e.g. when introducing encap support or adding new
894c61a1653SRyan Zezeski 	 * clients). For now we guard against this case by dropping
895c61a1653SRyan Zezeski 	 * the packet.
896c61a1653SRyan Zezeski 	 */
897c61a1653SRyan Zezeski 	oevh = (struct ether_vlan_header *)omp->b_rptr;
898c61a1653SRyan Zezeski 	if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
899c61a1653SRyan Zezeski 		oehlen = sizeof (struct ether_vlan_header);
900c61a1653SRyan Zezeski 	else
901c61a1653SRyan Zezeski 		oehlen = sizeof (struct ether_header);
902c61a1653SRyan Zezeski 
903c61a1653SRyan Zezeski 	ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
904c61a1653SRyan Zezeski 	if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
905c61a1653SRyan Zezeski 		mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
906c61a1653SRyan Zezeski 		goto fail;
907c61a1653SRyan Zezeski 	}
908c61a1653SRyan Zezeski 
909c61a1653SRyan Zezeski 	oiph = (ipha_t *)(omp->b_rptr + oehlen);
910c61a1653SRyan Zezeski 	oiphlen = IPH_HDR_LENGTH(oiph);
911c61a1653SRyan Zezeski 	otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
912c61a1653SRyan Zezeski 	otcphlen = TCP_HDR_LENGTH(otcph);
913c61a1653SRyan Zezeski 
914c61a1653SRyan Zezeski 	/*
915c61a1653SRyan Zezeski 	 * Currently we only support LSO for TCP/IPv4.
916c61a1653SRyan Zezeski 	 */
917c61a1653SRyan Zezeski 	if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
918c61a1653SRyan Zezeski 		mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
919c61a1653SRyan Zezeski 		    IPH_HDR_VERSION(oiph));
920c61a1653SRyan Zezeski 		goto fail;
921c61a1653SRyan Zezeski 	}
922c61a1653SRyan Zezeski 
923c61a1653SRyan Zezeski 	if (oiph->ipha_protocol != IPPROTO_TCP) {
924c61a1653SRyan Zezeski 		mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
925c61a1653SRyan Zezeski 		    oiph->ipha_protocol);
926c61a1653SRyan Zezeski 		goto fail;
927c61a1653SRyan Zezeski 	}
928c61a1653SRyan Zezeski 
929c61a1653SRyan Zezeski 	if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
930c61a1653SRyan Zezeski 		mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
931c61a1653SRyan Zezeski 		goto fail;
932c61a1653SRyan Zezeski 	}
933c61a1653SRyan Zezeski 
934c61a1653SRyan Zezeski 	ohdrslen = oehlen + oiphlen + otcphlen;
935c61a1653SRyan Zezeski 	if ((len = MBLKL(omp)) < ohdrslen) {
936c61a1653SRyan Zezeski 		mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
937c61a1653SRyan Zezeski 		    ohdrslen);
938c61a1653SRyan Zezeski 		goto fail;
939c61a1653SRyan Zezeski 	}
940c61a1653SRyan Zezeski 
941c61a1653SRyan Zezeski 	/*
942c61a1653SRyan Zezeski 	 * Either we have data in the first mblk or it's just the
943c61a1653SRyan Zezeski 	 * header. In either case, we need to set rptr to the start of
944c61a1653SRyan Zezeski 	 * the TCP data.
945c61a1653SRyan Zezeski 	 */
946c61a1653SRyan Zezeski 	if (len > ohdrslen) {
947c61a1653SRyan Zezeski 		odatamp = omp;
948c61a1653SRyan Zezeski 		offset = ohdrslen;
949c61a1653SRyan Zezeski 	} else {
950c61a1653SRyan Zezeski 		ASSERT3U(len, ==, ohdrslen);
951c61a1653SRyan Zezeski 		odatamp = omp->b_cont;
952c61a1653SRyan Zezeski 		offset = 0;
953c61a1653SRyan Zezeski 	}
954c61a1653SRyan Zezeski 
955c61a1653SRyan Zezeski 	/* Make sure we still have enough data. */
956c61a1653SRyan Zezeski 	ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
957c61a1653SRyan Zezeski 
958c61a1653SRyan Zezeski 	/*
959c61a1653SRyan Zezeski 	 * If a MAC negotiated LSO then it must negotioate both
960c61a1653SRyan Zezeski 	 * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
961c61a1653SRyan Zezeski 	 * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
962c61a1653SRyan Zezeski 	 * change during LSO segmentation (only the 3 fields of the
963c61a1653SRyan Zezeski 	 * pseudo header checksum don't change: src, dst, proto). Thus
964c61a1653SRyan Zezeski 	 * we would expect these flags (HCK_IPV4_HDRCKSUM |
965c61a1653SRyan Zezeski 	 * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
966c61a1653SRyan Zezeski 	 * function to emulate those checksums in software. However,
967c61a1653SRyan Zezeski 	 * that assumes a world where we only expose LSO if the
968c61a1653SRyan Zezeski 	 * underlying hardware exposes LSO. Moving forward the plan is
969c61a1653SRyan Zezeski 	 * to assume LSO in the upper layers and have MAC perform
970c61a1653SRyan Zezeski 	 * software LSO when the underlying provider doesn't support
971c61a1653SRyan Zezeski 	 * it. In such a world, if the provider doesn't support LSO
972c61a1653SRyan Zezeski 	 * but does support hardware checksum offload, then we could
973c61a1653SRyan Zezeski 	 * simply perform the segmentation and allow the hardware to
974c61a1653SRyan Zezeski 	 * calculate the checksums. To the hardware it's just another
975c61a1653SRyan Zezeski 	 * chain of non-LSO packets.
976c61a1653SRyan Zezeski 	 */
977c61a1653SRyan Zezeski 	ASSERT3S(DB_TYPE(omp), ==, M_DATA);
978c61a1653SRyan Zezeski 	ocsum_flags = DB_CKSUMFLAGS(omp);
979c61a1653SRyan Zezeski 	ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
980c61a1653SRyan Zezeski 	ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
981c61a1653SRyan Zezeski 
982c61a1653SRyan Zezeski 	/*
983c61a1653SRyan Zezeski 	 * If hardware only provides partial checksum then software
984c61a1653SRyan Zezeski 	 * must supply the pseudo-header checksum. In the case of LSO
985c61a1653SRyan Zezeski 	 * we leave the TCP length at zero to be filled in by
986c61a1653SRyan Zezeski 	 * hardware. This function must handle two scenarios.
987c61a1653SRyan Zezeski 	 *
988c61a1653SRyan Zezeski 	 * 1. Being called by a MAC client on the Rx path to segment
989c61a1653SRyan Zezeski 	 *    an LSO packet and calculate the checksum.
990c61a1653SRyan Zezeski 	 *
991c61a1653SRyan Zezeski 	 * 2. Being called by a MAC provider to segment an LSO packet.
992c61a1653SRyan Zezeski 	 *    In this case the LSO segmentation is performed in
993c61a1653SRyan Zezeski 	 *    software (by this routine) but the MAC provider should
994c61a1653SRyan Zezeski 	 *    still calculate the TCP/IP checksums in hardware.
995c61a1653SRyan Zezeski 	 *
996c61a1653SRyan Zezeski 	 *  To elaborate on the second case: we cannot have the
997c61a1653SRyan Zezeski 	 *  scenario where IP sends LSO packets but the underlying HW
998c61a1653SRyan Zezeski 	 *  doesn't support checksum offload -- because in that case
999c61a1653SRyan Zezeski 	 *  TCP/IP would calculate the checksum in software (for the
1000c61a1653SRyan Zezeski 	 *  LSO packet) but then MAC would segment the packet and have
1001c61a1653SRyan Zezeski 	 *  to redo all the checksum work. So IP should never do LSO
1002c61a1653SRyan Zezeski 	 *  if HW doesn't support both IP and TCP checksum.
1003c61a1653SRyan Zezeski 	 */
1004c61a1653SRyan Zezeski 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1005c61a1653SRyan Zezeski 		ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
1006c61a1653SRyan Zezeski 		ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
1007c61a1653SRyan Zezeski 	}
1008c61a1653SRyan Zezeski 
1009c61a1653SRyan Zezeski 	odatalen = opktlen - ohdrslen;
1010c61a1653SRyan Zezeski 
1011c61a1653SRyan Zezeski 	/*
1012c61a1653SRyan Zezeski 	 * Subtract one to account for the case where the data length
1013c61a1653SRyan Zezeski 	 * is evenly divisble by the MSS. Add one to account for the
1014c61a1653SRyan Zezeski 	 * fact that the division will always result in one less
1015c61a1653SRyan Zezeski 	 * segment than needed.
1016c61a1653SRyan Zezeski 	 */
1017c61a1653SRyan Zezeski 	nsegs = ((odatalen - 1) / mss) + 1;
1018c61a1653SRyan Zezeski 	if (nsegs < 2) {
1019c61a1653SRyan Zezeski 		mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
1020c61a1653SRyan Zezeski 		goto fail;
1021c61a1653SRyan Zezeski 	}
1022c61a1653SRyan Zezeski 
1023c61a1653SRyan Zezeski 	DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
1024c61a1653SRyan Zezeski 	    __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
1025c61a1653SRyan Zezeski 	    nsegs);
1026c61a1653SRyan Zezeski 
1027c61a1653SRyan Zezeski 	seg_chain = NULL;
1028c61a1653SRyan Zezeski 	tmptail = seg_chain;
1029c61a1653SRyan Zezeski 	oleft = odatalen;
1030c61a1653SRyan Zezeski 
1031c61a1653SRyan Zezeski 	for (uint_t i = 0; i < nsegs; i++) {
1032c61a1653SRyan Zezeski 		boolean_t last_seg = ((i + 1) == nsegs);
1033c61a1653SRyan Zezeski 		uint32_t seg_len;
1034c61a1653SRyan Zezeski 
1035c61a1653SRyan Zezeski 		/*
1036c61a1653SRyan Zezeski 		 * If we fail to allocate, then drop the partially
1037c61a1653SRyan Zezeski 		 * allocated chain as well as the LSO packet. Let the
1038c61a1653SRyan Zezeski 		 * sender deal with the fallout.
1039c61a1653SRyan Zezeski 		 */
1040c61a1653SRyan Zezeski 		if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
1041c61a1653SRyan Zezeski 			freemsgchain(seg_chain);
1042c61a1653SRyan Zezeski 			mac_drop_pkt(omp, "failed to alloc segment header");
1043c61a1653SRyan Zezeski 			goto fail;
1044c61a1653SRyan Zezeski 		}
1045c61a1653SRyan Zezeski 		ASSERT3P(nhdrmp->b_cont, ==, NULL);
1046c61a1653SRyan Zezeski 
1047c61a1653SRyan Zezeski 		if (seg_chain == NULL) {
1048c61a1653SRyan Zezeski 			seg_chain = nhdrmp;
1049da14cebeSEric Cheng 		} else {
1050c61a1653SRyan Zezeski 			ASSERT3P(tmptail, !=, NULL);
1051c61a1653SRyan Zezeski 			tmptail->b_next = nhdrmp;
1052da14cebeSEric Cheng 		}
1053da14cebeSEric Cheng 
1054c61a1653SRyan Zezeski 		tmptail = nhdrmp;
1055c61a1653SRyan Zezeski 
1056c61a1653SRyan Zezeski 		/*
1057c61a1653SRyan Zezeski 		 * Calculate this segment's lengh. It's either the MSS
1058c61a1653SRyan Zezeski 		 * or whatever remains for the last segment.
1059c61a1653SRyan Zezeski 		 */
1060c61a1653SRyan Zezeski 		seg_len = last_seg ? oleft : mss;
1061c61a1653SRyan Zezeski 		ASSERT3U(seg_len, <=, mss);
1062c61a1653SRyan Zezeski 		ndatamp = build_data_seg(&odatamp, &offset, seg_len);
1063c61a1653SRyan Zezeski 
1064c61a1653SRyan Zezeski 		if (ndatamp == NULL) {
1065c61a1653SRyan Zezeski 			freemsgchain(seg_chain);
1066c61a1653SRyan Zezeski 			mac_drop_pkt(omp, "LSO failed to segment data");
1067c61a1653SRyan Zezeski 			goto fail;
1068da14cebeSEric Cheng 		}
1069da14cebeSEric Cheng 
1070c61a1653SRyan Zezeski 		/* Attach data mblk to header mblk. */
1071c61a1653SRyan Zezeski 		nhdrmp->b_cont = ndatamp;
1072c61a1653SRyan Zezeski 		DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
1073c61a1653SRyan Zezeski 		ASSERT3U(seg_len, <=, oleft);
1074c61a1653SRyan Zezeski 		oleft -= seg_len;
1075c61a1653SRyan Zezeski 	}
1076da14cebeSEric Cheng 
1077c61a1653SRyan Zezeski 	/* We should have consumed entire LSO msg. */
1078c61a1653SRyan Zezeski 	ASSERT3S(oleft, ==, 0);
1079c61a1653SRyan Zezeski 	ASSERT3P(odatamp, ==, NULL);
1080da14cebeSEric Cheng 
1081c61a1653SRyan Zezeski 	/*
1082c61a1653SRyan Zezeski 	 * All seg data mblks are referenced by the header mblks, null
1083c61a1653SRyan Zezeski 	 * out this pointer to catch any bad derefs.
1084c61a1653SRyan Zezeski 	 */
1085c61a1653SRyan Zezeski 	ndatamp = NULL;
1086da14cebeSEric Cheng 
1087c61a1653SRyan Zezeski 	/*
1088c61a1653SRyan Zezeski 	 * Set headers and checksum for first segment.
1089c61a1653SRyan Zezeski 	 */
1090c61a1653SRyan Zezeski 	nhdrmp = seg_chain;
1091c61a1653SRyan Zezeski 	bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
1092c61a1653SRyan Zezeski 	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1093c61a1653SRyan Zezeski 	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1094c61a1653SRyan Zezeski 	ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
1095c61a1653SRyan Zezeski 	niph->ipha_length = htons(oiphlen + otcphlen + mss);
1096c61a1653SRyan Zezeski 	niph->ipha_hdr_checksum = 0;
1097c61a1653SRyan Zezeski 	ip_id = ntohs(niph->ipha_ident);
1098c61a1653SRyan Zezeski 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1099c61a1653SRyan Zezeski 	tcp_seq = BE32_TO_U32(ntcph->th_seq);
1100c61a1653SRyan Zezeski 	tcp_seq += mss;
1101c61a1653SRyan Zezeski 
1102c61a1653SRyan Zezeski 	/*
1103c61a1653SRyan Zezeski 	 * The first segment shouldn't:
1104c61a1653SRyan Zezeski 	 *
1105c61a1653SRyan Zezeski 	 *	o indicate end of data transmission (FIN),
1106c61a1653SRyan Zezeski 	 *	o indicate immediate handling of the data (PUSH).
1107c61a1653SRyan Zezeski 	 */
1108c61a1653SRyan Zezeski 	ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1109c61a1653SRyan Zezeski 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1110c61a1653SRyan Zezeski 
1111c61a1653SRyan Zezeski 	/*
1112c61a1653SRyan Zezeski 	 * If the underlying HW provides partial checksum, then make
1113c61a1653SRyan Zezeski 	 * sure to correct the pseudo header checksum before calling
1114c61a1653SRyan Zezeski 	 * mac_sw_cksum(). The native TCP stack doesn't include the
1115c61a1653SRyan Zezeski 	 * length field in the pseudo header when LSO is in play -- so
1116c61a1653SRyan Zezeski 	 * we need to calculate it here.
1117c61a1653SRyan Zezeski 	 */
1118c61a1653SRyan Zezeski 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1119c61a1653SRyan Zezeski 		DB_CKSUMSTART(nhdrmp) = ocsum_start;
1120c61a1653SRyan Zezeski 		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1121c61a1653SRyan Zezeski 		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1122c61a1653SRyan Zezeski 		tcp_sum = BE16_TO_U16(ntcph->th_sum);
1123c61a1653SRyan Zezeski 		otcp_sum = tcp_sum;
1124c61a1653SRyan Zezeski 		tcp_sum += mss + otcphlen;
1125c61a1653SRyan Zezeski 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1126c61a1653SRyan Zezeski 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1127c61a1653SRyan Zezeski 	}
1128c61a1653SRyan Zezeski 
1129c61a1653SRyan Zezeski 	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1130c61a1653SRyan Zezeski 	    (emul & MAC_HWCKSUM_EMULS)) {
1131c61a1653SRyan Zezeski 		next_nhdrmp = nhdrmp->b_next;
1132c61a1653SRyan Zezeski 		nhdrmp->b_next = NULL;
1133c61a1653SRyan Zezeski 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1134c61a1653SRyan Zezeski 		nhdrmp->b_next = next_nhdrmp;
1135c61a1653SRyan Zezeski 		next_nhdrmp = NULL;
1136c61a1653SRyan Zezeski 
1137c61a1653SRyan Zezeski 		/*
1138c61a1653SRyan Zezeski 		 * We may have freed the nhdrmp argument during
1139c61a1653SRyan Zezeski 		 * checksum emulation, make sure that seg_chain
1140c61a1653SRyan Zezeski 		 * references a valid mblk.
1141c61a1653SRyan Zezeski 		 */
1142c61a1653SRyan Zezeski 		seg_chain = nhdrmp;
1143c61a1653SRyan Zezeski 	}
1144c61a1653SRyan Zezeski 
1145c61a1653SRyan Zezeski 	ASSERT3P(nhdrmp, !=, NULL);
1146c61a1653SRyan Zezeski 
1147c61a1653SRyan Zezeski 	seg = 1;
1148c61a1653SRyan Zezeski 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1149c61a1653SRyan Zezeski 	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1150c61a1653SRyan Zezeski 	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
1151c61a1653SRyan Zezeski 	    uint_t, seg);
1152c61a1653SRyan Zezeski 	seg++;
1153c61a1653SRyan Zezeski 
1154c61a1653SRyan Zezeski 	/* There better be at least 2 segs. */
1155c61a1653SRyan Zezeski 	ASSERT3P(nhdrmp->b_next, !=, NULL);
1156c61a1653SRyan Zezeski 	prev_nhdrmp = nhdrmp;
1157c61a1653SRyan Zezeski 	nhdrmp = nhdrmp->b_next;
1158c61a1653SRyan Zezeski 
1159c61a1653SRyan Zezeski 	/*
1160c61a1653SRyan Zezeski 	 * Now adjust the headers of the middle segments. For each
1161c61a1653SRyan Zezeski 	 * header we need to adjust the following.
1162c61a1653SRyan Zezeski 	 *
1163c61a1653SRyan Zezeski 	 *	o IP ID
1164c61a1653SRyan Zezeski 	 *	o IP length
1165c61a1653SRyan Zezeski 	 *	o TCP sequence
1166c61a1653SRyan Zezeski 	 *	o TCP flags
1167c61a1653SRyan Zezeski 	 *	o cksum flags
1168c61a1653SRyan Zezeski 	 *	o cksum values (if MAC_HWCKSUM_EMUL is set)
1169c61a1653SRyan Zezeski 	 */
1170c61a1653SRyan Zezeski 	for (; seg < nsegs; seg++) {
1171c61a1653SRyan Zezeski 		/*
1172c61a1653SRyan Zezeski 		 * We use seg_chain as a reference to the first seg
1173c61a1653SRyan Zezeski 		 * header mblk -- this first header is a template for
1174c61a1653SRyan Zezeski 		 * the rest of the segments. This copy will include
1175c61a1653SRyan Zezeski 		 * the now updated checksum values from the first
1176c61a1653SRyan Zezeski 		 * header. We must reset these checksum values to
1177c61a1653SRyan Zezeski 		 * their original to make sure we produce the correct
1178c61a1653SRyan Zezeski 		 * value.
1179c61a1653SRyan Zezeski 		 */
1180c61a1653SRyan Zezeski 		bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1181c61a1653SRyan Zezeski 		nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1182c61a1653SRyan Zezeski 		niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1183c61a1653SRyan Zezeski 		niph->ipha_ident = htons(++ip_id);
1184c61a1653SRyan Zezeski 		ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
1185c61a1653SRyan Zezeski 		niph->ipha_length = htons(oiphlen + otcphlen + mss);
1186c61a1653SRyan Zezeski 		niph->ipha_hdr_checksum = 0;
1187c61a1653SRyan Zezeski 		ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1188c61a1653SRyan Zezeski 		U32_TO_BE32(tcp_seq, ntcph->th_seq);
1189c61a1653SRyan Zezeski 		tcp_seq += mss;
1190c61a1653SRyan Zezeski 		/*
1191c61a1653SRyan Zezeski 		 * Just like the first segment, the middle segments
1192c61a1653SRyan Zezeski 		 * shouldn't have these flags set.
1193c61a1653SRyan Zezeski 		 */
1194c61a1653SRyan Zezeski 		ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
1195c61a1653SRyan Zezeski 		DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1196c61a1653SRyan Zezeski 
1197c61a1653SRyan Zezeski 		if (ocsum_flags & HCK_PARTIALCKSUM) {
1198da14cebeSEric Cheng 			/*
1199c61a1653SRyan Zezeski 			 * First and middle segs have same
1200c61a1653SRyan Zezeski 			 * pseudo-header checksum.
1201da14cebeSEric Cheng 			 */
1202c61a1653SRyan Zezeski 			U16_TO_BE16(tcp_sum, ntcph->th_sum);
1203c61a1653SRyan Zezeski 			DB_CKSUMSTART(nhdrmp) = ocsum_start;
1204c61a1653SRyan Zezeski 			DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1205c61a1653SRyan Zezeski 			DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1206c61a1653SRyan Zezeski 		}
1207da14cebeSEric Cheng 
1208c61a1653SRyan Zezeski 		if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1209c61a1653SRyan Zezeski 		    (emul & MAC_HWCKSUM_EMULS)) {
1210c61a1653SRyan Zezeski 			next_nhdrmp = nhdrmp->b_next;
1211c61a1653SRyan Zezeski 			nhdrmp->b_next = NULL;
1212c61a1653SRyan Zezeski 			nhdrmp = mac_sw_cksum(nhdrmp, emul);
1213c61a1653SRyan Zezeski 			nhdrmp->b_next = next_nhdrmp;
1214c61a1653SRyan Zezeski 			next_nhdrmp = NULL;
1215c61a1653SRyan Zezeski 			/* We may have freed the original nhdrmp. */
1216c61a1653SRyan Zezeski 			prev_nhdrmp->b_next = nhdrmp;
1217c61a1653SRyan Zezeski 		}
1218da14cebeSEric Cheng 
1219c61a1653SRyan Zezeski 		DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1220c61a1653SRyan Zezeski 		    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1221c61a1653SRyan Zezeski 		    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
1222c61a1653SRyan Zezeski 		    uint_t, mss, uint_t, seg);
12230dc2366fSVenugopal Iyer 
1224c61a1653SRyan Zezeski 		ASSERT3P(nhdrmp->b_next, !=, NULL);
1225c61a1653SRyan Zezeski 		prev_nhdrmp = nhdrmp;
1226c61a1653SRyan Zezeski 		nhdrmp = nhdrmp->b_next;
1227c61a1653SRyan Zezeski 	}
1228c61a1653SRyan Zezeski 
1229c61a1653SRyan Zezeski 	/* Make sure we are on the last segment. */
1230c61a1653SRyan Zezeski 	ASSERT3U(seg, ==, nsegs);
1231c61a1653SRyan Zezeski 	ASSERT3P(nhdrmp->b_next, ==, NULL);
1232c61a1653SRyan Zezeski 
1233c61a1653SRyan Zezeski 	/*
1234c61a1653SRyan Zezeski 	 * Now we set the last segment header. The difference being
1235c61a1653SRyan Zezeski 	 * that FIN/PSH/RST flags are allowed.
1236c61a1653SRyan Zezeski 	 */
1237c61a1653SRyan Zezeski 	bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
1238c61a1653SRyan Zezeski 	nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
1239c61a1653SRyan Zezeski 	niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
1240c61a1653SRyan Zezeski 	niph->ipha_ident = htons(++ip_id);
1241c61a1653SRyan Zezeski 	len = msgsize(nhdrmp->b_cont);
1242c61a1653SRyan Zezeski 	ASSERT3S(len, >, 0);
1243c61a1653SRyan Zezeski 	niph->ipha_length = htons(oiphlen + otcphlen + len);
1244c61a1653SRyan Zezeski 	niph->ipha_hdr_checksum = 0;
1245c61a1653SRyan Zezeski 	ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
1246c61a1653SRyan Zezeski 	U32_TO_BE32(tcp_seq, ntcph->th_seq);
1247c61a1653SRyan Zezeski 
1248c61a1653SRyan Zezeski 	DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
1249c61a1653SRyan Zezeski 	if (ocsum_flags & HCK_PARTIALCKSUM) {
1250c61a1653SRyan Zezeski 		DB_CKSUMSTART(nhdrmp) = ocsum_start;
1251c61a1653SRyan Zezeski 		DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
1252c61a1653SRyan Zezeski 		DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
1253c61a1653SRyan Zezeski 		tcp_sum = otcp_sum;
1254c61a1653SRyan Zezeski 		tcp_sum += len + otcphlen;
1255c61a1653SRyan Zezeski 		tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
1256c61a1653SRyan Zezeski 		U16_TO_BE16(tcp_sum, ntcph->th_sum);
1257c61a1653SRyan Zezeski 	}
1258c61a1653SRyan Zezeski 
1259c61a1653SRyan Zezeski 	if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
1260c61a1653SRyan Zezeski 	    (emul & MAC_HWCKSUM_EMULS)) {
1261c61a1653SRyan Zezeski 		/* This should be the last mblk. */
1262c61a1653SRyan Zezeski 		ASSERT3P(nhdrmp->b_next, ==, NULL);
1263c61a1653SRyan Zezeski 		nhdrmp = mac_sw_cksum(nhdrmp, emul);
1264c61a1653SRyan Zezeski 		prev_nhdrmp->b_next = nhdrmp;
1265c61a1653SRyan Zezeski 	}
1266c61a1653SRyan Zezeski 
1267c61a1653SRyan Zezeski 	DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
1268c61a1653SRyan Zezeski 	    (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
1269c61a1653SRyan Zezeski 	    (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
1270c61a1653SRyan Zezeski 	    uint_t, seg);
1271c61a1653SRyan Zezeski 
1272c61a1653SRyan Zezeski 	/*
1273c61a1653SRyan Zezeski 	 * Free the reference to the original LSO message as it is
1274c61a1653SRyan Zezeski 	 * being replaced by seg_cahin.
1275c61a1653SRyan Zezeski 	 */
1276c61a1653SRyan Zezeski 	freemsg(omp);
1277c61a1653SRyan Zezeski 	*head = seg_chain;
1278c61a1653SRyan Zezeski 	*tail = nhdrmp;
1279c61a1653SRyan Zezeski 	*count = nsegs;
1280c61a1653SRyan Zezeski 	return;
1281c61a1653SRyan Zezeski 
1282c61a1653SRyan Zezeski fail:
1283c61a1653SRyan Zezeski 	*head = NULL;
1284c61a1653SRyan Zezeski 	*tail = NULL;
1285c61a1653SRyan Zezeski 	*count = 0;
1286c61a1653SRyan Zezeski }
1287c61a1653SRyan Zezeski 
1288c61a1653SRyan Zezeski #define	HCK_NEEDED	(HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
1289c61a1653SRyan Zezeski 
1290c61a1653SRyan Zezeski /*
1291c61a1653SRyan Zezeski  * Emulate various hardware offload features in software. Take a chain
1292c61a1653SRyan Zezeski  * of packets as input and emulate the hardware features specified in
1293c61a1653SRyan Zezeski  * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
1294c61a1653SRyan Zezeski  * pointer given as input, and its tail pointer is written to
1295c61a1653SRyan Zezeski  * '*otail'. The number of packets in the new chain is written to
1296c61a1653SRyan Zezeski  * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
1297c61a1653SRyan Zezeski  * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
1298c61a1653SRyan Zezeski  * which case 'mp_chain' will simply stay a NULL chain.
1299c61a1653SRyan Zezeski  *
1300c61a1653SRyan Zezeski  * While unlikely, it is technically possible that this function could
1301c61a1653SRyan Zezeski  * receive a non-NULL chain as input and return a NULL chain as output
1302c61a1653SRyan Zezeski  * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
1303c61a1653SRyan Zezeski  * zero). This could happen if all the packets in the chain are
1304c61a1653SRyan Zezeski  * dropped or if we fail to allocate new mblks. In this case, there is
1305c61a1653SRyan Zezeski  * nothing for the caller to free. In any event, the caller shouldn't
1306c61a1653SRyan Zezeski  * assume that '*mp_chain' is non-NULL on return.
1307c61a1653SRyan Zezeski  *
1308c61a1653SRyan Zezeski  * This function was written with three main use cases in mind.
1309c61a1653SRyan Zezeski  *
1310c61a1653SRyan Zezeski  * 1. To emulate hardware offloads when traveling mac-loopback (two
1311c61a1653SRyan Zezeski  *    clients on the same mac). This is wired up in mac_tx_send().
1312c61a1653SRyan Zezeski  *
1313c61a1653SRyan Zezeski  * 2. To provide hardware offloads to the client when the underlying
1314c61a1653SRyan Zezeski  *    provider cannot. This is currently wired up in mac_tx() but we
1315c61a1653SRyan Zezeski  *    still only negotiate offloads when the underlying provider
1316c61a1653SRyan Zezeski  *    supports them.
1317c61a1653SRyan Zezeski  *
1318c61a1653SRyan Zezeski  * 3. To emulate real hardware in simnet.
1319c61a1653SRyan Zezeski  */
1320c61a1653SRyan Zezeski void
mac_hw_emul(mblk_t ** mp_chain,mblk_t ** otail,uint_t * ocount,mac_emul_t emul)1321c61a1653SRyan Zezeski mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
1322c61a1653SRyan Zezeski {
1323c61a1653SRyan Zezeski 	mblk_t *head = NULL, *tail = NULL;
1324c61a1653SRyan Zezeski 	uint_t count = 0;
1325c61a1653SRyan Zezeski 
1326c61a1653SRyan Zezeski 	ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
1327c61a1653SRyan Zezeski 	ASSERT3P(mp_chain, !=, NULL);
1328c61a1653SRyan Zezeski 
1329c61a1653SRyan Zezeski 	for (mblk_t *mp = *mp_chain; mp != NULL; ) {
1330c61a1653SRyan Zezeski 		mblk_t *tmp, *next, *tmphead, *tmptail;
1331c61a1653SRyan Zezeski 		struct ether_header *ehp;
1332c61a1653SRyan Zezeski 		uint32_t flags;
1333c61a1653SRyan Zezeski 		uint_t len = MBLKL(mp), l2len;
1334c61a1653SRyan Zezeski 
1335c61a1653SRyan Zezeski 		/* Perform LSO/cksum one message at a time. */
1336c61a1653SRyan Zezeski 		next = mp->b_next;
1337c61a1653SRyan Zezeski 		mp->b_next = NULL;
1338c61a1653SRyan Zezeski 
1339c61a1653SRyan Zezeski 		/*
1340c61a1653SRyan Zezeski 		 * For our sanity the first mblk should contain at
1341c61a1653SRyan Zezeski 		 * least the full L2 header.
1342c61a1653SRyan Zezeski 		 */
1343c61a1653SRyan Zezeski 		if (len < sizeof (struct ether_header)) {
1344c61a1653SRyan Zezeski 			mac_drop_pkt(mp, "packet too short (A): %u", len);
1345c61a1653SRyan Zezeski 			mp = next;
1346c61a1653SRyan Zezeski 			continue;
1347da14cebeSEric Cheng 		}
1348da14cebeSEric Cheng 
1349c61a1653SRyan Zezeski 		ehp = (struct ether_header *)mp->b_rptr;
1350c61a1653SRyan Zezeski 		if (ntohs(ehp->ether_type) == VLAN_TPID)
1351c61a1653SRyan Zezeski 			l2len = sizeof (struct ether_vlan_header);
1352c61a1653SRyan Zezeski 		else
1353c61a1653SRyan Zezeski 			l2len = sizeof (struct ether_header);
1354c61a1653SRyan Zezeski 
1355c61a1653SRyan Zezeski 		/*
1356c61a1653SRyan Zezeski 		 * If the first mblk is solely the L2 header, then
1357c61a1653SRyan Zezeski 		 * there better be more data.
1358c61a1653SRyan Zezeski 		 */
1359c61a1653SRyan Zezeski 		if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
1360c61a1653SRyan Zezeski 			mac_drop_pkt(mp, "packet too short (C): %u", len);
1361c61a1653SRyan Zezeski 			mp = next;
1362c61a1653SRyan Zezeski 			continue;
1363c61a1653SRyan Zezeski 		}
1364da14cebeSEric Cheng 
1365c61a1653SRyan Zezeski 		DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
1366c61a1653SRyan Zezeski 
1367c61a1653SRyan Zezeski 		/*
1368c61a1653SRyan Zezeski 		 * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
1369c61a1653SRyan Zezeski 		 * because we don't want to mask-out the LSO flag.
1370c61a1653SRyan Zezeski 		 */
1371c61a1653SRyan Zezeski 		flags = DB_CKSUMFLAGS(mp);
1372da14cebeSEric Cheng 
1373c61a1653SRyan Zezeski 		if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
1374c61a1653SRyan Zezeski 			uint_t tmpcount = 0;
1375da14cebeSEric Cheng 
1376da14cebeSEric Cheng 			/*
1377c61a1653SRyan Zezeski 			 * LSO fix-up handles checksum emulation
1378c61a1653SRyan Zezeski 			 * inline (if requested). It also frees mp.
1379da14cebeSEric Cheng 			 */
1380c61a1653SRyan Zezeski 			mac_sw_lso(mp, emul, &tmphead, &tmptail,
1381c61a1653SRyan Zezeski 			    &tmpcount);
1382c61a1653SRyan Zezeski 			if (tmphead == NULL) {
1383c61a1653SRyan Zezeski 				/* mac_sw_lso() freed the mp. */
1384c61a1653SRyan Zezeski 				mp = next;
1385c61a1653SRyan Zezeski 				continue;
1386c61a1653SRyan Zezeski 			}
1387c61a1653SRyan Zezeski 			count += tmpcount;
1388c61a1653SRyan Zezeski 		} else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
1389c61a1653SRyan Zezeski 			tmp = mac_sw_cksum(mp, emul);
1390c61a1653SRyan Zezeski 			if (tmp == NULL) {
1391c61a1653SRyan Zezeski 				/* mac_sw_cksum() freed the mp. */
1392c61a1653SRyan Zezeski 				mp = next;
1393c61a1653SRyan Zezeski 				continue;
1394c61a1653SRyan Zezeski 			}
1395c61a1653SRyan Zezeski 			tmphead = tmp;
1396c61a1653SRyan Zezeski 			tmptail = tmp;
1397c61a1653SRyan Zezeski 			count++;
1398c61a1653SRyan Zezeski 		} else {
1399c61a1653SRyan Zezeski 			/* There is nothing to emulate. */
1400c61a1653SRyan Zezeski 			tmp = mp;
1401c61a1653SRyan Zezeski 			tmphead = tmp;
1402c61a1653SRyan Zezeski 			tmptail = tmp;
1403c61a1653SRyan Zezeski 			count++;
1404da14cebeSEric Cheng 		}
1405da14cebeSEric Cheng 
1406c61a1653SRyan Zezeski 		/*
1407c61a1653SRyan Zezeski 		 * The tmp mblk chain is either the start of the new
1408c61a1653SRyan Zezeski 		 * chain or added to the tail of the new chain.
1409c61a1653SRyan Zezeski 		 */
1410c61a1653SRyan Zezeski 		if (head == NULL) {
1411c61a1653SRyan Zezeski 			head = tmphead;
1412c61a1653SRyan Zezeski 			tail = tmptail;
1413c61a1653SRyan Zezeski 		} else {
1414c61a1653SRyan Zezeski 			/* Attach the new mblk to the end of the new chain. */
1415c61a1653SRyan Zezeski 			tail->b_next = tmphead;
1416c61a1653SRyan Zezeski 			tail = tmptail;
1417c61a1653SRyan Zezeski 		}
1418c61a1653SRyan Zezeski 
1419c61a1653SRyan Zezeski 		mp = next;
1420da14cebeSEric Cheng 	}
1421da14cebeSEric Cheng 
1422c61a1653SRyan Zezeski 	*mp_chain = head;
1423c61a1653SRyan Zezeski 
1424c61a1653SRyan Zezeski 	if (otail != NULL)
1425c61a1653SRyan Zezeski 		*otail = tail;
1426c61a1653SRyan Zezeski 
1427c61a1653SRyan Zezeski 	if (ocount != NULL)
1428c61a1653SRyan Zezeski 		*ocount = count;
1429da14cebeSEric Cheng }
1430da14cebeSEric Cheng 
1431da14cebeSEric Cheng /*
1432da14cebeSEric Cheng  * Add VLAN tag to the specified mblk.
1433da14cebeSEric Cheng  */
1434da14cebeSEric Cheng mblk_t *
mac_add_vlan_tag(mblk_t * mp,uint_t pri,uint16_t vid)1435da14cebeSEric Cheng mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid)
1436da14cebeSEric Cheng {
1437da14cebeSEric Cheng 	mblk_t *hmp;
1438da14cebeSEric Cheng 	struct ether_vlan_header *evhp;
1439da14cebeSEric Cheng 	struct ether_header *ehp;
1440da14cebeSEric Cheng 
1441da14cebeSEric Cheng 	ASSERT(pri != 0 || vid != 0);
1442da14cebeSEric Cheng 
1443da14cebeSEric Cheng 	/*
1444da14cebeSEric Cheng 	 * Allocate an mblk for the new tagged ethernet header,
1445da14cebeSEric Cheng 	 * and copy the MAC addresses and ethertype from the
1446da14cebeSEric Cheng 	 * original header.
1447da14cebeSEric Cheng 	 */
1448da14cebeSEric Cheng 
1449da14cebeSEric Cheng 	hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
1450da14cebeSEric Cheng 	if (hmp == NULL) {
1451da14cebeSEric Cheng 		freemsg(mp);
1452da14cebeSEric Cheng 		return (NULL);
1453da14cebeSEric Cheng 	}
1454da14cebeSEric Cheng 
1455da14cebeSEric Cheng 	evhp = (struct ether_vlan_header *)hmp->b_rptr;
1456da14cebeSEric Cheng 	ehp = (struct ether_header *)mp->b_rptr;
1457da14cebeSEric Cheng 
1458da14cebeSEric Cheng 	bcopy(ehp, evhp, (ETHERADDRL * 2));
1459da14cebeSEric Cheng 	evhp->ether_type = ehp->ether_type;
1460da14cebeSEric Cheng 	evhp->ether_tpid = htons(ETHERTYPE_VLAN);
1461da14cebeSEric Cheng 
1462da14cebeSEric Cheng 	hmp->b_wptr += sizeof (struct ether_vlan_header);
1463da14cebeSEric Cheng 	mp->b_rptr += sizeof (struct ether_header);
1464da14cebeSEric Cheng 
1465da14cebeSEric Cheng 	/*
1466da14cebeSEric Cheng 	 * Free the original message if it's now empty. Link the
1467da14cebeSEric Cheng 	 * rest of messages to the header message.
1468da14cebeSEric Cheng 	 */
1469ec71f88eSPatrick Mooney 	mac_hcksum_clone(mp, hmp);
1470da14cebeSEric Cheng 	if (MBLKL(mp) == 0) {
1471da14cebeSEric Cheng 		hmp->b_cont = mp->b_cont;
1472da14cebeSEric Cheng 		freeb(mp);
1473da14cebeSEric Cheng 	} else {
1474da14cebeSEric Cheng 		hmp->b_cont = mp;
1475da14cebeSEric Cheng 	}
1476da14cebeSEric Cheng 	ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header));
1477da14cebeSEric Cheng 
1478da14cebeSEric Cheng 	/*
1479da14cebeSEric Cheng 	 * Initialize the new TCI (Tag Control Information).
1480da14cebeSEric Cheng 	 */
1481da14cebeSEric Cheng 	evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid));
1482da14cebeSEric Cheng 
1483da14cebeSEric Cheng 	return (hmp);
1484da14cebeSEric Cheng }
1485da14cebeSEric Cheng 
1486da14cebeSEric Cheng /*
1487da14cebeSEric Cheng  * Adds a VLAN tag with the specified VID and priority to each mblk of
1488da14cebeSEric Cheng  * the specified chain.
1489da14cebeSEric Cheng  */
1490da14cebeSEric Cheng mblk_t *
mac_add_vlan_tag_chain(mblk_t * mp_chain,uint_t pri,uint16_t vid)1491da14cebeSEric Cheng mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid)
1492da14cebeSEric Cheng {
1493da14cebeSEric Cheng 	mblk_t *next_mp, **prev, *mp;
1494da14cebeSEric Cheng 
1495da14cebeSEric Cheng 	mp = mp_chain;
1496da14cebeSEric Cheng 	prev = &mp_chain;
1497da14cebeSEric Cheng 
1498da14cebeSEric Cheng 	while (mp != NULL) {
1499da14cebeSEric Cheng 		next_mp = mp->b_next;
1500da14cebeSEric Cheng 		mp->b_next = NULL;
1501da14cebeSEric Cheng 		if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) {
1502da14cebeSEric Cheng 			freemsgchain(next_mp);
1503da14cebeSEric Cheng 			break;
1504da14cebeSEric Cheng 		}
1505da14cebeSEric Cheng 		*prev = mp;
1506da14cebeSEric Cheng 		prev = &mp->b_next;
1507da14cebeSEric Cheng 		mp = mp->b_next = next_mp;
1508da14cebeSEric Cheng 	}
1509da14cebeSEric Cheng 
1510da14cebeSEric Cheng 	return (mp_chain);
1511da14cebeSEric Cheng }
1512da14cebeSEric Cheng 
1513da14cebeSEric Cheng /*
1514da14cebeSEric Cheng  * Strip VLAN tag
1515da14cebeSEric Cheng  */
1516da14cebeSEric Cheng mblk_t *
mac_strip_vlan_tag(mblk_t * mp)1517da14cebeSEric Cheng mac_strip_vlan_tag(mblk_t *mp)
1518da14cebeSEric Cheng {
1519da14cebeSEric Cheng 	mblk_t *newmp;
1520da14cebeSEric Cheng 	struct ether_vlan_header *evhp;
1521da14cebeSEric Cheng 
1522da14cebeSEric Cheng 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1523da14cebeSEric Cheng 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
1524da14cebeSEric Cheng 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1525da14cebeSEric Cheng 
1526da14cebeSEric Cheng 		if (DB_REF(mp) > 1) {
1527da14cebeSEric Cheng 			newmp = copymsg(mp);
1528da14cebeSEric Cheng 			if (newmp == NULL)
1529da14cebeSEric Cheng 				return (NULL);
1530da14cebeSEric Cheng 			freemsg(mp);
1531da14cebeSEric Cheng 			mp = newmp;
1532da14cebeSEric Cheng 		}
1533da14cebeSEric Cheng 
1534da14cebeSEric Cheng 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1535da14cebeSEric Cheng 
1536da14cebeSEric Cheng 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1537da14cebeSEric Cheng 		mp->b_rptr += VLAN_TAGSZ;
1538da14cebeSEric Cheng 	}
1539da14cebeSEric Cheng 	return (mp);
1540da14cebeSEric Cheng }
1541da14cebeSEric Cheng 
1542da14cebeSEric Cheng /*
1543da14cebeSEric Cheng  * Strip VLAN tag from each mblk of the chain.
1544da14cebeSEric Cheng  */
1545da14cebeSEric Cheng mblk_t *
mac_strip_vlan_tag_chain(mblk_t * mp_chain)1546da14cebeSEric Cheng mac_strip_vlan_tag_chain(mblk_t *mp_chain)
1547da14cebeSEric Cheng {
1548da14cebeSEric Cheng 	mblk_t *mp, *next_mp, **prev;
1549da14cebeSEric Cheng 
1550da14cebeSEric Cheng 	mp = mp_chain;
1551da14cebeSEric Cheng 	prev = &mp_chain;
1552da14cebeSEric Cheng 
1553da14cebeSEric Cheng 	while (mp != NULL) {
1554da14cebeSEric Cheng 		next_mp = mp->b_next;
1555da14cebeSEric Cheng 		mp->b_next = NULL;
1556da14cebeSEric Cheng 		if ((mp = mac_strip_vlan_tag(mp)) == NULL) {
1557da14cebeSEric Cheng 			freemsgchain(next_mp);
1558da14cebeSEric Cheng 			break;
1559da14cebeSEric Cheng 		}
1560da14cebeSEric Cheng 		*prev = mp;
1561da14cebeSEric Cheng 		prev = &mp->b_next;
1562da14cebeSEric Cheng 		mp = mp->b_next = next_mp;
1563da14cebeSEric Cheng 	}
1564da14cebeSEric Cheng 
1565da14cebeSEric Cheng 	return (mp_chain);
1566da14cebeSEric Cheng }
1567da14cebeSEric Cheng 
1568da14cebeSEric Cheng /*
1569da14cebeSEric Cheng  * Default callback function. Used when the datapath is not yet initialized.
1570da14cebeSEric Cheng  */
1571da14cebeSEric Cheng /* ARGSUSED */
1572da14cebeSEric Cheng void
mac_rx_def(void * arg,mac_resource_handle_t resource,mblk_t * mp_chain,boolean_t loopback)1573c61a1653SRyan Zezeski mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
1574da14cebeSEric Cheng     boolean_t loopback)
1575da14cebeSEric Cheng {
1576c61a1653SRyan Zezeski 	freemsgchain(mp_chain);
1577da14cebeSEric Cheng }
1578da14cebeSEric Cheng 
1579da14cebeSEric Cheng /*
1580da14cebeSEric Cheng  * Determines the IPv6 header length accounting for all the optional IPv6
1581da14cebeSEric Cheng  * headers (hop-by-hop, destination, routing and fragment). The header length
1582da14cebeSEric Cheng  * and next header value (a transport header) is captured.
1583da14cebeSEric Cheng  *
1584da14cebeSEric Cheng  * Returns B_FALSE if all the IP headers are not in the same mblk otherwise
1585da14cebeSEric Cheng  * returns B_TRUE.
1586da14cebeSEric Cheng  */
1587da14cebeSEric Cheng boolean_t
mac_ip_hdr_length_v6(ip6_t * ip6h,uint8_t * endptr,uint16_t * hdr_length,uint8_t * next_hdr,ip6_frag_t ** fragp)15880dc2366fSVenugopal Iyer mac_ip_hdr_length_v6(ip6_t *ip6h, uint8_t *endptr, uint16_t *hdr_length,
15890dc2366fSVenugopal Iyer     uint8_t *next_hdr, ip6_frag_t **fragp)
1590da14cebeSEric Cheng {
1591da14cebeSEric Cheng 	uint16_t length;
1592da14cebeSEric Cheng 	uint_t	ehdrlen;
1593da14cebeSEric Cheng 	uint8_t *whereptr;
1594da14cebeSEric Cheng 	uint8_t *nexthdrp;
1595da14cebeSEric Cheng 	ip6_dest_t *desthdr;
1596da14cebeSEric Cheng 	ip6_rthdr_t *rthdr;
1597da14cebeSEric Cheng 	ip6_frag_t *fraghdr;
1598da14cebeSEric Cheng 
1599da14cebeSEric Cheng 	if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
1600da14cebeSEric Cheng 		return (B_FALSE);
1601bd670b35SErik Nordmark 	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
1602da14cebeSEric Cheng 	length = IPV6_HDR_LEN;
1603da14cebeSEric Cheng 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
1604da14cebeSEric Cheng 
16050dc2366fSVenugopal Iyer 	if (fragp != NULL)
16060dc2366fSVenugopal Iyer 		*fragp = NULL;
16079820c710SBaban Kenkre 
1608da14cebeSEric Cheng 	nexthdrp = &ip6h->ip6_nxt;
1609da14cebeSEric Cheng 	while (whereptr < endptr) {
1610da14cebeSEric Cheng 		/* Is there enough left for len + nexthdr? */
1611da14cebeSEric Cheng 		if (whereptr + MIN_EHDR_LEN > endptr)
1612da14cebeSEric Cheng 			break;
1613da14cebeSEric Cheng 
1614da14cebeSEric Cheng 		switch (*nexthdrp) {
1615da14cebeSEric Cheng 		case IPPROTO_HOPOPTS:
1616da14cebeSEric Cheng 		case IPPROTO_DSTOPTS:
1617da14cebeSEric Cheng 			/* Assumes the headers are identical for hbh and dst */
1618da14cebeSEric Cheng 			desthdr = (ip6_dest_t *)whereptr;
1619da14cebeSEric Cheng 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
1620da14cebeSEric Cheng 			if ((uchar_t *)desthdr +  ehdrlen > endptr)
1621da14cebeSEric Cheng 				return (B_FALSE);
1622da14cebeSEric Cheng 			nexthdrp = &desthdr->ip6d_nxt;
1623da14cebeSEric Cheng 			break;
1624da14cebeSEric Cheng 		case IPPROTO_ROUTING:
1625da14cebeSEric Cheng 			rthdr = (ip6_rthdr_t *)whereptr;
1626da14cebeSEric Cheng 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
1627da14cebeSEric Cheng 			if ((uchar_t *)rthdr +  ehdrlen > endptr)
1628da14cebeSEric Cheng 				return (B_FALSE);
1629da14cebeSEric Cheng 			nexthdrp = &rthdr->ip6r_nxt;
1630da14cebeSEric Cheng 			break;
1631da14cebeSEric Cheng 		case IPPROTO_FRAGMENT:
1632da14cebeSEric Cheng 			fraghdr = (ip6_frag_t *)whereptr;
1633da14cebeSEric Cheng 			ehdrlen = sizeof (ip6_frag_t);
1634da14cebeSEric Cheng 			if ((uchar_t *)&fraghdr[1] > endptr)
1635da14cebeSEric Cheng 				return (B_FALSE);
1636da14cebeSEric Cheng 			nexthdrp = &fraghdr->ip6f_nxt;
16370dc2366fSVenugopal Iyer 			if (fragp != NULL)
16380dc2366fSVenugopal Iyer 				*fragp = fraghdr;
1639da14cebeSEric Cheng 			break;
1640da14cebeSEric Cheng 		case IPPROTO_NONE:
1641da14cebeSEric Cheng 			/* No next header means we're finished */
1642da14cebeSEric Cheng 		default:
1643da14cebeSEric Cheng 			*hdr_length = length;
1644da14cebeSEric Cheng 			*next_hdr = *nexthdrp;
1645da14cebeSEric Cheng 			return (B_TRUE);
1646da14cebeSEric Cheng 		}
1647da14cebeSEric Cheng 		length += ehdrlen;
1648da14cebeSEric Cheng 		whereptr += ehdrlen;
1649da14cebeSEric Cheng 		*hdr_length = length;
1650da14cebeSEric Cheng 		*next_hdr = *nexthdrp;
1651da14cebeSEric Cheng 	}
1652da14cebeSEric Cheng 	switch (*nexthdrp) {
1653da14cebeSEric Cheng 	case IPPROTO_HOPOPTS:
1654da14cebeSEric Cheng 	case IPPROTO_DSTOPTS:
1655da14cebeSEric Cheng 	case IPPROTO_ROUTING:
1656da14cebeSEric Cheng 	case IPPROTO_FRAGMENT:
1657da14cebeSEric Cheng 		/*
1658da14cebeSEric Cheng 		 * If any know extension headers are still to be processed,
1659da14cebeSEric Cheng 		 * the packet's malformed (or at least all the IP header(s) are
1660da14cebeSEric Cheng 		 * not in the same mblk - and that should never happen.
1661da14cebeSEric Cheng 		 */
1662da14cebeSEric Cheng 		return (B_FALSE);
1663da14cebeSEric Cheng 
1664da14cebeSEric Cheng 	default:
1665da14cebeSEric Cheng 		/*
1666da14cebeSEric Cheng 		 * If we get here, we know that all of the IP headers were in
1667da14cebeSEric Cheng 		 * the same mblk, even if the ULP header is in the next mblk.
1668da14cebeSEric Cheng 		 */
1669da14cebeSEric Cheng 		*hdr_length = length;
1670da14cebeSEric Cheng 		*next_hdr = *nexthdrp;
1671da14cebeSEric Cheng 		return (B_TRUE);
1672da14cebeSEric Cheng 	}
1673da14cebeSEric Cheng }
1674da14cebeSEric Cheng 
16750dc2366fSVenugopal Iyer /*
16760dc2366fSVenugopal Iyer  * The following set of routines are there to take care of interrupt
16770dc2366fSVenugopal Iyer  * re-targeting for legacy (fixed) interrupts. Some older versions
16780dc2366fSVenugopal Iyer  * of the popular NICs like e1000g do not support MSI-X interrupts
16790dc2366fSVenugopal Iyer  * and they reserve fixed interrupts for RX/TX rings. To re-target
16800dc2366fSVenugopal Iyer  * these interrupts, PCITOOL ioctls need to be used.
16810dc2366fSVenugopal Iyer  */
1682da14cebeSEric Cheng typedef struct mac_dladm_intr {
1683da14cebeSEric Cheng 	int	ino;
1684da14cebeSEric Cheng 	int	cpu_id;
1685da14cebeSEric Cheng 	char	driver_path[MAXPATHLEN];
1686da14cebeSEric Cheng 	char	nexus_path[MAXPATHLEN];
1687da14cebeSEric Cheng } mac_dladm_intr_t;
1688da14cebeSEric Cheng 
1689da14cebeSEric Cheng /* Bind the interrupt to cpu_num */
1690da14cebeSEric Cheng static int
mac_set_intr(ldi_handle_t lh,processorid_t cpu_num,int oldcpuid,int ino)16917ff178cdSJimmy Vetayases mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int oldcpuid, int ino)
1692da14cebeSEric Cheng {
1693da14cebeSEric Cheng 	pcitool_intr_set_t	iset;
1694da14cebeSEric Cheng 	int			err;
1695da14cebeSEric Cheng 
16967ff178cdSJimmy Vetayases 	iset.old_cpu = oldcpuid;
1697da14cebeSEric Cheng 	iset.ino = ino;
1698da14cebeSEric Cheng 	iset.cpu_id = cpu_num;
1699da14cebeSEric Cheng 	iset.user_version = PCITOOL_VERSION;
1700da14cebeSEric Cheng 	err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL,
1701da14cebeSEric Cheng 	    kcred, NULL);
1702da14cebeSEric Cheng 
1703da14cebeSEric Cheng 	return (err);
1704da14cebeSEric Cheng }
1705da14cebeSEric Cheng 
1706da14cebeSEric Cheng /*
1707da14cebeSEric Cheng  * Search interrupt information. iget is filled in with the info to search
1708da14cebeSEric Cheng  */
1709da14cebeSEric Cheng static boolean_t
mac_search_intrinfo(pcitool_intr_get_t * iget_p,mac_dladm_intr_t * dln)1710da14cebeSEric Cheng mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln)
1711da14cebeSEric Cheng {
1712da14cebeSEric Cheng 	int	i;
1713da14cebeSEric Cheng 	char	driver_path[2 * MAXPATHLEN];
1714da14cebeSEric Cheng 
1715da14cebeSEric Cheng 	for (i = 0; i < iget_p->num_devs; i++) {
1716da14cebeSEric Cheng 		(void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN);
1717da14cebeSEric Cheng 		(void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN,
1718da14cebeSEric Cheng 		    ":%s%d", iget_p->dev[i].driver_name,
1719da14cebeSEric Cheng 		    iget_p->dev[i].dev_inst);
1720da14cebeSEric Cheng 		/* Match the device path for the device path */
1721da14cebeSEric Cheng 		if (strcmp(driver_path, dln->driver_path) == 0) {
1722da14cebeSEric Cheng 			dln->ino = iget_p->ino;
1723da14cebeSEric Cheng 			dln->cpu_id = iget_p->cpu_id;
1724da14cebeSEric Cheng 			return (B_TRUE);
1725da14cebeSEric Cheng 		}
1726da14cebeSEric Cheng 	}
1727da14cebeSEric Cheng 	return (B_FALSE);
1728da14cebeSEric Cheng }
1729da14cebeSEric Cheng 
1730da14cebeSEric Cheng /*
1731da14cebeSEric Cheng  * Get information about ino, i.e. if this is the interrupt for our
1732da14cebeSEric Cheng  * device and where it is bound etc.
1733da14cebeSEric Cheng  */
1734da14cebeSEric Cheng static boolean_t
mac_get_single_intr(ldi_handle_t lh,int oldcpuid,int ino,mac_dladm_intr_t * dln)17357ff178cdSJimmy Vetayases mac_get_single_intr(ldi_handle_t lh, int oldcpuid, int ino,
17367ff178cdSJimmy Vetayases     mac_dladm_intr_t *dln)
1737da14cebeSEric Cheng {
1738da14cebeSEric Cheng 	pcitool_intr_get_t	*iget_p;
1739da14cebeSEric Cheng 	int			ipsz;
1740da14cebeSEric Cheng 	int			nipsz;
1741da14cebeSEric Cheng 	int			err;
1742da14cebeSEric Cheng 	uint8_t			inum;
1743da14cebeSEric Cheng 
1744da14cebeSEric Cheng 	/*
1745da14cebeSEric Cheng 	 * Check if SLEEP is OK, i.e if could come here in response to
1746da14cebeSEric Cheng 	 * changing the fanout due to some callback from the driver, say
1747da14cebeSEric Cheng 	 * link speed changes.
1748da14cebeSEric Cheng 	 */
1749da14cebeSEric Cheng 	ipsz = PCITOOL_IGET_SIZE(0);
1750da14cebeSEric Cheng 	iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1751da14cebeSEric Cheng 
1752da14cebeSEric Cheng 	iget_p->num_devs_ret = 0;
1753da14cebeSEric Cheng 	iget_p->user_version = PCITOOL_VERSION;
17547ff178cdSJimmy Vetayases 	iget_p->cpu_id = oldcpuid;
1755da14cebeSEric Cheng 	iget_p->ino = ino;
1756da14cebeSEric Cheng 
1757da14cebeSEric Cheng 	err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1758da14cebeSEric Cheng 	    FKIOCTL, kcred, NULL);
1759da14cebeSEric Cheng 	if (err != 0) {
1760da14cebeSEric Cheng 		kmem_free(iget_p, ipsz);
1761da14cebeSEric Cheng 		return (B_FALSE);
1762da14cebeSEric Cheng 	}
1763da14cebeSEric Cheng 	if (iget_p->num_devs == 0) {
1764da14cebeSEric Cheng 		kmem_free(iget_p, ipsz);
1765da14cebeSEric Cheng 		return (B_FALSE);
1766da14cebeSEric Cheng 	}
1767da14cebeSEric Cheng 	inum = iget_p->num_devs;
1768da14cebeSEric Cheng 	if (iget_p->num_devs_ret < iget_p->num_devs) {
1769da14cebeSEric Cheng 		/* Reallocate */
1770da14cebeSEric Cheng 		nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs);
1771da14cebeSEric Cheng 
1772da14cebeSEric Cheng 		kmem_free(iget_p, ipsz);
1773da14cebeSEric Cheng 		ipsz = nipsz;
1774da14cebeSEric Cheng 		iget_p = kmem_zalloc(ipsz, KM_SLEEP);
1775da14cebeSEric Cheng 
1776da14cebeSEric Cheng 		iget_p->num_devs_ret = inum;
17777ff178cdSJimmy Vetayases 		iget_p->cpu_id = oldcpuid;
1778da14cebeSEric Cheng 		iget_p->ino = ino;
1779da14cebeSEric Cheng 		iget_p->user_version = PCITOOL_VERSION;
1780da14cebeSEric Cheng 		err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p,
1781da14cebeSEric Cheng 		    FKIOCTL, kcred, NULL);
1782da14cebeSEric Cheng 		if (err != 0) {
1783da14cebeSEric Cheng 			kmem_free(iget_p, ipsz);
1784da14cebeSEric Cheng 			return (B_FALSE);
1785da14cebeSEric Cheng 		}
1786da14cebeSEric Cheng 		/* defensive */
1787da14cebeSEric Cheng 		if (iget_p->num_devs != iget_p->num_devs_ret) {
1788da14cebeSEric Cheng 			kmem_free(iget_p, ipsz);
1789da14cebeSEric Cheng 			return (B_FALSE);
1790da14cebeSEric Cheng 		}
1791da14cebeSEric Cheng 	}
1792da14cebeSEric Cheng 
1793da14cebeSEric Cheng 	if (mac_search_intrinfo(iget_p, dln)) {
1794da14cebeSEric Cheng 		kmem_free(iget_p, ipsz);
1795da14cebeSEric Cheng 		return (B_TRUE);
1796da14cebeSEric Cheng 	}
1797da14cebeSEric Cheng 	kmem_free(iget_p, ipsz);
1798da14cebeSEric Cheng 	return (B_FALSE);
1799da14cebeSEric Cheng }
1800da14cebeSEric Cheng 
1801da14cebeSEric Cheng /*
1802da14cebeSEric Cheng  * Get the interrupts and check each one to see if it is for our device.
1803da14cebeSEric Cheng  */
1804da14cebeSEric Cheng static int
mac_validate_intr(ldi_handle_t lh,mac_dladm_intr_t * dln,processorid_t cpuid)1805da14cebeSEric Cheng mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid)
1806da14cebeSEric Cheng {
1807da14cebeSEric Cheng 	pcitool_intr_info_t	intr_info;
1808da14cebeSEric Cheng 	int			err;
1809da14cebeSEric Cheng 	int			ino;
18107ff178cdSJimmy Vetayases 	int			oldcpuid;
1811da14cebeSEric Cheng 
1812da14cebeSEric Cheng 	err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info,
1813da14cebeSEric Cheng 	    FKIOCTL, kcred, NULL);
1814da14cebeSEric Cheng 	if (err != 0)
1815da14cebeSEric Cheng 		return (-1);
1816da14cebeSEric Cheng 
18177ff178cdSJimmy Vetayases 	for (oldcpuid = 0; oldcpuid < intr_info.num_cpu; oldcpuid++) {
18187ff178cdSJimmy Vetayases 		for (ino = 0; ino < intr_info.num_intr; ino++) {
18197ff178cdSJimmy Vetayases 			if (mac_get_single_intr(lh, oldcpuid, ino, dln)) {
18207ff178cdSJimmy Vetayases 				if (dln->cpu_id == cpuid)
18217ff178cdSJimmy Vetayases 					return (0);
18227ff178cdSJimmy Vetayases 				return (1);
18237ff178cdSJimmy Vetayases 			}
1824da14cebeSEric Cheng 		}
1825da14cebeSEric Cheng 	}
1826da14cebeSEric Cheng 	return (-1);
1827da14cebeSEric Cheng }
1828da14cebeSEric Cheng 
1829da14cebeSEric Cheng /*
1830da14cebeSEric Cheng  * Obtain the nexus parent node info. for mdip.
1831da14cebeSEric Cheng  */
1832da14cebeSEric Cheng static dev_info_t *
mac_get_nexus_node(dev_info_t * mdip,mac_dladm_intr_t * dln)1833da14cebeSEric Cheng mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln)
1834da14cebeSEric Cheng {
1835da14cebeSEric Cheng 	struct dev_info		*tdip = (struct dev_info *)mdip;
1836da14cebeSEric Cheng 	struct ddi_minor_data	*minordata;
1837da14cebeSEric Cheng 	dev_info_t		*pdip;
1838da14cebeSEric Cheng 	char			pathname[MAXPATHLEN];
1839da14cebeSEric Cheng 
1840da14cebeSEric Cheng 	while (tdip != NULL) {
1841c36aa31cSEric Cheng 		/*
1842c36aa31cSEric Cheng 		 * The netboot code could call this function while walking the
1843c36aa31cSEric Cheng 		 * device tree so we need to use ndi_devi_tryenter() here to
1844c36aa31cSEric Cheng 		 * avoid deadlock.
1845c36aa31cSEric Cheng 		 */
1846*3fe80ca4SDan Cross 		if (ndi_devi_tryenter((dev_info_t *)tdip) == 0)
1847c36aa31cSEric Cheng 			break;
1848c36aa31cSEric Cheng 
1849da14cebeSEric Cheng 		for (minordata = tdip->devi_minor; minordata != NULL;
1850da14cebeSEric Cheng 		    minordata = minordata->next) {
1851da14cebeSEric Cheng 			if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL,
1852da14cebeSEric Cheng 			    strlen(DDI_NT_INTRCTL)) == 0) {
1853da14cebeSEric Cheng 				pdip = minordata->dip;
1854da14cebeSEric Cheng 				(void) ddi_pathname(pdip, pathname);
1855da14cebeSEric Cheng 				(void) snprintf(dln->nexus_path, MAXPATHLEN,
1856da14cebeSEric Cheng 				    "/devices%s:intr", pathname);
1857da14cebeSEric Cheng 				(void) ddi_pathname_minor(minordata, pathname);
1858*3fe80ca4SDan Cross 				ndi_devi_exit((dev_info_t *)tdip);
1859da14cebeSEric Cheng 				return (pdip);
1860da14cebeSEric Cheng 			}
1861da14cebeSEric Cheng 		}
1862*3fe80ca4SDan Cross 		ndi_devi_exit((dev_info_t *)tdip);
1863da14cebeSEric Cheng 		tdip = tdip->devi_parent;
1864da14cebeSEric Cheng 	}
1865da14cebeSEric Cheng 	return (NULL);
1866da14cebeSEric Cheng }
1867da14cebeSEric Cheng 
1868da14cebeSEric Cheng /*
1869da14cebeSEric Cheng  * For a primary MAC client, if the user has set a list or CPUs or
1870da14cebeSEric Cheng  * we have obtained it implicitly, we try to retarget the interrupt
1871da14cebeSEric Cheng  * for that device on one of the CPUs in the list.
1872da14cebeSEric Cheng  * We assign the interrupt to the same CPU as the poll thread.
1873da14cebeSEric Cheng  */
1874da14cebeSEric Cheng static boolean_t
mac_check_interrupt_binding(dev_info_t * mdip,int32_t cpuid)1875da14cebeSEric Cheng mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid)
1876da14cebeSEric Cheng {
1877da14cebeSEric Cheng 	ldi_handle_t		lh = NULL;
1878da14cebeSEric Cheng 	ldi_ident_t		li = NULL;
1879da14cebeSEric Cheng 	int			err;
1880da14cebeSEric Cheng 	int			ret;
1881da14cebeSEric Cheng 	mac_dladm_intr_t	dln;
1882da14cebeSEric Cheng 	dev_info_t		*dip;
1883da14cebeSEric Cheng 	struct ddi_minor_data	*minordata;
1884da14cebeSEric Cheng 
1885da14cebeSEric Cheng 	dln.nexus_path[0] = '\0';
1886da14cebeSEric Cheng 	dln.driver_path[0] = '\0';
1887da14cebeSEric Cheng 
1888da14cebeSEric Cheng 	minordata = ((struct dev_info *)mdip)->devi_minor;
1889da14cebeSEric Cheng 	while (minordata != NULL) {
1890da14cebeSEric Cheng 		if (minordata->type == DDM_MINOR)
1891da14cebeSEric Cheng 			break;
1892da14cebeSEric Cheng 		minordata = minordata->next;
1893da14cebeSEric Cheng 	}
1894da14cebeSEric Cheng 	if (minordata == NULL)
1895da14cebeSEric Cheng 		return (B_FALSE);
1896da14cebeSEric Cheng 
1897da14cebeSEric Cheng 	(void) ddi_pathname_minor(minordata, dln.driver_path);
1898da14cebeSEric Cheng 
1899da14cebeSEric Cheng 	dip = mac_get_nexus_node(mdip, &dln);
1900da14cebeSEric Cheng 	/* defensive */
1901da14cebeSEric Cheng 	if (dip == NULL)
1902da14cebeSEric Cheng 		return (B_FALSE);
1903da14cebeSEric Cheng 
1904da14cebeSEric Cheng 	err = ldi_ident_from_major(ddi_driver_major(dip), &li);
1905da14cebeSEric Cheng 	if (err != 0)
1906da14cebeSEric Cheng 		return (B_FALSE);
1907da14cebeSEric Cheng 
1908da14cebeSEric Cheng 	err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li);
1909da14cebeSEric Cheng 	if (err != 0)
1910da14cebeSEric Cheng 		return (B_FALSE);
1911da14cebeSEric Cheng 
1912da14cebeSEric Cheng 	ret = mac_validate_intr(lh, &dln, cpuid);
1913da14cebeSEric Cheng 	if (ret < 0) {
1914da14cebeSEric Cheng 		(void) ldi_close(lh, FREAD|FWRITE, kcred);
1915da14cebeSEric Cheng 		return (B_FALSE);
1916da14cebeSEric Cheng 	}
1917da14cebeSEric Cheng 	/* cmn_note? */
1918da14cebeSEric Cheng 	if (ret != 0)
19197ff178cdSJimmy Vetayases 		if ((err = (mac_set_intr(lh, cpuid, dln.cpu_id, dln.ino)))
19207ff178cdSJimmy Vetayases 		    != 0) {
1921da14cebeSEric Cheng 			(void) ldi_close(lh, FREAD|FWRITE, kcred);
1922da14cebeSEric Cheng 			return (B_FALSE);
1923da14cebeSEric Cheng 		}
1924da14cebeSEric Cheng 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
1925da14cebeSEric Cheng 	return (B_TRUE);
1926da14cebeSEric Cheng }
1927da14cebeSEric Cheng 
1928da14cebeSEric Cheng void
mac_client_set_intr_cpu(void * arg,mac_client_handle_t mch,int32_t cpuid)1929da14cebeSEric Cheng mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid)
1930da14cebeSEric Cheng {
1931da14cebeSEric Cheng 	dev_info_t		*mdip = (dev_info_t *)arg;
1932da14cebeSEric Cheng 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1933da14cebeSEric Cheng 	mac_resource_props_t	*mrp;
1934da14cebeSEric Cheng 	mac_perim_handle_t	mph;
19350dc2366fSVenugopal Iyer 	flow_entry_t		*flent = mcip->mci_flent;
19360dc2366fSVenugopal Iyer 	mac_soft_ring_set_t	*rx_srs;
19370dc2366fSVenugopal Iyer 	mac_cpus_t		*srs_cpu;
1938da14cebeSEric Cheng 
19390dc2366fSVenugopal Iyer 	if (!mac_check_interrupt_binding(mdip, cpuid))
19400dc2366fSVenugopal Iyer 		cpuid = -1;
1941da14cebeSEric Cheng 	mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph);
1942da14cebeSEric Cheng 	mrp = MCIP_RESOURCE_PROPS(mcip);
19430dc2366fSVenugopal Iyer 	mrp->mrp_rx_intr_cpu = cpuid;
19440dc2366fSVenugopal Iyer 	if (flent != NULL && flent->fe_rx_srs_cnt == 2) {
19450dc2366fSVenugopal Iyer 		rx_srs = flent->fe_rx_srs[1];
19460dc2366fSVenugopal Iyer 		srs_cpu = &rx_srs->srs_cpu;
19470dc2366fSVenugopal Iyer 		srs_cpu->mc_rx_intr_cpu = cpuid;
19480dc2366fSVenugopal Iyer 	}
1949da14cebeSEric Cheng 	mac_perim_exit(mph);
1950da14cebeSEric Cheng }
1951da14cebeSEric Cheng 
1952da14cebeSEric Cheng int32_t
mac_client_intr_cpu(mac_client_handle_t mch)1953da14cebeSEric Cheng mac_client_intr_cpu(mac_client_handle_t mch)
1954da14cebeSEric Cheng {
1955da14cebeSEric Cheng 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1956da14cebeSEric Cheng 	mac_cpus_t		*srs_cpu;
1957da14cebeSEric Cheng 	mac_soft_ring_set_t	*rx_srs;
1958da14cebeSEric Cheng 	flow_entry_t		*flent = mcip->mci_flent;
1959da14cebeSEric Cheng 	mac_resource_props_t	*mrp = MCIP_RESOURCE_PROPS(mcip);
19600dc2366fSVenugopal Iyer 	mac_ring_t		*ring;
19610dc2366fSVenugopal Iyer 	mac_intr_t		*mintr;
1962da14cebeSEric Cheng 
1963da14cebeSEric Cheng 	/*
1964da14cebeSEric Cheng 	 * Check if we need to retarget the interrupt. We do this only
1965da14cebeSEric Cheng 	 * for the primary MAC client. We do this if we have the only
19660dc2366fSVenugopal Iyer 	 * exclusive ring in the group.
1967da14cebeSEric Cheng 	 */
1968da14cebeSEric Cheng 	if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) {
1969da14cebeSEric Cheng 		rx_srs = flent->fe_rx_srs[1];
1970da14cebeSEric Cheng 		srs_cpu = &rx_srs->srs_cpu;
19710dc2366fSVenugopal Iyer 		ring = rx_srs->srs_ring;
19720dc2366fSVenugopal Iyer 		mintr = &ring->mr_info.mri_intr;
19730dc2366fSVenugopal Iyer 		/*
19740dc2366fSVenugopal Iyer 		 * If ddi_handle is present or the poll CPU is
19750dc2366fSVenugopal Iyer 		 * already bound to the interrupt CPU, return -1.
19760dc2366fSVenugopal Iyer 		 */
19770dc2366fSVenugopal Iyer 		if (mintr->mi_ddi_handle != NULL ||
19780dc2366fSVenugopal Iyer 		    ((mrp->mrp_ncpus != 0) &&
19790dc2366fSVenugopal Iyer 		    (mrp->mrp_rx_intr_cpu == srs_cpu->mc_rx_pollid))) {
1980da14cebeSEric Cheng 			return (-1);
19810dc2366fSVenugopal Iyer 		}
19820dc2366fSVenugopal Iyer 		return (srs_cpu->mc_rx_pollid);
1983da14cebeSEric Cheng 	}
1984da14cebeSEric Cheng 	return (-1);
1985da14cebeSEric Cheng }
1986da14cebeSEric Cheng 
1987da14cebeSEric Cheng void *
mac_get_devinfo(mac_handle_t mh)1988da14cebeSEric Cheng mac_get_devinfo(mac_handle_t mh)
1989da14cebeSEric Cheng {
1990da14cebeSEric Cheng 	mac_impl_t	*mip = (mac_impl_t *)mh;
1991da14cebeSEric Cheng 
1992da14cebeSEric Cheng 	return ((void *)mip->mi_dip);
1993da14cebeSEric Cheng }
1994ae6aa22aSVenugopal Iyer 
19959820c710SBaban Kenkre #define	PKT_HASH_2BYTES(x) ((x)[0] ^ (x)[1])
1996ae6aa22aSVenugopal Iyer #define	PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
1997ae6aa22aSVenugopal Iyer #define	PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
1998ae6aa22aSVenugopal Iyer 
1999ae6aa22aSVenugopal Iyer uint64_t
mac_pkt_hash(uint_t media,mblk_t * mp,uint8_t policy,boolean_t is_outbound)2000ae6aa22aSVenugopal Iyer mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound)
2001ae6aa22aSVenugopal Iyer {
2002ae6aa22aSVenugopal Iyer 	struct ether_header *ehp;
2003ae6aa22aSVenugopal Iyer 	uint64_t hash = 0;
2004ae6aa22aSVenugopal Iyer 	uint16_t sap;
2005ae6aa22aSVenugopal Iyer 	uint_t skip_len;
2006ae6aa22aSVenugopal Iyer 	uint8_t proto;
20079820c710SBaban Kenkre 	boolean_t ip_fragmented;
2008ae6aa22aSVenugopal Iyer 
2009ae6aa22aSVenugopal Iyer 	/*
2010ae6aa22aSVenugopal Iyer 	 * We may want to have one of these per MAC type plugin in the
2011ae6aa22aSVenugopal Iyer 	 * future. For now supports only ethernet.
2012ae6aa22aSVenugopal Iyer 	 */
2013ae6aa22aSVenugopal Iyer 	if (media != DL_ETHER)
2014ae6aa22aSVenugopal Iyer 		return (0L);
2015ae6aa22aSVenugopal Iyer 
2016ae6aa22aSVenugopal Iyer 	/* for now we support only outbound packets */
2017ae6aa22aSVenugopal Iyer 	ASSERT(is_outbound);
2018ae6aa22aSVenugopal Iyer 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
2019ae6aa22aSVenugopal Iyer 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
2020ae6aa22aSVenugopal Iyer 
2021ae6aa22aSVenugopal Iyer 	/* compute L2 hash */
2022ae6aa22aSVenugopal Iyer 
2023ae6aa22aSVenugopal Iyer 	ehp = (struct ether_header *)mp->b_rptr;
2024ae6aa22aSVenugopal Iyer 
2025ae6aa22aSVenugopal Iyer 	if ((policy & MAC_PKT_HASH_L2) != 0) {
2026ae6aa22aSVenugopal Iyer 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
2027ae6aa22aSVenugopal Iyer 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
2028ae6aa22aSVenugopal Iyer 		hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst);
2029ae6aa22aSVenugopal Iyer 		policy &= ~MAC_PKT_HASH_L2;
2030ae6aa22aSVenugopal Iyer 	}
2031ae6aa22aSVenugopal Iyer 
2032ae6aa22aSVenugopal Iyer 	if (policy == 0)
2033ae6aa22aSVenugopal Iyer 		goto done;
2034ae6aa22aSVenugopal Iyer 
2035ae6aa22aSVenugopal Iyer 	/* skip ethernet header */
2036ae6aa22aSVenugopal Iyer 
2037ae6aa22aSVenugopal Iyer 	sap = ntohs(ehp->ether_type);
2038ae6aa22aSVenugopal Iyer 	if (sap == ETHERTYPE_VLAN) {
2039ae6aa22aSVenugopal Iyer 		struct ether_vlan_header *evhp;
2040ae6aa22aSVenugopal Iyer 		mblk_t *newmp = NULL;
2041ae6aa22aSVenugopal Iyer 
2042ae6aa22aSVenugopal Iyer 		skip_len = sizeof (struct ether_vlan_header);
2043ae6aa22aSVenugopal Iyer 		if (MBLKL(mp) < skip_len) {
2044ae6aa22aSVenugopal Iyer 			/* the vlan tag is the payload, pull up first */
2045ae6aa22aSVenugopal Iyer 			newmp = msgpullup(mp, -1);
2046ae6aa22aSVenugopal Iyer 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
2047ae6aa22aSVenugopal Iyer 				goto done;
2048ae6aa22aSVenugopal Iyer 			}
2049ae6aa22aSVenugopal Iyer 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
2050ae6aa22aSVenugopal Iyer 		} else {
2051ae6aa22aSVenugopal Iyer 			evhp = (struct ether_vlan_header *)mp->b_rptr;
2052ae6aa22aSVenugopal Iyer 		}
2053ae6aa22aSVenugopal Iyer 
2054ae6aa22aSVenugopal Iyer 		sap = ntohs(evhp->ether_type);
2055ae6aa22aSVenugopal Iyer 		freemsg(newmp);
2056ae6aa22aSVenugopal Iyer 	} else {
2057ae6aa22aSVenugopal Iyer 		skip_len = sizeof (struct ether_header);
2058ae6aa22aSVenugopal Iyer 	}
2059ae6aa22aSVenugopal Iyer 
2060ae6aa22aSVenugopal Iyer 	/* if ethernet header is in its own mblk, skip it */
2061ae6aa22aSVenugopal Iyer 	if (MBLKL(mp) <= skip_len) {
2062ae6aa22aSVenugopal Iyer 		skip_len -= MBLKL(mp);
2063ae6aa22aSVenugopal Iyer 		mp = mp->b_cont;
2064ae6aa22aSVenugopal Iyer 		if (mp == NULL)
2065ae6aa22aSVenugopal Iyer 			goto done;
2066ae6aa22aSVenugopal Iyer 	}
2067ae6aa22aSVenugopal Iyer 
2068ae6aa22aSVenugopal Iyer 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
2069ae6aa22aSVenugopal Iyer 
2070ae6aa22aSVenugopal Iyer 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
2071ae6aa22aSVenugopal Iyer 
2072ae6aa22aSVenugopal Iyer 	switch (sap) {
2073ae6aa22aSVenugopal Iyer 	case ETHERTYPE_IP: {
2074ae6aa22aSVenugopal Iyer 		ipha_t *iphp;
2075ae6aa22aSVenugopal Iyer 
2076ae6aa22aSVenugopal Iyer 		/*
2077ae6aa22aSVenugopal Iyer 		 * If the header is not aligned or the header doesn't fit
2078ae6aa22aSVenugopal Iyer 		 * in the mblk, bail now. Note that this may cause packets
2079ae6aa22aSVenugopal Iyer 		 * reordering.
2080ae6aa22aSVenugopal Iyer 		 */
2081ae6aa22aSVenugopal Iyer 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
2082ae6aa22aSVenugopal Iyer 		if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
2083ae6aa22aSVenugopal Iyer 		    !OK_32PTR((char *)iphp))
2084ae6aa22aSVenugopal Iyer 			goto done;
2085ae6aa22aSVenugopal Iyer 
2086ae6aa22aSVenugopal Iyer 		proto = iphp->ipha_protocol;
2087ae6aa22aSVenugopal Iyer 		skip_len += IPH_HDR_LENGTH(iphp);
2088ae6aa22aSVenugopal Iyer 
20899820c710SBaban Kenkre 		/* Check if the packet is fragmented. */
20909820c710SBaban Kenkre 		ip_fragmented = ntohs(iphp->ipha_fragment_offset_and_flags) &
20919820c710SBaban Kenkre 		    IPH_OFFSET;
20929820c710SBaban Kenkre 
20939820c710SBaban Kenkre 		/*
20949820c710SBaban Kenkre 		 * For fragmented packets, use addresses in addition to
20959820c710SBaban Kenkre 		 * the frag_id to generate the hash inorder to get
20969820c710SBaban Kenkre 		 * better distribution.
20979820c710SBaban Kenkre 		 */
20989820c710SBaban Kenkre 		if (ip_fragmented || (policy & MAC_PKT_HASH_L3) != 0) {
2099ae6aa22aSVenugopal Iyer 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
2100ae6aa22aSVenugopal Iyer 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
2101ae6aa22aSVenugopal Iyer 
2102ae6aa22aSVenugopal Iyer 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2103ae6aa22aSVenugopal Iyer 			    PKT_HASH_4BYTES(ip_dst));
2104ae6aa22aSVenugopal Iyer 			policy &= ~MAC_PKT_HASH_L3;
2105ae6aa22aSVenugopal Iyer 		}
21069820c710SBaban Kenkre 
21079820c710SBaban Kenkre 		if (ip_fragmented) {
21089820c710SBaban Kenkre 			uint8_t *identp = (uint8_t *)&iphp->ipha_ident;
21099820c710SBaban Kenkre 			hash ^= PKT_HASH_2BYTES(identp);
21109820c710SBaban Kenkre 			goto done;
21119820c710SBaban Kenkre 		}
2112ae6aa22aSVenugopal Iyer 		break;
2113ae6aa22aSVenugopal Iyer 	}
2114ae6aa22aSVenugopal Iyer 	case ETHERTYPE_IPV6: {
2115ae6aa22aSVenugopal Iyer 		ip6_t *ip6hp;
21160dc2366fSVenugopal Iyer 		ip6_frag_t *frag = NULL;
2117ae6aa22aSVenugopal Iyer 		uint16_t hdr_length;
2118ae6aa22aSVenugopal Iyer 
2119ae6aa22aSVenugopal Iyer 		/*
2120ae6aa22aSVenugopal Iyer 		 * If the header is not aligned or the header doesn't fit
2121ae6aa22aSVenugopal Iyer 		 * in the mblk, bail now. Note that this may cause packets
2122ae6aa22aSVenugopal Iyer 		 * reordering.
2123ae6aa22aSVenugopal Iyer 		 */
2124ae6aa22aSVenugopal Iyer 
2125ae6aa22aSVenugopal Iyer 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
2126ae6aa22aSVenugopal Iyer 		if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
2127ae6aa22aSVenugopal Iyer 		    !OK_32PTR((char *)ip6hp))
2128ae6aa22aSVenugopal Iyer 			goto done;
2129ae6aa22aSVenugopal Iyer 
21300dc2366fSVenugopal Iyer 		if (!mac_ip_hdr_length_v6(ip6hp, mp->b_wptr, &hdr_length,
21310dc2366fSVenugopal Iyer 		    &proto, &frag))
2132ae6aa22aSVenugopal Iyer 			goto done;
2133ae6aa22aSVenugopal Iyer 		skip_len += hdr_length;
2134ae6aa22aSVenugopal Iyer 
21359820c710SBaban Kenkre 		/*
21369820c710SBaban Kenkre 		 * For fragmented packets, use addresses in addition to
21379820c710SBaban Kenkre 		 * the frag_id to generate the hash inorder to get
21389820c710SBaban Kenkre 		 * better distribution.
21399820c710SBaban Kenkre 		 */
21400dc2366fSVenugopal Iyer 		if (frag != NULL || (policy & MAC_PKT_HASH_L3) != 0) {
2141ae6aa22aSVenugopal Iyer 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
2142ae6aa22aSVenugopal Iyer 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
2143ae6aa22aSVenugopal Iyer 
2144ae6aa22aSVenugopal Iyer 			hash ^= (PKT_HASH_4BYTES(ip_src) ^
2145ae6aa22aSVenugopal Iyer 			    PKT_HASH_4BYTES(ip_dst));
2146ae6aa22aSVenugopal Iyer 			policy &= ~MAC_PKT_HASH_L3;
2147ae6aa22aSVenugopal Iyer 		}
21489820c710SBaban Kenkre 
21490dc2366fSVenugopal Iyer 		if (frag != NULL) {
21500dc2366fSVenugopal Iyer 			uint8_t *identp = (uint8_t *)&frag->ip6f_ident;
21519820c710SBaban Kenkre 			hash ^= PKT_HASH_4BYTES(identp);
21529820c710SBaban Kenkre 			goto done;
21539820c710SBaban Kenkre 		}
2154ae6aa22aSVenugopal Iyer 		break;
2155ae6aa22aSVenugopal Iyer 	}
2156ae6aa22aSVenugopal Iyer 	default:
2157ae6aa22aSVenugopal Iyer 		goto done;
2158ae6aa22aSVenugopal Iyer 	}
2159ae6aa22aSVenugopal Iyer 
2160ae6aa22aSVenugopal Iyer 	if (policy == 0)
2161ae6aa22aSVenugopal Iyer 		goto done;
2162ae6aa22aSVenugopal Iyer 
2163ae6aa22aSVenugopal Iyer 	/* if ip header is in its own mblk, skip it */
2164ae6aa22aSVenugopal Iyer 	if (MBLKL(mp) <= skip_len) {
2165ae6aa22aSVenugopal Iyer 		skip_len -= MBLKL(mp);
2166ae6aa22aSVenugopal Iyer 		mp = mp->b_cont;
2167ae6aa22aSVenugopal Iyer 		if (mp == NULL)
2168ae6aa22aSVenugopal Iyer 			goto done;
2169ae6aa22aSVenugopal Iyer 	}
2170ae6aa22aSVenugopal Iyer 
2171ae6aa22aSVenugopal Iyer 	/* parse ULP header */
2172ae6aa22aSVenugopal Iyer again:
2173ae6aa22aSVenugopal Iyer 	switch (proto) {
2174ae6aa22aSVenugopal Iyer 	case IPPROTO_TCP:
2175ae6aa22aSVenugopal Iyer 	case IPPROTO_UDP:
2176ae6aa22aSVenugopal Iyer 	case IPPROTO_ESP:
2177ae6aa22aSVenugopal Iyer 	case IPPROTO_SCTP:
2178ae6aa22aSVenugopal Iyer 		/*
2179ae6aa22aSVenugopal Iyer 		 * These Internet Protocols are intentionally designed
2180ae6aa22aSVenugopal Iyer 		 * for hashing from the git-go.  Port numbers are in the first
2181ae6aa22aSVenugopal Iyer 		 * word for transports, SPI is first for ESP.
2182ae6aa22aSVenugopal Iyer 		 */
2183ae6aa22aSVenugopal Iyer 		if (mp->b_rptr + skip_len + 4 > mp->b_wptr)
2184ae6aa22aSVenugopal Iyer 			goto done;
2185ae6aa22aSVenugopal Iyer 		hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len));
2186ae6aa22aSVenugopal Iyer 		break;
2187ae6aa22aSVenugopal Iyer 
2188ae6aa22aSVenugopal Iyer 	case IPPROTO_AH: {
2189ae6aa22aSVenugopal Iyer 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
2190ae6aa22aSVenugopal Iyer 		uint_t ah_length = AH_TOTAL_LEN(ah);
2191ae6aa22aSVenugopal Iyer 
2192ae6aa22aSVenugopal Iyer 		if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr)
2193ae6aa22aSVenugopal Iyer 			goto done;
2194ae6aa22aSVenugopal Iyer 
2195ae6aa22aSVenugopal Iyer 		proto = ah->ah_nexthdr;
2196ae6aa22aSVenugopal Iyer 		skip_len += ah_length;
2197ae6aa22aSVenugopal Iyer 
2198ae6aa22aSVenugopal Iyer 		/* if AH header is in its own mblk, skip it */
2199ae6aa22aSVenugopal Iyer 		if (MBLKL(mp) <= skip_len) {
2200ae6aa22aSVenugopal Iyer 			skip_len -= MBLKL(mp);
2201ae6aa22aSVenugopal Iyer 			mp = mp->b_cont;
2202ae6aa22aSVenugopal Iyer 			if (mp == NULL)
2203ae6aa22aSVenugopal Iyer 				goto done;
2204ae6aa22aSVenugopal Iyer 		}
2205ae6aa22aSVenugopal Iyer 
2206ae6aa22aSVenugopal Iyer 		goto again;
2207ae6aa22aSVenugopal Iyer 	}
2208ae6aa22aSVenugopal Iyer 	}
2209ae6aa22aSVenugopal Iyer 
2210ae6aa22aSVenugopal Iyer done:
2211ae6aa22aSVenugopal Iyer 	return (hash);
2212ae6aa22aSVenugopal Iyer }
2213