xref: /illumos-gate/usr/src/uts/common/io/ixgbe/ixgbe_tx.c (revision e39a7b5b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
24  */
25 
26 /*
27  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
29  * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
30  * Copyright 2020 Joyent, Inc.
31  */
32 
33 #include "ixgbe_sw.h"
34 
35 static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t **,
36     link_list_t *, const void *, size_t);
37 static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t **,
38     link_list_t *, uint8_t *, size_t);
39 static uint_t ixgbe_tcb_done(tx_control_block_t *);
40 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t *, link_list_t *,
41     ixgbe_tx_context_t *, size_t);
42 static void ixgbe_save_desc(tx_control_block_t *, uint64_t, size_t);
43 static tx_control_block_t *ixgbe_get_free_list(ixgbe_tx_ring_t *,
44     link_list_t *);
45 
46 static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *);
47 static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *,
48     ixgbe_tx_context_t *);
49 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *,
50     ixgbe_tx_context_t *);
51 
52 #ifndef IXGBE_DEBUG
53 #pragma inline(ixgbe_save_desc)
54 #pragma inline(ixgbe_get_context)
55 #pragma inline(ixgbe_check_context)
56 #pragma inline(ixgbe_fill_context)
57 #endif
58 
59 /*
60  * ixgbe_ring_tx
61  *
62  * To transmit one mblk through one specified ring.
63  *
64  * One mblk can consist of several fragments, each fragment
65  * will be processed with different methods based on the size.
66  * For the fragments with size less than the bcopy threshold,
67  * they will be processed by using bcopy; otherwise, they will
68  * be processed by using DMA binding.
69  *
70  * To process the mblk, for each fragment, we pass a pointer to the location
71  * of the current transmit control block (tcb) (initialized to NULL) to either
72  * ixgbe_tx_copy() or ixgbe_tx_bind() (based on the size of the mblk fragment).
73  * ixgbe_tx_copy() and ixgbe_tx_bind() will either continue to use the current
74  * if possible, or close out the current tcb, allocate a new tcb, and update
75  * the passed location (tx_control_block_t **) to reflect the new current tcb.
76  *
77  * Since bound mblk fragments require their own tcb, the close, allocate new,
78  * and update steps occur on every call to ixgbe_tx_bind(), but since
79  * consecutive small mblk fragments can be combined into a single tcb, the
80  * close, allocate new, and update steps may not occur on every call to
81  * ixgbe_tx_copy(). If the current tcb is already being used to copy data and
82  * we call ixgbe_tx_copy(), if there is enough room in the current tcb for
83  * the current mblk fragment, we append the data from the mblk fragment. If
84  * we call ixgbe_tx_copy() and the current tcb isn't being used to copy (i.e.
85  * the previous iteration of the loop called ixgbe_tx_bind()), or doesn't
86  * have enough space for the mblk fragment, we close out the current tcb,
87  * grab a new tcb from the free list, and update the current tcb to the
88  * newly obtained tcb.
89  *
90  * When LSO (large segment offload) is enabled, we first copy the packet
91  * headers (ethernet, IP, and TCP/UDP) into their own descriptor before
92  * processing the remainder of the packet. The remaining bytes of the packet
93  * are then copied or mapped based on the fragment size as described above.
94  *
95  * Through the entire processing of a packet, we keep track of the number of
96  * DMA descriptors being used (either bound or pre-bound buffers used for
97  * copying) by this packet. Each tcb requires at least one DMA descriptor, but
98  * may require more than one. When a tcb is closed by ixgbe_tx_bind() or
99  * ixgbe_tx_copy(), it does so by calling ixgbe_tcb_done() which returns the
100  * number of DMA descriptors that are closed (ready for the HW). Since the
101  * hardware limits the number of descriptors that can be used to transmit a
102  * single packet, if the total number DMA descriptors required to transmit
103  * this packet exceeds this limit, we perform a msgpullup() and try again.
104  * Since our DMA attributes limit the number of DMA cookies allowed to
105  * map a single span of memory to a value (MAX_COOKIE) less than the
106  * maximum number of descriptors allowed for a packet (IXGBE_TX_DESC_LIMIT),
107  * as long as sufficient tcbs are available, we should always be able to
108  * process a packet that's contained in a single mblk_t (no additional
109  * fragments).
110  *
111  * Once all of the tcbs have been setup, ixgbe_tx_fill_ring() is called to
112  * setup the tx ring to transmit the tcbs and then tell the HW to start
113  * transmitting. When transmission is complete, an interrupt is triggered
114  * which calls the appropriate recycle routine to place the tcbs that were
115  * used in transmission back in the free list. We also may also try to
116  * recycle any available tcbs when the size of the tcb free list gets low
117  * or if the watchdog timer triggers.
118  *
119  */
120 mblk_t *
121 ixgbe_ring_tx(void *arg, mblk_t *orig_mp)
122 {
123 	ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg;
124 	ixgbe_t *ixgbe = tx_ring->ixgbe;
125 	mblk_t *mp = orig_mp;
126 	mblk_t *pull_mp = NULL;
127 	tx_control_block_t *tcb;
128 	size_t mbsize, offset, len;
129 	uint32_t desc_total;
130 	uint32_t copy_thresh;
131 	int desc_num;
132 	ixgbe_tx_context_t tx_context, *ctx = NULL;
133 	link_list_t pending_list;
134 	boolean_t limit_retry = B_FALSE;
135 
136 	ASSERT(mp->b_next == NULL);
137 
138 	if ((ixgbe->ixgbe_state & IXGBE_SUSPENDED) ||
139 	    (ixgbe->ixgbe_state & IXGBE_ERROR) ||
140 	    (ixgbe->ixgbe_state & IXGBE_OVERTEMP) ||
141 	    !(ixgbe->ixgbe_state & IXGBE_STARTED) ||
142 	    ixgbe->link_state != LINK_STATE_UP) {
143 		freemsg(mp);
144 		return (NULL);
145 	}
146 
147 	copy_thresh = ixgbe->tx_copy_thresh;
148 
149 	mbsize = msgsize(mp);
150 
151 	if (ixgbe->tx_hcksum_enable) {
152 		/*
153 		 * Retrieve checksum context information from the mblk
154 		 * that will be used to decide whether/how to fill the
155 		 * context descriptor.
156 		 */
157 		ctx = &tx_context;
158 		if (ixgbe_get_context(mp, ctx) < 0) {
159 			freemsg(mp);
160 			return (NULL);
161 		}
162 
163 		/*
164 		 * If the mblk size exceeds the max size ixgbe could
165 		 * process, then discard this mblk, and return NULL.
166 		 */
167 		if ((ctx->lso_flag &&
168 		    ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) ||
169 		    (!ctx->lso_flag &&
170 		    (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) {
171 			freemsg(mp);
172 			IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize");
173 			return (NULL);
174 		}
175 	}
176 
177 	/*
178 	 * If we use too many descriptors (see comments below), we may do
179 	 * pull_mp = msgpullup(orig_mp, -1), and jump back to here. As such,
180 	 * any time we error return past here, we should check and free
181 	 * pull_mp if != NULL.
182 	 */
183 retry:
184 	/*
185 	 * Check and recycle tx descriptors.
186 	 * The recycle threshold here should be selected carefully
187 	 */
188 	if (tx_ring->tbd_free < ixgbe->tx_recycle_thresh) {
189 		tx_ring->tx_recycle(tx_ring);
190 	}
191 
192 	/*
193 	 * After the recycling, if the tbd_free is less than the
194 	 * overload_threshold, assert overload, return mp;
195 	 * and we need to re-schedule the tx again.
196 	 */
197 	if (tx_ring->tbd_free < ixgbe->tx_overload_thresh) {
198 		tx_ring->reschedule = B_TRUE;
199 		tx_ring->stat_overload++;
200 		if (pull_mp != NULL)
201 			freemsg(pull_mp);
202 		return (orig_mp);
203 	}
204 
205 	/*
206 	 * The pending_list is a linked list that is used to save
207 	 * the tx control blocks that have packet data processed
208 	 * but have not put the data to the tx descriptor ring.
209 	 * It is used to reduce the lock contention of the tx_lock.
210 	 */
211 	LINK_LIST_INIT(&pending_list);
212 
213 	tcb = NULL;
214 	desc_num = 0;
215 	desc_total = 0;
216 	offset = 0;
217 
218 	/*
219 	 * For LSO, we always copy the packet header (Ethernet + IP + TCP/UDP)
220 	 * into a single descriptor separate from the remaining data.
221 	 */
222 	if ((ctx != NULL) && ctx->lso_flag) {
223 		size_t hdr_len;
224 
225 		hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len;
226 
227 		/*
228 		 * copy the first hdr_len bytes of mp (i.e. the Ethernet, IP,
229 		 * and TCP/UDP headers) into tcb.
230 		 */
231 		for (len = hdr_len; mp != NULL && len > 0; mp = mp->b_cont) {
232 			size_t mlen = MBLKL(mp);
233 			size_t amt = MIN(mlen, len);
234 			int ret;
235 
236 			ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list,
237 			    mp->b_rptr, amt);
238 			/*
239 			 * Since we're trying to copy all of the headers into
240 			 * a single buffer in a single tcb, if ixgbe_tx_copy()
241 			 * returns anything but 0, it means either no tcbs
242 			 * are available (< 0), or while copying, we spilled
243 			 * over and couldn't fit all the headers into a
244 			 * single tcb.
245 			 */
246 			if (ret != 0) {
247 				if (ret > 0)
248 					tx_ring->stat_lso_header_fail++;
249 				goto tx_failure;
250 			}
251 
252 			len -= amt;
253 
254 			/*
255 			 * If we copy less than the full amount of this
256 			 * mblk_t, we have some amount to copy below.
257 			 */
258 			if (amt < mlen) {
259 				offset = amt;
260 				break;
261 			}
262 		}
263 
264 		ASSERT0(len);
265 
266 		/*
267 		 * Finish off the header tcb, and start anew for the
268 		 * rest of the packet.
269 		 */
270 		desc_total += ixgbe_tcb_done(tcb);
271 		tcb = NULL;
272 	}
273 
274 	/*
275 	 * Process each remaining segment in the packet -- either binding
276 	 * the dblk_t or copying the contents of the dblk_t to an already
277 	 * bound buffer. When we copy, we will accumulate consecutive small
278 	 * (less than copy_thresh bytes) segments into a single tcb buffer
279 	 * until no more can fit (or we encounter a segment larger than
280 	 * copy_thresh and bind the dblk_t).
281 	 *
282 	 * Both ixgbe_tx_bind() and ixgbe_tx_copy() will allocate new
283 	 * transmit control blocks (tcb)s as needed (and append them onto
284 	 * 'pending_list'). Both functions also replace 'tcb' with the new
285 	 * tcb when they allocate a new tcb.
286 	 *
287 	 * We stop trying to process the packet once the number of descriptors
288 	 * used equals IXGBE_TX_DESC_LIMIT. Even if we're copying into the
289 	 * IXGBE_TX_DESC_LIMIT-th descriptor, we won't have room to add a
290 	 * context descriptor (since we're already at the limit), so there's
291 	 * no point in continuing. We'll pull up the mblk_t (see below)
292 	 * and try again.
293 	 */
294 	while (mp != NULL && desc_total < IXGBE_TX_DESC_LIMIT) {
295 		uint8_t *rptr = mp->b_rptr + offset;
296 		int ret;
297 
298 		len = MBLKL(mp) - offset;
299 		offset = 0;
300 
301 		if (len > copy_thresh) {
302 			ret = ixgbe_tx_bind(tx_ring, &tcb, &pending_list, rptr,
303 			    len);
304 		} else {
305 			ret = ixgbe_tx_copy(tx_ring, &tcb, &pending_list, rptr,
306 			    len);
307 		}
308 
309 		if (ret < 0)
310 			goto tx_failure;
311 
312 		desc_total += ret;
313 		mp = mp->b_cont;
314 	}
315 
316 	/* Finish off the last tcb */
317 	desc_total += ixgbe_tcb_done(tcb);
318 
319 	/*
320 	 * 82598/82599 chipset has a limitation that no more than 32 tx
321 	 * descriptors can be transmited out at one time. As noted above,
322 	 * we need to include space for a context descriptor in case its
323 	 * necessary, so we do this even if desc_total == IXGBE_TX_DESC_LIMIT
324 	 * as well as when it exceeds the limit.
325 	 *
326 	 * If we exceed this limit, we take the hit, do a msgpullup(), and
327 	 * then try again. Our DMA attributes guarantee we should never use
328 	 * more than MAX_COOKIE (18) descriptors to map a single mblk_t, so we
329 	 * should only need to retry once.
330 	 */
331 	if (desc_total >= IXGBE_TX_DESC_LIMIT) {
332 		/* We shouldn't hit this path twice */
333 		VERIFY0(limit_retry);
334 
335 		tx_ring->stat_break_tbd_limit++;
336 
337 		/* Release all the tcbs we used previously */
338 		ixgbe_put_free_list(tx_ring, &pending_list);
339 		desc_total = 0;
340 		offset = 0;
341 
342 		pull_mp = msgpullup(orig_mp, -1);
343 		if (pull_mp == NULL) {
344 			tx_ring->reschedule = B_TRUE;
345 			return (orig_mp);
346 		}
347 
348 		mp = pull_mp;
349 		limit_retry = B_TRUE;
350 		goto retry;
351 	}
352 
353 	/*
354 	 * Before filling the tx descriptor ring with the data, we need to
355 	 * ensure there are adequate free descriptors for transmit
356 	 * (including one context descriptor).
357 	 * Do not use up all the tx descriptors.
358 	 * Otherwise tx recycle will fail and cause false hang.
359 	 */
360 	if (tx_ring->tbd_free <= (desc_total + 1)) {
361 		tx_ring->tx_recycle(tx_ring);
362 	}
363 
364 	mutex_enter(&tx_ring->tx_lock);
365 	/*
366 	 * If the number of free tx descriptors is not enough for transmit
367 	 * then return mp.
368 	 *
369 	 * Note: we must put this check under the mutex protection to
370 	 * ensure the correctness when multiple threads access it in
371 	 * parallel.
372 	 */
373 	if (tx_ring->tbd_free <= (desc_total + 1)) {
374 		tx_ring->stat_fail_no_tbd++;
375 		mutex_exit(&tx_ring->tx_lock);
376 		goto tx_failure;
377 	}
378 
379 	/*
380 	 * Attach the mblk_t we've setup to the last control block.
381 	 * This is only done once we know there are enough free descriptors
382 	 * to transmit so that the cleanup in tx_failure doesn't try to
383 	 * call freemsg() on mp (since we will want to return it).
384 	 */
385 	tcb->mp = (pull_mp != NULL) ? pull_mp : orig_mp;
386 
387 	desc_num = ixgbe_tx_fill_ring(tx_ring, &pending_list, ctx,
388 	    mbsize);
389 
390 	ASSERT((desc_num == desc_total) || (desc_num == (desc_total + 1)));
391 
392 	tx_ring->stat_obytes += mbsize;
393 	tx_ring->stat_opackets++;
394 
395 	mutex_exit(&tx_ring->tx_lock);
396 
397 	/*
398 	 * Now that tx is done, if we pulled up the original message, we
399 	 * can free the original message since it is no longer being
400 	 * used.
401 	 */
402 	if (pull_mp != NULL) {
403 		freemsg(orig_mp);
404 	}
405 
406 	return (NULL);
407 
408 tx_failure:
409 	/*
410 	 * If transmission fails, need to free the pulling up mblk.
411 	 */
412 	if (pull_mp) {
413 		freemsg(pull_mp);
414 	}
415 
416 	/*
417 	 * tcb->mp should not be set until we know we can transmit (see above),
418 	 * so it should always be NULL if we get here.
419 	 */
420 	VERIFY3P(tcb->mp, ==, NULL);
421 
422 	/*
423 	 * Return the tx control blocks in the pending list to the free list.
424 	 */
425 	ixgbe_put_free_list(tx_ring, &pending_list);
426 
427 	/* Transmit failed, do not drop the mblk, rechedule the transmit */
428 	tx_ring->reschedule = B_TRUE;
429 
430 	return (orig_mp);
431 }
432 
433 /*
434  * ixgbe_tx_copy
435  *
436  * Copy the mblk fragment to the pre-allocated tx buffer. Return -1 on error,
437  * otherwise return the number of descriptors we've completed in this call.
438  */
439 static int
440 ixgbe_tx_copy(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
441     link_list_t *pending_list, const void *buf, size_t len)
442 {
443 	tx_control_block_t *tcb = *tcbp;
444 	dma_buffer_t *tx_buf;
445 	uint32_t desc_num = 0;
446 
447 	/*
448 	 * We need a new tcb -- either the current one (tcb) is NULL because
449 	 * we just started, tcb is being used for DMA, or tcb isn't large enough
450 	 * to hold the contents we need to copy.
451 	 */
452 	if (tcb == NULL || tcb->tx_type == USE_DMA ||
453 	    tcb->tx_buf.len + len > tcb->tx_buf.size) {
454 		tx_control_block_t *newtcb;
455 
456 		newtcb = ixgbe_get_free_list(tx_ring, pending_list);
457 		if (newtcb == NULL)
458 			return (-1);
459 
460 		newtcb->tx_type = USE_COPY;
461 
462 		if (tcb != NULL)
463 			desc_num += ixgbe_tcb_done(tcb);
464 		*tcbp = tcb = newtcb;
465 	}
466 
467 	ASSERT3S(tcb->tx_type, ==, USE_COPY);
468 	tx_buf = &tcb->tx_buf;
469 
470 	/*
471 	 * Copy the packet data of the mblk fragment into the
472 	 * pre-allocated tx buffer, which is maintained by the
473 	 * tx control block.
474 	 *
475 	 * Several mblk fragments can be copied into one tx buffer.
476 	 * The destination address of the current copied fragment in
477 	 * the tx buffer is next to the end of the previous copied
478 	 * fragment.
479 	 */
480 	if (len > 0) {
481 		bcopy(buf, tx_buf->address + tx_buf->len, len);
482 
483 		tx_buf->len += len;
484 		tcb->frag_num++;
485 	}
486 
487 	return (desc_num);
488 }
489 
490 /*
491  * ixgbe_tx_bind
492  *
493  * Bind the mblk fragment with DMA. Returns -1 on error, otherwise it
494  * returns the number of descriptors completed in this call. This count
495  * can include descriptors that weren't filled in by the current call to
496  * ixgbe_tx_bind() but were being used (but not yet completed) in previous
497  * calls to ixgbe_tx_bind() or ixgbe_tx_copy().
498  */
499 static int
500 ixgbe_tx_bind(ixgbe_tx_ring_t *tx_ring, tx_control_block_t **tcbp,
501     link_list_t *pending_list, uint8_t *buf, size_t len)
502 {
503 	tx_control_block_t *tcb = NULL;
504 	uint_t desc_num = 0;
505 	int status;
506 
507 	tcb = ixgbe_get_free_list(tx_ring, pending_list);
508 	if (tcb == NULL)
509 		return (-1);
510 
511 	/*
512 	 * Use DMA binding to process the mblk fragment
513 	 */
514 	status = ddi_dma_addr_bind_handle(tcb->tx_dma_handle, NULL,
515 	    (caddr_t)buf, len,
516 	    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
517 	    0, NULL, NULL);
518 
519 	if (status != DDI_DMA_MAPPED) {
520 		tx_ring->stat_fail_dma_bind++;
521 		return (-1);
522 	}
523 
524 	tcb->frag_num++;
525 	tcb->tx_type = USE_DMA;
526 
527 	/*
528 	 * If there was an old tcb, we're about to replace it. Finish
529 	 * setting up the old tcb so we can replace it with the new one.
530 	 */
531 	if (*tcbp != NULL)
532 		desc_num += ixgbe_tcb_done(*tcbp);
533 
534 	*tcbp = tcb;
535 	return (desc_num);
536 }
537 
538 /*
539  * Once we're done populating a tcb (either by binding or copying into
540  * a buffer in the tcb), get it ready for tx and return the number of
541  * descriptors used.
542  */
543 static uint_t
544 ixgbe_tcb_done(tx_control_block_t *tcb)
545 {
546 	uint_t desc_num = 0;
547 
548 	if (tcb->tx_type == USE_DMA) {
549 		const ddi_dma_cookie_t *c;
550 
551 		for (c = ddi_dma_cookie_iter(tcb->tx_dma_handle, NULL);
552 		    c != NULL;
553 		    c = ddi_dma_cookie_iter(tcb->tx_dma_handle, c)) {
554 			/*
555 			 * Save the address and length to the private data
556 			 * structure of the tx control block, which will be
557 			 * used to fill the tx descriptor ring after all the
558 			 * fragments are processed.
559 			 */
560 			ixgbe_save_desc(tcb, c->dmac_laddress, c->dmac_size);
561 			desc_num++;
562 		}
563 	} else if (tcb->tx_type == USE_COPY) {
564 		dma_buffer_t *tx_buf = &tcb->tx_buf;
565 
566 		DMA_SYNC(tx_buf, DDI_DMA_SYNC_FORDEV);
567 		ixgbe_save_desc(tcb, tx_buf->dma_address, tx_buf->len);
568 		desc_num++;
569 	} else {
570 		panic("invalid tcb type");
571 	}
572 
573 	return (desc_num);
574 }
575 
576 /*
577  * ixgbe_get_context
578  *
579  * Get the context information from the mblk
580  */
581 static int
582 ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx)
583 {
584 	uint32_t start;
585 	uint32_t hckflags;
586 	uint32_t lsoflags;
587 	uint32_t lsocksum;
588 	uint32_t mss;
589 	uint32_t len;
590 	uint32_t size;
591 	uint32_t offset;
592 	unsigned char *pos;
593 	ushort_t etype;
594 	uint32_t mac_hdr_len;
595 	uint32_t l4_proto;
596 	uint32_t l4_hdr_len;
597 
598 	ASSERT(mp != NULL);
599 
600 	mac_hcksum_get(mp, &start, NULL, NULL, NULL, &hckflags);
601 	bzero(ctx, sizeof (ixgbe_tx_context_t));
602 
603 	if (hckflags == 0) {
604 		return (0);
605 	}
606 
607 	ctx->hcksum_flags = hckflags;
608 
609 	mac_lso_get(mp, &mss, &lsoflags);
610 	ctx->mss = mss;
611 	ctx->lso_flag = (lsoflags == HW_LSO);
612 
613 	etype = 0;
614 	mac_hdr_len = 0;
615 	l4_proto = 0;
616 
617 	/*
618 	 * Firstly get the position of the ether_type/ether_tpid.
619 	 * Here we don't assume the ether (VLAN) header is fully included
620 	 * in one mblk fragment, so we go thourgh the fragments to parse
621 	 * the ether type.
622 	 */
623 	size = len = MBLKL(mp);
624 	offset = offsetof(struct ether_header, ether_type);
625 	while (size <= offset) {
626 		mp = mp->b_cont;
627 		ASSERT(mp != NULL);
628 		len = MBLKL(mp);
629 		size += len;
630 	}
631 	pos = mp->b_rptr + offset + len - size;
632 
633 	etype = ntohs(*(ushort_t *)(uintptr_t)pos);
634 	if (etype == ETHERTYPE_VLAN) {
635 		/*
636 		 * Get the position of the ether_type in VLAN header
637 		 */
638 		offset = offsetof(struct ether_vlan_header, ether_type);
639 		while (size <= offset) {
640 			mp = mp->b_cont;
641 			ASSERT(mp != NULL);
642 			len = MBLKL(mp);
643 			size += len;
644 		}
645 		pos = mp->b_rptr + offset + len - size;
646 
647 		etype = ntohs(*(ushort_t *)(uintptr_t)pos);
648 		mac_hdr_len = sizeof (struct ether_vlan_header);
649 	} else {
650 		mac_hdr_len = sizeof (struct ether_header);
651 	}
652 
653 	/*
654 	 * Here we don't assume the IP(V6) header is fully included in
655 	 * one mblk fragment.
656 	 */
657 	lsocksum = HCK_PARTIALCKSUM;
658 	ctx->l3_proto = etype;
659 	switch (etype) {
660 	case ETHERTYPE_IP:
661 		if (ctx->lso_flag) {
662 			offset = offsetof(ipha_t, ipha_length) + mac_hdr_len;
663 			while (size <= offset) {
664 				mp = mp->b_cont;
665 				ASSERT(mp != NULL);
666 				len = MBLKL(mp);
667 				size += len;
668 			}
669 			pos = mp->b_rptr + offset + len - size;
670 			*((uint16_t *)(uintptr_t)(pos)) = 0;
671 
672 			offset = offsetof(ipha_t, ipha_hdr_checksum) +
673 			    mac_hdr_len;
674 			while (size <= offset) {
675 				mp = mp->b_cont;
676 				ASSERT(mp != NULL);
677 				len = MBLKL(mp);
678 				size += len;
679 			}
680 			pos = mp->b_rptr + offset + len - size;
681 			*((uint16_t *)(uintptr_t)(pos)) = 0;
682 
683 			/*
684 			 * To perform ixgbe LSO, here also need to fill
685 			 * the tcp checksum field of the packet with the
686 			 * following pseudo-header checksum:
687 			 * (ip_source_addr, ip_destination_addr, l4_proto)
688 			 * Currently the tcp/ip stack has done it.
689 			 */
690 			lsocksum |= HCK_IPV4_HDRCKSUM;
691 		}
692 
693 		offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len;
694 		while (size <= offset) {
695 			mp = mp->b_cont;
696 			ASSERT(mp != NULL);
697 			len = MBLKL(mp);
698 			size += len;
699 		}
700 		pos = mp->b_rptr + offset + len - size;
701 
702 		l4_proto = *(uint8_t *)pos;
703 		break;
704 	case ETHERTYPE_IPV6:
705 		/*
706 		 * We need to zero out the length in the header.
707 		 */
708 		if (ctx->lso_flag) {
709 			offset = offsetof(ip6_t, ip6_plen) + mac_hdr_len;
710 			while (size <= offset) {
711 				mp = mp->b_cont;
712 				ASSERT(mp != NULL);
713 				len = MBLKL(mp);
714 				size += len;
715 			}
716 			pos = mp->b_rptr + offset + len - size;
717 			*((uint16_t *)(uintptr_t)(pos)) = 0;
718 		}
719 
720 		offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len;
721 		while (size <= offset) {
722 			mp = mp->b_cont;
723 			ASSERT(mp != NULL);
724 			len = MBLKL(mp);
725 			size += len;
726 		}
727 		pos = mp->b_rptr + offset + len - size;
728 
729 		l4_proto = *(uint8_t *)pos;
730 		break;
731 	default:
732 		/* Unrecoverable error */
733 		IXGBE_DEBUGLOG_0(NULL, "Ether type error with tx hcksum");
734 		return (-2);
735 	}
736 
737 	if (ctx->lso_flag) {
738 		/*
739 		 * LSO relies on tx h/w checksum, so here will drop the packet
740 		 * if h/w checksum flag is not declared.
741 		 */
742 		if ((ctx->hcksum_flags & lsocksum) != lsocksum) {
743 			IXGBE_DEBUGLOG_2(NULL, "ixgbe_tx: h/w checksum flags "
744 			    "are not set for LSO, found 0x%x, needed bits 0x%x",
745 			    ctx->hcksum_flags, lsocksum);
746 			return (-1);
747 		}
748 
749 
750 		offset = mac_hdr_len + start;
751 		while (size <= offset) {
752 			mp = mp->b_cont;
753 			ASSERT(mp != NULL);
754 			len = MBLKL(mp);
755 			size += len;
756 		}
757 		pos = mp->b_rptr + offset + len - size;
758 
759 		l4_hdr_len = TCP_HDR_LENGTH((tcph_t *)pos);
760 	} else {
761 		/*
762 		 * l4 header length is only required for LSO
763 		 */
764 		l4_hdr_len = 0;
765 	}
766 
767 	ctx->mac_hdr_len = mac_hdr_len;
768 	ctx->ip_hdr_len = start;
769 	ctx->l4_proto = l4_proto;
770 	ctx->l4_hdr_len = l4_hdr_len;
771 
772 	return (0);
773 }
774 
775 /*
776  * ixgbe_check_context
777  *
778  * Check if a new context descriptor is needed
779  */
780 static boolean_t
781 ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx)
782 {
783 	ixgbe_tx_context_t *last;
784 
785 	if (ctx == NULL)
786 		return (B_FALSE);
787 
788 	/*
789 	 * Compare the context data retrieved from the mblk and the
790 	 * stored data of the last context descriptor. The data need
791 	 * to be checked are:
792 	 *	hcksum_flags
793 	 *	l4_proto
794 	 *	mac_hdr_len
795 	 *	ip_hdr_len
796 	 *	lso_flag
797 	 *	mss (only checked for LSO)
798 	 *	l4_hr_len (only checked for LSO)
799 	 * Either one of the above data is changed, a new context descriptor
800 	 * will be needed.
801 	 */
802 	last = &tx_ring->tx_context;
803 
804 	if ((ctx->hcksum_flags != last->hcksum_flags) ||
805 	    (ctx->l4_proto != last->l4_proto) ||
806 	    (ctx->l3_proto != last->l3_proto) ||
807 	    (ctx->mac_hdr_len != last->mac_hdr_len) ||
808 	    (ctx->ip_hdr_len != last->ip_hdr_len) ||
809 	    (ctx->lso_flag != last->lso_flag) ||
810 	    (ctx->lso_flag && ((ctx->mss != last->mss) ||
811 	    (ctx->l4_hdr_len != last->l4_hdr_len)))) {
812 		return (B_TRUE);
813 	}
814 
815 	return (B_FALSE);
816 }
817 
818 /*
819  * ixgbe_fill_context
820  *
821  * Fill the context descriptor with hardware checksum informations
822  */
823 static void
824 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd,
825     ixgbe_tx_context_t *ctx)
826 {
827 	/*
828 	 * Fill the context descriptor with the checksum
829 	 * context information we've got.
830 	 */
831 	ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len;
832 	ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len <<
833 	    IXGBE_ADVTXD_MACLEN_SHIFT;
834 
835 	ctx_tbd->type_tucmd_mlhl =
836 	    IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_CTXT;
837 	/*
838 	 * When we have a TX context set up, we enforce that the ethertype is
839 	 * either IPv4 or IPv6 in ixgbe_get_tx_context().
840 	 */
841 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_IPV4_HDRCKSUM) {
842 		if (ctx->l3_proto == ETHERTYPE_IP) {
843 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV4;
844 		} else {
845 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_IPV6;
846 		}
847 	}
848 
849 	if (ctx->lso_flag || ctx->hcksum_flags & HCK_PARTIALCKSUM) {
850 		switch (ctx->l4_proto) {
851 		case IPPROTO_TCP:
852 			ctx_tbd->type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP;
853 			break;
854 		case IPPROTO_UDP:
855 			/*
856 			 * We don't have to explicitly set:
857 			 *	ctx_tbd->type_tucmd_mlhl |=
858 			 *	    IXGBE_ADVTXD_TUCMD_L4T_UDP;
859 			 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
860 			 */
861 			break;
862 		default:
863 			/* Unrecoverable error */
864 			IXGBE_DEBUGLOG_0(NULL, "L4 type error with tx hcksum");
865 			break;
866 		}
867 	}
868 
869 	ctx_tbd->seqnum_seed = 0;
870 
871 	if (ctx->lso_flag) {
872 		ctx_tbd->mss_l4len_idx =
873 		    (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) |
874 		    (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT);
875 	} else {
876 		ctx_tbd->mss_l4len_idx = 0;
877 	}
878 }
879 
880 /*
881  * ixgbe_tx_fill_ring
882  *
883  * Fill the tx descriptor ring with the data
884  */
885 static int
886 ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list,
887     ixgbe_tx_context_t *ctx, size_t mbsize)
888 {
889 	struct ixgbe_hw *hw = &tx_ring->ixgbe->hw;
890 	boolean_t load_context;
891 	uint32_t index, tcb_index, desc_num;
892 	union ixgbe_adv_tx_desc *tbd, *first_tbd;
893 	tx_control_block_t *tcb, *first_tcb;
894 	uint32_t hcksum_flags;
895 	int i;
896 
897 	ASSERT(mutex_owned(&tx_ring->tx_lock));
898 
899 	tbd = NULL;
900 	first_tbd = NULL;
901 	first_tcb = NULL;
902 	desc_num = 0;
903 	hcksum_flags = 0;
904 	load_context = B_FALSE;
905 
906 	/*
907 	 * Get the index of the first tx descriptor that will be filled,
908 	 * and the index of the first work list item that will be attached
909 	 * with the first used tx control block in the pending list.
910 	 * Note: the two indexes are the same.
911 	 */
912 	index = tx_ring->tbd_tail;
913 	tcb_index = tx_ring->tbd_tail;
914 
915 	if (ctx != NULL) {
916 		hcksum_flags = ctx->hcksum_flags;
917 
918 		/*
919 		 * Check if a new context descriptor is needed for this packet
920 		 */
921 		load_context = ixgbe_check_context(tx_ring, ctx);
922 
923 		if (load_context) {
924 			tbd = &tx_ring->tbd_ring[index];
925 
926 			/*
927 			 * Fill the context descriptor with the
928 			 * hardware checksum offload informations.
929 			 */
930 			ixgbe_fill_context(
931 			    (struct ixgbe_adv_tx_context_desc *)tbd, ctx);
932 
933 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
934 			desc_num++;
935 
936 			/*
937 			 * Store the checksum context data if
938 			 * a new context descriptor is added
939 			 */
940 			tx_ring->tx_context = *ctx;
941 		}
942 	}
943 
944 	first_tbd = &tx_ring->tbd_ring[index];
945 
946 	/*
947 	 * Fill tx data descriptors with the data saved in the pending list.
948 	 * The tx control blocks in the pending list are added to the work list
949 	 * at the same time.
950 	 *
951 	 * The work list is strictly 1:1 corresponding to the descriptor ring.
952 	 * One item of the work list corresponds to one tx descriptor. Because
953 	 * one tx control block can span multiple tx descriptors, the tx
954 	 * control block will be added to the first work list item that
955 	 * corresponds to the first tx descriptor generated from that tx
956 	 * control block.
957 	 */
958 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
959 	first_tcb = tcb;
960 	while (tcb != NULL) {
961 
962 		for (i = 0; i < tcb->desc_num; i++) {
963 			tbd = &tx_ring->tbd_ring[index];
964 
965 			tbd->read.buffer_addr = tcb->desc[i].address;
966 			tbd->read.cmd_type_len = tcb->desc[i].length;
967 
968 			tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_DEXT
969 			    | IXGBE_ADVTXD_DTYP_DATA;
970 
971 			tbd->read.olinfo_status = 0;
972 
973 			index = NEXT_INDEX(index, 1, tx_ring->ring_size);
974 			desc_num++;
975 		}
976 
977 		/*
978 		 * Add the tx control block to the work list
979 		 */
980 		ASSERT(tx_ring->work_list[tcb_index] == NULL);
981 		tx_ring->work_list[tcb_index] = tcb;
982 
983 		tcb_index = index;
984 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
985 	}
986 
987 	if (load_context) {
988 		/*
989 		 * Count the context descriptor for
990 		 * the first tx control block.
991 		 */
992 		first_tcb->desc_num++;
993 	}
994 	first_tcb->last_index = PREV_INDEX(index, 1, tx_ring->ring_size);
995 
996 	/*
997 	 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
998 	 * valid in the first descriptor of the packet.
999 	 * Setting paylen in every first_tbd for all parts.
1000 	 * 82599, X540 and X550 require the packet length in paylen field
1001 	 * with or without LSO and 82598 will ignore it in non-LSO mode.
1002 	 */
1003 	ASSERT(first_tbd != NULL);
1004 	first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS;
1005 
1006 	switch (hw->mac.type) {
1007 	case ixgbe_mac_82598EB:
1008 		if (ctx != NULL && ctx->lso_flag) {
1009 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1010 			first_tbd->read.olinfo_status |=
1011 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1012 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1013 		}
1014 		break;
1015 
1016 	case ixgbe_mac_82599EB:
1017 	case ixgbe_mac_X540:
1018 	case ixgbe_mac_X550:
1019 	case ixgbe_mac_X550EM_x:
1020 	case ixgbe_mac_X550EM_a:
1021 		if (ctx != NULL && ctx->lso_flag) {
1022 			first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE;
1023 			first_tbd->read.olinfo_status |=
1024 			    (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len
1025 			    - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT;
1026 		} else {
1027 			first_tbd->read.olinfo_status |=
1028 			    (mbsize << IXGBE_ADVTXD_PAYLEN_SHIFT);
1029 		}
1030 		break;
1031 
1032 	default:
1033 		break;
1034 	}
1035 
1036 	/* Set hardware checksum bits */
1037 	if (hcksum_flags != 0) {
1038 		if (hcksum_flags & HCK_IPV4_HDRCKSUM)
1039 			first_tbd->read.olinfo_status |=
1040 			    IXGBE_ADVTXD_POPTS_IXSM;
1041 		if (hcksum_flags & HCK_PARTIALCKSUM)
1042 			first_tbd->read.olinfo_status |=
1043 			    IXGBE_ADVTXD_POPTS_TXSM;
1044 	}
1045 
1046 	/*
1047 	 * The last descriptor of packet needs End Of Packet (EOP),
1048 	 * and Report Status (RS) bits set
1049 	 */
1050 	ASSERT(tbd != NULL);
1051 	tbd->read.cmd_type_len |=
1052 	    IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS;
1053 
1054 	/*
1055 	 * Sync the DMA buffer of the tx descriptor ring
1056 	 */
1057 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORDEV);
1058 
1059 	/*
1060 	 * Update the number of the free tx descriptors.
1061 	 * The mutual exclusion between the transmission and the recycling
1062 	 * (for the tx descriptor ring and the work list) is implemented
1063 	 * with the atomic operation on the number of the free tx descriptors.
1064 	 *
1065 	 * Note: we should always decrement the counter tbd_free before
1066 	 * advancing the hardware TDT pointer to avoid the race condition -
1067 	 * before the counter tbd_free is decremented, the transmit of the
1068 	 * tx descriptors has done and the counter tbd_free is increased by
1069 	 * the tx recycling.
1070 	 */
1071 	i = ixgbe_atomic_reserve(&tx_ring->tbd_free, desc_num);
1072 	ASSERT(i >= 0);
1073 
1074 	tx_ring->tbd_tail = index;
1075 
1076 	/*
1077 	 * Advance the hardware TDT pointer of the tx descriptor ring
1078 	 */
1079 	IXGBE_WRITE_REG(hw, IXGBE_TDT(tx_ring->index), index);
1080 
1081 	if (ixgbe_check_acc_handle(tx_ring->ixgbe->osdep.reg_handle) !=
1082 	    DDI_FM_OK) {
1083 		ddi_fm_service_impact(tx_ring->ixgbe->dip,
1084 		    DDI_SERVICE_DEGRADED);
1085 		atomic_or_32(&tx_ring->ixgbe->ixgbe_state, IXGBE_ERROR);
1086 	}
1087 
1088 	return (desc_num);
1089 }
1090 
1091 /*
1092  * ixgbe_save_desc
1093  *
1094  * Save the address/length pair to the private array
1095  * of the tx control block. The address/length pairs
1096  * will be filled into the tx descriptor ring later.
1097  */
1098 static void
1099 ixgbe_save_desc(tx_control_block_t *tcb, uint64_t address, size_t length)
1100 {
1101 	sw_desc_t *desc;
1102 
1103 	desc = &tcb->desc[tcb->desc_num];
1104 	desc->address = address;
1105 	desc->length = length;
1106 
1107 	tcb->desc_num++;
1108 }
1109 
1110 /*
1111  * ixgbe_tx_recycle_legacy
1112  *
1113  * Recycle the tx descriptors and tx control blocks.
1114  *
1115  * The work list is traversed to check if the corresponding
1116  * tx descriptors have been transmitted. If so, the resources
1117  * bound to the tx control blocks will be freed, and those
1118  * tx control blocks will be returned to the free list.
1119  */
1120 uint32_t
1121 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring)
1122 {
1123 	uint32_t index, last_index, prev_index;
1124 	int desc_num;
1125 	boolean_t desc_done;
1126 	tx_control_block_t *tcb;
1127 	link_list_t pending_list;
1128 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1129 
1130 	mutex_enter(&tx_ring->recycle_lock);
1131 
1132 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1133 
1134 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1135 		tx_ring->recycle_fail = 0;
1136 		tx_ring->stall_watchdog = 0;
1137 		if (tx_ring->reschedule) {
1138 			tx_ring->reschedule = B_FALSE;
1139 			mac_tx_ring_update(ixgbe->mac_hdl,
1140 			    tx_ring->ring_handle);
1141 		}
1142 		mutex_exit(&tx_ring->recycle_lock);
1143 		return (0);
1144 	}
1145 
1146 	/*
1147 	 * Sync the DMA buffer of the tx descriptor ring
1148 	 */
1149 	DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1150 
1151 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1152 		mutex_exit(&tx_ring->recycle_lock);
1153 		ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED);
1154 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1155 		return (0);
1156 	}
1157 
1158 	LINK_LIST_INIT(&pending_list);
1159 	desc_num = 0;
1160 	index = tx_ring->tbd_head;	/* Index of next tbd/tcb to recycle */
1161 
1162 	tcb = tx_ring->work_list[index];
1163 	ASSERT(tcb != NULL);
1164 
1165 	while (tcb != NULL) {
1166 		/*
1167 		 * Get the last tx descriptor of this packet.
1168 		 * If the last tx descriptor is done, then
1169 		 * we can recycle all descriptors of a packet
1170 		 * which usually includes several tx control blocks.
1171 		 * For 82599, LSO descriptors can not be recycled
1172 		 * unless the whole packet's transmission is done.
1173 		 * That's why packet level recycling is used here.
1174 		 * For 82598, there's not such limit.
1175 		 */
1176 		last_index = tcb->last_index;
1177 		/*
1178 		 * MAX_TX_RING_SIZE is used to judge whether
1179 		 * the index is a valid value or not.
1180 		 */
1181 		if (last_index == MAX_TX_RING_SIZE)
1182 			break;
1183 
1184 		/*
1185 		 * Check if the Descriptor Done bit is set
1186 		 */
1187 		desc_done = tx_ring->tbd_ring[last_index].wb.status &
1188 		    IXGBE_TXD_STAT_DD;
1189 		if (desc_done) {
1190 			/*
1191 			 * recycle all descriptors of the packet
1192 			 */
1193 			while (tcb != NULL) {
1194 				/*
1195 				 * Strip off the tx control block from
1196 				 * the work list, and add it to the
1197 				 * pending list.
1198 				 */
1199 				tx_ring->work_list[index] = NULL;
1200 				LIST_PUSH_TAIL(&pending_list, &tcb->link);
1201 
1202 				/*
1203 				 * Count the total number of the tx
1204 				 * descriptors recycled
1205 				 */
1206 				desc_num += tcb->desc_num;
1207 
1208 				index = NEXT_INDEX(index, tcb->desc_num,
1209 				    tx_ring->ring_size);
1210 
1211 				tcb = tx_ring->work_list[index];
1212 
1213 				prev_index = PREV_INDEX(index, 1,
1214 				    tx_ring->ring_size);
1215 				if (prev_index == last_index)
1216 					break;
1217 			}
1218 		} else {
1219 			break;
1220 		}
1221 	}
1222 
1223 	/*
1224 	 * If no tx descriptors are recycled, no need to do more processing
1225 	 */
1226 	if (desc_num == 0) {
1227 		tx_ring->recycle_fail++;
1228 		mutex_exit(&tx_ring->recycle_lock);
1229 		return (0);
1230 	}
1231 
1232 	tx_ring->recycle_fail = 0;
1233 	tx_ring->stall_watchdog = 0;
1234 
1235 	/*
1236 	 * Update the head index of the tx descriptor ring
1237 	 */
1238 	tx_ring->tbd_head = index;
1239 
1240 	/*
1241 	 * Update the number of the free tx descriptors with atomic operations
1242 	 */
1243 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1244 
1245 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1246 	    (tx_ring->reschedule)) {
1247 		tx_ring->reschedule = B_FALSE;
1248 		mac_tx_ring_update(ixgbe->mac_hdl,
1249 		    tx_ring->ring_handle);
1250 	}
1251 	mutex_exit(&tx_ring->recycle_lock);
1252 
1253 	/*
1254 	 * Add the tx control blocks in the pending list to the free list.
1255 	 */
1256 	ixgbe_put_free_list(tx_ring, &pending_list);
1257 
1258 	return (desc_num);
1259 }
1260 
1261 /*
1262  * ixgbe_tx_recycle_head_wb
1263  *
1264  * Check the head write-back, and recycle all the transmitted
1265  * tx descriptors and tx control blocks.
1266  */
1267 uint32_t
1268 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring)
1269 {
1270 	uint32_t index;
1271 	uint32_t head_wb;
1272 	int desc_num;
1273 	tx_control_block_t *tcb;
1274 	link_list_t pending_list;
1275 	ixgbe_t *ixgbe = tx_ring->ixgbe;
1276 
1277 	mutex_enter(&tx_ring->recycle_lock);
1278 
1279 	ASSERT(tx_ring->tbd_free <= tx_ring->ring_size);
1280 
1281 	if (tx_ring->tbd_free == tx_ring->ring_size) {
1282 		tx_ring->recycle_fail = 0;
1283 		tx_ring->stall_watchdog = 0;
1284 		if (tx_ring->reschedule) {
1285 			tx_ring->reschedule = B_FALSE;
1286 			mac_tx_ring_update(ixgbe->mac_hdl,
1287 			    tx_ring->ring_handle);
1288 		}
1289 		mutex_exit(&tx_ring->recycle_lock);
1290 		return (0);
1291 	}
1292 
1293 	/*
1294 	 * Sync the DMA buffer of the tx descriptor ring
1295 	 *
1296 	 * Note: For head write-back mode, the tx descriptors will not
1297 	 * be written back, but the head write-back value is stored at
1298 	 * the last extra tbd at the end of the DMA area, we still need
1299 	 * to sync the head write-back value for kernel.
1300 	 *
1301 	 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1302 	 */
1303 	(void) ddi_dma_sync(tx_ring->tbd_area.dma_handle,
1304 	    sizeof (union ixgbe_adv_tx_desc) * tx_ring->ring_size,
1305 	    sizeof (uint32_t),
1306 	    DDI_DMA_SYNC_FORKERNEL);
1307 
1308 	if (ixgbe_check_dma_handle(tx_ring->tbd_area.dma_handle) != DDI_FM_OK) {
1309 		mutex_exit(&tx_ring->recycle_lock);
1310 		ddi_fm_service_impact(ixgbe->dip,
1311 		    DDI_SERVICE_DEGRADED);
1312 		atomic_or_32(&ixgbe->ixgbe_state, IXGBE_ERROR);
1313 		return (0);
1314 	}
1315 
1316 	LINK_LIST_INIT(&pending_list);
1317 	desc_num = 0;
1318 	index = tx_ring->tbd_head;	/* Next index to clean */
1319 
1320 	/*
1321 	 * Get the value of head write-back
1322 	 */
1323 	head_wb = *tx_ring->tbd_head_wb;
1324 	while (index != head_wb) {
1325 		tcb = tx_ring->work_list[index];
1326 		ASSERT(tcb != NULL);
1327 
1328 		if (OFFSET(index, head_wb, tx_ring->ring_size) <
1329 		    tcb->desc_num) {
1330 			/*
1331 			 * The current tx control block is not
1332 			 * completely transmitted, stop recycling
1333 			 */
1334 			break;
1335 		}
1336 
1337 		/*
1338 		 * Strip off the tx control block from the work list,
1339 		 * and add it to the pending list.
1340 		 */
1341 		tx_ring->work_list[index] = NULL;
1342 		LIST_PUSH_TAIL(&pending_list, &tcb->link);
1343 
1344 		/*
1345 		 * Advance the index of the tx descriptor ring
1346 		 */
1347 		index = NEXT_INDEX(index, tcb->desc_num, tx_ring->ring_size);
1348 
1349 		/*
1350 		 * Count the total number of the tx descriptors recycled
1351 		 */
1352 		desc_num += tcb->desc_num;
1353 	}
1354 
1355 	/*
1356 	 * If no tx descriptors are recycled, no need to do more processing
1357 	 */
1358 	if (desc_num == 0) {
1359 		tx_ring->recycle_fail++;
1360 		mutex_exit(&tx_ring->recycle_lock);
1361 		return (0);
1362 	}
1363 
1364 	tx_ring->recycle_fail = 0;
1365 	tx_ring->stall_watchdog = 0;
1366 
1367 	/*
1368 	 * Update the head index of the tx descriptor ring
1369 	 */
1370 	tx_ring->tbd_head = index;
1371 
1372 	/*
1373 	 * Update the number of the free tx descriptors with atomic operations
1374 	 */
1375 	atomic_add_32(&tx_ring->tbd_free, desc_num);
1376 
1377 	if ((tx_ring->tbd_free >= ixgbe->tx_resched_thresh) &&
1378 	    (tx_ring->reschedule)) {
1379 		tx_ring->reschedule = B_FALSE;
1380 		mac_tx_ring_update(ixgbe->mac_hdl,
1381 		    tx_ring->ring_handle);
1382 	}
1383 	mutex_exit(&tx_ring->recycle_lock);
1384 
1385 	/*
1386 	 * Add the tx control blocks in the pending list to the free list.
1387 	 */
1388 	ixgbe_put_free_list(tx_ring, &pending_list);
1389 
1390 	return (desc_num);
1391 }
1392 
1393 /*
1394  * ixgbe_free_tcb - free up the tx control block
1395  *
1396  * Free the resources of the tx control block, including
1397  * unbind the previously bound DMA handle, and reset other
1398  * control fields.
1399  */
1400 void
1401 ixgbe_free_tcb(tx_control_block_t *tcb)
1402 {
1403 	if (tcb == NULL)
1404 		return;
1405 
1406 	switch (tcb->tx_type) {
1407 	case USE_COPY:
1408 		/*
1409 		 * Reset the buffer length that is used for copy
1410 		 */
1411 		tcb->tx_buf.len = 0;
1412 		break;
1413 	case USE_DMA:
1414 		/*
1415 		 * Release the DMA resource that is used for
1416 		 * DMA binding.
1417 		 */
1418 		(void) ddi_dma_unbind_handle(tcb->tx_dma_handle);
1419 		break;
1420 	default:
1421 		break;
1422 	}
1423 
1424 	/*
1425 	 * Free the mblk
1426 	 */
1427 	if (tcb->mp != NULL) {
1428 		freemsg(tcb->mp);
1429 		tcb->mp = NULL;
1430 	}
1431 
1432 	tcb->tx_type = USE_NONE;
1433 	tcb->last_index = MAX_TX_RING_SIZE;
1434 	tcb->frag_num = 0;
1435 	tcb->desc_num = 0;
1436 }
1437 
1438 /*
1439  * ixgbe_get_free_list - Get a free tx control block from the free list.
1440  * Returns the tx control block and appends it to list.
1441  *
1442  * The atomic operation on the number of the available tx control block
1443  * in the free list is used to keep this routine mutual exclusive with
1444  * the routine ixgbe_put_check_list.
1445  */
1446 static tx_control_block_t *
1447 ixgbe_get_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *list)
1448 {
1449 	tx_control_block_t *tcb;
1450 
1451 	/*
1452 	 * Check and update the number of the free tx control block
1453 	 * in the free list.
1454 	 */
1455 	if (ixgbe_atomic_reserve(&tx_ring->tcb_free, 1) < 0) {
1456 		tx_ring->stat_fail_no_tcb++;
1457 		return (NULL);
1458 	}
1459 
1460 	mutex_enter(&tx_ring->tcb_head_lock);
1461 
1462 	tcb = tx_ring->free_list[tx_ring->tcb_head];
1463 	ASSERT(tcb != NULL);
1464 	tx_ring->free_list[tx_ring->tcb_head] = NULL;
1465 	tx_ring->tcb_head = NEXT_INDEX(tx_ring->tcb_head, 1,
1466 	    tx_ring->free_list_size);
1467 
1468 	mutex_exit(&tx_ring->tcb_head_lock);
1469 
1470 	LIST_PUSH_TAIL(list, &tcb->link);
1471 	return (tcb);
1472 }
1473 
1474 /*
1475  * ixgbe_put_free_list
1476  *
1477  * Put a list of used tx control blocks back to the free list
1478  *
1479  * A mutex is used here to ensure the serialization. The mutual exclusion
1480  * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1481  * the atomic operation on the counter tcb_free.
1482  */
1483 void
1484 ixgbe_put_free_list(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list)
1485 {
1486 	uint32_t index;
1487 	int tcb_num;
1488 	tx_control_block_t *tcb;
1489 
1490 	for (tcb = (tx_control_block_t *)LIST_GET_HEAD(pending_list);
1491 	    tcb != NULL;
1492 	    tcb = (tx_control_block_t *)LIST_GET_NEXT(pending_list, tcb)) {
1493 		/*
1494 		 * Despite the name, ixgbe_free_tcb() just releases the
1495 		 * resources in tcb, but does not free tcb itself.
1496 		 */
1497 		ixgbe_free_tcb(tcb);
1498 	}
1499 
1500 	mutex_enter(&tx_ring->tcb_tail_lock);
1501 
1502 	index = tx_ring->tcb_tail;
1503 
1504 	tcb_num = 0;
1505 	tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1506 	while (tcb != NULL) {
1507 		ASSERT(tx_ring->free_list[index] == NULL);
1508 		tx_ring->free_list[index] = tcb;
1509 
1510 		tcb_num++;
1511 
1512 		index = NEXT_INDEX(index, 1, tx_ring->free_list_size);
1513 
1514 		tcb = (tx_control_block_t *)LIST_POP_HEAD(pending_list);
1515 	}
1516 
1517 	tx_ring->tcb_tail = index;
1518 
1519 	/*
1520 	 * Update the number of the free tx control block
1521 	 * in the free list. This operation must be placed
1522 	 * under the protection of the lock.
1523 	 */
1524 	atomic_add_32(&tx_ring->tcb_free, tcb_num);
1525 
1526 	mutex_exit(&tx_ring->tcb_tail_lock);
1527 }
1528