1/*
2 * Copyright (c) 2008-2016 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 *    this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 *    this list of conditions and the following disclaimer in the documentation
12 *    and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 * The views and conclusions contained in the software and documentation are
27 * those of the authors and should not be interpreted as representing official
28 * policies, either expressed or implied, of the FreeBSD Project.
29 */
30
31#include <sys/types.h>
32#include <sys/sysmacros.h>
33#include <sys/ddi.h>
34#include <sys/sunddi.h>
35#include <sys/atomic.h>
36#include <sys/stream.h>
37#include <sys/strsun.h>
38#include <sys/strsubr.h>
39#include <sys/pattr.h>
40#include <sys/cpu.h>
41
42#include <sys/ethernet.h>
43#include <inet/ip.h>
44
45#include <netinet/in.h>
46#include <netinet/ip.h>
47#include <netinet/tcp.h>
48
49#include "sfxge.h"
50
51#include "efx.h"
52
53/* TXQ flush response timeout (in microseconds) */
54#define	SFXGE_TX_QFLUSH_USEC	(2000000)
55
56/* See sfxge.conf.private for descriptions */
57#define	SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT 4096
58#define	SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT 256
59
60
61/* Transmit buffer DMA attributes */
62static ddi_device_acc_attr_t sfxge_tx_buffer_devacc = {
63
64	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
65	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
66	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
67};
68
69static ddi_dma_attr_t sfxge_tx_buffer_dma_attr = {
70	DMA_ATTR_V0,		/* dma_attr_version	*/
71	0,			/* dma_attr_addr_lo	*/
72	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
73	0xffffffffffffffffull,	/* dma_attr_count_max	*/
74	SFXGE_TX_BUFFER_SIZE,	/* dma_attr_align	*/
75	0xffffffff,		/* dma_attr_burstsizes	*/
76	1,			/* dma_attr_minxfer	*/
77	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
78	0xffffffffffffffffull,	/* dma_attr_seg		*/
79	1,			/* dma_attr_sgllen	*/
80	1,			/* dma_attr_granular	*/
81	0			/* dma_attr_flags	*/
82};
83
84/* Transmit mapping DMA attributes */
85static ddi_dma_attr_t sfxge_tx_mapping_dma_attr = {
86	DMA_ATTR_V0,		/* dma_attr_version	*/
87	0,			/* dma_attr_addr_lo	*/
88	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
89	0xffffffffffffffffull,	/* dma_attr_count_max	*/
90	1,			/* dma_attr_align	*/
91	0xffffffff,		/* dma_attr_burstsizes	*/
92	1,			/* dma_attr_minxfer	*/
93	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
94	0xffffffffffffffffull,	/* dma_attr_seg		*/
95	0x7fffffff,		/* dma_attr_sgllen	*/
96	1,			/* dma_attr_granular	*/
97	0			/* dma_attr_flags	*/
98};
99
100/* Transmit queue DMA attributes */
101static ddi_device_acc_attr_t sfxge_txq_devacc = {
102
103	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
104	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
105	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
106};
107
108static ddi_dma_attr_t sfxge_txq_dma_attr = {
109	DMA_ATTR_V0,		/* dma_attr_version	*/
110	0,			/* dma_attr_addr_lo	*/
111	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
112	0xffffffffffffffffull,	/* dma_attr_count_max	*/
113	EFX_BUF_SIZE,		/* dma_attr_align	*/
114	0xffffffff,		/* dma_attr_burstsizes	*/
115	1,			/* dma_attr_minxfer	*/
116	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
117	0xffffffffffffffffull,	/* dma_attr_seg		*/
118	1,			/* dma_attr_sgllen	*/
119	1,			/* dma_attr_granular	*/
120	0			/* dma_attr_flags	*/
121};
122
123
124/*
125 * A sfxge_tx_qdpl_swizzle() can happen when the DPL get list is one packet
126 * under the limit, and must move all packets from the DPL put->get list
127 * Hence this is the real maximum length of the TX DPL get list.
128 */
129static int
130sfxge_tx_dpl_get_pkt_max(sfxge_txq_t *stp)
131{
132	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
133	return (stdp->get_pkt_limit + stdp->put_pkt_limit - 1);
134}
135
136
137static int
138sfxge_tx_packet_ctor(void *buf, void *arg, int kmflags)
139{
140	_NOTE(ARGUNUSED(arg, kmflags))
141
142	bzero(buf, sizeof (sfxge_tx_packet_t));
143
144	return (0);
145}
146
147static void
148sfxge_tx_packet_dtor(void *buf, void *arg)
149{
150	sfxge_tx_packet_t *stpp = buf;
151
152	_NOTE(ARGUNUSED(arg))
153
154	SFXGE_OBJ_CHECK(stpp, sfxge_tx_packet_t);
155}
156
157static int
158sfxge_tx_buffer_ctor(void *buf, void *arg, int kmflags)
159{
160	sfxge_tx_buffer_t *stbp = buf;
161	sfxge_t *sp = arg;
162	sfxge_dma_buffer_attr_t dma_attr;
163	int rc;
164
165	bzero(buf, sizeof (sfxge_tx_buffer_t));
166
167	dma_attr.sdba_dip	 = sp->s_dip;
168	dma_attr.sdba_dattrp	 = &sfxge_tx_buffer_dma_attr;
169	dma_attr.sdba_callback	 = ((kmflags == KM_SLEEP) ?
170	    DDI_DMA_SLEEP : DDI_DMA_DONTWAIT);
171	dma_attr.sdba_length	 = SFXGE_TX_BUFFER_SIZE;
172	dma_attr.sdba_memflags	 = DDI_DMA_STREAMING;
173	dma_attr.sdba_devaccp	 = &sfxge_tx_buffer_devacc;
174	dma_attr.sdba_bindflags	 = DDI_DMA_WRITE | DDI_DMA_STREAMING;
175	dma_attr.sdba_maxcookies = 1;
176	dma_attr.sdba_zeroinit	 = B_FALSE;
177
178	if ((rc = sfxge_dma_buffer_create(&(stbp->stb_esm), &dma_attr)) != 0)
179		goto fail1;
180
181	return (0);
182
183fail1:
184	DTRACE_PROBE1(fail1, int, rc);
185
186	SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
187
188	return (-1);
189}
190
191static void
192sfxge_tx_buffer_dtor(void *buf, void *arg)
193{
194	sfxge_tx_buffer_t *stbp = buf;
195
196	_NOTE(ARGUNUSED(arg))
197
198	sfxge_dma_buffer_destroy(&(stbp->stb_esm));
199
200	SFXGE_OBJ_CHECK(stbp, sfxge_tx_buffer_t);
201}
202
203static int
204sfxge_tx_mapping_ctor(void *buf, void *arg, int kmflags)
205{
206	sfxge_tx_mapping_t *stmp = buf;
207	sfxge_t *sp = arg;
208	dev_info_t *dip = sp->s_dip;
209	int rc;
210
211	bzero(buf, sizeof (sfxge_tx_mapping_t));
212
213	stmp->stm_sp = sp;
214
215	/* Allocate DMA handle */
216	rc = ddi_dma_alloc_handle(dip, &sfxge_tx_mapping_dma_attr,
217	    (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
218	    NULL, &(stmp->stm_dma_handle));
219	if (rc != DDI_SUCCESS)
220		goto fail1;
221
222	return (0);
223
224fail1:
225	DTRACE_PROBE1(fail1, int, rc);
226
227	stmp->stm_sp = NULL;
228
229	SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
230
231	return (-1);
232}
233
234static void
235sfxge_tx_mapping_dtor(void *buf, void *arg)
236{
237	sfxge_tx_mapping_t *stmp = buf;
238
239	ASSERT3P(stmp->stm_sp, ==, arg);
240
241	/* Free the DMA handle */
242	ddi_dma_free_handle(&(stmp->stm_dma_handle));
243	stmp->stm_dma_handle = NULL;
244
245	stmp->stm_sp = NULL;
246
247	SFXGE_OBJ_CHECK(stmp, sfxge_tx_mapping_t);
248}
249
250static int
251sfxge_tx_qctor(void *buf, void *arg, int kmflags)
252{
253	sfxge_txq_t *stp = buf;
254	efsys_mem_t *esmp = &(stp->st_mem);
255	sfxge_t *sp = arg;
256	sfxge_dma_buffer_attr_t dma_attr;
257	sfxge_tx_dpl_t *stdp;
258	int rc;
259
260	/* Compile-time structure layout checks */
261	EFX_STATIC_ASSERT(sizeof (stp->__st_u1.__st_s1) <=
262	    sizeof (stp->__st_u1.__st_pad));
263	EFX_STATIC_ASSERT(sizeof (stp->__st_u2.__st_s2) <=
264	    sizeof (stp->__st_u2.__st_pad));
265	EFX_STATIC_ASSERT(sizeof (stp->__st_u3.__st_s3) <=
266	    sizeof (stp->__st_u3.__st_pad));
267	EFX_STATIC_ASSERT(sizeof (stp->__st_u4.__st_s4) <=
268	    sizeof (stp->__st_u4.__st_pad));
269
270	bzero(buf, sizeof (sfxge_txq_t));
271
272	stp->st_sp = sp;
273
274	dma_attr.sdba_dip	 = sp->s_dip;
275	dma_attr.sdba_dattrp	 = &sfxge_txq_dma_attr;
276	dma_attr.sdba_callback	 = DDI_DMA_SLEEP;
277	dma_attr.sdba_length	 = EFX_TXQ_SIZE(SFXGE_TX_NDESCS);
278	dma_attr.sdba_memflags	 = DDI_DMA_CONSISTENT;
279	dma_attr.sdba_devaccp	 = &sfxge_txq_devacc;
280	dma_attr.sdba_bindflags	 = DDI_DMA_READ | DDI_DMA_CONSISTENT;
281	dma_attr.sdba_maxcookies = EFX_TXQ_NBUFS(SFXGE_TX_NDESCS);
282	dma_attr.sdba_zeroinit	 = B_FALSE;
283
284	if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
285		goto fail1;
286
287	/* Allocate some buffer table entries */
288	if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS),
289	    &(stp->st_id))) != 0)
290		goto fail2;
291
292	/* Allocate the descriptor array */
293	if ((stp->st_eb = kmem_zalloc(sizeof (efx_buffer_t) *
294	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS), kmflags)) == NULL) {
295		rc = ENOMEM;
296		goto fail3;
297	}
298
299	/* Allocate the context arrays */
300	if ((stp->st_stmp = kmem_zalloc(sizeof (sfxge_tx_mapping_t *) *
301	    SFXGE_TX_NDESCS, kmflags)) == NULL) {
302		rc = ENOMEM;
303		goto fail4;
304	}
305
306	if ((stp->st_stbp = kmem_zalloc(sizeof (sfxge_tx_buffer_t *) *
307	    SFXGE_TX_NDESCS, kmflags)) == NULL) {
308		rc = ENOMEM;
309		goto fail5;
310	}
311
312	if ((stp->st_mp = kmem_zalloc(sizeof (mblk_t *) *
313	    SFXGE_TX_NDESCS, kmflags)) == NULL) {
314		rc = ENOMEM;
315		goto fail6;
316	}
317
318	/* Initialize the deferred packet list */
319	stdp = &(stp->st_dpl);
320	stdp->std_getp = &(stdp->std_get);
321
322	stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
323
324	return (0);
325
326fail6:
327	DTRACE_PROBE(fail6);
328
329	kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
330	stp->st_stbp = NULL;
331
332fail5:
333	DTRACE_PROBE(fail5);
334
335	kmem_free(stp->st_stmp,
336	    sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
337	stp->st_stmp = NULL;
338
339fail4:
340	DTRACE_PROBE(fail4);
341
342	/* Free the descriptor array */
343	kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
344	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
345	stp->st_eb = NULL;
346
347fail3:
348	DTRACE_PROBE(fail3);
349
350	/* Free the buffer table entries */
351	sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
352	stp->st_id = 0;
353
354fail2:
355	DTRACE_PROBE(fail2);
356
357	/* Tear down DMA setup */
358	sfxge_dma_buffer_destroy(esmp);
359
360fail1:
361	DTRACE_PROBE1(fail1, int, rc);
362
363	stp->st_sp = NULL;
364
365	SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
366
367	return (-1);
368}
369
370static void
371sfxge_tx_qdtor(void *buf, void *arg)
372{
373	sfxge_txq_t *stp = buf;
374	efsys_mem_t *esmp = &(stp->st_mem);
375	sfxge_t *sp = stp->st_sp;
376	sfxge_tx_dpl_t *stdp;
377
378	_NOTE(ARGUNUSED(arg))
379
380	stp->st_unblock = 0;
381
382	/* Tear down the deferred packet list */
383	stdp = &(stp->st_dpl);
384	ASSERT3P(stdp->std_getp, ==, &(stdp->std_get));
385	stdp->std_getp = NULL;
386
387	/* Free the context arrays */
388	kmem_free(stp->st_mp, sizeof (mblk_t *) * SFXGE_TX_NDESCS);
389	stp->st_mp = NULL;
390
391	kmem_free(stp->st_stbp, sizeof (sfxge_tx_buffer_t *) * SFXGE_TX_NDESCS);
392	stp->st_stbp = NULL;
393
394	kmem_free(stp->st_stmp,
395	    sizeof (sfxge_tx_mapping_t *) * SFXGE_TX_NDESCS);
396	stp->st_stmp = NULL;
397
398	/* Free the descriptor array */
399	kmem_free(stp->st_eb, sizeof (efx_buffer_t) *
400	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
401	stp->st_eb = NULL;
402
403	/* Free the buffer table entries */
404	sfxge_sram_buf_tbl_free(sp, stp->st_id, EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
405	stp->st_id = 0;
406
407	/* Tear down dma setup */
408	sfxge_dma_buffer_destroy(esmp);
409
410	stp->st_sp = NULL;
411
412	SFXGE_OBJ_CHECK(stp, sfxge_txq_t);
413}
414
415static void
416sfxge_tx_packet_destroy(sfxge_t *sp, sfxge_tx_packet_t *stpp)
417{
418	kmem_cache_free(sp->s_tpc, stpp);
419}
420
421static sfxge_tx_packet_t *
422sfxge_tx_packet_create(sfxge_t *sp)
423{
424	sfxge_tx_packet_t *stpp;
425
426	stpp = kmem_cache_alloc(sp->s_tpc, KM_NOSLEEP);
427
428	return (stpp);
429}
430
431static inline int
432sfxge_tx_qfpp_put(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp)
433{
434	sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
435
436	ASSERT(mutex_owned(&(stp->st_lock)));
437
438	ASSERT3P(stpp->stp_next, ==, NULL);
439	ASSERT3P(stpp->stp_mp, ==, NULL);
440	ASSERT3P(stpp->stp_etherhp, ==, NULL);
441	ASSERT3P(stpp->stp_iphp, ==, NULL);
442	ASSERT3P(stpp->stp_thp, ==, NULL);
443	ASSERT3U(stpp->stp_off, ==, 0);
444	ASSERT3U(stpp->stp_size, ==, 0);
445	ASSERT3U(stpp->stp_mss, ==, 0);
446	ASSERT3U(stpp->stp_dpl_put_len, ==, 0);
447
448	if (stfp->stf_count < SFXGE_TX_FPP_MAX) {
449		/* Add to the start of the list */
450		stpp->stp_next = stfp->stf_stpp;
451		stfp->stf_stpp = stpp;
452		stfp->stf_count++;
453
454		return (0);
455	}
456
457	DTRACE_PROBE(fpp_full);
458	return (ENOSPC);
459}
460
461static inline sfxge_tx_packet_t *
462sfxge_tx_qfpp_get(sfxge_txq_t *stp)
463{
464	sfxge_tx_packet_t *stpp;
465	sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
466
467	ASSERT(mutex_owned(&(stp->st_lock)));
468
469	stpp = stfp->stf_stpp;
470	if (stpp == NULL) {
471		ASSERT3U(stfp->stf_count, ==, 0);
472		return (NULL);
473	}
474
475	/* Remove item from the head of the list */
476	stfp->stf_stpp = stpp->stp_next;
477	stpp->stp_next = NULL;
478
479	ASSERT3U(stfp->stf_count, >, 0);
480	stfp->stf_count--;
481
482	if (stfp->stf_count != 0) {
483		ASSERT(stfp->stf_stpp != NULL);
484		prefetch_read_many(stfp->stf_stpp);
485	}
486	return (stpp);
487}
488
489static void
490sfxge_tx_qfpp_empty(sfxge_txq_t *stp)
491{
492	sfxge_t *sp = stp->st_sp;
493	sfxge_tx_fpp_t *stfp = &(stp->st_fpp);
494	sfxge_tx_packet_t *stpp;
495
496	mutex_enter(&(stp->st_lock));
497
498	stpp = stfp->stf_stpp;
499	stfp->stf_stpp = NULL;
500
501	while (stpp != NULL) {
502		sfxge_tx_packet_t *next;
503
504		next = stpp->stp_next;
505		stpp->stp_next = NULL;
506
507		ASSERT3U(stfp->stf_count, >, 0);
508		stfp->stf_count--;
509
510		sfxge_tx_packet_destroy(sp, stpp);
511
512		stpp = next;
513	}
514	ASSERT3U(stfp->stf_count, ==, 0);
515
516	mutex_exit(&(stp->st_lock));
517}
518
519static inline void
520sfxge_tx_qfbp_put(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp)
521{
522	sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
523
524	ASSERT3P(stbp->stb_next, ==, NULL);
525	ASSERT3U(stbp->stb_off, ==, 0);
526	ASSERT3U(stbp->stb_esm.esm_used, ==, 0);
527
528	stbp->stb_next = stfp->stf_stbp;
529	stfp->stf_stbp = stbp;
530	stfp->stf_count++;
531}
532
533
534static inline sfxge_tx_buffer_t *
535sfxge_tx_qfbp_get(sfxge_txq_t *stp)
536{
537	sfxge_tx_buffer_t *stbp;
538	sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
539
540	stbp = stfp->stf_stbp;
541	if (stbp == NULL) {
542		ASSERT3U(stfp->stf_count, ==, 0);
543		return (NULL);
544	}
545
546	stfp->stf_stbp = stbp->stb_next;
547	stbp->stb_next = NULL;
548
549	ASSERT3U(stfp->stf_count, >, 0);
550	stfp->stf_count--;
551
552	if (stfp->stf_count != 0) {
553		ASSERT(stfp->stf_stbp != NULL);
554		prefetch_read_many(stfp->stf_stbp);
555	}
556
557	return (stbp);
558}
559
560static void
561sfxge_tx_qfbp_empty(sfxge_txq_t *stp)
562{
563	sfxge_t *sp = stp->st_sp;
564	sfxge_tx_fbp_t *stfp = &(stp->st_fbp);
565	sfxge_tx_buffer_t *stbp;
566
567	mutex_enter(&(stp->st_lock));
568
569	stbp = stfp->stf_stbp;
570	stfp->stf_stbp = NULL;
571
572	while (stbp != NULL) {
573		sfxge_tx_buffer_t *next;
574
575		next = stbp->stb_next;
576		stbp->stb_next = NULL;
577
578		ASSERT3U(stfp->stf_count, >, 0);
579		stfp->stf_count--;
580
581		kmem_cache_free(sp->s_tbc, stbp);
582
583		stbp = next;
584	}
585	ASSERT3U(stfp->stf_count, ==, 0);
586
587	mutex_exit(&(stp->st_lock));
588}
589
590static inline void
591sfxge_tx_qfmp_put(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp)
592{
593	sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
594
595	ASSERT3P(stmp->stm_next, ==, NULL);
596	ASSERT3P(stmp->stm_mp, ==, NULL);
597	ASSERT3P(stmp->stm_base, ==, NULL);
598	ASSERT3U(stmp->stm_off, ==, 0);
599	ASSERT3U(stmp->stm_size, ==, 0);
600
601	stmp->stm_next = stfp->stf_stmp;
602	stfp->stf_stmp = stmp;
603	stfp->stf_count++;
604}
605
606static inline sfxge_tx_mapping_t *
607sfxge_tx_qfmp_get(sfxge_txq_t *stp)
608{
609	sfxge_tx_mapping_t *stmp;
610	sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
611
612	stmp = stfp->stf_stmp;
613	if (stmp == NULL) {
614		ASSERT3U(stfp->stf_count, ==, 0);
615		return (NULL);
616	}
617
618	stfp->stf_stmp = stmp->stm_next;
619	stmp->stm_next = NULL;
620
621	ASSERT3U(stfp->stf_count, >, 0);
622	stfp->stf_count--;
623
624	if (stfp->stf_count != 0) {
625		ASSERT(stfp->stf_stmp != NULL);
626		prefetch_read_many(stfp->stf_stmp);
627	}
628	return (stmp);
629}
630
631static void
632sfxge_tx_qfmp_empty(sfxge_txq_t *stp)
633{
634	sfxge_t *sp = stp->st_sp;
635	sfxge_tx_fmp_t *stfp = &(stp->st_fmp);
636	sfxge_tx_mapping_t *stmp;
637
638	mutex_enter(&(stp->st_lock));
639
640	stmp = stfp->stf_stmp;
641	stfp->stf_stmp = NULL;
642
643	while (stmp != NULL) {
644		sfxge_tx_mapping_t *next;
645
646		next = stmp->stm_next;
647		stmp->stm_next = NULL;
648
649		ASSERT3U(stfp->stf_count, >, 0);
650		stfp->stf_count--;
651
652		kmem_cache_free(sp->s_tmc, stmp);
653
654		stmp = next;
655	}
656	ASSERT3U(stfp->stf_count, ==, 0);
657
658	mutex_exit(&(stp->st_lock));
659}
660
661static void
662sfxge_tx_msgb_unbind(sfxge_tx_mapping_t *stmp)
663{
664	bzero(stmp->stm_addr, sizeof (uint64_t) * SFXGE_TX_MAPPING_NADDR);
665	stmp->stm_off = 0;
666
667	(void) ddi_dma_unbind_handle(stmp->stm_dma_handle);
668
669	stmp->stm_size = 0;
670	stmp->stm_base = NULL;
671
672	stmp->stm_mp = NULL;
673}
674
675#define	SFXGE_TX_DESCSHIFT	12
676#define	SFXGE_TX_DESCSIZE	(1 << 12)
677
678#define	SFXGE_TX_DESCOFFSET	(SFXGE_TX_DESCSIZE - 1)
679#define	SFXGE_TX_DESCMASK	(~SFXGE_TX_DESCOFFSET)
680
681static int
682sfxge_tx_msgb_bind(mblk_t *mp, sfxge_tx_mapping_t *stmp)
683{
684	ddi_dma_cookie_t dmac;
685	unsigned int ncookies;
686	size_t size;
687	unsigned int n;
688	int rc;
689
690	ASSERT(mp != NULL);
691	ASSERT3U(DB_TYPE(mp), ==, M_DATA);
692
693	ASSERT(stmp->stm_mp == NULL);
694	stmp->stm_mp = mp;
695
696	stmp->stm_base = (caddr_t)(mp->b_rptr);
697	stmp->stm_size = MBLKL(mp);
698
699	/* Bind the STREAMS block to the mapping */
700	rc = ddi_dma_addr_bind_handle(stmp->stm_dma_handle, NULL,
701	    stmp->stm_base, stmp->stm_size, DDI_DMA_WRITE | DDI_DMA_STREAMING,
702	    DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
703	if (rc != DDI_DMA_MAPPED)
704		goto fail1;
705
706	ASSERT3U(ncookies, <=, SFXGE_TX_MAPPING_NADDR);
707
708	/*
709	 * Construct an array of addresses and an initial
710	 * offset.
711	 */
712	n = 0;
713	stmp->stm_addr[n++] = dmac.dmac_laddress & SFXGE_TX_DESCMASK;
714	DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress & SFXGE_TX_DESCMASK);
715
716	stmp->stm_off = dmac.dmac_laddress & SFXGE_TX_DESCOFFSET;
717
718	size = MIN(SFXGE_TX_DESCSIZE - stmp->stm_off, dmac.dmac_size);
719	dmac.dmac_laddress += size;
720	dmac.dmac_size -= size;
721
722	for (;;) {
723		ASSERT3U(n, <, SFXGE_TX_MAPPING_NADDR);
724
725		if (dmac.dmac_size == 0) {
726			if (--ncookies == 0)
727				break;
728
729			ddi_dma_nextcookie(stmp->stm_dma_handle, &dmac);
730		}
731
732		ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCMASK) != 0);
733		ASSERT((dmac.dmac_laddress & SFXGE_TX_DESCOFFSET) == 0);
734		stmp->stm_addr[n++] = dmac.dmac_laddress;
735		DTRACE_PROBE1(addr, uint64_t, dmac.dmac_laddress);
736
737		size = MIN(SFXGE_TX_DESCSIZE, dmac.dmac_size);
738		dmac.dmac_laddress += size;
739		dmac.dmac_size -= size;
740	}
741	ASSERT3U(n, <=, SFXGE_TX_MAPPING_NADDR);
742
743	return (0);
744
745fail1:
746	DTRACE_PROBE1(fail1, int, rc);
747
748	stmp->stm_size = 0;
749	stmp->stm_base = NULL;
750
751	stmp->stm_mp = NULL;
752
753	return (-1);
754}
755
756static void
757sfxge_tx_qreap(sfxge_txq_t *stp)
758{
759	unsigned int reaped;
760
761	ASSERT(mutex_owned(&(stp->st_lock)));
762
763	reaped = stp->st_reaped;
764	while (reaped != stp->st_completed) {
765		unsigned int id;
766		sfxge_tx_mapping_t *stmp;
767		sfxge_tx_buffer_t *stbp;
768
769		id = reaped++ & (SFXGE_TX_NDESCS - 1);
770
771		ASSERT3P(stp->st_mp[id], ==, NULL);
772
773		if ((stmp = stp->st_stmp[id]) != NULL) {
774			stp->st_stmp[id] = NULL;
775
776			/* Free all the mappings */
777			do {
778				sfxge_tx_mapping_t *next;
779
780				next = stmp->stm_next;
781				stmp->stm_next = NULL;
782
783				sfxge_tx_qfmp_put(stp, stmp);
784
785				stmp = next;
786			} while (stmp != NULL);
787		}
788
789		if ((stbp = stp->st_stbp[id]) != NULL) {
790			stp->st_stbp[id] = NULL;
791
792			/* Free all the buffers */
793			do {
794				sfxge_tx_buffer_t *next;
795
796				next = stbp->stb_next;
797				stbp->stb_next = NULL;
798
799				stbp->stb_esm.esm_used = 0;
800				stbp->stb_off = 0;
801
802				sfxge_tx_qfbp_put(stp, stbp);
803
804				stbp = next;
805			} while (stbp != NULL);
806		}
807	}
808	stp->st_reaped = reaped;
809}
810
811static void
812sfxge_tx_qlist_abort(sfxge_txq_t *stp)
813{
814	unsigned int id;
815	sfxge_tx_mapping_t *stmp;
816	sfxge_tx_buffer_t *stbp;
817	mblk_t *mp;
818
819	ASSERT(mutex_owned(&(stp->st_lock)));
820
821	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
822
823	/* Clear the completion information */
824	stmp = stp->st_stmp[id];
825	stp->st_stmp[id] = NULL;
826
827	/* Free any mappings that were used */
828	while (stmp != NULL) {
829		sfxge_tx_mapping_t *next;
830
831		next = stmp->stm_next;
832		stmp->stm_next = NULL;
833
834		if (stmp->stm_mp != NULL)
835			sfxge_tx_msgb_unbind(stmp);
836
837		sfxge_tx_qfmp_put(stp, stmp);
838
839		stmp = next;
840	}
841
842	stbp = stp->st_stbp[id];
843	stp->st_stbp[id] = NULL;
844
845	/* Free any buffers that were used */
846	while (stbp != NULL) {
847		sfxge_tx_buffer_t *next;
848
849		next = stbp->stb_next;
850		stbp->stb_next = NULL;
851
852		stbp->stb_off = 0;
853		stbp->stb_esm.esm_used = 0;
854
855		sfxge_tx_qfbp_put(stp, stbp);
856
857		stbp = next;
858	}
859
860	mp = stp->st_mp[id];
861	stp->st_mp[id] = NULL;
862
863	if (mp != NULL)
864		freemsg(mp);
865
866	/* Clear the fragment list */
867	stp->st_n = 0;
868}
869
870/* Push descriptors to the TX ring setting blocked if no space */
871static void
872sfxge_tx_qlist_post(sfxge_txq_t *stp)
873{
874	unsigned int id;
875	unsigned int level;
876	unsigned int available;
877	int rc;
878
879	ASSERT(mutex_owned(&(stp->st_lock)));
880
881	ASSERT(stp->st_n != 0);
882
883again:
884	level = stp->st_added - stp->st_reaped;
885	available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
886
887	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
888
889	if (available < stp->st_n) {
890		rc = ENOSPC;
891		goto fail1;
892	}
893
894	ASSERT3U(available, >=, stp->st_n);
895
896	/* Post the fragment list */
897	if ((rc = efx_tx_qpost(stp->st_etp, stp->st_eb, stp->st_n,
898	    stp->st_reaped, &(stp->st_added))) != 0)
899		goto fail2;
900
901	/*
902	 * If the list took more than a single descriptor then we need to
903	 * to move the completion information so it is referenced by the last
904	 * descriptor.
905	 */
906	if (((stp->st_added - 1) & (SFXGE_TX_NDESCS - 1)) != id) {
907		sfxge_tx_mapping_t *stmp;
908		sfxge_tx_buffer_t *stbp;
909		mblk_t *mp;
910
911		stmp = stp->st_stmp[id];
912		stp->st_stmp[id] = NULL;
913
914		stbp = stp->st_stbp[id];
915		stp->st_stbp[id] = NULL;
916
917		mp = stp->st_mp[id];
918		stp->st_mp[id] = NULL;
919
920		id = (stp->st_added - 1) & (SFXGE_TX_NDESCS - 1);
921
922		ASSERT(stp->st_stmp[id] == NULL);
923		stp->st_stmp[id] = stmp;
924
925		ASSERT(stp->st_stbp[id] == NULL);
926		stp->st_stbp[id] = stbp;
927
928		ASSERT(stp->st_mp[id] == NULL);
929		stp->st_mp[id] = mp;
930	}
931
932	/* Clear the list */
933	stp->st_n = 0;
934
935	ASSERT3U(stp->st_unblock, ==, SFXGE_TXQ_NOT_BLOCKED);
936	return;
937
938fail2:
939	DTRACE_PROBE(fail2);
940fail1:
941	DTRACE_PROBE1(fail1, int, rc);
942
943	ASSERT(rc == ENOSPC);
944
945	level = stp->st_added - stp->st_completed;
946	available = EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) - level;
947
948	/*
949	 * If there would be enough space after we've reaped any completed
950	 * mappings and buffers, and we gain sufficient queue space by doing
951	 * so, then reap now and try posting again.
952	 */
953	if (stp->st_n <= available &&
954	    stp->st_completed - stp->st_reaped >= SFXGE_TX_BATCH) {
955		sfxge_tx_qreap(stp);
956
957		goto again;
958	}
959
960	/* Set the unblock level */
961	if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED) {
962		stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL1;
963	} else {
964		ASSERT(stp->st_unblock == SFXGE_TXQ_UNBLOCK_LEVEL1);
965
966		stp->st_unblock = SFXGE_TXQ_UNBLOCK_LEVEL2;
967	}
968
969	/*
970	 * Avoid a race with completion interrupt handling that could leave the
971	 * queue blocked.
972	 *
973	 * NOTE: The use of st_pending rather than st_completed is intentional
974	 *	 as st_pending is updated per-event rather than per-batch and
975	 *	 therefore avoids needless deferring.
976	 */
977	if (stp->st_pending == stp->st_added) {
978		sfxge_tx_qreap(stp);
979
980		stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
981		goto again;
982	}
983
984	ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED);
985}
986
987static int
988sfxge_tx_kstat_update(kstat_t *ksp, int rw)
989{
990	sfxge_txq_t *stp = ksp->ks_private;
991	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
992	kstat_named_t *knp;
993	int rc;
994
995	ASSERT(mutex_owned(&(stp->st_lock)));
996
997	if (rw != KSTAT_READ) {
998		rc = EACCES;
999		goto fail1;
1000	}
1001
1002	if (stp->st_state != SFXGE_TXQ_STARTED)
1003		goto done;
1004
1005	efx_tx_qstats_update(stp->st_etp, stp->st_stat);
1006	knp = (kstat_named_t *)ksp->ks_data + TX_NQSTATS;
1007	knp->value.ui64 = stdp->get_pkt_limit;
1008	knp++;
1009	knp->value.ui64 = stdp->put_pkt_limit;
1010	knp++;
1011	knp->value.ui64 = stdp->get_full_count;
1012	knp++;
1013	knp->value.ui64 = stdp->put_full_count;
1014
1015done:
1016	return (0);
1017
1018fail1:
1019	DTRACE_PROBE1(fail1, int, rc);
1020
1021	return (rc);
1022}
1023
1024static int
1025sfxge_tx_kstat_init(sfxge_txq_t *stp)
1026{
1027	sfxge_t *sp = stp->st_sp;
1028	unsigned int index = stp->st_index;
1029	dev_info_t *dip = sp->s_dip;
1030	kstat_t *ksp;
1031	kstat_named_t *knp;
1032	char name[MAXNAMELEN];
1033	unsigned int id;
1034	int rc;
1035
1036	/* Create the set */
1037	(void) snprintf(name, MAXNAMELEN - 1, "%s_txq%04d",
1038	    ddi_driver_name(dip), index);
1039
1040	if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1041	    ddi_get_instance(dip), name, "queue", KSTAT_TYPE_NAMED,
1042	    TX_NQSTATS + 4, 0)) == NULL) {
1043		rc = ENOMEM;
1044		goto fail1;
1045	}
1046
1047	stp->st_ksp = ksp;
1048
1049	ksp->ks_update = sfxge_tx_kstat_update;
1050	ksp->ks_private = stp;
1051	ksp->ks_lock = &(stp->st_lock);
1052
1053	/* Initialise the named stats */
1054	stp->st_stat = knp = ksp->ks_data;
1055	for (id = 0; id < TX_NQSTATS; id++) {
1056		kstat_named_init(knp, (char *)efx_tx_qstat_name(sp->s_enp, id),
1057		    KSTAT_DATA_UINT64);
1058		knp++;
1059	}
1060	kstat_named_init(knp, "dpl_get_pkt_limit", KSTAT_DATA_UINT64);
1061	knp++;
1062	kstat_named_init(knp, "dpl_put_pkt_limit", KSTAT_DATA_UINT64);
1063	knp++;
1064	kstat_named_init(knp, "dpl_get_full_count", KSTAT_DATA_UINT64);
1065	knp++;
1066	kstat_named_init(knp, "dpl_put_full_count", KSTAT_DATA_UINT64);
1067
1068	kstat_install(ksp);
1069	return (0);
1070
1071fail1:
1072	DTRACE_PROBE1(fail1, int, rc);
1073
1074	return (rc);
1075}
1076
1077static void
1078sfxge_tx_kstat_fini(sfxge_txq_t *stp)
1079{
1080	/* Destroy the set */
1081	kstat_delete(stp->st_ksp);
1082	stp->st_ksp = NULL;
1083	stp->st_stat = NULL;
1084}
1085
1086static int
1087sfxge_tx_qinit(sfxge_t *sp, unsigned int index, sfxge_txq_type_t type,
1088    unsigned int evq)
1089{
1090	sfxge_txq_t *stp;
1091	sfxge_tx_dpl_t *stdp;
1092	int rc;
1093
1094	ASSERT3U(index, <, EFX_ARRAY_SIZE(sp->s_stp));
1095	ASSERT3U(type, <, SFXGE_TXQ_NTYPES);
1096	ASSERT3U(evq, <, EFX_ARRAY_SIZE(sp->s_sep));
1097
1098	if ((stp = kmem_cache_alloc(sp->s_tqc, KM_SLEEP)) == NULL) {
1099		rc = ENOMEM;
1100		goto fail1;
1101	}
1102	ASSERT3U(stp->st_state, ==, SFXGE_TXQ_UNINITIALIZED);
1103
1104	stdp = &(stp->st_dpl);
1105
1106	stp->st_index = index;
1107	stp->st_type = type;
1108	stp->st_evq = evq;
1109
1110	mutex_init(&(stp->st_lock), NULL, MUTEX_DRIVER,
1111	    DDI_INTR_PRI(sp->s_intr.si_intr_pri));
1112
1113	/* Initialize the statistics */
1114	if ((rc = sfxge_tx_kstat_init(stp)) != 0)
1115		goto fail2;
1116
1117	stdp->get_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1118	    DDI_PROP_DONTPASS, "tx_dpl_get_pkt_limit",
1119	    SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT);
1120
1121	stdp->put_pkt_limit = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
1122	    DDI_PROP_DONTPASS, "tx_dpl_put_pkt_limit",
1123	    SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT);
1124
1125	/* Allocate a per-EVQ label for events from this TXQ */
1126	if ((rc = sfxge_ev_txlabel_alloc(sp, evq, stp, &(stp->st_label))) != 0)
1127		goto fail2;
1128
1129	stp->st_state = SFXGE_TXQ_INITIALIZED;
1130
1131	/* Attach the TXQ to the driver */
1132	ASSERT3P(sp->s_stp[index], ==, NULL);
1133	sp->s_stp[index] = stp;
1134	sp->s_tx_qcount++;
1135
1136	return (0);
1137
1138fail2:
1139	DTRACE_PROBE(fail2);
1140
1141	sfxge_tx_kstat_fini(stp);
1142
1143
1144	stp->st_evq = 0;
1145	stp->st_type = 0;
1146	stp->st_index = 0;
1147
1148	mutex_destroy(&(stp->st_lock));
1149
1150	kmem_cache_free(sp->s_tqc, stp);
1151
1152fail1:
1153	DTRACE_PROBE1(fail1, int, rc);
1154
1155	return (rc);
1156}
1157
1158static int
1159sfxge_tx_qstart(sfxge_t *sp, unsigned int index)
1160{
1161	sfxge_txq_t *stp = sp->s_stp[index];
1162	efx_nic_t *enp = sp->s_enp;
1163	efsys_mem_t *esmp;
1164	sfxge_evq_t *sep;
1165	unsigned int evq;
1166	unsigned int flags;
1167	unsigned int desc_index;
1168	int rc;
1169
1170	mutex_enter(&(stp->st_lock));
1171
1172	esmp = &(stp->st_mem);
1173	evq = stp->st_evq;
1174	sep = sp->s_sep[evq];
1175
1176	ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
1177	ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1178
1179	/* Zero the memory */
1180	bzero(esmp->esm_base, EFX_TXQ_SIZE(SFXGE_TX_NDESCS));
1181
1182	/* Program the buffer table */
1183	if ((rc = sfxge_sram_buf_tbl_set(sp, stp->st_id, esmp,
1184	    EFX_TXQ_NBUFS(SFXGE_TX_NDESCS))) != 0)
1185		goto fail1;
1186
1187	switch (stp->st_type) {
1188	case SFXGE_TXQ_NON_CKSUM:
1189		flags = 0;
1190		break;
1191
1192	case SFXGE_TXQ_IP_CKSUM:
1193		flags = EFX_TXQ_CKSUM_IPV4;
1194		break;
1195
1196	case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
1197		flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
1198		break;
1199
1200	default:
1201		ASSERT(B_FALSE);
1202
1203		flags = 0;
1204		break;
1205	}
1206
1207	/* Create the transmit queue */
1208	if ((rc = efx_tx_qcreate(enp, index, stp->st_label, esmp,
1209	    SFXGE_TX_NDESCS, stp->st_id, flags, sep->se_eep,
1210	    &(stp->st_etp), &desc_index)) != 0)
1211		goto fail2;
1212
1213	/* Initialise queue descriptor indexes */
1214	stp->st_added = desc_index;
1215	stp->st_pending = desc_index;
1216	stp->st_completed = desc_index;
1217	stp->st_reaped = desc_index;
1218
1219	/* Enable the transmit queue */
1220	efx_tx_qenable(stp->st_etp);
1221
1222	stp->st_state = SFXGE_TXQ_STARTED;
1223
1224	mutex_exit(&(stp->st_lock));
1225
1226	return (0);
1227
1228fail2:
1229	DTRACE_PROBE(fail2);
1230
1231	/* Clear entries from the buffer table */
1232	sfxge_sram_buf_tbl_clear(sp, stp->st_id,
1233	    EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
1234
1235fail1:
1236	DTRACE_PROBE1(fail1, int, rc);
1237
1238	mutex_exit(&(stp->st_lock));
1239
1240	return (rc);
1241}
1242
1243static inline int
1244sfxge_tx_qmapping_add(sfxge_txq_t *stp, sfxge_tx_mapping_t *stmp,
1245    size_t *offp, size_t *limitp)
1246{
1247	mblk_t *mp;
1248	size_t mapping_off;
1249	size_t mapping_size;
1250	int rc;
1251
1252	ASSERT3U(*offp, <, stmp->stm_size);
1253	ASSERT(*limitp != 0);
1254
1255	mp = stmp->stm_mp;
1256
1257	ASSERT3P(stmp->stm_base, ==, mp->b_rptr);
1258	ASSERT3U(stmp->stm_size, ==, MBLKL(mp));
1259
1260	mapping_off = stmp->stm_off + *offp;
1261	mapping_size = stmp->stm_size - *offp;
1262
1263	while (mapping_size != 0 && *limitp != 0) {
1264		size_t page =
1265		    mapping_off >> SFXGE_TX_DESCSHIFT;
1266		size_t page_off =
1267		    mapping_off & SFXGE_TX_DESCOFFSET;
1268		size_t page_size =
1269		    SFXGE_TX_DESCSIZE - page_off;
1270		efx_buffer_t *ebp;
1271
1272		ASSERT3U(page, <, SFXGE_TX_MAPPING_NADDR);
1273		ASSERT((stmp->stm_addr[page] & SFXGE_TX_DESCMASK) != 0);
1274
1275		page_size = MIN(page_size, mapping_size);
1276		page_size = MIN(page_size, *limitp);
1277
1278		ASSERT3U(stp->st_n, <=,
1279		    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1280		if (stp->st_n ==
1281		    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1282			rc = ENOSPC;
1283			goto fail1;
1284		}
1285
1286		ebp = &(stp->st_eb[stp->st_n++]);
1287		ebp->eb_addr = stmp->stm_addr[page] +
1288		    page_off;
1289		ebp->eb_size = page_size;
1290
1291		*offp += page_size;
1292		*limitp -= page_size;
1293
1294		mapping_off += page_size;
1295		mapping_size -= page_size;
1296
1297		ebp->eb_eop = (*limitp == 0 ||
1298		    (mapping_size == 0 && mp->b_cont == NULL));
1299
1300		DTRACE_PROBE5(tx_mapping_add,
1301		    unsigned int, stp->st_index,
1302		    unsigned int, stp->st_n - 1,
1303		    uint64_t, ebp->eb_addr,
1304		    size_t, ebp->eb_size,
1305		    boolean_t, ebp->eb_eop);
1306	}
1307
1308	ASSERT3U(*offp, <=, stmp->stm_size);
1309
1310	return (0);
1311
1312fail1:
1313	DTRACE_PROBE1(fail1, int, rc);
1314
1315	return (rc);
1316}
1317
1318static inline int
1319sfxge_tx_qbuffer_add(sfxge_txq_t *stp, sfxge_tx_buffer_t *stbp, boolean_t eop)
1320{
1321	efx_buffer_t *ebp;
1322	int rc;
1323
1324	ASSERT3U(stp->st_n, <=,
1325	    EFX_TXQ_LIMIT(SFXGE_TX_NDESCS));
1326	if (stp->st_n == EFX_TXQ_LIMIT(SFXGE_TX_NDESCS)) {
1327		rc = ENOSPC;
1328		goto fail1;
1329	}
1330
1331	ebp = &(stp->st_eb[stp->st_n++]);
1332	ebp->eb_addr = stbp->stb_esm.esm_addr + stbp->stb_off;
1333	ebp->eb_size = stbp->stb_esm.esm_used - stbp->stb_off;
1334	ebp->eb_eop = eop;
1335
1336	(void) ddi_dma_sync(stbp->stb_esm.esm_dma_handle,
1337	    stbp->stb_off, ebp->eb_size,
1338	    DDI_DMA_SYNC_FORDEV);
1339
1340	stbp->stb_off = stbp->stb_esm.esm_used;
1341
1342	DTRACE_PROBE5(tx_buffer_add,
1343	    unsigned int, stp->st_index,
1344	    unsigned int, stp->st_n - 1,
1345	    uint64_t, ebp->eb_addr, size_t, ebp->eb_size,
1346	    boolean_t, ebp->eb_eop);
1347
1348	return (0);
1349
1350fail1:
1351	DTRACE_PROBE1(fail1, int, rc);
1352
1353	return (rc);
1354}
1355
1356static inline boolean_t
1357sfxge_tx_msgb_copy(mblk_t *mp, sfxge_tx_buffer_t *stbp, size_t *offp,
1358    size_t *limitp)
1359{
1360	size_t data_off;
1361	size_t data_size;
1362	size_t copy_off;
1363	size_t copy_size;
1364	boolean_t eop;
1365
1366	ASSERT3U(*offp, <=, MBLKL(mp));
1367	ASSERT(*limitp != 0);
1368
1369	data_off = *offp;
1370	data_size = MBLKL(mp) - *offp;
1371
1372	copy_off = stbp->stb_esm.esm_used;
1373	copy_size = SFXGE_TX_BUFFER_SIZE - copy_off;
1374
1375	copy_size = MIN(copy_size, data_size);
1376	copy_size = MIN(copy_size, *limitp);
1377
1378	bcopy(mp->b_rptr + data_off,
1379	    stbp->stb_esm.esm_base + copy_off, copy_size);
1380
1381	stbp->stb_esm.esm_used += copy_size;
1382	ASSERT3U(stbp->stb_esm.esm_used, <=,
1383	    SFXGE_TX_BUFFER_SIZE);
1384
1385	*offp += copy_size;
1386	*limitp -= copy_size;
1387
1388	data_off += copy_size;
1389	data_size -= copy_size;
1390
1391	eop = (*limitp == 0 ||
1392	    (data_size == 0 && mp->b_cont == NULL));
1393
1394	ASSERT3U(*offp, <=, MBLKL(mp));
1395
1396	return (eop);
1397}
1398
1399static int
1400sfxge_tx_qpayload_fragment(sfxge_txq_t *stp, unsigned int id, mblk_t **mpp,
1401    size_t *offp, size_t size, boolean_t copy)
1402{
1403	sfxge_t *sp = stp->st_sp;
1404	mblk_t *mp = *mpp;
1405	size_t off = *offp;
1406	sfxge_tx_buffer_t *stbp;
1407	sfxge_tx_mapping_t *stmp;
1408	int rc;
1409
1410	stbp = stp->st_stbp[id];
1411	ASSERT(stbp == NULL || (stbp->stb_esm.esm_used == stbp->stb_off));
1412
1413	stmp = stp->st_stmp[id];
1414
1415	while (size != 0) {
1416		boolean_t eop;
1417
1418		ASSERT(mp != NULL);
1419
1420		if (mp->b_cont != NULL)
1421			prefetch_read_many(mp->b_cont);
1422
1423		ASSERT3U(off, <, MBLKL(mp));
1424
1425		if (copy)
1426			goto copy;
1427
1428		/*
1429		 * Check whether we have already mapped this data block for
1430		 * DMA.
1431		 */
1432		if (stmp == NULL || stmp->stm_mp != mp) {
1433			/*
1434			 * If we are part way through copying a data block then
1435			 * there's no point in trying to map it for DMA.
1436			 */
1437			if (off != 0)
1438				goto copy;
1439
1440			/*
1441			 * If the data block is too short then the cost of
1442			 * mapping it for DMA would outweigh the cost of
1443			 * copying it.
1444			 */
1445			if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1446				goto copy;
1447
1448			/* Try to grab a transmit mapping from the pool */
1449			stmp = sfxge_tx_qfmp_get(stp);
1450			if (stmp == NULL) {
1451				/*
1452				 * The pool was empty so allocate a new
1453				 * mapping.
1454				 */
1455				if ((stmp = kmem_cache_alloc(sp->s_tmc,
1456				    KM_NOSLEEP)) == NULL)
1457					goto copy;
1458			}
1459
1460			/* Add the DMA mapping to the list */
1461			stmp->stm_next = stp->st_stmp[id];
1462			stp->st_stmp[id] = stmp;
1463
1464			/* Try to bind the data block to the mapping */
1465			if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1466				goto copy;
1467		}
1468		ASSERT3P(stmp->stm_mp, ==, mp);
1469
1470		/*
1471		 * If we have a partially filled buffer then we must add it to
1472		 * the fragment list before adding the mapping.
1473		 */
1474		if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
1475			rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1476			if (rc != 0)
1477				goto fail1;
1478		}
1479
1480		/* Add the mapping to the fragment list */
1481		rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1482		if (rc != 0)
1483			goto fail2;
1484
1485		ASSERT(off == MBLKL(mp) || size == 0);
1486
1487		/*
1488		 * If the data block has been exhausted then Skip over the
1489		 * control block and advance to the next data block.
1490		 */
1491		if (off == MBLKL(mp)) {
1492			mp = mp->b_cont;
1493			off = 0;
1494		}
1495
1496		continue;
1497
1498copy:
1499		if (stbp == NULL ||
1500		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
1501			/* Try to grab a buffer from the pool */
1502			stbp = sfxge_tx_qfbp_get(stp);
1503			if (stbp == NULL) {
1504				/*
1505				 * The pool was empty so allocate a new
1506				 * buffer.
1507				 */
1508				if ((stbp = kmem_cache_alloc(sp->s_tbc,
1509				    KM_NOSLEEP)) == NULL) {
1510					rc = ENOMEM;
1511					goto fail3;
1512				}
1513			}
1514
1515			/* Add it to the list */
1516			stbp->stb_next = stp->st_stbp[id];
1517			stp->st_stbp[id] = stbp;
1518		}
1519
1520		/* Copy as much of the data block as we can into the buffer */
1521		eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1522
1523		ASSERT(off == MBLKL(mp) || size == 0 ||
1524		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);
1525
1526		/*
1527		 * If we have reached the end of the packet, or the buffer is
1528		 * full, then add the buffer to the fragment list.
1529		 */
1530		if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
1531			rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1532			if (rc != 0)
1533				goto fail4;
1534		}
1535
1536		/*
1537		 * If the data block has been exhaused then advance to the next
1538		 * one.
1539		 */
1540		if (off == MBLKL(mp)) {
1541			mp = mp->b_cont;
1542			off = 0;
1543		}
1544	}
1545
1546	*mpp = mp;
1547	*offp = off;
1548
1549	return (0);
1550
1551fail4:
1552	DTRACE_PROBE(fail4);
1553fail3:
1554	DTRACE_PROBE(fail3);
1555fail2:
1556	DTRACE_PROBE(fail2);
1557fail1:
1558	DTRACE_PROBE1(fail1, int, rc);
1559
1560	return (rc);
1561}
1562
1563static int
1564sfxge_tx_qlso_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1565    boolean_t copy)
1566{
1567	sfxge_t *sp = stp->st_sp;
1568	mblk_t *mp = stpp->stp_mp;
1569	struct ether_header *etherhp = stpp->stp_etherhp;
1570	struct ip *iphp = stpp->stp_iphp;
1571	struct tcphdr *thp = stpp->stp_thp;
1572	size_t size = stpp->stp_size;
1573	size_t off = stpp->stp_off;
1574	size_t mss = stpp->stp_mss;
1575	unsigned int id;
1576	caddr_t hp;
1577	size_t ehs, hs;
1578	uint16_t start_len;
1579	uint16_t start_id;
1580	uint16_t ip_id;
1581	uint8_t start_flags;
1582	uint32_t start_seq;
1583	uint32_t th_seq;
1584	size_t lss;
1585	sfxge_tx_buffer_t *stbp;
1586	int rc;
1587
1588	ASSERT(mutex_owned(&(stp->st_lock)));
1589
1590	if ((DB_LSOFLAGS(mp) & HW_LSO) == 0) {
1591		rc = EINVAL;
1592		goto fail1;
1593	}
1594
1595	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1596
1597	ASSERT(stp->st_n == 0);
1598	ASSERT(stp->st_stbp[id] == NULL);
1599	ASSERT(stp->st_stmp[id] == NULL);
1600
1601	ehs = (etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1602	    sizeof (struct ether_vlan_header) :
1603	    sizeof (struct ether_header);
1604	if (msgdsize(mp) != ehs + ntohs(iphp->ip_len)) {
1605		rc = EINVAL;
1606		goto fail2;
1607	}
1608
1609	/* The payload offset is equivalent to the size of the headers */
1610	hp = (caddr_t)(mp->b_rptr);
1611	hs = off;
1612
1613	/*
1614	 * If the initial data block only contains the headers then advance
1615	 * to the next one.
1616	 */
1617	if (hs > MBLKL(mp)) {
1618		rc = EINVAL;
1619		goto fail3;
1620	}
1621	mp->b_rptr += hs;
1622
1623	if (MBLKL(mp) == 0)
1624		mp = mp->b_cont;
1625
1626	off = 0;
1627
1628	/* Check IP and TCP headers are suitable for LSO */
1629	if (((iphp->ip_off & ~htons(IP_DF)) != 0) ||
1630	    ((thp->th_flags & (TH_URG | TH_SYN)) != 0) ||
1631	    (thp->th_urp != 0)) {
1632		rc = EINVAL;
1633		goto fail4;
1634	}
1635
1636	if (size + (thp->th_off << 2) + (iphp->ip_hl << 2) !=
1637	    ntohs(iphp->ip_len)) {
1638		rc = EINVAL;
1639		goto fail4;
1640	}
1641
1642	/*
1643	 * Get the base IP id, The stack leaves enough of a gap in id space
1644	 * for us to increment this for each segment we send out.
1645	 */
1646	start_len = ntohs(iphp->ip_len);
1647	start_id = ip_id = ntohs(iphp->ip_id);
1648
1649	/* Get the base TCP sequence number and flags */
1650	start_flags = thp->th_flags;
1651	start_seq = th_seq = ntohl(thp->th_seq);
1652
1653	/* Adjust the header for interim segments */
1654	iphp->ip_len = htons((iphp->ip_hl << 2) + (thp->th_off << 2) + mss);
1655	thp->th_flags = start_flags & ~(TH_PUSH | TH_FIN);
1656
1657	lss = size;
1658	if ((lss / mss) >= (EFX_TXQ_LIMIT(SFXGE_TX_NDESCS) / 2)) {
1659		rc = EINVAL;
1660		goto fail5;
1661	}
1662
1663	stbp = NULL;
1664	while (lss != 0) {
1665		size_t ss = MIN(lss, mss);
1666		boolean_t eol = (ss == lss);
1667
1668		/* Adjust the header for this segment */
1669		iphp->ip_id = htons(ip_id);
1670		ip_id++;
1671
1672		thp->th_seq = htonl(th_seq);
1673		th_seq += ss;
1674
1675		/* If this is the final segment then do some extra adjustment */
1676		if (eol) {
1677			iphp->ip_len = htons((iphp->ip_hl << 2) +
1678			    (thp->th_off << 2) + ss);
1679			thp->th_flags = start_flags;
1680		}
1681
1682		if (stbp == NULL ||
1683		    stbp->stb_esm.esm_used + hs > SFXGE_TX_BUFFER_SIZE) {
1684			/* Try to grab a buffer from the pool */
1685			stbp = sfxge_tx_qfbp_get(stp);
1686			if (stbp == NULL) {
1687				/*
1688				 * The pool was empty so allocate a new
1689				 * buffer.
1690				 */
1691				if ((stbp = kmem_cache_alloc(sp->s_tbc,
1692				    KM_NOSLEEP)) == NULL) {
1693					rc = ENOMEM;
1694					goto fail6;
1695				}
1696			}
1697
1698			/* Add it to the list */
1699			stbp->stb_next = stp->st_stbp[id];
1700			stp->st_stbp[id] = stbp;
1701		}
1702
1703		/* Copy in the headers */
1704		ASSERT3U(stbp->stb_off, ==, stbp->stb_esm.esm_used);
1705		bcopy(hp, stbp->stb_esm.esm_base + stbp->stb_off, hs);
1706		stbp->stb_esm.esm_used += hs;
1707
1708		/* Add the buffer to the fragment list */
1709		rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1710		if (rc != 0)
1711			goto fail7;
1712
1713		/* Add the payload to the fragment list */
1714		if ((rc = sfxge_tx_qpayload_fragment(stp, id, &mp, &off,
1715		    ss, copy)) != 0)
1716			goto fail8;
1717
1718		lss -= ss;
1719	}
1720	ASSERT3U(off, ==, 0);
1721	ASSERT3P(mp, ==, NULL);
1722
1723	ASSERT3U(th_seq - start_seq, ==, size);
1724
1725	/*
1726	 * If no part of the packet has been mapped for DMA then we can free
1727	 * it now, otherwise it can only be freed on completion.
1728	 */
1729	if (stp->st_stmp[id] == NULL)
1730		freemsg(stpp->stp_mp);
1731	else
1732		stp->st_mp[id] = stpp->stp_mp;
1733
1734	stpp->stp_mp = NULL;
1735
1736	return (0);
1737
1738fail8:
1739	DTRACE_PROBE(fail8);
1740fail7:
1741	DTRACE_PROBE(fail7);
1742fail6:
1743	DTRACE_PROBE(fail6);
1744fail5:
1745	DTRACE_PROBE(fail5);
1746
1747	/* Restore the header */
1748	thp->th_seq = htonl(start_seq);
1749	thp->th_flags = start_flags;
1750
1751	iphp->ip_len = htons(start_len);
1752	iphp->ip_id = htons(start_id);
1753
1754fail4:
1755	DTRACE_PROBE(fail4);
1756
1757	mp = stpp->stp_mp;
1758	mp->b_rptr -= hs;
1759
1760	ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1761	    sizeof (struct ether_vlan_header) :
1762	    sizeof (struct ether_header)) +
1763	    ntohs(iphp->ip_len), ==, msgdsize(mp));
1764
1765	ASSERT(stp->st_mp[id] == NULL);
1766
1767fail3:
1768	DTRACE_PROBE(fail3);
1769fail2:
1770	DTRACE_PROBE(fail2);
1771fail1:
1772	DTRACE_PROBE1(fail1, int, rc);
1773
1774	return (rc);
1775}
1776
1777static int
1778sfxge_tx_qpacket_fragment(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp,
1779    boolean_t copy)
1780{
1781	sfxge_t *sp = stp->st_sp;
1782	mblk_t *mp = stpp->stp_mp;
1783	unsigned int id;
1784	size_t off;
1785	size_t size;
1786	sfxge_tx_mapping_t *stmp;
1787	sfxge_tx_buffer_t *stbp;
1788	int rc;
1789
1790	ASSERT(mutex_owned(&(stp->st_lock)));
1791
1792	ASSERT(stp->st_n == 0);
1793
1794	id = stp->st_added & (SFXGE_TX_NDESCS - 1);
1795
1796	ASSERT(stp->st_stbp[id] == NULL);
1797	ASSERT(stp->st_stmp[id] == NULL);
1798
1799	off = 0;
1800	size = LONG_MAX;	/* must be larger than the packet */
1801
1802	stbp = NULL;
1803	stmp = NULL;
1804
1805	while (mp != NULL) {
1806		boolean_t eop;
1807
1808		ASSERT(mp != NULL);
1809
1810		if (mp->b_cont != NULL)
1811			prefetch_read_many(mp->b_cont);
1812
1813		ASSERT(stmp == NULL || stmp->stm_mp != mp);
1814
1815		if (copy)
1816			goto copy;
1817
1818		/*
1819		 * If we are part way through copying a data block then there's
1820		 * no point in trying to map it for DMA.
1821		 */
1822		if (off != 0)
1823			goto copy;
1824
1825		/*
1826		 * If the data block is too short then the cost of mapping it
1827		 * for DMA would outweigh the cost of copying it.
1828		 *
1829		 * TX copy break
1830		 */
1831		if (MBLKL(mp) < SFXGE_TX_COPY_THRESHOLD)
1832			goto copy;
1833
1834		/* Try to grab a transmit mapping from the pool */
1835		stmp = sfxge_tx_qfmp_get(stp);
1836		if (stmp == NULL) {
1837			/*
1838			 * The pool was empty so allocate a new
1839			 * mapping.
1840			 */
1841			if ((stmp = kmem_cache_alloc(sp->s_tmc,
1842			    KM_NOSLEEP)) == NULL)
1843				goto copy;
1844		}
1845
1846		/* Add the DMA mapping to the list */
1847		stmp->stm_next = stp->st_stmp[id];
1848		stp->st_stmp[id] = stmp;
1849
1850		/* Try to bind the data block to the mapping */
1851		if (sfxge_tx_msgb_bind(mp, stmp) != 0)
1852			goto copy;
1853
1854		/*
1855		 * If we have a partially filled buffer then we must add it to
1856		 * the fragment list before adding the mapping.
1857		 */
1858		if (stbp != NULL && (stbp->stb_esm.esm_used > stbp->stb_off)) {
1859			rc = sfxge_tx_qbuffer_add(stp, stbp, B_FALSE);
1860			if (rc != 0)
1861				goto fail1;
1862		}
1863
1864		/* Add the mapping to the fragment list */
1865		rc = sfxge_tx_qmapping_add(stp, stmp, &off, &size);
1866		if (rc != 0)
1867			goto fail2;
1868
1869		ASSERT3U(off, ==, MBLKL(mp));
1870
1871		/* Advance to the next data block */
1872		mp = mp->b_cont;
1873		off = 0;
1874		continue;
1875
1876copy:
1877		if (stbp == NULL ||
1878		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE) {
1879			/* Try to grab a buffer from the pool */
1880			stbp = sfxge_tx_qfbp_get(stp);
1881			if (stbp == NULL) {
1882				/*
1883				 * The pool was empty so allocate a new
1884				 * buffer.
1885				 */
1886				if ((stbp = kmem_cache_alloc(sp->s_tbc,
1887				    KM_NOSLEEP)) == NULL) {
1888					rc = ENOMEM;
1889					goto fail3;
1890				}
1891			}
1892
1893			/* Add it to the list */
1894			stbp->stb_next = stp->st_stbp[id];
1895			stp->st_stbp[id] = stbp;
1896		}
1897
1898		/* Copy as much of the data block as we can into the buffer */
1899		eop = sfxge_tx_msgb_copy(mp, stbp, &off, &size);
1900
1901		ASSERT(off == MBLKL(mp) ||
1902		    stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE);
1903
1904		/*
1905		 * If we have reached the end of the packet, or the buffer is
1906		 * full, then add the buffer to the fragment list.
1907		 */
1908		if (stbp->stb_esm.esm_used == SFXGE_TX_BUFFER_SIZE || eop) {
1909			rc = sfxge_tx_qbuffer_add(stp, stbp, eop);
1910			if (rc != 0)
1911				goto fail4;
1912		}
1913
1914		/*
1915		 * If the data block has been exhaused then advance to the next
1916		 * one.
1917		 */
1918		if (off == MBLKL(mp)) {
1919			mp = mp->b_cont;
1920			off = 0;
1921		}
1922	}
1923	ASSERT3U(off, ==, 0);
1924	ASSERT3P(mp, ==, NULL);
1925	ASSERT3U(size, !=, 0);
1926
1927	/*
1928	 * If no part of the packet has been mapped for DMA then we can free
1929	 * it now, otherwise it can only be freed on completion.
1930	 */
1931	if (stp->st_stmp[id] == NULL)
1932		freemsg(stpp->stp_mp);
1933	else
1934		stp->st_mp[id] = stpp->stp_mp;
1935
1936	stpp->stp_mp = NULL;
1937
1938	return (0);
1939
1940fail4:
1941	DTRACE_PROBE(fail4);
1942fail3:
1943	DTRACE_PROBE(fail3);
1944fail2:
1945	DTRACE_PROBE(fail2);
1946fail1:
1947	DTRACE_PROBE1(fail1, int, rc);
1948
1949	ASSERT(stp->st_stmp[id] == NULL);
1950
1951	return (rc);
1952}
1953
1954
1955#define	SFXGE_TX_QDPL_PUT_PENDING(_stp)					\
1956	((_stp)->st_dpl.std_put != 0)
1957
1958static void
1959sfxge_tx_qdpl_swizzle(sfxge_txq_t *stp)
1960{
1961	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
1962	volatile uintptr_t *putp;
1963	uintptr_t put;
1964	sfxge_tx_packet_t *stpp;
1965	sfxge_tx_packet_t *p;
1966	sfxge_tx_packet_t **pp;
1967	unsigned int count;
1968
1969	ASSERT(mutex_owned(&(stp->st_lock)));
1970
1971	/*
1972	 * Guaranteed that in flight TX packets will cause more TX completions
1973	 * hence more swizzles must happen
1974	 */
1975	ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
1976	if (stdp->std_count >= stdp->get_pkt_limit)
1977		return;
1978
1979	/* Acquire the put list - replacing with an empty list */
1980	putp = &(stdp->std_put);
1981	put = atomic_swap_ulong(putp, 0);
1982	stpp = (void *)put;
1983
1984	if (stpp == NULL)
1985		return;
1986
1987	/* Reverse the list */
1988	pp = &(stpp->stp_next);
1989	p = NULL;
1990
1991	count = 0;
1992	do {
1993		sfxge_tx_packet_t *next;
1994
1995		next = stpp->stp_next;
1996
1997		stpp->stp_next = p;
1998		p = stpp;
1999
2000		count++;
2001		stpp = next;
2002	} while (stpp != NULL);
2003
2004	/* Add it to the tail of the get list */
2005	ASSERT3P(*pp, ==, NULL);
2006
2007	*(stdp->std_getp) = p;
2008	stdp->std_getp = pp;
2009	stdp->std_count += count;
2010	ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2011
2012	DTRACE_PROBE2(dpl_counts, int, stdp->std_count, int, count);
2013}
2014
2015
2016/*
2017 * If TXQ locked, add the RX DPL put list and this packet to the TX DPL get list
2018 * If TXQ unlocked, atomically add this packet to TX DPL put list
2019 *
2020 * The only possible error is ENOSPC (used for TX backpressure)
2021 * For the TX DPL put or get list becoming full, in both cases there must be
2022 * future TX completions (as represented by the packets on the DPL get lists).
2023 *
2024 * This ensures that in the future mac_tx_update() will be called from
2025 * sfxge_tx_qcomplete()
2026 */
2027static inline int
2028sfxge_tx_qdpl_add(sfxge_txq_t *stp, sfxge_tx_packet_t *stpp, int locked)
2029{
2030	sfxge_tx_dpl_t *stdp = &stp->st_dpl;
2031
2032	ASSERT3P(stpp->stp_next, ==, NULL);
2033
2034	if (locked) {
2035		ASSERT(mutex_owned(&stp->st_lock));
2036
2037		if (stdp->std_count >= stdp->get_pkt_limit) {
2038			stdp->get_full_count++;
2039			return (ENOSPC);
2040		}
2041
2042		/* Reverse the put list onto the get list */
2043		sfxge_tx_qdpl_swizzle(stp);
2044
2045		/* Add to the tail of the get list */
2046		*(stdp->std_getp) = stpp;
2047		stdp->std_getp = &stpp->stp_next;
2048		stdp->std_count++;
2049		ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2050
2051	} else {
2052		volatile uintptr_t *putp;
2053		uintptr_t old;
2054		uintptr_t new;
2055		sfxge_tx_packet_t *old_pkt;
2056
2057		putp = &(stdp->std_put);
2058		new = (uintptr_t)stpp;
2059
2060		/* Add to the head of the put list, keeping a list length */
2061		do {
2062			old = *putp;
2063			old_pkt =  (sfxge_tx_packet_t *)old;
2064
2065			stpp->stp_dpl_put_len = old ?
2066			    old_pkt->stp_dpl_put_len + 1 : 1;
2067
2068			if (stpp->stp_dpl_put_len >= stdp->put_pkt_limit) {
2069				stpp->stp_next = 0;
2070				stpp->stp_dpl_put_len = 0;
2071				stdp->put_full_count++;
2072				return (ENOSPC);
2073			}
2074
2075			stpp->stp_next = (void *)old;
2076		} while (atomic_cas_ulong(putp, old, new) != old);
2077	}
2078	return (0);
2079}
2080
2081
2082/* Take all packets from DPL get list and try to send to HW */
2083static void
2084sfxge_tx_qdpl_drain(sfxge_txq_t *stp)
2085{
2086	sfxge_t *sp = stp->st_sp;
2087	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2088	unsigned int pushed = stp->st_added;
2089	sfxge_tx_packet_t *stpp;
2090	unsigned int count;
2091
2092	ASSERT(mutex_owned(&(stp->st_lock)));
2093
2094	prefetch_read_many(sp->s_enp);
2095	prefetch_read_many(stp->st_etp);
2096
2097	stpp = stdp->std_get;
2098	count = stdp->std_count;
2099
2100	while (count != 0) {
2101		sfxge_tx_packet_t *next;
2102		boolean_t copy;
2103		int rc;
2104
2105		ASSERT(stpp != NULL);
2106
2107		/* Split stpp off */
2108		next = stpp->stp_next;
2109		stpp->stp_next = NULL;
2110
2111		if (next != NULL)
2112			prefetch_read_many(next);
2113
2114		if (stp->st_state != SFXGE_TXQ_STARTED)
2115			goto reject;
2116
2117		copy = B_FALSE;
2118
2119again:
2120		/* Fragment the packet */
2121		if (stpp->stp_mss != 0) {
2122			rc = sfxge_tx_qlso_fragment(stp, stpp, copy);
2123		} else {
2124			rc = sfxge_tx_qpacket_fragment(stp, stpp, copy);
2125		}
2126
2127		switch (rc) {
2128		case 0:
2129			break;
2130
2131		case ENOSPC:
2132			if (!copy)
2133				goto copy;
2134
2135		/*FALLTHRU*/
2136		default:
2137			goto reject;
2138		}
2139
2140		/* Free the packet structure */
2141		stpp->stp_etherhp = NULL;
2142		stpp->stp_iphp = NULL;
2143		stpp->stp_thp = NULL;
2144		stpp->stp_off = 0;
2145		stpp->stp_size = 0;
2146		stpp->stp_mss = 0;
2147		stpp->stp_dpl_put_len = 0;
2148
2149		ASSERT3P(stpp->stp_mp, ==, NULL);
2150
2151		if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2152			sfxge_tx_packet_destroy(sp, stpp);
2153			stpp = NULL;
2154		}
2155
2156		--count;
2157		stpp = next;
2158
2159		/* Post the packet */
2160		sfxge_tx_qlist_post(stp);
2161
2162		if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED)
2163			goto defer;
2164
2165		if (stp->st_added - pushed >= SFXGE_TX_BATCH) {
2166			efx_tx_qpush(stp->st_etp, stp->st_added, pushed);
2167			pushed = stp->st_added;
2168		}
2169
2170		continue;
2171
2172copy:
2173		/* Abort the current fragment list */
2174		sfxge_tx_qlist_abort(stp);
2175
2176		/* Try copying the packet to flatten it */
2177		ASSERT(!copy);
2178		copy = B_TRUE;
2179
2180		goto again;
2181
2182reject:
2183		/* Abort the current fragment list */
2184		sfxge_tx_qlist_abort(stp);
2185
2186		/* Discard the packet */
2187		freemsg(stpp->stp_mp);
2188		stpp->stp_mp = NULL;
2189
2190		/* Free the packet structure */
2191		stpp->stp_etherhp = NULL;
2192		stpp->stp_iphp = NULL;
2193		stpp->stp_thp = NULL;
2194		stpp->stp_off = 0;
2195		stpp->stp_size = 0;
2196		stpp->stp_mss = 0;
2197		stpp->stp_dpl_put_len = 0;
2198
2199		if (sfxge_tx_qfpp_put(stp, stpp) != 0) {
2200			sfxge_tx_packet_destroy(sp, stpp);
2201			stpp = NULL;
2202		}
2203
2204		--count;
2205		stpp = next;
2206		continue;
2207defer:
2208		DTRACE_PROBE1(defer, unsigned int, stp->st_index);
2209		break;
2210	}
2211
2212	if (count == 0) {
2213		/* New empty get list */
2214		ASSERT3P(stpp, ==, NULL);
2215		stdp->std_get = NULL;
2216		stdp->std_count = 0;
2217
2218		stdp->std_getp = &(stdp->std_get);
2219	} else {
2220		/* shorten the list by moving the head */
2221		stdp->std_get = stpp;
2222		stdp->std_count = count;
2223		ASSERT3U(stdp->std_count, <=, sfxge_tx_dpl_get_pkt_max(stp));
2224	}
2225
2226	if (stp->st_added != pushed)
2227		efx_tx_qpush(stp->st_etp, stp->st_added, pushed);
2228
2229	ASSERT(stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED ||
2230	    stdp->std_count == 0);
2231}
2232
2233/* Swizzle deferred packet list, try and push to HW */
2234static inline void
2235sfxge_tx_qdpl_service(sfxge_txq_t *stp)
2236{
2237	do {
2238		ASSERT(mutex_owned(&(stp->st_lock)));
2239
2240		if (SFXGE_TX_QDPL_PUT_PENDING(stp))
2241			sfxge_tx_qdpl_swizzle(stp);
2242
2243		if (stp->st_unblock == SFXGE_TXQ_NOT_BLOCKED)
2244			sfxge_tx_qdpl_drain(stp);
2245
2246		mutex_exit(&(stp->st_lock));
2247
2248		if (!SFXGE_TX_QDPL_PUT_PENDING(stp))
2249			break;
2250	} while (mutex_tryenter(&(stp->st_lock)));
2251}
2252
2253static void
2254sfxge_tx_qdpl_flush_locked(sfxge_txq_t *stp)
2255{
2256	sfxge_t *sp = stp->st_sp;
2257	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2258	sfxge_tx_packet_t *stpp;
2259	unsigned int count;
2260
2261	ASSERT(mutex_owned(&(stp->st_lock)));
2262
2263	/* Swizzle put list to the get list */
2264	sfxge_tx_qdpl_swizzle(stp);
2265
2266	stpp = stdp->std_get;
2267	count = stdp->std_count;
2268
2269	while (count != 0) {
2270		sfxge_tx_packet_t *next;
2271
2272		next = stpp->stp_next;
2273		stpp->stp_next = NULL;
2274
2275		/* Discard the packet */
2276		freemsg(stpp->stp_mp);
2277		stpp->stp_mp = NULL;
2278
2279		/* Free the packet structure */
2280		stpp->stp_etherhp = NULL;
2281		stpp->stp_iphp = NULL;
2282		stpp->stp_thp = NULL;
2283		stpp->stp_off = 0;
2284		stpp->stp_size = 0;
2285		stpp->stp_mss = 0;
2286		stpp->stp_dpl_put_len = 0;
2287
2288		sfxge_tx_packet_destroy(sp, stpp);
2289
2290		--count;
2291		stpp = next;
2292	}
2293
2294	ASSERT3P(stpp, ==, NULL);
2295
2296	/* Empty list */
2297	stdp->std_get = NULL;
2298	stdp->std_count = 0;
2299	stdp->std_getp = &(stdp->std_get);
2300}
2301
2302
2303void
2304sfxge_tx_qdpl_flush(sfxge_txq_t *stp)
2305{
2306	mutex_enter(&(stp->st_lock));
2307	sfxge_tx_qdpl_flush_locked(stp);
2308	mutex_exit(&(stp->st_lock));
2309}
2310
2311
2312static void
2313sfxge_tx_qunblock(sfxge_txq_t *stp)
2314{
2315	sfxge_t *sp = stp->st_sp;
2316	unsigned int evq = stp->st_evq;
2317	sfxge_evq_t *sep = sp->s_sep[evq];
2318
2319	ASSERT(mutex_owned(&(sep->se_lock)));
2320
2321	mutex_enter(&(stp->st_lock));
2322
2323	if (stp->st_state != SFXGE_TXQ_STARTED) {
2324		mutex_exit(&(stp->st_lock));
2325		return;
2326	}
2327
2328	if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2329		unsigned int level;
2330
2331		level = stp->st_added - stp->st_completed;
2332		if (level <= stp->st_unblock) {
2333			stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2334			sfxge_tx_qlist_post(stp);
2335		}
2336	}
2337
2338	sfxge_tx_qdpl_service(stp);
2339	/* lock has been dropped */
2340}
2341
2342void
2343sfxge_tx_qcomplete(sfxge_txq_t *stp)
2344{
2345	sfxge_t *sp = stp->st_sp;
2346	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2347	unsigned int evq = stp->st_evq;
2348	sfxge_evq_t *sep = sp->s_sep[evq];
2349	unsigned int completed;
2350
2351	ASSERT(mutex_owned(&(sep->se_lock)));
2352
2353	completed = stp->st_completed;
2354	while (completed != stp->st_pending) {
2355		unsigned int id;
2356		sfxge_tx_mapping_t *stmp;
2357
2358		id = completed++ & (SFXGE_TX_NDESCS - 1);
2359
2360		if ((stmp = stp->st_stmp[id]) != NULL) {
2361			mblk_t *mp;
2362
2363			/* Unbind all the mappings */
2364			do {
2365				ASSERT(stmp->stm_mp != NULL);
2366				sfxge_tx_msgb_unbind(stmp);
2367
2368				stmp = stmp->stm_next;
2369			} while (stmp != NULL);
2370
2371			/*
2372			 * Now that the packet is no longer mapped for DMA it
2373			 * can be freed.
2374			 */
2375			mp = stp->st_mp[id];
2376			stp->st_mp[id] = NULL;
2377
2378			ASSERT(mp != NULL);
2379			freemsg(mp);
2380		}
2381	}
2382	stp->st_completed = completed;
2383
2384	/* Check whether we need to unblock the queue */
2385	if (stp->st_unblock != SFXGE_TXQ_NOT_BLOCKED) {
2386		unsigned int level;
2387
2388		level = stp->st_added - stp->st_completed;
2389		if (level <= stp->st_unblock)
2390			sfxge_tx_qunblock(stp);
2391	}
2392
2393	/* Release TX backpressure from the TX DPL put/get list being full */
2394	if (stdp->std_count < stdp->get_pkt_limit)
2395		mac_tx_update(sp->s_mh);
2396}
2397
2398void
2399sfxge_tx_qflush_done(sfxge_txq_t *stp)
2400{
2401	sfxge_t *sp = stp->st_sp;
2402	boolean_t flush_pending = B_FALSE;
2403
2404	ASSERT(mutex_owned(&(sp->s_sep[stp->st_evq]->se_lock)));
2405
2406	mutex_enter(&(stp->st_lock));
2407
2408	switch (stp->st_state) {
2409	case SFXGE_TXQ_INITIALIZED:
2410		/* Ignore flush event after TxQ destroyed */
2411		break;
2412
2413	case SFXGE_TXQ_FLUSH_PENDING:
2414		flush_pending = B_TRUE;
2415		stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2416		break;
2417
2418	case SFXGE_TXQ_FLUSH_FAILED:
2419		/* MC may have rebooted before handling the flush request */
2420		stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2421		break;
2422
2423	case SFXGE_TXQ_STARTED:
2424		/*
2425		 * MC initiated flush on MC reboot or because of bad Tx
2426		 * descriptor
2427		 */
2428		stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2429		break;
2430
2431	case SFXGE_TXQ_FLUSH_DONE:
2432		/* Ignore unexpected extra flush event */
2433		ASSERT(B_FALSE);
2434		break;
2435
2436	default:
2437		ASSERT(B_FALSE);
2438	}
2439
2440
2441	mutex_exit(&(stp->st_lock));
2442
2443	if (flush_pending == B_FALSE) {
2444		/* Flush was not pending */
2445		return;
2446	}
2447
2448	mutex_enter(&(sp->s_tx_flush_lock));
2449	sp->s_tx_flush_pending--;
2450	if (sp->s_tx_flush_pending <= 0) {
2451		/* All queues flushed: wakeup sfxge_tx_stop() */
2452		cv_signal(&(sp->s_tx_flush_kv));
2453	}
2454	mutex_exit(&(sp->s_tx_flush_lock));
2455}
2456
2457static void
2458sfxge_tx_qflush(sfxge_t *sp, unsigned int index, boolean_t wait_for_flush)
2459{
2460	sfxge_txq_t *stp = sp->s_stp[index];
2461	int rc;
2462
2463	ASSERT(mutex_owned(&(sp->s_state_lock)));
2464	ASSERT(mutex_owned(&(sp->s_tx_flush_lock)));
2465
2466	mutex_enter(&(stp->st_lock));
2467
2468	/* Prepare to flush and stop the queue */
2469	if (stp->st_state == SFXGE_TXQ_STARTED) {
2470		/* Flush the transmit queue */
2471		if ((rc = efx_tx_qflush(stp->st_etp)) == EALREADY) {
2472			/* Already flushed, may be initiated by MC */
2473			stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2474		} else if (rc != 0) {
2475			/* Unexpected error */
2476			stp->st_state = SFXGE_TXQ_FLUSH_FAILED;
2477		} else if (wait_for_flush) {
2478			stp->st_state = SFXGE_TXQ_FLUSH_PENDING;
2479			sp->s_tx_flush_pending++;
2480		} else {
2481			/* Assume the flush is done */
2482			stp->st_state = SFXGE_TXQ_FLUSH_DONE;
2483		}
2484	}
2485
2486	mutex_exit(&(stp->st_lock));
2487}
2488
2489static void
2490sfxge_tx_qstop(sfxge_t *sp, unsigned int index)
2491{
2492	sfxge_txq_t *stp = sp->s_stp[index];
2493	unsigned int evq = stp->st_evq;
2494	sfxge_evq_t *sep = sp->s_sep[evq];
2495
2496	mutex_enter(&(sep->se_lock));
2497	mutex_enter(&(stp->st_lock));
2498
2499	if (stp->st_state == SFXGE_TXQ_INITIALIZED)
2500		goto done;
2501
2502	ASSERT(stp->st_state == SFXGE_TXQ_FLUSH_PENDING ||
2503	    stp->st_state == SFXGE_TXQ_FLUSH_DONE ||
2504	    stp->st_state == SFXGE_TXQ_FLUSH_FAILED);
2505
2506	/* All queues should have been flushed */
2507	if (stp->st_sp->s_tx_flush_pending != 0) {
2508		dev_err(sp->s_dip, CE_NOTE,
2509		    SFXGE_CMN_ERR "txq[%d] stop with flush_pending=%d",
2510		    index, stp->st_sp->s_tx_flush_pending);
2511	}
2512	if (stp->st_state == SFXGE_TXQ_FLUSH_FAILED) {
2513		dev_err(sp->s_dip, CE_NOTE,
2514		    SFXGE_CMN_ERR "txq[%d] flush failed", index);
2515	}
2516
2517	/* Destroy the transmit queue */
2518	efx_tx_qdestroy(stp->st_etp);
2519	stp->st_etp = NULL;
2520
2521	/* Clear entries from the buffer table */
2522	sfxge_sram_buf_tbl_clear(sp, stp->st_id,
2523	    EFX_TXQ_NBUFS(SFXGE_TX_NDESCS));
2524
2525	sfxge_tx_qlist_abort(stp);
2526	ASSERT3U(stp->st_n, ==, 0);
2527
2528	stp->st_unblock = SFXGE_TXQ_NOT_BLOCKED;
2529
2530	stp->st_pending = stp->st_added;
2531
2532	sfxge_tx_qcomplete(stp);
2533	ASSERT3U(stp->st_completed, ==, stp->st_pending);
2534
2535	sfxge_tx_qreap(stp);
2536	ASSERT3U(stp->st_reaped, ==, stp->st_completed);
2537
2538	/*
2539	 * Ensure the deferred packet list is cleared
2540	 * Can race with sfxge_tx_packet_add() adding to the put list
2541	 */
2542	sfxge_tx_qdpl_flush_locked(stp);
2543
2544	stp->st_added = 0;
2545	stp->st_pending = 0;
2546	stp->st_completed = 0;
2547	stp->st_reaped = 0;
2548
2549	stp->st_state = SFXGE_TXQ_INITIALIZED;
2550
2551done:
2552	mutex_exit(&(stp->st_lock));
2553	mutex_exit(&(sep->se_lock));
2554}
2555
2556static void
2557sfxge_tx_qfini(sfxge_t *sp, unsigned int index)
2558{
2559	sfxge_txq_t *stp = sp->s_stp[index];
2560	sfxge_tx_dpl_t *stdp = &(stp->st_dpl);
2561
2562	ASSERT3U(stp->st_state, ==, SFXGE_TXQ_INITIALIZED);
2563	stp->st_state = SFXGE_TXQ_UNINITIALIZED;
2564
2565	/* Detach the TXQ from the driver */
2566	sp->s_stp[index] = NULL;
2567	ASSERT(sp->s_tx_qcount > 0);
2568	sp->s_tx_qcount--;
2569
2570	/* Free the EVQ label for events from this TXQ */
2571	(void) sfxge_ev_txlabel_free(sp, stp->st_evq, stp, stp->st_label);
2572	stp->st_label = 0;
2573
2574	/* Tear down the statistics */
2575	sfxge_tx_kstat_fini(stp);
2576
2577	/* Ensure the deferred packet list is empty */
2578	ASSERT3U(stdp->std_count, ==, 0);
2579	ASSERT3P(stdp->std_get, ==, NULL);
2580	ASSERT3U(stdp->std_put, ==, 0);
2581
2582	/* Clear the free buffer pool */
2583	sfxge_tx_qfbp_empty(stp);
2584
2585	/* Clear the free mapping pool */
2586	sfxge_tx_qfmp_empty(stp);
2587
2588	/* Clear the free packet pool */
2589	sfxge_tx_qfpp_empty(stp);
2590
2591	mutex_destroy(&(stp->st_lock));
2592
2593	stp->st_evq = 0;
2594	stp->st_type = 0;
2595	stp->st_index = 0;
2596
2597	kmem_cache_free(sp->s_tqc, stp);
2598}
2599
2600int
2601sfxge_tx_init(sfxge_t *sp)
2602{
2603	sfxge_intr_t *sip = &(sp->s_intr);
2604	char name[MAXNAMELEN];
2605	sfxge_txq_type_t qtype;
2606	unsigned int txq, evq;
2607	int index;
2608	int rc;
2609
2610	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_packet_cache",
2611	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2612
2613	sp->s_tpc = kmem_cache_create(name, sizeof (sfxge_tx_packet_t),
2614	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_packet_ctor, sfxge_tx_packet_dtor,
2615	    NULL, sp, NULL, 0);
2616	ASSERT(sp->s_tpc != NULL);
2617
2618	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_buffer_cache",
2619	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2620
2621	sp->s_tbc = kmem_cache_create(name, sizeof (sfxge_tx_buffer_t),
2622	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_buffer_ctor, sfxge_tx_buffer_dtor,
2623	    NULL, sp, NULL, 0);
2624	ASSERT(sp->s_tbc != NULL);
2625
2626	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_tx_mapping_cache",
2627	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2628
2629	sp->s_tmc = kmem_cache_create(name, sizeof (sfxge_tx_mapping_t),
2630	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_mapping_ctor, sfxge_tx_mapping_dtor,
2631	    NULL, sp, NULL, 0);
2632	ASSERT(sp->s_tmc != NULL);
2633
2634	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_txq_cache",
2635	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2636
2637	sp->s_tqc = kmem_cache_create(name, sizeof (sfxge_txq_t),
2638	    SFXGE_CPU_CACHE_SIZE, sfxge_tx_qctor, sfxge_tx_qdtor, NULL, sp,
2639	    NULL, 0);
2640	ASSERT(sp->s_tqc != NULL);
2641
2642	/* Initialize the transmit queues. */
2643	sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM]		= sip->si_nalloc;
2644	sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM]		= 1;
2645	sp->s_tx_scale_max[SFXGE_TXQ_IP_TCP_UDP_CKSUM]	= sip->si_nalloc;
2646
2647	/* Ensure minimum queue counts required by sfxge_tx_packet_add(). */
2648	if (sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] < 1)
2649		sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM] = 1;
2650
2651	if (sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] < 1)
2652		sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM] = 1;
2653
2654	txq = 0;
2655	for (qtype = 0; qtype < SFXGE_TXQ_NTYPES; qtype++) {
2656		unsigned int tx_scale = sp->s_tx_scale_max[qtype];
2657
2658		if (txq + tx_scale > EFX_ARRAY_SIZE(sp->s_stp)) {
2659			rc = EINVAL;
2660			goto fail1;
2661		}
2662
2663		sp->s_tx_scale_base[qtype] = txq;
2664
2665		for (evq = 0; evq < tx_scale; evq++) {
2666			if ((rc = sfxge_tx_qinit(sp, txq, qtype, evq)) != 0) {
2667				goto fail2;
2668			}
2669			txq++;
2670		}
2671		ASSERT3U(txq, <=, EFX_ARRAY_SIZE(sp->s_stp));
2672	}
2673
2674	return (0);
2675
2676fail2:
2677	DTRACE_PROBE(fail2);
2678
2679fail1:
2680	DTRACE_PROBE1(fail1, int, rc);
2681
2682	index = EFX_ARRAY_SIZE(sp->s_stp);
2683	while (--index >= 0) {
2684		if (sp->s_stp[index] != NULL)
2685			sfxge_tx_qfini(sp, index);
2686	}
2687
2688	kmem_cache_destroy(sp->s_tqc);
2689	sp->s_tqc = NULL;
2690
2691	kmem_cache_destroy(sp->s_tmc);
2692	sp->s_tmc = NULL;
2693
2694	kmem_cache_destroy(sp->s_tbc);
2695	sp->s_tbc = NULL;
2696
2697	kmem_cache_destroy(sp->s_tpc);
2698	sp->s_tpc = NULL;
2699
2700	return (rc);
2701}
2702
2703int
2704sfxge_tx_start(sfxge_t *sp)
2705{
2706	efx_nic_t *enp = sp->s_enp;
2707	int index;
2708	int rc;
2709
2710	/* Initialize the transmit module */
2711	if ((rc = efx_tx_init(enp)) != 0)
2712		goto fail1;
2713
2714	for (index = 0; index < EFX_ARRAY_SIZE(sp->s_stp); index++) {
2715		if (sp->s_stp[index] != NULL)
2716			if ((rc = sfxge_tx_qstart(sp, index)) != 0)
2717				goto fail2;
2718	}
2719
2720	return (0);
2721
2722fail2:
2723	DTRACE_PROBE(fail2);
2724
2725	sfxge_tx_stop(sp);
2726
2727fail1:
2728	DTRACE_PROBE1(fail1, int, rc);
2729
2730	return (rc);
2731}
2732
2733
2734/*
2735 * Add a packet to the TX Deferred Packet List and if the TX queue lock
2736 * can be acquired then call sfxge_tx_qdpl_service() to fragment and push
2737 * to the H/W transmit descriptor ring
2738 *
2739 * If ENOSPC is returned then the DPL is full or the packet create failed, but
2740 * the mblk isn't freed so that the caller can return this mblk from mc_tx() to
2741 * back-pressure the OS stack.
2742 *
2743 * For all other errors the mblk is freed
2744 */
2745int
2746sfxge_tx_packet_add(sfxge_t *sp, mblk_t *mp)
2747{
2748	struct ether_header *etherhp;
2749	struct ip *iphp;
2750	struct tcphdr *thp;
2751	size_t off;
2752	size_t size;
2753	size_t mss;
2754	sfxge_txq_t *stp;
2755	unsigned int txq;
2756	int index;
2757	boolean_t locked;
2758	sfxge_tx_packet_t *stpp;
2759	sfxge_packet_type_t pkt_type;
2760	uint16_t sport, dport;
2761	int rc = 0;
2762
2763	ASSERT3P(mp->b_next, ==, NULL);
2764	ASSERT(!(DB_CKSUMFLAGS(mp) & HCK_PARTIALCKSUM));
2765
2766	/*
2767	 * Do not enqueue packets during startup/shutdown;
2768	 *
2769	 * NOTE: This access to the state is NOT protected by the state lock. It
2770	 * is an imperfect test and anything further getting onto the get/put
2771	 * deferred packet lists is cleaned up in (possibly repeated) calls to
2772	 * sfxge_can_destroy().
2773	 */
2774	if (sp->s_state != SFXGE_STARTED) {
2775		rc = EINVAL;
2776		goto fail1;
2777	}
2778
2779	etherhp = NULL;
2780	iphp = NULL;
2781	thp = NULL;
2782	off = 0;
2783	size = 0;
2784	mss = 0;
2785
2786	/* Check whether we need the header pointers for LSO segmentation */
2787	if (DB_LSOFLAGS(mp) & HW_LSO) {
2788		/* LSO segmentation relies on hardware checksum offload */
2789		DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
2790
2791		if ((mss = DB_LSOMSS(mp)) == 0) {
2792			rc = EINVAL;
2793			goto fail1;
2794		}
2795
2796		pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp,
2797		    &off, &size, &sport, &dport);
2798
2799		if (pkt_type != SFXGE_PACKET_TYPE_IPV4_TCP ||
2800		    etherhp == NULL ||
2801		    iphp == NULL ||
2802		    thp == NULL ||
2803		    off == 0) {
2804			rc = EINVAL;
2805			goto fail2;
2806		}
2807	}
2808
2809	/* Choose the appropriate transit queue */
2810	if (DB_CKSUMFLAGS(mp) & HCK_FULLCKSUM) {
2811		sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2812
2813		if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
2814			uint32_t hash;
2815
2816			if (srsp->srs_count > 1) {
2817				/*
2818				 * If we have not already parsed the headers
2819				 * for LSO segmentation then we need to do it
2820				 * now so we can calculate the hash.
2821				 */
2822				if (thp == NULL) {
2823					(void) sfxge_pkthdr_parse(mp, &etherhp,
2824					    &iphp, &thp, &off, &size,
2825					    &sport, &dport);
2826				}
2827
2828				if (thp != NULL) {
2829					SFXGE_TCP_HASH(sp,
2830					    &iphp->ip_dst.s_addr,
2831					    thp->th_dport,
2832					    &iphp->ip_src.s_addr,
2833					    thp->th_sport, hash);
2834
2835					index = srsp->srs_tbl[hash %
2836					    SFXGE_RX_SCALE_MAX];
2837				} else if (iphp != NULL) {
2838					/*
2839					 * Calculate IPv4 4-tuple hash, with
2840					 * TCP/UDP/SCTP src/dest ports. Ports
2841					 * are zero for other IPv4 protocols.
2842					 */
2843					SFXGE_IP_HASH(sp,
2844					    &iphp->ip_dst.s_addr, dport,
2845					    &iphp->ip_src.s_addr, sport, hash);
2846
2847					index = srsp->srs_tbl[hash %
2848					    SFXGE_RX_SCALE_MAX];
2849				} else {
2850					/*
2851					 * Other traffic always goes to the
2852					 * the queue in the zero-th entry of
2853					 * the RSS table.
2854					 */
2855					index = srsp->srs_tbl[0];
2856				}
2857			} else {
2858				/*
2859				 * It does not matter what the hash is
2860				 * because all the RSS table entries will be
2861				 * the same.
2862				 */
2863				index = srsp->srs_tbl[0];
2864			}
2865
2866			/*
2867			 * Find the event queue corresponding to the hash in
2868			 * the RSS table.
2869			 */
2870			txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
2871			    index;
2872			stp = sp->s_stp[txq];
2873			ASSERT3U(stp->st_evq, ==, index);
2874		} else {
2875			index = 0;
2876			txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_TCP_UDP_CKSUM] +
2877			    index;
2878			stp = sp->s_stp[txq];
2879		}
2880	} else if (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) {
2881		ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_IP_CKSUM], >=, 1);
2882		index = 0;
2883		txq = sp->s_tx_scale_base[SFXGE_TXQ_IP_CKSUM] + index;
2884		stp = sp->s_stp[txq];
2885	} else {
2886		/*
2887		 * No hardware checksum offload requested.
2888		 */
2889		sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2890
2891		if (srsp->srs_state == SFXGE_RX_SCALE_STARTED) {
2892			uint32_t hash = 0;
2893
2894			if (srsp->srs_count > 1) {
2895				if (iphp == NULL) {
2896					(void) sfxge_pkthdr_parse(mp, &etherhp,
2897					    &iphp, &thp, &off, &size,
2898					    &sport, &dport);
2899				}
2900
2901				if (iphp != NULL) {
2902					/*
2903					 * Calculate IPv4 4-tuple hash, with
2904					 * TCP/UDP/SCTP src/dest ports. Ports
2905					 * are zero for other IPv4 protocols.
2906					 */
2907					SFXGE_IP_HASH(sp,
2908					    &iphp->ip_dst.s_addr, dport,
2909					    &iphp->ip_src.s_addr, sport, hash);
2910
2911					hash = hash % SFXGE_RX_SCALE_MAX;
2912				}
2913			}
2914			index = srsp->srs_tbl[hash];
2915
2916			/*
2917			 * The RSS table (indexed by hash) gives the RXQ index,
2918			 * (mapped 1:1 with EVQs). Find the TXQ that results in
2919			 * using the same EVQ as for the RX data path.
2920			 */
2921			ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM],
2922			    >, index);
2923			txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
2924			stp = sp->s_stp[txq];
2925			ASSERT3U(stp->st_evq, ==, index);
2926		} else {
2927			ASSERT3U(sp->s_tx_scale_max[SFXGE_TXQ_NON_CKSUM], >, 0);
2928			index = 0;
2929			txq = sp->s_tx_scale_base[SFXGE_TXQ_NON_CKSUM] + index;
2930			stp = sp->s_stp[txq];
2931		}
2932
2933
2934	}
2935	ASSERT(stp != NULL);
2936
2937	ASSERT(mss == 0 || (DB_LSOFLAGS(mp) & HW_LSO));
2938
2939	/* Try to grab the lock */
2940	locked = mutex_tryenter(&(stp->st_lock));
2941
2942	if (locked) {
2943		/* Try to grab a packet from the pool */
2944		stpp = sfxge_tx_qfpp_get(stp);
2945	} else {
2946		stpp = NULL;
2947	}
2948
2949	if (stpp == NULL) {
2950		/*
2951		 * Either the pool was empty or we don't have the lock so
2952		 * allocate a new packet.
2953		 */
2954		if ((stpp = sfxge_tx_packet_create(sp)) == NULL) {
2955			rc = ENOSPC;
2956			goto fail3;
2957		}
2958	}
2959
2960	stpp->stp_mp = mp;
2961	stpp->stp_etherhp = etherhp;
2962	stpp->stp_iphp = iphp;
2963	stpp->stp_thp = thp;
2964	stpp->stp_off = off;
2965	stpp->stp_size = size;
2966	stpp->stp_mss = mss;
2967	stpp->stp_dpl_put_len = 0;
2968
2969	rc = sfxge_tx_qdpl_add(stp, stpp, locked);
2970	if (rc != 0) {
2971		/* ENOSPC can happen for DPL get or put list is full */
2972		ASSERT3U(rc, ==, ENOSPC);
2973
2974		/*
2975		 * Note; if this is the unlocked DPL put list full case there is
2976		 * no need to worry about a race with locked
2977		 * sfxge_tx_qdpl_swizzle() as we know that the TX DPL put list
2978		 * was full and would have been swizzle'd to the TX DPL get
2979		 * list; hence guaranteeing future TX completions and calls
2980		 * to mac_tx_update() via sfxge_tx_qcomplete()
2981		 */
2982		goto fail4;
2983	}
2984
2985	/* Try to grab the lock again */
2986	if (!locked)
2987		locked = mutex_tryenter(&(stp->st_lock));
2988
2989	if (locked) {
2990		/* Try to service the list */
2991		sfxge_tx_qdpl_service(stp);
2992		/* lock has been dropped */
2993	}
2994
2995	return (0);
2996
2997fail4:
2998	DTRACE_PROBE(fail4);
2999	sfxge_tx_packet_destroy(sp, stpp);
3000fail3:
3001	DTRACE_PROBE(fail3);
3002	if (locked)
3003		mutex_exit(&(stp->st_lock));
3004fail2:
3005	DTRACE_PROBE(fail2);
3006fail1:
3007	DTRACE_PROBE1(fail1, int, rc);
3008
3009	if (rc != ENOSPC)
3010		freemsg(mp);
3011	return (rc);
3012}
3013
3014void
3015sfxge_tx_stop(sfxge_t *sp)
3016{
3017	efx_nic_t *enp = sp->s_enp;
3018	clock_t timeout;
3019	boolean_t wait_for_flush;
3020	int index;
3021
3022	ASSERT(mutex_owned(&(sp->s_state_lock)));
3023
3024	mutex_enter(&(sp->s_tx_flush_lock));
3025
3026	/* Flush all the queues */
3027	if (sp->s_hw_err == SFXGE_HW_OK) {
3028		wait_for_flush = B_TRUE;
3029	} else {
3030		/*
3031		 * Flag indicates possible hardware failure.
3032		 * Attempt flush but do not wait for it to complete.
3033		 */
3034		wait_for_flush = B_FALSE;
3035	}
3036
3037	/* Prepare queues to stop and flush the hardware ring */
3038	index = EFX_ARRAY_SIZE(sp->s_stp);
3039	while (--index >= 0) {
3040		if (sp->s_stp[index] != NULL)
3041			sfxge_tx_qflush(sp, index, wait_for_flush);
3042	}
3043
3044	if (wait_for_flush == B_FALSE)
3045		goto flush_done;
3046
3047	/* Wait upto 2sec for queue flushing to complete */
3048	timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_TX_QFLUSH_USEC);
3049
3050	while (sp->s_tx_flush_pending > 0) {
3051		if (cv_timedwait(&(sp->s_tx_flush_kv), &(sp->s_tx_flush_lock),
3052		    timeout) < 0) {
3053			/* Timeout waiting for queues to flush */
3054			dev_info_t *dip = sp->s_dip;
3055
3056			DTRACE_PROBE(timeout);
3057			dev_err(dip, CE_NOTE,
3058			    SFXGE_CMN_ERR "tx qflush timeout");
3059			break;
3060		}
3061	}
3062
3063flush_done:
3064	sp->s_tx_flush_pending = 0;
3065	mutex_exit(&(sp->s_tx_flush_lock));
3066
3067	/* Stop all the queues */
3068	index = EFX_ARRAY_SIZE(sp->s_stp);
3069	while (--index >= 0) {
3070		if (sp->s_stp[index] != NULL)
3071			sfxge_tx_qstop(sp, index);
3072	}
3073
3074	/* Tear down the transmit module */
3075	efx_tx_fini(enp);
3076}
3077
3078void
3079sfxge_tx_fini(sfxge_t *sp)
3080{
3081	int index;
3082
3083	index = EFX_ARRAY_SIZE(sp->s_stp);
3084	while (--index >= 0) {
3085		if (sp->s_stp[index] != NULL)
3086			sfxge_tx_qfini(sp, index);
3087	}
3088
3089	kmem_cache_destroy(sp->s_tqc);
3090	sp->s_tqc = NULL;
3091
3092	kmem_cache_destroy(sp->s_tmc);
3093	sp->s_tmc = NULL;
3094
3095	kmem_cache_destroy(sp->s_tbc);
3096	sp->s_tbc = NULL;
3097
3098	kmem_cache_destroy(sp->s_tpc);
3099	sp->s_tpc = NULL;
3100}
3101