1/*
2 * Copyright (c) 2008-2016 Solarflare Communications Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 *    this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 *    this list of conditions and the following disclaimer in the documentation
12 *    and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *
26 * The views and conclusions contained in the software and documentation are
27 * those of the authors and should not be interpreted as representing official
28 * policies, either expressed or implied, of the FreeBSD Project.
29 */
30
31#include <sys/types.h>
32#include <sys/sysmacros.h>
33#include <sys/ddi.h>
34#include <sys/sunddi.h>
35#include <sys/atomic.h>
36#include <sys/stream.h>
37#include <sys/strsun.h>
38#include <sys/strsubr.h>
39#include <sys/strft.h>
40#include <sys/ksynch.h>
41#include <sys/ethernet.h>
42#include <sys/crc32.h>
43#include <sys/pattr.h>
44#include <sys/cpu.h>
45
46#include <sys/ethernet.h>
47#include <inet/ip.h>
48
49#include <netinet/in.h>
50#include <netinet/ip.h>
51#include <netinet/tcp.h>
52
53#include "sfxge.h"
54
55#include "efx.h"
56
57/* RXQ flush response timeout (in microseconds) */
58#define	SFXGE_RX_QFLUSH_USEC	(2000000)
59
60/* RXQ flush tries in the case of failure */
61#define	SFXGE_RX_QFLUSH_TRIES	(5)
62
63/* RXQ default packet buffer preallocation (number of packet buffers) */
64#define	SFXGE_RX_QPREALLOC	(0)
65
66/* Receive packet DMA attributes */
67static ddi_device_acc_attr_t sfxge_rx_packet_devacc = {
68
69	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
70	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
71	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
72};
73
74static ddi_dma_attr_t sfxge_rx_packet_dma_attr = {
75	DMA_ATTR_V0,		/* dma_attr_version	*/
76	0,			/* dma_attr_addr_lo	*/
77	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
78	0xffffffffffffffffull,	/* dma_attr_count_max	*/
79	SFXGE_CPU_CACHE_SIZE,	/* dma_attr_align	*/
80	0xffffffff,		/* dma_attr_burstsizes	*/
81	1,			/* dma_attr_minxfer	*/
82	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
83	0xffffffffffffffffull,	/* dma_attr_seg		*/
84	1,			/* dma_attr_sgllen	*/
85	1,			/* dma_attr_granular	*/
86	0			/* dma_attr_flags	*/
87};
88
89/* Receive queue DMA attributes */
90static ddi_device_acc_attr_t sfxge_rxq_devacc = {
91
92	DDI_DEVICE_ATTR_V0,	/* devacc_attr_version */
93	DDI_NEVERSWAP_ACC,	/* devacc_attr_endian_flags */
94	DDI_STRICTORDER_ACC	/* devacc_attr_dataorder */
95};
96
97static ddi_dma_attr_t sfxge_rxq_dma_attr = {
98	DMA_ATTR_V0,		/* dma_attr_version	*/
99	0,			/* dma_attr_addr_lo	*/
100	0xffffffffffffffffull,	/* dma_attr_addr_hi	*/
101	0xffffffffffffffffull,	/* dma_attr_count_max	*/
102	EFX_BUF_SIZE,		/* dma_attr_align	*/
103	0xffffffff,		/* dma_attr_burstsizes	*/
104	1,			/* dma_attr_minxfer	*/
105	0xffffffffffffffffull,	/* dma_attr_maxxfer	*/
106	0xffffffffffffffffull,	/* dma_attr_seg		*/
107	1,			/* dma_attr_sgllen	*/
108	1,			/* dma_attr_granular	*/
109	0			/* dma_attr_flags	*/
110};
111
112/* Forward declaration */
113static void sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc);
114
115static int
116sfxge_rx_packet_ctor(void *buf, void *arg, int kmflags)
117{
118	sfxge_rx_packet_t *srpp = buf;
119	sfxge_t *sp = arg;
120	dev_info_t *dip = sp->s_dip;
121	int err;
122
123	ASSERT3U(sizeof (srpp->__srp_u1.__srp_s1), <=,
124	    sizeof (srpp->__srp_u1.__srp_pad));
125	ASSERT3U(sizeof (srpp->__srp_u2.__srp_s2), <=,
126	    sizeof (srpp->__srp_u2.__srp_pad));
127
128	bzero(buf, sizeof (sfxge_rx_packet_t));
129
130	/* Allocate a DMA handle */
131	err = ddi_dma_alloc_handle(dip, &sfxge_rx_packet_dma_attr,
132	    (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
133	    NULL, &(srpp->srp_dma_handle));
134	if (err != DDI_SUCCESS)
135		goto fail1;
136
137	return (0);
138
139fail1:
140	DTRACE_PROBE1(fail1, int, err);
141
142	SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
143
144	return (-1);
145}
146
147static void
148sfxge_rx_packet_dtor(void *buf, void *arg)
149{
150	sfxge_rx_packet_t *srpp = buf;
151
152	_NOTE(ARGUNUSED(arg))
153
154	/* Free the DMA handle */
155	ddi_dma_free_handle(&(srpp->srp_dma_handle));
156	srpp->srp_dma_handle = NULL;
157
158	SFXGE_OBJ_CHECK(srpp, sfxge_rx_packet_t);
159}
160
161static int
162sfxge_rx_qctor(void *buf, void *arg, int kmflags)
163{
164	sfxge_rxq_t *srp = buf;
165	efsys_mem_t *esmp = &(srp->sr_mem);
166	sfxge_t *sp = arg;
167	sfxge_dma_buffer_attr_t dma_attr;
168	sfxge_rx_fpp_t *srfppp;
169	int nprealloc;
170	unsigned int id;
171	int rc;
172
173	/* Compile-time structure layout checks */
174	EFX_STATIC_ASSERT(sizeof (srp->__sr_u1.__sr_s1) <=
175	    sizeof (srp->__sr_u1.__sr_pad));
176	EFX_STATIC_ASSERT(sizeof (srp->__sr_u2.__sr_s2) <=
177	    sizeof (srp->__sr_u2.__sr_pad));
178	EFX_STATIC_ASSERT(sizeof (srp->__sr_u3.__sr_s3) <=
179	    sizeof (srp->__sr_u3.__sr_pad));
180
181	bzero(buf, sizeof (sfxge_rxq_t));
182
183	srp->sr_sp = sp;
184
185	dma_attr.sdba_dip	 = sp->s_dip;
186	dma_attr.sdba_dattrp	 = &sfxge_rxq_dma_attr;
187	dma_attr.sdba_callback	 = DDI_DMA_SLEEP;
188	dma_attr.sdba_length	 = EFX_RXQ_SIZE(sp->s_rxq_size);
189	dma_attr.sdba_memflags	 = DDI_DMA_CONSISTENT;
190	dma_attr.sdba_devaccp	 = &sfxge_rxq_devacc;
191	dma_attr.sdba_bindflags	 = DDI_DMA_READ | DDI_DMA_CONSISTENT;
192	dma_attr.sdba_maxcookies = 1;
193	dma_attr.sdba_zeroinit	 = B_FALSE;
194
195	if ((rc = sfxge_dma_buffer_create(esmp, &dma_attr)) != 0)
196		goto fail1;
197
198	/* Allocate some buffer table entries */
199	if ((rc = sfxge_sram_buf_tbl_alloc(sp, EFX_RXQ_NBUFS(sp->s_rxq_size),
200	    &(srp->sr_id))) != 0)
201		goto fail2;
202
203	/* Allocate the context array */
204	if ((srp->sr_srpp = kmem_zalloc(sizeof (sfxge_rx_packet_t *) *
205	    sp->s_rxq_size, kmflags)) == NULL) {
206		rc = ENOMEM;
207		goto fail3;
208	}
209
210	/* Allocate the flow table */
211	if ((srp->sr_flow = kmem_zalloc(sizeof (sfxge_rx_flow_t) *
212	    SFXGE_MAX_FLOW, kmflags)) == NULL) {
213		rc = ENOMEM;
214		goto fail4;
215	}
216
217	srp->sr_srfpp = &(srp->sr_srfp);
218	srp->sr_rto = drv_usectohz(200000);
219
220	srp->sr_mpp = &(srp->sr_mp);
221
222	/* Initialize the free packet pool */
223	srfppp = &(srp->sr_fpp);
224	if ((srfppp->srfpp_putp = kmem_zalloc(SFXGE_CPU_CACHE_SIZE *
225	    SFXGE_RX_FPP_NSLOTS, kmflags)) == NULL) {
226		rc = ENOMEM;
227		goto fail5;
228	}
229	for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
230		sfxge_rx_fpp_putlist_t *putp;
231		size_t off;
232
233		off = id * SFXGE_CPU_CACHE_SIZE;
234		putp = (void *)(srfppp->srfpp_putp + off);
235
236		putp->srfpl_putp = NULL;
237		putp->srfpl_putpp = &(putp->srfpl_putp);
238		mutex_init(&(putp->srfpl_lock), NULL, MUTEX_DRIVER,
239		    DDI_INTR_PRI(sp->s_intr.si_intr_pri));
240	}
241
242	cv_init(&(srp->sr_flush_kv), NULL, CV_DRIVER, NULL);
243
244	/* Preallocate some packets on the free packet pool */
245	nprealloc = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
246	    DDI_PROP_DONTPASS, "rx_prealloc_pkt_buffers", SFXGE_RX_QPREALLOC);
247	sfxge_rx_qpreallocate(srp, nprealloc);
248
249
250	return (0);
251
252fail5:
253	DTRACE_PROBE(fail5);
254
255	srp->sr_mpp = NULL;
256
257	srp->sr_rto = 0;
258	srp->sr_srfpp = NULL;
259
260	/* Free the flow table */
261	kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
262	    SFXGE_MAX_FLOW);
263	srp->sr_flow = NULL;
264
265fail4:
266	DTRACE_PROBE(fail4);
267
268	/* Free the context array */
269	kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
270	    sp->s_rxq_size);
271	srp->sr_srpp = NULL;
272
273fail3:
274	DTRACE_PROBE(fail3);
275
276	/* Free the buffer table entries */
277	sfxge_sram_buf_tbl_free(sp, srp->sr_id,
278	    EFX_RXQ_NBUFS(sp->s_rxq_size));
279	srp->sr_id = 0;
280
281fail2:
282	DTRACE_PROBE(fail2);
283	/* Remove dma setup */
284	sfxge_dma_buffer_destroy(esmp);
285
286fail1:
287	DTRACE_PROBE1(fail1, int, rc);
288
289	srp->sr_sp = NULL;
290
291	SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
292
293	return (-1);
294}
295
296static void
297sfxge_rx_qdtor(void *buf, void *arg)
298{
299	sfxge_rxq_t *srp = buf;
300	efsys_mem_t *esmp = &(srp->sr_mem);
301	sfxge_t *sp = srp->sr_sp;
302	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
303	unsigned int id;
304
305	_NOTE(ARGUNUSED(arg))
306
307	cv_destroy(&(srp->sr_flush_kv));
308
309	/* Tear down the free packet pool */
310	for (id = 0; id < SFXGE_RX_FPP_NSLOTS; id++) {
311		sfxge_rx_fpp_putlist_t *putp;
312		size_t off;
313
314		off = id * SFXGE_CPU_CACHE_SIZE;
315		putp = (void *)(srfppp->srfpp_putp + off);
316
317		putp->srfpl_putpp = NULL;
318		mutex_destroy(&(putp->srfpl_lock));
319
320		SFXGE_OBJ_CHECK(putp, sfxge_rx_fpp_putlist_t);
321	}
322	kmem_free(srfppp->srfpp_putp, SFXGE_CPU_CACHE_SIZE *
323	    SFXGE_RX_FPP_NSLOTS);
324	srfppp->srfpp_putp = NULL;
325
326	srp->sr_mpp = NULL;
327
328	srp->sr_rto = 0;
329	srp->sr_srfpp = NULL;
330
331	/* Free the flow table */
332	kmem_free(srp->sr_flow, sizeof (sfxge_rx_flow_t) *
333	    SFXGE_MAX_FLOW);
334	srp->sr_flow = NULL;
335
336	/* Free the context array */
337	kmem_free(srp->sr_srpp, sizeof (sfxge_rx_packet_t *) *
338	    sp->s_rxq_size);
339	srp->sr_srpp = NULL;
340
341	/* Free the buffer table entries */
342	sfxge_sram_buf_tbl_free(sp, srp->sr_id,
343	    EFX_RXQ_NBUFS(sp->s_rxq_size));
344	srp->sr_id = 0;
345
346	/* Tear down dma setup */
347	sfxge_dma_buffer_destroy(esmp);
348
349	SFXGE_OBJ_CHECK(srp, sfxge_rxq_t);
350}
351
352/* Note: This function takes ownership of *srpp. */
353static inline void
354sfxge_rx_qfpp_put(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
355{
356	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
357	mblk_t *mp = srpp->srp_mp;
358	unsigned int id;
359	size_t off;
360	sfxge_rx_fpp_putlist_t *putp;
361
362	ASSERT3P(mp->b_next, ==, NULL);
363	ASSERT3P(mp->b_prev, ==, NULL);
364
365	id = CPU->cpu_seqid & SFXGE_RX_FPP_MASK;
366	off = id * SFXGE_CPU_CACHE_SIZE;
367
368	ASSERT3P(srpp->srp_putp, ==, srfppp->srfpp_putp);
369	putp = (void *)(srpp->srp_putp + off);
370
371	mutex_enter(&(putp->srfpl_lock));
372	putp->srfpl_count++;
373	*putp->srfpl_putpp = mp;
374	putp->srfpl_putpp = &(mp->b_next);
375	mutex_exit(&(putp->srfpl_lock));
376}
377
378static unsigned int
379sfxge_rx_qfpp_swizzle(sfxge_rxq_t *srp)
380{
381	sfxge_t *sp = srp->sr_sp;
382	unsigned int index = srp->sr_index;
383	sfxge_evq_t *sep = sp->s_sep[index];
384	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
385	unsigned int start;
386	unsigned int id;
387	mblk_t *p;
388	mblk_t **pp;
389	unsigned int count;
390	unsigned int loaned;
391
392	ASSERT(mutex_owned(&(sep->se_lock)));
393
394	/* We want to access the put list for the current CPU last */
395	id = start = (CPU->cpu_seqid + 1) & SFXGE_RX_FPP_MASK;
396
397	do {
398		sfxge_rx_fpp_putlist_t *putp;
399		size_t off;
400
401		off = id * SFXGE_CPU_CACHE_SIZE;
402		id  = (id + 1) & SFXGE_RX_FPP_MASK;
403
404		putp = (void *)(srfppp->srfpp_putp + off);
405
406		/* Acquire the put list */
407		mutex_enter(&(putp->srfpl_lock));
408
409		p = putp->srfpl_putp;
410		pp = putp->srfpl_putpp;
411		count = putp->srfpl_count;
412
413		putp->srfpl_putp = NULL;
414		putp->srfpl_putpp = &(putp->srfpl_putp);
415		putp->srfpl_count = 0;
416
417		mutex_exit(&(putp->srfpl_lock));
418
419		if (p == NULL)
420			continue;
421
422		/* Add the list to the head of the get list */
423		*pp = srfppp->srfpp_get;
424		srfppp->srfpp_get = p;
425
426		/* Adjust the counters */
427		ASSERT3U(srfppp->srfpp_loaned, >=, count);
428		srfppp->srfpp_loaned -= count;
429		srfppp->srfpp_count += count;
430
431#if 0
432		/* NOTE: this probe is disabled because it is expensive!! */
433		DTRACE_PROBE2(count,
434		    unsigned int, (id - 1) & SFXGE_RX_FPP_MASK,
435		    unsigned int, count);
436#endif
437
438	} while (id != start);
439
440	/* Return the number of packets yet to appear in the put list */
441	loaned = srfppp->srfpp_loaned;
442
443
444	return (loaned);
445}
446
447
448#define	DB_FRTNP(mp)	((mp)->b_datap->db_frtnp)
449
450static void
451sfxge_rx_qfpp_empty(sfxge_rxq_t *srp)
452{
453	sfxge_t *sp = srp->sr_sp;
454	unsigned int index = srp->sr_index;
455	sfxge_evq_t *sep = sp->s_sep[index];
456	sfxge_rx_fpp_t *srfppp;
457	mblk_t *mp;
458
459	mutex_enter(&(sep->se_lock));
460	srfppp = &(srp->sr_fpp);
461
462	/* Swizzle put list to get list */
463	(void) sfxge_rx_qfpp_swizzle(srp);
464	ASSERT3U(srfppp->srfpp_loaned, ==, 0);
465
466	mp = srfppp->srfpp_get;
467	srfppp->srfpp_get = NULL;
468
469	/* Free the remainder */
470	while (mp != NULL) {
471		mblk_t *next;
472		frtn_t *freep;
473		sfxge_rx_packet_t *srpp;
474
475		next = mp->b_next;
476		mp->b_next = NULL;
477
478		ASSERT3U(srfppp->srfpp_count, >, 0);
479		srfppp->srfpp_count--;
480
481		freep = DB_FRTNP(mp);
482		/*
483		 * ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
484		 *   is implied by srpp test below
485		 */
486		/*LINTED*/
487		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
488		ASSERT3P(srpp->srp_mp, ==, mp);
489		ASSERT3P(mp->b_cont, ==, NULL);
490		srpp->srp_recycle = B_FALSE;
491
492		freeb(mp);
493
494		mp = next;
495	}
496	ASSERT3U(srfppp->srfpp_count, ==, 0);
497
498	srfppp->srfpp_min = 0;
499
500	mutex_exit(&(sep->se_lock));
501}
502
503/*
504 * This is an estimate of all memory consumed per RX packet
505 * it can be inaccurate but but sp->s_rx_pkt_mem_alloc mustn't drift
506 */
507static uint64_t
508sfxge_rx_pkt_mem_approx(const sfxge_rx_packet_t *srpp)
509{
510	return (srpp->srp_mblksize + sizeof (mblk_t) + sizeof (dblk_t) +
511	    sizeof (sfxge_rx_packet_t));
512}
513
514static void
515sfxge_rx_qpacket_destroy(sfxge_rxq_t *srp, sfxge_rx_packet_t *srpp)
516{
517	sfxge_t *sp = srp->sr_sp;
518	int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
519
520	ASSERT(!(srpp->srp_recycle));
521	ASSERT3P(srpp->srp_mp, ==, NULL);
522
523	srpp->srp_off = 0;
524	srpp->srp_thp = NULL;
525	srpp->srp_iphp = NULL;
526	srpp->srp_etherhp = NULL;
527	srpp->srp_size = 0;
528	srpp->srp_flags = 0;
529
530	bzero(&(srpp->srp_free), sizeof (frtn_t));
531
532	srpp->srp_mblksize = 0;
533	srpp->srp_base = NULL;
534
535	/* Unbind the DMA memory from the DMA handle */
536	srpp->srp_addr = 0;
537	(void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
538
539	/* Free the DMA memory */
540	srpp->srp_base = NULL;
541	ddi_dma_mem_free(&(srpp->srp_acc_handle));
542	srpp->srp_acc_handle = NULL;
543
544	srpp->srp_putp = NULL;
545	srpp->srp_srp = NULL;
546
547	kmem_cache_free(sp->s_rpc, srpp);
548	if (sp->s_rx_pkt_mem_max)
549		atomic_add_64(&sp->s_rx_pkt_mem_alloc, -delta);
550}
551
552static void
553sfxge_rx_qpacket_free(void *arg)
554{
555	sfxge_rx_packet_t *srpp = arg;
556	sfxge_rxq_t *srp = srpp->srp_srp;
557
558	/*
559	 * WARNING "man -s 9f esballoc"  states:
560	 * => runs sync from the thread calling freeb()
561	 * => must not sleep, or access data structures that could be freed
562	 */
563
564	/* Check whether we want to recycle the receive packets */
565	if (srpp->srp_recycle) {
566		frtn_t *freep;
567		mblk_t *mp;
568		size_t size;
569
570		freep = &(srpp->srp_free);
571		ASSERT3P(freep->free_func, ==, sfxge_rx_qpacket_free);
572		ASSERT3P(freep->free_arg, ==, (caddr_t)srpp);
573
574		/*
575		 * Allocate a matching mblk_t before the current one is
576		 * freed.
577		 */
578		size = srpp->srp_mblksize;
579
580		if ((mp = desballoc(srpp->srp_base, size, BPRI_HI,
581		    freep)) != NULL) {
582			srpp->srp_mp = mp;
583
584			/* NORMAL recycled case */
585			sfxge_rx_qfpp_put(srp, srpp);
586			return;
587		}
588	}
589
590	srpp->srp_mp = NULL;
591
592	sfxge_rx_qpacket_destroy(srp, srpp);
593}
594
595static sfxge_rx_packet_t *
596sfxge_rx_qpacket_create(sfxge_rxq_t *srp)
597{
598	sfxge_t *sp = srp->sr_sp;
599	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
600	sfxge_rx_packet_t *srpp;
601	size_t size;
602	caddr_t base;
603	size_t unit;
604	ddi_dma_cookie_t dmac;
605	unsigned int ncookies;
606	frtn_t *freep;
607	mblk_t *mp;
608	int err;
609	int rc;
610
611	size = sp->s_rx_buffer_size;
612
613	if (sp->s_rx_pkt_mem_max &&
614	    (sp->s_rx_pkt_mem_alloc + size >= sp->s_rx_pkt_mem_max)) {
615		DTRACE_PROBE(rx_pkt_mem_max);
616		srp->sr_kstat.srk_rx_pkt_mem_limit++;
617		return (NULL);
618	}
619
620	/* Allocate a new packet */
621	if ((srpp = kmem_cache_alloc(sp->s_rpc, KM_NOSLEEP)) == NULL) {
622		srp->sr_kstat.srk_kcache_alloc_nomem++;
623		rc = ENOMEM;
624		goto fail1;
625	}
626
627	srpp->srp_srp = srp;
628	srpp->srp_putp = srfppp->srfpp_putp;
629
630	/* Allocate some DMA memory */
631	err = ddi_dma_mem_alloc(srpp->srp_dma_handle, size,
632	    &sfxge_rx_packet_devacc, DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
633	    NULL, &base, &unit, &(srpp->srp_acc_handle));
634	switch (err) {
635	case DDI_SUCCESS:
636		break;
637
638	case DDI_FAILURE:
639		srp->sr_kstat.srk_dma_alloc_nomem++;
640		rc = ENOMEM;
641		goto fail2;
642
643	default:
644		srp->sr_kstat.srk_dma_alloc_fail++;
645		rc = EFAULT;
646		goto fail2;
647	}
648
649	/* Adjust the buffer to align the start of the DMA area correctly */
650	base += sp->s_rx_buffer_align;
651	size -= sp->s_rx_buffer_align;
652
653	/* Bind the DMA memory to the DMA handle */
654	err = ddi_dma_addr_bind_handle(srpp->srp_dma_handle, NULL,
655	    base, size, DDI_DMA_READ | DDI_DMA_STREAMING,
656	    DDI_DMA_DONTWAIT, NULL, &dmac, &ncookies);
657	switch (err) {
658	case DDI_DMA_MAPPED:
659		break;
660
661	case DDI_DMA_INUSE:
662		srp->sr_kstat.srk_dma_bind_fail++;
663		rc = EEXIST;
664		goto fail3;
665
666	case DDI_DMA_NORESOURCES:
667		srp->sr_kstat.srk_dma_bind_nomem++;
668		rc = ENOMEM;
669		goto fail3;
670
671	case DDI_DMA_NOMAPPING:
672		srp->sr_kstat.srk_dma_bind_fail++;
673		rc = ENOTSUP;
674		goto fail3;
675
676	case DDI_DMA_TOOBIG:
677		srp->sr_kstat.srk_dma_bind_fail++;
678		rc = EFBIG;
679		goto fail3;
680
681	default:
682		srp->sr_kstat.srk_dma_bind_fail++;
683		rc = EFAULT;
684		goto fail3;
685	}
686	ASSERT3U(ncookies, ==, 1);
687
688	srpp->srp_addr = dmac.dmac_laddress;
689
690	srpp->srp_base = (unsigned char *)base;
691	srpp->srp_mblksize = size;
692
693	/*
694	 * Allocate a STREAMS block: We use size 1 so that the allocator will
695	 * use the first (and smallest) dblk cache.
696	 */
697	freep = &(srpp->srp_free);
698	freep->free_func = sfxge_rx_qpacket_free;
699	freep->free_arg  = (caddr_t)srpp;
700
701	if ((mp = desballoc(srpp->srp_base, size, BPRI_HI, freep)) == NULL) {
702		srp->sr_kstat.srk_desballoc_fail++;
703		rc = ENOMEM;
704		goto fail4;
705	}
706
707	srpp->srp_mp = mp;
708	srpp->srp_recycle = B_TRUE;
709
710	if (sp->s_rx_pkt_mem_max) {
711		int64_t delta = sfxge_rx_pkt_mem_approx(srpp);
712		atomic_add_64(&sp->s_rx_pkt_mem_alloc, delta);
713	}
714
715	return (srpp);
716
717fail4:
718	DTRACE_PROBE(fail4);
719
720	bzero(&(srpp->srp_free), sizeof (frtn_t));
721
722	srpp->srp_mblksize = 0;
723	srpp->srp_base = NULL;
724
725	/* Unbind the DMA memory from the DMA handle */
726	srpp->srp_addr = 0;
727	(void) ddi_dma_unbind_handle(srpp->srp_dma_handle);
728
729fail3:
730	DTRACE_PROBE(fail3);
731
732	/* Free the DMA memory */
733	ddi_dma_mem_free(&(srpp->srp_acc_handle));
734	srpp->srp_acc_handle = NULL;
735
736fail2:
737	DTRACE_PROBE(fail2);
738
739	srpp->srp_putp = NULL;
740	srpp->srp_srp = NULL;
741
742	kmem_cache_free(sp->s_rpc, srpp);
743
744fail1:
745	DTRACE_PROBE1(fail1, int, rc);
746
747	return (NULL);
748}
749
750#define	SFXGE_REFILL_BATCH  64
751
752/* Try to refill the RX descriptor ring from the associated free pkt pool */
753static void
754sfxge_rx_qrefill(sfxge_rxq_t *srp, unsigned int target)
755{
756	sfxge_t *sp = srp->sr_sp;
757	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
758	unsigned int index = srp->sr_index;
759	sfxge_evq_t *sep = sp->s_sep[index];
760	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
761	mblk_t *mp;
762	int ntodo;
763	unsigned int count;
764	unsigned int batch;
765	unsigned int rxfill;
766	unsigned int mblksize;
767
768	prefetch_read_many(sp->s_enp);
769	prefetch_read_many(srp->sr_erp);
770
771	ASSERT(mutex_owned(&(sep->se_lock)));
772
773	if (srp->sr_state != SFXGE_RXQ_STARTED)
774		return;
775
776	rxfill = srp->sr_added - srp->sr_completed;
777	ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
778	ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
779	ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
780
781	if (ntodo == 0)
782		goto out;
783
784	(void) sfxge_rx_qfpp_swizzle(srp);
785
786	mp = srfppp->srfpp_get;
787	count = srfppp->srfpp_count;
788	mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
789
790	batch = 0;
791	while (ntodo-- > 0) {
792		mblk_t *next;
793		frtn_t *freep;
794		sfxge_rx_packet_t *srpp;
795		unsigned int id;
796
797		if (mp == NULL)
798			break;
799
800		next = mp->b_next;
801		mp->b_next = NULL;
802
803		if (next != NULL)
804			prefetch_read_many(next);
805
806		freep = DB_FRTNP(mp);
807		/*LINTED*/
808		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
809		ASSERT3P(srpp->srp_mp, ==, mp);
810
811		/* The MTU may have changed since the packet was allocated */
812		if (MBLKSIZE(mp) != mblksize) {
813			srpp->srp_recycle = B_FALSE;
814
815			freeb(mp);
816
817			--count;
818			mp = next;
819			continue;
820		}
821
822		srpp->srp_off = 0;
823		srpp->srp_thp = NULL;
824		srpp->srp_iphp = NULL;
825		srpp->srp_etherhp = NULL;
826		srpp->srp_size = 0;
827		srpp->srp_flags = EFX_DISCARD;
828
829		id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
830		ASSERT(srp->sr_srpp[id] == NULL);
831		srp->sr_srpp[id] = srpp;
832
833		addr[batch++] = srpp->srp_addr;
834		if (batch == SFXGE_REFILL_BATCH) {
835			efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
836			    srp->sr_completed, srp->sr_added);
837			srp->sr_added += batch;
838			batch = 0;
839		}
840
841		--count;
842		mp = next;
843	}
844
845	srfppp->srfpp_get = mp;
846	srfppp->srfpp_count = count;
847
848	if (batch != 0) {
849		efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
850		    srp->sr_completed, srp->sr_added);
851		srp->sr_added += batch;
852	}
853
854	efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);
855
856out:
857	if (srfppp->srfpp_count < srfppp->srfpp_min)
858		srfppp->srfpp_min = srfppp->srfpp_count;
859}
860
861/* Preallocate packets and put them in the free packet pool */
862static void
863sfxge_rx_qpreallocate(sfxge_rxq_t *srp, int nprealloc)
864{
865	sfxge_rx_fpp_t *srfppp = &((srp)->sr_fpp);
866	srfppp->srfpp_lowat = nprealloc;
867	while (nprealloc-- > 0) {
868		sfxge_rx_packet_t *srpp;
869
870		if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
871			break;
872		sfxge_rx_qfpp_put(srp, srpp);
873	}
874}
875
876/* Try to refill the RX descriptor ring by allocating new packets */
877static void
878sfxge_rx_qfill(sfxge_rxq_t *srp, unsigned int target)
879{
880	sfxge_t *sp = srp->sr_sp;
881	unsigned int index = srp->sr_index;
882	sfxge_evq_t *sep = sp->s_sep[index];
883	unsigned int batch;
884	unsigned int rxfill;
885	unsigned int mblksize;
886	int ntodo;
887	efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
888	mblk_t *mp = NULL;
889
890	prefetch_read_many(sp->s_enp);
891	prefetch_read_many(srp->sr_erp);
892
893	ASSERT(mutex_owned(&(sep->se_lock)));
894
895	if (srp->sr_state != SFXGE_RXQ_STARTED)
896		return;
897
898	rxfill = srp->sr_added - srp->sr_completed;
899	ASSERT3U(rxfill, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
900	ntodo = min(EFX_RXQ_LIMIT(sp->s_rxq_size) - rxfill, target);
901	ASSERT3U(ntodo, <=, EFX_RXQ_LIMIT(sp->s_rxq_size));
902
903	if (ntodo == 0)
904		return;
905
906	mblksize = sp->s_rx_buffer_size - sp->s_rx_buffer_align;
907
908	batch = 0;
909	while (ntodo-- > 0) {
910		sfxge_rx_packet_t *srpp;
911		unsigned int id;
912
913		if ((srpp = sfxge_rx_qpacket_create(srp)) == NULL)
914			break;
915
916		mp = srpp->srp_mp;
917
918		ASSERT3U(MBLKSIZE(mp), ==, mblksize);
919
920		ASSERT3U(srpp->srp_off, ==, 0);
921		ASSERT3P(srpp->srp_thp, ==, NULL);
922		ASSERT3P(srpp->srp_iphp, ==, NULL);
923		ASSERT3P(srpp->srp_etherhp, ==, NULL);
924		ASSERT3U(srpp->srp_size, ==, 0);
925
926		srpp->srp_flags = EFX_DISCARD;
927
928		id = (srp->sr_added + batch) & (sp->s_rxq_size - 1);
929		ASSERT(srp->sr_srpp[id] == NULL);
930		srp->sr_srpp[id] = srpp;
931
932		addr[batch++] = srpp->srp_addr;
933		if (batch == SFXGE_REFILL_BATCH) {
934			efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
935			    srp->sr_completed, srp->sr_added);
936			srp->sr_added += batch;
937			batch = 0;
938		}
939	}
940
941	if (batch != 0) {
942		efx_rx_qpost(srp->sr_erp, addr, mblksize, batch,
943		    srp->sr_completed, srp->sr_added);
944		srp->sr_added += batch;
945	}
946
947	efx_rx_qpush(srp->sr_erp, srp->sr_added, &srp->sr_pushed);
948}
949
950void
951sfxge_rx_qfpp_trim(sfxge_rxq_t *srp)
952{
953	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
954	sfxge_t *sp = srp->sr_sp;
955	unsigned int index = srp->sr_index;
956	sfxge_evq_t *sep = sp->s_sep[index];
957	mblk_t *p;
958	mblk_t **pp;
959	int count;
960
961	ASSERT(mutex_owned(&(sep->se_lock)));
962
963	if (srp->sr_state != SFXGE_RXQ_STARTED)
964		goto done;
965
966	/* Make sure the queue is full */
967	sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
968
969	/* The refill may have emptied the pool */
970	if (srfppp->srfpp_min == 0)
971		goto done;
972
973	/* Don't trim below the pool's low water mark */
974	if (srfppp->srfpp_count <= srfppp->srfpp_lowat)
975		goto done;
976
977	ASSERT(srfppp->srfpp_min <= srfppp->srfpp_count);
978
979	/* Trim to the largest of srfppp->srfpp_min and srfpp->srfpp_lowat */
980	if (srfppp->srfpp_lowat > srfppp->srfpp_min)
981		count = srfppp->srfpp_count - srfppp->srfpp_lowat;
982	else
983		count = srfppp->srfpp_count - srfppp->srfpp_min;
984
985	/* Walk the get list */
986	pp = &(srfppp->srfpp_get);
987	while (--count >= 0) {
988		ASSERT(pp);
989		p = *pp;
990		ASSERT(p != NULL);
991
992		pp = &(p->b_next);
993	}
994	ASSERT(pp);
995	p = *pp;
996
997	/* Truncate the get list */
998	*pp = NULL;
999
1000	/* Free the remainder */
1001	while (p != NULL) {
1002		mblk_t *next;
1003		frtn_t *freep;
1004		sfxge_rx_packet_t *srpp;
1005
1006		next = p->b_next;
1007		p->b_next = NULL;
1008
1009		ASSERT3U(srfppp->srfpp_min, >, 0);
1010		srfppp->srfpp_min--;
1011		srfppp->srfpp_count--;
1012
1013		freep = DB_FRTNP(p);
1014		/*LINTED*/
1015		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1016		ASSERT3P(srpp->srp_mp, ==, p);
1017
1018		srpp->srp_recycle = B_FALSE;
1019
1020		freeb(p);
1021
1022		p = next;
1023	}
1024
1025done:
1026	srfppp->srfpp_min = srfppp->srfpp_count;
1027}
1028
1029static void
1030sfxge_rx_qpoll(void *arg)
1031{
1032	sfxge_rxq_t *srp = arg;
1033	sfxge_t *sp = srp->sr_sp;
1034	unsigned int index = srp->sr_index;
1035	sfxge_evq_t *sep = sp->s_sep[index];
1036	uint16_t magic;
1037
1038	/*
1039	 * man timeout(9f) states that this code should adhere to the
1040	 * same requirements as a softirq handler - DO NOT BLOCK
1041	 */
1042
1043	/*
1044	 * Post an event to the event queue to cause the free packet pool to be
1045	 * trimmed if it is oversize.
1046	 */
1047	magic = SFXGE_MAGIC_RX_QFPP_TRIM | index;
1048
1049#if defined(DEBUG)
1050	/* This is guaranteed due to the start/stop order of rx and ev */
1051	ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1052	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1053#else
1054	/*
1055	 * Bug22691 WORKAROUND:
1056	 * This handler has been observed in the field to be invoked for a
1057	 * queue in the INITIALIZED state, which should never happen.
1058	 * Until the mechanism for this is properly understood, add defensive
1059	 * checks.
1060	 */
1061	if ((sep->se_state != SFXGE_EVQ_STARTED) ||
1062	    (srp->sr_state != SFXGE_RXQ_STARTED) ||
1063	    (!sep->se_eep)) {
1064		dev_err(sp->s_dip, CE_WARN, SFXGE_CMN_ERR
1065		    "RXQ[%d] bad state in sfxge_rx_qpoll %d %d %p",
1066		    index, sep->se_state, srp->sr_state, sep->se_eep);
1067		return;
1068	}
1069#endif
1070	efx_ev_qpost(sep->se_eep, magic);
1071
1072	srp->sr_tid = timeout(sfxge_rx_qpoll, srp,
1073	    drv_usectohz(sp->s_rxq_poll_usec));
1074}
1075
1076static void
1077sfxge_rx_qpoll_start(sfxge_rxq_t *srp)
1078{
1079	sfxge_t *sp = srp->sr_sp;
1080	unsigned int index = srp->sr_index;
1081	sfxge_evq_t *sep = sp->s_sep[index];
1082
1083	ASSERT(mutex_owned(&(sep->se_lock)));
1084	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1085
1086	/* Schedule a poll */
1087	ASSERT3P(srp->sr_tid, ==, 0);
1088	srp->sr_tid = timeout(sfxge_rx_qpoll, srp, 0);
1089}
1090
1091static void
1092sfxge_rx_qpoll_stop(sfxge_rxq_t *srp)
1093{
1094	sfxge_t *sp = srp->sr_sp;
1095	unsigned int index = srp->sr_index;
1096	sfxge_evq_t *sep = sp->s_sep[index];
1097	timeout_id_t tid;
1098
1099	ASSERT(mutex_owned(&(sep->se_lock)));
1100	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1101
1102	/*
1103	 * Cancel the qpoll timer. Care is needed as this function
1104	 * can race with sfxge_rx_qpoll() for timeout id updates.
1105	 *
1106	 * Do not hold locks used by any timeout(9f) handlers across
1107	 * calls to untimeout(9f) as this will deadlock.
1108	 */
1109	tid = 0;
1110	while ((srp->sr_tid != 0) && (srp->sr_tid != tid)) {
1111		tid = srp->sr_tid;
1112		(void) untimeout(tid);
1113	}
1114	srp->sr_tid = 0;
1115}
1116
1117static int
1118sfxge_rx_kstat_update(kstat_t *ksp, int rw)
1119{
1120	sfxge_rxq_t *srp = ksp->ks_private;
1121	sfxge_t *sp = srp->sr_sp;
1122	unsigned int index = srp->sr_index;
1123	sfxge_evq_t *sep = sp->s_sep[index];
1124	kstat_named_t *knp;
1125	int rc;
1126
1127	if (rw != KSTAT_READ) {
1128		rc = EACCES;
1129		goto fail1;
1130	}
1131
1132	ASSERT(mutex_owned(&(sep->se_lock)));
1133	if (srp->sr_state != SFXGE_RXQ_STARTED)
1134		goto done;
1135
1136	knp = ksp->ks_data;
1137	/* NB pointer post-increment below */
1138	knp++->value.ui32 = srp->sr_kstat.srk_rx_pkt_mem_limit;
1139	knp++->value.ui32 = srp->sr_kstat.srk_kcache_alloc_nomem;
1140	knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_nomem;
1141	knp++->value.ui32 = srp->sr_kstat.srk_dma_alloc_fail;
1142	knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_nomem;
1143	knp++->value.ui32 = srp->sr_kstat.srk_dma_bind_fail;
1144	knp++->value.ui32 = srp->sr_kstat.srk_desballoc_fail;
1145	knp++->value.ui32 = srp->sr_kstat.srk_rxq_empty_discard;
1146
1147done:
1148	return (0);
1149
1150fail1:
1151	DTRACE_PROBE1(fail1, int, rc);
1152
1153	return (rc);
1154}
1155
1156static int
1157sfxge_rx_kstat_init(sfxge_rxq_t *srp)
1158{
1159	sfxge_t *sp = srp->sr_sp;
1160	unsigned int index = srp->sr_index;
1161	sfxge_evq_t *sep = sp->s_sep[index];
1162	dev_info_t *dip = sp->s_dip;
1163	char name[MAXNAMELEN];
1164	kstat_t *ksp;
1165	kstat_named_t *knp;
1166	int rc;
1167
1168	/* Create the set */
1169	(void) snprintf(name, MAXNAMELEN - 1, "%s_rxq%04d",
1170	    ddi_driver_name(dip), index);
1171
1172	if ((ksp = kstat_create((char *)ddi_driver_name(dip),
1173	    ddi_get_instance(dip), name, "rxq", KSTAT_TYPE_NAMED,
1174	    SFXGE_RX_NSTATS, 0)) == NULL) {
1175		rc = ENOMEM;
1176		goto fail1;
1177	}
1178
1179	srp->sr_ksp = ksp;
1180
1181	ksp->ks_update = sfxge_rx_kstat_update;
1182	ksp->ks_private = srp;
1183	ksp->ks_lock = &(sep->se_lock);
1184
1185	/* Initialise the named stats */
1186	knp = ksp->ks_data;
1187	kstat_named_init(knp, "rx_pkt_mem_limit", KSTAT_DATA_UINT32);
1188	knp++;
1189	kstat_named_init(knp, "kcache_alloc_nomem", KSTAT_DATA_UINT32);
1190	knp++;
1191	kstat_named_init(knp, "dma_alloc_nomem", KSTAT_DATA_UINT32);
1192	knp++;
1193	kstat_named_init(knp, "dma_alloc_fail", KSTAT_DATA_UINT32);
1194	knp++;
1195	kstat_named_init(knp, "dma_bind_nomem", KSTAT_DATA_UINT32);
1196	knp++;
1197	kstat_named_init(knp, "dma_bind_fail", KSTAT_DATA_UINT32);
1198	knp++;
1199	kstat_named_init(knp, "desballoc_fail", KSTAT_DATA_UINT32);
1200	knp++;
1201	kstat_named_init(knp, "rxq_empty_discard", KSTAT_DATA_UINT32);
1202
1203	kstat_install(ksp);
1204	return (0);
1205
1206fail1:
1207	DTRACE_PROBE1(fail1, int, rc);
1208
1209	return (rc);
1210}
1211
1212static int
1213sfxge_rx_qinit(sfxge_t *sp, unsigned int index)
1214{
1215	sfxge_rxq_t *srp;
1216	int rc;
1217
1218	ASSERT3U(index, <, SFXGE_RX_SCALE_MAX);
1219
1220	if ((srp = kmem_cache_alloc(sp->s_rqc, KM_SLEEP)) == NULL) {
1221		rc = ENOMEM;
1222		goto fail1;
1223	}
1224	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_UNINITIALIZED);
1225
1226	srp->sr_index = index;
1227	sp->s_srp[index] = srp;
1228
1229	if ((rc = sfxge_rx_kstat_init(srp)) != 0)
1230		goto fail2;
1231
1232	srp->sr_state = SFXGE_RXQ_INITIALIZED;
1233
1234	return (0);
1235
1236fail2:
1237	DTRACE_PROBE(fail2);
1238	kmem_cache_free(sp->s_rqc, srp);
1239
1240fail1:
1241	DTRACE_PROBE1(fail1, int, rc);
1242
1243	return (rc);
1244}
1245
1246static int
1247sfxge_rx_qstart(sfxge_t *sp, unsigned int index)
1248{
1249	sfxge_evq_t *sep = sp->s_sep[index];
1250	sfxge_rxq_t *srp;
1251	efsys_mem_t *esmp;
1252	efx_nic_t *enp;
1253	unsigned int level;
1254	int rc;
1255
1256	mutex_enter(&(sep->se_lock));
1257	srp = sp->s_srp[index];
1258	enp = sp->s_enp;
1259	esmp = &(srp->sr_mem);
1260
1261	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
1262	ASSERT3U(sep->se_state, ==, SFXGE_EVQ_STARTED);
1263
1264	/* Zero the memory */
1265	bzero(esmp->esm_base, EFX_RXQ_SIZE(sp->s_rxq_size));
1266
1267	/* Program the buffer table */
1268	if ((rc = sfxge_sram_buf_tbl_set(sp, srp->sr_id, esmp,
1269	    EFX_RXQ_NBUFS(sp->s_rxq_size))) != 0)
1270		goto fail1;
1271
1272	/* Create the receive queue */
1273	if ((rc = efx_rx_qcreate(enp, index, index, EFX_RXQ_TYPE_DEFAULT,
1274	    esmp, sp->s_rxq_size, srp->sr_id, sep->se_eep, &(srp->sr_erp)))
1275	    != 0)
1276		goto fail2;
1277
1278	/* Enable the receive queue */
1279	efx_rx_qenable(srp->sr_erp);
1280
1281	/* Set the water marks */
1282	srp->sr_hiwat = EFX_RXQ_LIMIT(sp->s_rxq_size) * 9 / 10;
1283	srp->sr_lowat = srp->sr_hiwat / 2;
1284
1285	srp->sr_state = SFXGE_RXQ_STARTED;
1286	srp->sr_flush = SFXGE_FLUSH_INACTIVE;
1287
1288	sfxge_rx_qpoll_start(srp);
1289
1290	/* Try to fill the queue from the pool */
1291	sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1292
1293	/*
1294	 * If there were insufficient buffers in the pool to reach the at
1295	 * least a batch then allocate some.
1296	 */
1297	level = srp->sr_added - srp->sr_completed;
1298	if (level < SFXGE_RX_BATCH)
1299		sfxge_rx_qfill(srp, SFXGE_RX_BATCH);
1300
1301	mutex_exit(&(sep->se_lock));
1302
1303	return (0);
1304
1305fail2:
1306	DTRACE_PROBE(fail2);
1307
1308	/* Clear entries from the buffer table */
1309	sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
1310	    EFX_RXQ_NBUFS(sp->s_rxq_size));
1311
1312fail1:
1313	DTRACE_PROBE1(fail1, int, rc);
1314
1315	mutex_exit(&(sep->se_lock));
1316
1317	return (rc);
1318}
1319
1320static void
1321sfxge_rx_qflow_complete(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp)
1322{
1323	mblk_t *mp;
1324	struct ether_header *etherhp;
1325	struct ip *iphp;
1326	struct tcphdr *thp;
1327
1328	if (srfp->srf_mp == NULL)
1329		return;
1330
1331	mp = srfp->srf_mp;
1332	etherhp = srfp->srf_etherhp;
1333	iphp = srfp->srf_iphp;
1334	thp = srfp->srf_last_thp;
1335
1336	ASSERT3U(((etherhp->ether_type == htons(ETHERTYPE_VLAN)) ?
1337	    sizeof (struct ether_vlan_header) :
1338	    sizeof (struct ether_header)) +
1339	    srfp->srf_len, ==, msgdsize(mp));
1340
1341	ASSERT3U(srfp->srf_len & 0xffff, ==, srfp->srf_len);
1342	iphp->ip_len = htons(srfp->srf_len);
1343
1344	srfp->srf_first_thp->th_ack = thp->th_ack;
1345	srfp->srf_first_thp->th_win = thp->th_win;
1346	srfp->srf_first_thp->th_flags = thp->th_flags;
1347
1348	DTRACE_PROBE2(flow_complete, uint32_t, srfp->srf_tag,
1349	    size_t, srfp->srf_len);
1350
1351	srfp->srf_mp = NULL;
1352	srfp->srf_len = 0;
1353
1354	ASSERT(mp->b_next == NULL);
1355	*(srp->sr_mpp) = mp;
1356	srp->sr_mpp = &(mp->b_next);
1357}
1358
1359static boolean_t
1360sfxge_rx_qflow_add(sfxge_rxq_t *srp, sfxge_rx_flow_t *srfp,
1361    sfxge_rx_packet_t *srpp, clock_t now)
1362{
1363	sfxge_t *sp = srp->sr_sp;
1364	struct ether_header *etherhp = srpp->srp_etherhp;
1365	struct ip *iphp = srpp->srp_iphp;
1366	struct tcphdr *thp = srpp->srp_thp;
1367	size_t off = srpp->srp_off;
1368	size_t size = (size_t)(srpp->srp_size);
1369	mblk_t *mp = srpp->srp_mp;
1370	uint32_t seq;
1371	unsigned int shift;
1372
1373	ASSERT3U(MBLKL(mp), ==, off + size);
1374	ASSERT3U(DB_CKSUMFLAGS(mp), ==,
1375	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK | HCK_IPV4_HDRCKSUM);
1376
1377	seq = htonl(thp->th_seq);
1378
1379	/*
1380	 * If the time between this segment and the last is greater than RTO
1381	 * then consider this a new flow.
1382	 */
1383	if (now - srfp->srf_lbolt > srp->sr_rto) {
1384		srfp->srf_count = 1;
1385		srfp->srf_seq = seq + size;
1386
1387		goto fail1;
1388	}
1389
1390	if (seq != srfp->srf_seq) {
1391		if (srfp->srf_count > SFXGE_SLOW_START)
1392			srfp->srf_count = SFXGE_SLOW_START;
1393
1394		srfp->srf_count >>= 1;
1395
1396		srfp->srf_count++;
1397		srfp->srf_seq = seq + size;
1398
1399		goto fail2;
1400	}
1401
1402	/* Update the in-order segment count and sequence number */
1403	srfp->srf_count++;
1404	srfp->srf_seq = seq + size;
1405
1406	/* Don't merge across pure ACK, URG, SYN or RST segments */
1407	if (size == 0 || thp->th_flags & (TH_URG | TH_SYN | TH_RST) ||
1408	    thp->th_urp != 0)
1409		goto fail3;
1410
1411	/*
1412	 * If the in-order segment count has not yet reached the slow-start
1413	 * threshold then we cannot coalesce.
1414	 */
1415	if (srfp->srf_count < SFXGE_SLOW_START)
1416		goto fail4;
1417
1418	/* Scale up the packet size from 4k (the maximum being 64k) */
1419	ASSERT3U(srfp->srf_count, >=, SFXGE_SLOW_START);
1420	shift = MIN(srfp->srf_count - SFXGE_SLOW_START + 12, 16);
1421	if (srfp->srf_len + size >= (1 << shift))
1422		sfxge_rx_qflow_complete(srp, srfp);
1423
1424	ASSERT(mp->b_cont == NULL);
1425
1426	if (srfp->srf_mp == NULL) {
1427		/* First packet in this flow */
1428		srfp->srf_etherhp = etherhp;
1429		srfp->srf_iphp = iphp;
1430		srfp->srf_first_thp = srfp->srf_last_thp = thp;
1431
1432		ASSERT3P(mp->b_cont, ==, NULL);
1433		srfp->srf_mp = mp;
1434		srfp->srf_mpp = &(mp->b_cont);
1435
1436		srfp->srf_len = ntohs(iphp->ip_len);
1437
1438		/*
1439		 * If the flow is not already in the list of occupied flows then
1440		 * add it.
1441		 */
1442		if (srfp->srf_next == NULL &&
1443		    srp->sr_srfpp != &(srfp->srf_next)) {
1444			*(srp->sr_srfpp) = srfp;
1445			srp->sr_srfpp = &(srfp->srf_next);
1446		}
1447	} else {
1448		/* Later packet in this flow - skip TCP header */
1449		srfp->srf_last_thp = thp;
1450
1451		mp->b_rptr += off;
1452		ASSERT3U(MBLKL(mp), ==, size);
1453
1454		ASSERT3P(mp->b_cont, ==, NULL);
1455		*(srfp->srf_mpp) = mp;
1456		srfp->srf_mpp = &(mp->b_cont);
1457
1458		srfp->srf_len += size;
1459
1460		ASSERT(srfp->srf_next != NULL ||
1461		    srp->sr_srfpp == &(srfp->srf_next));
1462	}
1463
1464	DTRACE_PROBE2(flow_add, uint32_t, srfp->srf_tag, size_t, size);
1465
1466	/*
1467	 * Try to align coalesced segments on push boundaries, unless they
1468	 * are too frequent.
1469	 */
1470	if (sp->s_rx_coalesce_mode == SFXGE_RX_COALESCE_ALLOW_PUSH &&
1471	    thp->th_flags & TH_PUSH)
1472		sfxge_rx_qflow_complete(srp, srfp);
1473
1474	srfp->srf_lbolt = now;
1475	return (B_TRUE);
1476
1477fail4:
1478fail3:
1479fail2:
1480fail1:
1481	sfxge_rx_qflow_complete(srp, srfp);
1482
1483	srfp->srf_lbolt = now;
1484	return (B_FALSE);
1485}
1486
1487void
1488sfxge_rx_qpacket_coalesce(sfxge_rxq_t *srp)
1489{
1490	sfxge_t *sp = srp->sr_sp;
1491	clock_t now;
1492	mblk_t *mp;
1493	sfxge_rx_flow_t *srfp;
1494
1495	ASSERT(sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF);
1496
1497	now = ddi_get_lbolt();
1498
1499	mp = srp->sr_mp;
1500
1501	srp->sr_mp = NULL;
1502	srp->sr_mpp = &(srp->sr_mp);
1503
1504	/* Start with the last flow to be appended to */
1505	srfp = *(srp->sr_srfpp);
1506
1507	while (mp != NULL) {
1508		frtn_t *freep;
1509		sfxge_rx_packet_t *srpp;
1510		struct ether_header *etherhp;
1511		struct ip *iphp;
1512		struct tcphdr *thp;
1513		size_t off;
1514		size_t size;
1515		uint16_t ether_tci;
1516		uint32_t hash;
1517		uint32_t tag;
1518		mblk_t *next;
1519		sfxge_packet_type_t pkt_type;
1520		uint16_t sport, dport;
1521
1522		next = mp->b_next;
1523		mp->b_next = NULL;
1524
1525		if (next != NULL)
1526			prefetch_read_many(next);
1527
1528		freep = DB_FRTNP(mp);
1529		/*LINTED*/
1530		srpp = (sfxge_rx_packet_t *)(freep->free_arg);
1531		ASSERT3P(srpp->srp_mp, ==, mp);
1532
1533		/* If the packet is not TCP then we cannot coalesce it */
1534		if (~(srpp->srp_flags) & EFX_PKT_TCP)
1535			goto reject;
1536
1537		/*
1538		 * If the packet is not fully checksummed then we cannot
1539		 * coalesce it.
1540		 */
1541		if (~(srpp->srp_flags) & (EFX_CKSUM_TCPUDP | EFX_CKSUM_IPV4))
1542			goto reject;
1543
1544		/* Parse the TCP header */
1545		pkt_type = sfxge_pkthdr_parse(mp, &etherhp, &iphp, &thp, &off,
1546		    &size, &sport, &dport);
1547		ASSERT(pkt_type == SFXGE_PACKET_TYPE_IPV4_TCP);
1548		ASSERT(etherhp != NULL);
1549		ASSERT(iphp != NULL);
1550		ASSERT(thp != NULL);
1551		ASSERT(off != 0);
1552
1553		if ((iphp->ip_off & ~htons(IP_DF)) != 0)
1554			goto reject;
1555
1556		if (etherhp->ether_type == htons(ETHERTYPE_VLAN)) {
1557			struct ether_vlan_header *ethervhp;
1558
1559			ethervhp = (struct ether_vlan_header *)etherhp;
1560			ether_tci = ethervhp->ether_tci;
1561		} else {
1562			ether_tci = 0;
1563		}
1564
1565		/*
1566		 * Make sure any minimum length padding is stripped
1567		 * before we try to add the packet to a flow.
1568		 */
1569		ASSERT3U(sp->s_rx_prefix_size + MBLKL(mp), ==,
1570		    (size_t)(srpp->srp_size));
1571		ASSERT3U(sp->s_rx_prefix_size + off + size, <=,
1572		    (size_t)(srpp->srp_size));
1573
1574		if (sp->s_rx_prefix_size + off + size <
1575		    (size_t)(srpp->srp_size))
1576			mp->b_wptr = mp->b_rptr + off + size;
1577
1578		/*
1579		 * If there is no current flow, or the segment does not match
1580		 * the current flow then we must attempt to look up the
1581		 * correct flow in the table.
1582		 */
1583		if (srfp == NULL)
1584			goto lookup;
1585
1586		if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1587		    srfp->srf_daddr != iphp->ip_dst.s_addr)
1588			goto lookup;
1589
1590		if (srfp->srf_sport != thp->th_sport ||
1591		    srfp->srf_dport != thp->th_dport)
1592			goto lookup;
1593
1594		if (srfp->srf_tci != ether_tci)
1595			goto lookup;
1596
1597add:
1598		ASSERT(srfp != NULL);
1599
1600		srpp->srp_etherhp = etherhp;
1601		srpp->srp_iphp = iphp;
1602		srpp->srp_thp = thp;
1603		srpp->srp_off = off;
1604
1605		ASSERT3U(size, <, (1 << 16));
1606		srpp->srp_size = (uint16_t)size;
1607
1608		/* Try to append the packet to the flow */
1609		if (!sfxge_rx_qflow_add(srp, srfp, srpp, now))
1610			goto reject;
1611
1612		mp = next;
1613		continue;
1614
1615lookup:
1616		/*
1617		 * If there is a prefix area then read the hash from that,
1618		 * otherwise calculate it.
1619		 */
1620		if (sp->s_rx_prefix_size != 0) {
1621			hash = efx_psuedo_hdr_hash_get(sp->s_enp,
1622			    EFX_RX_HASHALG_TOEPLITZ,
1623			    DB_BASE(mp));
1624		} else {
1625			SFXGE_TCP_HASH(sp,
1626			    &iphp->ip_src.s_addr,
1627			    thp->th_sport,
1628			    &iphp->ip_dst.s_addr,
1629			    thp->th_dport,
1630			    hash);
1631		}
1632
1633		srfp = &(srp->sr_flow[(hash >> 6) % SFXGE_MAX_FLOW]);
1634		tag = hash + 1; /* Make sure it's not zero */
1635
1636		/*
1637		 * If the flow we have found does not match the hash then
1638		 * it may be an unused flow, or it may be stale.
1639		 */
1640		if (tag != srfp->srf_tag) {
1641			if (srfp->srf_count != 0) {
1642				if (now - srfp->srf_lbolt <= srp->sr_rto)
1643					goto reject;
1644			}
1645
1646			if (srfp->srf_mp != NULL)
1647				goto reject;
1648
1649			/* Start a new flow */
1650			ASSERT(srfp->srf_next == NULL);
1651
1652			srfp->srf_tag = tag;
1653
1654			srfp->srf_saddr = iphp->ip_src.s_addr;
1655			srfp->srf_daddr = iphp->ip_dst.s_addr;
1656			srfp->srf_sport = thp->th_sport;
1657			srfp->srf_dport = thp->th_dport;
1658			srfp->srf_tci = ether_tci;
1659
1660			srfp->srf_count = 0;
1661			srfp->srf_seq = ntohl(thp->th_seq);
1662
1663			srfp->srf_lbolt = now;
1664			goto add;
1665		}
1666
1667		/*
1668		 * If the flow we have found does match the hash then it could
1669		 * still be an alias.
1670		 */
1671		if (srfp->srf_saddr != iphp->ip_src.s_addr ||
1672		    srfp->srf_daddr != iphp->ip_dst.s_addr)
1673			goto reject;
1674
1675		if (srfp->srf_sport != thp->th_sport ||
1676		    srfp->srf_dport != thp->th_dport)
1677			goto reject;
1678
1679		if (srfp->srf_tci != ether_tci)
1680			goto reject;
1681
1682		goto add;
1683
1684reject:
1685		*(srp->sr_mpp) = mp;
1686		srp->sr_mpp = &(mp->b_next);
1687
1688		mp = next;
1689	}
1690}
1691
1692void
1693sfxge_rx_qcomplete(sfxge_rxq_t *srp, boolean_t eop)
1694{
1695	sfxge_t *sp = srp->sr_sp;
1696	unsigned int index = srp->sr_index;
1697	sfxge_evq_t *sep = sp->s_sep[index];
1698	unsigned int completed;
1699	sfxge_rx_fpp_t *srfppp = &(srp->sr_fpp);
1700	unsigned int level;
1701
1702	ASSERT(mutex_owned(&(sep->se_lock)));
1703
1704	ASSERT(srp->sr_mp == NULL);
1705	ASSERT(srp->sr_mpp == &(srp->sr_mp));
1706
1707	completed = srp->sr_completed;
1708	while (completed != srp->sr_pending) {
1709		unsigned int id;
1710		sfxge_rx_packet_t *srpp;
1711		mblk_t *mp;
1712		size_t size;
1713		uint16_t flags;
1714		int rc;
1715
1716		id = completed++ & (sp->s_rxq_size - 1);
1717
1718		if (srp->sr_pending - completed >= 4) {
1719			unsigned int prefetch;
1720
1721			prefetch = (id + 4) & (sp->s_rxq_size - 1);
1722
1723			srpp = srp->sr_srpp[prefetch];
1724			ASSERT(srpp != NULL);
1725
1726			mp = srpp->srp_mp;
1727			prefetch_read_many(mp->b_datap);
1728		} else if (completed == srp->sr_pending) {
1729			prefetch_read_many(srp->sr_mp);
1730		}
1731
1732		srpp = srp->sr_srpp[id];
1733		ASSERT(srpp != NULL);
1734
1735		srp->sr_srpp[id] = NULL;
1736
1737		mp = srpp->srp_mp;
1738		ASSERT(mp->b_cont == NULL);
1739
1740		/* when called from sfxge_rx_qstop() */
1741		if (srp->sr_state != SFXGE_RXQ_STARTED)
1742			goto discard;
1743
1744		if (srpp->srp_flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
1745			goto discard;
1746
1747		/* Make the data visible to the kernel */
1748		rc = ddi_dma_sync(srpp->srp_dma_handle, 0,
1749		    sp->s_rx_buffer_size, DDI_DMA_SYNC_FORKERNEL);
1750		ASSERT3P(rc, ==, DDI_SUCCESS);
1751
1752		/* Read the length from the psuedo header if required */
1753		if (srpp->srp_flags & EFX_PKT_PREFIX_LEN) {
1754			rc = efx_psuedo_hdr_pkt_length_get(sp->s_enp,
1755			    mp->b_rptr,
1756			    &srpp->srp_size);
1757			ASSERT3P(rc, ==, 0);
1758			srpp->srp_size += sp->s_rx_prefix_size;
1759		}
1760
1761		/* Set up the packet length */
1762		ASSERT3P(mp->b_rptr, ==, DB_BASE(mp));
1763		mp->b_rptr += sp->s_rx_prefix_size;
1764
1765		prefetch_read_many(mp->b_rptr);
1766
1767		ASSERT3P(mp->b_wptr, ==, DB_BASE(mp));
1768		mp->b_wptr += (size_t)(srpp->srp_size);
1769		ASSERT3P(mp->b_wptr, <=, DB_LIM(mp));
1770
1771		/* Calculate the maximum packet size */
1772		size = sp->s_mtu;
1773		size += (srpp->srp_flags & EFX_PKT_VLAN_TAGGED) ?
1774		    sizeof (struct ether_vlan_header) :
1775		    sizeof (struct ether_header);
1776
1777		if (MBLKL(mp) > size)
1778			goto discard;
1779
1780		/* Check for loopback packets */
1781		if (!(srpp->srp_flags & EFX_PKT_IPV4) &&
1782		    !(srpp->srp_flags & EFX_PKT_IPV6)) {
1783			struct ether_header *etherhp;
1784
1785			/*LINTED*/
1786			etherhp = (struct ether_header *)(mp->b_rptr);
1787
1788			if (etherhp->ether_type ==
1789			    htons(SFXGE_ETHERTYPE_LOOPBACK)) {
1790				DTRACE_PROBE(loopback);
1791
1792				srp->sr_loopback++;
1793				goto discard;
1794			}
1795		}
1796
1797		/* Set up the checksum information */
1798		flags = 0;
1799
1800		if (srpp->srp_flags & EFX_CKSUM_IPV4) {
1801			ASSERT(srpp->srp_flags & EFX_PKT_IPV4);
1802			flags |= HCK_IPV4_HDRCKSUM;
1803		}
1804
1805		if (srpp->srp_flags & EFX_CKSUM_TCPUDP) {
1806			ASSERT(srpp->srp_flags & EFX_PKT_TCP ||
1807			    srpp->srp_flags & EFX_PKT_UDP);
1808			flags |= HCK_FULLCKSUM | HCK_FULLCKSUM_OK;
1809		}
1810
1811		DB_CKSUMSTART(mp) = 0;
1812		DB_CKSUMSTUFF(mp) = 0;
1813		DB_CKSUMEND(mp) = 0;
1814		DB_CKSUMFLAGS(mp) = flags;
1815		DB_CKSUM16(mp) = 0;
1816
1817		/* Add the packet to the tail of the chain */
1818		srfppp->srfpp_loaned++;
1819
1820		ASSERT(mp->b_next == NULL);
1821		*(srp->sr_mpp) = mp;
1822		srp->sr_mpp = &(mp->b_next);
1823
1824		continue;
1825
1826discard:
1827		/* Return the packet to the pool */
1828		srfppp->srfpp_loaned++;
1829		freeb(mp); /* Equivalent to freemsg() as b_cont==0 */
1830	}
1831	srp->sr_completed = completed;
1832
1833	/* Attempt to coalesce any TCP packets */
1834	if (sp->s_rx_coalesce_mode != SFXGE_RX_COALESCE_OFF)
1835		sfxge_rx_qpacket_coalesce(srp);
1836
1837	/*
1838	 * If there are any pending flows and this is the end of the
1839	 * poll then they must be completed.
1840	 */
1841	if (srp->sr_srfp != NULL && eop) {
1842		sfxge_rx_flow_t *srfp;
1843
1844		srfp = srp->sr_srfp;
1845
1846		srp->sr_srfp = NULL;
1847		srp->sr_srfpp = &(srp->sr_srfp);
1848
1849		do {
1850			sfxge_rx_flow_t *next;
1851
1852			next = srfp->srf_next;
1853			srfp->srf_next = NULL;
1854
1855			sfxge_rx_qflow_complete(srp, srfp);
1856
1857			srfp = next;
1858		} while (srfp != NULL);
1859	}
1860
1861	level = srp->sr_pushed - srp->sr_completed;
1862
1863	/* If there are any packets then pass them up the stack */
1864	if (srp->sr_mp != NULL) {
1865		mblk_t *mp;
1866
1867		mp = srp->sr_mp;
1868
1869		srp->sr_mp = NULL;
1870		srp->sr_mpp = &(srp->sr_mp);
1871
1872		if (level == 0) {
1873			/* Try to refill ASAP */
1874			sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1875			level = srp->sr_pushed - srp->sr_completed;
1876		}
1877
1878		/*
1879		 * If the RXQ is still empty, discard and recycle the
1880		 * current entry to ensure that the ring always
1881		 * contains at least one descriptor. This ensures that
1882		 * the next hardware RX will trigger an event
1883		 * (possibly delayed by interrupt moderation) and
1884		 * trigger another refill/fill attempt.
1885		 *
1886		 * Note this drops a complete LRO fragment from the
1887		 * start of the batch.
1888		 *
1889		 * Note also that copymsgchain() does not help with
1890		 * resource starvation here, unless we are short of DMA
1891		 * mappings.
1892		 */
1893		if (level == 0) {
1894			mblk_t *nmp;
1895
1896			srp->sr_kstat.srk_rxq_empty_discard++;
1897			DTRACE_PROBE1(rxq_empty_discard, int, index);
1898			nmp = mp->b_next;
1899			if (nmp)
1900				sfxge_gld_rx_post(sp, index, nmp);
1901			/* as level==0 will swizzle,rxpost below */
1902			freemsg(mp);
1903		} else {
1904			sfxge_gld_rx_post(sp, index, mp);
1905		}
1906	}
1907
1908	/* Top up the queue if necessary */
1909	if (level < srp->sr_hiwat) {
1910		sfxge_rx_qrefill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1911
1912		level = srp->sr_added - srp->sr_completed;
1913		if (level < srp->sr_lowat)
1914			sfxge_rx_qfill(srp, EFX_RXQ_LIMIT(sp->s_rxq_size));
1915	}
1916}
1917
1918void
1919sfxge_rx_qflush_done(sfxge_rxq_t *srp)
1920{
1921	sfxge_t *sp = srp->sr_sp;
1922	unsigned int index = srp->sr_index;
1923	sfxge_evq_t *sep = sp->s_sep[index];
1924	boolean_t flush_pending;
1925
1926	ASSERT(mutex_owned(&(sep->se_lock)));
1927
1928	/*
1929	 * Flush successful: wakeup sfxge_rx_qstop() if flush is pending.
1930	 *
1931	 * A delayed flush event received after RxQ stop has timed out
1932	 * will be ignored, as then the flush state will not be PENDING
1933	 * (see SFCbug22989).
1934	 */
1935	flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
1936	srp->sr_flush = SFXGE_FLUSH_DONE;
1937	if (flush_pending)
1938		cv_broadcast(&(srp->sr_flush_kv));
1939}
1940
1941void
1942sfxge_rx_qflush_failed(sfxge_rxq_t *srp)
1943{
1944	sfxge_t *sp = srp->sr_sp;
1945	unsigned int index = srp->sr_index;
1946	sfxge_evq_t *sep = sp->s_sep[index];
1947	boolean_t flush_pending;
1948
1949	ASSERT(mutex_owned(&(sep->se_lock)));
1950
1951	/*
1952	 * Flush failed: wakeup sfxge_rx_qstop() if flush is pending.
1953	 *
1954	 * A delayed flush event received after RxQ stop has timed out
1955	 * will be ignored, as then the flush state will not be PENDING
1956	 * (see SFCbug22989).
1957	 */
1958	flush_pending = (srp->sr_flush == SFXGE_FLUSH_PENDING);
1959	srp->sr_flush = SFXGE_FLUSH_FAILED;
1960	if (flush_pending)
1961		cv_broadcast(&(srp->sr_flush_kv));
1962}
1963
1964static void
1965sfxge_rx_qstop(sfxge_t *sp, unsigned int index)
1966{
1967	dev_info_t *dip = sp->s_dip;
1968	sfxge_evq_t *sep = sp->s_sep[index];
1969	sfxge_rxq_t *srp;
1970	clock_t timeout;
1971	unsigned int flush_tries = SFXGE_RX_QFLUSH_TRIES;
1972	int rc;
1973
1974	ASSERT(mutex_owned(&(sp->s_state_lock)));
1975
1976	mutex_enter(&(sep->se_lock));
1977
1978	srp = sp->s_srp[index];
1979	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_STARTED);
1980
1981	sfxge_rx_qpoll_stop(srp);
1982
1983	/* Further packets are discarded by sfxge_rx_qcomplete() */
1984	srp->sr_state = SFXGE_RXQ_INITIALIZED;
1985
1986	if (sp->s_hw_err != SFXGE_HW_OK) {
1987		/*
1988		 * Flag indicates possible hardware failure.
1989		 * Attempt flush but do not wait for it to complete.
1990		 */
1991		srp->sr_flush = SFXGE_FLUSH_DONE;
1992		(void) efx_rx_qflush(srp->sr_erp);
1993	}
1994
1995	/* Wait upto 2sec for queue flushing to complete */
1996	timeout = ddi_get_lbolt() + drv_usectohz(SFXGE_RX_QFLUSH_USEC);
1997
1998	while (srp->sr_flush != SFXGE_FLUSH_DONE && flush_tries-- > 0) {
1999		if ((rc = efx_rx_qflush(srp->sr_erp)) != 0) {
2000			if (rc == EALREADY)
2001				srp->sr_flush = SFXGE_FLUSH_DONE;
2002			else
2003				srp->sr_flush = SFXGE_FLUSH_FAILED;
2004			break;
2005		}
2006		srp->sr_flush = SFXGE_FLUSH_PENDING;
2007		if (cv_timedwait(&(srp->sr_flush_kv), &(sep->se_lock),
2008		    timeout) < 0) {
2009			/* Timeout waiting for successful or failed flush */
2010			dev_err(dip, CE_NOTE,
2011			    SFXGE_CMN_ERR "rxq[%d] flush timeout", index);
2012			break;
2013		}
2014	}
2015
2016	if (srp->sr_flush == SFXGE_FLUSH_FAILED)
2017		dev_err(dip, CE_NOTE,
2018		    SFXGE_CMN_ERR "rxq[%d] flush failed", index);
2019
2020	DTRACE_PROBE1(flush, sfxge_flush_state_t, srp->sr_flush);
2021	srp->sr_flush = SFXGE_FLUSH_DONE;
2022
2023	/* Destroy the receive queue */
2024	efx_rx_qdestroy(srp->sr_erp);
2025	srp->sr_erp = NULL;
2026
2027	/* Clear entries from the buffer table */
2028	sfxge_sram_buf_tbl_clear(sp, srp->sr_id,
2029	    EFX_RXQ_NBUFS(sp->s_rxq_size));
2030
2031	/*
2032	 * Free any unused RX packets which had descriptors on the RXQ
2033	 * Packets will be discard as state != STARTED
2034	 */
2035	srp->sr_pending = srp->sr_added;
2036	sfxge_rx_qcomplete(srp, B_TRUE);
2037
2038	ASSERT3U(srp->sr_completed, ==, srp->sr_pending);
2039
2040	srp->sr_added = 0;
2041	srp->sr_pushed = 0;
2042	srp->sr_pending = 0;
2043	srp->sr_completed = 0;
2044	srp->sr_loopback = 0;
2045
2046	srp->sr_lowat = 0;
2047	srp->sr_hiwat = 0;
2048
2049	mutex_exit(&(sep->se_lock));
2050}
2051
2052static void
2053sfxge_rx_kstat_fini(sfxge_rxq_t *srp)
2054{
2055	kstat_delete(srp->sr_ksp);
2056	srp->sr_ksp = NULL;
2057}
2058
2059static void
2060sfxge_rx_qfini(sfxge_t *sp, unsigned int index)
2061{
2062	sfxge_rxq_t *srp = sp->s_srp[index];
2063
2064	ASSERT3U(srp->sr_state, ==, SFXGE_RXQ_INITIALIZED);
2065
2066	sp->s_srp[index] = NULL;
2067	srp->sr_state = SFXGE_RXQ_UNINITIALIZED;
2068
2069	sfxge_rx_kstat_fini(srp);
2070
2071	/* Empty the pool */
2072	sfxge_rx_qfpp_empty(srp);
2073
2074	srp->sr_index = 0;
2075
2076	kmem_cache_free(sp->s_rqc, srp);
2077}
2078
2079static int
2080sfxge_rx_scale_kstat_update(kstat_t *ksp, int rw)
2081{
2082	sfxge_t *sp = ksp->ks_private;
2083	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2084	sfxge_intr_t *sip = &(sp->s_intr);
2085	kstat_named_t *knp;
2086	unsigned int index;
2087	unsigned int entry;
2088	unsigned int *freq;
2089	int rc;
2090
2091	ASSERT(mutex_owned(&(srsp->srs_lock)));
2092
2093	if (rw != KSTAT_READ) {
2094		rc = EACCES;
2095		goto fail1;
2096	}
2097
2098	if ((freq = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2099	    KM_NOSLEEP)) == NULL) {
2100		rc = ENOMEM;
2101		goto fail2;
2102	}
2103
2104	for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2105		index = srsp->srs_tbl[entry];
2106
2107		freq[index]++;
2108	}
2109
2110	knp = ksp->ks_data;
2111	for (index = 0; index < sip->si_nalloc; index++) {
2112		knp->value.ui64 = freq[index];
2113		knp++;
2114	}
2115
2116	knp->value.ui64 = srsp->srs_count;
2117
2118	kmem_free(freq, sizeof (unsigned int) * sip->si_nalloc);
2119
2120	return (0);
2121
2122fail2:
2123	DTRACE_PROBE(fail2);
2124fail1:
2125	DTRACE_PROBE1(fail1, int, rc);
2126	return (rc);
2127}
2128
2129static int
2130sfxge_rx_scale_kstat_init(sfxge_t *sp)
2131{
2132	dev_info_t *dip = sp->s_dip;
2133	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2134	sfxge_intr_t *sip = &(sp->s_intr);
2135	char name[MAXNAMELEN];
2136	kstat_t *ksp;
2137	kstat_named_t *knp;
2138	unsigned int index;
2139	int rc;
2140
2141	/* Create the set */
2142	(void) snprintf(name, MAXNAMELEN - 1, "%s_rss", ddi_driver_name(dip));
2143
2144	if ((ksp = kstat_create((char *)ddi_driver_name(dip),
2145	    ddi_get_instance(dip), name, "rss", KSTAT_TYPE_NAMED,
2146	    sip->si_nalloc + 1, 0)) == NULL) {
2147		rc = ENOMEM;
2148		goto fail1;
2149	}
2150
2151	srsp->srs_ksp = ksp;
2152
2153	ksp->ks_update = sfxge_rx_scale_kstat_update;
2154	ksp->ks_private = sp;
2155	ksp->ks_lock = &(srsp->srs_lock);
2156
2157	/* Initialise the named stats */
2158	knp = ksp->ks_data;
2159	for (index = 0; index < sip->si_nalloc; index++) {
2160		char name[MAXNAMELEN];
2161
2162		(void) snprintf(name, MAXNAMELEN - 1, "evq%04d_count", index);
2163		kstat_named_init(knp, name, KSTAT_DATA_UINT64);
2164		knp++;
2165	}
2166
2167	kstat_named_init(knp, "scale", KSTAT_DATA_UINT64);
2168
2169	kstat_install(ksp);
2170	return (0);
2171
2172fail1:
2173	DTRACE_PROBE1(fail1, int, rc);
2174
2175	return (rc);
2176}
2177
2178static void
2179sfxge_rx_scale_kstat_fini(sfxge_t *sp)
2180{
2181	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2182
2183	/* Destroy the set */
2184	kstat_delete(srsp->srs_ksp);
2185	srsp->srs_ksp = NULL;
2186}
2187
2188
2189unsigned int
2190sfxge_rx_scale_prop_get(sfxge_t *sp)
2191{
2192	int rx_scale;
2193
2194	rx_scale = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2195	    DDI_PROP_DONTPASS, "rx_scale_count", SFXGE_RX_SCALE_MAX);
2196	/* 0 and all -ve numbers sets to number of logical CPUs */
2197	if (rx_scale <= 0)
2198		rx_scale = ncpus;
2199
2200	return (rx_scale);
2201}
2202
2203
2204static int
2205sfxge_rx_scale_init(sfxge_t *sp)
2206{
2207	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2208	sfxge_intr_t *sip = &(sp->s_intr);
2209	int rc;
2210
2211	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_UNINITIALIZED);
2212
2213	/* Create tables for CPU, core, cache and chip counts */
2214	srsp->srs_cpu = kmem_zalloc(sizeof (unsigned int) * NCPU, KM_SLEEP);
2215
2216	mutex_init(&(srsp->srs_lock), NULL, MUTEX_DRIVER, NULL);
2217
2218	/* We need at least one event queue */
2219	srsp->srs_count = sfxge_rx_scale_prop_get(sp);
2220	if (srsp->srs_count > sip->si_nalloc)
2221		srsp->srs_count = sip->si_nalloc;
2222	if (srsp->srs_count < 1)
2223		srsp->srs_count = 1;
2224
2225	/* Set up the kstats */
2226	if ((rc = sfxge_rx_scale_kstat_init(sp)) != 0)
2227		goto fail1;
2228
2229	srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2230
2231	return (0);
2232
2233fail1:
2234	DTRACE_PROBE1(fail1, int, rc);
2235	mutex_destroy(&(srsp->srs_lock));
2236
2237	return (rc);
2238}
2239
2240void
2241sfxge_rx_scale_update(void *arg)
2242{
2243	sfxge_t *sp = arg;
2244	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2245	sfxge_intr_t *sip;
2246	processorid_t id;
2247	unsigned int count;
2248	unsigned int *tbl;
2249	unsigned int *rating;
2250	unsigned int entry;
2251	int rc;
2252
2253	mutex_enter(&(srsp->srs_lock));
2254
2255	if (srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2256		rc = EFAULT;
2257		goto fail1;
2258	}
2259
2260	if ((tbl =  kmem_zalloc(sizeof (unsigned int) * SFXGE_RX_SCALE_MAX,
2261	    KM_NOSLEEP)) == NULL) {
2262		rc = ENOMEM;
2263		goto fail2;
2264	}
2265
2266	sip = &(sp->s_intr);
2267	if ((rating = kmem_zalloc(sizeof (unsigned int) * sip->si_nalloc,
2268	    KM_NOSLEEP)) == NULL) {
2269		rc = ENOMEM;
2270		goto fail3;
2271	}
2272
2273	mutex_enter(&cpu_lock);
2274
2275	/*
2276	 * Substract any current CPU, core, cache and chip usage from the
2277	 * global contention tables.
2278	 */
2279	for (id = 0; id < NCPU; id++) {
2280		ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2281		sfxge_cpu[id] -= srsp->srs_cpu[id];
2282		srsp->srs_cpu[id] = 0;
2283	}
2284
2285	ASSERT(srsp->srs_count != 0);
2286
2287	/* Choose as many event queues as we need */
2288	for (count = 0; count < srsp->srs_count; count++) {
2289		unsigned int index;
2290		sfxge_evq_t *sep;
2291		unsigned int choice;
2292		unsigned int choice_rating;
2293
2294		bzero(rating, sizeof (unsigned int) * sip->si_nalloc);
2295
2296		/*
2297		 * Rate each event queue on its global level of CPU
2298		 * contention.
2299		 */
2300		for (index = 0; index < sip->si_nalloc; index++) {
2301			sep = sp->s_sep[index];
2302
2303			id = sep->se_cpu_id;
2304			rating[index] += sfxge_cpu[id];
2305		}
2306
2307		/* Choose the queue with the lowest CPU contention */
2308		choice = 0;
2309		choice_rating = rating[0];
2310
2311		for (index = 1; index < sip->si_nalloc; index++) {
2312			if (rating[index] < choice_rating) {
2313				choice = index;
2314				choice_rating = rating[index];
2315			}
2316		}
2317
2318		/* Add our choice to the condensed RSS table */
2319		tbl[count] = choice;
2320
2321		/* Add information to the global contention tables */
2322		sep = sp->s_sep[choice];
2323
2324		id = sep->se_cpu_id;
2325		srsp->srs_cpu[id]++;
2326		sfxge_cpu[id]++;
2327	}
2328
2329	mutex_exit(&cpu_lock);
2330
2331	/* Build the expanded RSS table */
2332	count = 0;
2333	for (entry = 0; entry < SFXGE_RX_SCALE_MAX; entry++) {
2334		unsigned int index;
2335
2336		index = tbl[count];
2337		count = (count + 1) % srsp->srs_count;
2338
2339		srsp->srs_tbl[entry] = index;
2340	}
2341
2342	/* Program the expanded RSS table into the hardware */
2343	(void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2344	    SFXGE_RX_SCALE_MAX);
2345
2346	mutex_exit(&(srsp->srs_lock));
2347	kmem_free(rating, sizeof (unsigned int) * sip->si_nalloc);
2348	kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2349	return;
2350
2351fail3:
2352	DTRACE_PROBE(fail3);
2353	kmem_free(tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2354fail2:
2355	DTRACE_PROBE(fail2);
2356fail1:
2357	DTRACE_PROBE1(fail1, int, rc);
2358
2359	mutex_exit(&(srsp->srs_lock));
2360}
2361
2362static int
2363sfxge_rx_scale_start(sfxge_t *sp)
2364{
2365	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2366	int rc;
2367
2368	mutex_enter(&(srsp->srs_lock));
2369
2370	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2371
2372	/* Clear down the RSS table */
2373	bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2374
2375	(void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2376	    SFXGE_RX_SCALE_MAX);
2377
2378	if ((rc = sfxge_toeplitz_hash_init(sp)) != 0)
2379		goto fail1;
2380
2381	srsp->srs_state = SFXGE_RX_SCALE_STARTED;
2382
2383	mutex_exit(&(srsp->srs_lock));
2384
2385	/* sfxge_t->s_state_lock held */
2386	(void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2387	    DDI_SLEEP);
2388
2389	return (0);
2390
2391fail1:
2392	DTRACE_PROBE1(fail1, int, rc);
2393
2394	mutex_exit(&(srsp->srs_lock));
2395
2396	return (rc);
2397}
2398
2399int
2400sfxge_rx_scale_count_get(sfxge_t *sp, unsigned int *countp)
2401{
2402	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2403	int rc;
2404
2405	mutex_enter(&(srsp->srs_lock));
2406
2407	if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2408	    srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2409		rc = ENOTSUP;
2410		goto fail1;
2411	}
2412
2413	*countp = srsp->srs_count;
2414
2415	mutex_exit(&(srsp->srs_lock));
2416
2417	return (0);
2418
2419fail1:
2420	DTRACE_PROBE1(fail1, int, rc);
2421
2422	mutex_exit(&(srsp->srs_lock));
2423
2424	return (rc);
2425}
2426
2427int
2428sfxge_rx_scale_count_set(sfxge_t *sp, unsigned int count)
2429{
2430	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2431	sfxge_intr_t *sip = &(sp->s_intr);
2432	int dispatch = 1;
2433	int rc;
2434
2435	if (count < 1 || count > sip->si_nalloc) {
2436		rc = EINVAL;
2437		goto fail1;
2438	}
2439
2440	mutex_enter(&(srsp->srs_lock));
2441
2442	if (srsp->srs_state != SFXGE_RX_SCALE_INITIALIZED &&
2443	    srsp->srs_state != SFXGE_RX_SCALE_STARTED) {
2444		rc = ENOTSUP;
2445		goto fail2;
2446	}
2447
2448	srsp->srs_count = count;
2449
2450	if (srsp->srs_state != SFXGE_RX_SCALE_STARTED)
2451		dispatch = 0;
2452
2453	mutex_exit(&(srsp->srs_lock));
2454
2455	if (dispatch)
2456		/* no locks held */
2457		(void) ddi_taskq_dispatch(sp->s_tqp, sfxge_rx_scale_update, sp,
2458		    DDI_SLEEP);
2459
2460	return (0);
2461
2462fail2:
2463	DTRACE_PROBE(fail2);
2464
2465	mutex_exit(&(srsp->srs_lock));
2466
2467fail1:
2468	DTRACE_PROBE1(fail1, int, rc);
2469
2470	return (rc);
2471}
2472
2473static void
2474sfxge_rx_scale_stop(sfxge_t *sp)
2475{
2476	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2477	processorid_t id;
2478
2479	mutex_enter(&(srsp->srs_lock));
2480
2481	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_STARTED);
2482
2483	srsp->srs_state = SFXGE_RX_SCALE_INITIALIZED;
2484
2485	mutex_enter(&cpu_lock);
2486
2487	/*
2488	 * Substract any current CPU, core, cache and chip usage from the
2489	 * global contention tables.
2490	 */
2491	for (id = 0; id < NCPU; id++) {
2492		ASSERT3U(sfxge_cpu[id], >=, srsp->srs_cpu[id]);
2493		sfxge_cpu[id] -= srsp->srs_cpu[id];
2494		srsp->srs_cpu[id] = 0;
2495	}
2496
2497	mutex_exit(&cpu_lock);
2498
2499	/* Clear down the RSS table */
2500	bzero(srsp->srs_tbl, sizeof (unsigned int) * SFXGE_RX_SCALE_MAX);
2501
2502	(void) efx_rx_scale_tbl_set(sp->s_enp, srsp->srs_tbl,
2503	    SFXGE_RX_SCALE_MAX);
2504
2505	mutex_exit(&(srsp->srs_lock));
2506}
2507
2508static void
2509sfxge_rx_scale_fini(sfxge_t *sp)
2510{
2511	sfxge_rx_scale_t *srsp = &(sp->s_rx_scale);
2512
2513	ASSERT3U(srsp->srs_state, ==, SFXGE_RX_SCALE_INITIALIZED);
2514
2515	srsp->srs_state = SFXGE_RX_SCALE_UNINITIALIZED;
2516
2517	/* Tear down the kstats */
2518	sfxge_rx_scale_kstat_fini(sp);
2519
2520	srsp->srs_count = 0;
2521
2522	mutex_destroy(&(srsp->srs_lock));
2523
2524	/* Destroy tables */
2525	kmem_free(srsp->srs_cpu, sizeof (unsigned int) * NCPU);
2526	srsp->srs_cpu = NULL;
2527
2528	sfxge_toeplitz_hash_fini(sp);
2529}
2530
2531int
2532sfxge_rx_init(sfxge_t *sp)
2533{
2534	sfxge_intr_t *sip = &(sp->s_intr);
2535	char name[MAXNAMELEN];
2536	int index;
2537	int rc;
2538
2539	if (sip->si_state == SFXGE_INTR_UNINITIALIZED) {
2540		rc = EINVAL;
2541		goto fail1;
2542	}
2543
2544	if ((rc = sfxge_rx_scale_init(sp)) != 0)
2545		goto fail2;
2546
2547	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_rx_packet_cache",
2548	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2549
2550	sp->s_rpc = kmem_cache_create(name, sizeof (sfxge_rx_packet_t),
2551	    SFXGE_CPU_CACHE_SIZE, sfxge_rx_packet_ctor, sfxge_rx_packet_dtor,
2552	    NULL, sp, NULL, 0);
2553	ASSERT(sp->s_rpc != NULL);
2554
2555	(void) snprintf(name, MAXNAMELEN - 1, "%s%d_rxq_cache",
2556	    ddi_driver_name(sp->s_dip), ddi_get_instance(sp->s_dip));
2557
2558	sp->s_rqc = kmem_cache_create(name, sizeof (sfxge_rxq_t),
2559	    SFXGE_CPU_CACHE_SIZE, sfxge_rx_qctor, sfxge_rx_qdtor, NULL, sp,
2560	    NULL, 0);
2561	ASSERT(sp->s_rqc != NULL);
2562
2563	sp->s_rx_pkt_mem_max = ddi_prop_get_int64(DDI_DEV_T_ANY, sp->s_dip,
2564	    DDI_PROP_DONTPASS, "rx_pkt_mem_max", 0); /* disabled */
2565
2566	/* Initialize the receive queue(s) */
2567	for (index = 0; index < sip->si_nalloc; index++) {
2568		if ((rc = sfxge_rx_qinit(sp, index)) != 0)
2569			goto fail3;
2570	}
2571
2572	sp->s_rx_coalesce_mode = ddi_prop_get_int(DDI_DEV_T_ANY, sp->s_dip,
2573	    DDI_PROP_DONTPASS, "rx_coalesce_mode", SFXGE_RX_COALESCE_OFF);
2574
2575	return (0);
2576
2577fail3:
2578	DTRACE_PROBE(fail3);
2579
2580	/* Tear down the receive queue(s) */
2581	while (--index >= 0)
2582		sfxge_rx_qfini(sp, index);
2583
2584	kmem_cache_destroy(sp->s_rqc);
2585	sp->s_rqc = NULL;
2586
2587	kmem_cache_destroy(sp->s_rpc);
2588	sp->s_rpc = NULL;
2589
2590	sfxge_rx_scale_fini(sp);
2591
2592fail2:
2593	DTRACE_PROBE(fail2);
2594fail1:
2595	DTRACE_PROBE1(fail1, int, rc);
2596
2597	return (rc);
2598}
2599
2600int
2601sfxge_rx_start(sfxge_t *sp)
2602{
2603	sfxge_mac_t *smp = &(sp->s_mac);
2604	sfxge_intr_t *sip;
2605	const efx_nic_cfg_t *encp;
2606	size_t hdrlen, align;
2607	int index;
2608	int rc;
2609
2610	mutex_enter(&(smp->sm_lock));
2611
2612	/* Calculate the receive packet buffer size and alignment */
2613	sp->s_rx_buffer_size = EFX_MAC_PDU(sp->s_mtu);
2614
2615	encp = efx_nic_cfg_get(sp->s_enp);
2616
2617	/* Packet buffer allocations are cache line aligned */
2618	EFSYS_ASSERT3U(encp->enc_rx_buf_align_start, <=, SFXGE_CPU_CACHE_SIZE);
2619
2620	if (sp->s_family == EFX_FAMILY_HUNTINGTON) {
2621		sp->s_rx_prefix_size = encp->enc_rx_prefix_size;
2622
2623		hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);
2624
2625		/* Ensure IP headers are 32bit aligned */
2626		sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
2627		sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2628
2629	} else if (encp->enc_features & EFX_FEATURE_LFSR_HASH_INSERT) {
2630		sp->s_rx_prefix_size = encp->enc_rx_prefix_size;
2631
2632		/*
2633		 * Place the start of the buffer a prefix length minus 2
2634		 * before the start of a cache line. This ensures that the
2635		 * last two bytes of the prefix (which is where the LFSR hash
2636		 * is located) are in the same cache line as the headers, and
2637		 * the IP header is 32-bit aligned.
2638		 */
2639		sp->s_rx_buffer_align =
2640		    SFXGE_CPU_CACHE_SIZE - (encp->enc_rx_prefix_size - 2);
2641		sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2642	} else {
2643		sp->s_rx_prefix_size = 0;
2644
2645		/*
2646		 * Place the start of the buffer 2 bytes after a cache line
2647		 * boundary so that the headers fit into the cache line and
2648		 * the IP header is 32-bit aligned.
2649		 */
2650		hdrlen = sp->s_rx_prefix_size + sizeof (struct ether_header);
2651
2652		sp->s_rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
2653		sp->s_rx_buffer_size += sp->s_rx_buffer_align;
2654	}
2655
2656	/* Align end of packet buffer for RX DMA end padding */
2657	align = MAX(1, encp->enc_rx_buf_align_end);
2658	EFSYS_ASSERT(ISP2(align));
2659	sp->s_rx_buffer_size = P2ROUNDUP(sp->s_rx_buffer_size, align);
2660
2661	/* Initialize the receive module */
2662	if ((rc = efx_rx_init(sp->s_enp)) != 0)
2663		goto fail1;
2664
2665	mutex_exit(&(smp->sm_lock));
2666
2667	if ((rc = sfxge_rx_scale_start(sp)) != 0)
2668		goto fail2;
2669
2670	/* Start the receive queue(s) */
2671	sip = &(sp->s_intr);
2672	for (index = 0; index < sip->si_nalloc; index++) {
2673		if ((rc = sfxge_rx_qstart(sp, index)) != 0)
2674			goto fail3;
2675	}
2676
2677	ASSERT3U(sp->s_srp[0]->sr_state, ==, SFXGE_RXQ_STARTED);
2678	/* It is sufficient to have Rx scale initialized */
2679	ASSERT3U(sp->s_rx_scale.srs_state, ==, SFXGE_RX_SCALE_STARTED);
2680	rc = efx_mac_filter_default_rxq_set(sp->s_enp, sp->s_srp[0]->sr_erp,
2681	    sp->s_rx_scale.srs_count > 1);
2682	if (rc != 0)
2683		goto fail4;
2684
2685	return (0);
2686
2687fail4:
2688	DTRACE_PROBE(fail4);
2689
2690fail3:
2691	DTRACE_PROBE(fail3);
2692
2693	/* Stop the receive queue(s) */
2694	while (--index >= 0)
2695		sfxge_rx_qstop(sp, index);
2696
2697	sfxge_rx_scale_stop(sp);
2698
2699fail2:
2700	DTRACE_PROBE(fail2);
2701
2702	mutex_enter(&(smp->sm_lock));
2703
2704	/* Tear down the receive module */
2705	efx_rx_fini(sp->s_enp);
2706
2707fail1:
2708	DTRACE_PROBE1(fail1, int, rc);
2709
2710	mutex_exit(&(smp->sm_lock));
2711
2712	return (rc);
2713}
2714
2715void
2716sfxge_rx_coalesce_mode_get(sfxge_t *sp, sfxge_rx_coalesce_mode_t *modep)
2717{
2718	*modep = sp->s_rx_coalesce_mode;
2719}
2720
2721int
2722sfxge_rx_coalesce_mode_set(sfxge_t *sp, sfxge_rx_coalesce_mode_t mode)
2723{
2724	int rc;
2725
2726	switch (mode) {
2727	case SFXGE_RX_COALESCE_OFF:
2728	case SFXGE_RX_COALESCE_DISALLOW_PUSH:
2729	case SFXGE_RX_COALESCE_ALLOW_PUSH:
2730		break;
2731
2732	default:
2733		rc = EINVAL;
2734		goto fail1;
2735	}
2736
2737	sp->s_rx_coalesce_mode = mode;
2738
2739	return (0);
2740
2741fail1:
2742	DTRACE_PROBE1(fail1, int, rc);
2743
2744	return (rc);
2745}
2746
2747void
2748sfxge_rx_stop(sfxge_t *sp)
2749{
2750	sfxge_mac_t *smp = &(sp->s_mac);
2751	sfxge_intr_t *sip = &(sp->s_intr);
2752	efx_nic_t *enp = sp->s_enp;
2753	int index;
2754
2755	ASSERT(mutex_owned(&(sp->s_state_lock)));
2756
2757	efx_mac_filter_default_rxq_clear(enp);
2758
2759	/* Stop the receive queue(s) */
2760	index = sip->si_nalloc;
2761	while (--index >= 0) {
2762		/* TBD: Flush RXQs in parallel; HW has limit + may need retry */
2763		sfxge_rx_qstop(sp, index);
2764	}
2765
2766	sfxge_rx_scale_stop(sp);
2767
2768	mutex_enter(&(smp->sm_lock));
2769
2770	/* Tear down the receive module */
2771	efx_rx_fini(enp);
2772
2773	sp->s_rx_buffer_align = 0;
2774	sp->s_rx_prefix_size = 0;
2775	sp->s_rx_buffer_size = 0;
2776
2777	mutex_exit(&(smp->sm_lock));
2778}
2779
2780unsigned int
2781sfxge_rx_loaned(sfxge_t *sp)
2782{
2783	sfxge_intr_t *sip = &(sp->s_intr);
2784	int index;
2785	unsigned int loaned;
2786
2787	ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2788
2789	loaned = 0;
2790	for (index = 0; index < sip->si_nalloc; index++) {
2791		sfxge_rxq_t *srp = sp->s_srp[index];
2792		sfxge_evq_t *sep = sp->s_sep[srp->sr_index];
2793
2794		mutex_enter(&(sep->se_lock));
2795
2796		loaned += sfxge_rx_qfpp_swizzle(srp);
2797
2798		mutex_exit(&(sep->se_lock));
2799	}
2800
2801	return (loaned);
2802}
2803
2804void
2805sfxge_rx_fini(sfxge_t *sp)
2806{
2807	sfxge_intr_t *sip = &(sp->s_intr);
2808	int index;
2809
2810	ASSERT3U(sip->si_state, ==, SFXGE_INTR_INITIALIZED);
2811
2812	sp->s_rx_coalesce_mode = SFXGE_RX_COALESCE_OFF;
2813
2814	/* Tear down the receive queue(s) */
2815	index = sip->si_nalloc;
2816	while (--index >= 0)
2817		sfxge_rx_qfini(sp, index);
2818
2819	ASSERT3U(sp->s_rx_pkt_mem_alloc, ==, 0);
2820
2821	kmem_cache_destroy(sp->s_rqc);
2822	sp->s_rqc = NULL;
2823
2824	kmem_cache_destroy(sp->s_rpc);
2825	sp->s_rpc = NULL;
2826
2827	sfxge_rx_scale_fini(sp);
2828}
2829