1da14cebeSEric Cheng /*
2da14cebeSEric Cheng  * CDDL HEADER START
3da14cebeSEric Cheng  *
4da14cebeSEric Cheng  * The contents of this file are subject to the terms of the
5da14cebeSEric Cheng  * Common Development and Distribution License (the "License").
6da14cebeSEric Cheng  * You may not use this file except in compliance with the License.
7da14cebeSEric Cheng  *
8da14cebeSEric Cheng  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9da14cebeSEric Cheng  * or http://www.opensolaris.org/os/licensing.
10da14cebeSEric Cheng  * See the License for the specific language governing permissions
11da14cebeSEric Cheng  * and limitations under the License.
12da14cebeSEric Cheng  *
13da14cebeSEric Cheng  * When distributing Covered Code, include this CDDL HEADER in each
14da14cebeSEric Cheng  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15da14cebeSEric Cheng  * If applicable, add the following below this CDDL HEADER, with the
16da14cebeSEric Cheng  * fields enclosed by brackets "[]" replaced with your own identifying
17da14cebeSEric Cheng  * information: Portions Copyright [yyyy] [name of copyright owner]
18da14cebeSEric Cheng  *
19da14cebeSEric Cheng  * CDDL HEADER END
20da14cebeSEric Cheng  */
21da14cebeSEric Cheng 
22da14cebeSEric Cheng /*
230dc2366fSVenugopal Iyer  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24da14cebeSEric Cheng  * Use is subject to license terms.
255647635cSRyan Zezeski  * Copyright 2017 Joyent, Inc.
26da14cebeSEric Cheng  */
27da14cebeSEric Cheng 
28da14cebeSEric Cheng #ifndef	_SYS_MAC_SOFT_RING_H
29da14cebeSEric Cheng #define	_SYS_MAC_SOFT_RING_H
30da14cebeSEric Cheng 
31da14cebeSEric Cheng #ifdef	__cplusplus
32da14cebeSEric Cheng extern "C" {
33da14cebeSEric Cheng #endif
34da14cebeSEric Cheng 
35da14cebeSEric Cheng #include <sys/types.h>
36da14cebeSEric Cheng #include <sys/cpuvar.h>
370dc2366fSVenugopal Iyer #include <sys/cpupart.h>
38da14cebeSEric Cheng #include <sys/processor.h>
39da14cebeSEric Cheng #include <sys/stream.h>
40da14cebeSEric Cheng #include <sys/squeue.h>
41da14cebeSEric Cheng #include <sys/dlpi.h>
42da14cebeSEric Cheng #include <sys/mac_impl.h>
430dc2366fSVenugopal Iyer #include <sys/mac_stat.h>
44da14cebeSEric Cheng 
45da14cebeSEric Cheng #define	S_RING_NAMELEN 64
46da14cebeSEric Cheng 
472b24ab6bSSebastien Roy #define	MAX_SR_FANOUT	24
48da14cebeSEric Cheng 
49da14cebeSEric Cheng extern boolean_t mac_soft_ring_enable;
50da14cebeSEric Cheng extern boolean_t mac_latency_optimize;
51da14cebeSEric Cheng 
52da14cebeSEric Cheng typedef struct mac_soft_ring_s mac_soft_ring_t;
53da14cebeSEric Cheng typedef struct mac_soft_ring_set_s mac_soft_ring_set_t;
54da14cebeSEric Cheng 
55da14cebeSEric Cheng typedef void (*mac_soft_ring_drain_func_t)(mac_soft_ring_t *);
56da14cebeSEric Cheng typedef mac_tx_cookie_t (*mac_tx_func_t)(mac_soft_ring_set_t *, mblk_t *,
57da14cebeSEric Cheng     uintptr_t, uint16_t, mblk_t **);
58da14cebeSEric Cheng 
59da14cebeSEric Cheng 
60da14cebeSEric Cheng /* Tx notify callback */
61da14cebeSEric Cheng typedef struct mac_tx_notify_cb_s {
62da14cebeSEric Cheng 	mac_cb_t		mtnf_link;	/* Linked list of callbacks */
63da14cebeSEric Cheng 	mac_tx_notify_t		mtnf_fn;	/* The callback function */
64da14cebeSEric Cheng 	void			*mtnf_arg;	/* Callback function argument */
65da14cebeSEric Cheng } mac_tx_notify_cb_t;
66da14cebeSEric Cheng 
67da14cebeSEric Cheng struct mac_soft_ring_s {
68da14cebeSEric Cheng 	/* Keep the most used members 64bytes cache aligned */
69da14cebeSEric Cheng 	kmutex_t	s_ring_lock;	/* lock before using any member */
70da14cebeSEric Cheng 	uint16_t	s_ring_type;	/* processing model of the sq */
71da14cebeSEric Cheng 	uint16_t	s_ring_state;	/* state flags and message count */
72da14cebeSEric Cheng 	int		s_ring_count;	/* # of mblocks in mac_soft_ring */
73da14cebeSEric Cheng 	size_t		s_ring_size;	/* Size of data queued */
74da14cebeSEric Cheng 	mblk_t		*s_ring_first;	/* first mblk chain or NULL */
75da14cebeSEric Cheng 	mblk_t		*s_ring_last;	/* last mblk chain or NULL */
76da14cebeSEric Cheng 
77da14cebeSEric Cheng 	mac_direct_rx_t	s_ring_rx_func;
78da14cebeSEric Cheng 	void		*s_ring_rx_arg1;
79da14cebeSEric Cheng 	mac_resource_handle_t  s_ring_rx_arg2;
80da14cebeSEric Cheng 
81da14cebeSEric Cheng 	/*
82da14cebeSEric Cheng 	 * Threshold after which packets get dropped.
83da14cebeSEric Cheng 	 * Is always greater than s_ring_tx_hiwat
84da14cebeSEric Cheng 	 */
85da14cebeSEric Cheng 	int		s_ring_tx_max_q_cnt;
86da14cebeSEric Cheng 	/* # of mblocks after which to apply flow control */
87da14cebeSEric Cheng 	int		s_ring_tx_hiwat;
88da14cebeSEric Cheng 	/* # of mblocks after which to relieve flow control */
89da14cebeSEric Cheng 	int		s_ring_tx_lowat;
90da14cebeSEric Cheng 	boolean_t	s_ring_tx_woken_up;
91da14cebeSEric Cheng 	uint32_t	s_ring_hiwat_cnt;	/* times blocked for Tx descs */
92da14cebeSEric Cheng 
93da14cebeSEric Cheng 	void		*s_ring_tx_arg1;
94da14cebeSEric Cheng 	void		*s_ring_tx_arg2;
95da14cebeSEric Cheng 
96da14cebeSEric Cheng 	/* Tx notify callback */
97da14cebeSEric Cheng 	mac_cb_info_t	s_ring_notify_cb_info;		/* cb list info */
98da14cebeSEric Cheng 	mac_cb_t	*s_ring_notify_cb_list;		/* The cb list */
99da14cebeSEric Cheng 
100da14cebeSEric Cheng 	clock_t		s_ring_awaken;	/* time async thread was awakened */
101da14cebeSEric Cheng 
102da14cebeSEric Cheng 	kthread_t	*s_ring_run;	/* Current thread processing sq */
103da14cebeSEric Cheng 	processorid_t	s_ring_cpuid;	/* processor to bind to */
104da14cebeSEric Cheng 	processorid_t	s_ring_cpuid_save;	/* saved cpuid during offline */
105da14cebeSEric Cheng 	kcondvar_t	s_ring_async;	/* async thread blocks on */
106da14cebeSEric Cheng 	clock_t		s_ring_wait;	/* lbolts to wait after a fill() */
107da14cebeSEric Cheng 	timeout_id_t	s_ring_tid;	/* timer id of pending timeout() */
108da14cebeSEric Cheng 	kthread_t	*s_ring_worker;	/* kernel thread id */
109da14cebeSEric Cheng 	char		s_ring_name[S_RING_NAMELEN + 1];
110da14cebeSEric Cheng 	uint32_t	s_ring_total_inpkt;
1110dc2366fSVenugopal Iyer 	uint32_t	s_ring_total_rbytes;
112da14cebeSEric Cheng 	uint32_t	s_ring_drops;
113da14cebeSEric Cheng 	struct mac_client_impl_s *s_ring_mcip;
114da14cebeSEric Cheng 	kstat_t		*s_ring_ksp;
115da14cebeSEric Cheng 
116da14cebeSEric Cheng 	/* Teardown, poll disable control ops */
117da14cebeSEric Cheng 	kcondvar_t	s_ring_client_cv; /* Client wait for control op */
118da14cebeSEric Cheng 
119da14cebeSEric Cheng 	mac_soft_ring_set_t *s_ring_set;   /* The SRS this ring belongs to */
120da14cebeSEric Cheng 	mac_soft_ring_t	*s_ring_next;
121da14cebeSEric Cheng 	mac_soft_ring_t	*s_ring_prev;
122da14cebeSEric Cheng 	mac_soft_ring_drain_func_t s_ring_drain_func;
1230dc2366fSVenugopal Iyer 
1240dc2366fSVenugopal Iyer 	mac_tx_stats_t	s_st_stat;
125da14cebeSEric Cheng };
126da14cebeSEric Cheng 
127da14cebeSEric Cheng typedef void (*mac_srs_drain_proc_t)(mac_soft_ring_set_t *, uint_t);
128da14cebeSEric Cheng 
129da14cebeSEric Cheng /* Transmit side Soft Ring Set */
130da14cebeSEric Cheng typedef struct mac_srs_tx_s {
131da14cebeSEric Cheng 	/* Members for Tx size processing */
132da14cebeSEric Cheng 	uint32_t	st_mode;
133da14cebeSEric Cheng 	mac_tx_func_t	st_func;
134da14cebeSEric Cheng 	void		*st_arg1;
135da14cebeSEric Cheng 	void		*st_arg2;
136da14cebeSEric Cheng 	mac_group_t	*st_group;	/* TX group for share */
137da14cebeSEric Cheng 	boolean_t	st_woken_up;
138da14cebeSEric Cheng 
139da14cebeSEric Cheng 	/*
140da14cebeSEric Cheng 	 * st_max_q_cnt is the queue depth threshold to limit
141da14cebeSEric Cheng 	 * outstanding packets on the Tx SRS. Once the limit
142da14cebeSEric Cheng 	 * is reached, Tx SRS will drop packets until the
143da14cebeSEric Cheng 	 * limit goes below the threshold.
144da14cebeSEric Cheng 	 */
145da14cebeSEric Cheng 	uint32_t	st_max_q_cnt;	/* max. outstanding packets */
146da14cebeSEric Cheng 	/*
147da14cebeSEric Cheng 	 * st_hiwat is used Tx serializer and bandwidth mode.
148da14cebeSEric Cheng 	 * This is the queue depth threshold upto which
149da14cebeSEric Cheng 	 * packets will get buffered with no flow-control
150da14cebeSEric Cheng 	 * back pressure applied to the caller. Once this
151da14cebeSEric Cheng 	 * threshold is reached, back pressure will be
152da14cebeSEric Cheng 	 * applied to the caller of mac_tx() (mac_tx() starts
153da14cebeSEric Cheng 	 * returning a cookie to indicate a blocked SRS).
154da14cebeSEric Cheng 	 * st_hiwat should always be lesser than or equal to
155da14cebeSEric Cheng 	 * st_max_q_cnt.
156da14cebeSEric Cheng 	 */
157da14cebeSEric Cheng 	uint32_t	st_hiwat;	/* mblk cnt to apply flow control */
158da14cebeSEric Cheng 	uint32_t	st_lowat;	/* mblk cnt to relieve flow control */
1590dc2366fSVenugopal Iyer 	uint32_t	st_hiwat_cnt; /* times blocked for Tx descs */
1600dc2366fSVenugopal Iyer 	mac_tx_stats_t	st_stat;
1610dc2366fSVenugopal Iyer 	mac_capab_aggr_t	st_capab_aggr;
162da14cebeSEric Cheng 	/*
1630dc2366fSVenugopal Iyer 	 * st_soft_rings is used as an array to store aggr Tx soft
1640dc2366fSVenugopal Iyer 	 * rings. When aggr_find_tx_ring() returns a pseudo ring,
1650dc2366fSVenugopal Iyer 	 * the associated soft ring has to be found. st_soft_rings
1660dc2366fSVenugopal Iyer 	 * array stores the soft ring associated with a pseudo Tx
1670dc2366fSVenugopal Iyer 	 * ring and it can be accessed using the pseudo ring
1680dc2366fSVenugopal Iyer 	 * index (mr_index). Note that the ring index is unique
1690dc2366fSVenugopal Iyer 	 * for each ring in a group.
170da14cebeSEric Cheng 	 */
1710dc2366fSVenugopal Iyer 	mac_soft_ring_t **st_soft_rings;
172da14cebeSEric Cheng } mac_srs_tx_t;
173da14cebeSEric Cheng 
174da14cebeSEric Cheng /* Receive side Soft Ring Set */
175da14cebeSEric Cheng typedef struct mac_srs_rx_s {
176da14cebeSEric Cheng 	/*
177da14cebeSEric Cheng 	 * Upcall Function for fanout, Rx processing etc. Perhaps
178da14cebeSEric Cheng 	 * the same 3 members below can be used for Tx
179da14cebeSEric Cheng 	 * processing, but looking around, mac_rx_func_t has
180da14cebeSEric Cheng 	 * proliferated too much into various files at different
181da14cebeSEric Cheng 	 * places. I am leaving the consolidation battle for
182da14cebeSEric Cheng 	 * another day.
183da14cebeSEric Cheng 	 */
184da14cebeSEric Cheng 	mac_direct_rx_t		sr_func;	/* srs_lock */
185da14cebeSEric Cheng 	void			*sr_arg1;	/* srs_lock */
1867ec6bfcfSToomas Soome 	mac_resource_handle_t	sr_arg2;	/* srs_lock */
187da14cebeSEric Cheng 	mac_rx_func_t		sr_lower_proc;	/* Atomically changed */
188da14cebeSEric Cheng 	uint32_t		sr_poll_pkt_cnt;
189da14cebeSEric Cheng 	uint32_t		sr_poll_thres;
190da14cebeSEric Cheng 
191da14cebeSEric Cheng 	/* mblk cnt to apply flow control */
192da14cebeSEric Cheng 	uint32_t		sr_hiwat;
193da14cebeSEric Cheng 	/* mblk cnt to relieve flow control */
194da14cebeSEric Cheng 	uint32_t		sr_lowat;
1950dc2366fSVenugopal Iyer 	mac_rx_stats_t		sr_stat;
196da14cebeSEric Cheng 
197da14cebeSEric Cheng 	/* Times polling was enabled */
198da14cebeSEric Cheng 	uint32_t		sr_poll_on;
199da14cebeSEric Cheng 	/* Times polling was enabled by worker thread */
200da14cebeSEric Cheng 	uint32_t		sr_worker_poll_on;
201da14cebeSEric Cheng 	/* Times polling was disabled */
202da14cebeSEric Cheng 	uint32_t		sr_poll_off;
203da14cebeSEric Cheng 	/* Poll thread signalled count */
204da14cebeSEric Cheng 	uint32_t		sr_poll_thr_sig;
205da14cebeSEric Cheng 	/* Poll thread busy */
206da14cebeSEric Cheng 	uint32_t		sr_poll_thr_busy;
207da14cebeSEric Cheng 	/* SRS drains, stays in poll mode but doesn't poll */
208da14cebeSEric Cheng 	uint32_t		sr_poll_drain_no_poll;
209da14cebeSEric Cheng 	/*
210da14cebeSEric Cheng 	 * SRS has nothing to do and no packets in H/W but
211da14cebeSEric Cheng 	 * there is a backlog in softrings. SRS stays in
212da14cebeSEric Cheng 	 * poll mode but doesn't do polling.
213da14cebeSEric Cheng 	 */
214da14cebeSEric Cheng 	uint32_t		sr_poll_no_poll;
215da14cebeSEric Cheng 	/* Active polling restarted */
216da14cebeSEric Cheng 	uint32_t		sr_below_hiwat;
217da14cebeSEric Cheng 	/* Found packets in last poll so try and poll again */
218da14cebeSEric Cheng 	uint32_t		sr_poll_again;
219da14cebeSEric Cheng 	/*
220da14cebeSEric Cheng 	 * Packets in queue but poll thread not allowed to process so
221da14cebeSEric Cheng 	 * signal the worker thread.
222da14cebeSEric Cheng 	 */
223da14cebeSEric Cheng 	uint32_t		sr_poll_sig_worker;
224da14cebeSEric Cheng 	/*
225da14cebeSEric Cheng 	 * Poll thread has nothing to do and H/W has nothing so
226da14cebeSEric Cheng 	 * reenable the interrupts.
227da14cebeSEric Cheng 	 */
228da14cebeSEric Cheng 	uint32_t		sr_poll_intr_enable;
229da14cebeSEric Cheng 	/*
230da14cebeSEric Cheng 	 * Poll thread has nothing to do and worker thread was already
231da14cebeSEric Cheng 	 * running so it can decide to reenable interrupt or poll again.
232da14cebeSEric Cheng 	 */
233da14cebeSEric Cheng 	uint32_t		sr_poll_goto_sleep;
234da14cebeSEric Cheng 	/* Worker thread goes back to draining the queue */
235da14cebeSEric Cheng 	uint32_t		sr_drain_again;
2368ac29891SEric Cheng 	/* More Packets in queue so signal the poll thread to drain */
237ae6aa22aSVenugopal Iyer 	uint32_t		sr_drain_poll_sig;
2388ac29891SEric Cheng 	/* More Packets in queue so signal the worker thread to drain */
2398ac29891SEric Cheng 	uint32_t		sr_drain_worker_sig;
240da14cebeSEric Cheng 	/* Poll thread is already running so worker has nothing to do */
241da14cebeSEric Cheng 	uint32_t		sr_drain_poll_running;
242da14cebeSEric Cheng 	/* We have packets already queued so keep polling */
243da14cebeSEric Cheng 	uint32_t		sr_drain_keep_polling;
244da14cebeSEric Cheng 	/* Drain is done and interrupts are reenabled */
245da14cebeSEric Cheng 	uint32_t		sr_drain_finish_intr;
246da14cebeSEric Cheng 	/* Polling thread needs to schedule worker wakeup */
247da14cebeSEric Cheng 	uint32_t		sr_poll_worker_wakeup;
248da14cebeSEric Cheng } mac_srs_rx_t;
249da14cebeSEric Cheng 
250da14cebeSEric Cheng /*
251da14cebeSEric Cheng  * mac_soft_ring_set_s:
252da14cebeSEric Cheng  * This is used both for Tx and Rx side. The srs_type identifies Rx or
253da14cebeSEric Cheng  * Tx type.
254da14cebeSEric Cheng  *
255da14cebeSEric Cheng  * Note that the structure is carefully crafted, with Rx elements coming
256da14cebeSEric Cheng  * first followed by Tx specific members. Future additions to this
257da14cebeSEric Cheng  * structure should follow the same guidelines.
258da14cebeSEric Cheng  *
259da14cebeSEric Cheng  * Rx-side notes:
260da14cebeSEric Cheng  * mac_rx_classify_flow_add() always creates a mac_soft_ring_set_t and fn_flow
261da14cebeSEric Cheng  * points to info from it (func = srs_lower_proc, arg = soft_ring_set). On
262da14cebeSEric Cheng  * interrupt path, srs_lower_proc does B/W adjustment and switch to polling mode
263da14cebeSEric Cheng  * (if poll capable) and feeds the packets to soft_ring_list via choosen
264da14cebeSEric Cheng  * fanout type (specified by srs_type). In poll mode, the poll thread which is
265da14cebeSEric Cheng  * also a pointer can pick up the packets and feed them to various
266da14cebeSEric Cheng  * soft_ring_list.
267da14cebeSEric Cheng  *
268da14cebeSEric Cheng  * The srs_type can either be protocol based or fanout based where fanout itelf
269da14cebeSEric Cheng  * can be various types
270da14cebeSEric Cheng  *
271da14cebeSEric Cheng  * The polling works by turning off interrupts as soon as a packets
272da14cebeSEric Cheng  * are queued on the soft ring set. Once the backlog is clear and poll
273da14cebeSEric Cheng  * thread return empty handed i.e. Rx ring doesn't have anything, the
274da14cebeSEric Cheng  * interrupt is turned back on. For this purpose we keep a separate
275da14cebeSEric Cheng  * srs_poll_pkt_cnt counter which tracks the packets queued between SRS
276da14cebeSEric Cheng  * and the soft rings as well. The counter is incremented when packets
277da14cebeSEric Cheng  * are queued and decremented when SRS processes them (in case it has
278da14cebeSEric Cheng  * no soft rings) or the soft ring process them. Its important that
279da14cebeSEric Cheng  * in case SRS has softrings, the decrement doesn't happen till the
280da14cebeSEric Cheng  * packet is processed by the soft rings since it takes very little time
281da14cebeSEric Cheng  * for SRS to queue packet from SRS to soft rings and it will keep
282da14cebeSEric Cheng  * bringing more packets in the system faster than soft rings can
283da14cebeSEric Cheng  * process them.
284da14cebeSEric Cheng  *
285da14cebeSEric Cheng  * Tx side notes:
286da14cebeSEric Cheng  * The srs structure acts as a serializer with a worker thread. The
287da14cebeSEric Cheng  * default behavior of srs though is to act as a pass-thru. The queues
288da14cebeSEric Cheng  * (srs_first, srs_last, srs_count) get used when Tx ring runs out of Tx
289da14cebeSEric Cheng  * descriptors or to enforce bandwidth limits.
290da14cebeSEric Cheng  *
291da14cebeSEric Cheng  * When multiple Tx rings are present, the SRS state will be set to
292da14cebeSEric Cheng  * SRS_FANOUT_OTH. Outgoing packets coming into mac_tx_srs_process()
293da14cebeSEric Cheng  * function will be fanned out to one of the Tx side soft rings based on
294da14cebeSEric Cheng  * a hint passed in mac_tx_srs_process(). Each soft ring, in turn, will
295da14cebeSEric Cheng  * be associated with a distinct h/w Tx ring.
296da14cebeSEric Cheng  */
297da14cebeSEric Cheng 
298da14cebeSEric Cheng struct mac_soft_ring_set_s {
299da14cebeSEric Cheng 	/*
300da14cebeSEric Cheng 	 * Common elements, common to both Rx and Tx SRS type.
301da14cebeSEric Cheng 	 * The following block of fields are protected by srs_lock
302da14cebeSEric Cheng 	 */