1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2020 Joyent, Inc.
25 * Copyright 2015 Garrett D'Amore <garrett@damore.org>
26 * Copyright 2020 RackTop Systems, Inc.
27 */
28
29/*
30 * MAC Services Module
31 *
32 * The GLDv3 framework locking -  The MAC layer
33 * --------------------------------------------
34 *
35 * The MAC layer is central to the GLD framework and can provide the locking
36 * framework needed for itself and for the use of MAC clients. MAC end points
37 * are fairly disjoint and don't share a lot of state. So a coarse grained
38 * multi-threading scheme is to single thread all create/modify/delete or set
39 * type of control operations on a per mac end point while allowing data threads
40 * concurrently.
41 *
42 * Control operations (set) that modify a mac end point are always serialized on
43 * a per mac end point basis, We have at most 1 such thread per mac end point
44 * at a time.
45 *
46 * All other operations that are not serialized are essentially multi-threaded.
47 * For example a control operation (get) like getting statistics which may not
48 * care about reading values atomically or data threads sending or receiving
49 * data. Mostly these type of operations don't modify the control state. Any
50 * state these operations care about are protected using traditional locks.
51 *
52 * The perimeter only serializes serial operations. It does not imply there
53 * aren't any other concurrent operations. However a serialized operation may
54 * sometimes need to make sure it is the only thread. In this case it needs
55 * to use reference counting mechanisms to cv_wait until any current data
56 * threads are done.
57 *
58 * The mac layer itself does not hold any locks across a call to another layer.
59 * The perimeter is however held across a down call to the driver to make the
60 * whole control operation atomic with respect to other control operations.
61 * Also the data path and get type control operations may proceed concurrently.
62 * These operations synchronize with the single serial operation on a given mac
63 * end point using regular locks. The perimeter ensures that conflicting
64 * operations like say a mac_multicast_add and a mac_multicast_remove on the
65 * same mac end point don't interfere with each other and also ensures that the
66 * changes in the mac layer and the call to the underlying driver to say add a
67 * multicast address are done atomically without interference from a thread
68 * trying to delete the same address.
69 *
70 * For example, consider
71 * mac_multicst_add()
72 * {
73 *	mac_perimeter_enter();	serialize all control operations
74 *
75 *	grab list lock		protect against access by data threads
76 *	add to list
77 *	drop list lock
78 *
79 *	call driver's mi_multicst
80 *
81 *	mac_perimeter_exit();
82 * }
83 *
84 * To lessen the number of serialization locks and simplify the lock hierarchy,
85 * we serialize all the control operations on a per mac end point by using a
86 * single serialization lock called the perimeter. We allow recursive entry into
87 * the perimeter to facilitate use of this mechanism by both the mac client and
88 * the MAC layer itself.
89 *
90 * MAC client means an entity that does an operation on a mac handle
91 * obtained from a mac_open/mac_client_open. Similarly MAC driver means
92 * an entity that does an operation on a mac handle obtained from a
93 * mac_register. An entity could be both client and driver but on different
94 * handles eg. aggr. and should only make the corresponding mac interface calls
95 * i.e. mac driver interface or mac client interface as appropriate for that
96 * mac handle.
97 *
98 * General rules.
99 * -------------
100 *
101 * R1. The lock order of upcall threads is natually opposite to downcall
102 * threads. Hence upcalls must not hold any locks across layers for fear of
103 * recursive lock enter and lock order violation. This applies to all layers.
104 *
105 * R2. The perimeter is just another lock. Since it is held in the down
106 * direction, acquiring the perimeter in an upcall is prohibited as it would
107 * cause a deadlock. This applies to all layers.
108 *
109 * Note that upcalls that need to grab the mac perimeter (for example
110 * mac_notify upcalls) can still achieve that by posting the request to a
111 * thread, which can then grab all the required perimeters and locks in the
112 * right global order. Note that in the above example the mac layer iself
113 * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
114 * to the client must do that. Please see the aggr code for an example.
115 *
116 * MAC client rules
117 * ----------------
118 *
119 * R3. A MAC client may use the MAC provided perimeter facility to serialize
120 * control operations on a per mac end point. It does this by by acquring
121 * and holding the perimeter across a sequence of calls to the mac layer.
122 * This ensures atomicity across the entire block of mac calls. In this
123 * model the MAC client must not hold any client locks across the calls to
124 * the mac layer. This model is the preferred solution.
125 *
126 * R4. However if a MAC client has a lot of global state across all mac end
127 * points the per mac end point serialization may not be sufficient. In this
128 * case the client may choose to use global locks or use its own serialization.
129 * To avoid deadlocks, these client layer locks held across the mac calls
130 * in the control path must never be acquired by the data path for the reason
131 * mentioned below.
132 *
133 * (Assume that a control operation that holds a client lock blocks in the
134 * mac layer waiting for upcall reference counts to drop to zero. If an upcall
135 * data thread that holds this reference count, tries to acquire the same
136 * client lock subsequently it will deadlock).
137 *
138 * A MAC client may follow either the R3 model or the R4 model, but can't
139 * mix both. In the former, the hierarchy is Perim -> client locks, but in
140 * the latter it is client locks -> Perim.
141 *
142 * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
143 * context since they may block while trying to acquire the perimeter.
144 * In addition some calls may block waiting for upcall refcnts to come down to
145 * zero.
146 *
147 * R6. MAC clients must make sure that they are single threaded and all threads
148 * from the top (in particular data threads) have finished before calling
149 * mac_client_close. The MAC framework does not track the number of client
150 * threads using the mac client handle. Also mac clients must make sure
151 * they have undone all the control operations before calling mac_client_close.
152 * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
153 * mac_unicast_add/mac_multicast_add.
154 *
155 * MAC framework rules
156 * -------------------
157 *
158 * R7. The mac layer itself must not hold any mac layer locks (except the mac
159 * perimeter) across a call to any other layer from the mac layer. The call to
160 * any other layer could be via mi_* entry points, classifier entry points into
161 * the driver or via upcall pointers into layers above. The mac perimeter may
162 * be acquired or held only in the down direction, for e.g. when calling into
163 * a mi_* driver enty point to provide atomicity of the operation.
164 *
165 * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
166 * mac driver interfaces, the MAC layer must provide a cut out for control
167 * interfaces like upcall notifications and start them in a separate thread.
168 *
169 * R9. Note that locking order also implies a plumbing order. For example
170 * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
171 * to plumb in any other order must be failed at mac_open time, otherwise it
172 * could lead to deadlocks due to inverse locking order.
173 *
174 * R10. MAC driver interfaces must not block since the driver could call them
175 * in interrupt context.
176 *
177 * R11. Walkers must preferably not hold any locks while calling walker
178 * callbacks. Instead these can operate on reference counts. In simple
179 * callbacks it may be ok to hold a lock and call the callbacks, but this is
180 * harder to maintain in the general case of arbitrary callbacks.
181 *
182 * R12. The MAC layer must protect upcall notification callbacks using reference
183 * counts rather than holding locks across the callbacks.
184 *
185 * R13. Given the variety of drivers, it is preferable if the MAC layer can make
186 * sure that any pointers (such as mac ring pointers) it passes to the driver
187 * remain valid until mac unregister time. Currently the mac layer achieves
188 * this by using generation numbers for rings and freeing the mac rings only
189 * at unregister time.  The MAC layer must provide a layer of indirection and
190 * must not expose underlying driver rings or driver data structures/pointers
191 * directly to MAC clients.
192 *
193 * MAC driver rules
194 * ----------------
195 *
196 * R14. It would be preferable if MAC drivers don't hold any locks across any
197 * mac call. However at a minimum they must not hold any locks across data
198 * upcalls. They must also make sure that all references to mac data structures
199 * are cleaned up and that it is single threaded at mac_unregister time.
200 *
201 * R15. MAC driver interfaces don't block and so the action may be done
202 * asynchronously in a separate thread as for example handling notifications.
203 * The driver must not assume that the action is complete when the call
204 * returns.
205 *
206 * R16. Drivers must maintain a generation number per Rx ring, and pass it
207 * back to mac_rx_ring(); They are expected to increment the generation
208 * number whenever the ring's stop routine is invoked.
209 * See comments in mac_rx_ring();
210 *
211 * R17 Similarly mi_stop is another synchronization point and the driver must
212 * ensure that all upcalls are done and there won't be any future upcall
213 * before returning from mi_stop.
214 *
215 * R18. The driver may assume that all set/modify control operations via
216 * the mi_* entry points are single threaded on a per mac end point.
217 *
218 * Lock and Perimeter hierarchy scenarios
219 * ---------------------------------------
220 *
221 * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
222 *
223 * ft_lock -> fe_lock [mac_flow_lookup]
224 *
225 * mi_rw_lock -> fe_lock [mac_bcast_send]
226 *
227 * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
228 *
229 * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
230 *
231 * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
232 *
233 * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
234 * client to driver. In the case of clients that explictly use the mac provided
235 * perimeter mechanism for its serialization, the hierarchy is
236 * Perimeter -> mac layer locks, since the client never holds any locks across
237 * the mac calls. In the case of clients that use its own locks the hierarchy
238 * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
239 * calls mac_perim_enter/exit in this case.
240 *
241 * Subflow creation rules
242 * ---------------------------
243 * o In case of a user specified cpulist present on underlying link and flows,
244 * the flows cpulist must be a subset of the underlying link.
245 * o In case of a user specified fanout mode present on link and flow, the
246 * subflow fanout count has to be less than or equal to that of the
247 * underlying link. The cpu-bindings for the subflows will be a subset of
248 * the underlying link.
249 * o In case if no cpulist specified on both underlying link and flow, the
250 * underlying link relies on a  MAC tunable to provide out of box fanout.
251 * The subflow will have no cpulist (the subflow will be unbound)
252 * o In case if no cpulist is specified on the underlying link, a subflow can
253 * carry  either a user-specified cpulist or fanout count. The cpu-bindings
254 * for the subflow will not adhere to restriction that they need to be subset
255 * of the underlying link.
256 * o In case where the underlying link is carrying either a user specified
257 * cpulist or fanout mode and for a unspecified subflow, the subflow will be
258 * created unbound.
259 * o While creating unbound subflows, bandwidth mode changes attempt to
260 * figure a right fanout count. In such cases the fanout count will override
261 * the unbound cpu-binding behavior.
262 * o In addition to this, while cycling between flow and link properties, we
263 * impose a restriction that if a link property has a subflow with
264 * user-specified attributes, we will not allow changing the link property.
265 * The administrator needs to reset all the user specified properties for the
266 * subflows before attempting a link property change.
267 * Some of the above rules can be overridden by specifying additional command
268 * line options while creating or modifying link or subflow properties.
269 *
270 * Datapath
271 * --------
272 *
273 * For information on the datapath, the world of soft rings, hardware rings, how
274 * it is structured, and the path of an mblk_t between a driver and a mac
275 * client, see mac_sched.c.
276 */
277
278#include <sys/types.h>
279#include <sys/conf.h>
280#include <sys/id_space.h>
281#include <sys/esunddi.h>
282#include <sys/stat.h>
283#include <sys/mkdev.h>
284#include <sys/stream.h>
285#include <sys/strsun.h>
286#include <sys/strsubr.h>
287#include <sys/dlpi.h>
288#include <sys/list.h>
289#include <sys/modhash.h>
290#include <sys/mac_provider.h>
291#include <sys/mac_client_impl.h>
292#include <sys/mac_soft_ring.h>
293#include <sys/mac_stat.h>
294#include <sys/mac_impl.h>
295#include <sys/mac.h>
296#include <sys/dls.h>
297#include <sys/dld.h>
298#include <sys/modctl.h>
299#include <sys/fs/dv_node.h>
300#include <sys/thread.h>
301#include <sys/proc.h>
302#include <sys/callb.h>
303#include <sys/cpuvar.h>
304#include <sys/atomic.h>
305#include <sys/bitmap.h>
306#include <sys/sdt.h>
307#include <sys/mac_flow.h>
308#include <sys/ddi_intr_impl.h>
309#include <sys/disp.h>
310#include <sys/sdt.h>
311#include <sys/vnic.h>
312#include <sys/vnic_impl.h>
313#include <sys/vlan.h>
314#include <inet/ip.h>
315#include <inet/ip6.h>
316#include <sys/exacct.h>
317#include <sys/exacct_impl.h>
318#include <inet/nd.h>
319#include <sys/ethernet.h>
320#include <sys/pool.h>
321#include <sys/pool_pset.h>
322#include <sys/cpupart.h>
323#include <inet/wifi_ioctl.h>
324#include <net/wpa.h>
325
326#define	IMPL_HASHSZ	67	/* prime */
327
328kmem_cache_t		*i_mac_impl_cachep;
329mod_hash_t		*i_mac_impl_hash;
330krwlock_t		i_mac_impl_lock;
331uint_t			i_mac_impl_count;
332static kmem_cache_t	*mac_ring_cache;
333static id_space_t	*minor_ids;
334static uint32_t		minor_count;
335static pool_event_cb_t	mac_pool_event_reg;
336
337/*
338 * Logging stuff. Perhaps mac_logging_interval could be broken into
339 * mac_flow_log_interval and mac_link_log_interval if we want to be
340 * able to schedule them differently.
341 */
342uint_t			mac_logging_interval;
343boolean_t		mac_flow_log_enable;
344boolean_t		mac_link_log_enable;
345timeout_id_t		mac_logging_timer;
346
347#define	MACTYPE_KMODDIR	"mac"
348#define	MACTYPE_HASHSZ	67
349static mod_hash_t	*i_mactype_hash;
350/*
351 * i_mactype_lock synchronizes threads that obtain references to mactype_t
352 * structures through i_mactype_getplugin().
353 */
354static kmutex_t		i_mactype_lock;
355
356/*
357 * mac_tx_percpu_cnt
358 *
359 * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
360 * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
361 * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
362 * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
363 */
364int mac_tx_percpu_cnt;
365int mac_tx_percpu_cnt_max = 128;
366
367/*
368 * Call back functions for the bridge module.  These are guaranteed to be valid
369 * when holding a reference on a link or when holding mip->mi_bridge_lock and
370 * mi_bridge_link is non-NULL.
371 */
372mac_bridge_tx_t mac_bridge_tx_cb;
373mac_bridge_rx_t mac_bridge_rx_cb;
374mac_bridge_ref_t mac_bridge_ref_cb;
375mac_bridge_ls_t mac_bridge_ls_cb;
376
377static int i_mac_constructor(void *, void *, int);
378static void i_mac_destructor(void *, void *);
379static int i_mac_ring_ctor(void *, void *, int);
380static void i_mac_ring_dtor(void *, void *);
381static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
382void mac_tx_client_flush(mac_client_impl_t *);
383void mac_tx_client_block(mac_client_impl_t *);
384static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
385static int mac_start_group_and_rings(mac_group_t *);
386static void mac_stop_group_and_rings(mac_group_t *);
387static void mac_pool_event_cb(pool_event_t, int, void *);
388
389typedef struct netinfo_s {
390	list_node_t	ni_link;
391	void		*ni_record;
392	int		ni_size;
393	int		ni_type;
394} netinfo_t;
395
396/*
397 * Module initialization functions.
398 */
399
400void
401mac_init(void)
402{
403	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
404	    boot_max_ncpus);
405
406	/* Upper bound is mac_tx_percpu_cnt_max */
407	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
408		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
409
410	if (mac_tx_percpu_cnt < 1) {
411		/* Someone set max_tx_percpu_cnt_max to 0 or less */
412		mac_tx_percpu_cnt = 1;
413	}
414
415	ASSERT(mac_tx_percpu_cnt >= 1);
416	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
417	/*
418	 * Make it of the form 2**N - 1 in the range
419	 * [0 .. mac_tx_percpu_cnt_max - 1]
420	 */
421	mac_tx_percpu_cnt--;
422
423	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
424	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
425	    NULL, NULL, NULL, 0);
426	ASSERT(i_mac_impl_cachep != NULL);
427
428	mac_ring_cache = kmem_cache_create("mac_ring_cache",
429	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
430	    NULL, NULL, 0);
431	ASSERT(mac_ring_cache != NULL);
432
433	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
434	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
435	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
436	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
437
438	mac_flow_init();
439	mac_soft_ring_init();
440	mac_bcast_init();
441	mac_client_init();
442
443	i_mac_impl_count = 0;
444
445	i_mactype_hash = mod_hash_create_extended("mactype_hash",
446	    MACTYPE_HASHSZ,
447	    mod_hash_null_keydtor, mod_hash_null_valdtor,
448	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
449
450	/*
451	 * Allocate an id space to manage minor numbers. The range of the
452	 * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
453	 * leaves half of the 32-bit minors available for driver private use.
454	 */
455	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
456	    MAC_PRIVATE_MINOR-1);
457	ASSERT(minor_ids != NULL);
458	minor_count = 0;
459
460	/* Let's default to 20 seconds */
461	mac_logging_interval = 20;
462	mac_flow_log_enable = B_FALSE;
463	mac_link_log_enable = B_FALSE;
464	mac_logging_timer = NULL;
465
466	/* Register to be notified of noteworthy pools events */
467	mac_pool_event_reg.pec_func =  mac_pool_event_cb;
468	mac_pool_event_reg.pec_arg = NULL;
469	pool_event_cb_register(&mac_pool_event_reg);
470}
471
472int
473mac_fini(void)
474{
475
476	if (i_mac_impl_count > 0 || minor_count > 0)
477		return (EBUSY);
478
479	pool_event_cb_unregister(&mac_pool_event_reg);
480
481	id_space_destroy(minor_ids);
482	mac_flow_fini();
483
484	mod_hash_destroy_hash(i_mac_impl_hash);
485	rw_destroy(&i_mac_impl_lock);
486
487	mac_client_fini();
488	kmem_cache_destroy(mac_ring_cache);
489
490	mod_hash_destroy_hash(i_mactype_hash);
491	mac_soft_ring_finish();
492
493
494	return (0);
495}
496
497/*
498 * Initialize a GLDv3 driver's device ops.  A driver that manages its own ops
499 * (e.g. softmac) may pass in a NULL ops argument.
500 */
501void
502mac_init_ops(struct dev_ops *ops, const char *name)
503{
504	major_t major = ddi_name_to_major((char *)name);
505
506	/*
507	 * By returning on error below, we are not letting the driver continue
508	 * in an undefined context.  The mac_register() function will faill if
509	 * DN_GLDV3_DRIVER isn't set.
510	 */
511	if (major == DDI_MAJOR_T_NONE)
512		return;
513	LOCK_DEV_OPS(&devnamesp[major].dn_lock);
514	devnamesp[major].dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER);
515	UNLOCK_DEV_OPS(&devnamesp[major].dn_lock);
516	if (ops != NULL)
517		dld_init_ops(ops, name);
518}
519
520void
521mac_fini_ops(struct dev_ops *ops)
522{
523	dld_fini_ops(ops);
524}
525
526/*ARGSUSED*/
527static int
528i_mac_constructor(void *buf, void *arg, int kmflag)
529{
530	mac_impl_t	*mip = buf;
531
532	bzero(buf, sizeof (mac_impl_t));
533
534	mip->mi_linkstate = LINK_STATE_UNKNOWN;
535
536	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
537	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
538	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
539	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
540
541	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
542	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
543	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
544	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
545
546	mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
547
548	return (0);
549}
550
551/*ARGSUSED*/
552static void
553i_mac_destructor(void *buf, void *arg)
554{
555	mac_impl_t	*mip = buf;
556	mac_cb_info_t	*mcbi;
557
558	ASSERT(mip->mi_ref == 0);
559	ASSERT(mip->mi_active == 0);
560	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
561	ASSERT(mip->mi_devpromisc == 0);
562	ASSERT(mip->mi_ksp == NULL);
563	ASSERT(mip->mi_kstat_count == 0);
564	ASSERT(mip->mi_nclients == 0);
565	ASSERT(mip->mi_nactiveclients == 0);
566	ASSERT(mip->mi_single_active_client == NULL);
567	ASSERT(mip->mi_state_flags == 0);
568	ASSERT(mip->mi_factory_addr == NULL);
569	ASSERT(mip->mi_factory_addr_num == 0);
570	ASSERT(mip->mi_default_tx_ring == NULL);
571
572	mcbi = &mip->mi_notify_cb_info;
573	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
574	ASSERT(mip->mi_notify_bits == 0);
575	ASSERT(mip->mi_notify_thread == NULL);
576	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
577	mcbi->mcbi_lockp = NULL;
578
579	mcbi = &mip->mi_promisc_cb_info;
580	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
581	ASSERT(mip->mi_promisc_list == NULL);
582	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
583	mcbi->mcbi_lockp = NULL;
584
585	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
586	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
587
588	rw_destroy(&mip->mi_rw_lock);
589
590	mutex_destroy(&mip->mi_promisc_lock);
591	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
592	mutex_destroy(&mip->mi_notify_lock);
593	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
594	mutex_destroy(&mip->mi_ring_lock);
595
596	ASSERT(mip->mi_bridge_link == NULL);
597}
598
599/* ARGSUSED */
600static int
601i_mac_ring_ctor(void *buf, void *arg, int kmflag)
602{
603	mac_ring_t *ring = (mac_ring_t *)buf;
604
605	bzero(ring, sizeof (mac_ring_t));
606	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
607	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
608	ring->mr_state = MR_FREE;
609	return (0);
610}
611
612/* ARGSUSED */
613static void
614i_mac_ring_dtor(void *buf, void *arg)
615{
616	mac_ring_t *ring = (mac_ring_t *)buf;
617
618	cv_destroy(&ring->mr_cv);
619	mutex_destroy(&ring->mr_lock);
620}
621
622/*
623 * Common functions to do mac callback addition and deletion. Currently this is
624 * used by promisc callbacks and notify callbacks. List addition and deletion
625 * need to take care of list walkers. List walkers in general, can't hold list
626 * locks and make upcall callbacks due to potential lock order and recursive
627 * reentry issues. Instead list walkers increment the list walker count to mark
628 * the presence of a walker thread. Addition can be carefully done to ensure
629 * that the list walker always sees either the old list or the new list.
630 * However the deletion can't be done while the walker is active, instead the
631 * deleting thread simply marks the entry as logically deleted. The last walker
632 * physically deletes and frees up the logically deleted entries when the walk
633 * is complete.
634 */
635void
636mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
637    mac_cb_t *mcb_elem)
638{
639	mac_cb_t	*p;
640	mac_cb_t	**pp;
641
642	/* Verify it is not already in the list */
643	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
644		if (p == mcb_elem)
645			break;
646	}
647	VERIFY(p == NULL);
648
649	/*
650	 * Add it to the head of the callback list. The membar ensures that
651	 * the following list pointer manipulations reach global visibility
652	 * in exactly the program order below.
653	 */
654	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
655
656	mcb_elem->mcb_nextp = *mcb_head;
657	membar_producer();
658	*mcb_head = mcb_elem;
659}
660
661/*
662 * Mark the entry as logically deleted. If there aren't any walkers unlink
663 * from the list. In either case return the corresponding status.
664 */
665boolean_t
666mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
667    mac_cb_t *mcb_elem)
668{
669	mac_cb_t	*p;
670	mac_cb_t	**pp;
671
672	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
673	/*
674	 * Search the callback list for the entry to be removed
675	 */
676	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
677		if (p == mcb_elem)
678			break;
679	}
680	VERIFY(p != NULL);
681
682	/*
683	 * If there are walkers just mark it as deleted and the last walker
684	 * will remove from the list and free it.
685	 */
686	if (mcbi->mcbi_walker_cnt != 0) {
687		p->mcb_flags |= MCB_CONDEMNED;
688		mcbi->mcbi_del_cnt++;
689		return (B_FALSE);
690	}
691
692	ASSERT(mcbi->mcbi_del_cnt == 0);
693	*pp = p->mcb_nextp;
694	p->mcb_nextp = NULL;
695	return (B_TRUE);
696}
697
698/*
699 * Wait for all pending callback removals to be completed
700 */
701void
702mac_callback_remove_wait(mac_cb_info_t *mcbi)
703{
704	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
705	while (mcbi->mcbi_del_cnt != 0) {
706		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
707		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
708	}
709}
710
711void
712mac_callback_barrier(mac_cb_info_t *mcbi)
713{
714	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
715	ASSERT3U(mcbi->mcbi_barrier_cnt, <, UINT_MAX);
716
717	if (mcbi->mcbi_walker_cnt == 0) {
718		return;
719	}
720
721	mcbi->mcbi_barrier_cnt++;
722	do {
723		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
724	} while (mcbi->mcbi_walker_cnt > 0);
725	mcbi->mcbi_barrier_cnt--;
726	cv_broadcast(&mcbi->mcbi_cv);
727}
728
729void
730mac_callback_walker_enter(mac_cb_info_t *mcbi)
731{
732	mutex_enter(mcbi->mcbi_lockp);
733	/*
734	 * Incoming walkers should give precedence to timely clean-up of
735	 * deleted callback entries and requested barriers.
736	 */
737	while (mcbi->mcbi_del_cnt > 0 || mcbi->mcbi_barrier_cnt > 0) {
738		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
739	}
740	mcbi->mcbi_walker_cnt++;
741	mutex_exit(mcbi->mcbi_lockp);
742}
743
744/*
745 * The last mac callback walker does the cleanup. Walk the list and unlik
746 * all the logically deleted entries and construct a temporary list of
747 * removed entries. Return the list of removed entries to the caller.
748 */
749static mac_cb_t *
750mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
751{
752	mac_cb_t	*p;
753	mac_cb_t	**pp;
754	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
755	int	cnt = 0;
756
757	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
758	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
759
760	pp = mcb_head;
761	while (*pp != NULL) {
762		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
763			p = *pp;
764			*pp = p->mcb_nextp;
765			p->mcb_nextp = rmlist;
766			rmlist = p;
767			cnt++;
768			continue;
769		}
770		pp = &(*pp)->mcb_nextp;
771	}
772
773	ASSERT(mcbi->mcbi_del_cnt == cnt);
774	mcbi->mcbi_del_cnt = 0;
775	return (rmlist);
776}
777
778void
779mac_callback_walker_exit(mac_cb_info_t *mcbi, mac_cb_t **headp,
780    boolean_t is_promisc)
781{
782	boolean_t do_wake = B_FALSE;
783
784	mutex_enter(mcbi->mcbi_lockp);
785
786	/* If walkers remain, nothing more can be done for now */
787	if (--mcbi->mcbi_walker_cnt != 0) {
788		mutex_exit(mcbi->mcbi_lockp);
789		return;
790	}
791
792	if (mcbi->mcbi_del_cnt != 0) {
793		mac_cb_t *rmlist;
794
795		rmlist = mac_callback_walker_cleanup(mcbi, headp);
796
797		if (!is_promisc) {
798			/* The "normal" non-promisc callback clean-up */
799			mac_callback_free(rmlist);
800		} else {
801			mac_cb_t *mcb, *mcb_next;
802
803			/*
804			 * The promisc callbacks are in 2 lists, one off the
805			 * 'mip' and another off the 'mcip' threaded by
806			 * mpi_mi_link and mpi_mci_link respectively.  There
807			 * is, however, only a single shared total walker
808			 * count, and an entry cannot be physically unlinked if
809			 * a walker is active on either list. The last walker
810			 * does this cleanup of logically deleted entries.
811			 *
812			 * With a list of callbacks deleted from above from
813			 * mi_promisc_list (headp), remove the corresponding
814			 * entry from mci_promisc_list (headp_pair) and free
815			 * the structure.
816			 */
817			for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
818				mac_promisc_impl_t *mpip;
819				mac_client_impl_t *mcip;
820
821				mcb_next = mcb->mcb_nextp;
822				mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
823				mcip = mpip->mpi_mcip;
824
825				ASSERT3P(&mcip->mci_mip->mi_promisc_cb_info,
826				    ==, mcbi);
827				ASSERT3P(&mcip->mci_mip->mi_promisc_list,
828				    ==, headp);
829
830				VERIFY(mac_callback_remove(mcbi,
831				    &mcip->mci_promisc_list,
832				    &mpip->mpi_mci_link));
833				mcb->mcb_flags = 0;
834				mcb->mcb_nextp = NULL;
835				kmem_cache_free(mac_promisc_impl_cache, mpip);
836			}
837		}
838
839		/*
840		 * Wake any walker threads that could be waiting in
841		 * mac_callback_walker_enter() until deleted items have been
842		 * cleaned from the list.
843		 */
844		do_wake = B_TRUE;
845	}
846
847	if (mcbi->mcbi_barrier_cnt != 0) {
848		/*
849		 * One or more threads are waiting for all walkers to exit the
850		 * callback list.  Notify them, now that the list is clear.
851		 */
852		do_wake = B_TRUE;
853	}
854
855	if (do_wake) {
856		cv_broadcast(&mcbi->mcbi_cv);
857	}
858	mutex_exit(mcbi->mcbi_lockp);
859}
860
861static boolean_t
862mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
863{
864	mac_cb_t	*mcb;
865
866	/* Verify it is not already in the list */
867	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
868		if (mcb == mcb_elem)
869			return (B_TRUE);
870	}
871
872	return (B_FALSE);
873}
874
875static boolean_t
876mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
877{
878	boolean_t	found;
879
880	mutex_enter(mcbi->mcbi_lockp);
881	found = mac_callback_lookup(mcb_headp, mcb_elem);
882	mutex_exit(mcbi->mcbi_lockp);
883
884	return (found);
885}
886
887/* Free the list of removed callbacks */
888void
889mac_callback_free(mac_cb_t *rmlist)
890{
891	mac_cb_t	*mcb;
892	mac_cb_t	*mcb_next;
893
894	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
895		mcb_next = mcb->mcb_nextp;
896		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
897	}
898}
899
900void
901i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
902{
903	mac_cb_info_t	*mcbi;
904
905	/*
906	 * Signal the notify thread even after mi_ref has become zero and
907	 * mi_disabled is set. The synchronization with the notify thread
908	 * happens in mac_unregister and that implies the driver must make
909	 * sure it is single-threaded (with respect to mac calls) and that
910	 * all pending mac calls have returned before it calls mac_unregister
911	 */
912	rw_enter(&i_mac_impl_lock, RW_READER);
913	if (mip->mi_state_flags & MIS_DISABLED)
914		goto exit;
915
916	/*
917	 * Guard against incorrect notifications.  (Running a newer
918	 * mac client against an older implementation?)
919	 */
920	if (type >= MAC_NNOTE)
921		goto exit;
922
923	mcbi = &mip->mi_notify_cb_info;
924	mutex_enter(mcbi->mcbi_lockp);
925	mip->mi_notify_bits |= (1 << type);
926	cv_broadcast(&mcbi->mcbi_cv);
927	mutex_exit(mcbi->mcbi_lockp);
928
929exit:
930	rw_exit(&i_mac_impl_lock);
931}
932
933/*
934 * Mac serialization primitives. Please see the block comment at the
935 * top of the file.
936 */
937void
938i_mac_perim_enter(mac_impl_t *mip)
939{
940	mac_client_impl_t	*mcip;
941
942	if (mip->mi_state_flags & MIS_IS_VNIC) {
943		/*
944		 * This is a VNIC. Return the lower mac since that is what
945		 * we want to serialize on.
946		 */
947		mcip = mac_vnic_lower(mip);
948		mip = mcip->mci_mip;
949	}
950
951	mutex_enter(&mip->mi_perim_lock);
952	if (mip->mi_perim_owner == curthread) {
953		mip->mi_perim_ocnt++;
954		mutex_exit(&mip->mi_perim_lock);
955		return;
956	}
957
958	while (mip->mi_perim_owner != NULL)
959		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
960
961	mip->mi_perim_owner = curthread;
962	ASSERT(mip->mi_perim_ocnt == 0);
963	mip->mi_perim_ocnt++;
964#ifdef DEBUG
965	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
966	    MAC_PERIM_STACK_DEPTH);
967#endif
968	mutex_exit(&mip->mi_perim_lock);
969}
970
971int
972i_mac_perim_enter_nowait(mac_impl_t *mip)
973{
974	/*
975	 * The vnic is a special case, since the serialization is done based
976	 * on the lower mac. If the lower mac is busy, it does not imply the
977	 * vnic can't be unregistered. But in the case of other drivers,
978	 * a busy perimeter or open mac handles implies that the mac is busy
979	 * and can't be unregistered.
980	 */
981	if (mip->mi_state_flags & MIS_IS_VNIC) {
982		i_mac_perim_enter(mip);
983		return (0);
984	}
985
986	mutex_enter(&mip->mi_perim_lock);
987	if (mip->mi_perim_owner != NULL) {
988		mutex_exit(&mip->mi_perim_lock);
989		return (EBUSY);
990	}
991	ASSERT(mip->mi_perim_ocnt == 0);
992	mip->mi_perim_owner = curthread;
993	mip->mi_perim_ocnt++;
994	mutex_exit(&mip->mi_perim_lock);
995
996	return (0);
997}
998
999void
1000i_mac_perim_exit(mac_impl_t *mip)
1001{
1002	mac_client_impl_t *mcip;
1003
1004	if (mip->mi_state_flags & MIS_IS_VNIC) {
1005		/*
1006		 * This is a VNIC. Return the lower mac since that is what
1007		 * we want to serialize on.
1008		 */
1009		mcip = mac_vnic_lower(mip);
1010		mip = mcip->mci_mip;
1011	}
1012
1013	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
1014
1015	mutex_enter(&mip->mi_perim_lock);
1016	if (--mip->mi_perim_ocnt == 0) {
1017		mip->mi_perim_owner = NULL;
1018		cv_signal(&mip->mi_perim_cv);
1019	}
1020	mutex_exit(&mip->mi_perim_lock);
1021}
1022
1023/*
1024 * Returns whether the current thread holds the mac perimeter. Used in making
1025 * assertions.
1026 */
1027boolean_t
1028mac_perim_held(mac_handle_t mh)
1029{
1030	mac_impl_t	*mip = (mac_impl_t *)mh;
1031	mac_client_impl_t *mcip;
1032
1033	if (mip->mi_state_flags & MIS_IS_VNIC) {
1034		/*
1035		 * This is a VNIC. Return the lower mac since that is what
1036		 * we want to serialize on.
1037		 */
1038		mcip = mac_vnic_lower(mip);
1039		mip = mcip->mci_mip;
1040	}
1041	return (mip->mi_perim_owner == curthread);
1042}
1043
1044/*
1045 * mac client interfaces to enter the mac perimeter of a mac end point, given
1046 * its mac handle, or macname or linkid.
1047 */
1048void
1049mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
1050{
1051	mac_impl_t	*mip = (mac_impl_t *)mh;
1052
1053	i_mac_perim_enter(mip);
1054	/*
1055	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
1056	 * mac_open has been done internally while entering the perimeter.
1057	 * This information is used in mac_perim_exit
1058	 */
1059	MAC_ENCODE_MPH(*mphp, mip, 0);
1060}
1061
1062int
1063mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
1064{
1065	int	err;
1066	mac_handle_t	mh;
1067
1068	if ((err = mac_open(name, &mh)) != 0)
1069		return (err);
1070
1071	mac_perim_enter_by_mh(mh, mphp);
1072	MAC_ENCODE_MPH(*mphp, mh, 1);
1073	return (0);
1074}
1075
1076int
1077mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
1078{
1079	int	err;
1080	mac_handle_t	mh;
1081
1082	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
1083		return (err);
1084
1085	mac_perim_enter_by_mh(mh, mphp);
1086	MAC_ENCODE_MPH(*mphp, mh, 1);
1087	return (0);
1088}
1089
1090void
1091mac_perim_exit(mac_perim_handle_t mph)
1092{
1093	mac_impl_t	*mip;
1094	boolean_t	need_close;
1095
1096	MAC_DECODE_MPH(mph, mip, need_close);
1097	i_mac_perim_exit(mip);
1098	if (need_close)
1099		mac_close((mac_handle_t)mip);
1100}
1101
1102int
1103mac_hold(const char *macname, mac_impl_t **pmip)
1104{
1105	mac_impl_t	*mip;
1106	int		err;
1107
1108	/*
1109	 * Check the device name length to make sure it won't overflow our
1110	 * buffer.
1111	 */
1112	if (strlen(macname) >= MAXNAMELEN)
1113		return (EINVAL);
1114
1115	/*
1116	 * Look up its entry in the global hash table.
1117	 */
1118	rw_enter(&i_mac_impl_lock, RW_WRITER);
1119	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
1120	    (mod_hash_val_t *)&mip);
1121
1122	if (err != 0) {
1123		rw_exit(&i_mac_impl_lock);
1124		return (ENOENT);
1125	}
1126
1127	if (mip->mi_state_flags & MIS_DISABLED) {
1128		rw_exit(&i_mac_impl_lock);
1129		return (ENOENT);
1130	}
1131
1132	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1133		rw_exit(&i_mac_impl_lock);
1134		return (EBUSY);
1135	}
1136
1137	mip->mi_ref++;
1138	rw_exit(&i_mac_impl_lock);
1139
1140	*pmip = mip;
1141	return (0);
1142}
1143
1144void
1145mac_rele(mac_impl_t *mip)
1146{
1147	rw_enter(&i_mac_impl_lock, RW_WRITER);
1148	ASSERT(mip->mi_ref != 0);
1149	if (--mip->mi_ref == 0) {
1150		ASSERT(mip->mi_nactiveclients == 0 &&
1151		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
1152	}
1153	rw_exit(&i_mac_impl_lock);
1154}
1155
1156/*
1157 * Private GLDv3 function to start a MAC instance.
1158 */
1159int
1160mac_start(mac_handle_t mh)
1161{
1162	mac_impl_t	*mip = (mac_impl_t *)mh;
1163	int		err = 0;
1164	mac_group_t	*defgrp;
1165
1166	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1167	ASSERT(mip->mi_start != NULL);
1168
1169	/*
1170	 * Check whether the device is already started.
1171	 */
1172	if (mip->mi_active++ == 0) {
1173		mac_ring_t *ring = NULL;
1174
1175		/*
1176		 * Start the device.
1177		 */
1178		err = mip->mi_start(mip->mi_driver);
1179		if (err != 0) {
1180			mip->mi_active--;
1181			return (err);
1182		}
1183
1184		/*
1185		 * Start the default tx ring.
1186		 */
1187		if (mip->mi_default_tx_ring != NULL) {
1188
1189			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1190			if (ring->mr_state != MR_INUSE) {
1191				err = mac_start_ring(ring);
1192				if (err != 0) {
1193					mip->mi_active--;
1194					return (err);
1195				}
1196			}
1197		}
1198
1199		if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1200			/*
1201			 * Start the default group which is responsible
1202			 * for receiving broadcast and multicast
1203			 * traffic for both primary and non-primary
1204			 * MAC clients.
1205			 */
1206			ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1207			err = mac_start_group_and_rings(defgrp);
1208			if (err != 0) {
1209				mip->mi_active--;
1210				if ((ring != NULL) &&
1211				    (ring->mr_state == MR_INUSE))
1212					mac_stop_ring(ring);
1213				return (err);
1214			}
1215			mac_set_group_state(defgrp, MAC_GROUP_STATE_SHARED);
1216		}
1217	}
1218
1219	return (err);
1220}
1221
1222/*
1223 * Private GLDv3 function to stop a MAC instance.
1224 */
1225void
1226mac_stop(mac_handle_t mh)
1227{
1228	mac_impl_t	*mip = (mac_impl_t *)mh;
1229	mac_group_t	*grp;
1230
1231	ASSERT(mip->mi_stop != NULL);
1232	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1233
1234	/*
1235	 * Check whether the device is still needed.
1236	 */
1237	ASSERT(mip->mi_active != 0);
1238	if (--mip->mi_active == 0) {
1239		if ((grp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) {
1240			/*
1241			 * There should be no more active clients since the
1242			 * MAC is being stopped. Stop the default RX group
1243			 * and transition it back to registered state.
1244			 *
1245			 * When clients are torn down, the groups
1246			 * are release via mac_release_rx_group which
1247			 * knows the the default group is always in
1248			 * started mode since broadcast uses it. So
1249			 * we can assert that their are no clients
1250			 * (since mac_bcast_add doesn't register itself
1251			 * as a client) and group is in SHARED state.
1252			 */
1253			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1254			ASSERT(MAC_GROUP_NO_CLIENT(grp) &&
1255			    mip->mi_nactiveclients == 0);
1256			mac_stop_group_and_rings(grp);
1257			mac_set_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1258		}
1259
1260		if (mip->mi_default_tx_ring != NULL) {
1261			mac_ring_t *ring;
1262
1263			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1264			if (ring->mr_state == MR_INUSE) {
1265				mac_stop_ring(ring);
1266				ring->mr_flag = 0;
1267			}
1268		}
1269
1270		/*
1271		 * Stop the device.
1272		 */
1273		mip->mi_stop(mip->mi_driver);
1274	}
1275}
1276
1277int
1278i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1279{
1280	int		err = 0;
1281
1282	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1283	ASSERT(mip->mi_setpromisc != NULL);
1284
1285	if (on) {
1286		/*
1287		 * Enable promiscuous mode on the device if not yet enabled.
1288		 */
1289		if (mip->mi_devpromisc++ == 0) {
1290			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1291			if (err != 0) {
1292				mip->mi_devpromisc--;
1293				return (err);
1294			}
1295			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1296		}
1297	} else {
1298		if (mip->mi_devpromisc == 0)
1299			return (EPROTO);
1300
1301		/*
1302		 * Disable promiscuous mode on the device if this is the last
1303		 * enabling.
1304		 */
1305		if (--mip->mi_devpromisc == 0) {
1306			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1307			if (err != 0) {
1308				mip->mi_devpromisc++;
1309				return (err);
1310			}
1311			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1312		}
1313	}
1314
1315	return (0);
1316}
1317
1318/*
1319 * The promiscuity state can change any time. If the caller needs to take
1320 * actions that are atomic with the promiscuity state, then the caller needs
1321 * to bracket the entire sequence with mac_perim_enter/exit
1322 */
1323boolean_t
1324mac_promisc_get(mac_handle_t mh)
1325{
1326	mac_impl_t		*mip = (mac_impl_t *)mh;
1327
1328	/*
1329	 * Return the current promiscuity.
1330	 */
1331	return (mip->mi_devpromisc != 0);
1332}
1333
1334/*
1335 * Invoked at MAC instance attach time to initialize the list
1336 * of factory MAC addresses supported by a MAC instance. This function
1337 * builds a local cache in the mac_impl_t for the MAC addresses
1338 * supported by the underlying hardware. The MAC clients themselves
1339 * use the mac_addr_factory*() functions to query and reserve
1340 * factory MAC addresses.
1341 */
1342void
1343mac_addr_factory_init(mac_impl_t *mip)
1344{
1345	mac_capab_multifactaddr_t capab;
1346	uint8_t *addr;
1347	int i;
1348
1349	/*
1350	 * First round to see how many factory MAC addresses are available.
1351	 */
1352	bzero(&capab, sizeof (capab));
1353	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1354	    &capab) || (capab.mcm_naddr == 0)) {
1355		/*
1356		 * The MAC instance doesn't support multiple factory
1357		 * MAC addresses, we're done here.
1358		 */
1359		return;
1360	}
1361
1362	/*
1363	 * Allocate the space and get all the factory addresses.
1364	 */
1365	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1366	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1367
1368	mip->mi_factory_addr_num = capab.mcm_naddr;
1369	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1370	    sizeof (mac_factory_addr_t), KM_SLEEP);
1371
1372	for (i = 0; i < capab.mcm_naddr; i++) {
1373		bcopy(addr + i * MAXMACADDRLEN,
1374		    mip->mi_factory_addr[i].mfa_addr,
1375		    mip->mi_type->mt_addr_length);
1376		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1377	}
1378
1379	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1380}
1381
1382void
1383mac_addr_factory_fini(mac_impl_t *mip)
1384{
1385	if (mip->mi_factory_addr == NULL) {
1386		ASSERT(mip->mi_factory_addr_num == 0);
1387		return;
1388	}
1389
1390	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1391	    sizeof (mac_factory_addr_t));
1392
1393	mip->mi_factory_addr = NULL;
1394	mip->mi_factory_addr_num = 0;
1395}
1396
1397/*
1398 * Reserve a factory MAC address. If *slot is set to -1, the function
1399 * attempts to reserve any of the available factory MAC addresses and
1400 * returns the reserved slot id. If no slots are available, the function
1401 * returns ENOSPC. If *slot is not set to -1, the function reserves
1402 * the specified slot if it is available, or returns EBUSY is the slot
1403 * is already used. Returns ENOTSUP if the underlying MAC does not
1404 * support multiple factory addresses. If the slot number is not -1 but
1405 * is invalid, returns EINVAL.
1406 */
1407int
1408mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1409{
1410	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1411	mac_impl_t *mip = mcip->mci_mip;
1412	int i, ret = 0;
1413
1414	i_mac_perim_enter(mip);
1415	/*
1416	 * Protect against concurrent readers that may need a self-consistent
1417	 * view of the factory addresses
1418	 */
1419	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1420
1421	if (mip->mi_factory_addr_num == 0) {
1422		ret = ENOTSUP;
1423		goto bail;
1424	}
1425
1426	if (*slot != -1) {
1427		/* check the specified slot */
1428		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1429			ret = EINVAL;
1430			goto bail;
1431		}
1432		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1433			ret = EBUSY;
1434			goto bail;
1435		}
1436	} else {
1437		/* pick the next available slot */
1438		for (i = 0; i < mip->mi_factory_addr_num; i++) {
1439			if (!mip->mi_factory_addr[i].mfa_in_use)
1440				break;
1441		}
1442
1443		if (i == mip->mi_factory_addr_num) {
1444			ret = ENOSPC;
1445			goto bail;
1446		}
1447		*slot = i+1;
1448	}
1449
1450	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1451	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1452
1453bail:
1454	rw_exit(&mip->mi_rw_lock);
1455	i_mac_perim_exit(mip);
1456	return (ret);
1457}
1458
1459/*
1460 * Release the specified factory MAC address slot.
1461 */
1462void
1463mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1464{
1465	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1466	mac_impl_t *mip = mcip->mci_mip;
1467
1468	i_mac_perim_enter(mip);
1469	/*
1470	 * Protect against concurrent readers that may need a self-consistent
1471	 * view of the factory addresses
1472	 */
1473	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1474
1475	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1476	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1477
1478	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1479
1480	rw_exit(&mip->mi_rw_lock);
1481	i_mac_perim_exit(mip);
1482}
1483
1484/*
1485 * Stores in mac_addr the value of the specified MAC address. Returns
1486 * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1487 * The caller must provide a string of at least MAXNAMELEN bytes.
1488 */
1489void
1490mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1491    uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1492{
1493	mac_impl_t *mip = (mac_impl_t *)mh;
1494	boolean_t in_use;
1495
1496	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1497
1498	/*
1499	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1500	 * and mi_rw_lock
1501	 */
1502	rw_enter(&mip->mi_rw_lock, RW_READER);
1503	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1504	*addr_len = mip->mi_type->mt_addr_length;
1505	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1506	if (in_use && client_name != NULL) {
1507		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1508		    client_name, MAXNAMELEN);
1509	}
1510	if (in_use_arg != NULL)
1511		*in_use_arg = in_use;
1512	rw_exit(&mip->mi_rw_lock);
1513}
1514
1515/*
1516 * Returns the number of factory MAC addresses (in addition to the
1517 * primary MAC address), 0 if the underlying MAC doesn't support
1518 * that feature.
1519 */
1520uint_t
1521mac_addr_factory_num(mac_handle_t mh)
1522{
1523	mac_impl_t *mip = (mac_impl_t *)mh;
1524
1525	return (mip->mi_factory_addr_num);
1526}
1527
1528
1529void
1530mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1531{
1532	mac_ring_t	*ring;
1533
1534	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1535		ring->mr_flag &= ~flag;
1536}
1537
1538/*
1539 * The following mac_hwrings_xxx() functions are private mac client functions
1540 * used by the aggr driver to access and control the underlying HW Rx group
1541 * and rings. In this case, the aggr driver has exclusive control of the
1542 * underlying HW Rx group/rings, it calls the following functions to
1543 * start/stop the HW Rx rings, disable/enable polling, add/remove MAC
1544 * addresses, or set up the Rx callback.
1545 */
1546/* ARGSUSED */
1547static void
1548mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1549    mblk_t *mp_chain, boolean_t loopback)
1550{
1551	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
1552	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1553	mac_direct_rx_t		proc;
1554	void			*arg1;
1555	mac_resource_handle_t	arg2;
1556
1557	proc = srs_rx->sr_func;
1558	arg1 = srs_rx->sr_arg1;
1559	arg2 = mac_srs->srs_mrh;
1560
1561	proc(arg1, arg2, mp_chain, NULL);
1562}
1563
1564/*
1565 * This function is called to get the list of HW rings that are reserved by
1566 * an exclusive mac client.
1567 *
1568 * Return value: the number of HW rings.
1569 */
1570int
1571mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1572    mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1573{
1574	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1575	flow_entry_t		*flent = mcip->mci_flent;
1576	mac_group_t		*grp;
1577	mac_ring_t		*ring;
1578	int			cnt = 0;
1579
1580	if (rtype == MAC_RING_TYPE_RX) {
1581		grp = flent->fe_rx_ring_group;
1582	} else if (rtype == MAC_RING_TYPE_TX) {
1583		grp = flent->fe_tx_ring_group;
1584	} else {
1585		ASSERT(B_FALSE);
1586		return (-1);
1587	}
1588
1589	/*
1590	 * The MAC client did not reserve an Rx group, return directly.
1591	 * This is probably because the underlying MAC does not support
1592	 * any groups.
1593	 */
1594	if (hwgh != NULL)
1595		*hwgh = NULL;
1596	if (grp == NULL)
1597		return (0);
1598	/*
1599	 * This group must be reserved by this MAC client.
1600	 */
1601	ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1602	    (mcip == MAC_GROUP_ONLY_CLIENT(grp)));
1603
1604	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
1605		ASSERT(cnt < MAX_RINGS_PER_GROUP);
1606		hwrh[cnt] = (mac_ring_handle_t)ring;
1607	}
1608	if (hwgh != NULL)
1609		*hwgh = (mac_group_handle_t)grp;
1610
1611	return (cnt);
1612}
1613
1614/*
1615 * Get the HW ring handles of the given group index. If the MAC
1616 * doesn't have a group at this index, or any groups at all, then 0 is
1617 * returned and hwgh is set to NULL. This is a private client API. The
1618 * MAC perimeter must be held when calling this function.
1619 *
1620 * mh: A handle to the MAC that owns the group.
1621 *
1622 * idx: The index of the HW group to be read.
1623 *
1624 * hwgh: If non-NULL, contains a handle to the HW group on return.
1625 *
1626 * hwrh: An array of ring handles pointing to the HW rings in the
1627 * group. The array must be large enough to hold a handle to each ring
1628 * in the group. To be safe, this array should be of size MAX_RINGS_PER_GROUP.
1629 *
1630 * rtype: Used to determine if we are fetching Rx or Tx rings.
1631 *
1632 * Returns the number of rings in the group.
1633 */
1634uint_t
1635mac_hwrings_idx_get(mac_handle_t mh, uint_t idx, mac_group_handle_t *hwgh,
1636    mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1637{
1638	mac_impl_t		*mip = (mac_impl_t *)mh;
1639	mac_group_t		*grp;
1640	mac_ring_t		*ring;
1641	uint_t			cnt = 0;
1642
1643	/*
1644	 * The MAC perimeter must be held when accessing the
1645	 * mi_{rx,tx}_groups fields.
1646	 */
1647	ASSERT(MAC_PERIM_HELD(mh));
1648	ASSERT(rtype == MAC_RING_TYPE_RX || rtype == MAC_RING_TYPE_TX);
1649
1650	if (rtype == MAC_RING_TYPE_RX) {
1651		grp = mip->mi_rx_groups;
1652	} else {
1653		ASSERT(rtype == MAC_RING_TYPE_TX);
1654		grp = mip->mi_tx_groups;
1655	}
1656
1657	while (grp != NULL && grp->mrg_index != idx)
1658		grp = grp->mrg_next;
1659
1660	/*
1661	 * If the MAC doesn't have a group at this index or doesn't
1662	 * impelement RINGS capab, then set hwgh to NULL and return 0.
1663	 */
1664	if (hwgh != NULL)
1665		*hwgh = NULL;
1666
1667	if (grp == NULL)
1668		return (0);
1669
1670	ASSERT3U(idx, ==, grp->mrg_index);
1671
1672	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) {
1673		ASSERT3U(cnt, <, MAX_RINGS_PER_GROUP);
1674		hwrh[cnt] = (mac_ring_handle_t)ring;
1675	}
1676
1677	/* A group should always have at least one ring. */
1678	ASSERT3U(cnt, >, 0);
1679
1680	if (hwgh != NULL)
1681		*hwgh = (mac_group_handle_t)grp;
1682
1683	return (cnt);
1684}
1685
1686/*
1687 * This function is called to get info about Tx/Rx rings.
1688 *
1689 * Return value: returns uint_t which will have various bits set
1690 * that indicates different properties of the ring.
1691 */
1692uint_t
1693mac_hwring_getinfo(mac_ring_handle_t rh)
1694{
1695	mac_ring_t *ring = (mac_ring_t *)rh;
1696	mac_ring_info_t *info = &ring->mr_info;
1697
1698	return (info->mri_flags);
1699}
1700
1701/*
1702 * Set the passthru callback on the hardware ring.
1703 */
1704void
1705mac_hwring_set_passthru(mac_ring_handle_t hwrh, mac_rx_t fn, void *arg1,
1706    mac_resource_handle_t arg2)
1707{
1708	mac_ring_t *hwring = (mac_ring_t *)hwrh;
1709
1710	ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX);
1711
1712	hwring->mr_classify_type = MAC_PASSTHRU_CLASSIFIER;
1713
1714	hwring->mr_pt_fn = fn;
1715	hwring->mr_pt_arg1 = arg1;
1716	hwring->mr_pt_arg2 = arg2;
1717}
1718
1719/*
1720 * Clear the passthru callback on the hardware ring.
1721 */
1722void
1723mac_hwring_clear_passthru(mac_ring_handle_t hwrh)
1724{
1725	mac_ring_t *hwring = (mac_ring_t *)hwrh;
1726
1727	ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX);
1728
1729	hwring->mr_classify_type = MAC_NO_CLASSIFIER;
1730
1731	hwring->mr_pt_fn = NULL;
1732	hwring->mr_pt_arg1 = NULL;
1733	hwring->mr_pt_arg2 = NULL;
1734}
1735
1736void
1737mac_client_set_flow_cb(mac_client_handle_t mch, mac_rx_t func, void *arg1)
1738{
1739	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1740	flow_entry_t		*flent = mcip->mci_flent;
1741
1742	mutex_enter(&flent->fe_lock);
1743	flent->fe_cb_fn = (flow_fn_t)func;
1744	flent->fe_cb_arg1 = arg1;
1745	flent->fe_cb_arg2 = NULL;
1746	flent->fe_flags &= ~FE_MC_NO_DATAPATH;
1747	mutex_exit(&flent->fe_lock);
1748}
1749
1750void
1751mac_client_clear_flow_cb(mac_client_handle_t mch)
1752{
1753	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1754	flow_entry_t		*flent = mcip->mci_flent;
1755
1756	mutex_enter(&flent->fe_lock);
1757	flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
1758	flent->fe_cb_arg1 = NULL;
1759	flent->fe_cb_arg2 = NULL;
1760	flent->fe_flags |= FE_MC_NO_DATAPATH;
1761	mutex_exit(&flent->fe_lock);
1762}
1763
1764/*
1765 * Export ddi interrupt handles from the HW ring to the pseudo ring and
1766 * setup the RX callback of the mac client which exclusively controls
1767 * HW ring.
1768 */
1769void
1770mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh,
1771    mac_ring_handle_t pseudo_rh)
1772{
1773	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1774	mac_ring_t		*pseudo_ring;
1775	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1776
1777	if (pseudo_rh != NULL) {
1778		pseudo_ring = (mac_ring_t *)pseudo_rh;
1779		/* Export the ddi handles to pseudo ring */
1780		pseudo_ring->mr_info.mri_intr.mi_ddi_handle =
1781		    hw_ring->mr_info.mri_intr.mi_ddi_handle;
1782		pseudo_ring->mr_info.mri_intr.mi_ddi_shared =
1783		    hw_ring->mr_info.mri_intr.mi_ddi_shared;
1784		/*
1785		 * Save a pointer to pseudo ring in the hw ring. If
1786		 * interrupt handle changes, the hw ring will be
1787		 * notified of the change (see mac_ring_intr_set())
1788		 * and the appropriate change has to be made to
1789		 * the pseudo ring that has exported the ddi handle.
1790		 */
1791		hw_ring->mr_prh = pseudo_rh;
1792	}
1793
1794	if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1795		ASSERT(!(mac_srs->srs_type & SRST_TX));
1796		mac_srs->srs_mrh = prh;
1797		mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1798	}
1799}
1800
1801void
1802mac_hwring_teardown(mac_ring_handle_t hwrh)
1803{
1804	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1805	mac_soft_ring_set_t	*mac_srs;
1806
1807	if (hw_ring == NULL)
1808		return;
1809	hw_ring->mr_prh = NULL;
1810	if (hw_ring->mr_type == MAC_RING_TYPE_RX) {
1811		mac_srs = hw_ring->mr_srs;
1812		ASSERT(!(mac_srs->srs_type & SRST_TX));
1813		mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1814		mac_srs->srs_mrh = NULL;
1815	}
1816}
1817
1818int
1819mac_hwring_disable_intr(mac_ring_handle_t rh)
1820{
1821	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1822	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1823
1824	return (intr->mi_disable(intr->mi_handle));
1825}
1826
1827int
1828mac_hwring_enable_intr(mac_ring_handle_t rh)
1829{
1830	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1831	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1832
1833	return (intr->mi_enable(intr->mi_handle));
1834}
1835
1836/*
1837 * Start the HW ring pointed to by rh.
1838 *
1839 * This is used by special MAC clients that are MAC themselves and
1840 * need to exert control over the underlying HW rings of the NIC.
1841 */
1842int
1843mac_hwring_start(mac_ring_handle_t rh)
1844{
1845	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1846	int rv = 0;
1847
1848	if (rr_ring->mr_state != MR_INUSE)
1849		rv = mac_start_ring(rr_ring);
1850
1851	return (rv);
1852}
1853
1854/*
1855 * Stop the HW ring pointed to by rh. Also see mac_hwring_start().
1856 */
1857void
1858mac_hwring_stop(mac_ring_handle_t rh)
1859{
1860	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1861
1862	if (rr_ring->mr_state != MR_FREE)
1863		mac_stop_ring(rr_ring);
1864}
1865
1866/*
1867 * Remove the quiesced flag from the HW ring pointed to by rh.
1868 *
1869 * This is used by special MAC clients that are MAC themselves and
1870 * need to exert control over the underlying HW rings of the NIC.
1871 */
1872int
1873mac_hwring_activate(mac_ring_handle_t rh)
1874{
1875	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1876
1877	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1878	return (0);
1879}
1880
1881/*
1882 * Quiesce the HW ring pointed to by rh. Also see mac_hwring_activate().
1883 */
1884void
1885mac_hwring_quiesce(mac_ring_handle_t rh)
1886{
1887	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1888
1889	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1890}
1891
1892mblk_t *
1893mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1894{
1895	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1896	mac_ring_info_t *info = &rr_ring->mr_info;
1897
1898	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1899}
1900
1901/*
1902 * Send packets through a selected tx ring.
1903 */
1904mblk_t *
1905mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1906{
1907	mac_ring_t *ring = (mac_ring_t *)rh;
1908	mac_ring_info_t *info = &ring->mr_info;
1909
1910	ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1911	    ring->mr_state >= MR_INUSE);
1912	return (info->mri_tx(info->mri_driver, mp));
1913}
1914
1915/*
1916 * Query stats for a particular rx/tx ring
1917 */
1918int
1919mac_hwring_getstat(mac_ring_handle_t rh, uint_t stat, uint64_t *val)
1920{
1921	mac_ring_t	*ring = (mac_ring_t *)rh;
1922	mac_ring_info_t *info = &ring->mr_info;
1923
1924	return (info->mri_stat(info->mri_driver, stat, val));
1925}
1926
1927/*
1928 * Private function that is only used by aggr to send packets through
1929 * a port/Tx ring. Since aggr exposes a pseudo Tx ring even for ports
1930 * that does not expose Tx rings, aggr_ring_tx() entry point needs
1931 * access to mac_impl_t to send packets through m_tx() entry point.
1932 * It accomplishes this by calling mac_hwring_send_priv() function.
1933 */
1934mblk_t *
1935mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
1936{
1937	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1938	mac_impl_t *mip = mcip->mci_mip;
1939
1940	return (mac_provider_tx(mip, rh, mp, mcip));
1941}
1942
1943/*
1944 * Private function that is only used by aggr to update the default transmission
1945 * ring. Because aggr exposes a pseudo Tx ring even for ports that may
1946 * temporarily be down, it may need to update the default ring that is used by
1947 * MAC such that it refers to a link that can actively be used to send traffic.
1948 * Note that this is different from the case where the port has been removed
1949 * from the group. In those cases, all of the rings will be torn down because
1950 * the ring will no longer exist. It's important to give aggr a case where the
1951 * rings can still exist such that it may be able to continue to send LACP PDUs
1952 * to potentially restore the link.
1953 */
1954void
1955mac_hwring_set_default(mac_handle_t mh, mac_ring_handle_t rh)
1956{
1957	mac_impl_t *mip = (mac_impl_t *)mh;
1958	mac_ring_t *ring = (mac_ring_t *)rh;
1959
1960	ASSERT(MAC_PERIM_HELD(mh));
1961	VERIFY(mip->mi_state_flags & MIS_IS_AGGR);
1962
1963	/*
1964	 * We used to condition this assignment on the ring's
1965	 * 'mr_state' being one of 'MR_INUSE'. However, there are
1966	 * cases where this is called before the ring has any active
1967	 * clients, and therefore is not marked as in use. Since the
1968	 * sole purpose of this function is for aggr to make sure
1969	 * 'mi_default_tx_ring' matches 'lg_tx_ports[0]', its
1970	 * imperative that we update its value regardless of ring
1971	 * state. Otherwise, we can end up in a state where
1972	 * 'mi_default_tx_ring' points to a pseudo ring of a downed
1973	 * port, even when 'lg_tx_ports[0]' points to a port that is
1974	 * up.
1975	 */
1976	mip->mi_default_tx_ring = rh;
1977}
1978
1979int
1980mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1981{
1982	mac_group_t *group = (mac_group_t *)gh;
1983
1984	return (mac_group_addmac(group, addr));
1985}
1986
1987int
1988mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1989{
1990	mac_group_t *group = (mac_group_t *)gh;
1991
1992	return (mac_group_remmac(group, addr));
1993}
1994
1995/*
1996 * Program the group's HW VLAN filter if it has such support.
1997 * Otherwise, the group will implicitly accept tagged traffic and
1998 * there is nothing to do.
1999 */
2000int
2001mac_hwgroup_addvlan(mac_group_handle_t gh, uint16_t vid)
2002{
2003	mac_group_t *group = (mac_group_t *)gh;
2004
2005	if (!MAC_GROUP_HW_VLAN(group))
2006		return (0);
2007
2008	return (mac_group_addvlan(group, vid));
2009}
2010
2011int
2012mac_hwgroup_remvlan(mac_group_handle_t gh, uint16_t vid)
2013{
2014	mac_group_t *group = (mac_group_t *)gh;
2015
2016	if (!MAC_GROUP_HW_VLAN(group))
2017		return (0);
2018
2019	return (mac_group_remvlan(group, vid));
2020}
2021
2022/*
2023 * Determine if a MAC has HW VLAN support. This is a private API
2024 * consumed by aggr. In the future it might be nice to have a bitfield
2025 * in mac_capab_rings_t to track which forms of HW filtering are
2026 * supported by the MAC.
2027 */
2028boolean_t
2029mac_has_hw_vlan(mac_handle_t mh)
2030{
2031	mac_impl_t *mip = (mac_impl_t *)mh;
2032
2033	return (MAC_GROUP_HW_VLAN(mip->mi_rx_groups));
2034}
2035
2036/*
2037 * Get the number of Rx HW groups on this MAC.
2038 */
2039uint_t
2040mac_get_num_rx_groups(mac_handle_t mh)
2041{
2042	mac_impl_t *mip = (mac_impl_t *)mh;
2043
2044	ASSERT(MAC_PERIM_HELD(mh));
2045	return (mip->mi_rx_group_count);
2046}
2047
2048int
2049mac_set_promisc(mac_handle_t mh, boolean_t value)
2050{
2051	mac_impl_t *mip = (mac_impl_t *)mh;
2052
2053	ASSERT(MAC_PERIM_HELD(mh));
2054	return (i_mac_promisc_set(mip, value));
2055}
2056
2057/*
2058 * Set the RX group to be shared/reserved. Note that the group must be
2059 * started/stopped outside of this function.
2060 */
2061void
2062mac_set_group_state(mac_group_t *grp, mac_group_state_t state)
2063{
2064	/*
2065	 * If there is no change in the group state, just return.
2066	 */
2067	if (grp->mrg_state == state)
2068		return;
2069
2070	switch (state) {
2071	case MAC_GROUP_STATE_RESERVED:
2072		/*
2073		 * Successfully reserved the group.
2074		 *
2075		 * Given that there is an exclusive client controlling this
2076		 * group, we enable the group level polling when available,
2077		 * so that SRSs get to turn on/off individual rings they's
2078		 * assigned to.
2079		 */
2080		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
2081
2082		if (grp->mrg_type == MAC_RING_TYPE_RX &&
2083		    GROUP_INTR_DISABLE_FUNC(grp) != NULL) {
2084			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
2085		}
2086		break;
2087
2088	case MAC_GROUP_STATE_SHARED:
2089		/*
2090		 * Set all rings of this group to software classified.
2091		 * If the group has an overriding interrupt, then re-enable it.
2092		 */
2093		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
2094
2095		if (grp->mrg_type == MAC_RING_TYPE_RX &&
2096		    GROUP_INTR_ENABLE_FUNC(grp) != NULL) {
2097			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
2098		}
2099		/* The ring is not available for reservations any more */
2100		break;
2101
2102	case MAC_GROUP_STATE_REGISTERED:
2103		/* Also callable from mac_register, perim is not held */
2104		break;
2105
2106	default:
2107		ASSERT(B_FALSE);
2108		break;
2109	}
2110
2111	grp->mrg_state = state;
2112}
2113
2114/*
2115 * Quiesce future hardware classified packets for the specified Rx ring
2116 */
2117static void
2118mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
2119{
2120	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
2121	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
2122
2123	mutex_enter(&rx_ring->mr_lock);
2124	rx_ring->mr_flag |= ring_flag;
2125	while (rx_ring->mr_refcnt != 0)
2126		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
2127	mutex_exit(&rx_ring->mr_lock);
2128}
2129
2130/*
2131 * Please see mac_tx for details about the per cpu locking scheme
2132 */
2133static void
2134mac_tx_lock_all(mac_client_impl_t *mcip)
2135{
2136	int	i;
2137
2138	for (i = 0; i <= mac_tx_percpu_cnt; i++)
2139		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
2140}
2141
2142static void
2143mac_tx_unlock_all(mac_client_impl_t *mcip)
2144{
2145	int	i;
2146
2147	for (i = mac_tx_percpu_cnt; i >= 0; i--)
2148		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
2149}
2150
2151static void
2152mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
2153{
2154	int	i;
2155
2156	for (i = mac_tx_percpu_cnt; i > 0; i--)
2157		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
2158}
2159
2160static int
2161mac_tx_sum_refcnt(mac_client_impl_t *mcip)
2162{
2163	int	i;
2164	int	refcnt = 0;
2165
2166	for (i = 0; i <= mac_tx_percpu_cnt; i++)
2167		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
2168
2169	return (refcnt);
2170}
2171
2172/*
2173 * Stop future Tx packets coming down from the client in preparation for
2174 * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
2175 * of rings between clients
2176 */
2177void
2178mac_tx_client_block(mac_client_impl_t *mcip)
2179{
2180	mac_tx_lock_all(mcip);
2181	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
2182	while (mac_tx_sum_refcnt(mcip) != 0) {
2183		mac_tx_unlock_allbutzero(mcip);
2184		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
2185		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
2186		mac_tx_lock_all(mcip);
2187	}
2188	mac_tx_unlock_all(mcip);
2189}
2190
2191void
2192mac_tx_client_unblock(mac_client_impl_t *mcip)
2193{
2194	mac_tx_lock_all(mcip);
2195	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
2196	mac_tx_unlock_all(mcip);
2197	/*
2198	 * We may fail to disable flow control for the last MAC_NOTE_TX
2199	 * notification because the MAC client is quiesced. Send the
2200	 * notification again.
2201	 */
2202	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
2203}
2204
2205/*
2206 * Wait for an SRS to quiesce. The SRS worker will signal us when the
2207 * quiesce is done.
2208 */
2209static void
2210mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
2211{
2212	mutex_enter(&srs->srs_lock);
2213	while (!(srs->srs_state & srs_flag))
2214		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
2215	mutex_exit(&srs->srs_lock);
2216}
2217
2218/*
2219 * Quiescing an Rx SRS is achieved by the following sequence. The protocol
2220 * works bottom up by cutting off packet flow from the bottommost point in the
2221 * mac, then the SRS, and then the soft rings. There are 2 use cases of this
2222 * mechanism. One is a temporary quiesce of the SRS, such as say while changing
2223 * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
2224 * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
2225 * for the SRS and MR flags. In the former case the threads pause waiting for
2226 * a restart, while in the latter case the threads exit. The Tx SRS teardown
2227 * is also mostly similar to the above.
2228 *
2229 * 1. Stop future hardware classified packets at the lowest level in the mac.
2230 *    Remove any hardware classification rule (CONDEMNED case) and mark the
2231 *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
2232 *    from increasing. Upcalls from the driver that come through hardware
2233 *    classification will be dropped in mac_rx from now on. Then we wait for
2234 *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
2235 *    sure there aren't any upcall threads from the driver through hardware
2236 *    classification. In the case of SRS teardown we also remove the
2237 *    classification rule in the driver.
2238 *
2239 * 2. Stop future software classified packets by marking the flow entry with
2240 *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
2241 *    increasing. We also remove the flow entry from the table in the latter
2242 *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
2243 *    that indicates there aren't any active threads using that flow entry.
2244 *
2245 * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
2246 *    SRS worker thread, and the soft ring threads are quiesced in sequence
2247 *    with the SRS worker thread serving as a master controller. This
2248 *    mechansim is explained in mac_srs_worker_quiesce().
2249 *
2250 * The restart mechanism to reactivate the SRS and softrings is explained
2251 * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
2252 * restart sequence.
2253 */
2254void
2255mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2256{
2257	flow_entry_t	*flent = srs->srs_flent;
2258	uint_t	mr_flag, srs_done_flag;
2259
2260	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
2261	ASSERT(!(srs->srs_type & SRST_TX));
2262
2263	if (srs_quiesce_flag == SRS_CONDEMNED) {
2264		mr_flag = MR_CONDEMNED;
2265		srs_done_flag = SRS_CONDEMNED_DONE;
2266		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
2267			mac_srs_client_poll_disable(srs->srs_mcip, srs);
2268	} else {
2269		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
2270		mr_flag = MR_QUIESCE;
2271		srs_done_flag = SRS_QUIESCE_DONE;
2272		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
2273			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
2274	}
2275
2276	if (srs->srs_ring != NULL) {
2277		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
2278	} else {
2279		/*
2280		 * SRS is driven by software classification. In case
2281		 * of CONDEMNED, the top level teardown functions will
2282		 * deal with flow removal.
2283		 */
2284		if (srs_quiesce_flag != SRS_CONDEMNED) {
2285			FLOW_MARK(flent, FE_QUIESCE);
2286			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
2287		}
2288	}
2289
2290	/*
2291	 * Signal the SRS to quiesce itself, and then cv_wait for the
2292	 * SRS quiesce to complete. The SRS worker thread will wake us
2293	 * up when the quiesce is complete
2294	 */
2295	mac_srs_signal(srs, srs_quiesce_flag);
2296	mac_srs_quiesce_wait(srs, srs_done_flag);
2297}
2298
2299/*
2300 * Remove an SRS.
2301 */
2302void
2303mac_rx_srs_remove(mac_soft_ring_set_t *srs)
2304{
2305	flow_entry_t *flent = srs->srs_flent;
2306	int i;
2307
2308	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
2309	/*
2310	 * Locate and remove our entry in the fe_rx_srs[] array, and
2311	 * adjust the fe_rx_srs array entries and array count by
2312	 * moving the last entry into the vacated spot.
2313	 */
2314	mutex_enter(&flent->fe_lock);
2315	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2316		if (flent->fe_rx_srs[i] == srs)
2317			break;
2318	}
2319
2320	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
2321	if (i != flent->fe_rx_srs_cnt - 1) {
2322		flent->fe_rx_srs[i] =
2323		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
2324		i = flent->fe_rx_srs_cnt - 1;
2325	}
2326
2327	flent->fe_rx_srs[i] = NULL;
2328	flent->fe_rx_srs_cnt--;
2329	mutex_exit(&flent->fe_lock);
2330
2331	mac_srs_free(srs);
2332}
2333
2334static void
2335mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
2336{
2337	mutex_enter(&srs->srs_lock);
2338	srs->srs_state &= ~flag;
2339	mutex_exit(&srs->srs_lock);
2340}
2341
2342void
2343mac_rx_srs_restart(mac_soft_ring_set_t *srs)
2344{
2345	flow_entry_t	*flent = srs->srs_flent;
2346	mac_ring_t	*mr;
2347
2348	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
2349	ASSERT((srs->srs_type & SRST_TX) == 0);
2350
2351	/*
2352	 * This handles a change in the number of SRSs between the quiesce and
2353	 * and restart operation of a flow.
2354	 */
2355	if (!SRS_QUIESCED(srs))
2356		return;
2357
2358	/*
2359	 * Signal the SRS to restart itself. Wait for the restart to complete
2360	 * Note that we only restart the SRS if it is not marked as
2361	 * permanently quiesced.
2362	 */
2363	if (!SRS_QUIESCED_PERMANENT(srs)) {
2364		mac_srs_signal(srs, SRS_RESTART);
2365		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2366		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2367
2368		mac_srs_client_poll_restart(srs->srs_mcip, srs);
2369	}
2370
2371	/* Finally clear the flags to let the packets in */
2372	mr = srs->srs_ring;
2373	if (mr != NULL) {
2374		MAC_RING_UNMARK(mr, MR_QUIESCE);
2375		/* In case the ring was stopped, safely restart it */
2376		if (mr->mr_state != MR_INUSE)
2377			(void) mac_start_ring(mr);
2378	} else {
2379		FLOW_UNMARK(flent, FE_QUIESCE);
2380	}
2381}
2382
2383/*
2384 * Temporary quiesce of a flow and associated Rx SRS.
2385 * Please see block comment above mac_rx_classify_flow_rem.
2386 */
2387/* ARGSUSED */
2388int
2389mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
2390{
2391	int		i;
2392
2393	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2394		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
2395		    SRS_QUIESCE);
2396	}
2397	return (0);
2398}
2399
2400/*
2401 * Restart a flow and associated Rx SRS that has been quiesced temporarily
2402 * Please see block comment above mac_rx_classify_flow_rem
2403 */
2404/* ARGSUSED */
2405int
2406mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
2407{
2408	int		i;
2409
2410	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
2411		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
2412
2413	return (0);
2414}
2415
2416void
2417mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
2418{
2419	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2420	flow_entry_t		*flent = mcip->mci_flent;
2421	mac_impl_t		*mip = mcip->mci_mip;
2422	mac_soft_ring_set_t	*mac_srs;
2423	int			i;
2424
2425	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2426
2427	if (flent == NULL)
2428		return;
2429
2430	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
2431		mac_srs = flent->fe_rx_srs[i];
2432		mutex_enter(&mac_srs->srs_lock);
2433		if (on)
2434			mac_srs->srs_state |= SRS_QUIESCE_PERM;
2435		else
2436			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
2437		mutex_exit(&mac_srs->srs_lock);
2438	}
2439}
2440
2441void
2442mac_rx_client_quiesce(mac_client_handle_t mch)
2443{
2444	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2445	mac_impl_t		*mip = mcip->mci_mip;
2446
2447	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2448
2449	if (MCIP_DATAPATH_SETUP(mcip)) {
2450		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
2451		    NULL);
2452		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2453		    mac_rx_classify_flow_quiesce, NULL);
2454	}
2455}
2456
2457void
2458mac_rx_client_restart(mac_client_handle_t mch)
2459{
2460	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2461	mac_impl_t		*mip = mcip->mci_mip;
2462
2463	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
2464
2465	if (MCIP_DATAPATH_SETUP(mcip)) {
2466		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
2467		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2468		    mac_rx_classify_flow_restart, NULL);
2469	}
2470}
2471
2472/*
2473 * This function only quiesces the Tx SRS and softring worker threads. Callers
2474 * need to make sure that there aren't any mac client threads doing current or
2475 * future transmits in the mac before calling this function.
2476 */
2477void
2478mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2479{
2480	mac_client_impl_t	*mcip = srs->srs_mcip;
2481
2482	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2483
2484	ASSERT(srs->srs_type & SRST_TX);
2485	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2486	    srs_quiesce_flag == SRS_QUIESCE);
2487
2488	/*
2489	 * Signal the SRS to quiesce itself, and then cv_wait for the
2490	 * SRS quiesce to complete. The SRS worker thread will wake us
2491	 * up when the quiesce is complete
2492	 */
2493	mac_srs_signal(srs, srs_quiesce_flag);
2494	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2495	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2496}
2497
2498void
2499mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2500{
2501	/*
2502	 * Resizing the fanout could result in creation of new SRSs.
2503	 * They may not necessarily be in the quiesced state in which
2504	 * case it need be restarted
2505	 */
2506	if (!SRS_QUIESCED(srs))
2507		return;
2508
2509	mac_srs_signal(srs, SRS_RESTART);
2510	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2511	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2512}
2513
2514/*
2515 * Temporary quiesce of a flow and associated Rx SRS.
2516 * Please see block comment above mac_rx_srs_quiesce
2517 */
2518/* ARGSUSED */
2519int
2520mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2521{
2522	/*
2523	 * The fe_tx_srs is null for a subflow on an interface that is
2524	 * not plumbed
2525	 */
2526	if (flent->fe_tx_srs != NULL)
2527		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2528	return (0);
2529}
2530
2531/* ARGSUSED */
2532int
2533mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2534{
2535	/*
2536	 * The fe_tx_srs is null for a subflow on an interface that is
2537	 * not plumbed
2538	 */
2539	if (flent->fe_tx_srs != NULL)
2540		mac_tx_srs_restart(flent->fe_tx_srs);
2541	return (0);
2542}
2543
2544static void
2545i_mac_tx_client_quiesce(mac_client_handle_t mch, uint_t srs_quiesce_flag)
2546{
2547	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
2548
2549	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2550
2551	mac_tx_client_block(mcip);
2552	if (MCIP_TX_SRS(mcip) != NULL) {
2553		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2554		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2555		    mac_tx_flow_quiesce, NULL);
2556	}
2557}
2558
2559void
2560mac_tx_client_quiesce(mac_client_handle_t mch)
2561{
2562	i_mac_tx_client_quiesce(mch, SRS_QUIESCE);
2563}
2564
2565void
2566mac_tx_client_condemn(mac_client_handle_t mch)
2567{
2568	i_mac_tx_client_quiesce(mch, SRS_CONDEMNED);
2569}
2570
2571void
2572mac_tx_client_restart(mac_client_handle_t mch)
2573{
2574	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
2575
2576	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2577
2578	mac_tx_client_unblock(mcip);
2579	if (MCIP_TX_SRS(mcip) != NULL) {
2580		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2581		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2582		    mac_tx_flow_restart, NULL);
2583	}
2584}
2585
2586void
2587mac_tx_client_flush(mac_client_impl_t *mcip)
2588{
2589	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2590
2591	mac_tx_client_quiesce((mac_client_handle_t)mcip);
2592	mac_tx_client_restart((mac_client_handle_t)mcip);
2593}
2594
2595void
2596mac_client_quiesce(mac_client_impl_t *mcip)
2597{
2598	mac_rx_client_quiesce((mac_client_handle_t)mcip);
2599	mac_tx_client_quiesce((mac_client_handle_t)mcip);
2600}
2601
2602void
2603mac_client_restart(mac_client_impl_t *mcip)
2604{
2605	mac_rx_client_restart((mac_client_handle_t)mcip);
2606	mac_tx_client_restart((mac_client_handle_t)mcip);
2607}
2608
2609/*
2610 * Allocate a minor number.
2611 */
2612minor_t
2613mac_minor_hold(boolean_t sleep)
2614{
2615	id_t id;
2616
2617	/*
2618	 * Grab a value from the arena.
2619	 */
2620	atomic_inc_32(&minor_count);
2621
2622	if (sleep)
2623		return ((uint_t)id_alloc(minor_ids));
2624
2625	if ((id = id_alloc_nosleep(minor_ids)) == -1) {
2626		atomic_dec_32(&minor_count);
2627		return (0);
2628	}
2629
2630	return ((uint_t)id);
2631}
2632
2633/*
2634 * Release a previously allocated minor number.
2635 */
2636void
2637mac_minor_rele(minor_t minor)
2638{
2639	/*
2640	 * Return the value to the arena.
2641	 */
2642	id_free(minor_ids, minor);
2643	atomic_dec_32(&minor_count);
2644}
2645
2646uint32_t
2647mac_no_notification(mac_handle_t mh)
2648{
2649	mac_impl_t *mip = (mac_impl_t *)mh;
2650
2651	return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2652	    mip->mi_capab_legacy.ml_unsup_note : 0);
2653}
2654
2655/*
2656 * Prevent any new opens of this mac in preparation for unregister
2657 */
2658int
2659i_mac_disable(mac_impl_t *mip)
2660{
2661	mac_client_impl_t	*mcip;
2662
2663	rw_enter(&i_mac_impl_lock, RW_WRITER);
2664	if (mip->mi_state_flags & MIS_DISABLED) {
2665		/* Already disabled, return success */
2666		rw_exit(&i_mac_impl_lock);
2667		return (0);
2668	}
2669	/*
2670	 * See if there are any other references to this mac_t (e.g., VLAN's).
2671	 * If so return failure. If all the other checks below pass, then
2672	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2673	 * any new VLAN's from being created or new mac client opens of this
2674	 * mac end point.
2675	 */
2676	if (mip->mi_ref > 0) {
2677		rw_exit(&i_mac_impl_lock);
2678		return (EBUSY);
2679	}
2680
2681	/*
2682	 * mac clients must delete all multicast groups they join before
2683	 * closing. bcast groups are reference counted, the last client
2684	 * to delete the group will wait till the group is physically
2685	 * deleted. Since all clients have closed this mac end point
2686	 * mi_bcast_ngrps must be zero at this point
2687	 */
2688	ASSERT(mip->mi_bcast_ngrps == 0);
2689
2690	/*
2691	 * Don't let go of this if it has some flows.
2692	 * All other code guarantees no flows are added to a disabled
2693	 * mac, therefore it is sufficient to check for the flow table
2694	 * only here.
2695	 */
2696	mcip = mac_primary_client_handle(mip);
2697	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2698		rw_exit(&i_mac_impl_lock);
2699		return (ENOTEMPTY);
2700	}
2701
2702	mip->mi_state_flags |= MIS_DISABLED;
2703	rw_exit(&i_mac_impl_lock);
2704	return (0);
2705}
2706
2707int
2708mac_disable_nowait(mac_handle_t mh)
2709{
2710	mac_impl_t	*mip = (mac_impl_t *)mh;
2711	int err;
2712
2713	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2714		return (err);
2715	err = i_mac_disable(mip);
2716	i_mac_perim_exit(mip);
2717	return (err);
2718}
2719
2720int
2721mac_disable(mac_handle_t mh)
2722{
2723	mac_impl_t	*mip = (mac_impl_t *)mh;
2724	int err;
2725
2726	i_mac_perim_enter(mip);
2727	err = i_mac_disable(mip);
2728	i_mac_perim_exit(mip);
2729
2730	/*
2731	 * Clean up notification thread and wait for it to exit.
2732	 */
2733	if (err == 0)
2734		i_mac_notify_exit(mip);
2735
2736	return (err);
2737}
2738
2739/*
2740 * Called when the MAC instance has a non empty flow table, to de-multiplex
2741 * incoming packets to the right flow.
2742 */
2743/* ARGSUSED */
2744static mblk_t *
2745mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2746{
2747	flow_entry_t	*flent = NULL;
2748	uint_t		flags = FLOW_INBOUND;
2749	int		err;
2750
2751	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2752	if (err != 0) {
2753		/* no registered receive function */
2754		return (mp);
2755	} else {
2756		mac_client_impl_t	*mcip;
2757
2758		/*
2759		 * This flent might just be an additional one on the MAC client,
2760		 * i.e. for classification purposes (different fdesc), however
2761		 * the resources, SRS et. al., are in the mci_flent, so if
2762		 * this isn't the mci_flent, we need to get it.
2763		 */
2764		if ((mcip = flent->fe_mcip) != NULL &&
2765		    mcip->mci_flent != flent) {
2766			FLOW_REFRELE(flent);
2767			flent = mcip->mci_flent;
2768			FLOW_TRY_REFHOLD(flent, err);
2769			if (err != 0)
2770				return (mp);
2771		}
2772		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2773		    B_FALSE);
2774		FLOW_REFRELE(flent);
2775	}
2776	return (NULL);
2777}
2778
2779mblk_t *
2780mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2781{
2782	mac_impl_t	*mip = (mac_impl_t *)mh;
2783	mblk_t		*bp, *bp1, **bpp, *list = NULL;
2784
2785	/*
2786	 * We walk the chain and attempt to classify each packet.
2787	 * The packets that couldn't be classified will be returned
2788	 * back to the caller.
2789	 */
2790	bp = mp_chain;
2791	bpp = &list;
2792	while (bp != NULL) {
2793		bp1 = bp;
2794		bp = bp->b_next;
2795		bp1->b_next = NULL;
2796
2797		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2798			*bpp = bp1;
2799			bpp = &bp1->b_next;
2800		}
2801	}
2802	return (list);
2803}
2804
2805static int
2806mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2807{
2808	mac_ring_handle_t ring = arg;
2809
2810	if (flent->fe_tx_srs)
2811		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2812	return (0);
2813}
2814
2815void
2816i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2817{
2818	mac_client_impl_t	*cclient;
2819	mac_soft_ring_set_t	*mac_srs;
2820
2821	/*
2822	 * After grabbing the mi_rw_lock, the list of clients can't change.
2823	 * If there are any clients mi_disabled must be B_FALSE and can't
2824	 * get set since there are clients. If there aren't any clients we
2825	 * don't do anything. In any case the mip has to be valid. The driver
2826	 * must make sure that it goes single threaded (with respect to mac
2827	 * calls) and wait for all pending mac calls to finish before calling
2828	 * mac_unregister.
2829	 */
2830	rw_enter(&i_mac_impl_lock, RW_READER);
2831	if (mip->mi_state_flags & MIS_DISABLED) {
2832		rw_exit(&i_mac_impl_lock);
2833		return;
2834	}
2835
2836	/*
2837	 * Get MAC tx srs from walking mac_client_handle list.
2838	 */
2839	rw_enter(&mip->mi_rw_lock, RW_READER);
2840	for (cclient = mip->mi_clients_list; cclient != NULL;
2841	    cclient = cclient->mci_client_next) {
2842		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) {
2843			mac_tx_srs_wakeup(mac_srs, ring);
2844		} else {
2845			/*
2846			 * Aggr opens underlying ports in exclusive mode
2847			 * and registers flow control callbacks using
2848			 * mac_tx_client_notify(). When opened in
2849			 * exclusive mode, Tx SRS won't be created
2850			 * during mac_unicast_add().
2851			 */
2852			if (cclient->mci_state_flags & MCIS_EXCLUSIVE) {
2853				mac_tx_invoke_callbacks(cclient,
2854				    (mac_tx_cookie_t)ring);
2855			}
2856		}
2857		(void) mac_flow_walk(cclient->mci_subflow_tab,
2858		    mac_tx_flow_srs_wakeup, ring);
2859	}
2860	rw_exit(&mip->mi_rw_lock);
2861	rw_exit(&i_mac_impl_lock);
2862}
2863
2864/* ARGSUSED */
2865void
2866mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2867    boolean_t add)
2868{
2869	mac_impl_t *mip = (mac_impl_t *)mh;
2870
2871	i_mac_perim_enter((mac_impl_t *)mh);
2872	/*
2873	 * If no specific refresh function was given then default to the
2874	 * driver's m_multicst entry point.
2875	 */
2876	if (refresh == NULL) {
2877		refresh = mip->mi_multicst;
2878		arg = mip->mi_driver;
2879	}
2880
2881	mac_bcast_refresh(mip, refresh, arg, add);
2882	i_mac_perim_exit((mac_impl_t *)mh);
2883}
2884
2885void
2886mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2887{
2888	mac_impl_t	*mip = (mac_impl_t *)mh;
2889
2890	/*
2891	 * If no specific refresh function was given then default to the
2892	 * driver's m_promisc entry point.
2893	 */
2894	if (refresh == NULL) {
2895		refresh = mip->mi_setpromisc;
2896		arg = mip->mi_driver;
2897	}
2898	ASSERT(refresh != NULL);
2899
2900	/*
2901	 * Call the refresh function with the current promiscuity.
2902	 */
2903	refresh(arg, (mip->mi_devpromisc != 0));
2904}
2905
2906/*
2907 * The mac client requests that the mac not to change its margin size to
2908 * be less than the specified value.  If "current" is B_TRUE, then the client
2909 * requests the mac not to change its margin size to be smaller than the
2910 * current size. Further, return the current margin size value in this case.
2911 *
2912 * We keep every requested size in an ordered list from largest to smallest.
2913 */
2914int
2915mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2916{
2917	mac_impl_t		*mip = (mac_impl_t *)mh;
2918	mac_margin_req_t	**pp, *p;
2919	int			err = 0;
2920
2921	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2922	if (current)
2923		*marginp = mip->mi_margin;
2924
2925	/*
2926	 * If the current margin value cannot satisfy the margin requested,
2927	 * return ENOTSUP directly.
2928	 */
2929	if (*marginp > mip->mi_margin) {
2930		err = ENOTSUP;
2931		goto done;
2932	}
2933
2934	/*
2935	 * Check whether the given margin is already in the list. If so,
2936	 * bump the reference count.
2937	 */
2938	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2939		if (p->mmr_margin == *marginp) {
2940			/*
2941			 * The margin requested is already in the list,
2942			 * so just bump the reference count.
2943			 */
2944			p->mmr_ref++;
2945			goto done;
2946		}
2947		if (p->mmr_margin < *marginp)
2948			break;
2949	}
2950
2951
2952	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2953	p->mmr_margin = *marginp;
2954	p->mmr_ref++;
2955	p->mmr_nextp = *pp;
2956	*pp = p;
2957
2958done:
2959	rw_exit(&(mip->mi_rw_lock));
2960	return (err);
2961}
2962
2963/*
2964 * The mac client requests to cancel its previous mac_margin_add() request.
2965 * We remove the requested margin size from the list.
2966 */
2967int
2968mac_margin_remove(mac_handle_t mh, uint32_t margin)
2969{
2970	mac_impl_t		*mip = (mac_impl_t *)mh;
2971	mac_margin_req_t	**pp, *p;
2972	int			err = 0;
2973
2974	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2975	/*
2976	 * Find the entry in the list for the given margin.
2977	 */
2978	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2979		if (p->mmr_margin == margin) {
2980			if (--p->mmr_ref == 0)
2981				break;
2982
2983			/*
2984			 * There is still a reference to this address so
2985			 * there's nothing more to do.
2986			 */
2987			goto done;
2988		}
2989	}
2990
2991	/*
2992	 * We did not find an entry for the given margin.
2993	 */
2994	if (p == NULL) {
2995		err = ENOENT;
2996		goto done;
2997	}
2998
2999	ASSERT(p->mmr_ref == 0);
3000
3001	/*
3002	 * Remove it from the list.
3003	 */
3004	*pp = p->mmr_nextp;
3005	kmem_free(p, sizeof (mac_margin_req_t));
3006done:
3007	rw_exit(&(mip->mi_rw_lock));
3008	return (err);
3009}
3010
3011boolean_t
3012mac_margin_update(mac_handle_t mh, uint32_t margin)
3013{
3014	mac_impl_t	*mip = (mac_impl_t *)mh;
3015	uint32_t	margin_needed = 0;
3016
3017	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
3018
3019	if (mip->mi_mmrp != NULL)
3020		margin_needed = mip->mi_mmrp->mmr_margin;
3021
3022	if (margin_needed <= margin)
3023		mip->mi_margin = margin;
3024
3025	rw_exit(&(mip->mi_rw_lock));
3026
3027	if (margin_needed <= margin)
3028		i_mac_notify(mip, MAC_NOTE_MARGIN);
3029
3030	return (margin_needed <= margin);
3031}
3032
3033/*
3034 * MAC clients use this interface to request that a MAC device not change its
3035 * MTU below the specified amount. At this time, that amount must be within the
3036 * range of the device's current minimum and the device's current maximum. eg. a
3037 * client cannot request a 3000 byte MTU when the device's MTU is currently
3038 * 2000.
3039 *
3040 * If "current" is set to B_TRUE, then the request is to simply to reserve the
3041 * current underlying mac's maximum for this mac client and return it in mtup.
3042 */
3043int
3044mac_mtu_add(mac_handle_t mh, uint32_t *mtup, boolean_t current)
3045{
3046	mac_impl_t		*mip = (mac_impl_t *)mh;
3047	mac_mtu_req_t		*prev, *cur;
3048	mac_propval_range_t	mpr;
3049	int			err;
3050
3051	i_mac_perim_enter(mip);
3052	rw_enter(&mip->mi_rw_lock, RW_WRITER);
3053
3054	if (current == B_TRUE)
3055		*mtup = mip->mi_sdu_max;
3056	mpr.mpr_count = 1;
3057	err = mac_prop_info(mh, MAC_PROP_MTU, "mtu", NULL, 0, &mpr, NULL);
3058	if (err != 0) {
3059		rw_exit(&mip->mi_rw_lock);
3060		i_mac_perim_exit(mip);
3061		return (err);
3062	}
3063
3064	if (*mtup > mip->mi_sdu_max ||
3065	    *mtup < mpr.mpr_range_uint32[0].mpur_min) {
3066		rw_exit(&mip->mi_rw_lock);
3067		i_mac_perim_exit(mip);
3068		return (ENOTSUP);
3069	}
3070
3071	prev = NULL;
3072	for (cur = mip->mi_mtrp; cur != NULL; cur = cur->mtr_nextp) {
3073		if (*mtup == cur->mtr_mtu) {
3074			cur->mtr_ref++;
3075			rw_exit(&mip->mi_rw_lock);
3076			i_mac_perim_exit(mip);
3077			return (0);
3078		}
3079
3080		if (*mtup > cur->mtr_mtu)
3081			break;
3082
3083		prev = cur;
3084	}
3085
3086	cur = kmem_alloc(sizeof (mac_mtu_req_t), KM_SLEEP);
3087	cur->mtr_mtu = *mtup;
3088	cur->mtr_ref = 1;
3089	if (prev != NULL) {
3090		cur->mtr_nextp = prev->mtr_nextp;
3091		prev->mtr_nextp = cur;
3092	} else {
3093		cur->mtr_nextp = mip->mi_mtrp;
3094		mip->mi_mtrp = cur;
3095	}
3096
3097	rw_exit(&mip->mi_rw_lock);
3098	i_mac_perim_exit(mip);
3099	return (0);
3100}
3101
3102int
3103mac_mtu_remove(mac_handle_t mh, uint32_t mtu)
3104{
3105	mac_impl_t *mip = (mac_impl_t *)mh;
3106	mac_mtu_req_t *cur, *prev;
3107
3108	i_mac_perim_enter(mip);
3109	rw_enter(&mip->mi_rw_lock, RW_WRITER);
3110
3111	prev = NULL;
3112	for (cur = mip->mi_mtrp; cur != NULL; cur = cur->mtr_nextp) {
3113		if (cur->mtr_mtu == mtu) {
3114			ASSERT(cur->mtr_ref > 0);
3115			cur->mtr_ref--;
3116			if (cur->mtr_ref == 0) {
3117				if (prev == NULL) {
3118					mip->mi_mtrp = cur->mtr_nextp;
3119				} else {
3120					prev->mtr_nextp = cur->mtr_nextp;
3121				}
3122				kmem_free(cur, sizeof (mac_mtu_req_t));
3123			}
3124			rw_exit(&mip->mi_rw_lock);
3125			i_mac_perim_exit(mip);
3126			return (0);
3127		}
3128
3129		prev = cur;
3130	}
3131
3132	rw_exit(&mip->mi_rw_lock);
3133	i_mac_perim_exit(mip);
3134	return (ENOENT);
3135}
3136
3137/*
3138 * MAC Type Plugin functions.
3139 */
3140
3141mactype_t *
3142mactype_getplugin(const char *pname)
3143{
3144	mactype_t	*mtype = NULL;
3145	boolean_t	tried_modload = B_FALSE;
3146
3147	mutex_enter(&i_mactype_lock);
3148
3149find_registered_mactype:
3150	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
3151	    (mod_hash_val_t *)&mtype) != 0) {
3152		if (!tried_modload) {
3153			/*
3154			 * If the plugin has not yet been loaded, then
3155			 * attempt to load it now.  If modload() succeeds,
3156			 * the plugin should have registered using
3157			 * mactype_register(), in which case we can go back
3158			 * and attempt to find it again.
3159			 */
3160			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
3161				tried_modload = B_TRUE;
3162				goto find_registered_mactype;
3163			}
3164		}
3165	} else {
3166		/*
3167		 * Note that there's no danger that the plugin we've loaded
3168		 * could be unloaded between the modload() step and the
3169		 * reference count bump here, as we're holding
3170		 * i_mactype_lock, which mactype_unregister() also holds.
3171		 */
3172		atomic_inc_32(&mtype->mt_ref);
3173	}
3174
3175	mutex_exit(&i_mactype_lock);
3176	return (mtype);
3177}
3178
3179mactype_register_t *
3180mactype_alloc(uint_t mactype_version)
3181{
3182	mactype_register_t *mtrp;
3183
3184	/*
3185	 * Make sure there isn't a version mismatch between the plugin and
3186	 * the framework.  In the future, if multiple versions are
3187	 * supported, this check could become more sophisticated.
3188	 */
3189	if (mactype_version != MACTYPE_VERSION)
3190		return (NULL);
3191
3192	mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
3193	mtrp->mtr_version = mactype_version;
3194	return (mtrp);
3195}
3196
3197void
3198mactype_free(mactype_register_t *mtrp)
3199{
3200	kmem_free(mtrp, sizeof (mactype_register_t));
3201}
3202
3203int
3204mactype_register(mactype_register_t *mtrp)
3205{
3206	mactype_t	*mtp;
3207	mactype_ops_t	*ops = mtrp->mtr_ops;
3208
3209	/* Do some sanity checking before we register this MAC type. */
3210	if (mtrp->mtr_ident == NULL || ops == NULL)
3211		return (EINVAL);
3212
3213	/*
3214	 * Verify that all mandatory callbacks are set in the ops
3215	 * vector.
3216	 */
3217	if (ops->mtops_unicst_verify == NULL ||
3218	    ops->mtops_multicst_verify == NULL ||
3219	    ops->mtops_sap_verify == NULL ||
3220	    ops->mtops_header == NULL ||
3221	    ops->mtops_header_info == NULL) {
3222		return (EINVAL);
3223	}
3224
3225	mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
3226	mtp->mt_ident = mtrp->mtr_ident;
3227	mtp->mt_ops = *ops;
3228	mtp->mt_type = mtrp->mtr_mactype;
3229	mtp->mt_nativetype = mtrp->mtr_nativetype;
3230	mtp->mt_addr_length = mtrp->mtr_addrlen;
3231	if (mtrp->mtr_brdcst_addr != NULL) {
3232		mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
3233		bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
3234		    mtrp->mtr_addrlen);
3235	}
3236
3237	mtp->mt_stats = mtrp->mtr_stats;
3238	mtp->mt_statcount = mtrp->mtr_statcount;
3239
3240	mtp->mt_mapping = mtrp->mtr_mapping;
3241	mtp->mt_mappingcount = mtrp->mtr_mappingcount;
3242
3243	if (mod_hash_insert(i_mactype_hash,
3244	    (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
3245		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
3246		kmem_free(mtp, sizeof (*mtp));
3247		return (EEXIST);
3248	}
3249	return (0);
3250}
3251
3252int
3253mactype_unregister(const char *ident)
3254{
3255	mactype_t	*mtp;
3256	mod_hash_val_t	val;
3257	int		err;
3258
3259	/*
3260	 * Let's not allow MAC drivers to use this plugin while we're
3261	 * trying to unregister it.  Holding i_mactype_lock also prevents a
3262	 * plugin from unregistering while a MAC driver is attempting to
3263	 * hold a reference to it in i_mactype_getplugin().
3264	 */
3265	mutex_enter(&i_mactype_lock);
3266
3267	if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
3268	    (mod_hash_val_t *)&mtp)) != 0) {
3269		/* A plugin is trying to unregister, but it never registered. */
3270		err = ENXIO;
3271		goto done;
3272	}
3273
3274	if (mtp->mt_ref != 0) {
3275		err = EBUSY;
3276		goto done;
3277	}
3278
3279	err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
3280	ASSERT(err == 0);
3281	if (err != 0) {
3282		/* This should never happen, thus the ASSERT() above. */
3283		err = EINVAL;
3284		goto done;
3285	}
3286	ASSERT(mtp == (mactype_t *)val);
3287
3288	if (mtp->mt_brdcst_addr != NULL)
3289		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
3290	kmem_free(mtp, sizeof (mactype_t));
3291done:
3292	mutex_exit(&i_mactype_lock);
3293	return (err);
3294}
3295
3296/*
3297 * Checks the size of the value size specified for a property as
3298 * part of a property operation. Returns B_TRUE if the size is
3299 * correct, B_FALSE otherwise.
3300 */
3301boolean_t
3302mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range)
3303{
3304	uint_t minsize = 0;
3305
3306	if (is_range)
3307		return (valsize >= sizeof (mac_propval_range_t));
3308
3309	switch (id) {
3310	case MAC_PROP_ZONE:
3311		minsize = sizeof (dld_ioc_zid_t);
3312		break;
3313	case MAC_PROP_AUTOPUSH:
3314		if (valsize != 0)
3315			minsize = sizeof (struct dlautopush);
3316		break;
3317	case MAC_PROP_TAGMODE:
3318		minsize = sizeof (link_tagmode_t);
3319		break;
3320	case MAC_PROP_RESOURCE:
3321	case MAC_PROP_RESOURCE_EFF:
3322		minsize = sizeof (mac_resource_props_t);
3323		break;
3324	case MAC_PROP_DUPLEX:
3325		minsize = sizeof (link_duplex_t);
3326		break;
3327	case MAC_PROP_SPEED:
3328		minsize = sizeof (uint64_t);
3329		break;
3330	case MAC_PROP_STATUS:
3331		minsize = sizeof (link_state_t);
3332		break;
3333	case MAC_PROP_AUTONEG:
3334	case MAC_PROP_EN_AUTONEG:
3335		minsize = sizeof (uint8_t);
3336		break;
3337	case MAC_PROP_MTU:
3338	case MAC_PROP_LLIMIT:
3339	case MAC_PROP_LDECAY:
3340		minsize = sizeof (uint32_t);
3341		break;
3342	case MAC_PROP_FLOWCTRL:
3343		minsize = sizeof (link_flowctrl_t);
3344		break;
3345	case MAC_PROP_ADV_FEC_CAP:
3346	case MAC_PROP_EN_FEC_CAP:
3347		minsize = sizeof (link_fec_t);
3348		break;
3349	case MAC_PROP_ADV_5000FDX_CAP:
3350	case MAC_PROP_EN_5000FDX_CAP:
3351	case MAC_PROP_ADV_2500FDX_CAP:
3352	case MAC_PROP_EN_2500FDX_CAP:
3353	case MAC_PROP_ADV_100GFDX_CAP:
3354	case MAC_PROP_EN_100GFDX_CAP:
3355	case MAC_PROP_ADV_50GFDX_CAP:
3356	case MAC_PROP_EN_50GFDX_CAP:
3357	case MAC_PROP_ADV_40GFDX_CAP:
3358	case MAC_PROP_EN_40GFDX_CAP:
3359	case MAC_PROP_ADV_25GFDX_CAP:
3360	case MAC_PROP_EN_25GFDX_CAP:
3361	case MAC_PROP_ADV_10GFDX_CAP:
3362	case MAC_PROP_EN_10GFDX_CAP:
3363	case MAC_PROP_ADV_1000HDX_CAP:
3364	case MAC_PROP_EN_1000HDX_CAP:
3365	case MAC_PROP_ADV_100FDX_CAP:
3366	case MAC_PROP_EN_100FDX_CAP:
3367	case MAC_PROP_ADV_100HDX_CAP:
3368	case MAC_PROP_EN_100HDX_CAP:
3369	case MAC_PROP_ADV_10FDX_CAP:
3370	case MAC_PROP_EN_10FDX_CAP:
3371	case MAC_PROP_ADV_10HDX_CAP:
3372	case MAC_PROP_EN_10HDX_CAP:
3373	case MAC_PROP_ADV_100T4_CAP:
3374	case MAC_PROP_EN_100T4_CAP:
3375		minsize = sizeof (uint8_t);
3376		break;
3377	case MAC_PROP_PVID:
3378		minsize = sizeof (uint16_t);
3379		break;
3380	case MAC_PROP_IPTUN_HOPLIMIT:
3381		minsize = sizeof (uint32_t);
3382		break;
3383	case MAC_PROP_IPTUN_ENCAPLIMIT:
3384		minsize = sizeof (uint32_t);
3385		break;
3386	case MAC_PROP_MAX_TX_RINGS_AVAIL:
3387	case MAC_PROP_MAX_RX_RINGS_AVAIL:
3388	case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3389	case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3390		minsize = sizeof (uint_t);
3391		break;
3392	case MAC_PROP_WL_ESSID:
3393		minsize = sizeof (wl_linkstatus_t);
3394		break;
3395	case MAC_PROP_WL_BSSID:
3396		minsize = sizeof (wl_bssid_t);
3397		break;
3398	case MAC_PROP_WL_BSSTYPE:
3399		minsize = sizeof (wl_bss_type_t);
3400		break;
3401	case MAC_PROP_WL_LINKSTATUS:
3402		minsize = sizeof (wl_linkstatus_t);
3403		break;
3404	case MAC_PROP_WL_DESIRED_RATES:
3405		minsize = sizeof (wl_rates_t);
3406		break;
3407	case MAC_PROP_WL_SUPPORTED_RATES:
3408		minsize = sizeof (wl_rates_t);
3409		break;
3410	case MAC_PROP_WL_AUTH_MODE:
3411		minsize = sizeof (wl_authmode_t);
3412		break;
3413	case MAC_PROP_WL_ENCRYPTION:
3414		minsize = sizeof (wl_encryption_t);
3415		break;
3416	case MAC_PROP_WL_RSSI:
3417		minsize = sizeof (wl_rssi_t);
3418		break;
3419	case MAC_PROP_WL_PHY_CONFIG:
3420		minsize = sizeof (wl_phy_conf_t);
3421		break;
3422	case MAC_PROP_WL_CAPABILITY:
3423		minsize = sizeof (wl_capability_t);
3424		break;
3425	case MAC_PROP_WL_WPA:
3426		minsize = sizeof (wl_wpa_t);
3427		break;
3428	case MAC_PROP_WL_SCANRESULTS:
3429		minsize = sizeof (wl_wpa_ess_t);
3430		break;
3431	case MAC_PROP_WL_POWER_MODE:
3432		minsize = sizeof (wl_ps_mode_t);
3433		break;
3434	case MAC_PROP_WL_RADIO:
3435		minsize = sizeof (wl_radio_t);
3436		break;
3437	case MAC_PROP_WL_ESS_LIST:
3438		minsize = sizeof (wl_ess_list_t);
3439		break;
3440	case MAC_PROP_WL_KEY_TAB:
3441		minsize = sizeof (wl_wep_key_tab_t);
3442		break;
3443	case MAC_PROP_WL_CREATE_IBSS:
3444		minsize = sizeof (wl_create_ibss_t);
3445		break;
3446	case MAC_PROP_WL_SETOPTIE:
3447		minsize = sizeof (wl_wpa_ie_t);
3448		break;
3449	case MAC_PROP_WL_DELKEY:
3450		minsize = sizeof (wl_del_key_t);
3451		break;
3452	case MAC_PROP_WL_KEY:
3453		minsize = sizeof (wl_key_t);
3454		break;
3455	case MAC_PROP_WL_MLME:
3456		minsize = sizeof (wl_mlme_t);
3457		break;
3458	case MAC_PROP_VN_PROMISC_FILTERED:
3459		minsize = sizeof (boolean_t);
3460		break;
3461	}
3462
3463	return (valsize >= minsize);
3464}
3465
3466/*
3467 * mac_set_prop() sets MAC or hardware driver properties:
3468 *
3469 * - MAC-managed properties such as resource properties include maxbw,
3470 *   priority, and cpu binding list, as well as the default port VID
3471 *   used by bridging. These properties are consumed by the MAC layer
3472 *   itself and not passed down to the driver. For resource control
3473 *   properties, this function invokes mac_set_resources() which will
3474 *   cache the property value in mac_impl_t and may call
3475 *   mac_client_set_resource() to update property value of the primary
3476 *   mac client, if it exists.
3477 *
3478 * - Properties which act on the hardware and must be passed to the
3479 *   driver, such as MTU, through the driver's mc_setprop() entry point.
3480 */
3481int
3482mac_set_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3483    uint_t valsize)
3484{
3485	int err = ENOTSUP;
3486	mac_impl_t *mip = (mac_impl_t *)mh;
3487
3488	ASSERT(MAC_PERIM_HELD(mh));
3489
3490	switch (id) {
3491	case MAC_PROP_RESOURCE: {
3492		mac_resource_props_t *mrp;
3493
3494		/* call mac_set_resources() for MAC properties */
3495		ASSERT(valsize >= sizeof (mac_resource_props_t));
3496		mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3497		bcopy(val, mrp, sizeof (*mrp));
3498		err = mac_set_resources(mh, mrp);
3499		kmem_free(mrp, sizeof (*mrp));
3500		break;
3501	}
3502
3503	case MAC_PROP_PVID:
3504		ASSERT(valsize >= sizeof (uint16_t));
3505		if (mip->mi_state_flags & MIS_IS_VNIC)
3506			return (EINVAL);
3507		err = mac_set_pvid(mh, *(uint16_t *)val);
3508		break;
3509
3510	case MAC_PROP_MTU: {
3511		uint32_t mtu;
3512
3513		ASSERT(valsize >= sizeof (uint32_t));
3514		bcopy(val, &mtu, sizeof (mtu));
3515		err = mac_set_mtu(mh, mtu, NULL);
3516		break;
3517	}
3518
3519	case MAC_PROP_LLIMIT:
3520	case MAC_PROP_LDECAY: {
3521		uint32_t learnval;
3522
3523		if (valsize < sizeof (learnval) ||
3524		    (mip->mi_state_flags & MIS_IS_VNIC))
3525			return (EINVAL);
3526		bcopy(val, &learnval, sizeof (learnval));
3527		if (learnval == 0 && id == MAC_PROP_LDECAY)
3528			return (EINVAL);
3529		if (id == MAC_PROP_LLIMIT)
3530			mip->mi_llimit = learnval;
3531		else
3532			mip->mi_ldecay = learnval;
3533		err = 0;
3534		break;
3535	}
3536
3537	case MAC_PROP_ADV_FEC_CAP:
3538	case MAC_PROP_EN_FEC_CAP: {
3539		link_fec_t fec;
3540
3541		ASSERT(valsize >= sizeof (link_fec_t));
3542
3543		/*
3544		 * fec cannot be zero, and auto must be set exclusively.
3545		 */
3546		bcopy(val, &fec, sizeof (link_fec_t));
3547		if (fec == 0)
3548			return (EINVAL);
3549		if ((fec & LINK_FEC_AUTO) != 0 && (fec & ~LINK_FEC_AUTO) != 0)
3550			return (EINVAL);
3551
3552		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
3553			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
3554			    name, id, valsize, val);
3555		}
3556		break;
3557	}
3558
3559	default:
3560		/* For other driver properties, call driver's callback */
3561		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
3562			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
3563			    name, id, valsize, val);
3564		}
3565	}
3566	return (err);
3567}
3568
3569/*
3570 * mac_get_prop() gets MAC or device driver properties.
3571 *
3572 * If the property is a driver property, mac_get_prop() calls driver's callback
3573 * entry point to get it.
3574 * If the property is a MAC property, mac_get_prop() invokes mac_get_resources()
3575 * which returns the cached value in mac_impl_t.
3576 */
3577int
3578mac_get_prop(mac_handle_t mh, mac_prop_id_t id, char *name, void *val,
3579    uint_t valsize)
3580{
3581	int err = ENOTSUP;
3582	mac_impl_t *mip = (mac_impl_t *)mh;
3583	uint_t	rings;
3584	uint_t	vlinks;
3585
3586	bzero(val, valsize);
3587
3588	switch (id) {
3589	case MAC_PROP_RESOURCE: {
3590		mac_resource_props_t *mrp;
3591
3592		/* If mac property, read from cache */
3593		ASSERT(valsize >= sizeof (mac_resource_props_t));
3594		mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3595		mac_get_resources(mh, mrp);
3596		bcopy(mrp, val, sizeof (*mrp));
3597		kmem_free(mrp, sizeof (*mrp));
3598		return (0);
3599	}
3600	case MAC_PROP_RESOURCE_EFF: {
3601		mac_resource_props_t *mrp;
3602
3603		/* If mac effective property, read from client */
3604		ASSERT(valsize >= sizeof (mac_resource_props_t));
3605		mrp = kmem_zalloc(sizeof (*mrp), KM_SLEEP);
3606		mac_get_effective_resources(mh, mrp);
3607		bcopy(mrp, val, sizeof (*mrp));
3608		kmem_free(mrp, sizeof (*mrp));
3609		return (0);
3610	}
3611
3612	case MAC_PROP_PVID:
3613		ASSERT(valsize >= sizeof (uint16_t));
3614		if (mip->mi_state_flags & MIS_IS_VNIC)
3615			return (EINVAL);
3616		*(uint16_t *)val = mac_get_pvid(mh);
3617		return (0);
3618
3619	case MAC_PROP_LLIMIT:
3620	case MAC_PROP_LDECAY:
3621		ASSERT(valsize >= sizeof (uint32_t));
3622		if (mip->mi_state_flags & MIS_IS_VNIC)
3623			return (EINVAL);
3624		if (id == MAC_PROP_LLIMIT)
3625			bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
3626		else
3627			bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
3628		return (0);
3629
3630	case MAC_PROP_MTU: {
3631		uint32_t sdu;
3632
3633		ASSERT(valsize >= sizeof (uint32_t));
3634		mac_sdu_get2(mh, NULL, &sdu, NULL);
3635		bcopy(&sdu, val, sizeof (sdu));
3636
3637		return (0);
3638	}
3639	case MAC_PROP_STATUS: {
3640		link_state_t link_state;
3641
3642		if (valsize < sizeof (link_state))
3643			return (EINVAL);
3644		link_state = mac_link_get(mh);
3645		bcopy(&link_state, val, sizeof (link_state));
3646
3647		return (0);
3648	}
3649
3650	case MAC_PROP_MAX_RX_RINGS_AVAIL:
3651	case MAC_PROP_MAX_TX_RINGS_AVAIL:
3652		ASSERT(valsize >= sizeof (uint_t));
3653		rings = id == MAC_PROP_MAX_RX_RINGS_AVAIL ?
3654		    mac_rxavail_get(mh) : mac_txavail_get(mh);
3655		bcopy(&rings, val, sizeof (uint_t));
3656		return (0);
3657
3658	case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3659	case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3660		ASSERT(valsize >= sizeof (uint_t));
3661		vlinks = id == MAC_PROP_MAX_RXHWCLNT_AVAIL ?
3662		    mac_rxhwlnksavail_get(mh) : mac_txhwlnksavail_get(mh);
3663		bcopy(&vlinks, val, sizeof (uint_t));
3664		return (0);
3665
3666	case MAC_PROP_RXRINGSRANGE:
3667	case MAC_PROP_TXRINGSRANGE:
3668		/*
3669		 * The value for these properties are returned through
3670		 * the MAC_PROP_RESOURCE property.
3671		 */
3672		return (0);
3673
3674	default:
3675		break;
3676
3677	}
3678
3679	/* If driver property, request from driver */
3680	if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
3681		err = mip->mi_callbacks->mc_getprop(mip->mi_driver, name, id,
3682		    valsize, val);
3683	}
3684
3685	return (err);
3686}
3687
3688/*
3689 * Helper function to initialize the range structure for use in
3690 * mac_get_prop. If the type can be other than uint32, we can
3691 * pass that as an arg.
3692 */
3693static void
3694_mac_set_range(mac_propval_range_t *range, uint32_t min, uint32_t max)
3695{
3696	range->mpr_count = 1;
3697	range->mpr_type = MAC_PROPVAL_UINT32;
3698	range->mpr_range_uint32[0].mpur_min = min;
3699	range->mpr_range_uint32[0].mpur_max = max;
3700}
3701
3702/*
3703 * Returns information about the specified property, such as default
3704 * values or permissions.
3705 */
3706int
3707mac_prop_info(mac_handle_t mh, mac_prop_id_t id, char *name,
3708    void *default_val, uint_t default_size, mac_propval_range_t *range,
3709    uint_t *perm)
3710{
3711	mac_prop_info_state_t state;
3712	mac_impl_t *mip = (mac_impl_t *)mh;
3713	uint_t	max;
3714
3715	/*
3716	 * A property is read/write by default unless the driver says
3717	 * otherwise.
3718	 */
3719	if (perm != NULL)
3720		*perm = MAC_PROP_PERM_RW;
3721
3722	if (default_val != NULL)
3723		bzero(default_val, default_size);
3724
3725	/*
3726	 * First, handle framework properties for which we don't need to
3727	 * involve the driver.
3728	 */
3729	switch (id) {
3730	case MAC_PROP_RESOURCE:
3731	case MAC_PROP_PVID:
3732	case MAC_PROP_LLIMIT:
3733	case MAC_PROP_LDECAY:
3734		return (0);
3735
3736	case MAC_PROP_MAX_RX_RINGS_AVAIL:
3737	case MAC_PROP_MAX_TX_RINGS_AVAIL:
3738	case MAC_PROP_MAX_RXHWCLNT_AVAIL:
3739	case MAC_PROP_MAX_TXHWCLNT_AVAIL:
3740		if (perm != NULL)
3741			*perm = MAC_PROP_PERM_READ;
3742		return (0);
3743
3744	case MAC_PROP_RXRINGSRANGE:
3745	case MAC_PROP_TXRINGSRANGE:
3746		/*
3747		 * Currently, we support range for RX and TX rings properties.
3748		 * When we extend this support to maxbw, cpus and priority,
3749		 * we should move this to mac_get_resources.
3750		 * There is no default value for RX or TX rings.
3751		 */
3752		if ((mip->mi_state_flags & MIS_IS_VNIC) &&
3753		    mac_is_vnic_primary(mh)) {
3754			/*
3755			 * We don't support setting rings for a VLAN
3756			 * data link because it shares its ring with the
3757			 * primary MAC client.
3758			 */
3759			if (perm != NULL)
3760				*perm = MAC_PROP_PERM_READ;
3761			if (range != NULL)
3762				range->mpr_count = 0;
3763		} else if (range != NULL) {
3764			if (mip->mi_state_flags & MIS_IS_VNIC)
3765				mh = mac_get_lower_mac_handle(mh);
3766			mip = (mac_impl_t *)mh;
3767			if ((id == MAC_PROP_RXRINGSRANGE &&
3768			    mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) ||
3769			    (id == MAC_PROP_TXRINGSRANGE &&
3770			    mip->mi_tx_group_type == MAC_GROUP_TYPE_STATIC)) {
3771				if (id == MAC_PROP_RXRINGSRANGE) {
3772					if ((mac_rxhwlnksavail_get(mh) +
3773					    mac_rxhwlnksrsvd_get(mh)) <= 1) {
3774						/*
3775						 * doesn't support groups or
3776						 * rings
3777						 */
3778						range->mpr_count = 0;
3779					} else {
3780						/*
3781						 * supports specifying groups,
3782						 * but not rings
3783						 */
3784						_mac_set_range(range, 0, 0);
3785					}
3786				} else {
3787					if ((mac_txhwlnksavail_get(mh) +
3788					    mac_txhwlnksrsvd_get(mh)) <= 1) {
3789						/*
3790						 * doesn't support groups or
3791						 * rings
3792						 */
3793						range->mpr_count = 0;
3794					} else {
3795						/*
3796						 * supports specifying groups,
3797						 * but not rings
3798						 */
3799						_mac_set_range(range, 0, 0);
3800					}
3801				}
3802			} else {
3803				max = id == MAC_PROP_RXRINGSRANGE ?
3804				    mac_rxavail_get(mh) + mac_rxrsvd_get(mh) :
3805				    mac_txavail_get(mh) + mac_txrsvd_get(mh);
3806				if (max <= 1) {
3807					/*
3808					 * doesn't support groups or
3809					 * rings
3810					 */
3811					range->mpr_count = 0;
3812				} else  {
3813					/*
3814					 * -1 because we have to leave out the
3815					 * default ring.
3816					 */
3817					_mac_set_range(range, 1, max - 1);
3818				}
3819			}
3820		}
3821		return (0);
3822
3823	case MAC_PROP_STATUS:
3824		if (perm != NULL)
3825			*perm = MAC_PROP_PERM_READ;
3826		return (0);
3827	}
3828
3829	/*
3830	 * Get the property info from the driver if it implements the
3831	 * property info entry point.
3832	 */
3833	bzero(&state, sizeof (state));
3834
3835	if (mip->mi_callbacks->mc_callbacks & MC_PROPINFO) {
3836		state.pr_default = default_val;
3837		state.pr_default_size = default_size;
3838
3839		/*
3840		 * The caller specifies the maximum number of ranges
3841		 * it can accomodate using mpr_count. We don't touch
3842		 * this value until the driver returns from its
3843		 * mc_propinfo() callback, and ensure we don't exceed
3844		 * this number of range as the driver defines
3845		 * supported range from its mc_propinfo().
3846		 *
3847		 * pr_range_cur_count keeps track of how many ranges
3848		 * were defined by the driver from its mc_propinfo()
3849		 * entry point.
3850		 *
3851		 * On exit, the user-specified range mpr_count returns
3852		 * the number of ranges specified by the driver on
3853		 * success, or the number of ranges it wanted to
3854		 * define if that number of ranges could not be
3855		 * accomodated by the specified range structure.  In
3856		 * the latter case, the caller will be able to
3857		 * allocate a larger range structure, and query the
3858		 * property again.
3859		 */
3860		state.pr_range_cur_count = 0;
3861		state.pr_range = range;
3862
3863		mip->mi_callbacks->mc_propinfo(mip->mi_driver, name, id,
3864		    (mac_prop_info_handle_t)&state);
3865
3866		if (state.pr_flags & MAC_PROP_INFO_RANGE)
3867			range->mpr_count = state.pr_range_cur_count;
3868
3869		/*
3870		 * The operation could fail if the buffer supplied by
3871		 * the user was too small for the range or default
3872		 * value of the property.
3873		 */
3874		if (state.pr_errno != 0)
3875			return (state.pr_errno);
3876
3877		if (perm != NULL && state.pr_flags & MAC_PROP_INFO_PERM)
3878			*perm = state.pr_perm;
3879	}
3880
3881	/*
3882	 * The MAC layer may want to provide default values or allowed
3883	 * ranges for properties if the driver does not provide a
3884	 * property info entry point, or that entry point exists, but
3885	 * it did not provide a default value or allowed ranges for
3886	 * that property.
3887	 */
3888	switch (id) {
3889	case MAC_PROP_MTU: {
3890		uint32_t sdu;
3891
3892		mac_sdu_get2(mh, NULL, &sdu, NULL);
3893
3894		if (range != NULL && !(state.pr_flags &
3895		    MAC_PROP_INFO_RANGE)) {
3896			/* MTU range */
3897			_mac_set_range(range, sdu, sdu);
3898		}
3899
3900		if (default_val != NULL && !(state.pr_flags &
3901		    MAC_PROP_INFO_DEFAULT)) {
3902			if (mip->mi_info.mi_media == DL_ETHER)
3903				sdu = ETHERMTU;
3904			/* default MTU value */
3905			bcopy(&sdu, default_val, sizeof (sdu));
3906		}
3907	}
3908	}
3909
3910	return (0);
3911}
3912
3913int
3914mac_fastpath_disable(mac_handle_t mh)
3915{
3916	mac_impl_t	*mip = (mac_impl_t *)mh;
3917
3918	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3919		return (0);
3920
3921	return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
3922}
3923
3924void
3925mac_fastpath_enable(mac_handle_t mh)
3926{
3927	mac_impl_t	*mip = (mac_impl_t *)mh;
3928
3929	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
3930		return;
3931
3932	mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
3933}
3934
3935void
3936mac_register_priv_prop(mac_impl_t *mip, char **priv_props)
3937{
3938	uint_t nprops, i;
3939
3940	if (priv_props == NULL)
3941		return;
3942
3943	nprops = 0;
3944	while (priv_props[nprops] != NULL)
3945		nprops++;
3946	if (nprops == 0)
3947		return;
3948
3949
3950	mip->mi_priv_prop = kmem_zalloc(nprops * sizeof (char *), KM_SLEEP);
3951
3952	for (i = 0; i < nprops; i++) {
3953		mip->mi_priv_prop[i] = kmem_zalloc(MAXLINKPROPNAME, KM_SLEEP);
3954		(void) strlcpy(mip->mi_priv_prop[i], priv_props[i],
3955		    MAXLINKPROPNAME);
3956	}
3957
3958	mip->mi_priv_prop_count = nprops;
3959}
3960
3961void
3962mac_unregister_priv_prop(mac_impl_t *mip)
3963{
3964	uint_t i;
3965
3966	if (mip->mi_priv_prop_count == 0) {
3967		ASSERT(mip->mi_priv_prop == NULL);
3968		return;
3969	}
3970
3971	for (i = 0; i < mip->mi_priv_prop_count; i++)
3972		kmem_free(mip->mi_priv_prop[i], MAXLINKPROPNAME);
3973	kmem_free(mip->mi_priv_prop, mip->mi_priv_prop_count *
3974	    sizeof (char *));
3975
3976	mip->mi_priv_prop = NULL;
3977	mip->mi_priv_prop_count = 0;
3978}
3979
3980/*
3981 * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
3982 * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
3983 * cases if MAC free's the ring structure after mac_stop_ring(), any
3984 * illegal access to the ring structure coming from the driver will panic
3985 * the system. In order to protect the system from such inadverent access,
3986 * we maintain a cache of rings in the mac_impl_t after they get free'd up.
3987 * When packets are received on free'd up rings, MAC (through the generation
3988 * count mechanism) will drop such packets.
3989 */
3990static mac_ring_t *
3991mac_ring_alloc(mac_impl_t *mip)
3992{
3993	mac_ring_t *ring;
3994
3995	mutex_enter(&mip->mi_ring_lock);
3996	if (mip->mi_ring_freelist != NULL) {
3997		ring = mip->mi_ring_freelist;
3998		mip->mi_ring_freelist = ring->mr_next;
3999		bzero(ring, sizeof (mac_ring_t));
4000		mutex_exit(&mip->mi_ring_lock);
4001	} else {
4002		mutex_exit(&mip->mi_ring_lock);
4003		ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
4004	}
4005	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
4006	return (ring);
4007}
4008
4009static void
4010mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
4011{
4012	ASSERT(ring->mr_state == MR_FREE);
4013
4014	mutex_enter(&mip->mi_ring_lock);
4015	ring->mr_state = MR_FREE;
4016	ring->mr_flag = 0;
4017	ring->mr_next = mip->mi_ring_freelist;
4018	ring->mr_mip = NULL;
4019	mip->mi_ring_freelist = ring;
4020	mac_ring_stat_delete(ring);
4021	mutex_exit(&mip->mi_ring_lock);
4022}
4023
4024static void
4025mac_ring_freeall(mac_impl_t *mip)
4026{
4027	mac_ring_t *ring_next;
4028	mutex_enter(&mip->mi_ring_lock);
4029	mac_ring_t *ring = mip->mi_ring_freelist;
4030	while (ring != NULL) {
4031		ring_next = ring->mr_next;
4032		kmem_cache_free(mac_ring_cache, ring);
4033		ring = ring_next;
4034	}
4035	mip->mi_ring_freelist = NULL;
4036	mutex_exit(&mip->mi_ring_lock);
4037}
4038
4039int
4040mac_start_ring(mac_ring_t *ring)
4041{
4042	int rv = 0;
4043
4044	ASSERT(ring->mr_state == MR_FREE);
4045
4046	if (ring->mr_start != NULL) {
4047		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
4048		if (rv != 0)
4049			return (rv);
4050	}
4051
4052	ring->mr_state = MR_INUSE;
4053	return (rv);
4054}
4055
4056void
4057mac_stop_ring(mac_ring_t *ring)
4058{
4059	ASSERT(ring->mr_state == MR_INUSE);
4060
4061	if (ring->mr_stop != NULL)
4062		ring->mr_stop(ring->mr_driver);
4063
4064	ring->mr_state = MR_FREE;
4065
4066	/*
4067	 * Increment the ring generation number for this ring.
4068	 */
4069	ring->mr_gen_num++;
4070}
4071
4072int
4073mac_start_group(mac_group_t *group)
4074{
4075	int rv = 0;
4076
4077	if (group->mrg_start != NULL)
4078		rv = group->mrg_start(group->mrg_driver);
4079
4080	return (rv);
4081}
4082
4083void
4084mac_stop_group(mac_group_t *group)
4085{
4086	if (group->mrg_stop != NULL)
4087		group->mrg_stop(group->mrg_driver);
4088}
4089
4090/*
4091 * Called from mac_start() on the default Rx group. Broadcast and multicast
4092 * packets are received only on the default group. Hence the default group
4093 * needs to be up even if the primary client is not up, for the other groups
4094 * to be functional. We do this by calling this function at mac_start time
4095 * itself. However the broadcast packets that are received can't make their
4096 * way beyond mac_rx until a mac client creates a broadcast flow.
4097 */
4098static int
4099mac_start_group_and_rings(mac_group_t *group)
4100{
4101	mac_ring_t	*ring;
4102	int		rv = 0;
4103
4104	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
4105	if ((rv = mac_start_group(group)) != 0)
4106		return (rv);
4107
4108	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
4109		ASSERT(ring->mr_state == MR_FREE);
4110
4111		if ((rv = mac_start_ring(ring)) != 0)
4112			goto error;
4113
4114		/*
4115		 * When aggr_set_port_sdu() is called, it will remove
4116		 * the port client's unicast address. This will cause
4117		 * MAC to stop the default group's rings on the port
4118		 * MAC. After it modifies the SDU, it will then re-add
4119		 * the unicast address. At which time, this function is
4120		 * called to start the default group's rings. Normally
4121		 * this function would set the classify type to
4122		 * MAC_SW_CLASSIFIER; but that will break aggr which
4123		 * relies on the passthru classify mode being set for
4124		 * correct delivery (see mac_rx_common()). To avoid
4125		 * that, we check for a passthru callback and set the
4126		 * classify type to MAC_PASSTHRU_CLASSIFIER; as it was
4127		 * before the rings were stopped.
4128		 */
4129		ring->mr_classify_type = (ring->mr_pt_fn != NULL) ?
4130		    MAC_PASSTHRU_CLASSIFIER : MAC_SW_CLASSIFIER;
4131	}
4132	return (0);
4133
4134error:
4135	mac_stop_group_and_rings(group);
4136	return (rv);
4137}
4138
4139/* Called from mac_stop on the default Rx group */
4140static void
4141mac_stop_group_and_rings(mac_group_t *group)
4142{
4143	mac_ring_t	*ring;
4144
4145	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
4146		if (ring->mr_state != MR_FREE) {
4147			mac_stop_ring(ring);
4148			ring->mr_flag = 0;
4149			ring->mr_classify_type = MAC_NO_CLASSIFIER;
4150		}
4151	}
4152	mac_stop_group(group);
4153}
4154
4155
4156static mac_ring_t *
4157mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
4158    mac_capab_rings_t *cap_rings)
4159{
4160	mac_ring_t *ring, *rnext;
4161	mac_ring_info_t ring_info;
4162	ddi_intr_handle_t ddi_handle;
4163
4164	ring = mac_ring_alloc(mip);
4165
4166	/* Prepare basic information of ring */
4167
4168	/*
4169	 * Ring index is numbered to be unique across a particular device.
4170	 * Ring index computation makes following assumptions:
4171	 *	- For drivers with static grouping (e.g. ixgbe, bge),
4172	 *	ring index exchanged with the driver (e.g. during mr_rget)
4173	 *	is unique only across the group the ring belongs to.
4174	 *	- Drivers with dynamic grouping (e.g. nxge), start
4175	 *	with single group (mrg_index = 0).
4176	 */
4177	ring->mr_index = group->mrg_index * group->mrg_info.mgi_count + index;
4178	ring->mr_type = group->mrg_type;
4179	ring->mr_gh = (mac_group_handle_t)group;
4180
4181	/* Insert the new ring to the list. */
4182	ring->mr_next = group->mrg_rings;
4183	group->mrg_rings = ring;
4184
4185	/* Zero to reuse the info data structure */
4186	bzero(&ring_info, sizeof (ring_info));
4187
4188	/* Query ring information from driver */
4189	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
4190	    index, &ring_info, (mac_ring_handle_t)ring);
4191
4192	ring->mr_info = ring_info;
4193
4194	/*
4195	 * The interrupt handle could be shared among multiple rings.
4196	 * Thus if there is a bunch of rings that are sharing an
4197	 * interrupt, then only one ring among the bunch will be made
4198	 * available for interrupt re-targeting; the rest will have
4199	 * ddi_shared flag set to TRUE and would not be available for
4200	 * be interrupt re-targeting.
4201	 */
4202	if ((ddi_handle = ring_info.mri_intr.mi_ddi_handle) != NULL) {
4203		rnext = ring->mr_next;
4204		while (rnext != NULL) {
4205			if (rnext->mr_info.mri_intr.mi_ddi_handle ==
4206			    ddi_handle) {
4207				/*
4208				 * If default ring (mr_index == 0) is part
4209				 * of a group of rings sharing an
4210				 * interrupt, then set ddi_shared flag for
4211				 * the default ring and give another ring
4212				 * the chance to be re-targeted.
4213				 */
4214				if (rnext->mr_index == 0 &&
4215				    !rnext->mr_info.mri_intr.mi_ddi_shared) {
4216					rnext->mr_info.mri_intr.mi_ddi_shared =
4217					    B_TRUE;
4218				} else {
4219					ring->mr_info.mri_intr.mi_ddi_shared =
4220					    B_TRUE;
4221				}
4222				break;
4223			}
4224			rnext = rnext->mr_next;
4225		}
4226		/*
4227		 * If rnext is NULL, then no matching ddi_handle was found.
4228		 * Rx rings get registered first. So if this is a Tx ring,
4229		 * then go through all the Rx rings and see if there is a
4230		 * matching ddi handle.
4231		 */
4232		if (rnext == NULL && ring->mr_type == MAC_RING_TYPE_TX) {
4233			mac_compare_ddi_handle(mip->mi_rx_groups,
4234			    mip->mi_rx_group_count, ring);
4235		}
4236	}
4237
4238	/* Update ring's status */
4239	ring->mr_state = MR_FREE;
4240	ring->mr_flag = 0;
4241
4242	/* Update the ring count of the group */
4243	group->mrg_cur_count++;
4244
4245	/* Create per ring kstats */
4246	if (ring->mr_stat != NULL) {
4247		ring->mr_mip = mip;
4248		mac_ring_stat_create(ring);
4249	}
4250
4251	return (ring);
4252}
4253
4254/*
4255 * Rings are chained together for easy regrouping.
4256 */
4257static void
4258mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
4259    mac_capab_rings_t *cap_rings)
4260{
4261	int index;
4262
4263	/*
4264	 * Initialize all ring members of this group. Size of zero will not
4265	 * enter the loop, so it's safe for initializing an empty group.
4266	 */
4267	for (index = size - 1; index >= 0; index--)
4268		(void) mac_init_ring(mip, group, index, cap_rings);
4269}
4270
4271int
4272mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
4273{
4274	mac_capab_rings_t	*cap_rings;
4275	mac_group_t		*group;
4276	mac_group_t		*groups;
4277	mac_group_info_t	group_info;
4278	uint_t			group_free = 0;
4279	uint_t			ring_left;
4280	mac_ring_t		*ring;
4281	int			g;
4282	int			err = 0;
4283	uint_t			grpcnt;
4284	boolean_t		pseudo_txgrp = B_FALSE;
4285
4286	switch (rtype) {
4287	case MAC_RING_TYPE_RX:
4288		ASSERT(mip->mi_rx_groups == NULL);
4289
4290		cap_rings = &mip->mi_rx_rings_cap;
4291		cap_rings->mr_type = MAC_RING_TYPE_RX;
4292		break;
4293	case MAC_RING_TYPE_TX:
4294		ASSERT(mip->mi_tx_groups == NULL);
4295
4296		cap_rings = &mip->mi_tx_rings_cap;
4297		cap_rings->mr_type = MAC_RING_TYPE_TX;
4298		break;
4299	default:
4300		ASSERT(B_FALSE);
4301	}
4302
4303	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, cap_rings))
4304		return (0);
4305	grpcnt = cap_rings->mr_gnum;
4306
4307	/*
4308	 * If we have multiple TX rings, but only one TX group, we can
4309	 * create pseudo TX groups (one per TX ring) in the MAC layer,
4310	 * except for an aggr. For an aggr currently we maintain only
4311	 * one group with all the rings (for all its ports), going
4312	 * forwards we might change this.
4313	 */
4314	if (rtype == MAC_RING_TYPE_TX &&
4315	    cap_rings->mr_gnum == 0 && cap_rings->mr_rnum >  0 &&
4316	    (mip->mi_state_flags & MIS_IS_AGGR) == 0) {
4317		/*
4318		 * The -1 here is because we create a default TX group
4319		 * with all the rings in it.
4320		 */
4321		grpcnt = cap_rings->mr_rnum - 1;
4322		pseudo_txgrp = B_TRUE;
4323	}
4324
4325	/*
4326	 * Allocate a contiguous buffer for all groups.
4327	 */
4328	groups = kmem_zalloc(sizeof (mac_group_t) * (grpcnt+ 1), KM_SLEEP);
4329
4330	ring_left = cap_rings->mr_rnum;
4331
4332	/*
4333	 * Get all ring groups if any, and get their ring members
4334	 * if any.
4335	 */
4336	for (g = 0; g < grpcnt; g++) {
4337		group = groups + g;
4338
4339		/* Prepare basic information of the group */
4340		group->mrg_index = g;
4341		group->mrg_type = rtype;
4342		group->mrg_state = MAC_GROUP_STATE_UNINIT;
4343		group->mrg_mh = (mac_handle_t)mip;
4344		group->mrg_next = group + 1;
4345
4346		/* Zero to reuse the info data structure */
4347		bzero(&group_info, sizeof (group_info));
4348
4349		if (pseudo_txgrp) {
4350			/*
4351			 * This is a pseudo group that we created, apart
4352			 * from setting the state there is nothing to be
4353			 * done.
4354			 */
4355			group->mrg_state = MAC_GROUP_STATE_REGISTERED;
4356			group_free++;
4357			continue;
4358		}
4359		/* Query group information from driver */
4360		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
4361		    (mac_group_handle_t)group);
4362
4363		switch (cap_rings->mr_group_type) {
4364		case MAC_GROUP_TYPE_DYNAMIC:
4365			if (cap_rings->mr_gaddring == NULL ||
4366			    cap_rings->mr_gremring == NULL) {
4367				DTRACE_PROBE3(
4368				    mac__init__rings_no_addremring,
4369				    char *, mip->mi_name,
4370				    mac_group_add_ring_t,
4371				    cap_rings->mr_gaddring,
4372				    mac_group_add_ring_t,
4373				    cap_rings->mr_gremring);
4374				err = EINVAL;
4375				goto bail;
4376			}
4377
4378			switch (rtype) {
4379			case MAC_RING_TYPE_RX:
4380				/*
4381				 * The first RX group must have non-zero
4382				 * rings, and the following groups must
4383				 * have zero rings.
4384				 */
4385				if (g == 0 && group_info.mgi_count == 0) {
4386					DTRACE_PROBE1(
4387					    mac__init__rings__rx__def__zero,
4388					    char *, mip->mi_name);
4389					err = EINVAL;
4390					goto bail;
4391				}
4392				if (g > 0 && group_info.mgi_count != 0) {
4393					DTRACE_PROBE3(
4394					    mac__init__rings__rx__nonzero,
4395					    char *, mip->mi_name,
4396					    int, g, int, group_info.mgi_count);
4397					err = EINVAL;
4398					goto bail;
4399				}
4400				break;
4401			case MAC_RING_TYPE_TX:
4402				/*
4403				 * All TX ring groups must have zero rings.
4404				 */
4405				if (group_info.mgi_count != 0) {
4406					DTRACE_PROBE3(
4407					    mac__init__rings__tx__nonzero,
4408					    char *, mip->mi_name,
4409					    int, g, int, group_info.mgi_count);
4410					err = EINVAL;
4411					goto bail;
4412				}
4413				break;
4414			}
4415			break;
4416		case MAC_GROUP_TYPE_STATIC:
4417			/*
4418			 * Note that an empty group is allowed, e.g., an aggr
4419			 * would start with an empty group.
4420			 */
4421			break;
4422		default:
4423			/* unknown group type */
4424			DTRACE_PROBE2(mac__init__rings__unknown__type,
4425			    char *, mip->mi_name,
4426			    int, cap_rings->mr_group_type);
4427			err = EINVAL;
4428			goto bail;
4429		}
4430
4431
4432		/*
4433		 * The driver must register some form of hardware MAC
4434		 * filter in order for Rx groups to support multiple
4435		 * MAC addresses.
4436		 */
4437		if (rtype == MAC_RING_TYPE_RX &&
4438		    (group_info.mgi_addmac == NULL ||
4439		    group_info.mgi_remmac == NULL)) {
4440			DTRACE_PROBE1(mac__init__rings__no__mac__filter,
4441			    char *, mip->mi_name);
4442			err = EINVAL;
4443			goto bail;
4444		}
4445
4446		/* Cache driver-supplied information */
4447		group->mrg_info = group_info;
4448
4449		/* Update the group's status and group count. */
4450		mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
4451		group_free++;
4452
4453		group->mrg_rings = NULL;
4454		group->mrg_cur_count = 0;
4455		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
4456		ring_left -= group_info.mgi_count;
4457
4458		/* The current group size should be equal to default value */
4459		ASSERT(group->mrg_cur_count == group_info.mgi_count);
4460	}
4461
4462	/* Build up a dummy group for free resources as a pool */
4463	group = groups + grpcnt;
4464
4465	/* Prepare basic information of the group */
4466	group->mrg_index = -1;
4467	group->mrg_type = rtype;
4468	group->mrg_state = MAC_GROUP_STATE_UNINIT;
4469	group->mrg_mh = (mac_handle_t)mip;
4470	group->mrg_next = NULL;
4471
4472	/*
4473	 * If there are ungrouped rings, allocate a continuous buffer for
4474	 * remaining resources.
4475	 */
4476	if (ring_left != 0) {
4477		group->mrg_rings = NULL;
4478		group->mrg_cur_count = 0;
4479		mac_init_group(mip, group, ring_left, cap_rings);
4480
4481		/* The current group size should be equal to ring_left */
4482		ASSERT(group->mrg_cur_count == ring_left);
4483
4484		ring_left = 0;
4485
4486		/* Update this group's status */
4487		mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED);
4488	} else {
4489		group->mrg_rings = NULL;
4490	}
4491
4492	ASSERT(ring_left == 0);
4493
4494bail:
4495
4496	/* Cache other important information to finalize the initialization */
4497	switch (rtype) {
4498	case MAC_RING_TYPE_RX:
4499		mip->mi_rx_group_type = cap_rings->mr_group_type;
4500		mip->mi_rx_group_count = cap_rings->mr_gnum;
4501		mip->mi_rx_groups = groups;
4502		mip->mi_rx_donor_grp = groups;
4503		if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
4504			/*
4505			 * The default ring is reserved since it is
4506			 * used for sending the broadcast etc. packets.
4507			 */
4508			mip->mi_rxrings_avail =
4509			    mip->mi_rx_groups->mrg_cur_count - 1;
4510			mip->mi_rxrings_rsvd = 1;
4511		}
4512		/*
4513		 * The default group cannot be reserved. It is used by
4514		 * all the clients that do not have an exclusive group.
4515		 */
4516		mip->mi_rxhwclnt_avail = mip->mi_rx_group_count - 1;
4517		mip->mi_rxhwclnt_used = 1;
4518		break;
4519	case MAC_RING_TYPE_TX:
4520		mip->mi_tx_group_type = pseudo_txgrp ? MAC_GROUP_TYPE_DYNAMIC :
4521		    cap_rings->mr_group_type;
4522		mip->mi_tx_group_count = grpcnt;
4523		mip->mi_tx_group_free = group_free;
4524		mip->mi_tx_groups = groups;
4525
4526		group = groups + grpcnt;
4527		ring = group->mrg_rings;
4528		/*
4529		 * The ring can be NULL in the case of aggr. Aggr will
4530		 * have an empty Tx group which will get populated
4531		 * later when pseudo Tx rings are added after
4532		 * mac_register() is done.
4533		 */
4534		if (ring == NULL) {
4535			ASSERT(mip->mi_state_flags & MIS_IS_AGGR);
4536			/*
4537			 * pass the group to aggr so it can add Tx
4538			 * rings to the group later.
4539			 */
4540			cap_rings->mr_gget(mip->mi_driver, rtype, 0, NULL,
4541			    (mac_group_handle_t)group);
4542			/*
4543			 * Even though there are no rings at this time
4544			 * (rings will come later), set the group
4545			 * state to registered.
4546			 */
4547			group->mrg_state = MAC_GROUP_STATE_REGISTERED;
4548		} else {
4549			/*
4550			 * Ring 0 is used as the default one and it could be
4551			 * assigned to a client as well.
4552			 */
4553			while ((ring->mr_index != 0) && (ring->mr_next != NULL))
4554				ring = ring->mr_next;
4555			ASSERT(ring->mr_index == 0);
4556			mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
4557		}
4558		if (mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
4559			mip->mi_txrings_avail = group->mrg_cur_count - 1;
4560			/*
4561			 * The default ring cannot be reserved.
4562			 */
4563			mip->mi_txrings_rsvd = 1;
4564		}
4565		/*
4566		 * The default group cannot be reserved. It will be shared
4567		 * by clients that do not have an exclusive group.
4568		 */
4569		mip->mi_txhwclnt_avail = mip->mi_tx_group_count;
4570		mip->mi_txhwclnt_used = 1;
4571		break;
4572	default:
4573		ASSERT(B_FALSE);
4574	}
4575
4576	if (err != 0)
4577		mac_free_rings(mip, rtype);
4578
4579	return (err);
4580}
4581
4582/*
4583 * The ddi interrupt handle could be shared amoung rings. If so, compare
4584 * the new ring's ddi handle with the existing ones and set ddi_shared
4585 * flag.
4586 */
4587void
4588mac_compare_ddi_handle(mac_group_t *groups, uint_t grpcnt, mac_ring_t *cring)
4589{
4590	mac_group_t *group;
4591	mac_ring_t *ring;
4592	ddi_intr_handle_t ddi_handle;
4593	int g;
4594
4595	ddi_handle = cring->mr_info.mri_intr.mi_ddi_handle;
4596	for (g = 0; g < grpcnt; g++) {
4597		group = groups + g;
4598		for (ring = group->mrg_rings; ring != NULL;
4599		    ring = ring->mr_next) {
4600			if (ring == cring)
4601				continue;
4602			if (ring->mr_info.mri_intr.mi_ddi_handle ==
4603			    ddi_handle) {
4604				if (cring->mr_type == MAC_RING_TYPE_RX &&
4605				    ring->mr_index == 0 &&
4606				    !ring->mr_info.mri_intr.mi_ddi_shared) {
4607					ring->mr_info.mri_intr.mi_ddi_shared =
4608					    B_TRUE;
4609				} else {
4610					cring->mr_info.mri_intr.mi_ddi_shared =
4611					    B_TRUE;
4612				}
4613				return;
4614			}
4615		}
4616	}
4617}
4618
4619/*
4620 * Called to free all groups of particular type (RX or TX). It's assumed that
4621 * no clients are using these groups.
4622 */
4623void
4624mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
4625{
4626	mac_group_t *group, *groups;
4627	uint_t group_count;
4628
4629	switch (rtype) {
4630	case MAC_RING_TYPE_RX:
4631		if (mip->mi_rx_groups == NULL)
4632			return;
4633
4634		groups = mip->mi_rx_groups;
4635		group_count = mip->mi_rx_group_count;
4636
4637		mip->mi_rx_groups = NULL;
4638		mip->mi_rx_donor_grp = NULL;
4639		mip->mi_rx_group_count = 0;
4640		break;
4641	case MAC_RING_TYPE_TX:
4642		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
4643
4644		if (mip->mi_tx_groups == NULL)
4645			return;
4646
4647		groups = mip->mi_tx_groups;
4648		group_count = mip->mi_tx_group_count;
4649
4650		mip->mi_tx_groups = NULL;
4651		mip->mi_tx_group_count = 0;
4652		mip->mi_tx_group_free = 0;
4653		mip->mi_default_tx_ring = NULL;
4654		break;
4655	default:
4656		ASSERT(B_FALSE);
4657	}
4658
4659	for (group = groups; group != NULL; group = group->mrg_next) {
4660		mac_ring_t *ring;
4661
4662		if (group->mrg_cur_count == 0)
4663			continue;
4664
4665		ASSERT(group->mrg_rings != NULL);
4666
4667		while ((ring = group->mrg_rings) != NULL) {
4668			group->mrg_rings = ring->mr_next;
4669			mac_ring_free(mip, ring);
4670		}
4671	}
4672
4673	/* Free all the cached rings */
4674	mac_ring_freeall(mip);
4675	/* Free the block of group data strutures */
4676	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
4677}
4678
4679/*
4680 * Associate the VLAN filter to the receive group.
4681 */
4682int
4683mac_group_addvlan(mac_group_t *group, uint16_t vlan)
4684{
4685	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
4686	VERIFY3P(group->mrg_info.mgi_addvlan, !=, NULL);
4687
4688	if (vlan > VLAN_ID_MAX)
4689		return (EINVAL);
4690
4691	vlan = MAC_VLAN_UNTAGGED_VID(vlan);
4692	return (group->mrg_info.mgi_addvlan(group->mrg_info.mgi_driver, vlan));
4693}
4694
4695/*
4696 * Dissociate the VLAN from the receive group.
4697 */
4698int
4699mac_group_remvlan(mac_group_t *group, uint16_t vlan)
4700{
4701	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
4702	VERIFY3P(group->mrg_info.mgi_remvlan, !=, NULL);
4703
4704	if (vlan > VLAN_ID_MAX)
4705		return (EINVAL);
4706
4707	vlan = MAC_VLAN_UNTAGGED_VID(vlan);
4708	return (group->mrg_info.mgi_remvlan(group->mrg_info.mgi_driver, vlan));
4709}
4710
4711/*
4712 * Associate a MAC address with a receive group.
4713 *
4714 * The return value of this function should always be checked properly, because
4715 * any type of failure could cause unexpected results. A group can be added
4716 * or removed with a MAC address only after it has been reserved. Ideally,
4717 * a successful reservation always leads to calling mac_group_addmac() to
4718 * steer desired traffic. Failure of adding an unicast MAC address doesn't
4719 * always imply that the group is functioning abnormally.
4720 *
4721 * Currently this function is called everywhere, and it reflects assumptions
4722 * about MAC addresses in the implementation. CR 6735196.
4723 */
4724int
4725mac_group_addmac(mac_group_t *group, const uint8_t *addr)
4726{
4727	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
4728	VERIFY3P(group->mrg_info.mgi_addmac, !=, NULL);
4729
4730	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
4731}
4732
4733/*
4734 * Remove the association between MAC address and receive group.
4735 */
4736int
4737mac_group_remmac(mac_group_t *group, const uint8_t *addr)
4738{
4739	VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX);
4740	VERIFY3P(group->mrg_info.mgi_remmac, !=, NULL);
4741
4742	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
4743}
4744
4745/*
4746 * This is the entry point for packets transmitted through the bridge
4747 * code. If no bridge is in place, mac_ring_tx() transmits via the tx
4748 * ring. The 'rh' pointer may be NULL to select the default ring.
4749 */
4750mblk_t *
4751mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
4752{
4753	mac_handle_t mh;
4754
4755	/*
4756	 * Once we take a reference on the bridge link, the bridge
4757	 * module itself can't unload, so the callback pointers are
4758	 * stable.
4759	 */
4760	mutex_enter(&mip->mi_bridge_lock);
4761	if ((mh = mip->mi_bridge_link) != NULL)
4762		mac_bridge_ref_cb(mh, B_TRUE);
4763	mutex_exit(&mip->mi_bridge_lock);
4764	if (mh == NULL) {
4765		mp = mac_ring_tx((mac_handle_t)mip, rh, mp);
4766	} else {
4767		/*
4768		 * The bridge may place this mblk on a provider's Tx
4769		 * path, a mac's Rx path, or both. Since we don't have
4770		 * enough information at this point, we can't be sure
4771		 * that the destination(s) are capable of handling the
4772		 * hardware offloads requested by the mblk. We emulate
4773		 * them here as it is the safest choice. In the
4774		 * future, if bridge performance becomes a priority,
4775		 * we can elide the emulation here and leave the
4776		 * choice up to bridge.
4777		 *
4778		 * We don't clear the DB_CKSUMFLAGS here because
4779		 * HCK_IPV4_HDRCKSUM (Tx) and HCK_IPV4_HDRCKSUM_OK
4780		 * (Rx) still have the same value. If the bridge
4781		 * receives a packet from a HCKSUM_IPHDRCKSUM NIC then
4782		 * the mac(s) it is forwarded on may calculate the
4783		 * checksum again, but incorrectly (because the
4784		 * checksum field is not zero). Until the
4785		 * HCK_IPV4_HDRCKSUM/HCK_IPV4_HDRCKSUM_OK issue is
4786		 * resovled, we leave the flag clearing in bridge
4787		 * itself.
4788		 */
4789		if ((DB_CKSUMFLAGS(mp) & (HCK_TX_FLAGS | HW_LSO_FLAGS)) != 0) {
4790			mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS);
4791		}
4792
4793		mp = mac_bridge_tx_cb(mh, rh, mp);
4794		mac_bridge_ref_cb(mh, B_FALSE);
4795	}
4796
4797	return (mp);
4798}
4799
4800/*
4801 * Find a ring from its index.
4802 */
4803mac_ring_handle_t
4804mac_find_ring(mac_group_handle_t gh, int index)
4805{
4806	mac_group_t *group = (mac_group_t *)gh;
4807	mac_ring_t *ring = group->mrg_rings;
4808
4809	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
4810		if (ring->mr_index == index)
4811			break;
4812
4813	return ((mac_ring_handle_t)ring);
4814}
4815/*
4816 * Add a ring to an existing group.
4817 *
4818 * The ring must be either passed directly (for example if the ring
4819 * movement is initiated by the framework), or specified through a driver
4820 * index (for example when the ring is added by the driver.
4821 *
4822 * The caller needs to call mac_perim_enter() before calling this function.
4823 */
4824int
4825i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
4826{
4827	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
4828	mac_capab_rings_t *cap_rings;
4829	boolean_t driver_call = (ring == NULL);
4830	mac_group_type_t group_type;
4831	int ret = 0;
4832	flow_entry_t *flent;
4833
4834	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4835
4836	switch (group->mrg_type) {
4837	case MAC_RING_TYPE_RX:
4838		cap_rings = &mip->mi_rx_rings_cap;
4839		group_type = mip->mi_rx_group_type;
4840		break;
4841	case MAC_RING_TYPE_TX:
4842		cap_rings = &mip->mi_tx_rings_cap;
4843		group_type = mip->mi_tx_group_type;
4844		break;
4845	default:
4846		ASSERT(B_FALSE);
4847	}
4848
4849	/*
4850	 * There should be no ring with the same ring index in the target
4851	 * group.
4852	 */
4853	ASSERT(mac_find_ring((mac_group_handle_t)group,
4854	    driver_call ? index : ring->mr_index) == NULL);
4855
4856	if (driver_call) {
4857		/*
4858		 * The function is called as a result of a request from
4859		 * a driver to add a ring to an existing group, for example
4860		 * from the aggregation driver. Allocate a new mac_ring_t
4861		 * for that ring.
4862		 */
4863		ring = mac_init_ring(mip, group, index, cap_rings);
4864		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
4865	} else {
4866		/*
4867		 * The function is called as a result of a MAC layer request
4868		 * to add a ring to an existing group. In this case the
4869		 * ring is being moved between groups, which requires
4870		 * the underlying driver to support dynamic grouping,
4871		 * and the mac_ring_t already exists.
4872		 */
4873		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
4874		ASSERT(group->mrg_driver == NULL ||
4875		    cap_rings->mr_gaddring != NULL);
4876		ASSERT(ring->mr_gh == NULL);
4877	}
4878
4879	/*
4880	 * At this point the ring should not be in use, and it should be
4881	 * of the right for the target group.
4882	 */
4883	ASSERT(ring->mr_state < MR_INUSE);
4884	ASSERT(ring->mr_srs == NULL);
4885	ASSERT(ring->mr_type == group->mrg_type);
4886
4887	if (!driver_call) {
4888		/*
4889		 * Add the driver level hardware ring if the process was not
4890		 * initiated by the driver, and the target group is not the
4891		 * group.
4892		 */
4893		if (group->mrg_driver != NULL) {
4894			cap_rings->mr_gaddring(group->mrg_driver,
4895			    ring->mr_driver, ring->mr_type);
4896		}
4897
4898		/*
4899		 * Insert the ring ahead existing rings.
4900		 */
4901		ring->mr_next = group->mrg_rings;
4902		group->mrg_rings = ring;
4903		ring->mr_gh = (mac_group_handle_t)group;
4904		group->mrg_cur_count++;
4905	}
4906
4907	/*
4908	 * If the group has not been actively used, we're done.
4909	 */
4910	if (group->mrg_index != -1 &&
4911	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
4912		return (0);
4913
4914	/*
4915	 * Start the ring if needed. Failure causes to undo the grouping action.
4916	 */
4917	if (ring->mr_state != MR_INUSE) {
4918		if ((ret = mac_start_ring(ring)) != 0) {
4919			if (!driver_call) {
4920				cap_rings->mr_gremring(group->mrg_driver,
4921				    ring->mr_driver, ring->mr_type);
4922			}
4923			group->mrg_cur_count--;
4924			group->mrg_rings = ring->mr_next;
4925
4926			ring->mr_gh = NULL;
4927
4928			if (driver_call)
4929				mac_ring_free(mip, ring);
4930
4931			return (ret);
4932		}
4933	}
4934
4935	/*
4936	 * Set up SRS/SR according to the ring type.
4937	 */
4938	switch (ring->mr_type) {
4939	case MAC_RING_TYPE_RX:
4940		/*
4941		 * Setup an SRS on top of the new ring if the group is
4942		 * reserved for someone's exclusive use.
4943		 */
4944		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
4945			mac_client_impl_t *mcip =  MAC_GROUP_ONLY_CLIENT(group);
4946
4947			VERIFY3P(mcip, !=, NULL);
4948			flent = mcip->mci_flent;
4949			VERIFY3S(flent->fe_rx_srs_cnt, >, 0);
4950			mac_rx_srs_group_setup(mcip, flent, SRST_LINK);
4951			mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
4952			    mac_rx_deliver, mcip, NULL, NULL);
4953		} else {
4954			ring->mr_classify_type = MAC_SW_CLASSIFIER;
4955		}
4956		break;
4957	case MAC_RING_TYPE_TX:
4958	{
4959		mac_grp_client_t	*mgcp = group->mrg_clients;
4960		mac_client_impl_t	*mcip;
4961		mac_soft_ring_set_t	*mac_srs;
4962		mac_srs_tx_t		*tx;
4963
4964		if (MAC_GROUP_NO_CLIENT(group)) {
4965			if (ring->mr_state == MR_INUSE)
4966				mac_stop_ring(ring);
4967			ring->mr_flag = 0;
4968			break;
4969		}
4970		/*
4971		 * If the rings are being moved to a group that has
4972		 * clients using it, then add the new rings to the
4973		 * clients SRS.
4974		 */
4975		while (mgcp != NULL) {
4976			boolean_t	is_aggr;
4977
4978			mcip = mgcp->mgc_client;
4979			flent = mcip->mci_flent;
4980			is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT);
4981			mac_srs = MCIP_TX_SRS(mcip);
4982			tx = &mac_srs->srs_tx;
4983			mac_tx_client_quiesce((mac_client_handle_t)mcip);
4984			/*
4985			 * If we are  growing from 1 to multiple rings.
4986			 */
4987			if (tx->st_mode == SRS_TX_BW ||
4988			    tx->st_mode == SRS_TX_SERIALIZE ||
4989			    tx->st_mode == SRS_TX_DEFAULT) {
4990				mac_ring_t	*tx_ring = tx->st_arg2;
4991
4992				tx->st_arg2 = NULL;
4993				mac_tx_srs_stat_recreate(mac_srs, B_TRUE);
4994				mac_tx_srs_add_ring(mac_srs, tx_ring);
4995				if (mac_srs->srs_type & SRST_BW_CONTROL) {
4996					tx->st_mode = is_aggr ? SRS_TX_BW_AGGR :
4997					    SRS_TX_BW_FANOUT;
4998				} else {
4999					tx->st_mode = is_aggr ? SRS_TX_AGGR :
5000					    SRS_TX_FANOUT;
5001				}
5002				tx->st_func = mac_tx_get_func(tx->st_mode);
5003			}
5004			mac_tx_srs_add_ring(mac_srs, ring);
5005			mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip),
5006			    mac_rx_deliver, mcip, NULL, NULL);
5007			mac_tx_client_restart((mac_client_handle_t)mcip);
5008			mgcp = mgcp->mgc_next;
5009		}
5010		break;
5011	}
5012	default:
5013		ASSERT(B_FALSE);
5014	}
5015	/*
5016	 * For aggr, the default ring will be NULL to begin with. If it
5017	 * is NULL, then pick the first ring that gets added as the
5018	 * default ring. Any ring in an aggregation can be removed at
5019	 * any time (by the user action of removing a link) and if the
5020	 * current default ring gets removed, then a new one gets
5021	 * picked (see i_mac_group_rem_ring()).
5022	 */
5023	if (mip->mi_state_flags & MIS_IS_AGGR &&
5024	    mip->mi_default_tx_ring == NULL &&
5025	    ring->mr_type == MAC_RING_TYPE_TX) {
5026		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
5027	}
5028
5029	MAC_RING_UNMARK(ring, MR_INCIPIENT);
5030	return (0);
5031}
5032
5033/*
5034 * Remove a ring from it's current group. MAC internal function for dynamic
5035 * grouping.
5036 *
5037 * The caller needs to call mac_perim_enter() before calling this function.
5038 */
5039void
5040i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
5041    boolean_t driver_call)
5042{
5043	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
5044	mac_capab_rings_t *cap_rings = NULL;
5045	mac_group_type_t group_type;
5046
5047	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5048
5049	ASSERT(mac_find_ring((mac_group_handle_t)group,
5050	    ring->mr_index) == (mac_ring_handle_t)ring);
5051	ASSERT((mac_group_t *)ring->mr_gh == group);
5052	ASSERT(ring->mr_type == group->mrg_type);
5053
5054	if (ring->mr_state == MR_INUSE)
5055		mac_stop_ring(ring);
5056	switch (ring->mr_type) {
5057	case MAC_RING_TYPE_RX:
5058		group_type = mip->mi_rx_group_type;
5059		cap_rings = &mip->mi_rx_rings_cap;
5060
5061		/*
5062		 * Only hardware classified packets hold a reference to the
5063		 * ring all the way up the Rx path. mac_rx_srs_remove()
5064		 * will take care of quiescing the Rx path and removing the
5065		 * SRS. The software classified path neither holds a reference
5066		 * nor any association with the ring in mac_rx.
5067		 */
5068		if (ring->mr_srs != NULL) {
5069			mac_rx_srs_remove(ring->mr_srs);
5070			ring->mr_srs = NULL;
5071		}
5072
5073		break;
5074	case MAC_RING_TYPE_TX:
5075	{
5076		mac_grp_client_t	*mgcp;
5077		mac_client_impl_t	*mcip;
5078		mac_soft_ring_set_t	*mac_srs;
5079		mac_srs_tx_t		*tx;
5080		mac_ring_t		*rem_ring;
5081		mac_group_t		*defgrp;
5082		uint_t			ring_info = 0;
5083
5084		/*
5085		 * For TX this function is invoked in three
5086		 * cases:
5087		 *
5088		 * 1) In the case of a failure during the
5089		 * initial creation of a group when a share is
5090		 * associated with a MAC client. So the SRS is not
5091		 * yet setup, and will be setup later after the
5092		 * group has been reserved and populated.
5093		 *
5094		 * 2) From mac_release_tx_group() when freeing
5095		 * a TX SRS.
5096		 *
5097		 * 3) In the case of aggr, when a port gets removed,
5098		 * the pseudo Tx rings that it exposed gets removed.
5099		 *
5100		 * In the first two cases the SRS and its soft
5101		 * rings are already quiesced.
5102		 */
5103		if (driver_call) {
5104			mac_client_impl_t *mcip;
5105			mac_soft_ring_set_t *mac_srs;
5106			mac_soft_ring_t *sringp;
5107			mac_srs_tx_t *srs_tx;
5108
5109			if (mip->mi_state_flags & MIS_IS_AGGR &&
5110			    mip->mi_default_tx_ring ==
5111			    (mac_ring_handle_t)ring) {
5112				/* pick a new default Tx ring */
5113				mip->mi_default_tx_ring =
5114				    (group->mrg_rings != ring) ?
5115				    (mac_ring_handle_t)group->mrg_rings :
5116				    (mac_ring_handle_t)(ring->mr_next);
5117			}
5118			/* Presently only aggr case comes here */
5119			if (group->mrg_state != MAC_GROUP_STATE_RESERVED)
5120				break;
5121
5122			mcip = MAC_GROUP_ONLY_CLIENT(group);
5123			ASSERT(mcip != NULL);
5124			ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT);
5125			mac_srs = MCIP_TX_SRS(mcip);
5126			ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
5127			    mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
5128			srs_tx = &mac_srs->srs_tx;
5129			/*
5130			 * Wakeup any callers blocked on this
5131			 * Tx ring due to flow control.
5132			 */
5133			sringp = srs_tx->st_soft_rings[ring->mr_index];
5134			ASSERT(sringp != NULL);
5135			mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)sringp);
5136			mac_tx_client_quiesce((mac_client_handle_t)mcip);
5137			mac_tx_srs_del_ring(mac_srs, ring);
5138			mac_tx_client_restart((mac_client_handle_t)mcip);
5139			break;
5140		}
5141		ASSERT(ring != (mac_ring_t *)mip->mi_default_tx_ring);
5142		group_type = mip->mi_tx_group_type;
5143		cap_rings = &mip->mi_tx_rings_cap;
5144		/*
5145		 * See if we need to take it out of the MAC clients using
5146		 * this group
5147		 */
5148		if (MAC_GROUP_NO_CLIENT(group))
5149			break;
5150		mgcp = group->mrg_clients;
5151		defgrp = MAC_DEFAULT_TX_GROUP(mip);
5152		while (mgcp != NULL) {
5153			mcip = mgcp->mgc_client;
5154			mac_srs = MCIP_TX_SRS(mcip);
5155			tx = &mac_srs->srs_tx;
5156			mac_tx_client_quiesce((mac_client_handle_t)mcip);
5157			/*
5158			 * If we are here when removing rings from the
5159			 * defgroup, mac_reserve_tx_ring would have
5160			 * already deleted the ring from the MAC
5161			 * clients in the group.
5162			 */
5163			if (group != defgrp) {
5164				mac_tx_invoke_callbacks(mcip,
5165				    (mac_tx_cookie_t)
5166				    mac_tx_srs_get_soft_ring(mac_srs, ring));
5167				mac_tx_srs_del_ring(mac_srs, ring);
5168			}
5169			/*
5170			 * Additionally, if  we are left with only
5171			 * one ring in the group after this, we need
5172			 * to modify the mode etc. to. (We haven't
5173			 * yet taken the ring out, so we check with 2).
5174			 */
5175			if (group->mrg_cur_count == 2) {
5176				if (ring->mr_next == NULL)
5177					rem_ring = group->mrg_rings;
5178				else
5179					rem_ring = ring->mr_next;
5180				mac_tx_invoke_callbacks(mcip,
5181				    (mac_tx_cookie_t)
5182				    mac_tx_srs_get_soft_ring(mac_srs,
5183				    rem_ring));
5184				mac_tx_srs_del_ring(mac_srs, rem_ring);
5185				if (rem_ring->mr_state != MR_INUSE) {
5186					(void) mac_start_ring(rem_ring);
5187				}
5188				tx->st_arg2 = (void *)rem_ring;
5189				mac_tx_srs_stat_recreate(mac_srs, B_FALSE);
5190				ring_info = mac_hwring_getinfo(
5191				    (mac_ring_handle_t)rem_ring);
5192				/*
5193				 * We are  shrinking from multiple
5194				 * to 1 ring.
5195				 */
5196				if (mac_srs->srs_type & SRST_BW_CONTROL) {
5197					tx->st_mode = SRS_TX_BW;
5198				} else if (mac_tx_serialize ||
5199				    (ring_info & MAC_RING_TX_SERIALIZE)) {
5200					tx->st_mode = SRS_TX_SERIALIZE;
5201				} else {
5202					tx->st_mode = SRS_TX_DEFAULT;
5203				}
5204				tx->st_func = mac_tx_get_func(tx->st_mode);
5205			}
5206			mac_tx_client_restart((mac_client_handle_t)mcip);
5207			mgcp = mgcp->mgc_next;
5208		}
5209		break;
5210	}
5211	default:
5212		ASSERT(B_FALSE);
5213	}
5214
5215	/*
5216	 * Remove the ring from the group.
5217	 */
5218	if (ring == group->mrg_rings)
5219		group->mrg_rings = ring->mr_next;
5220	else {
5221		mac_ring_t *pre;
5222
5223		pre = group->mrg_rings;
5224		while (pre->mr_next != ring)
5225			pre = pre->mr_next;
5226		pre->mr_next = ring->mr_next;
5227	}
5228	group->mrg_cur_count--;
5229
5230	if (!driver_call) {
5231		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
5232		ASSERT(group->mrg_driver == NULL ||
5233		    cap_rings->mr_gremring != NULL);
5234
5235		/*
5236		 * Remove the driver level hardware ring.
5237		 */
5238		if (group->mrg_driver != NULL) {
5239			cap_rings->mr_gremring(group->mrg_driver,
5240			    ring->mr_driver, ring->mr_type);
5241		}
5242	}
5243
5244	ring->mr_gh = NULL;
5245	if (driver_call)
5246		mac_ring_free(mip, ring);
5247	else
5248		ring->mr_flag = 0;
5249}
5250
5251/*
5252 * Move a ring to the target group. If needed, remove the ring from the group
5253 * that it currently belongs to.
5254 *
5255 * The caller need to enter MAC's perimeter by calling mac_perim_enter().
5256 */
5257static int
5258mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
5259{
5260	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
5261	int rv;
5262
5263	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5264	ASSERT(d_group != NULL);
5265	ASSERT(s_group == NULL || s_group->mrg_mh == d_group->mrg_mh);
5266
5267	if (s_group == d_group)
5268		return (0);
5269
5270	/*
5271	 * Remove it from current group first.
5272	 */
5273	if (s_group != NULL)
5274		i_mac_group_rem_ring(s_group, ring, B_FALSE);
5275
5276	/*
5277	 * Add it to the new group.
5278	 */
5279	rv = i_mac_group_add_ring(d_group, ring, 0);
5280	if (rv != 0) {
5281		/*
5282		 * Failed to add ring back to source group. If
5283		 * that fails, the ring is stuck in limbo, log message.
5284		 */
5285		if (i_mac_group_add_ring(s_group, ring, 0)) {
5286			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
5287			    mip->mi_name, (void *)ring);
5288		}
5289	}
5290
5291	return (rv);
5292}
5293
5294/*
5295 * Find a MAC address according to its value.
5296 */
5297mac_address_t *
5298mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
5299{
5300	mac_address_t *map;
5301
5302	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5303
5304	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
5305		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
5306			break;
5307	}
5308
5309	return (map);
5310}
5311
5312/*
5313 * Check whether the MAC address is shared by multiple clients.
5314 */
5315boolean_t
5316mac_check_macaddr_shared(mac_address_t *map)
5317{
5318	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
5319
5320	return (map->ma_nusers > 1);
5321}
5322
5323/*
5324 * Remove the specified MAC address from the MAC address list and free it.
5325 */
5326static void
5327mac_free_macaddr(mac_address_t *map)
5328{
5329	mac_impl_t *mip = map->ma_mip;
5330
5331	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5332	VERIFY3P(mip->mi_addresses, !=, NULL);
5333
5334	VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr));
5335	VERIFY3P(map, !=, NULL);
5336	VERIFY3S(map->ma_nusers, ==, 0);
5337	VERIFY3P(map->ma_vlans, ==, NULL);
5338
5339	if (map == mip->mi_addresses) {
5340		mip->mi_addresses = map->ma_next;
5341	} else {
5342		mac_address_t *pre;
5343
5344		pre = mip->mi_addresses;
5345		while (pre->ma_next != map)
5346			pre = pre->ma_next;
5347		pre->ma_next = map->ma_next;
5348	}
5349
5350	kmem_free(map, sizeof (mac_address_t));
5351}
5352
5353static mac_vlan_t *
5354mac_find_vlan(mac_address_t *map, uint16_t vid)
5355{
5356	mac_vlan_t *mvp;
5357
5358	for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next) {
5359		if (mvp->mv_vid == vid)
5360			return (mvp);
5361	}
5362
5363	return (NULL);
5364}
5365
5366static mac_vlan_t *
5367mac_add_vlan(mac_address_t *map, uint16_t vid)
5368{
5369	mac_vlan_t *mvp;
5370
5371	/*
5372	 * We should never add the same {addr, VID} tuple more
5373	 * than once, but let's be sure.
5374	 */
5375	for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next)
5376		VERIFY3U(mvp->mv_vid, !=, vid);
5377
5378	/* Add the VLAN to the head of the VLAN list. */
5379	mvp = kmem_zalloc(sizeof (mac_vlan_t), KM_SLEEP);
5380	mvp->mv_vid = vid;
5381	mvp->mv_next = map->ma_vlans;
5382	map->ma_vlans = mvp;
5383
5384	return (mvp);
5385}
5386
5387static void
5388mac_rem_vlan(mac_address_t *map, mac_vlan_t *mvp)
5389{
5390	mac_vlan_t *pre;
5391
5392	if (map->ma_vlans == mvp) {
5393		map->ma_vlans = mvp->mv_next;
5394	} else {
5395		pre = map->ma_vlans;
5396		while (pre->mv_next != mvp) {
5397			pre = pre->mv_next;
5398
5399			/*
5400			 * We've reached the end of the list without
5401			 * finding mvp.
5402			 */
5403			VERIFY3P(pre, !=, NULL);
5404		}
5405		pre->mv_next = mvp->mv_next;
5406	}
5407
5408	kmem_free(mvp, sizeof (mac_vlan_t));
5409}
5410
5411/*
5412 * Create a new mac_address_t if this is the first use of the address
5413 * or add a VID to an existing address. In either case, the
5414 * mac_address_t acts as a list of {addr, VID} tuples where each tuple
5415 * shares the same addr. If group is non-NULL then attempt to program
5416 * the MAC's HW filters for this group. Otherwise, if group is NULL,
5417 * then the MAC has no rings and there is nothing to program.
5418 */
5419int
5420mac_add_macaddr_vlan(mac_impl_t *mip, mac_group_t *group, uint8_t *addr,
5421    uint16_t vid, boolean_t use_hw)
5422{
5423	mac_address_t	*map;
5424	mac_vlan_t	*mvp;
5425	int		err = 0;
5426	boolean_t	allocated_map = B_FALSE;
5427	boolean_t	hw_mac = B_FALSE;
5428	boolean_t	hw_vlan = B_FALSE;
5429
5430	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5431
5432	map = mac_find_macaddr(mip, addr);
5433
5434	/*
5435	 * If this is the first use of this MAC address then allocate
5436	 * and initialize a new structure.
5437	 */
5438	if (map == NULL) {
5439		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
5440		map->ma_len = mip->mi_type->mt_addr_length;
5441		bcopy(addr, map->ma_addr, map->ma_len);
5442		map->ma_nusers = 0;
5443		map->ma_group = group;
5444		map->ma_mip = mip;
5445		map->ma_untagged = B_FALSE;
5446
5447		/* Add the new MAC address to the head of the address list. */
5448		map->ma_next = mip->mi_addresses;
5449		mip->mi_addresses = map;
5450
5451		allocated_map = B_TRUE;
5452	}
5453
5454	VERIFY(map->ma_group == NULL || map->ma_group == group);
5455	if (map->ma_group == NULL)
5456		map->ma_group = group;
5457
5458	if (vid == VLAN_ID_NONE) {
5459		map->ma_untagged = B_TRUE;
5460		mvp = NULL;
5461	} else {
5462		mvp = mac_add_vlan(map, vid);
5463	}
5464
5465	/*
5466	 * Set the VLAN HW filter if:
5467	 *
5468	 * o the MAC's VLAN HW filtering is enabled, and
5469	 * o the address does not currently rely on promisc mode.
5470	 *
5471	 * This is called even when the client specifies an untagged
5472	 * address (VLAN_ID_NONE) because some MAC providers require
5473	 * setting additional bits to accept untagged traffic when
5474	 * VLAN HW filtering is enabled.
5475	 */
5476	if (MAC_GROUP_HW_VLAN(group) &&
5477	    map->ma_type != MAC_ADDRESS_TYPE_UNICAST_PROMISC) {
5478		if ((err = mac_group_addvlan(group, vid)) != 0)
5479			goto bail;
5480
5481		hw_vlan = B_TRUE;
5482	}
5483
5484	VERIFY3S(map->ma_nusers, >=, 0);
5485	map->ma_nusers++;
5486
5487	/*
5488	 * If this MAC address already has a HW filter then simply
5489	 * increment the counter.
5490	 */
5491	if (map->ma_nusers > 1)
5492		return (0);
5493
5494	/*
5495	 * All logic from here on out is executed during initial
5496	 * creation only.
5497	 */
5498	VERIFY3S(map->ma_nusers, ==, 1);
5499
5500	/*
5501	 * Activate this MAC address by adding it to the reserved group.
5502	 */
5503	if (group != NULL) {
5504		err = mac_group_addmac(group, (const uint8_t *)addr);
5505
5506		/*
5507		 * If the driver is out of filters then we can
5508		 * continue and use promisc mode. For any other error,
5509		 * assume the driver is in a state where we can't
5510		 * program the filters or use promisc mode; so we must
5511		 * bail.
5512		 */
5513		if (err != 0 && err != ENOSPC) {
5514			map->ma_nusers--;
5515			goto bail;
5516		}
5517
5518		hw_mac = (err == 0);
5519	}
5520
5521	if (hw_mac) {
5522		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5523		return (0);
5524	}
5525
5526	/*
5527	 * The MAC address addition failed. If the client requires a
5528	 * hardware classified MAC address, fail the operation. This
5529	 * feature is only used by sun4v vsw.
5530	 */
5531	if (use_hw && !hw_mac) {
5532		err = ENOSPC;
5533		map->ma_nusers--;
5534		goto bail;
5535	}
5536
5537	/*
5538	 * If we reach this point then either the MAC doesn't have
5539	 * RINGS capability or we are out of MAC address HW filters.
5540	 * In any case we must put the MAC into promiscuous mode.
5541	 */
5542	VERIFY(group == NULL || !hw_mac);
5543
5544	/*
5545	 * The one exception is the primary address. A non-RINGS
5546	 * driver filters the primary address by default; promisc mode
5547	 * is not needed.
5548	 */
5549	if ((group == NULL) &&
5550	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
5551		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5552		return (0);
5553	}
5554
5555	/*
5556	 * Enable promiscuous mode in order to receive traffic to the
5557	 * new MAC address. All existing HW filters still send their
5558	 * traffic to their respective group/SRSes. But with promisc
5559	 * enabled all unknown traffic is delivered to the default
5560	 * group where it is SW classified via mac_rx_classify().
5561	 */
5562	if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
5563		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
5564		return (0);
5565	}
5566
5567	/*
5568	 * We failed to set promisc mode and we are about to free 'map'.
5569	 */
5570	map->ma_nusers = 0;
5571
5572bail:
5573	if (hw_vlan) {
5574		int err2 = mac_group_remvlan(group, vid);
5575
5576		if (err2 != 0) {
5577			cmn_err(CE_WARN, "Failed to remove VLAN %u from group"
5578			    " %d on MAC %s: %d.", vid, group->mrg_index,
5579			    mip->mi_name, err2);
5580		}
5581	}
5582
5583	if (mvp != NULL)
5584		mac_rem_vlan(map, mvp);
5585
5586	if (allocated_map)
5587		mac_free_macaddr(map);
5588
5589	return (err);
5590}
5591
5592int
5593mac_remove_macaddr_vlan(mac_address_t *map, uint16_t vid)
5594{
5595	mac_vlan_t	*mvp;
5596	mac_impl_t	*mip = map->ma_mip;
5597	mac_group_t	*group = map->ma_group;
5598	int		err = 0;
5599
5600	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5601	VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr));
5602
5603	if (vid == VLAN_ID_NONE) {
5604		map->ma_untagged = B_FALSE;
5605		mvp = NULL;
5606	} else {
5607		mvp = mac_find_vlan(map, vid);
5608		VERIFY3P(mvp, !=, NULL);
5609	}
5610
5611	if (MAC_GROUP_HW_VLAN(group) &&
5612	    map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED &&
5613	    ((err = mac_group_remvlan(group, vid)) != 0))
5614		return (err);
5615
5616	if (mvp != NULL)
5617		mac_rem_vlan(map, mvp);
5618
5619	/*
5620	 * If it's not the last client using this MAC address, only update
5621	 * the MAC clients count.
5622	 */
5623	map->ma_nusers--;
5624	if (map->ma_nusers > 0)
5625		return (0);
5626
5627	VERIFY3S(map->ma_nusers, ==, 0);
5628
5629	/*
5630	 * The MAC address is no longer used by any MAC client, so
5631	 * remove it from its associated group. Turn off promiscuous
5632	 * mode if this is the last address relying on it.
5633	 */
5634	switch (map->ma_type) {
5635	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
5636		/*
5637		 * Don't free the preset primary address for drivers that
5638		 * don't advertise RINGS capability.
5639		 */
5640		if (group == NULL)
5641			return (0);
5642
5643		if ((err = mac_group_remmac(group, map->ma_addr)) != 0) {
5644			if (vid == VLAN_ID_NONE)
5645				map->ma_untagged = B_TRUE;
5646			else
5647				(void) mac_add_vlan(map, vid);
5648
5649			/*
5650			 * If we fail to remove the MAC address HW
5651			 * filter but then also fail to re-add the
5652			 * VLAN HW filter then we are in a busted
5653			 * state. We do our best by logging a warning
5654			 * and returning the original 'err' that got
5655			 * us here. At this point, traffic for this
5656			 * address + VLAN combination will be dropped
5657			 * until the user reboots the system. In the
5658			 * future, it would be nice to have a system
5659			 * that can compare the state of expected
5660			 * classification according to mac to the
5661			 * actual state of the provider, and report
5662			 * and fix any inconsistencies.
5663			 */
5664			if (MAC_GROUP_HW_VLAN(group)) {
5665				int err2;
5666
5667				err2 = mac_group_addvlan(group, vid);
5668				if (err2 != 0) {
5669					cmn_err(CE_WARN, "Failed to readd VLAN"
5670					    " %u to group %d on MAC %s: %d.",
5671					    vid, group->mrg_index, mip->mi_name,
5672					    err2);
5673				}
5674			}
5675
5676			map->ma_nusers = 1;
5677			return (err);
5678		}
5679
5680		map->ma_group = NULL;
5681		break;
5682	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5683		err = i_mac_promisc_set(mip, B_FALSE);
5684		break;
5685	default:
5686		panic("Unexpected ma_type 0x%x, file: %s, line %d",
5687		    map->ma_type, __FILE__, __LINE__);
5688	}
5689
5690	if (err != 0) {
5691		map->ma_nusers = 1;
5692		return (err);
5693	}
5694
5695	/*
5696	 * We created MAC address for the primary one at registration, so we
5697	 * won't free it here. mac_fini_macaddr() will take care of it.
5698	 */
5699	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
5700		mac_free_macaddr(map);
5701
5702	return (0);
5703}
5704
5705/*
5706 * Update an existing MAC address. The caller need to make sure that the new
5707 * value has not been used.
5708 */
5709int
5710mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
5711{
5712	mac_impl_t *mip = map->ma_mip;
5713	int err = 0;
5714
5715	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5716	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5717
5718	switch (map->ma_type) {
5719	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
5720		/*
5721		 * Update the primary address for drivers that are not
5722		 * RINGS capable.
5723		 */
5724		if (mip->mi_rx_groups == NULL) {
5725			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
5726			    mac_addr);
5727			if (err != 0)
5728				return (err);
5729			break;
5730		}
5731
5732		/*
5733		 * If this MAC address is not currently in use,
5734		 * simply break out and update the value.
5735		 */
5736		if (map->ma_nusers == 0)
5737			break;
5738
5739		/*
5740		 * Need to replace the MAC address associated with a group.
5741		 */
5742		err = mac_group_remmac(map->ma_group, map->ma_addr);
5743		if (err != 0)
5744			return (err);
5745
5746		err = mac_group_addmac(map->ma_group, mac_addr);
5747
5748		/*
5749		 * Failure hints hardware error. The MAC layer needs to
5750		 * have error notification facility to handle this.
5751		 * Now, simply try to restore the value.
5752		 */
5753		if (err != 0)
5754			(void) mac_group_addmac(map->ma_group, map->ma_addr);
5755
5756		break;
5757	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
5758		/*
5759		 * Need to do nothing more if in promiscuous mode.
5760		 */
5761		break;
5762	default:
5763		ASSERT(B_FALSE);
5764	}
5765
5766	/*
5767	 * Successfully replaced the MAC address.
5768	 */
5769	if (err == 0)
5770		bcopy(mac_addr, map->ma_addr, map->ma_len);
5771
5772	return (err);
5773}
5774
5775/*
5776 * Freshen the MAC address with new value. Its caller must have updated the
5777 * hardware MAC address before calling this function.
5778 * This funcitons is supposed to be used to handle the MAC address change
5779 * notification from underlying drivers.
5780 */
5781void
5782mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
5783{
5784	mac_impl_t *mip = map->ma_mip;
5785
5786	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5787	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
5788
5789	/*
5790	 * Freshen the MAC address with new value.
5791	 */
5792	bcopy(mac_addr, map->ma_addr, map->ma_len);
5793	bcopy(mac_addr, mip->mi_addr, map->ma_len);
5794
5795	/*
5796	 * Update all MAC clients that share this MAC address.
5797	 */
5798	mac_unicast_update_clients(mip, map);
5799}
5800
5801/*
5802 * Set up the primary MAC address.
5803 */
5804void
5805mac_init_macaddr(mac_impl_t *mip)
5806{
5807	mac_address_t *map;
5808
5809	/*
5810	 * The reference count is initialized to zero, until it's really
5811	 * activated.
5812	 */
5813	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
5814	map->ma_len = mip->mi_type->mt_addr_length;
5815	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
5816
5817	/*
5818	 * If driver advertises RINGS capability, it shouldn't have initialized
5819	 * its primary MAC address. For other drivers, including VNIC, the
5820	 * primary address must work after registration.
5821	 */
5822	if (mip->mi_rx_groups == NULL)
5823		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
5824
5825	map->ma_mip = mip;
5826
5827	mip->mi_addresses = map;
5828}
5829
5830/*
5831 * Clean up the primary MAC address. Note, only one primary MAC address
5832 * is allowed. All other MAC addresses must have been freed appropriately.
5833 */
5834void
5835mac_fini_macaddr(mac_impl_t *mip)
5836{
5837	mac_address_t *map = mip->mi_addresses;
5838
5839	if (map == NULL)
5840		return;
5841
5842	/*
5843	 * If mi_addresses is initialized, there should be exactly one
5844	 * entry left on the list with no users.
5845	 */
5846	VERIFY3S(map->ma_nusers, ==, 0);
5847	VERIFY3P(map->ma_next, ==, NULL);
5848	VERIFY3P(map->ma_vlans, ==, NULL);
5849
5850	kmem_free(map, sizeof (mac_address_t));
5851	mip->mi_addresses = NULL;
5852}
5853
5854/*
5855 * Logging related functions.
5856 *
5857 * Note that Kernel statistics have been extended to maintain fine
5858 * granularity of statistics viz. hardware lane, software lane, fanout
5859 * stats etc. However, extended accounting continues to support only
5860 * aggregate statistics like before.
5861 */
5862
5863/* Write the flow description to a netinfo_t record */
5864static netinfo_t *
5865mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
5866{
5867	netinfo_t		*ninfo;
5868	net_desc_t		*ndesc;
5869	flow_desc_t		*fdesc;
5870	mac_resource_props_t	*mrp;
5871
5872	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5873	if (ninfo == NULL)
5874		return (NULL);
5875	ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5876	if (ndesc == NULL) {
5877		kmem_free(ninfo, sizeof (netinfo_t));
5878		return (NULL);
5879	}
5880
5881	/*
5882	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
5883	 * Updates to the fe_flow_desc are done under the fe_lock
5884	 */
5885	mutex_enter(&flent->fe_lock);
5886	fdesc = &flent->fe_flow_desc;
5887	mrp = &flent->fe_resource_props;
5888
5889	ndesc->nd_name = flent->fe_flow_name;
5890	ndesc->nd_devname = mcip->mci_name;
5891	bcopy(fdesc->fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5892	bcopy(fdesc->fd_dst_mac, ndesc->nd_edest, ETHERADDRL);
5893	ndesc->nd_sap = htonl(fdesc->fd_sap);
5894	ndesc->nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
5895	ndesc->nd_bw_limit = mrp->mrp_maxbw;
5896	if (ndesc->nd_isv4) {
5897		ndesc->nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
5898		ndesc->nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
5899	} else {
5900		bcopy(&fdesc->fd_local_addr, ndesc->nd_saddr, IPV6_ADDR_LEN);
5901		bcopy(&fdesc->fd_remote_addr, ndesc->nd_daddr, IPV6_ADDR_LEN);
5902	}
5903	ndesc->nd_sport = htons(fdesc->fd_local_port);
5904	ndesc->nd_dport = htons(fdesc->fd_remote_port);
5905	ndesc->nd_protocol = (uint8_t)fdesc->fd_protocol;
5906	mutex_exit(&flent->fe_lock);
5907
5908	ninfo->ni_record = ndesc;
5909	ninfo->ni_size = sizeof (net_desc_t);
5910	ninfo->ni_type = EX_NET_FLDESC_REC;
5911
5912	return (ninfo);
5913}
5914
5915/* Write the flow statistics to a netinfo_t record */
5916static netinfo_t *
5917mac_write_flow_stats(flow_entry_t *flent)
5918{
5919	netinfo_t		*ninfo;
5920	net_stat_t		*nstat;
5921	mac_soft_ring_set_t	*mac_srs;
5922	mac_rx_stats_t		*mac_rx_stat;
5923	mac_tx_stats_t		*mac_tx_stat;
5924	int			i;
5925
5926	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5927	if (ninfo == NULL)
5928		return (NULL);
5929	nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
5930	if (nstat == NULL) {
5931		kmem_free(ninfo, sizeof (netinfo_t));
5932		return (NULL);
5933	}
5934
5935	nstat->ns_name = flent->fe_flow_name;
5936	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
5937		mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
5938		mac_rx_stat = &mac_srs->srs_rx.sr_stat;
5939
5940		nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
5941		    mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
5942		nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
5943		    mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
5944		nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
5945	}
5946
5947	mac_srs = (mac_soft_ring_set_t *)(flent->fe_tx_srs);
5948	if (mac_srs != NULL) {
5949		mac_tx_stat = &mac_srs->srs_tx.st_stat;
5950
5951		nstat->ns_obytes = mac_tx_stat->mts_obytes;
5952		nstat->ns_opackets = mac_tx_stat->mts_opackets;
5953		nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
5954	}
5955
5956	ninfo->ni_record = nstat;
5957	ninfo->ni_size = sizeof (net_stat_t);
5958	ninfo->ni_type = EX_NET_FLSTAT_REC;
5959
5960	return (ninfo);
5961}
5962
5963/* Write the link description to a netinfo_t record */
5964static netinfo_t *
5965mac_write_link_desc(mac_client_impl_t *mcip)
5966{
5967	netinfo_t		*ninfo;
5968	net_desc_t		*ndesc;
5969	flow_entry_t		*flent = mcip->mci_flent;
5970
5971	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
5972	if (ninfo == NULL)
5973		return (NULL);
5974	ndesc = kmem_zalloc(sizeof (net_desc_t), KM_NOSLEEP);
5975	if (ndesc == NULL) {
5976		kmem_free(ninfo, sizeof (netinfo_t));
5977		return (NULL);
5978	}
5979
5980	ndesc->nd_name = mcip->mci_name;
5981	ndesc->nd_devname = mcip->mci_name;
5982	ndesc->nd_isv4 = B_TRUE;
5983	/*
5984	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
5985	 * Updates to the fe_flow_desc are done under the fe_lock
5986	 * after removing the flent from the flow table.
5987	 */
5988	mutex_enter(&flent->fe_lock);
5989	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc->nd_ehost, ETHERADDRL);
5990	mutex_exit(&flent->fe_lock);
5991
5992	ninfo->ni_record = ndesc;
5993	ninfo->ni_size = sizeof (net_desc_t);
5994	ninfo->ni_type = EX_NET_LNDESC_REC;
5995
5996	return (ninfo);
5997}
5998
5999/* Write the link statistics to a netinfo_t record */
6000static netinfo_t *
6001mac_write_link_stats(mac_client_impl_t *mcip)
6002{
6003	netinfo_t		*ninfo;
6004	net_stat_t		*nstat;
6005	flow_entry_t		*flent;
6006	mac_soft_ring_set_t	*mac_srs;
6007	mac_rx_stats_t		*mac_rx_stat;
6008	mac_tx_stats_t		*mac_tx_stat;
6009	int			i;
6010
6011	ninfo = kmem_zalloc(sizeof (netinfo_t), KM_NOSLEEP);
6012	if (ninfo == NULL)
6013		return (NULL);
6014	nstat = kmem_zalloc(sizeof (net_stat_t), KM_NOSLEEP);
6015	if (nstat == NULL) {
6016		kmem_free(ninfo, sizeof (netinfo_t));
6017		return (NULL);
6018	}
6019
6020	nstat->ns_name = mcip->mci_name;
6021	flent = mcip->mci_flent;
6022	if (flent != NULL)  {
6023		for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
6024			mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i];
6025			mac_rx_stat = &mac_srs->srs_rx.sr_stat;
6026
6027			nstat->ns_ibytes += mac_rx_stat->mrs_intrbytes +
6028			    mac_rx_stat->mrs_pollbytes +
6029			    mac_rx_stat->mrs_lclbytes;
6030			nstat->ns_ipackets += mac_rx_stat->mrs_intrcnt +
6031			    mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
6032			nstat->ns_oerrors += mac_rx_stat->mrs_ierrors;
6033		}
6034	}
6035
6036	mac_srs = (mac_soft_ring_set_t *)(mcip->mci_flent->fe_tx_srs);
6037	if (mac_srs != NULL) {
6038		mac_tx_stat = &mac_srs->srs_tx.st_stat;
6039
6040		nstat->ns_obytes = mac_tx_stat->mts_obytes;
6041		nstat->ns_opackets = mac_tx_stat->mts_opackets;
6042		nstat->ns_oerrors = mac_tx_stat->mts_oerrors;
6043	}
6044
6045	ninfo->ni_record = nstat;
6046	ninfo->ni_size = sizeof (net_stat_t);
6047	ninfo->ni_type = EX_NET_LNSTAT_REC;
6048
6049	return (ninfo);
6050}
6051
6052typedef struct i_mac_log_state_s {
6053	boolean_t	mi_last;
6054	int		mi_fenable;
6055	int		mi_lenable;
6056	list_t		*mi_list;
6057} i_mac_log_state_t;
6058
6059/*
6060 * For a given flow, if the description has not been logged before, do it now.
6061 * If it is a VNIC, then we have collected information about it from the MAC
6062 * table, so skip it.
6063 *
6064 * Called through mac_flow_walk_nolock()
6065 *
6066 * Return 0 if successful.
6067 */
6068static int
6069mac_log_flowinfo(flow_entry_t *flent, void *arg)
6070{
6071	mac_client_impl_t	*mcip = flent->fe_mcip;
6072	i_mac_log_state_t	*lstate = arg;
6073	netinfo_t		*ninfo;
6074
6075	if (mcip == NULL)
6076		return (0);
6077
6078	/*
6079	 * If the name starts with "vnic", and fe_user_generated is true (to
6080	 * exclude the mcast and active flow entries created implicitly for
6081	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
6082	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
6083	 */
6084	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
6085	    (flent->fe_type & FLOW_USER) != 0) {
6086		return (0);
6087	}
6088
6089	if (!flent->fe_desc_logged) {
6090		/*
6091		 * We don't return error because we want to continue the
6092		 * walk in case this is the last walk which means we
6093		 * need to reset fe_desc_logged in all the flows.
6094		 */
6095		if ((ninfo = mac_write_flow_desc(flent, mcip)) == NULL)
6096			return (0);
6097		list_insert_tail(lstate->mi_list, ninfo);
6098		flent->fe_desc_logged = B_TRUE;
6099	}
6100
6101	/*
6102	 * Regardless of the error, we want to proceed in case we have to
6103	 * reset fe_desc_logged.
6104	 */
6105	ninfo = mac_write_flow_stats(flent);
6106	if (ninfo == NULL)
6107		return (-1);
6108
6109	list_insert_tail(lstate->mi_list, ninfo);
6110
6111	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
6112		flent->fe_desc_logged = B_FALSE;
6113
6114	return (0);
6115}
6116
6117/*
6118 * Log the description for each mac client of this mac_impl_t, if it
6119 * hasn't already been done. Additionally, log statistics for the link as
6120 * well. Walk the flow table and log information for each flow as well.
6121 * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
6122 * also fe_desc_logged, if flow logging is on) since we want to log the
6123 * description if and when logging is restarted.
6124 *
6125 * Return 0 upon success or -1 upon failure
6126 */
6127static int
6128i_mac_impl_log(mac_impl_t *mip, i_mac_log_state_t *lstate)
6129{
6130	mac_client_impl_t	*mcip;
6131	netinfo_t		*ninfo;
6132
6133	i_mac_perim_enter(mip);
6134	/*
6135	 * Only walk the client list for NIC and etherstub
6136	 */
6137	if ((mip->mi_state_flags & MIS_DISABLED) ||
6138	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
6139	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) {
6140		i_mac_perim_exit(mip);
6141		return (0);
6142	}
6143
6144	for (mcip = mip->mi_clients_list; mcip != NULL;
6145	    mcip = mcip->mci_client_next) {
6146		if (!MCIP_DATAPATH_SETUP(mcip))
6147			continue;
6148		if (lstate->mi_lenable) {
6149			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
6150				ninfo = mac_write_link_desc(mcip);
6151				if (ninfo == NULL) {
6152				/*
6153				 * We can't terminate it if this is the last
6154				 * walk, else there might be some links with
6155				 * mi_desc_logged set to true, which means
6156				 * their description won't be logged the next
6157				 * time logging is started (similarly for the
6158				 * flows within such links). We can continue
6159				 * without walking the flow table (i.e. to
6160				 * set fe_desc_logged to false) because we
6161				 * won't have written any flow stuff for this
6162				 * link as we haven't logged the link itself.
6163				 */
6164					i_mac_perim_exit(mip);
6165					if (lstate->mi_last)
6166						return (0);
6167					else
6168						return (-1);
6169				}
6170				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
6171				list_insert_tail(lstate->mi_list, ninfo);
6172			}
6173		}
6174
6175		ninfo = mac_write_link_stats(mcip);
6176		if (ninfo == NULL && !lstate->mi_last) {
6177			i_mac_perim_exit(mip);
6178			return (-1);
6179		}
6180		list_insert_tail(lstate->mi_list, ninfo);
6181
6182		if (lstate->mi_last)
6183			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
6184
6185		if (lstate->mi_fenable) {
6186			if (mcip->mci_subflow_tab != NULL) {
6187				(void) mac_flow_walk_nolock(
6188				    mcip->mci_subflow_tab, mac_log_flowinfo,
6189				    lstate);
6190			}
6191		}
6192	}
6193	i_mac_perim_exit(mip);
6194	return (0);
6195}
6196
6197/*
6198 * modhash walker function to add a mac_impl_t to a list
6199 */
6200/*ARGSUSED*/
6201static uint_t
6202i_mac_impl_list_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
6203{
6204	list_t			*list = (list_t *)arg;
6205	mac_impl_t		*mip = (mac_impl_t *)val;
6206
6207	if ((mip->mi_state_flags & MIS_DISABLED) == 0) {
6208		list_insert_tail(list, mip);
6209		mip->mi_ref++;
6210	}
6211
6212	return (MH_WALK_CONTINUE);
6213}
6214
6215void
6216i_mac_log_info(list_t *net_log_list, i_mac_log_state_t *lstate)
6217{
6218	list_t			mac_impl_list;
6219	mac_impl_t		*mip;
6220	netinfo_t		*ninfo;
6221
6222	/* Create list of mac_impls */
6223	ASSERT(RW_LOCK_HELD(&i_mac_impl_lock));
6224	list_create(&mac_impl_list, sizeof (mac_impl_t), offsetof(mac_impl_t,
6225	    mi_node));
6226	mod_hash_walk(i_mac_impl_hash, i_mac_impl_list_walker, &mac_impl_list);
6227	rw_exit(&i_mac_impl_lock);
6228
6229	/* Create log entries for each mac_impl */
6230	for (mip = list_head(&mac_impl_list); mip != NULL;
6231	    mip = list_next(&mac_impl_list, mip)) {
6232		if (i_mac_impl_log(mip, lstate) != 0)
6233			continue;
6234	}
6235
6236	/* Remove elements and destroy list of mac_impls */
6237	rw_enter(&i_mac_impl_lock, RW_WRITER);
6238	while ((mip = list_remove_tail(&mac_impl_list)) != NULL) {
6239		mip->mi_ref--;
6240	}
6241	rw_exit(&i_mac_impl_lock);
6242	list_destroy(&mac_impl_list);
6243
6244	/*
6245	 * Write log entries to files outside of locks, free associated
6246	 * structures, and remove entries from the list.
6247	 */
6248	while ((ninfo = list_head(net_log_list)) != NULL) {
6249		(void) exacct_commit_netinfo(ninfo->ni_record, ninfo->ni_type);
6250		list_remove(net_log_list, ninfo);
6251		kmem_free(ninfo->ni_record, ninfo->ni_size);
6252		kmem_free(ninfo, sizeof (*ninfo));
6253	}
6254	list_destroy(net_log_list);
6255}
6256
6257/*
6258 * The timer thread that runs every mac_logging_interval seconds and logs
6259 * link and/or flow information.
6260 */
6261/* ARGSUSED */
6262void
6263mac_log_linkinfo(void *arg)
6264{
6265	i_mac_log_state_t	lstate;
6266	list_t			net_log_list;
6267
6268	list_create(&net_log_list, sizeof (netinfo_t),
6269	    offsetof(netinfo_t, ni_link));
6270
6271	rw_enter(&i_mac_impl_lock, RW_READER);
6272	if (!mac_flow_log_enable && !mac_link_log_enable) {
6273		rw_exit(&i_mac_impl_lock);
6274		return;
6275	}
6276	lstate.mi_fenable = mac_flow_log_enable;
6277	lstate.mi_lenable = mac_link_log_enable;
6278	lstate.mi_last = B_FALSE;
6279	lstate.mi_list = &net_log_list;
6280
6281	/* Write log entries for each mac_impl in the list */
6282	i_mac_log_info(&net_log_list, &lstate);
6283
6284	if (mac_flow_log_enable || mac_link_log_enable) {
6285		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
6286		    SEC_TO_TICK(mac_logging_interval));
6287	}
6288}
6289
6290typedef struct i_mac_fastpath_state_s {
6291	boolean_t	mf_disable;
6292	int		mf_err;
6293} i_mac_fastpath_state_t;
6294
6295/* modhash walker function to enable or disable fastpath */
6296/*ARGSUSED*/
6297static uint_t
6298i_mac_fastpath_walker(mod_hash_key_t key, mod_hash_val_t *val,
6299    void *arg)
6300{
6301	i_mac_fastpath_state_t	*state = arg;
6302	mac_handle_t		mh = (mac_handle_t)val;
6303
6304	if (state->mf_disable)
6305		state->mf_err = mac_fastpath_disable(mh);
6306	else
6307		mac_fastpath_enable(mh);
6308
6309	return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
6310}
6311
6312/*
6313 * Start the logging timer.
6314 */
6315int
6316mac_start_logusage(mac_logtype_t type, uint_t interval)
6317{
6318	i_mac_fastpath_state_t	dstate = {B_TRUE, 0};
6319	i_mac_fastpath_state_t	estate = {B_FALSE, 0};
6320	int			err;
6321
6322	rw_enter(&i_mac_impl_lock, RW_WRITER);
6323	switch (type) {
6324	case MAC_LOGTYPE_FLOW:
6325		if (mac_flow_log_enable) {
6326			rw_exit(&i_mac_impl_lock);
6327			return (0);
6328		}
6329		/* FALLTHRU */
6330	case MAC_LOGTYPE_LINK:
6331		if (mac_link_log_enable) {
6332			rw_exit(&i_mac_impl_lock);
6333			return (0);
6334		}
6335		break;
6336	default:
6337		ASSERT(0);
6338	}
6339
6340	/* Disable fastpath */
6341	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &dstate);
6342	if ((err = dstate.mf_err) != 0) {
6343		/* Reenable fastpath  */
6344		mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
6345		rw_exit(&i_mac_impl_lock);
6346		return (err);
6347	}
6348
6349	switch (type) {
6350	case MAC_LOGTYPE_FLOW:
6351		mac_flow_log_enable = B_TRUE;
6352		/* FALLTHRU */
6353	case MAC_LOGTYPE_LINK:
6354		mac_link_log_enable = B_TRUE;
6355		break;
6356	}
6357
6358	mac_logging_interval = interval;
6359	rw_exit(&i_mac_impl_lock);
6360	mac_log_linkinfo(NULL);
6361	return (0);
6362}
6363
6364/*
6365 * Stop the logging timer if both link and flow logging are turned off.
6366 */
6367void
6368mac_stop_logusage(mac_logtype_t type)
6369{
6370	i_mac_log_state_t	lstate;
6371	i_mac_fastpath_state_t	estate = {B_FALSE, 0};
6372	list_t			net_log_list;
6373
6374	list_create(&net_log_list, sizeof (netinfo_t),
6375	    offsetof(netinfo_t, ni_link));
6376
6377	rw_enter(&i_mac_impl_lock, RW_WRITER);
6378
6379	lstate.mi_fenable = mac_flow_log_enable;
6380	lstate.mi_lenable = mac_link_log_enable;
6381	lstate.mi_list = &net_log_list;
6382
6383	/* Last walk */
6384	lstate.mi_last = B_TRUE;
6385
6386	switch (type) {
6387	case MAC_LOGTYPE_FLOW:
6388		if (lstate.mi_fenable) {
6389			ASSERT(mac_link_log_enable);
6390			mac_flow_log_enable = B_FALSE;
6391			mac_link_log_enable = B_FALSE;
6392			break;
6393		}
6394		/* FALLTHRU */
6395	case MAC_LOGTYPE_LINK:
6396		if (!lstate.mi_lenable || mac_flow_log_enable) {
6397			rw_exit(&i_mac_impl_lock);
6398			return;
6399		}
6400		mac_link_log_enable = B_FALSE;
6401		break;
6402	default:
6403		ASSERT(0);
6404	}
6405
6406	/* Reenable fastpath */
6407	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate);
6408
6409	(void) untimeout(mac_logging_timer);
6410	mac_logging_timer = NULL;
6411
6412	/* Write log entries for each mac_impl in the list */
6413	i_mac_log_info(&net_log_list, &lstate);
6414}
6415
6416/*
6417 * Walk the rx and tx SRS/SRs for a flow and update the priority value.
6418 */
6419void
6420mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
6421{
6422	pri_t			pri;
6423	int			count;
6424	mac_soft_ring_set_t	*mac_srs;
6425
6426	if (flent->fe_rx_srs_cnt <= 0)
6427		return;
6428
6429	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
6430	    SRST_FLOW) {
6431		pri = FLOW_PRIORITY(mcip->mci_min_pri,
6432		    mcip->mci_max_pri,
6433		    flent->fe_resource_props.mrp_priority);
6434	} else {
6435		pri = mcip->mci_max_pri;
6436	}
6437
6438	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
6439		mac_srs = flent->fe_rx_srs[count];
6440		mac_update_srs_priority(mac_srs, pri);
6441	}
6442	/*
6443	 * If we have a Tx SRS, we need to modify all the threads associated
6444	 * with it.
6445	 */
6446	if (flent->fe_tx_srs != NULL)
6447		mac_update_srs_priority(flent->fe_tx_srs, pri);
6448}
6449
6450/*
6451 * RX and TX rings are reserved according to different semantics depending
6452 * on the requests from the MAC clients and type of rings:
6453 *
6454 * On the Tx side, by default we reserve individual rings, independently from
6455 * the groups.
6456 *
6457 * On the Rx side, the reservation is at the granularity of the group
6458 * of rings, and used for v12n level 1 only. It has a special case for the
6459 * primary client.
6460 *
6461 * If a share is allocated to a MAC client, we allocate a TX group and an
6462 * RX group to the client, and assign TX rings and RX rings to these
6463 * groups according to information gathered from the driver through
6464 * the share capability.
6465 *
6466 * The foreseable evolution of Rx rings will handle v12n level 2 and higher
6467 * to allocate individual rings out of a group and program the hw classifier
6468 * based on IP address or higher level criteria.
6469 */
6470
6471/*
6472 * mac_reserve_tx_ring()
6473 * Reserve a unused ring by marking it with MR_INUSE state.
6474 * As reserved, the ring is ready to function.
6475 *
6476 * Notes for Hybrid I/O:
6477 *
6478 * If a specific ring is needed, it is specified through the desired_ring
6479 * argument. Otherwise that argument is set to NULL.
6480 * If the desired ring was previous allocated to another client, this
6481 * function swaps it with a new ring from the group of unassigned rings.
6482 */
6483mac_ring_t *
6484mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
6485{
6486	mac_group_t		*group;
6487	mac_grp_client_t	*mgcp;
6488	mac_client_impl_t	*mcip;
6489	mac_soft_ring_set_t	*srs;
6490
6491	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
6492
6493	/*
6494	 * Find an available ring and start it before changing its status.
6495	 * The unassigned rings are at the end of the mi_tx_groups
6496	 * array.
6497	 */
6498	group = MAC_DEFAULT_TX_GROUP(mip);
6499
6500	/* Can't take the default ring out of the default group */
6501	ASSERT(desired_ring != (mac_ring_t *)mip->mi_default_tx_ring);
6502
6503	if (desired_ring->mr_state == MR_FREE) {
6504		ASSERT(MAC_GROUP_NO_CLIENT(group));
6505		if (mac_start_ring(desired_ring) != 0)
6506			return (NULL);
6507		return (desired_ring);
6508	}
6509	/*
6510	 * There are clients using this ring, so let's move the clients
6511	 * away from using this ring.
6512	 */
6513	for (mgcp = group->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
6514		mcip = mgcp->mgc_client;
6515		mac_tx_client_quiesce((mac_client_handle_t)mcip);
6516		srs = MCIP_TX_SRS(mcip);
6517		ASSERT(mac_tx_srs_ring_present(srs, desired_ring));
6518		mac_tx_invoke_callbacks(mcip,
6519		    (mac_tx_cookie_t)mac_tx_srs_get_soft_ring(srs,
6520		    desired_ring));
6521		mac_tx_srs_del_ring(srs, desired_ring);
6522		mac_tx_client_restart((mac_client_handle_t)mcip);
6523	}
6524	return (desired_ring);
6525}
6526
6527/*
6528 * For a non-default group with multiple clients, return the primary client.
6529 */
6530static mac_client_impl_t *
6531mac_get_grp_primary(mac_group_t *grp)
6532{
6533	mac_grp_client_t	*mgcp = grp->mrg_clients;
6534	mac_client_impl_t	*mcip;
6535
6536	while (mgcp != NULL) {
6537		mcip = mgcp->mgc_client;
6538		if (mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC)
6539			return (mcip);
6540		mgcp = mgcp->mgc_next;
6541	}
6542	return (NULL);
6543}
6544
6545/*
6546 * Hybrid I/O specifies the ring that should be given to a share.
6547 * If the ring is already used by clients, then we need to release
6548 * the ring back to the default group so that we can give it to
6549 * the share. This means the clients using this ring now get a
6550 * replacement ring. If there aren't any replacement rings, this
6551 * function returns a failure.
6552 */
6553static int
6554mac_reclaim_ring_from_grp(mac_impl_t *mip, mac_ring_type_t ring_type,
6555    mac_ring_t *ring, mac_ring_t **rings, int nrings)
6556{
6557	mac_group_t		*group = (mac_group_t *)ring->mr_gh;
6558	mac_resource_props_t	*mrp;
6559	mac_client_impl_t	*mcip;
6560	mac_group_t		*defgrp;
6561	mac_ring_t		*tring;
6562	mac_group_t		*tgrp;
6563	int			i;
6564	int			j;
6565
6566	mcip = MAC_GROUP_ONLY_CLIENT(group);
6567	if (mcip == NULL)
6568		mcip = mac_get_grp_primary(group);
6569	ASSERT(mcip != NULL);
6570	ASSERT(mcip->mci_share == 0);
6571
6572	mrp = MCIP_RESOURCE_PROPS(mcip);
6573	if (ring_type == MAC_RING_TYPE_RX) {
6574		defgrp = mip->mi_rx_donor_grp;
6575		if ((mrp->mrp_mask & MRP_RX_RINGS) == 0) {
6576			/* Need to put this mac client in the default group */
6577			if (mac_rx_switch_group(mcip, group, defgrp) != 0)
6578				return (ENOSPC);
6579		} else {
6580			/*
6581			 * Switch this ring with some other ring from
6582			 * the default group.
6583			 */
6584			for (tring = defgrp->mrg_rings; tring != NULL;
6585			    tring = tring->mr_next) {
6586				if (tring->mr_index == 0)
6587					continue;
6588				for (j = 0; j < nrings; j++) {
6589					if (rings[j] == tring)
6590						break;
6591				}
6592				if (j >= nrings)
6593					break;
6594			}
6595			if (tring == NULL)
6596				return (ENOSPC);
6597			if (mac_group_mov_ring(mip, group, tring) != 0)
6598				return (ENOSPC);
6599			if (mac_group_mov_ring(mip, defgrp, ring) != 0) {
6600				(void) mac_group_mov_ring(mip, defgrp, tring);
6601				return (ENOSPC);
6602			}
6603		}
6604		ASSERT(ring->mr_gh == (mac_group_handle_t)defgrp);
6605		return (0);
6606	}
6607
6608	defgrp = MAC_DEFAULT_TX_GROUP(mip);
6609	if (ring == (mac_ring_t *)mip->mi_default_tx_ring) {
6610		/*
6611		 * See if we can get a spare ring to replace the default
6612		 * ring.
6613		 */
6614		if (defgrp->mrg_cur_count == 1) {
6615			/*
6616			 * Need to get a ring from another client, see if
6617			 * there are any clients that can be moved to
6618			 * the default group, thereby freeing some rings.
6619			 */
6620			for (i = 0; i < mip->mi_tx_group_count; i++) {
6621				tgrp = &mip->mi_tx_groups[i];
6622				if (tgrp->mrg_state ==
6623				    MAC_GROUP_STATE_REGISTERED) {
6624					continue;
6625				}
6626				mcip = MAC_GROUP_ONLY_CLIENT(tgrp);
6627				if (mcip == NULL)
6628					mcip = mac_get_grp_primary(tgrp);
6629				ASSERT(mcip != NULL);
6630				mrp = MCIP_RESOURCE_PROPS(mcip);
6631				if ((mrp->mrp_mask & MRP_TX_RINGS) == 0) {
6632					ASSERT(tgrp->mrg_cur_count == 1);
6633					/*
6634					 * If this ring is part of the
6635					 * rings asked by the share we cannot
6636					 * use it as the default ring.
6637					 */
6638					for (j = 0; j < nrings; j++) {
6639						if (rings[j] == tgrp->mrg_rings)
6640							break;
6641					}
6642					if (j < nrings)
6643						continue;
6644					mac_tx_client_quiesce(
6645					    (mac_client_handle_t)mcip);
6646					mac_tx_switch_group(mcip, tgrp,
6647					    defgrp);
6648					mac_tx_client_restart(
6649					    (mac_client_handle_t)mcip);
6650					break;
6651				}
6652			}
6653			/*
6654			 * All the rings are reserved, can't give up the
6655			 * default ring.
6656			 */
6657			if (defgrp->mrg_cur_count <= 1)
6658				return (ENOSPC);
6659		}
6660		/*
6661		 * Swap the default ring with another.
6662		 */
6663		for (tring = defgrp->mrg_rings; tring != NULL;
6664		    tring = tring->mr_next) {
6665			/*
6666			 * If this ring is part of the rings asked by the
6667			 * share we cannot use it as the default ring.
6668			 */
6669			for (j = 0; j < nrings; j++) {
6670				if (rings[j] == tring)
6671					break;
6672			}
6673			if (j >= nrings)
6674				break;
6675		}
6676		ASSERT(tring != NULL);
6677		mip->mi_default_tx_ring = (mac_ring_handle_t)tring;
6678		return (0);
6679	}
6680	/*
6681	 * The Tx ring is with a group reserved by a MAC client. See if
6682	 * we can swap it.
6683	 */
6684	ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
6685	mcip = MAC_GROUP_ONLY_CLIENT(group);
6686	if (mcip == NULL)
6687		mcip = mac_get_grp_primary(group);
6688	ASSERT(