xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision f1ccfd86d00d3a46fa8dc85b88860e10a3ad1019)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Data-Link Driver
27  */
28 
29 #include	<inet/common.h>
30 #include	<sys/strsubr.h>
31 #include	<sys/stropts.h>
32 #include	<sys/strsun.h>
33 #include	<sys/vlan.h>
34 #include	<sys/dld_impl.h>
35 #include	<sys/cpuvar.h>
36 #include	<sys/callb.h>
37 #include	<sys/list.h>
38 #include	<sys/mac_client.h>
39 #include	<sys/mac_client_priv.h>
40 #include	<sys/mac_flow.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
61 
62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
64     link_tagmode_t);
65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
66 
67 static uint32_t		str_count;
68 static kmem_cache_t	*str_cachep;
69 static mod_hash_t	*str_hashp;
70 
71 #define	STR_HASHSZ		64
72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
73 
74 #define	dld_taskq	system_taskq
75 
76 static kmutex_t		dld_taskq_lock;
77 static kcondvar_t	dld_taskq_cv;
78 static list_t		dld_taskq_list;		/* List of dld_str_t */
79 boolean_t		dld_taskq_quit;
80 boolean_t		dld_taskq_done;
81 
82 static void		dld_taskq_dispatch(void);
83 
84 /*
85  * Some notes on entry points, flow-control, queueing.
86  *
87  * This driver exports the traditional STREAMS put entry point as well as
88  * the non-STREAMS fast-path transmit routine which is provided to IP via
89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
90  * and data operations, while the fast-path routine deals only with M_DATA
91  * fast-path packets.  Regardless of the entry point, all outbound packets
92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
93  *
94  * The transmit logic operates in the following way: All packets coming
95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
96  * happens when the MAC layer indicates the packets couldn't be
97  * transmitted due to 1) lack of resources (e.g. running out of
98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
99  * particular flow. The indication comes in the form of a Tx cookie that
100  * identifies the blocked ring. In such case, DLD will place a
101  * dummy message on its write-side STREAMS queue so that the queue is
102  * marked as "full". Any subsequent packets arriving at the driver will
103  * still be sent to the MAC layer where it either gets queued in the Tx
104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
106  * When the write service procedure runs, it will remove the dummy
107  * message from the write-side STREAMS queue; in effect this will trigger
108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
109  * respectively, due to the above reasons.
110  *
111  * All non-data operations, both DLPI and ioctls are single threaded on a per
112  * dld_str_t endpoint. This is done using a taskq so that the control operation
113  * has kernel context and can cv_wait for resources. In addition all set type
114  * operations that involve mac level state modification are serialized on a
115  * per mac end point using the perimeter mechanism provided by the mac layer.
116  * This serializes all mac clients trying to modify a single mac end point over
117  * the entire sequence of mac calls made by that client as an atomic unit. The
118  * mac framework locking is described in mac.c. A critical element is that
119  * DLD/DLS does not hold any locks across the mac perimeter.
120  *
121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
124  * is returned. If the mac handle is non-null, it can be safely accessed
125  * below. The mac handle won't be freed until the mac_unregister which
126  * won't happen until the driver detaches. The DDI framework ensures that
127  * the detach won't happen while a getinfo is in progress.
128  */
129 typedef struct i_dld_str_state_s {
130 	major_t		ds_major;
131 	minor_t		ds_minor;
132 	int		ds_instance;
133 	dev_info_t	*ds_dip;
134 } i_dld_str_state_t;
135 
136 /* ARGSUSED */
137 static uint_t
138 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
139 {
140 	i_dld_str_state_t	*statep = arg;
141 	dld_str_t		*dsp = (dld_str_t *)val;
142 	mac_handle_t		mh;
143 
144 	if (statep->ds_major != dsp->ds_major)
145 		return (MH_WALK_CONTINUE);
146 
147 	ASSERT(statep->ds_minor != 0);
148 	mh = dsp->ds_mh;
149 
150 	if (statep->ds_minor == dsp->ds_minor) {
151 		/*
152 		 * Clone: a clone minor is unique. we can terminate the
153 		 * walk if we find a matching stream -- even if we fail
154 		 * to obtain the devinfo.
155 		 */
156 		if (mh != NULL) {
157 			statep->ds_dip = mac_devinfo_get(mh);
158 			statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
159 		}
160 		return (MH_WALK_TERMINATE);
161 	}
162 	return (MH_WALK_CONTINUE);
163 }
164 
165 static dev_info_t *
166 dld_finddevinfo(dev_t dev)
167 {
168 	dev_info_t		*dip;
169 	i_dld_str_state_t	state;
170 
171 	if (getminor(dev) == 0)
172 		return (NULL);
173 
174 	/*
175 	 * See if it's a minor node of a link
176 	 */
177 	if ((dip = dls_link_devinfo(dev)) != NULL)
178 		return (dip);
179 
180 	state.ds_minor = getminor(dev);
181 	state.ds_major = getmajor(dev);
182 	state.ds_dip = NULL;
183 	state.ds_instance = -1;
184 
185 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
186 	return (state.ds_dip);
187 }
188 
189 int
190 dld_devt_to_instance(dev_t dev)
191 {
192 	minor_t			minor;
193 	i_dld_str_state_t	state;
194 
195 	/*
196 	 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
197 	 * Minor number 0 is reserved for the DLPI style 2 unattached
198 	 * node.
199 	 */
200 
201 	if ((minor = getminor(dev)) == 0)
202 		return (-1);
203 
204 	/*
205 	 * Check for unopened style 1 node.
206 	 * Note that this doesn't *necessarily* work for legacy
207 	 * devices, but this code is only called within the
208 	 * getinfo(9e) implementation for true GLDv3 devices, so it
209 	 * doesn't matter.
210 	 */
211 	if (minor > 0 && minor <= DLS_MAX_MINOR) {
212 		return (DLS_MINOR2INST(minor));
213 	}
214 
215 	state.ds_minor = getminor(dev);
216 	state.ds_major = getmajor(dev);
217 	state.ds_dip = NULL;
218 	state.ds_instance = -1;
219 
220 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
221 	return (state.ds_instance);
222 }
223 
224 /*
225  * devo_getinfo: getinfo(9e)
226  *
227  * NB: This may be called for a provider before the provider's
228  * instances are attached.  Hence, if a particular provider needs a
229  * special mapping (the mac instance != ddi_get_instance()), then it
230  * may need to provide its own implmentation using the
231  * mac_devt_to_instance() function, and translating the returned mac
232  * instance to a devinfo instance.  For dev_t's where the minor number
233  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
234  * function indirectly via the mac_getinfo() function.
235  */
236 /*ARGSUSED*/
237 int
238 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
239 {
240 	dev_info_t	*devinfo;
241 	minor_t		minor = getminor((dev_t)arg);
242 	int		rc = DDI_FAILURE;
243 
244 	switch (cmd) {
245 	case DDI_INFO_DEVT2DEVINFO:
246 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
247 			*(dev_info_t **)resp = devinfo;
248 			rc = DDI_SUCCESS;
249 		}
250 		break;
251 	case DDI_INFO_DEVT2INSTANCE:
252 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
253 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
254 			rc = DDI_SUCCESS;
255 		} else if (minor > DLS_MAX_MINOR &&
256 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
257 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
258 			rc = DDI_SUCCESS;
259 		}
260 		break;
261 	}
262 	return (rc);
263 }
264 
265 void *
266 dld_str_private(queue_t *q)
267 {
268 	return (((dld_str_t *)(q->q_ptr))->ds_private);
269 }
270 
271 int
272 dld_str_open(queue_t *rq, dev_t *devp, void *private)
273 {
274 	dld_str_t	*dsp;
275 	major_t		major;
276 	minor_t		minor;
277 	int		err;
278 
279 	major = getmajor(*devp);
280 	minor = getminor(*devp);
281 
282 	/*
283 	 * Create a new dld_str_t for the stream. This will grab a new minor
284 	 * number that will be handed back in the cloned dev_t.  Creation may
285 	 * fail if we can't allocate the dummy mblk used for flow-control.
286 	 */
287 	dsp = dld_str_create(rq, DLD_DLPI, major,
288 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
289 	if (dsp == NULL)
290 		return (ENOSR);
291 
292 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
293 	dsp->ds_private = private;
294 	if (minor != 0) {
295 		/*
296 		 * Style 1 open
297 		 */
298 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
299 			goto failed;
300 
301 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
302 	} else {
303 		(void) qassociate(rq, -1);
304 	}
305 
306 	/*
307 	 * Enable the queue srv(9e) routine.
308 	 */
309 	qprocson(rq);
310 
311 	/*
312 	 * Construct a cloned dev_t to hand back.
313 	 */
314 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
315 	return (0);
316 
317 failed:
318 	dld_str_destroy(dsp);
319 	return (err);
320 }
321 
322 int
323 dld_str_close(queue_t *rq)
324 {
325 	dld_str_t	*dsp = rq->q_ptr;
326 
327 	/*
328 	 * All modules on top have been popped off. So there can't be any
329 	 * threads from the top.
330 	 */
331 	ASSERT(dsp->ds_datathr_cnt == 0);
332 
333 	/*
334 	 * Wait until pending DLPI requests are processed.
335 	 */
336 	mutex_enter(&dsp->ds_lock);
337 	while (dsp->ds_dlpi_pending)
338 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
339 	mutex_exit(&dsp->ds_lock);
340 
341 
342 	/*
343 	 * This stream was open to a provider node. Check to see
344 	 * if it has been cleanly shut down.
345 	 */
346 	if (dsp->ds_dlstate != DL_UNATTACHED) {
347 		/*
348 		 * The stream is either open to a style 1 provider or
349 		 * this is not clean shutdown. Detach from the PPA.
350 		 * (This is still ok even in the style 1 case).
351 		 */
352 		dld_str_detach(dsp);
353 	}
354 
355 	dld_str_destroy(dsp);
356 	return (0);
357 }
358 
359 /*
360  * qi_qopen: open(9e)
361  */
362 /*ARGSUSED*/
363 int
364 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
365 {
366 	if (sflag == MODOPEN)
367 		return (ENOTSUP);
368 
369 	/*
370 	 * This is a cloning driver and therefore each queue should only
371 	 * ever get opened once.
372 	 */
373 	if (rq->q_ptr != NULL)
374 		return (EBUSY);
375 
376 	return (dld_str_open(rq, devp, NULL));
377 }
378 
379 /*
380  * qi_qclose: close(9e)
381  */
382 /* ARGSUSED */
383 int
384 dld_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
385 {
386 	/*
387 	 * Disable the queue srv(9e) routine.
388 	 */
389 	qprocsoff(rq);
390 
391 	return (dld_str_close(rq));
392 }
393 
394 /*
395  * qi_qputp: put(9e)
396  */
397 int
398 dld_wput(queue_t *wq, mblk_t *mp)
399 {
400 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
401 	dld_str_mode_t	mode;
402 
403 	switch (DB_TYPE(mp)) {
404 	case M_DATA:
405 		mutex_enter(&dsp->ds_lock);
406 		mode = dsp->ds_mode;
407 		if ((dsp->ds_dlstate != DL_IDLE) ||
408 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
409 			mutex_exit(&dsp->ds_lock);
410 			freemsg(mp);
411 			break;
412 		}
413 
414 		DLD_DATATHR_INC(dsp);
415 		mutex_exit(&dsp->ds_lock);
416 		if (mode == DLD_FASTPATH) {
417 			if (dsp->ds_mip->mi_media == DL_ETHER &&
418 			    (MBLKL(mp) < sizeof (struct ether_header))) {
419 				freemsg(mp);
420 			} else {
421 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
422 			}
423 		} else {
424 			str_mdata_raw_put(dsp, mp);
425 		}
426 		DLD_DATATHR_DCR(dsp);
427 		break;
428 	case M_PROTO:
429 	case M_PCPROTO: {
430 		t_uscalar_t	prim;
431 
432 		if (MBLKL(mp) < sizeof (t_uscalar_t))
433 			break;
434 
435 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
436 
437 		if (prim == DL_UNITDATA_REQ) {
438 			proto_unitdata_req(dsp, mp);
439 		} else {
440 			dld_wput_nondata(dsp, mp);
441 		}
442 		break;
443 	}
444 
445 	case M_IOCTL:
446 		dld_wput_nondata(dsp, mp);
447 		break;
448 
449 	case M_FLUSH:
450 		if (*mp->b_rptr & FLUSHW) {
451 			DLD_CLRQFULL(dsp);
452 			*mp->b_rptr &= ~FLUSHW;
453 		}
454 
455 		if (*mp->b_rptr & FLUSHR) {
456 			qreply(wq, mp);
457 		} else {
458 			freemsg(mp);
459 		}
460 		break;
461 
462 	default:
463 		freemsg(mp);
464 		break;
465 	}
466 	return (0);
467 }
468 
469 /*
470  * qi_srvp: srv(9e)
471  */
472 int
473 dld_wsrv(queue_t *wq)
474 {
475 	dld_str_t	*dsp = wq->q_ptr;
476 
477 	DLD_CLRQFULL(dsp);
478 	return (0);
479 }
480 
481 void
482 dld_init_ops(struct dev_ops *ops, const char *name)
483 {
484 	struct streamtab *stream;
485 	struct qinit *rq, *wq;
486 	struct module_info *modinfo;
487 
488 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
489 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
490 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
491 	modinfo->mi_minpsz = 0;
492 	modinfo->mi_maxpsz = 64*1024;
493 	modinfo->mi_hiwat  = 1;
494 	modinfo->mi_lowat = 0;
495 
496 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
497 	rq->qi_qopen = dld_open;
498 	rq->qi_qclose = dld_close;
499 	rq->qi_minfo = modinfo;
500 
501 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
502 	wq->qi_putp = (pfi_t)dld_wput;
503 	wq->qi_srvp = (pfi_t)dld_wsrv;
504 	wq->qi_minfo = modinfo;
505 
506 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
507 	stream->st_rdinit = rq;
508 	stream->st_wrinit = wq;
509 	ops->devo_cb_ops->cb_str = stream;
510 
511 	if (ops->devo_getinfo == NULL)
512 		ops->devo_getinfo = &dld_getinfo;
513 }
514 
515 void
516 dld_fini_ops(struct dev_ops *ops)
517 {
518 	struct streamtab *stream;
519 	struct qinit *rq, *wq;
520 	struct module_info *modinfo;
521 
522 	stream = ops->devo_cb_ops->cb_str;
523 	rq = stream->st_rdinit;
524 	wq = stream->st_wrinit;
525 	modinfo = rq->qi_minfo;
526 	ASSERT(wq->qi_minfo == modinfo);
527 
528 	kmem_free(stream, sizeof (struct streamtab));
529 	kmem_free(wq, sizeof (struct qinit));
530 	kmem_free(rq, sizeof (struct qinit));
531 	kmem_free(modinfo->mi_idname, FMNAMESZ);
532 	kmem_free(modinfo, sizeof (struct module_info));
533 }
534 
535 /*
536  * Initialize this module's data structures.
537  */
538 void
539 dld_str_init(void)
540 {
541 	/*
542 	 * Create dld_str_t object cache.
543 	 */
544 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
545 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
546 	ASSERT(str_cachep != NULL);
547 
548 	/*
549 	 * Create a hash table for maintaining dld_str_t's.
550 	 * The ds_minor field (the clone minor number) of a dld_str_t
551 	 * is used as a key for this hash table because this number is
552 	 * globally unique (allocated from "dls_minor_arena").
553 	 */
554 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
555 	    mod_hash_null_valdtor);
556 
557 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
558 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
559 
560 	dld_taskq_quit = B_FALSE;
561 	dld_taskq_done = B_FALSE;
562 	list_create(&dld_taskq_list, sizeof (dld_str_t),
563 	    offsetof(dld_str_t, ds_tqlist));
564 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
565 	    &p0, TS_RUN, minclsyspri);
566 }
567 
568 /*
569  * Tear down this module's data structures.
570  */
571 int
572 dld_str_fini(void)
573 {
574 	/*
575 	 * Make sure that there are no objects in use.
576 	 */
577 	if (str_count != 0)
578 		return (EBUSY);
579 
580 	/*
581 	 * Ask the dld_taskq thread to quit and wait for it to be done
582 	 */
583 	mutex_enter(&dld_taskq_lock);
584 	dld_taskq_quit = B_TRUE;
585 	cv_signal(&dld_taskq_cv);
586 	while (!dld_taskq_done)
587 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
588 	mutex_exit(&dld_taskq_lock);
589 	list_destroy(&dld_taskq_list);
590 	/*
591 	 * Destroy object cache.
592 	 */
593 	kmem_cache_destroy(str_cachep);
594 	mod_hash_destroy_idhash(str_hashp);
595 	return (0);
596 }
597 
598 /*
599  * Create a new dld_str_t object.
600  */
601 dld_str_t *
602 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
603 {
604 	dld_str_t	*dsp;
605 	int		err;
606 
607 	/*
608 	 * Allocate an object from the cache.
609 	 */
610 	atomic_inc_32(&str_count);
611 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
612 
613 	/*
614 	 * Allocate the dummy mblk for flow-control.
615 	 */
616 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
617 	if (dsp->ds_tx_flow_mp == NULL) {
618 		kmem_cache_free(str_cachep, dsp);
619 		atomic_dec_32(&str_count);
620 		return (NULL);
621 	}
622 	dsp->ds_type = type;
623 	dsp->ds_major = major;
624 	dsp->ds_style = style;
625 
626 	/*
627 	 * Initialize the queue pointers.
628 	 */
629 	ASSERT(RD(rq) == rq);
630 	dsp->ds_rq = rq;
631 	dsp->ds_wq = WR(rq);
632 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
633 
634 	/*
635 	 * We want explicit control over our write-side STREAMS queue
636 	 * where the dummy mblk gets added/removed for flow-control.
637 	 */
638 	noenable(WR(rq));
639 
640 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
641 	    (mod_hash_val_t)dsp);
642 	ASSERT(err == 0);
643 	return (dsp);
644 }
645 
646 /*
647  * Destroy a dld_str_t object.
648  */
649 void
650 dld_str_destroy(dld_str_t *dsp)
651 {
652 	queue_t		*rq;
653 	queue_t		*wq;
654 	mod_hash_val_t	val;
655 
656 	/*
657 	 * Clear the queue pointers.
658 	 */
659 	rq = dsp->ds_rq;
660 	wq = dsp->ds_wq;
661 	ASSERT(wq == WR(rq));
662 	rq->q_ptr = wq->q_ptr = NULL;
663 	dsp->ds_rq = dsp->ds_wq = NULL;
664 
665 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
666 	ASSERT(dsp->ds_sap == 0);
667 	ASSERT(dsp->ds_mh == NULL);
668 	ASSERT(dsp->ds_mch == NULL);
669 	ASSERT(dsp->ds_promisc == 0);
670 	ASSERT(dsp->ds_mph == NULL);
671 	ASSERT(dsp->ds_mip == NULL);
672 	ASSERT(dsp->ds_mnh == NULL);
673 
674 	ASSERT(dsp->ds_polling == B_FALSE);
675 	ASSERT(dsp->ds_direct == B_FALSE);
676 	ASSERT(dsp->ds_lso == B_FALSE);
677 	ASSERT(dsp->ds_lso_max == 0);
678 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
679 
680 	/*
681 	 * Reinitialize all the flags.
682 	 */
683 	dsp->ds_notifications = 0;
684 	dsp->ds_passivestate = DLD_UNINITIALIZED;
685 	dsp->ds_mode = DLD_UNITDATA;
686 	dsp->ds_native = B_FALSE;
687 	dsp->ds_nonip = B_FALSE;
688 
689 	ASSERT(dsp->ds_datathr_cnt == 0);
690 	ASSERT(dsp->ds_pending_head == NULL);
691 	ASSERT(dsp->ds_pending_tail == NULL);
692 	ASSERT(!dsp->ds_dlpi_pending);
693 
694 	ASSERT(dsp->ds_dlp == NULL);
695 	ASSERT(dsp->ds_dmap == NULL);
696 	ASSERT(dsp->ds_rx == NULL);
697 	ASSERT(dsp->ds_rx_arg == NULL);
698 	ASSERT(dsp->ds_next == NULL);
699 	ASSERT(dsp->ds_head == NULL);
700 
701 	/*
702 	 * Free the dummy mblk if exists.
703 	 */
704 	if (dsp->ds_tx_flow_mp != NULL) {
705 		freeb(dsp->ds_tx_flow_mp);
706 		dsp->ds_tx_flow_mp = NULL;
707 	}
708 
709 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
710 	ASSERT(dsp == (dld_str_t *)val);
711 
712 	/*
713 	 * Free the object back to the cache.
714 	 */
715 	kmem_cache_free(str_cachep, dsp);
716 	atomic_dec_32(&str_count);
717 }
718 
719 /*
720  * kmem_cache contructor function: see kmem_cache_create(9f).
721  */
722 /*ARGSUSED*/
723 static int
724 str_constructor(void *buf, void *cdrarg, int kmflags)
725 {
726 	dld_str_t	*dsp = buf;
727 
728 	bzero(buf, sizeof (dld_str_t));
729 
730 	/*
731 	 * Allocate a new minor number.
732 	 */
733 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
734 		return (-1);
735 
736 	/*
737 	 * Initialize the DLPI state machine.
738 	 */
739 	dsp->ds_dlstate = DL_UNATTACHED;
740 
741 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
742 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
743 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
744 
745 	return (0);
746 }
747 
748 /*
749  * kmem_cache destructor function.
750  */
751 /*ARGSUSED*/
752 static void
753 str_destructor(void *buf, void *cdrarg)
754 {
755 	dld_str_t	*dsp = buf;
756 
757 	/*
758 	 * Release the minor number.
759 	 */
760 	mac_minor_rele(dsp->ds_minor);
761 
762 	ASSERT(dsp->ds_tx_flow_mp == NULL);
763 
764 	mutex_destroy(&dsp->ds_lock);
765 	cv_destroy(&dsp->ds_datathr_cv);
766 	cv_destroy(&dsp->ds_dlpi_pending_cv);
767 }
768 
769 /*
770  * Update the priority bits and VID (may need to insert tag if mp points
771  * to an untagged packet.
772  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
773  */
774 static mblk_t *
775 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
776     link_tagmode_t tagmode)
777 {
778 	mblk_t *hmp;
779 	struct ether_vlan_header *evhp;
780 	struct ether_header *ehp;
781 	uint16_t old_tci = 0;
782 	size_t len;
783 
784 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
785 
786 	evhp = (struct ether_vlan_header *)mp->b_rptr;
787 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
788 		/*
789 		 * Tagged packet, update the priority bits.
790 		 */
791 		len = sizeof (struct ether_vlan_header);
792 
793 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
794 			/*
795 			 * In case some drivers only check the db_ref
796 			 * count of the first mblk, we pullup the
797 			 * message into a single mblk.
798 			 */
799 			hmp = msgpullup(mp, -1);
800 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
801 				freemsg(hmp);
802 				return (NULL);
803 			} else {
804 				freemsg(mp);
805 				mp = hmp;
806 			}
807 		}
808 
809 		evhp = (struct ether_vlan_header *)mp->b_rptr;
810 		old_tci = ntohs(evhp->ether_tci);
811 	} else {
812 		/*
813 		 * Untagged packet.  Two factors will cause us to insert a
814 		 * VLAN header:
815 		 * - This is a VLAN link (vid is specified)
816 		 * - The link supports user priority tagging and the priority
817 		 *   is non-zero.
818 		 */
819 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
820 			return (mp);
821 
822 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
823 		if (hmp == NULL)
824 			return (NULL);
825 
826 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
827 		ehp = (struct ether_header *)mp->b_rptr;
828 
829 		/*
830 		 * Copy the MAC addresses and typelen
831 		 */
832 		bcopy(ehp, evhp, (ETHERADDRL * 2));
833 		evhp->ether_type = ehp->ether_type;
834 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
835 
836 		hmp->b_wptr += sizeof (struct ether_vlan_header);
837 		mp->b_rptr += sizeof (struct ether_header);
838 
839 		/*
840 		 * Free the original message if it's now empty. Link the
841 		 * rest of the messages to the header message.
842 		 */
843 		if (MBLKL(mp) == 0) {
844 			hmp->b_cont = mp->b_cont;
845 			freeb(mp);
846 		} else {
847 			hmp->b_cont = mp;
848 		}
849 		mp = hmp;
850 	}
851 
852 	if (pri == 0)
853 		pri = VLAN_PRI(old_tci);
854 	if (vid == VLAN_ID_NONE)
855 		vid = VLAN_ID(old_tci);
856 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
857 	return (mp);
858 }
859 
860 /*
861  * M_DATA put (IP fast-path mode)
862  */
863 mac_tx_cookie_t
864 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
865     uint16_t flag)
866 {
867 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
868 	mblk_t *newmp;
869 	uint_t pri;
870 	mac_tx_cookie_t cookie;
871 
872 	if (is_ethernet) {
873 		/*
874 		 * Update the priority bits to the assigned priority.
875 		 */
876 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
877 
878 		if (pri != 0) {
879 			newmp = i_dld_ether_header_update_tag(mp, pri,
880 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
881 			if (newmp == NULL)
882 				goto discard;
883 			mp = newmp;
884 		}
885 	}
886 
887 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
888 		DLD_SETQFULL(dsp);
889 	}
890 	return (cookie);
891 
892 discard:
893 	/* TODO: bump kstat? */
894 	freemsg(mp);
895 	return (NULL);
896 }
897 
898 /*
899  * M_DATA put (DLIOCRAW mode)
900  */
901 static void
902 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
903 {
904 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
905 	mblk_t *bp, *newmp;
906 	size_t size;
907 	mac_header_info_t mhi;
908 	uint_t pri, vid, dvid;
909 	uint_t max_sdu;
910 
911 	/*
912 	 * Certain MAC type plugins provide an illusion for raw DLPI
913 	 * consumers.  They pretend that the MAC layer is something that
914 	 * it's not for the benefit of observability tools.  For example,
915 	 * mac_wifi pretends that it's Ethernet for such consumers.
916 	 * Here, unless native mode is enabled, we call into the MAC layer so
917 	 * that this illusion can be maintained.  The plugin will optionally
918 	 * transform the MAC header here into something that can be passed
919 	 * down.  The header goes from raw mode to "cooked" mode.
920 	 */
921 	if (!dsp->ds_native) {
922 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
923 			goto discard;
924 		mp = newmp;
925 	}
926 
927 	size = MBLKL(mp);
928 
929 	/*
930 	 * Check the packet is not too big and that any remaining
931 	 * fragment list is composed entirely of M_DATA messages. (We
932 	 * know the first fragment was M_DATA otherwise we could not
933 	 * have got here).
934 	 */
935 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
936 		if (DB_TYPE(bp) != M_DATA)
937 			goto discard;
938 		size += MBLKL(bp);
939 	}
940 
941 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
942 		goto discard;
943 
944 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
945 	/*
946 	 * If LSO is enabled, check the size against lso_max. Otherwise,
947 	 * compare the packet size with max_sdu.
948 	 */
949 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
950 	if (size > max_sdu + mhi.mhi_hdrsize)
951 		goto discard;
952 
953 	if (is_ethernet) {
954 		dvid = mac_client_vid(dsp->ds_mch);
955 
956 		/*
957 		 * Discard the packet if this is a VLAN stream but the VID in
958 		 * the packet is not correct.
959 		 */
960 		vid = VLAN_ID(mhi.mhi_tci);
961 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
962 			goto discard;
963 
964 		/*
965 		 * Discard the packet if this packet is a tagged packet
966 		 * but both pri and VID are 0.
967 		 */
968 		pri = VLAN_PRI(mhi.mhi_tci);
969 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
970 		    vid == VLAN_ID_NONE)
971 			goto discard;
972 
973 		/*
974 		 * Update the priority bits to the per-stream priority if
975 		 * priority is not set in the packet. Update the VID for
976 		 * packets on a VLAN stream.
977 		 */
978 		pri = (pri == 0) ? dsp->ds_pri : 0;
979 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
980 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
981 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
982 				goto discard;
983 			}
984 			mp = newmp;
985 		}
986 	}
987 
988 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
989 		/* Turn on flow-control for dld */
990 		DLD_SETQFULL(dsp);
991 	}
992 	return;
993 
994 discard:
995 	/* TODO: bump kstat? */
996 	freemsg(mp);
997 }
998 
999 /*
1000  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1001  */
1002 int
1003 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1004 {
1005 	dev_t			dev;
1006 	int			err;
1007 	const char		*drvname;
1008 	mac_perim_handle_t	mph = NULL;
1009 	boolean_t		qassociated = B_FALSE;
1010 	dls_link_t		*dlp = NULL;
1011 	dls_dl_handle_t		ddp = NULL;
1012 
1013 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1014 		return (EINVAL);
1015 
1016 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1017 		return (ENOTSUP);
1018 
1019 	/*
1020 	 * /dev node access. This will still be supported for backward
1021 	 * compatibility reason.
1022 	 */
1023 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1024 	    (strcmp(drvname, "vnic") != 0)) {
1025 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1026 			return (EINVAL);
1027 		qassociated = B_TRUE;
1028 	}
1029 
1030 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1031 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1032 		goto failed;
1033 
1034 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1035 		goto failed;
1036 
1037 	/*
1038 	 * Open a channel.
1039 	 */
1040 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1041 		goto failed;
1042 
1043 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
1044 		goto failed;
1045 
1046 	/*
1047 	 * Set the default packet priority.
1048 	 */
1049 	dsp->ds_pri = 0;
1050 
1051 	/*
1052 	 * Add a notify function so that the we get updates from the MAC.
1053 	 */
1054 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1055 	dsp->ds_dlstate = DL_UNBOUND;
1056 	mac_perim_exit(mph);
1057 	return (0);
1058 
1059 failed:
1060 	if (dlp != NULL)
1061 		dls_link_rele(dlp);
1062 	if (mph != NULL)
1063 		mac_perim_exit(mph);
1064 	if (ddp != NULL)
1065 		dls_devnet_rele(ddp);
1066 	if (qassociated)
1067 		(void) qassociate(dsp->ds_wq, -1);
1068 
1069 	return (err);
1070 }
1071 
1072 /*
1073  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1074  * from close(2) for style 2.
1075  */
1076 void
1077 dld_str_detach(dld_str_t *dsp)
1078 {
1079 	mac_perim_handle_t	mph;
1080 	int			err;
1081 
1082 	ASSERT(dsp->ds_datathr_cnt == 0);
1083 
1084 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1085 	/*
1086 	 * Remove the notify function.
1087 	 *
1088 	 * Note that we cannot wait for the notification callback to be removed
1089 	 * since it could cause the deadlock with str_notify() since they both
1090 	 * need the mac perimeter. Continue if we cannot remove the
1091 	 * notification callback right now and wait after we leave the
1092 	 * perimeter.
1093 	 */
1094 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1095 	dsp->ds_mnh = NULL;
1096 
1097 	/*
1098 	 * Disable the capabilities
1099 	 */
1100 	dld_capabilities_disable(dsp);
1101 
1102 	/*
1103 	 * Clear LSO flags.
1104 	 */
1105 	dsp->ds_lso = B_FALSE;
1106 	dsp->ds_lso_max = 0;
1107 
1108 	dls_close(dsp);
1109 	mac_perim_exit(mph);
1110 
1111 	/*
1112 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1113 	 * because the notification callback was in progress, wait for
1114 	 * it to finish before we proceed.
1115 	 */
1116 	if (err != 0)
1117 		mac_notify_remove_wait(dsp->ds_mh);
1118 
1119 	/*
1120 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1121 	 * automatically in the call to dls_devnet_rele.
1122 	 */
1123 	dls_devnet_rele(dsp->ds_ddh);
1124 
1125 	dsp->ds_sap = 0;
1126 	dsp->ds_mh = NULL;
1127 	dsp->ds_mch = NULL;
1128 	dsp->ds_mip = NULL;
1129 
1130 	if (dsp->ds_style == DL_STYLE2)
1131 		(void) qassociate(dsp->ds_wq, -1);
1132 
1133 	/*
1134 	 * Re-initialize the DLPI state machine.
1135 	 */
1136 	dsp->ds_dlstate = DL_UNATTACHED;
1137 }
1138 
1139 /*
1140  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1141  * tags before sending packets up to the DLS clients, with the exception of
1142  * special priority tagged packets, in that case, we set the VID to 0.
1143  * mp must be a VLAN tagged packet.
1144  */
1145 static mblk_t *
1146 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1147 {
1148 	mblk_t *newmp;
1149 	struct ether_vlan_header *evhp;
1150 	uint16_t tci, new_tci;
1151 
1152 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1153 	if (DB_REF(mp) > 1) {
1154 		newmp = copymsg(mp);
1155 		if (newmp == NULL)
1156 			return (NULL);
1157 		freemsg(mp);
1158 		mp = newmp;
1159 	}
1160 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1161 
1162 	tci = ntohs(evhp->ether_tci);
1163 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1164 		/*
1165 		 * Priority is 0, strip the tag.
1166 		 */
1167 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1168 		mp->b_rptr += VLAN_TAGSZ;
1169 	} else {
1170 		/*
1171 		 * Priority is not 0, update the VID to 0.
1172 		 */
1173 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1174 		evhp->ether_tci = htons(new_tci);
1175 	}
1176 	return (mp);
1177 }
1178 
1179 /*
1180  * Raw mode receive function.
1181  */
1182 /*ARGSUSED*/
1183 void
1184 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1185     mac_header_info_t *mhip)
1186 {
1187 	dld_str_t *dsp = (dld_str_t *)arg;
1188 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1189 	mblk_t *next, *newmp;
1190 
1191 	ASSERT(mp != NULL);
1192 	do {
1193 		/*
1194 		 * Get the pointer to the next packet in the chain and then
1195 		 * clear b_next before the packet gets passed on.
1196 		 */
1197 		next = mp->b_next;
1198 		mp->b_next = NULL;
1199 
1200 		/*
1201 		 * Wind back b_rptr to point at the MAC header.
1202 		 */
1203 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1204 		mp->b_rptr -= mhip->mhi_hdrsize;
1205 
1206 		/*
1207 		 * Certain MAC type plugins provide an illusion for raw
1208 		 * DLPI consumers.  They pretend that the MAC layer is
1209 		 * something that it's not for the benefit of observability
1210 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1211 		 * for such consumers.	Here, unless native mode is enabled,
1212 		 * we call into the MAC layer so that this illusion can be
1213 		 * maintained.	The plugin will optionally transform the MAC
1214 		 * header here into something that can be passed up to raw
1215 		 * consumers.  The header goes from "cooked" mode to raw mode.
1216 		 */
1217 		if (!dsp->ds_native) {
1218 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1219 			if (newmp == NULL) {
1220 				freemsg(mp);
1221 				goto next;
1222 			}
1223 			mp = newmp;
1224 		}
1225 
1226 		/*
1227 		 * Strip the VLAN tag for VLAN streams.
1228 		 */
1229 		if (is_ethernet &&
1230 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1231 			/*
1232 			 * The priority should be kept only for VLAN
1233 			 * data-links.
1234 			 */
1235 			newmp = i_dld_ether_header_strip_tag(mp,
1236 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1237 			if (newmp == NULL) {
1238 				freemsg(mp);
1239 				goto next;
1240 			}
1241 			mp = newmp;
1242 		}
1243 
1244 		/*
1245 		 * Pass the packet on.
1246 		 */
1247 		if (canputnext(dsp->ds_rq))
1248 			putnext(dsp->ds_rq, mp);
1249 		else
1250 			freemsg(mp);
1251 
1252 next:
1253 		/*
1254 		 * Move on to the next packet in the chain.
1255 		 */
1256 		mp = next;
1257 	} while (mp != NULL);
1258 }
1259 
1260 /*
1261  * Fast-path receive function.
1262  */
1263 /*ARGSUSED*/
1264 void
1265 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1266     mac_header_info_t *mhip)
1267 {
1268 	dld_str_t *dsp = (dld_str_t *)arg;
1269 	mblk_t *next;
1270 	size_t offset = 0;
1271 
1272 	/*
1273 	 * MAC header stripping rules:
1274 	 *    - Tagged packets:
1275 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1276 	 *	b. Physical streams
1277 	 *	- VLAN packets (non-zero VID). The stream must be either a
1278 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1279 	 *	  Strip the Ethernet header but keep the VLAN header.
1280 	 *	- Special tagged packets (zero VID)
1281 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1282 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1283 	 *	    keep the VLAN header.
1284 	 *	  * Otherwise, strip the whole VLAN header.
1285 	 *    - Untagged packets. Strip the whole MAC header.
1286 	 */
1287 	if (mhip->mhi_istagged &&
1288 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1289 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1290 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1291 		offset = VLAN_TAGSZ;
1292 	}
1293 
1294 	ASSERT(mp != NULL);
1295 	do {
1296 		/*
1297 		 * Get the pointer to the next packet in the chain and then
1298 		 * clear b_next before the packet gets passed on.
1299 		 */
1300 		next = mp->b_next;
1301 		mp->b_next = NULL;
1302 
1303 		/*
1304 		 * Wind back b_rptr to point at the VLAN header.
1305 		 */
1306 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1307 		mp->b_rptr -= offset;
1308 
1309 		/*
1310 		 * Pass the packet on.
1311 		 */
1312 		if (canputnext(dsp->ds_rq))
1313 			putnext(dsp->ds_rq, mp);
1314 		else
1315 			freemsg(mp);
1316 		/*
1317 		 * Move on to the next packet in the chain.
1318 		 */
1319 		mp = next;
1320 	} while (mp != NULL);
1321 }
1322 
1323 /*
1324  * Default receive function (send DL_UNITDATA_IND messages).
1325  */
1326 /*ARGSUSED*/
1327 void
1328 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1329     mac_header_info_t *mhip)
1330 {
1331 	dld_str_t		*dsp = (dld_str_t *)arg;
1332 	mblk_t			*ud_mp;
1333 	mblk_t			*next;
1334 	size_t			offset = 0;
1335 	boolean_t		strip_vlan = B_TRUE;
1336 
1337 	/*
1338 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1339 	 */
1340 	if (mhip->mhi_istagged &&
1341 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1342 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1343 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1344 		offset = VLAN_TAGSZ;
1345 		strip_vlan = B_FALSE;
1346 	}
1347 
1348 	ASSERT(mp != NULL);
1349 	do {
1350 		/*
1351 		 * Get the pointer to the next packet in the chain and then
1352 		 * clear b_next before the packet gets passed on.
1353 		 */
1354 		next = mp->b_next;
1355 		mp->b_next = NULL;
1356 
1357 		/*
1358 		 * Wind back b_rptr to point at the MAC header.
1359 		 */
1360 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1361 		mp->b_rptr -= mhip->mhi_hdrsize;
1362 
1363 		/*
1364 		 * Create the DL_UNITDATA_IND M_PROTO.
1365 		 */
1366 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1367 			freemsgchain(mp);
1368 			return;
1369 		}
1370 
1371 		/*
1372 		 * Advance b_rptr to point at the payload (or the VLAN header).
1373 		 */
1374 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1375 
1376 		/*
1377 		 * Prepend the DL_UNITDATA_IND.
1378 		 */
1379 		ud_mp->b_cont = mp;
1380 
1381 		/*
1382 		 * Send the message.
1383 		 */
1384 		if (canputnext(dsp->ds_rq))
1385 			putnext(dsp->ds_rq, ud_mp);
1386 		else
1387 			freemsg(ud_mp);
1388 
1389 		/*
1390 		 * Move on to the next packet in the chain.
1391 		 */
1392 		mp = next;
1393 	} while (mp != NULL);
1394 }
1395 
1396 /*
1397  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1398  */
1399 static void
1400 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu, uint_t multicast_sdu)
1401 {
1402 	mblk_t		*mp;
1403 	dl_notify_ind_t *dlip;
1404 
1405 	if (!(dsp->ds_notifications & (DL_NOTE_SDU_SIZE|DL_NOTE_SDU_SIZE2)))
1406 		return;
1407 
1408 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1409 	    M_PROTO, 0)) == NULL)
1410 		return;
1411 
1412 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1413 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1414 	dlip->dl_primitive = DL_NOTIFY_IND;
1415 	if (dsp->ds_notifications & DL_NOTE_SDU_SIZE2) {
1416 		dlip->dl_notification = DL_NOTE_SDU_SIZE2;
1417 		dlip->dl_data1 = max_sdu;
1418 		dlip->dl_data2 = multicast_sdu;
1419 	} else {
1420 		dlip->dl_notification = DL_NOTE_SDU_SIZE;
1421 		dlip->dl_data = max_sdu;
1422 	}
1423 
1424 	qreply(dsp->ds_wq, mp);
1425 }
1426 
1427 /*
1428  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1429  * current state of the interface.
1430  */
1431 void
1432 dld_str_notify_ind(dld_str_t *dsp)
1433 {
1434 	mac_notify_type_t	type;
1435 
1436 	for (type = 0; type < MAC_NNOTE; type++)
1437 		str_notify(dsp, type);
1438 }
1439 
1440 typedef struct dl_unitdata_ind_wrapper {
1441 	dl_unitdata_ind_t	dl_unitdata;
1442 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1443 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1444 } dl_unitdata_ind_wrapper_t;
1445 
1446 /*
1447  * Create a DL_UNITDATA_IND M_PROTO message.
1448  */
1449 static mblk_t *
1450 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1451 {
1452 	mblk_t				*nmp;
1453 	dl_unitdata_ind_wrapper_t	*dlwp;
1454 	dl_unitdata_ind_t		*dlp;
1455 	mac_header_info_t		mhi;
1456 	uint_t				addr_length;
1457 	uint8_t				*daddr;
1458 	uint8_t				*saddr;
1459 
1460 	/*
1461 	 * Get the packet header information.
1462 	 */
1463 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
1464 		return (NULL);
1465 
1466 	/*
1467 	 * Allocate a message large enough to contain the wrapper structure
1468 	 * defined above.
1469 	 */
1470 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1471 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1472 	    DL_UNITDATA_IND)) == NULL)
1473 		return (NULL);
1474 
1475 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1476 
1477 	dlp = &(dlwp->dl_unitdata);
1478 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1479 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1480 
1481 	/*
1482 	 * Copy in the destination address.
1483 	 */
1484 	addr_length = dsp->ds_mip->mi_addr_length;
1485 	daddr = dlwp->dl_dest_addr;
1486 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1487 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1488 
1489 	/*
1490 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1491 	 */
1492 	if (mhi.mhi_istagged && !strip_vlan)
1493 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1494 	else
1495 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1496 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1497 
1498 	/*
1499 	 * If the destination address was multicast or broadcast then the
1500 	 * dl_group_address field should be non-zero.
1501 	 */
1502 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1503 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1504 
1505 	/*
1506 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1507 	 * for example) may not have access to source information.
1508 	 */
1509 	if (mhi.mhi_saddr == NULL) {
1510 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1511 	} else {
1512 		saddr = dlwp->dl_src_addr;
1513 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1514 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1515 
1516 		/*
1517 		 * Set the source DLSAP to the packet ethertype.
1518 		 */
1519 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1520 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1521 	}
1522 
1523 	return (nmp);
1524 }
1525 
1526 /*
1527  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1528  */
1529 static void
1530 str_notify_promisc_on_phys(dld_str_t *dsp)
1531 {
1532 	mblk_t		*mp;
1533 	dl_notify_ind_t	*dlip;
1534 
1535 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1536 		return;
1537 
1538 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1539 	    M_PROTO, 0)) == NULL)
1540 		return;
1541 
1542 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1543 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1544 	dlip->dl_primitive = DL_NOTIFY_IND;
1545 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1546 
1547 	qreply(dsp->ds_wq, mp);
1548 }
1549 
1550 /*
1551  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1552  */
1553 static void
1554 str_notify_promisc_off_phys(dld_str_t *dsp)
1555 {
1556 	mblk_t		*mp;
1557 	dl_notify_ind_t	*dlip;
1558 
1559 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1560 		return;
1561 
1562 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1563 	    M_PROTO, 0)) == NULL)
1564 		return;
1565 
1566 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1567 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1568 	dlip->dl_primitive = DL_NOTIFY_IND;
1569 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1570 
1571 	qreply(dsp->ds_wq, mp);
1572 }
1573 
1574 /*
1575  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1576  */
1577 static void
1578 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1579 {
1580 	mblk_t		*mp;
1581 	dl_notify_ind_t	*dlip;
1582 	uint_t		addr_length;
1583 	uint16_t	ethertype;
1584 
1585 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1586 		return;
1587 
1588 	addr_length = dsp->ds_mip->mi_addr_length;
1589 	if ((mp = mexchange(dsp->ds_wq, NULL,
1590 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1591 	    M_PROTO, 0)) == NULL)
1592 		return;
1593 
1594 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1595 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1596 	dlip->dl_primitive = DL_NOTIFY_IND;
1597 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1598 	dlip->dl_data = addr_type;
1599 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1600 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1601 
1602 	bcopy(addr, &dlip[1], addr_length);
1603 
1604 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1605 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1606 
1607 	qreply(dsp->ds_wq, mp);
1608 }
1609 
1610 /*
1611  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1612  */
1613 static void
1614 str_notify_link_up(dld_str_t *dsp)
1615 {
1616 	mblk_t		*mp;
1617 	dl_notify_ind_t	*dlip;
1618 
1619 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1620 		return;
1621 
1622 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1623 	    M_PROTO, 0)) == NULL)
1624 		return;
1625 
1626 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1627 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1628 	dlip->dl_primitive = DL_NOTIFY_IND;
1629 	dlip->dl_notification = DL_NOTE_LINK_UP;
1630 
1631 	qreply(dsp->ds_wq, mp);
1632 }
1633 
1634 /*
1635  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1636  */
1637 static void
1638 str_notify_link_down(dld_str_t *dsp)
1639 {
1640 	mblk_t		*mp;
1641 	dl_notify_ind_t	*dlip;
1642 
1643 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1644 		return;
1645 
1646 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1647 	    M_PROTO, 0)) == NULL)
1648 		return;
1649 
1650 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1651 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1652 	dlip->dl_primitive = DL_NOTIFY_IND;
1653 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1654 
1655 	qreply(dsp->ds_wq, mp);
1656 }
1657 
1658 /*
1659  * DL_NOTIFY_IND: DL_NOTE_SPEED
1660  */
1661 static void
1662 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1663 {
1664 	mblk_t		*mp;
1665 	dl_notify_ind_t	*dlip;
1666 
1667 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1668 		return;
1669 
1670 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1671 	    M_PROTO, 0)) == NULL)
1672 		return;
1673 
1674 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1675 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1676 	dlip->dl_primitive = DL_NOTIFY_IND;
1677 	dlip->dl_notification = DL_NOTE_SPEED;
1678 	dlip->dl_data = speed;
1679 
1680 	qreply(dsp->ds_wq, mp);
1681 }
1682 
1683 /*
1684  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1685  */
1686 static void
1687 str_notify_capab_reneg(dld_str_t *dsp)
1688 {
1689 	mblk_t		*mp;
1690 	dl_notify_ind_t	*dlip;
1691 
1692 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1693 		return;
1694 
1695 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1696 	    M_PROTO, 0)) == NULL)
1697 		return;
1698 
1699 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1700 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1701 	dlip->dl_primitive = DL_NOTIFY_IND;
1702 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1703 
1704 	qreply(dsp->ds_wq, mp);
1705 }
1706 
1707 /*
1708  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1709  */
1710 static void
1711 str_notify_fastpath_flush(dld_str_t *dsp)
1712 {
1713 	mblk_t		*mp;
1714 	dl_notify_ind_t	*dlip;
1715 
1716 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1717 		return;
1718 
1719 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1720 	    M_PROTO, 0)) == NULL)
1721 		return;
1722 
1723 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1724 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1725 	dlip->dl_primitive = DL_NOTIFY_IND;
1726 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1727 
1728 	qreply(dsp->ds_wq, mp);
1729 }
1730 
1731 static void
1732 str_notify_allowed_ips(dld_str_t *dsp)
1733 {
1734 	mblk_t		*mp;
1735 	dl_notify_ind_t	*dlip;
1736 	size_t		mp_size;
1737 	mac_protect_t	*mrp;
1738 
1739 	if (!(dsp->ds_notifications & DL_NOTE_ALLOWED_IPS))
1740 		return;
1741 
1742 	mp_size = sizeof (mac_protect_t) + sizeof (dl_notify_ind_t);
1743 	if ((mp = mexchange(dsp->ds_wq, NULL, mp_size, M_PROTO, 0)) == NULL)
1744 		return;
1745 
1746 	mrp = mac_protect_get(dsp->ds_mh);
1747 	bzero(mp->b_rptr, mp_size);
1748 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1749 	dlip->dl_primitive = DL_NOTIFY_IND;
1750 	dlip->dl_notification = DL_NOTE_ALLOWED_IPS;
1751 	dlip->dl_data = 0;
1752 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1753 	dlip->dl_addr_length = sizeof (mac_protect_t);
1754 	bcopy(mrp, mp->b_rptr + sizeof (dl_notify_ind_t),
1755 	    sizeof (mac_protect_t));
1756 
1757 	qreply(dsp->ds_wq, mp);
1758 }
1759 
1760 /*
1761  * MAC notification callback.
1762  */
1763 void
1764 str_notify(void *arg, mac_notify_type_t type)
1765 {
1766 	dld_str_t		*dsp = (dld_str_t *)arg;
1767 	queue_t			*q = dsp->ds_wq;
1768 	mac_handle_t		mh = dsp->ds_mh;
1769 	mac_client_handle_t	mch = dsp->ds_mch;
1770 	uint8_t			addr[MAXMACADDRLEN];
1771 
1772 	switch (type) {
1773 	case MAC_NOTE_TX:
1774 		qenable(q);
1775 		break;
1776 
1777 	case MAC_NOTE_DEVPROMISC:
1778 		/*
1779 		 * Send the appropriate DL_NOTIFY_IND.
1780 		 */
1781 		if (mac_promisc_get(mh))
1782 			str_notify_promisc_on_phys(dsp);
1783 		else
1784 			str_notify_promisc_off_phys(dsp);
1785 		break;
1786 
1787 	case MAC_NOTE_UNICST:
1788 		/*
1789 		 * This notification is sent whenever the MAC unicast
1790 		 * address changes.
1791 		 */
1792 		mac_unicast_primary_get(mh, addr);
1793 
1794 		/*
1795 		 * Send the appropriate DL_NOTIFY_IND.
1796 		 */
1797 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1798 		break;
1799 
1800 	case MAC_NOTE_DEST:
1801 		/*
1802 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
1803 		 * destination address.
1804 		 */
1805 		if (mac_dst_get(dsp->ds_mh, addr))
1806 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1807 		break;
1808 
1809 	case MAC_NOTE_LOWLINK:
1810 	case MAC_NOTE_LINK:
1811 		/*
1812 		 * LOWLINK refers to the actual link status. For links that
1813 		 * are not part of a bridge instance LOWLINK and LINK state
1814 		 * are the same. But for a link part of a bridge instance
1815 		 * LINK state refers to the aggregate link status: "up" when
1816 		 * at least one link part of the bridge is up and is "down"
1817 		 * when all links part of the bridge are down.
1818 		 *
1819 		 * Clients can request to be notified of the LOWLINK state
1820 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1821 		 * daemon request lowlink state changes and upper layer clients
1822 		 * receive notifications of the aggregate link state changes
1823 		 * which is the default when requesting LINK UP/DOWN state
1824 		 * notifications.
1825 		 */
1826 
1827 		/*
1828 		 * Check that the notification type matches the one that we
1829 		 * want.  If we want lower-level link notifications, and this
1830 		 * is upper, or if we want upper and this is lower, then
1831 		 * ignore.
1832 		 */
1833 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1834 			break;
1835 		/*
1836 		 * This notification is sent every time the MAC driver
1837 		 * updates the link state.
1838 		 */
1839 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1840 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1841 		case LINK_STATE_UP: {
1842 			uint64_t speed;
1843 			/*
1844 			 * The link is up so send the appropriate
1845 			 * DL_NOTIFY_IND.
1846 			 */
1847 			str_notify_link_up(dsp);
1848 
1849 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1850 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1851 			break;
1852 		}
1853 		case LINK_STATE_DOWN:
1854 			/*
1855 			 * The link is down so send the appropriate
1856 			 * DL_NOTIFY_IND.
1857 			 */
1858 			str_notify_link_down(dsp);
1859 			break;
1860 
1861 		default:
1862 			break;
1863 		}
1864 		break;
1865 
1866 	case MAC_NOTE_CAPAB_CHG:
1867 		/*
1868 		 * This notification is sent whenever the MAC resources
1869 		 * change or capabilities change. We need to renegotiate
1870 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1871 		 */
1872 		str_notify_capab_reneg(dsp);
1873 		break;
1874 
1875 	case MAC_NOTE_SDU_SIZE: {
1876 		uint_t  max_sdu;
1877 		uint_t	multicast_sdu;
1878 		mac_sdu_get2(dsp->ds_mh, NULL, &max_sdu, &multicast_sdu);
1879 		str_notify_sdu_size(dsp, max_sdu, multicast_sdu);
1880 		break;
1881 	}
1882 
1883 	case MAC_NOTE_FASTPATH_FLUSH:
1884 		str_notify_fastpath_flush(dsp);
1885 		break;
1886 
1887 	/* Unused notifications */
1888 	case MAC_NOTE_MARGIN:
1889 		break;
1890 
1891 	case MAC_NOTE_ALLOWED_IPS:
1892 		str_notify_allowed_ips(dsp);
1893 		break;
1894 
1895 	default:
1896 		ASSERT(B_FALSE);
1897 		break;
1898 	}
1899 }
1900 
1901 /*
1902  * This function is called via a taskq mechansim to process all control
1903  * messages on a per 'dsp' end point.
1904  */
1905 static void
1906 dld_wput_nondata_task(void *arg)
1907 {
1908 	dld_str_t	*dsp = arg;
1909 	mblk_t		*mp;
1910 
1911 	mutex_enter(&dsp->ds_lock);
1912 	while (dsp->ds_pending_head != NULL) {
1913 		mp = dsp->ds_pending_head;
1914 		dsp->ds_pending_head = mp->b_next;
1915 		mp->b_next = NULL;
1916 		if (dsp->ds_pending_head == NULL)
1917 			dsp->ds_pending_tail = NULL;
1918 		mutex_exit(&dsp->ds_lock);
1919 
1920 		switch (DB_TYPE(mp)) {
1921 		case M_PROTO:
1922 		case M_PCPROTO:
1923 			dld_proto(dsp, mp);
1924 			break;
1925 		case M_IOCTL:
1926 			dld_ioc(dsp, mp);
1927 			break;
1928 		default:
1929 			ASSERT(0);
1930 		}
1931 
1932 		mutex_enter(&dsp->ds_lock);
1933 	}
1934 	ASSERT(dsp->ds_pending_tail == NULL);
1935 	dsp->ds_dlpi_pending = 0;
1936 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1937 	mutex_exit(&dsp->ds_lock);
1938 }
1939 
1940 /*
1941  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1942  * thread is started at boot time.
1943  */
1944 static void
1945 dld_taskq_dispatch(void)
1946 {
1947 	callb_cpr_t	cprinfo;
1948 	dld_str_t	*dsp;
1949 
1950 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1951 	    "dld_taskq_dispatch");
1952 	mutex_enter(&dld_taskq_lock);
1953 
1954 	while (!dld_taskq_quit) {
1955 		dsp = list_head(&dld_taskq_list);
1956 		while (dsp != NULL) {
1957 			list_remove(&dld_taskq_list, dsp);
1958 			mutex_exit(&dld_taskq_lock);
1959 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1960 			    dsp, TQ_SLEEP) != 0);
1961 			mutex_enter(&dld_taskq_lock);
1962 			dsp = list_head(&dld_taskq_list);
1963 		}
1964 
1965 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1966 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1967 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1968 	}
1969 
1970 	dld_taskq_done = B_TRUE;
1971 	cv_signal(&dld_taskq_cv);
1972 	CALLB_CPR_EXIT(&cprinfo);
1973 	thread_exit();
1974 }
1975 
1976 /*
1977  * All control operations are serialized on the 'dsp' and are also funneled
1978  * through a taskq mechanism to ensure that subsequent processing has kernel
1979  * context and can safely use cv_wait.
1980  *
1981  * Mechanisms to handle taskq dispatch failures
1982  *
1983  * The only way to be sure that taskq dispatch does not fail is to either
1984  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1985  * some number of entries and make sure that the number of outstanding requests
1986  * are less than that number. We can't use TQ_SLEEP since we don't know the
1987  * context. Nor can we bound the total number of 'dsp' end points. So we are
1988  * unable to use either of the above schemes, and are forced to deal with
1989  * taskq dispatch failures. Note that even dynamic taskq could fail in
1990  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1991  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1992  * framework.
1993  *
1994  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1995  * We also have a single global thread to retry the taskq dispatch. This
1996  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1997  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1998  */
1999 static void
2000 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
2001 {
2002 	ASSERT(mp->b_next == NULL);
2003 	mutex_enter(&dsp->ds_lock);
2004 	if (dsp->ds_pending_head != NULL) {
2005 		ASSERT(dsp->ds_dlpi_pending);
2006 		dsp->ds_pending_tail->b_next = mp;
2007 		dsp->ds_pending_tail = mp;
2008 		mutex_exit(&dsp->ds_lock);
2009 		return;
2010 	}
2011 	ASSERT(dsp->ds_pending_tail == NULL);
2012 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2013 	/*
2014 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
2015 	 * thread is still active and is processing the last message, though
2016 	 * the pending queue has been emptied.
2017 	 */
2018 	if (dsp->ds_dlpi_pending) {
2019 		mutex_exit(&dsp->ds_lock);
2020 		return;
2021 	}
2022 
2023 	dsp->ds_dlpi_pending = 1;
2024 	mutex_exit(&dsp->ds_lock);
2025 
2026 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
2027 	    TQ_NOSLEEP) != 0)
2028 		return;
2029 
2030 	mutex_enter(&dld_taskq_lock);
2031 	list_insert_tail(&dld_taskq_list, dsp);
2032 	cv_signal(&dld_taskq_cv);
2033 	mutex_exit(&dld_taskq_lock);
2034 }
2035 
2036 /*
2037  * Process an M_IOCTL message.
2038  */
2039 static void
2040 dld_ioc(dld_str_t *dsp, mblk_t *mp)
2041 {
2042 	uint_t			cmd;
2043 
2044 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2045 	ASSERT(dsp->ds_type == DLD_DLPI);
2046 
2047 	switch (cmd) {
2048 	case DLIOCNATIVE:
2049 		ioc_native(dsp, mp);
2050 		break;
2051 	case DLIOCMARGININFO:
2052 		ioc_margin(dsp, mp);
2053 		break;
2054 	case DLIOCRAW:
2055 		ioc_raw(dsp, mp);
2056 		break;
2057 	case DLIOCHDRINFO:
2058 		ioc_fast(dsp, mp);
2059 		break;
2060 	case DLIOCLOWLINK:
2061 		ioc_lowlink(dsp, mp);
2062 		break;
2063 	default:
2064 		ioc(dsp, mp);
2065 	}
2066 }
2067 
2068 /*
2069  * DLIOCNATIVE
2070  */
2071 static void
2072 ioc_native(dld_str_t *dsp, mblk_t *mp)
2073 {
2074 	queue_t *q = dsp->ds_wq;
2075 	const mac_info_t *mip = dsp->ds_mip;
2076 
2077 	/*
2078 	 * Native mode can be enabled if it's disabled and if the
2079 	 * native media type is different.
2080 	 */
2081 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2082 		dsp->ds_native = B_TRUE;
2083 
2084 	if (dsp->ds_native)
2085 		miocack(q, mp, 0, mip->mi_nativemedia);
2086 	else
2087 		miocnak(q, mp, 0, ENOTSUP);
2088 }
2089 
2090 /*
2091  * DLIOCMARGININFO
2092  */
2093 static void
2094 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2095 {
2096 	queue_t *q = dsp->ds_wq;
2097 	uint32_t margin;
2098 	int err;
2099 
2100 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2101 		err = EINVAL;
2102 		goto failed;
2103 	}
2104 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2105 		goto failed;
2106 
2107 	mac_margin_get(dsp->ds_mh, &margin);
2108 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2109 	miocack(q, mp, sizeof (uint32_t), 0);
2110 	return;
2111 
2112 failed:
2113 	miocnak(q, mp, 0, err);
2114 }
2115 
2116 /*
2117  * DLIOCRAW
2118  */
2119 static void
2120 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2121 {
2122 	queue_t *q = dsp->ds_wq;
2123 	mac_perim_handle_t	mph;
2124 
2125 	if (dsp->ds_mh == NULL) {
2126 		dsp->ds_mode = DLD_RAW;
2127 		miocack(q, mp, 0, 0);
2128 		return;
2129 	}
2130 
2131 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2132 	if (dsp->ds_polling || dsp->ds_direct) {
2133 		mac_perim_exit(mph);
2134 		miocnak(q, mp, 0, EPROTO);
2135 		return;
2136 	}
2137 
2138 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2139 		/*
2140 		 * Set the receive callback.
2141 		 */
2142 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2143 	}
2144 
2145 	/*
2146 	 * Note that raw mode is enabled.
2147 	 */
2148 	dsp->ds_mode = DLD_RAW;
2149 	mac_perim_exit(mph);
2150 
2151 	miocack(q, mp, 0, 0);
2152 }
2153 
2154 /*
2155  * DLIOCHDRINFO
2156  */
2157 static void
2158 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2159 {
2160 	dl_unitdata_req_t *dlp;
2161 	off_t		off;
2162 	size_t		len;
2163 	const uint8_t	*addr;
2164 	uint16_t	sap;
2165 	mblk_t		*nmp;
2166 	mblk_t		*hmp;
2167 	uint_t		addr_length;
2168 	queue_t		*q = dsp->ds_wq;
2169 	int		err;
2170 	mac_perim_handle_t	mph;
2171 
2172 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2173 		err = ENOTSUP;
2174 		goto failed;
2175 	}
2176 
2177 	/*
2178 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2179 	 * user-land should not be allowed.
2180 	 */
2181 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2182 		err = EINVAL;
2183 		goto failed;
2184 	}
2185 
2186 	nmp = mp->b_cont;
2187 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2188 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2189 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2190 		err = EINVAL;
2191 		goto failed;
2192 	}
2193 
2194 	off = dlp->dl_dest_addr_offset;
2195 	len = dlp->dl_dest_addr_length;
2196 
2197 	if (!MBLKIN(nmp, off, len)) {
2198 		err = EINVAL;
2199 		goto failed;
2200 	}
2201 
2202 	if (dsp->ds_dlstate != DL_IDLE) {
2203 		err = ENOTSUP;
2204 		goto failed;
2205 	}
2206 
2207 	addr_length = dsp->ds_mip->mi_addr_length;
2208 	if (len != addr_length + sizeof (uint16_t)) {
2209 		err = EINVAL;
2210 		goto failed;
2211 	}
2212 
2213 	addr = nmp->b_rptr + off;
2214 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2215 
2216 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2217 		err = ENOMEM;
2218 		goto failed;
2219 	}
2220 
2221 	/*
2222 	 * This ioctl might happen concurrently with a direct call to dld_capab
2223 	 * that tries to enable direct and/or poll capabilities. Since the
2224 	 * stack does not serialize them, we do so here to avoid mixing
2225 	 * the callbacks.
2226 	 */
2227 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2228 	if (dsp->ds_mode != DLD_FASTPATH) {
2229 		/*
2230 		 * Set the receive callback (unless polling is enabled).
2231 		 */
2232 		if (!dsp->ds_polling && !dsp->ds_direct)
2233 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2234 
2235 		/*
2236 		 * Note that fast-path mode is enabled.
2237 		 */
2238 		dsp->ds_mode = DLD_FASTPATH;
2239 	}
2240 	mac_perim_exit(mph);
2241 
2242 	freemsg(nmp->b_cont);
2243 	nmp->b_cont = hmp;
2244 
2245 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2246 	return;
2247 failed:
2248 	miocnak(q, mp, 0, err);
2249 }
2250 
2251 /*
2252  * DLIOCLOWLINK: request actual link state changes. When the
2253  * link is part of a bridge instance the client receives actual
2254  * link state changes and not the aggregate link status. Used by
2255  * the bridging daemon (bridged) for proper RSTP operation.
2256  */
2257 static void
2258 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2259 {
2260 	queue_t *q = dsp->ds_wq;
2261 	int err;
2262 
2263 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
2264 		miocnak(q, mp, 0, err);
2265 	} else {
2266 		/* LINTED: alignment */
2267 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2268 		miocack(q, mp, 0, 0);
2269 	}
2270 }
2271 
2272 /*
2273  * Catch-all handler.
2274  */
2275 static void
2276 ioc(dld_str_t *dsp, mblk_t *mp)
2277 {
2278 	queue_t	*q = dsp->ds_wq;
2279 
2280 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2281 		miocnak(q, mp, 0, EINVAL);
2282 		return;
2283 	}
2284 	mac_ioctl(dsp->ds_mh, q, mp);
2285 }
2286