1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39 * Copyright 2023 Oxide Computer Company
40 */
41
42 /*
43 * viona - VirtIO-Net, Accelerated
44 *
45 * The purpose of viona is to provide high performance virtio-net devices to
46 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the
47 * DLS/DLD stack.
48 *
49 * --------------------
50 * General Architecture
51 * --------------------
52 *
53 * A single viona instance is comprised of a "link" handle and two "rings".
54 * After opening the viona device, it must be associated with a MAC network
55 * interface and a bhyve (vmm) instance to form its link resource. This is
56 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
57 * passed in to perform the initialization. With the MAC client opened, and a
58 * driver handle to the vmm instance established, the device is ready to be
59 * configured by the guest.
60 *
61 * The userspace portion of bhyve, which interfaces with the PCI device
62 * emulation framework, is meant to stay out of the datapath if at all
63 * possible. Configuration changes made via PCI are mapped to actions which
64 * will steer the operation of the in-kernel logic.
65 *
66 *
67 * -----------
68 * Ring Basics
69 * -----------
70 *
71 * Each viona link has two viona_vring_t entities, RX and TX, for handling data
72 * transfers to and from the guest. They represent an interface to the
73 * standard virtio ring structures. When initialized and active, each ring is
74 * backed by a kernel worker thread (parented to the bhyve process for the
75 * instance) which handles ring events. The RX worker has the simple task of
76 * watching for ring shutdown conditions. The TX worker does that in addition
77 * to processing all requests to transmit data. Data destined for the guest is
78 * delivered directly by MAC to viona_rx() when the ring is active.
79 *
80 *
81 * -----------
82 * Ring States
83 * -----------
84 *
85 * The viona_vring_t instances follow a simple path through the possible state
86 * values represented in virtio_vring_t`vr_state:
87 *
88 * +<--------------------------------------------+
89 * | |
90 * V ^
91 * +-----------+ This is the initial state when a link is created or
92 * | VRS_RESET | when the ring has been explicitly reset.
93 * +-----------+
94 * | ^
95 * |---* ioctl(VNA_IOC_RING_INIT) issued |
96 * | |
97 * | ^
98 * V
99 * +-----------+ The ring parameters (size, guest physical addresses)
100 * | VRS_SETUP | have been set and start-up of the ring worker thread
101 * +-----------+ has begun.
102 * | ^
103 * | |
104 * |---* ring worker thread begins execution |
105 * | |
106 * +-------------------------------------------->+
107 * | | ^
108 * | |
109 * | * If ring shutdown is requested (by ioctl or impending
110 * | bhyve process death) while the worker thread is
111 * | starting, the worker will transition the ring to
112 * | VRS_RESET and exit.
113 * | ^
114 * | |
115 * |<-------------------------------------------<+
116 * | | |
117 * | | ^
118 * | * If ring is requested to pause (but not stop)from the
119 * | VRS_RUN state, it will return to the VRS_INIT state.
120 * |
121 * | ^
122 * | |
123 * | ^
124 * V
125 * +-----------+ The worker thread associated with the ring has started
126 * | VRS_INIT | executing. It has allocated any extra resources needed
127 * +-----------+ for the ring to operate.
128 * | ^
129 * | |
130 * +-------------------------------------------->+
131 * | | ^
132 * | |
133 * | * If ring shutdown is requested while the worker is
134 * | waiting in VRS_INIT, it will free any extra resources
135 * | and transition to VRS_RESET.
136 * | ^
137 * | |
138 * |--* ioctl(VNA_IOC_RING_KICK) issued |
139 * | ^
140 * V
141 * +-----------+ The worker thread associated with the ring is executing
142 * | VRS_RUN | workload specific to that ring.
143 * +-----------+
144 * | ^
145 * |---* ioctl(VNA_IOC_RING_RESET) issued |
146 * | (or bhyve process begins exit) ^
147 * |
148 * +-----------+ The worker thread associated with the ring is in the
149 * | VRS_STOP | process of exiting. All outstanding TX and RX
150 * +-----------+ requests are allowed to complete, but new requests
151 * | must be ignored.
152 * | ^
153 * | |
154 * +-------------------------------------------->+
155 *
156 *
157 * While the worker thread is not running, changes to vr_state are only made by
158 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts
159 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread
160 * has been started, only it may perform ring state transitions (still under
161 * the protection of vr_lock), when requested by outside consumers via
162 * vr_state_flags or when the containing bhyve process initiates an exit.
163 *
164 *
165 * ----------------------------
166 * Transmission mblk_t Handling
167 * ----------------------------
168 *
169 * For incoming frames destined for a bhyve guest, the data must first land in
170 * a host OS buffer from the physical NIC before it is copied into the awaiting
171 * guest buffer(s). Outbound frames transmitted by the guest are not bound by
172 * this limitation and can avoid extra copying before the buffers are accessed
173 * directly by the NIC. When a guest designates buffers to be transmitted,
174 * viona translates the guest-physical addresses contained in the ring
175 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is
176 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
177 * Doing so increments vr_xfer_outstanding, preventing the ring from being
178 * reset (allowing the link to drop its vmm handle to the guest) until all
179 * transmit mblks referencing guest memory have been processed. Allocation of
180 * the viona_desb_t entries is done during the VRS_INIT stage of the ring
181 * worker thread. The ring size informs that allocation as the number of
182 * concurrent transmissions is limited by the number of descriptors in the
183 * ring. This minimizes allocation in the transmit hot-path by acquiring those
184 * fixed-size resources during initialization.
185 *
186 * This optimization depends on the underlying NIC driver freeing the mblks in
187 * a timely manner after they have been transmitted by the hardware. Some
188 * drivers have been found to flush TX descriptors only when new transmissions
189 * are initiated. This means that there is no upper bound to the time needed
190 * for an mblk to be flushed and can stall bhyve guests from shutting down
191 * since their memory must be free of viona TX references prior to clean-up.
192 *
193 * This expectation of deterministic mblk_t processing is likely the reason
194 * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
195 * loaded will copy transmit data into fresh buffers rather than passing up
196 * zero-copy mblks. It is a hold-over from the original viona sources provided
197 * by Pluribus and its continued necessity has not been confirmed.
198 *
199 *
200 * ----------------------------
201 * Ring Notification Fast-paths
202 * ----------------------------
203 *
204 * Device operation for viona requires that notifications flow to and from the
205 * guest to indicate certain ring conditions. In order to minimize latency and
206 * processing overhead, the notification procedures are kept in-kernel whenever
207 * possible.
208 *
209 * Guest-to-host notifications, when new available descriptors have been placed
210 * in the ring, are posted via the 'queue notify' address in the virtio BAR.
211 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
212 * install a callback hook on an ioport address. Guest exits for accesses to
213 * viona-hooked ioport addresses will result in direct calls to notify the
214 * appropriate ring worker without a trip to userland.
215 *
216 * Host-to-guest notifications in the form of interrupts enjoy similar
217 * acceleration. Each viona ring can be configured to send MSI notifications
218 * to the guest as virtio conditions dictate. This in-kernel interrupt
219 * configuration is kept synchronized through viona ioctls which are utilized
220 * during writes to the associated PCI config registers or MSI-X BAR.
221 *
222 * Guests which do not utilize MSI-X will result in viona falling back to the
223 * slow path for interrupts. It will poll(2) the viona handle, receiving
224 * notification when ring events necessitate the assertion of an interrupt.
225 *
226 *
227 * ---------------
228 * Nethook Support
229 * ---------------
230 *
231 * Viona provides four nethook events that consumers (e.g. ipf) can hook into
232 * to intercept packets as they go up or down the stack. Unfortunately,
233 * the nethook framework does not understand raw packets, so we can only
234 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach,
235 * we register callbacks with the neti (netinfo) module that will be invoked
236 * for each netstack already present, as well as for any additional netstack
237 * instances created as the system operates. These callbacks will
238 * register/unregister the hooks with the nethook framework for each
239 * netstack instance. This registration occurs prior to creating any
240 * viona instances for a given netstack, and the unregistration for a netstack
241 * instance occurs after all viona instances of the netstack instance have
242 * been deleted.
243 */
244
245 #include <sys/conf.h>
246 #include <sys/file.h>
247 #include <sys/stat.h>
248
249 #include <sys/dlpi.h>
250 #include <sys/vlan.h>
251
252 #include "viona_impl.h"
253
254
255 #define VIONA_NAME "Virtio Network Accelerator"
256 #define VIONA_CTL_MINOR 0
257 #define VIONA_CLI_NAME "viona" /* MAC client name */
258
259
260 /*
261 * Host capabilities.
262 */
263 #define VIONA_S_HOSTCAPS ( \
264 VIRTIO_NET_F_GUEST_CSUM | \
265 VIRTIO_NET_F_MAC | \
266 VIRTIO_NET_F_GUEST_TSO4 | \
267 VIRTIO_NET_F_MRG_RXBUF | \
268 VIRTIO_NET_F_STATUS | \
269 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
270 VIRTIO_F_RING_INDIRECT_DESC)
271
272 /* MAC_CAPAB_HCKSUM specifics of interest */
273 #define VIONA_CAP_HCKSUM_INTEREST \
274 (HCKSUM_INET_PARTIAL | \
275 HCKSUM_INET_FULL_V4 | \
276 HCKSUM_INET_FULL_V6)
277
278 static void *viona_state;
279 static dev_info_t *viona_dip;
280 static id_space_t *viona_minors;
281
282
283 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
284 void **result);
285 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
286 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
287 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
288 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
289 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
290 cred_t *credp, int *rval);
291 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
292 struct pollhead **phpp);
293
294 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
295 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
296
297 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
298 static int viona_ioc_set_promisc(viona_link_t *, viona_promisc_t);
299 static int viona_ioc_ring_init(viona_link_t *, void *, int);
300 static int viona_ioc_ring_set_state(viona_link_t *, void *, int);
301 static int viona_ioc_ring_get_state(viona_link_t *, void *, int);
302 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
303 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
304 static int viona_ioc_ring_pause(viona_link_t *, uint_t);
305 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
306 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
307 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
308
309 static struct cb_ops viona_cb_ops = {
310 viona_open,
311 viona_close,
312 nodev,
313 nodev,
314 nodev,
315 nodev,
316 nodev,
317 viona_ioctl,
318 nodev,
319 nodev,
320 nodev,
321 viona_chpoll,
322 ddi_prop_op,
323 0,
324 D_MP | D_NEW | D_HOTPLUG,
325 CB_REV,
326 nodev,
327 nodev
328 };
329
330 static struct dev_ops viona_ops = {
331 DEVO_REV,
332 0,
333 viona_info,
334 nulldev,
335 nulldev,
336 viona_attach,
337 viona_detach,
338 nodev,
339 &viona_cb_ops,
340 NULL,
341 ddi_power,
342 ddi_quiesce_not_needed
343 };
344
345 static struct modldrv modldrv = {
346 &mod_driverops,
347 VIONA_NAME,
348 &viona_ops,
349 };
350
351 static struct modlinkage modlinkage = {
352 MODREV_1, &modldrv, NULL
353 };
354
355 int
_init(void)356 _init(void)
357 {
358 int ret;
359
360 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
361 if (ret != 0) {
362 return (ret);
363 }
364
365 viona_minors = id_space_create("viona_minors",
366 VIONA_CTL_MINOR + 1, UINT16_MAX);
367 viona_rx_init();
368 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
369
370 ret = mod_install(&modlinkage);
371 if (ret != 0) {
372 ddi_soft_state_fini(&viona_state);
373 id_space_destroy(viona_minors);
374 viona_rx_fini();
375 mutex_destroy(&viona_force_copy_lock);
376 }
377
378 return (ret);
379 }
380
381 int
_fini(void)382 _fini(void)
383 {
384 int ret;
385
386 ret = mod_remove(&modlinkage);
387 if (ret != 0) {
388 return (ret);
389 }
390
391 ddi_soft_state_fini(&viona_state);
392 id_space_destroy(viona_minors);
393 viona_rx_fini();
394 mutex_destroy(&viona_force_copy_lock);
395
396 return (ret);
397 }
398
399 int
_info(struct modinfo * modinfop)400 _info(struct modinfo *modinfop)
401 {
402 return (mod_info(&modlinkage, modinfop));
403 }
404
405 /* ARGSUSED */
406 static int
viona_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)407 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
408 {
409 int error;
410
411 switch (cmd) {
412 case DDI_INFO_DEVT2DEVINFO:
413 *result = (void *)viona_dip;
414 error = DDI_SUCCESS;
415 break;
416 case DDI_INFO_DEVT2INSTANCE:
417 *result = (void *)0;
418 error = DDI_SUCCESS;
419 break;
420 default:
421 error = DDI_FAILURE;
422 break;
423 }
424 return (error);
425 }
426
427 static int
viona_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)428 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
429 {
430 if (cmd != DDI_ATTACH) {
431 return (DDI_FAILURE);
432 }
433
434 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
435 DDI_PSEUDO, 0) != DDI_SUCCESS) {
436 return (DDI_FAILURE);
437 }
438
439 viona_neti_attach();
440
441 viona_dip = dip;
442 ddi_report_dev(viona_dip);
443
444 return (DDI_SUCCESS);
445 }
446
447 static int
viona_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)448 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
449 {
450 dev_info_t *old_dip = viona_dip;
451
452 if (cmd != DDI_DETACH) {
453 return (DDI_FAILURE);
454 }
455
456 VERIFY(old_dip != NULL);
457
458 viona_neti_detach();
459 viona_dip = NULL;
460 ddi_remove_minor_node(old_dip, NULL);
461
462 return (DDI_SUCCESS);
463 }
464
465 static int
viona_open(dev_t * devp,int flag,int otype,cred_t * credp)466 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
467 {
468 int minor;
469 viona_soft_state_t *ss;
470
471 if (otype != OTYP_CHR) {
472 return (EINVAL);
473 }
474 #if 0
475 /*
476 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
477 * Should the check be at open() or ioctl()?
478 */
479 if (drv_priv(credp) != 0) {
480 return (EPERM);
481 }
482 #endif
483 if (getminor(*devp) != VIONA_CTL_MINOR) {
484 return (ENXIO);
485 }
486
487 minor = id_alloc_nosleep(viona_minors);
488 if (minor == -1) {
489 /* All minors are busy */
490 return (EBUSY);
491 }
492 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
493 id_free(viona_minors, minor);
494 return (ENOMEM);
495 }
496
497 ss = ddi_get_soft_state(viona_state, minor);
498 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
499 *devp = makedevice(getmajor(*devp), minor);
500
501 return (0);
502 }
503
504 static int
viona_close(dev_t dev,int flag,int otype,cred_t * credp)505 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
506 {
507 int minor;
508 viona_soft_state_t *ss;
509
510 if (otype != OTYP_CHR) {
511 return (EINVAL);
512 }
513
514 minor = getminor(dev);
515
516 ss = ddi_get_soft_state(viona_state, minor);
517 if (ss == NULL) {
518 return (ENXIO);
519 }
520
521 VERIFY0(viona_ioc_delete(ss, B_TRUE));
522 VERIFY(!list_link_active(&ss->ss_node));
523 ddi_soft_state_free(viona_state, minor);
524 id_free(viona_minors, minor);
525
526 return (0);
527 }
528
529 static int
viona_ioctl(dev_t dev,int cmd,intptr_t data,int md,cred_t * cr,int * rv)530 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
531 {
532 viona_soft_state_t *ss;
533 void *dptr = (void *)data;
534 int err = 0, val;
535 viona_link_t *link;
536
537 ss = ddi_get_soft_state(viona_state, getminor(dev));
538 if (ss == NULL) {
539 return (ENXIO);
540 }
541
542 switch (cmd) {
543 case VNA_IOC_CREATE:
544 return (viona_ioc_create(ss, dptr, md, cr));
545 case VNA_IOC_DELETE:
546 return (viona_ioc_delete(ss, B_FALSE));
547 case VNA_IOC_VERSION:
548 *rv = VIONA_CURRENT_INTERFACE_VERSION;
549 return (0);
550 default:
551 break;
552 }
553
554 mutex_enter(&ss->ss_lock);
555 if ((link = ss->ss_link) == NULL || link->l_destroyed ||
556 vmm_drv_release_reqd(link->l_vm_hold)) {
557 mutex_exit(&ss->ss_lock);
558 return (ENXIO);
559 }
560
561 switch (cmd) {
562 case VNA_IOC_GET_FEATURES:
563 val = VIONA_S_HOSTCAPS | link->l_features_hw;
564 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
565 err = EFAULT;
566 }
567 break;
568 case VNA_IOC_SET_FEATURES:
569 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
570 err = EFAULT;
571 break;
572 }
573 val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
574
575 if ((val & VIRTIO_NET_F_CSUM) == 0)
576 val &= ~VIRTIO_NET_F_HOST_TSO4;
577
578 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
579 val &= ~VIRTIO_NET_F_GUEST_TSO4;
580
581 link->l_features = val;
582 break;
583 case VNA_IOC_RING_INIT:
584 err = viona_ioc_ring_init(link, dptr, md);
585 break;
586 case VNA_IOC_RING_RESET:
587 err = viona_ioc_ring_reset(link, (uint_t)data);
588 break;
589 case VNA_IOC_RING_KICK:
590 err = viona_ioc_ring_kick(link, (uint_t)data);
591 break;
592 case VNA_IOC_RING_SET_MSI:
593 err = viona_ioc_ring_set_msi(link, dptr, md);
594 break;
595 case VNA_IOC_RING_INTR_CLR:
596 err = viona_ioc_ring_intr_clear(link, (uint_t)data);
597 break;
598 case VNA_IOC_RING_SET_STATE:
599 err = viona_ioc_ring_set_state(link, dptr, md);
600 break;
601 case VNA_IOC_RING_GET_STATE:
602 err = viona_ioc_ring_get_state(link, dptr, md);
603 break;
604 case VNA_IOC_RING_PAUSE:
605 err = viona_ioc_ring_pause(link, (uint_t)data);
606 break;
607
608 case VNA_IOC_INTR_POLL:
609 err = viona_ioc_intr_poll(link, dptr, md, rv);
610 break;
611 case VNA_IOC_SET_NOTIFY_IOP:
612 if (data < 0 || data > UINT16_MAX) {
613 err = EINVAL;
614 break;
615 }
616 err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
617 break;
618 case VNA_IOC_SET_PROMISC:
619 err = viona_ioc_set_promisc(link, (viona_promisc_t)data);
620 break;
621 default:
622 err = ENOTTY;
623 break;
624 }
625
626 mutex_exit(&ss->ss_lock);
627 return (err);
628 }
629
630 static int
viona_chpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)631 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
632 struct pollhead **phpp)
633 {
634 viona_soft_state_t *ss;
635 viona_link_t *link;
636
637 ss = ddi_get_soft_state(viona_state, getminor(dev));
638 if (ss == NULL) {
639 return (ENXIO);
640 }
641
642 mutex_enter(&ss->ss_lock);
643 if ((link = ss->ss_link) == NULL || link->l_destroyed) {
644 mutex_exit(&ss->ss_lock);
645 return (ENXIO);
646 }
647
648 *reventsp = 0;
649 if ((events & POLLRDBAND) != 0) {
650 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
651 if (link->l_vrings[i].vr_intr_enabled != 0) {
652 *reventsp |= POLLRDBAND;
653 break;
654 }
655 }
656 }
657 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
658 *phpp = &link->l_pollhead;
659 }
660 mutex_exit(&ss->ss_lock);
661
662 return (0);
663 }
664
665 static void
viona_get_mac_capab(viona_link_t * link)666 viona_get_mac_capab(viona_link_t *link)
667 {
668 mac_handle_t mh = link->l_mh;
669 uint32_t cap = 0;
670 mac_capab_lso_t lso_cap;
671
672 link->l_features_hw = 0;
673 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
674 /*
675 * Only report HW checksum ability if the underlying MAC
676 * resource is capable of populating the L4 header.
677 */
678 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
679 link->l_features_hw |= VIRTIO_NET_F_CSUM;
680 }
681 link->l_cap_csum = cap;
682 }
683
684 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
685 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
686 /*
687 * Virtio doesn't allow for negotiating a maximum LSO
688 * packet size. We have to assume that the guest may
689 * send a maximum length IP packet. Make sure the
690 * underlying MAC can handle an LSO of this size.
691 */
692 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
693 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
694 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
695 }
696 }
697
698 static int
viona_ioc_create(viona_soft_state_t * ss,void * dptr,int md,cred_t * cr)699 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
700 {
701 vioc_create_t kvc;
702 viona_link_t *link = NULL;
703 char cli_name[MAXNAMELEN];
704 int err = 0;
705 file_t *fp;
706 vmm_hold_t *hold = NULL;
707 viona_neti_t *nip = NULL;
708 zoneid_t zid;
709 mac_diag_t mac_diag = MAC_DIAG_NONE;
710
711 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
712
713 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
714 return (EFAULT);
715 }
716
717 zid = crgetzoneid(cr);
718 nip = viona_neti_lookup_by_zid(zid);
719 if (nip == NULL) {
720 return (EIO);
721 }
722
723 if (!nip->vni_nethook.vnh_hooked) {
724 viona_neti_rele(nip);
725 return (EIO);
726 }
727
728 mutex_enter(&ss->ss_lock);
729 if (ss->ss_link != NULL) {
730 mutex_exit(&ss->ss_lock);
731 viona_neti_rele(nip);
732 return (EEXIST);
733 }
734
735 if ((fp = getf(kvc.c_vmfd)) == NULL) {
736 err = EBADF;
737 goto bail;
738 }
739 err = vmm_drv_hold(fp, cr, &hold);
740 releasef(kvc.c_vmfd);
741 if (err != 0) {
742 goto bail;
743 }
744
745 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
746 link->l_linkid = kvc.c_linkid;
747 link->l_vm_hold = hold;
748
749 err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
750 if (err != 0) {
751 goto bail;
752 }
753
754 viona_get_mac_capab(link);
755
756 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
757 link->l_linkid);
758 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
759 if (err != 0) {
760 goto bail;
761 }
762
763 err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY,
764 &link->l_muh, VLAN_ID_NONE, &mac_diag);
765 if (err != 0) {
766 goto bail;
767 }
768
769 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
770 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
771
772 /*
773 * Default to passing up all multicast traffic in addition to
774 * classified unicast. Guests which have support will change this
775 * if they need to via the virtio net control queue; guests without
776 * support generally still want to see multicast.
777 */
778 link->l_promisc = VIONA_PROMISC_MULTI;
779 if ((err = viona_rx_set(link, link->l_promisc)) != 0) {
780 viona_rx_clear(link);
781 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
782 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
783 goto bail;
784 }
785
786 link->l_neti = nip;
787 ss->ss_link = link;
788 mutex_exit(&ss->ss_lock);
789
790 mutex_enter(&nip->vni_lock);
791 list_insert_tail(&nip->vni_dev_list, ss);
792 mutex_exit(&nip->vni_lock);
793
794 return (0);
795
796 bail:
797 if (link != NULL) {
798 if (link->l_mch != NULL) {
799 if (link->l_muh != NULL) {
800 VERIFY0(mac_unicast_remove(link->l_mch,
801 link->l_muh));
802 link->l_muh = NULL;
803 }
804 mac_client_close(link->l_mch, 0);
805 }
806 if (link->l_mh != NULL) {
807 mac_close(link->l_mh);
808 }
809 kmem_free(link, sizeof (viona_link_t));
810 }
811 if (hold != NULL) {
812 vmm_drv_rele(hold);
813 }
814 viona_neti_rele(nip);
815
816 mutex_exit(&ss->ss_lock);
817 return (err);
818 }
819
820 static int
viona_ioc_delete(viona_soft_state_t * ss,boolean_t on_close)821 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
822 {
823 viona_link_t *link;
824 viona_neti_t *nip = NULL;
825
826 mutex_enter(&ss->ss_lock);
827 if ((link = ss->ss_link) == NULL) {
828 /* Link destruction already complete */
829 mutex_exit(&ss->ss_lock);
830 return (0);
831 }
832
833 if (link->l_destroyed) {
834 /*
835 * Link destruction has been started by another thread, but has
836 * not completed. This condition should be impossible to
837 * encounter when performing the on-close destroy of the link,
838 * since racing ioctl accessors must necessarily be absent.
839 */
840 VERIFY(!on_close);
841 mutex_exit(&ss->ss_lock);
842 return (EAGAIN);
843 }
844 /*
845 * The link deletion cannot fail after this point, continuing until its
846 * successful completion is reached.
847 */
848 link->l_destroyed = B_TRUE;
849
850 /*
851 * Tear down the IO port hook so it cannot be used to kick any of the
852 * rings which are about to be reset and stopped.
853 */
854 VERIFY0(viona_ioc_set_notify_ioport(link, 0));
855 mutex_exit(&ss->ss_lock);
856
857 /*
858 * Return the rings to their reset state, ignoring any possible
859 * interruptions from signals.
860 */
861 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
862 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
863
864 mutex_enter(&ss->ss_lock);
865 if (link->l_mch != NULL) {
866 /* Unhook the receive callbacks and close out the client */
867 viona_rx_clear(link);
868 if (link->l_muh != NULL) {
869 VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh));
870 link->l_muh = NULL;
871 }
872 mac_client_close(link->l_mch, 0);
873 }
874 if (link->l_mh != NULL) {
875 mac_close(link->l_mh);
876 }
877 if (link->l_vm_hold != NULL) {
878 vmm_drv_rele(link->l_vm_hold);
879 link->l_vm_hold = NULL;
880 }
881
882 nip = link->l_neti;
883 link->l_neti = NULL;
884
885 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
886 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
887 pollhead_clean(&link->l_pollhead);
888 ss->ss_link = NULL;
889 mutex_exit(&ss->ss_lock);
890
891 mutex_enter(&nip->vni_lock);
892 list_remove(&nip->vni_dev_list, ss);
893 mutex_exit(&nip->vni_lock);
894
895 viona_neti_rele(nip);
896
897 kmem_free(link, sizeof (viona_link_t));
898 return (0);
899 }
900
901 static int
viona_ioc_ring_init(viona_link_t * link,void * udata,int md)902 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
903 {
904 vioc_ring_init_t kri;
905 int err;
906
907 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
908 return (EFAULT);
909 }
910 const struct viona_ring_params params = {
911 .vrp_pa = kri.ri_qaddr,
912 .vrp_size = kri.ri_qsize,
913 .vrp_avail_idx = 0,
914 .vrp_used_idx = 0,
915 };
916
917 err = viona_ring_init(link, kri.ri_index, ¶ms);
918
919 return (err);
920 }
921
922 static int
viona_ioc_ring_set_state(viona_link_t * link,void * udata,int md)923 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md)
924 {
925 vioc_ring_state_t krs;
926 int err;
927
928 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
929 return (EFAULT);
930 }
931 const struct viona_ring_params params = {
932 .vrp_pa = krs.vrs_qaddr,
933 .vrp_size = krs.vrs_qsize,
934 .vrp_avail_idx = krs.vrs_avail_idx,
935 .vrp_used_idx = krs.vrs_used_idx,
936 };
937
938 err = viona_ring_init(link, krs.vrs_index, ¶ms);
939
940 return (err);
941 }
942
943 static int
viona_ioc_ring_get_state(viona_link_t * link,void * udata,int md)944 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md)
945 {
946 vioc_ring_state_t krs;
947
948 if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
949 return (EFAULT);
950 }
951
952 struct viona_ring_params params;
953 int err = viona_ring_get_state(link, krs.vrs_index, ¶ms);
954 if (err != 0) {
955 return (err);
956 }
957 krs.vrs_qsize = params.vrp_size;
958 krs.vrs_qaddr = params.vrp_pa;
959 krs.vrs_avail_idx = params.vrp_avail_idx;
960 krs.vrs_used_idx = params.vrp_used_idx;
961
962 if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) {
963 return (EFAULT);
964 }
965 return (0);
966 }
967
968 static int
viona_ioc_ring_reset(viona_link_t * link,uint_t idx)969 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
970 {
971 viona_vring_t *ring;
972
973 if (idx >= VIONA_VQ_MAX) {
974 return (EINVAL);
975 }
976 ring = &link->l_vrings[idx];
977
978 return (viona_ring_reset(ring, B_TRUE));
979 }
980
981 static int
viona_ioc_ring_kick(viona_link_t * link,uint_t idx)982 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
983 {
984 viona_vring_t *ring;
985 int err;
986
987 if (idx >= VIONA_VQ_MAX) {
988 return (EINVAL);
989 }
990 ring = &link->l_vrings[idx];
991
992 mutex_enter(&ring->vr_lock);
993 switch (ring->vr_state) {
994 case VRS_SETUP:
995 /*
996 * An early kick to a ring which is starting its worker thread
997 * is fine. Once that thread is active, it will process the
998 * start-up request immediately.
999 */
1000 /* FALLTHROUGH */
1001 case VRS_INIT:
1002 ring->vr_state_flags |= VRSF_REQ_START;
1003 /* FALLTHROUGH */
1004 case VRS_RUN:
1005 cv_broadcast(&ring->vr_cv);
1006 err = 0;
1007 break;
1008 default:
1009 err = EBUSY;
1010 break;
1011 }
1012 mutex_exit(&ring->vr_lock);
1013
1014 return (err);
1015 }
1016
1017 static int
viona_ioc_ring_pause(viona_link_t * link,uint_t idx)1018 viona_ioc_ring_pause(viona_link_t *link, uint_t idx)
1019 {
1020 if (idx >= VIONA_VQ_MAX) {
1021 return (EINVAL);
1022 }
1023
1024 viona_vring_t *ring = &link->l_vrings[idx];
1025 return (viona_ring_pause(ring));
1026 }
1027
1028 static int
viona_ioc_ring_set_msi(viona_link_t * link,void * data,int md)1029 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
1030 {
1031 vioc_ring_msi_t vrm;
1032 viona_vring_t *ring;
1033
1034 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
1035 return (EFAULT);
1036 }
1037 if (vrm.rm_index >= VIONA_VQ_MAX) {
1038 return (EINVAL);
1039 }
1040
1041 ring = &link->l_vrings[vrm.rm_index];
1042 mutex_enter(&ring->vr_lock);
1043 ring->vr_msi_addr = vrm.rm_addr;
1044 ring->vr_msi_msg = vrm.rm_msg;
1045 mutex_exit(&ring->vr_lock);
1046
1047 return (0);
1048 }
1049
1050 static int
viona_notify_iop(void * arg,bool in,uint16_t port,uint8_t bytes,uint32_t * val)1051 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
1052 uint32_t *val)
1053 {
1054 viona_link_t *link = (viona_link_t *)arg;
1055
1056 /*
1057 * If the request is a read (in/ins), or direct at a port other than
1058 * what we expect to be registered on, ignore it.
1059 */
1060 if (in || port != link->l_notify_ioport) {
1061 return (ESRCH);
1062 }
1063
1064 /* Let userspace handle notifications for rings other than RX/TX. */
1065 const uint16_t vq = *val;
1066 if (vq >= VIONA_VQ_MAX) {
1067 return (ESRCH);
1068 }
1069
1070 viona_vring_t *ring = &link->l_vrings[vq];
1071 int res = 0;
1072
1073 mutex_enter(&ring->vr_lock);
1074 if (ring->vr_state == VRS_RUN) {
1075 cv_broadcast(&ring->vr_cv);
1076 } else {
1077 res = ESRCH;
1078 }
1079 mutex_exit(&ring->vr_lock);
1080
1081 return (res);
1082 }
1083
1084 static int
viona_ioc_set_notify_ioport(viona_link_t * link,uint16_t ioport)1085 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
1086 {
1087 int err = 0;
1088
1089 if (link->l_notify_ioport != 0) {
1090 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
1091 link->l_notify_ioport = 0;
1092 }
1093
1094 if (ioport != 0) {
1095 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
1096 viona_notify_iop, (void *)link, &link->l_notify_cookie);
1097 if (err == 0) {
1098 link->l_notify_ioport = ioport;
1099 }
1100 }
1101 return (err);
1102 }
1103
1104 static int
viona_ioc_set_promisc(viona_link_t * link,viona_promisc_t mode)1105 viona_ioc_set_promisc(viona_link_t *link, viona_promisc_t mode)
1106 {
1107 int err;
1108
1109 if (mode >= VIONA_PROMISC_MAX) {
1110 return (EINVAL);
1111 }
1112
1113 if (mode == link->l_promisc) {
1114 return (0);
1115 }
1116
1117 if ((err = viona_rx_set(link, mode)) != 0) {
1118 return (err);
1119 }
1120
1121 link->l_promisc = mode;
1122 return (0);
1123 }
1124
1125 static int
viona_ioc_ring_intr_clear(viona_link_t * link,uint_t idx)1126 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
1127 {
1128 if (idx >= VIONA_VQ_MAX) {
1129 return (EINVAL);
1130 }
1131
1132 link->l_vrings[idx].vr_intr_enabled = 0;
1133 return (0);
1134 }
1135
1136 static int
viona_ioc_intr_poll(viona_link_t * link,void * udata,int md,int * rv)1137 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
1138 {
1139 uint_t cnt = 0;
1140 vioc_intr_poll_t vip;
1141
1142 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
1143 uint_t val = link->l_vrings[i].vr_intr_enabled;
1144
1145 vip.vip_status[i] = val;
1146 if (val != 0) {
1147 cnt++;
1148 }
1149 }
1150
1151 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
1152 return (EFAULT);
1153 }
1154 *rv = (int)cnt;
1155 return (0);
1156 }
1157