1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2021 Oxide Computer Company
39 */
40
41 /*
42 * viona - VirtIO-Net, Accelerated
43 *
44 * The purpose of viona is to provide high performance virtio-net devices to
45 * bhyve guests. It does so by sitting directly atop MAC, skipping all of the
46 * DLS/DLD stack.
47 *
48 * --------------------
49 * General Architecture
50 * --------------------
51 *
52 * A single viona instance is comprised of a "link" handle and two "rings".
53 * After opening the viona device, it must be associated with a MAC network
54 * interface and a bhyve (vmm) instance to form its link resource. This is
55 * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
56 * passed in to perform the initialization. With the MAC client opened, and a
57 * driver handle to the vmm instance established, the device is ready to be
58 * configured by the guest.
59 *
60 * The userspace portion of bhyve, which interfaces with the PCI device
61 * emulation framework, is meant to stay out of the datapath if at all
62 * possible. Configuration changes made via PCI are mapped to actions which
63 * will steer the operation of the in-kernel logic.
64 *
65 *
66 * -----------
67 * Ring Basics
68 * -----------
69 *
70 * Each viona link has two viona_vring_t entities, RX and TX, for handling data
71 * transfers to and from the guest. They represent an interface to the
72 * standard virtio ring structures. When intiailized and active, each ring is
73 * backed by a kernel worker thread (parented to the bhyve process for the
74 * instance) which handles ring events. The RX worker has the simple task of
75 * watching for ring shutdown conditions. The TX worker does that in addition
76 * to processing all requests to transmit data. Data destined for the guest is
77 * delivered directly by MAC to viona_rx() when the ring is active.
78 *
79 *
80 * -----------
81 * Ring States
82 * -----------
83 *
84 * The viona_vring_t instances follow a simple path through the possible state
85 * values represented in virtio_vring_t`vr_state:
86 *
87 * +<--------------------------------------------+
88 * | |
89 * V ^
90 * +-----------+ This is the initial state when a link is created or
91 * | VRS_RESET | when the ring has been explicitly reset.
92 * +-----------+
93 * | ^
94 * |---* ioctl(VNA_IOC_RING_INIT) issued |
95 * | |
96 * | ^
97 * V
98 * +-----------+ The ring parameters (size, guest physical addresses)
99 * | VRS_SETUP | have been set and start-up of the ring worker thread
100 * +-----------+ has begun.
101 * | ^
102 * | |
103 * |---* ring worker thread begins execution |
104 * | |
105 * +-------------------------------------------->+
106 * | | ^
107 * | |
108 * | * If ring shutdown is requested (by ioctl or impending
109 * | bhyve process death) while the worker thread is
110 * | starting, the worker will transition the ring to
111 * | VRS_RESET and exit.
112 * | ^
113 * | |
114 * | ^
115 * V
116 * +-----------+ The worker thread associated with the ring has started
117 * | VRS_INIT | executing. It has allocated any extra resources needed
118 * +-----------+ for the ring to operate.
119 * | ^
120 * | |
121 * +-------------------------------------------->+
122 * | | ^
123 * | |
124 * | * If ring shutdown is requested while the worker is
125 * | waiting in VRS_INIT, it will free any extra resources
126 * | and transition to VRS_RESET.
127 * | ^
128 * | |
129 * |--* ioctl(VNA_IOC_RING_KICK) issued |
130 * | ^
131 * V
132 * +-----------+ The worker thread associated with the ring is executing
133 * | VRS_RUN | workload specific to that ring.
134 * +-----------+
135 * | ^
136 * |---* ioctl(VNA_IOC_RING_RESET) issued |
137 * | (or bhyve process begins exit) ^
138 * |
139 * +-----------+ The worker thread associated with the ring is in the
140 * | VRS_STOP | process of exiting. All outstanding TX and RX
141 * +-----------+ requests are allowed to complete, but new requests
142 * | must be ignored.
143 * | ^
144 * | |
145 * +-------------------------------------------->+
146 *
147 *
148 * While the worker thread is not running, changes to vr_state are only made by
149 * viona_ioc_ring_init() under vr_lock. There, it initializes the ring, starts
150 * the worker, and sets the ring state to VRS_SETUP. Once the worker thread
151 * has been started, only it may perform ring state transitions (still under
152 * the protection of vr_lock), when requested by outside consumers via
153 * vr_state_flags or when the containing bhyve process initiates an exit.
154 *
155 *
156 * ----------------------------
157 * Transmission mblk_t Handling
158 * ----------------------------
159 *
160 * For incoming frames destined for a bhyve guest, the data must first land in
161 * a host OS buffer from the physical NIC before it is copied into the awaiting
162 * guest buffer(s). Outbound frames transmitted by the guest are not bound by
163 * this limitation and can avoid extra copying before the buffers are accessed
164 * directly by the NIC. When a guest designates buffers to be transmitted,
165 * viona translates the guest-physical addresses contained in the ring
166 * descriptors to host-virtual addresses via viona_hold_page(). That pointer is
167 * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
168 * Doing so increments vr_xfer_outstanding, preventing the ring from being
169 * reset (allowing the link to drop its vmm handle to the guest) until all
170 * transmit mblks referencing guest memory have been processed. Allocation of
171 * the viona_desb_t entries is done during the VRS_INIT stage of the ring
172 * worker thread. The ring size informs that allocation as the number of
173 * concurrent transmissions is limited by the number of descriptors in the
174 * ring. This minimizes allocation in the transmit hot-path by acquiring those
175 * fixed-size resources during initialization.
176 *
177 * This optimization depends on the underlying NIC driver freeing the mblks in
178 * a timely manner after they have been transmitted by the hardware. Some
179 * drivers have been found to flush TX descriptors only when new transmissions
180 * are initiated. This means that there is no upper bound to the time needed
181 * for an mblk to be flushed and can stall bhyve guests from shutting down
182 * since their memory must be free of viona TX references prior to clean-up.
183 *
184 * This expectation of deterministic mblk_t processing is likely the reason
185 * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
186 * loaded will copy transmit data into fresh buffers rather than passing up
187 * zero-copy mblks. It is a hold-over from the original viona sources provided
188 * by Pluribus and its continued necessity has not been confirmed.
189 *
190 *
191 * ----------------------------
192 * Ring Notification Fast-paths
193 * ----------------------------
194 *
195 * Device operation for viona requires that notifications flow to and from the
196 * guest to indicate certain ring conditions. In order to minimize latency and
197 * processing overhead, the notification procedures are kept in-kernel whenever
198 * possible.
199 *
200 * Guest-to-host notifications, when new available descriptors have been placed
201 * in the ring, are posted via the 'queue notify' address in the virtio BAR.
202 * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
203 * install a callback hook on an ioport address. Guest exits for accesses to
204 * viona-hooked ioport addresses will result in direct calls to notify the
205 * appropriate ring worker without a trip to userland.
206 *
207 * Host-to-guest notifications in the form of interrupts enjoy similar
208 * acceleration. Each viona ring can be configured to send MSI notifications
209 * to the guest as virtio conditions dictate. This in-kernel interrupt
210 * configuration is kept synchronized through viona ioctls which are utilized
211 * during writes to the associated PCI config registers or MSI-X BAR.
212 *
213 * Guests which do not utilize MSI-X will result in viona falling back to the
214 * slow path for interrupts. It will poll(2) the viona handle, receiving
215 * notification when ring events necessitate the assertion of an interrupt.
216 *
217 *
218 * ---------------
219 * Nethook Support
220 * ---------------
221 *
222 * Viona provides four nethook events that consumers (e.g. ipf) can hook into
223 * to intercept packets as they go up or down the stack. Unfortunately,
224 * the nethook framework does not understand raw packets, so we can only
225 * generate events (in, out) for IPv4 and IPv6 packets. At driver attach,
226 * we register callbacks with the neti (netinfo) module that will be invoked
227 * for each netstack already present, as well as for any additional netstack
228 * instances created as the system operates. These callbacks will
229 * register/unregister the hooks with the nethook framework for each
230 * netstack instance. This registration occurs prior to creating any
231 * viona instances for a given netstack, and the unregistration for a netstack
232 * instance occurs after all viona instances of the netstack instance have
233 * been deleted.
234 */
235
236 #include <sys/conf.h>
237 #include <sys/file.h>
238 #include <sys/stat.h>
239
240 #include <sys/dlpi.h>
241
242 #include "viona_impl.h"
243
244
245 #define VIONA_NAME "Virtio Network Accelerator"
246 #define VIONA_CTL_MINOR 0
247 #define VIONA_CLI_NAME "viona" /* MAC client name */
248
249
250 /*
251 * Host capabilities.
252 */
253 #define VIONA_S_HOSTCAPS ( \
254 VIRTIO_NET_F_GUEST_CSUM | \
255 VIRTIO_NET_F_MAC | \
256 VIRTIO_NET_F_GUEST_TSO4 | \
257 VIRTIO_NET_F_MRG_RXBUF | \
258 VIRTIO_NET_F_STATUS | \
259 VIRTIO_F_RING_NOTIFY_ON_EMPTY | \
260 VIRTIO_F_RING_INDIRECT_DESC)
261
262 /* MAC_CAPAB_HCKSUM specifics of interest */
263 #define VIONA_CAP_HCKSUM_INTEREST \
264 (HCKSUM_INET_PARTIAL | \
265 HCKSUM_INET_FULL_V4 | \
266 HCKSUM_INET_FULL_V6)
267
268 static void *viona_state;
269 static dev_info_t *viona_dip;
270 static id_space_t *viona_minors;
271
272
273 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
274 void **result);
275 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
276 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
277 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
278 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
279 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
280 cred_t *credp, int *rval);
281 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
282 struct pollhead **phpp);
283
284 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
285 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
286
287 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
288 static int viona_ioc_ring_init(viona_link_t *, void *, int);
289 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
290 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
291 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
292 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
293 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
294
295 static struct cb_ops viona_cb_ops = {
296 viona_open,
297 viona_close,
298 nodev,
299 nodev,
300 nodev,
301 nodev,
302 nodev,
303 viona_ioctl,
304 nodev,
305 nodev,
306 nodev,
307 viona_chpoll,
308 ddi_prop_op,
309 0,
310 D_MP | D_NEW | D_HOTPLUG,
311 CB_REV,
312 nodev,
313 nodev
314 };
315
316 static struct dev_ops viona_ops = {
317 DEVO_REV,
318 0,
319 viona_info,
320 nulldev,
321 nulldev,
322 viona_attach,
323 viona_detach,
324 nodev,
325 &viona_cb_ops,
326 NULL,
327 ddi_power,
328 ddi_quiesce_not_needed
329 };
330
331 static struct modldrv modldrv = {
332 &mod_driverops,
333 VIONA_NAME,
334 &viona_ops,
335 };
336
337 static struct modlinkage modlinkage = {
338 MODREV_1, &modldrv, NULL
339 };
340
341 int
_init(void)342 _init(void)
343 {
344 int ret;
345
346 ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
347 if (ret != 0) {
348 return (ret);
349 }
350
351 viona_minors = id_space_create("viona_minors",
352 VIONA_CTL_MINOR + 1, UINT16_MAX);
353 viona_rx_init();
354 mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
355
356 ret = mod_install(&modlinkage);
357 if (ret != 0) {
358 ddi_soft_state_fini(&viona_state);
359 id_space_destroy(viona_minors);
360 viona_rx_fini();
361 mutex_destroy(&viona_force_copy_lock);
362 }
363
364 return (ret);
365 }
366
367 int
_fini(void)368 _fini(void)
369 {
370 int ret;
371
372 ret = mod_remove(&modlinkage);
373 if (ret != 0) {
374 return (ret);
375 }
376
377 ddi_soft_state_fini(&viona_state);
378 id_space_destroy(viona_minors);
379 viona_rx_fini();
380 mutex_destroy(&viona_force_copy_lock);
381
382 return (ret);
383 }
384
385 int
_info(struct modinfo * modinfop)386 _info(struct modinfo *modinfop)
387 {
388 return (mod_info(&modlinkage, modinfop));
389 }
390
391 /* ARGSUSED */
392 static int
viona_info(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** result)393 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
394 {
395 int error;
396
397 switch (cmd) {
398 case DDI_INFO_DEVT2DEVINFO:
399 *result = (void *)viona_dip;
400 error = DDI_SUCCESS;
401 break;
402 case DDI_INFO_DEVT2INSTANCE:
403 *result = (void *)0;
404 error = DDI_SUCCESS;
405 break;
406 default:
407 error = DDI_FAILURE;
408 break;
409 }
410 return (error);
411 }
412
413 static int
viona_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)414 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
415 {
416 if (cmd != DDI_ATTACH) {
417 return (DDI_FAILURE);
418 }
419
420 if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
421 DDI_PSEUDO, 0) != DDI_SUCCESS) {
422 return (DDI_FAILURE);
423 }
424
425 viona_neti_attach();
426
427 viona_dip = dip;
428 ddi_report_dev(viona_dip);
429
430 return (DDI_SUCCESS);
431 }
432
433 static int
viona_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)434 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
435 {
436 dev_info_t *old_dip = viona_dip;
437
438 if (cmd != DDI_DETACH) {
439 return (DDI_FAILURE);
440 }
441
442 VERIFY(old_dip != NULL);
443
444 viona_neti_detach();
445 viona_dip = NULL;
446 ddi_remove_minor_node(old_dip, NULL);
447
448 return (DDI_SUCCESS);
449 }
450
451 static int
viona_open(dev_t * devp,int flag,int otype,cred_t * credp)452 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
453 {
454 int minor;
455 viona_soft_state_t *ss;
456
457 if (otype != OTYP_CHR) {
458 return (EINVAL);
459 }
460 #if 0
461 /*
462 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
463 * Should the check be at open() or ioctl()?
464 */
465 if (drv_priv(credp) != 0) {
466 return (EPERM);
467 }
468 #endif
469 if (getminor(*devp) != VIONA_CTL_MINOR) {
470 return (ENXIO);
471 }
472
473 minor = id_alloc_nosleep(viona_minors);
474 if (minor == -1) {
475 /* All minors are busy */
476 return (EBUSY);
477 }
478 if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
479 id_free(viona_minors, minor);
480 return (ENOMEM);
481 }
482
483 ss = ddi_get_soft_state(viona_state, minor);
484 mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
485 *devp = makedevice(getmajor(*devp), minor);
486
487 return (0);
488 }
489
490 static int
viona_close(dev_t dev,int flag,int otype,cred_t * credp)491 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
492 {
493 int minor;
494 viona_soft_state_t *ss;
495
496 if (otype != OTYP_CHR) {
497 return (EINVAL);
498 }
499
500 minor = getminor(dev);
501
502 ss = ddi_get_soft_state(viona_state, minor);
503 if (ss == NULL) {
504 return (ENXIO);
505 }
506
507 VERIFY0(viona_ioc_delete(ss, B_TRUE));
508 VERIFY(!list_link_active(&ss->ss_node));
509 ddi_soft_state_free(viona_state, minor);
510 id_free(viona_minors, minor);
511
512 return (0);
513 }
514
515 static int
viona_ioctl(dev_t dev,int cmd,intptr_t data,int md,cred_t * cr,int * rv)516 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
517 {
518 viona_soft_state_t *ss;
519 void *dptr = (void *)data;
520 int err = 0, val;
521 viona_link_t *link;
522
523 ss = ddi_get_soft_state(viona_state, getminor(dev));
524 if (ss == NULL) {
525 return (ENXIO);
526 }
527
528 switch (cmd) {
529 case VNA_IOC_CREATE:
530 return (viona_ioc_create(ss, dptr, md, cr));
531 case VNA_IOC_DELETE:
532 return (viona_ioc_delete(ss, B_FALSE));
533 default:
534 break;
535 }
536
537 mutex_enter(&ss->ss_lock);
538 if ((link = ss->ss_link) == NULL || link->l_destroyed ||
539 vmm_drv_release_reqd(link->l_vm_hold)) {
540 mutex_exit(&ss->ss_lock);
541 return (ENXIO);
542 }
543
544 switch (cmd) {
545 case VNA_IOC_GET_FEATURES:
546 val = VIONA_S_HOSTCAPS | link->l_features_hw;
547 if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
548 err = EFAULT;
549 }
550 break;
551 case VNA_IOC_SET_FEATURES:
552 if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
553 err = EFAULT;
554 break;
555 }
556 val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
557
558 if ((val & VIRTIO_NET_F_CSUM) == 0)
559 val &= ~VIRTIO_NET_F_HOST_TSO4;
560
561 if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
562 val &= ~VIRTIO_NET_F_GUEST_TSO4;
563
564 link->l_features = val;
565 break;
566 case VNA_IOC_RING_INIT:
567 err = viona_ioc_ring_init(link, dptr, md);
568 break;
569 case VNA_IOC_RING_RESET:
570 err = viona_ioc_ring_reset(link, (uint_t)data);
571 break;
572 case VNA_IOC_RING_KICK:
573 err = viona_ioc_ring_kick(link, (uint_t)data);
574 break;
575 case VNA_IOC_RING_SET_MSI:
576 err = viona_ioc_ring_set_msi(link, dptr, md);
577 break;
578 case VNA_IOC_RING_INTR_CLR:
579 err = viona_ioc_ring_intr_clear(link, (uint_t)data);
580 break;
581 case VNA_IOC_INTR_POLL:
582 err = viona_ioc_intr_poll(link, dptr, md, rv);
583 break;
584 case VNA_IOC_SET_NOTIFY_IOP:
585 if (data < 0 || data > UINT16_MAX) {
586 err = EINVAL;
587 break;
588 }
589 err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
590 break;
591 default:
592 err = ENOTTY;
593 break;
594 }
595
596 mutex_exit(&ss->ss_lock);
597 return (err);
598 }
599
600 static int
viona_chpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)601 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
602 struct pollhead **phpp)
603 {
604 viona_soft_state_t *ss;
605 viona_link_t *link;
606
607 ss = ddi_get_soft_state(viona_state, getminor(dev));
608 if (ss == NULL) {
609 return (ENXIO);
610 }
611
612 mutex_enter(&ss->ss_lock);
613 if ((link = ss->ss_link) == NULL || link->l_destroyed) {
614 mutex_exit(&ss->ss_lock);
615 return (ENXIO);
616 }
617
618 *reventsp = 0;
619 if ((events & POLLRDBAND) != 0) {
620 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
621 if (link->l_vrings[i].vr_intr_enabled != 0) {
622 *reventsp |= POLLRDBAND;
623 break;
624 }
625 }
626 }
627 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
628 *phpp = &link->l_pollhead;
629 }
630 mutex_exit(&ss->ss_lock);
631
632 return (0);
633 }
634
635 static void
viona_get_mac_capab(viona_link_t * link)636 viona_get_mac_capab(viona_link_t *link)
637 {
638 mac_handle_t mh = link->l_mh;
639 uint32_t cap = 0;
640 mac_capab_lso_t lso_cap;
641
642 link->l_features_hw = 0;
643 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
644 /*
645 * Only report HW checksum ability if the underlying MAC
646 * resource is capable of populating the L4 header.
647 */
648 if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
649 link->l_features_hw |= VIRTIO_NET_F_CSUM;
650 }
651 link->l_cap_csum = cap;
652 }
653
654 if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
655 mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
656 /*
657 * Virtio doesn't allow for negotiating a maximum LSO
658 * packet size. We have to assume that the guest may
659 * send a maximum length IP packet. Make sure the
660 * underlying MAC can handle an LSO of this size.
661 */
662 if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
663 lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
664 link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
665 }
666 }
667
668 static int
viona_ioc_create(viona_soft_state_t * ss,void * dptr,int md,cred_t * cr)669 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
670 {
671 vioc_create_t kvc;
672 viona_link_t *link = NULL;
673 char cli_name[MAXNAMELEN];
674 int err = 0;
675 file_t *fp;
676 vmm_hold_t *hold = NULL;
677 viona_neti_t *nip = NULL;
678 zoneid_t zid;
679
680 ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
681
682 if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
683 return (EFAULT);
684 }
685
686 zid = crgetzoneid(cr);
687 nip = viona_neti_lookup_by_zid(zid);
688 if (nip == NULL) {
689 return (EIO);
690 }
691
692 if (!nip->vni_nethook.vnh_hooked) {
693 viona_neti_rele(nip);
694 return (EIO);
695 }
696
697 mutex_enter(&ss->ss_lock);
698 if (ss->ss_link != NULL) {
699 mutex_exit(&ss->ss_lock);
700 viona_neti_rele(nip);
701 return (EEXIST);
702 }
703
704 if ((fp = getf(kvc.c_vmfd)) == NULL) {
705 err = EBADF;
706 goto bail;
707 }
708 err = vmm_drv_hold(fp, cr, &hold);
709 releasef(kvc.c_vmfd);
710 if (err != 0) {
711 goto bail;
712 }
713
714 link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
715 link->l_linkid = kvc.c_linkid;
716 link->l_vm_hold = hold;
717
718 err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
719 if (err != 0) {
720 goto bail;
721 }
722
723 viona_get_mac_capab(link);
724
725 (void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
726 link->l_linkid);
727 err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
728 if (err != 0) {
729 goto bail;
730 }
731
732 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
733 viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
734
735 if ((err = viona_rx_set(link)) != 0) {
736 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
737 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
738 goto bail;
739 }
740
741 link->l_neti = nip;
742 ss->ss_link = link;
743 mutex_exit(&ss->ss_lock);
744
745 mutex_enter(&nip->vni_lock);
746 list_insert_tail(&nip->vni_dev_list, ss);
747 mutex_exit(&nip->vni_lock);
748
749 return (0);
750
751 bail:
752 if (link != NULL) {
753 if (link->l_mch != NULL) {
754 mac_client_close(link->l_mch, 0);
755 }
756 if (link->l_mh != NULL) {
757 mac_close(link->l_mh);
758 }
759 kmem_free(link, sizeof (viona_link_t));
760 }
761 if (hold != NULL) {
762 vmm_drv_rele(hold);
763 }
764 viona_neti_rele(nip);
765
766 mutex_exit(&ss->ss_lock);
767 return (err);
768 }
769
770 static int
viona_ioc_delete(viona_soft_state_t * ss,boolean_t on_close)771 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
772 {
773 viona_link_t *link;
774 viona_neti_t *nip = NULL;
775
776 mutex_enter(&ss->ss_lock);
777 if ((link = ss->ss_link) == NULL) {
778 /* Link destruction already complete */
779 mutex_exit(&ss->ss_lock);
780 return (0);
781 }
782
783 if (link->l_destroyed) {
784 /*
785 * Link destruction has been started by another thread, but has
786 * not completed. This condition should be impossible to
787 * encounter when performing the on-close destroy of the link,
788 * since racing ioctl accessors must necessarily be absent.
789 */
790 VERIFY(!on_close);
791 mutex_exit(&ss->ss_lock);
792 return (EAGAIN);
793 }
794 /*
795 * The link deletion cannot fail after this point, continuing until its
796 * successful completion is reached.
797 */
798 link->l_destroyed = B_TRUE;
799
800 /*
801 * Tear down the IO port hook so it cannot be used to kick any of the
802 * rings which are about to be reset and stopped.
803 */
804 VERIFY0(viona_ioc_set_notify_ioport(link, 0));
805 mutex_exit(&ss->ss_lock);
806
807 /*
808 * Return the rings to their reset state, ignoring any possible
809 * interruptions from signals.
810 */
811 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
812 VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
813
814 mutex_enter(&ss->ss_lock);
815 if (link->l_mch != NULL) {
816 /* Unhook the receive callbacks and close out the client */
817 viona_rx_clear(link);
818 mac_client_close(link->l_mch, 0);
819 }
820 if (link->l_mh != NULL) {
821 mac_close(link->l_mh);
822 }
823 if (link->l_vm_hold != NULL) {
824 vmm_drv_rele(link->l_vm_hold);
825 link->l_vm_hold = NULL;
826 }
827
828 nip = link->l_neti;
829 link->l_neti = NULL;
830
831 viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
832 viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
833 pollhead_clean(&link->l_pollhead);
834 ss->ss_link = NULL;
835 mutex_exit(&ss->ss_lock);
836
837 mutex_enter(&nip->vni_lock);
838 list_remove(&nip->vni_dev_list, ss);
839 mutex_exit(&nip->vni_lock);
840
841 viona_neti_rele(nip);
842
843 kmem_free(link, sizeof (viona_link_t));
844 return (0);
845 }
846
847 static int
viona_ioc_ring_init(viona_link_t * link,void * udata,int md)848 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
849 {
850 vioc_ring_init_t kri;
851 int err;
852
853 if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
854 return (EFAULT);
855 }
856
857 err = viona_ring_init(link, kri.ri_index, kri.ri_qsize, kri.ri_qaddr);
858
859 return (err);
860 }
861
862 static int
viona_ioc_ring_reset(viona_link_t * link,uint_t idx)863 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
864 {
865 viona_vring_t *ring;
866
867 if (idx >= VIONA_VQ_MAX) {
868 return (EINVAL);
869 }
870 ring = &link->l_vrings[idx];
871
872 return (viona_ring_reset(ring, B_TRUE));
873 }
874
875 static int
viona_ioc_ring_kick(viona_link_t * link,uint_t idx)876 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
877 {
878 viona_vring_t *ring;
879 int err;
880
881 if (idx >= VIONA_VQ_MAX) {
882 return (EINVAL);
883 }
884 ring = &link->l_vrings[idx];
885
886 mutex_enter(&ring->vr_lock);
887 switch (ring->vr_state) {
888 case VRS_SETUP:
889 /*
890 * An early kick to a ring which is starting its worker thread
891 * is fine. Once that thread is active, it will process the
892 * start-up request immediately.
893 */
894 /* FALLTHROUGH */
895 case VRS_INIT:
896 ring->vr_state_flags |= VRSF_REQ_START;
897 /* FALLTHROUGH */
898 case VRS_RUN:
899 cv_broadcast(&ring->vr_cv);
900 err = 0;
901 break;
902 default:
903 err = EBUSY;
904 break;
905 }
906 mutex_exit(&ring->vr_lock);
907
908 return (err);
909 }
910
911 static int
viona_ioc_ring_set_msi(viona_link_t * link,void * data,int md)912 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
913 {
914 vioc_ring_msi_t vrm;
915 viona_vring_t *ring;
916
917 if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
918 return (EFAULT);
919 }
920 if (vrm.rm_index >= VIONA_VQ_MAX) {
921 return (EINVAL);
922 }
923
924 ring = &link->l_vrings[vrm.rm_index];
925 mutex_enter(&ring->vr_lock);
926 ring->vr_msi_addr = vrm.rm_addr;
927 ring->vr_msi_msg = vrm.rm_msg;
928 mutex_exit(&ring->vr_lock);
929
930 return (0);
931 }
932
933 static int
viona_notify_iop(void * arg,bool in,uint16_t port,uint8_t bytes,uint32_t * val)934 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
935 uint32_t *val)
936 {
937 viona_link_t *link = (viona_link_t *)arg;
938 uint16_t vq = *val;
939
940 if (in) {
941 /*
942 * Do not service read (in/ins) requests on this ioport.
943 * Instead, indicate that the handler is not found, causing a
944 * fallback to userspace processing.
945 */
946 return (ESRCH);
947 }
948
949 if (port != link->l_notify_ioport) {
950 return (EINVAL);
951 }
952 return (viona_ioc_ring_kick(link, vq));
953 }
954
955 static int
viona_ioc_set_notify_ioport(viona_link_t * link,uint16_t ioport)956 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
957 {
958 int err = 0;
959
960 if (link->l_notify_ioport != 0) {
961 vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
962 link->l_notify_ioport = 0;
963 }
964
965 if (ioport != 0) {
966 err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
967 viona_notify_iop, (void *)link, &link->l_notify_cookie);
968 if (err == 0) {
969 link->l_notify_ioport = ioport;
970 }
971 }
972 return (err);
973 }
974
975 static int
viona_ioc_ring_intr_clear(viona_link_t * link,uint_t idx)976 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
977 {
978 if (idx >= VIONA_VQ_MAX) {
979 return (EINVAL);
980 }
981
982 link->l_vrings[idx].vr_intr_enabled = 0;
983 return (0);
984 }
985
986 static int
viona_ioc_intr_poll(viona_link_t * link,void * udata,int md,int * rv)987 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
988 {
989 uint_t cnt = 0;
990 vioc_intr_poll_t vip;
991
992 for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
993 uint_t val = link->l_vrings[i].vr_intr_enabled;
994
995 vip.vip_status[i] = val;
996 if (val != 0) {
997 cnt++;
998 }
999 }
1000
1001 if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
1002 return (EFAULT);
1003 }
1004 *rv = (int)cnt;
1005 return (0);
1006 }
1007