1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 #include <sys/types.h>
25 #include <sys/stream.h>
26 #include <sys/dlpi.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/strlog.h>
31 #include <sys/ddi.h>
32 #include <sys/cmn_err.h>
33 #include <sys/socket.h>
34 #include <net/if.h>
35 #include <net/if_types.h>
36 #include <netinet/in.h>
37 #include <sys/ethernet.h>
38 #include <inet/arp.h>
39 #include <inet/ip.h>
40 #include <inet/ip6.h>
41 #include <inet/ip_ire.h>
42 #include <inet/ip_if.h>
43 #include <inet/ip_ftable.h>
44 
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 
48 #include <sys/rds.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sockio.h>
52 #include <sys/sysmacros.h>
53 #include <inet/common.h>
54 #include <inet/ip.h>
55 #include <net/if_types.h>
56 
57 #include <sys/ib/clients/rdsv3/rdsv3.h>
58 #include <sys/ib/clients/rdsv3/rdma.h>
59 #include <sys/ib/clients/rdsv3/ib.h>
60 #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
62 
63 #include <sys/dls.h>
64 #include <sys/mac.h>
65 #include <sys/mac_client.h>
66 #include <sys/mac_provider.h>
67 #include <sys/mac_client_priv.h>
68 
69 uint_t			rdsv3_one_sec_in_hz;
70 ddi_taskq_t		*rdsv3_taskq = NULL;
71 extern kmem_cache_t	*rdsv3_alloc_cache;
72 
73 extern unsigned int	ip_ocsum(ushort_t *address, int halfword_count,
74     unsigned int sum);
75 
76 /*
77  * Check if the IP interface named by `lifrp' is RDS-capable.
78  */
79 boolean_t
rdsv3_capable_interface(struct lifreq * lifrp)80 rdsv3_capable_interface(struct lifreq *lifrp)
81 {
82 	char	ifname[LIFNAMSIZ];
83 	char	drv[MAXLINKNAMELEN];
84 	uint_t	ppa;
85 	char	*cp;
86 
87 	RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter");
88 
89 	if (lifrp->lifr_type == IFT_IB)
90 		return (B_TRUE);
91 
92 	/*
93 	 * Strip off the logical interface portion before getting
94 	 * intimate with the name.
95 	 */
96 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
97 	if ((cp = strchr(ifname, ':')) != NULL)
98 		*cp = '\0';
99 
100 	if (strcmp("lo0", ifname) == 0) {
101 		/*
102 		 * loopback is considered RDS-capable
103 		 */
104 		return (B_TRUE);
105 	}
106 
107 	return (
108 	    ddi_parse_dlen(ifname, drv, MAXLINKNAMELEN, &ppa) == DDI_SUCCESS &&
109 	    rdsv3_if_lookup_by_name(drv));
110 }
111 
112 int
rdsv3_do_ip_ioctl(ksocket_t so4,void ** ipaddrs,int * size,int * nifs)113 rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
114 {
115 	struct lifnum		lifn;
116 	struct lifconf		lifc;
117 	struct lifreq		*lp, *rlp, lifr;
118 	int			rval = 0;
119 	int			numifs;
120 	int			bufsize, rbufsize;
121 	void			*buf, *rbuf;
122 	int			i, j, n, rc;
123 
124 	*ipaddrs = NULL;
125 	*size = 0;
126 	*nifs = 0;
127 
128 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter");
129 
130 retry_count:
131 	/* snapshot the current number of interfaces */
132 	lifn.lifn_family = PF_UNSPEC;
133 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
134 	lifn.lifn_count = 0;
135 	rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval,
136 	    CRED());
137 	if (rval != 0) {
138 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
139 		    "ksocket_ioctl returned: %d", rval);
140 		return (rval);
141 	}
142 
143 	numifs = lifn.lifn_count;
144 	if (numifs <= 0) {
145 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found");
146 		return (0);
147 	}
148 
149 	/* allocate extra room in case more interfaces appear */
150 	numifs += 10;
151 
152 	/* get the interface names and ip addresses */
153 	bufsize = numifs * sizeof (struct lifreq);
154 	buf = kmem_alloc(bufsize, KM_SLEEP);
155 
156 	lifc.lifc_family = AF_UNSPEC;
157 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
158 	lifc.lifc_len = bufsize;
159 	lifc.lifc_buf = buf;
160 	rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
161 	if (rc != 0) {
162 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed");
163 		kmem_free(buf, bufsize);
164 		return (rc);
165 	}
166 	/* if our extra room is used up, try again */
167 	if (bufsize <= lifc.lifc_len) {
168 		kmem_free(buf, bufsize);
169 		buf = NULL;
170 		goto retry_count;
171 	}
172 	/* calc actual number of ifconfs */
173 	n = lifc.lifc_len / sizeof (struct lifreq);
174 
175 	/*
176 	 * Count the RDS interfaces
177 	 */
178 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
179 
180 		/*
181 		 * Copy as the SIOCGLIFFLAGS ioctl is destructive
182 		 */
183 		bcopy(lp, &lifr, sizeof (struct lifreq));
184 		/*
185 		 * fetch the flags using the socket of the correct family
186 		 */
187 		switch (lifr.lifr_addr.ss_family) {
188 		case AF_INET:
189 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
190 			    &rval, CRED());
191 			break;
192 		default:
193 			continue;
194 		}
195 
196 		if (rc != 0) continue;
197 
198 		/*
199 		 * If we got the flags, skip uninteresting
200 		 * interfaces based on flags
201 		 */
202 		if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
203 			continue;
204 		if (lifr.lifr_flags &
205 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
206 			continue;
207 		if (!rdsv3_capable_interface(&lifr))
208 			continue;
209 		j++;
210 	}
211 
212 	if (j <= 0) {
213 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces");
214 		kmem_free(buf, bufsize);
215 		return (rval);
216 	}
217 
218 	numifs = j;
219 
220 	/* This is the buffer we pass back */
221 	rbufsize = numifs * sizeof (struct lifreq);
222 	rbuf = kmem_alloc(rbufsize, KM_SLEEP);
223 	rlp = (struct lifreq *)rbuf;
224 
225 	/*
226 	 * Examine the array of interfaces and filter uninteresting ones
227 	 */
228 	for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
229 
230 		/*
231 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
232 		 */
233 		bcopy(lp, &lifr, sizeof (struct lifreq));
234 		/*
235 		 * fetch the flags using the socket of the correct family
236 		 */
237 		switch (lifr.lifr_addr.ss_family) {
238 		case AF_INET:
239 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
240 			    &rval, CRED());
241 			break;
242 		default:
243 			continue;
244 		}
245 
246 
247 		if (rc != 0) {
248 			RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
249 			    "ksocket_ioctl failed" " for %s", lifr.lifr_name);
250 			continue;
251 		}
252 
253 		/*
254 		 * If we got the flags, skip uninteresting
255 		 * interfaces based on flags
256 		 */
257 		if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
258 			continue;
259 		if (lifr.lifr_flags &
260 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
261 			continue;
262 		if (!rdsv3_capable_interface(&lifr))
263 			continue;
264 
265 		/* save the record */
266 		bcopy(lp, rlp, sizeof (struct lifreq));
267 		rlp->lifr_addr.ss_family = AF_INET_OFFLOAD;
268 		rlp++;
269 	}
270 
271 	kmem_free(buf, bufsize);
272 
273 	*ipaddrs = rbuf;
274 	*size = rbufsize;
275 	*nifs = numifs;
276 
277 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return");
278 
279 	return (rval);
280 }
281 
282 /*
283  * Check if the IP interface named by `ifrp' is RDS-capable.
284  */
285 boolean_t
rdsv3_capable_interface_old(struct ifreq * ifrp)286 rdsv3_capable_interface_old(struct ifreq *ifrp)
287 {
288 	char	ifname[IFNAMSIZ];
289 	char	drv[MAXLINKNAMELEN];
290 	uint_t	ppa;
291 	char	*cp;
292 
293 	RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter");
294 
295 	/*
296 	 * Strip off the logical interface portion before getting
297 	 * intimate with the name.
298 	 */
299 	(void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ);
300 	if ((cp = strchr(ifname, ':')) != NULL)
301 		*cp = '\0';
302 
303 	RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname);
304 
305 	if ((strcmp("lo0", ifname) == 0) ||
306 	    (strncmp("ibd", ifname, 3) == 0)) {
307 		/*
308 		 * loopback and IB are considered RDS-capable
309 		 */
310 		return (B_TRUE);
311 	}
312 
313 	return (
314 	    ddi_parse_dlen(ifname, drv, MAXLINKNAMELEN, &ppa) == DDI_SUCCESS &&
315 	    rdsv3_if_lookup_by_name(drv));
316 }
317 
318 int
rdsv3_do_ip_ioctl_old(ksocket_t so4,void ** ipaddrs,int * size,int * nifs)319 rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
320 {
321 	uint_t			ifn;
322 	struct ifconf		ifc;
323 	struct ifreq		*lp, *rlp, ifr;
324 	int			rval = 0;
325 	int			numifs;
326 	int			bufsize, rbufsize;
327 	void			*buf, *rbuf;
328 	int			i, j, n, rc;
329 
330 	*ipaddrs = NULL;
331 	*size = 0;
332 	*nifs = 0;
333 
334 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter");
335 
336 retry_count:
337 	rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval,
338 	    CRED());
339 	if (rval != 0) {
340 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
341 		    "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval);
342 		return (rval);
343 	}
344 
345 	numifs = ifn;
346 	if (numifs <= 0) {
347 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found");
348 		return (0);
349 	}
350 
351 	/* allocate extra room in case more interfaces appear */
352 	numifs += 10;
353 
354 	/* get the interface names and ip addresses */
355 	bufsize = numifs * sizeof (struct ifreq);
356 	buf = kmem_alloc(bufsize, KM_SLEEP);
357 
358 	ifc.ifc_len = bufsize;
359 	ifc.ifc_buf = buf;
360 	rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED());
361 	if (rc != 0) {
362 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
363 		    "SIOCGLIFCONF failed: %d", rc);
364 		kmem_free(buf, bufsize);
365 		return (rc);
366 	}
367 	/* if our extra room is used up, try again */
368 	if (bufsize <= ifc.ifc_len) {
369 		kmem_free(buf, bufsize);
370 		buf = NULL;
371 		goto retry_count;
372 	}
373 	/* calc actual number of ifconfs */
374 	n = ifc.ifc_len / sizeof (struct ifreq);
375 
376 	/*
377 	 * Count the RDS interfaces
378 	 */
379 	for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
380 
381 		/*
382 		 * Copy as the SIOCGIFFLAGS ioctl is destructive
383 		 */
384 		bcopy(lp, &ifr, sizeof (struct ifreq));
385 		/*
386 		 * fetch the flags using the socket of the correct family
387 		 */
388 		switch (ifr.ifr_addr.sa_family) {
389 		case AF_INET:
390 			rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
391 			    &rval, CRED());
392 			break;
393 		default:
394 			continue;
395 		}
396 
397 		if (rc != 0) continue;
398 
399 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
400 		    "1. ifr_name: %s, flags: %d", ifr.ifr_name,
401 		    (ushort_t)ifr.ifr_flags);
402 
403 		/*
404 		 * If we got the flags, skip uninteresting
405 		 * interfaces based on flags
406 		 */
407 		if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
408 			continue;
409 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
410 		    "2. ifr_name: %s, flags: %d", ifr.ifr_name,
411 		    (ushort_t)ifr.ifr_flags);
412 		if (((ushort_t)ifr.ifr_flags) &
413 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
414 			continue;
415 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
416 		    "3. ifr_name: %s, flags: %d", ifr.ifr_name,
417 		    (ushort_t)ifr.ifr_flags);
418 		if (!rdsv3_capable_interface_old(&ifr))
419 			continue;
420 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
421 		    "4. ifr_name: %s, flags: %d", ifr.ifr_name,
422 		    (ushort_t)ifr.ifr_flags);
423 		j++;
424 	}
425 
426 	if (j <= 0) {
427 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces");
428 		kmem_free(buf, bufsize);
429 		return (rval);
430 	}
431 
432 	numifs = j;
433 
434 	/* This is the buffer we pass back */
435 	rbufsize = numifs * sizeof (struct ifreq);
436 	rbuf = kmem_alloc(rbufsize, KM_SLEEP);
437 	rlp = (struct ifreq *)rbuf;
438 
439 	/*
440 	 * Examine the array of interfaces and filter uninteresting ones
441 	 */
442 	for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
443 
444 		/*
445 		 * Copy the address as the SIOCGIFFLAGS ioctl is destructive
446 		 */
447 		bcopy(lp, &ifr, sizeof (struct ifreq));
448 		/*
449 		 * fetch the flags using the socket of the correct family
450 		 */
451 		switch (ifr.ifr_addr.sa_family) {
452 		case AF_INET:
453 			rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
454 			    &rval, CRED());
455 			break;
456 		default:
457 			continue;
458 		}
459 
460 
461 		if (rc != 0) {
462 			RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
463 			    "ksocket_ioctl failed: %d for %s",
464 			    rc, ifr.ifr_name);
465 			continue;
466 		}
467 
468 		/*
469 		 * If we got the flags, skip uninteresting
470 		 * interfaces based on flags
471 		 */
472 		if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
473 			continue;
474 		if (((ushort_t)ifr.ifr_flags) &
475 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
476 			continue;
477 		if (!rdsv3_capable_interface_old(&ifr))
478 			continue;
479 
480 		/* save the record */
481 		bcopy(lp, rlp, sizeof (struct ifreq));
482 		rlp->ifr_addr.sa_family = AF_INET_OFFLOAD;
483 		rlp++;
484 	}
485 
486 	kmem_free(buf, bufsize);
487 
488 	*ipaddrs = rbuf;
489 	*size = rbufsize;
490 	*nifs = numifs;
491 
492 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return");
493 
494 	return (rval);
495 }
496 
497 boolean_t
rdsv3_isloopback(ipaddr_t addr)498 rdsv3_isloopback(ipaddr_t addr)
499 {
500 	ip_stack_t *ipst;
501 
502 	ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
503 	ASSERT(ipst != NULL);
504 	if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) {
505 		netstack_rele(ipst->ips_netstack);
506 		return (B_FALSE);
507 	}
508 	netstack_rele(ipst->ips_netstack);
509 	return (B_TRUE);
510 }
511 
512 /*
513  * Work Queue Implementation
514  */
515 
516 #define	RDSV3_WQ_THREAD_IDLE		0
517 #define	RDSV3_WQ_THREAD_RUNNING		1
518 #define	RDSV3_WQ_THREAD_FLUSHING	2
519 #define	RDSV3_WQ_THREAD_EXITING		3
520 
521 /* worker thread */
522 void
rdsv3_worker_thread(void * arg)523 rdsv3_worker_thread(void *arg)
524 {
525 	rdsv3_workqueue_struct_t *wq = arg;
526 	rdsv3_work_t *work;
527 
528 	RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq);
529 
530 	mutex_enter(&wq->wq_lock);
531 	work = list_remove_head(&wq->wq_queue);
532 	while (work) {
533 		mutex_exit(&wq->wq_lock);
534 
535 		/* process work */
536 		work->func(work);
537 
538 		mutex_enter(&wq->wq_lock);
539 		work = list_remove_head(&wq->wq_queue);
540 	}
541 
542 	/* No more work, go home, until called again */
543 	if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) {
544 		wq->wq_state = RDSV3_WQ_THREAD_IDLE;
545 	}
546 	mutex_exit(&wq->wq_lock);
547 
548 	RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq);
549 }
550 
551 /* XXX */
552 void
rdsv3_flush_workqueue(rdsv3_workqueue_struct_t * wq)553 rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq)
554 {
555 	RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq);
556 
557 	mutex_enter(&wq->wq_lock);
558 	switch (wq->wq_state) {
559 	case RDSV3_WQ_THREAD_IDLE:
560 		/* nothing to do */
561 		ASSERT(list_is_empty(&wq->wq_queue));
562 		break;
563 
564 	case RDSV3_WQ_THREAD_RUNNING:
565 		wq->wq_state = RDSV3_WQ_THREAD_FLUSHING;
566 		/* FALLTHRU */
567 	case RDSV3_WQ_THREAD_FLUSHING:
568 		/* already flushing, wait until the flushing is complete */
569 		do {
570 			mutex_exit(&wq->wq_lock);
571 			delay(drv_usectohz(1000000));
572 			mutex_enter(&wq->wq_lock);
573 		} while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
574 		break;
575 	case RDSV3_WQ_THREAD_EXITING:
576 		mutex_exit(&wq->wq_lock);
577 		rdsv3_worker_thread(wq);
578 		return;
579 	}
580 	mutex_exit(&wq->wq_lock);
581 
582 	RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq);
583 }
584 
585 void
rdsv3_queue_work(rdsv3_workqueue_struct_t * wq,rdsv3_work_t * wp)586 rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp)
587 {
588 	RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp);
589 
590 	mutex_enter(&wq->wq_lock);
591 
592 	if (list_link_active(&wp->work_item)) {
593 		/* This is already in the queue, ignore this call */
594 		mutex_exit(&wq->wq_lock);
595 		RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp);
596 		return;
597 	}
598 
599 	switch (wq->wq_state) {
600 	case RDSV3_WQ_THREAD_RUNNING:
601 		list_insert_tail(&wq->wq_queue, wp);
602 		mutex_exit(&wq->wq_lock);
603 		break;
604 
605 	case RDSV3_WQ_THREAD_FLUSHING:
606 		do {
607 			mutex_exit(&wq->wq_lock);
608 			delay(drv_usectohz(1000000));
609 			mutex_enter(&wq->wq_lock);
610 		} while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
611 
612 		if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) {
613 			list_insert_tail(&wq->wq_queue, wp);
614 			mutex_exit(&wq->wq_lock);
615 			break;
616 		}
617 		/* FALLTHRU */
618 
619 	case RDSV3_WQ_THREAD_IDLE:
620 		list_insert_tail(&wq->wq_queue, wp);
621 		wq->wq_state = RDSV3_WQ_THREAD_RUNNING;
622 		mutex_exit(&wq->wq_lock);
623 
624 		(void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq,
625 		    DDI_SLEEP);
626 		break;
627 
628 	case RDSV3_WQ_THREAD_EXITING:
629 		mutex_exit(&wq->wq_lock);
630 		break;
631 	}
632 
633 	RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp);
634 }
635 
636 /* timeout handler for delayed work queuing */
637 void
rdsv3_work_timeout_handler(void * arg)638 rdsv3_work_timeout_handler(void *arg)
639 {
640 	rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg;
641 
642 	RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
643 	    "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work);
644 
645 	mutex_enter(&dwp->lock);
646 	dwp->timeid = 0;
647 	mutex_exit(&dwp->lock);
648 
649 	mutex_enter(&dwp->wq->wq_lock);
650 	dwp->wq->wq_pending--;
651 	if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
652 		mutex_exit(&dwp->wq->wq_lock);
653 		return;
654 	}
655 	mutex_exit(&dwp->wq->wq_lock);
656 
657 	rdsv3_queue_work(dwp->wq, &dwp->work);
658 
659 	RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
660 	    "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work);
661 }
662 
663 void
rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t * wq,rdsv3_delayed_work_t * dwp,uint_t delay)664 rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq,
665     rdsv3_delayed_work_t *dwp, uint_t delay)
666 {
667 	RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
668 	    "Enter(wq: %p, wp: %p)", wq, dwp);
669 
670 	if (delay == 0) {
671 		rdsv3_queue_work(wq, &dwp->work);
672 		return;
673 	}
674 
675 	mutex_enter(&wq->wq_lock);
676 	if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
677 		mutex_exit(&wq->wq_lock);
678 		RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
679 		    "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp);
680 		return;
681 	}
682 	wq->wq_pending++;
683 	mutex_exit(&wq->wq_lock);
684 
685 	mutex_enter(&dwp->lock);
686 	if (dwp->timeid == 0) {
687 		dwp->wq = wq;
688 		dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp,
689 		    jiffies + (delay * rdsv3_one_sec_in_hz));
690 		mutex_exit(&dwp->lock);
691 	} else {
692 		mutex_exit(&dwp->lock);
693 		RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p",
694 		    dwp);
695 		mutex_enter(&wq->wq_lock);
696 		wq->wq_pending--;
697 		mutex_exit(&wq->wq_lock);
698 	}
699 
700 	RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
701 	    "Return(wq: %p, wp: %p)", wq, dwp);
702 }
703 
704 void
rdsv3_cancel_delayed_work(rdsv3_delayed_work_t * dwp)705 rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp)
706 {
707 	RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
708 	    "Enter(wq: %p, dwp: %p)", dwp->wq, dwp);
709 
710 	mutex_enter(&dwp->lock);
711 	if (dwp->timeid != 0) {
712 		(void) untimeout(dwp->timeid);
713 		dwp->timeid = 0;
714 	} else {
715 		RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
716 		    "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp);
717 		mutex_exit(&dwp->lock);
718 		return;
719 	}
720 	mutex_exit(&dwp->lock);
721 
722 	mutex_enter(&dwp->wq->wq_lock);
723 	dwp->wq->wq_pending--;
724 	mutex_exit(&dwp->wq->wq_lock);
725 
726 	RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
727 	    "Return(wq: %p, dwp: %p)", dwp->wq, dwp);
728 }
729 
730 void
rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t * wq)731 rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq)
732 {
733 	RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter");
734 
735 	ASSERT(wq);
736 
737 	mutex_enter(&wq->wq_lock);
738 	wq->wq_state = RDSV3_WQ_THREAD_EXITING;
739 
740 	while (wq->wq_pending > 0) {
741 		mutex_exit(&wq->wq_lock);
742 		delay(drv_usectohz(1000000));
743 		mutex_enter(&wq->wq_lock);
744 	};
745 	mutex_exit(&wq->wq_lock);
746 
747 	rdsv3_flush_workqueue(wq);
748 
749 	list_destroy(&wq->wq_queue);
750 	mutex_destroy(&wq->wq_lock);
751 	kmem_free(wq, sizeof (rdsv3_workqueue_struct_t));
752 
753 	ASSERT(rdsv3_taskq);
754 	ddi_taskq_destroy(rdsv3_taskq);
755 
756 	wq = NULL;
757 	rdsv3_taskq = NULL;
758 
759 	RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return");
760 }
761 
762 /* ARGSUSED */
763 void
rdsv3_rdma_init_worker(struct rdsv3_work_s * work)764 rdsv3_rdma_init_worker(struct rdsv3_work_s *work)
765 {
766 	rdsv3_rdma_init();
767 }
768 
769 #define	RDSV3_NUM_TASKQ_THREADS	1
770 rdsv3_workqueue_struct_t *
rdsv3_create_task_workqueue(char * name)771 rdsv3_create_task_workqueue(char *name)
772 {
773 	rdsv3_workqueue_struct_t	*wq;
774 
775 	RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)",
776 	    rdsv3_dev_info);
777 
778 	rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name,
779 	    RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0);
780 	if (rdsv3_taskq == NULL) {
781 		RDSV3_DPRINTF2(__FILE__,
782 		    "ddi_taskq_create failed for rdsv3_taskq");
783 		return (NULL);
784 	}
785 
786 	wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP);
787 	if (wq == NULL) {
788 		RDSV3_DPRINTF2(__FILE__, "kmem_zalloc failed for wq");
789 		ddi_taskq_destroy(rdsv3_taskq);
790 		return (NULL);
791 	}
792 
793 	list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s),
794 	    offsetof(struct rdsv3_work_s, work_item));
795 	mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL);
796 	wq->wq_state = RDSV3_WQ_THREAD_IDLE;
797 	wq->wq_pending = 0;
798 	rdsv3_one_sec_in_hz = drv_usectohz(1000000);
799 
800 	RDSV3_DPRINTF2("create_singlethread_workqueue", "Return");
801 
802 	return (wq);
803 }
804 
805 /*
806  * Implementation for struct sock
807  */
808 
809 void
rdsv3_sock_exit_data(struct rsock * sk)810 rdsv3_sock_exit_data(struct rsock *sk)
811 {
812 	struct rdsv3_sock *rs = sk->sk_protinfo;
813 
814 	RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
815 
816 	ASSERT(rs != NULL);
817 	ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD));
818 
819 	rs->rs_sk = NULL;
820 
821 	list_destroy(&rs->rs_send_queue);
822 	list_destroy(&rs->rs_notify_queue);
823 	list_destroy(&rs->rs_recv_queue);
824 
825 	rw_destroy(&rs->rs_recv_lock);
826 	mutex_destroy(&rs->rs_lock);
827 
828 	mutex_destroy(&rs->rs_rdma_lock);
829 	avl_destroy(&rs->rs_rdma_keys);
830 
831 	mutex_destroy(&rs->rs_conn_lock);
832 	mutex_destroy(&rs->rs_congested_lock);
833 	cv_destroy(&rs->rs_congested_cv);
834 
835 	rdsv3_exit_waitqueue(sk->sk_sleep);
836 	kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t));
837 	mutex_destroy(&sk->sk_lock);
838 
839 	kmem_cache_free(rdsv3_alloc_cache, sk);
840 	RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
841 }
842 
843 /* XXX - figure out right values */
844 #define	RDSV3_RECV_HIWATER	(256 * 1024)
845 #define	RDSV3_RECV_LOWATER	128
846 #define	RDSV3_XMIT_HIWATER	(256 * 1024)
847 #define	RDSV3_XMIT_LOWATER	1024
848 
849 struct rsock *
rdsv3_sk_alloc()850 rdsv3_sk_alloc()
851 {
852 	struct rsock *sk;
853 
854 	sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP);
855 	if (sk == NULL) {
856 		RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed");
857 		return (NULL);
858 	}
859 
860 	bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock));
861 	return (sk);
862 }
863 
864 void
rdsv3_sock_init_data(struct rsock * sk)865 rdsv3_sock_init_data(struct rsock *sk)
866 {
867 	sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP);
868 	rdsv3_init_waitqueue(sk->sk_sleep);
869 
870 	mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL);
871 	sk->sk_refcount = 1;
872 	sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1);
873 	sk->sk_sndbuf = RDSV3_XMIT_HIWATER;
874 	sk->sk_rcvbuf = RDSV3_RECV_HIWATER;
875 }
876 
877 /*
878  * Connection cache
879  */
880 /* ARGSUSED */
881 int
rdsv3_conn_constructor(void * buf,void * arg,int kmflags)882 rdsv3_conn_constructor(void *buf, void *arg, int kmflags)
883 {
884 	struct rdsv3_connection *conn = buf;
885 
886 	bzero(conn, sizeof (struct rdsv3_connection));
887 
888 	conn->c_next_tx_seq = 1;
889 	mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL);
890 	mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL);
891 	conn->c_send_generation = 1;
892 	conn->c_senders = 0;
893 
894 	list_create(&conn->c_send_queue, sizeof (struct rdsv3_message),
895 	    offsetof(struct rdsv3_message, m_conn_item));
896 	list_create(&conn->c_retrans, sizeof (struct rdsv3_message),
897 	    offsetof(struct rdsv3_message, m_conn_item));
898 	return (0);
899 }
900 
901 /* ARGSUSED */
902 void
rdsv3_conn_destructor(void * buf,void * arg)903 rdsv3_conn_destructor(void *buf, void *arg)
904 {
905 	struct rdsv3_connection *conn = buf;
906 
907 	ASSERT(list_is_empty(&conn->c_send_queue));
908 	ASSERT(list_is_empty(&conn->c_retrans));
909 	list_destroy(&conn->c_send_queue);
910 	list_destroy(&conn->c_retrans);
911 	mutex_destroy(&conn->c_send_lock);
912 	mutex_destroy(&conn->c_lock);
913 }
914 
915 int
rdsv3_conn_compare(const void * conn1,const void * conn2)916 rdsv3_conn_compare(const void *conn1, const void *conn2)
917 {
918 	uint32_be_t	laddr1, faddr1, laddr2, faddr2;
919 
920 	laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr;
921 	laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr;
922 
923 	if (laddr1 == laddr2) {
924 		faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr;
925 		faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr;
926 		if (faddr1 == faddr2)
927 			return (0);
928 		if (faddr1 < faddr2)
929 			return (-1);
930 		return (1);
931 	}
932 
933 	if (laddr1 < laddr2)
934 		return (-1);
935 
936 	return (1);
937 }
938 
939 /* rdsv3_ib_incoming cache */
940 /* ARGSUSED */
941 int
rdsv3_ib_inc_constructor(void * buf,void * arg,int kmflags)942 rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags)
943 {
944 	list_create(&((struct rdsv3_ib_incoming *)buf)->ii_frags,
945 	    sizeof (struct rdsv3_page_frag),
946 	    offsetof(struct rdsv3_page_frag, f_item));
947 
948 	return (0);
949 }
950 
951 /* ARGSUSED */
952 void
rdsv3_ib_inc_destructor(void * buf,void * arg)953 rdsv3_ib_inc_destructor(void *buf, void *arg)
954 {
955 	list_destroy(&((struct rdsv3_ib_incoming *)buf)->ii_frags);
956 }
957 
958 /* ib_frag_slab cache */
959 /* ARGSUSED */
960 int
rdsv3_ib_frag_constructor(void * buf,void * arg,int kmflags)961 rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags)
962 {
963 	struct rdsv3_page_frag *frag = (struct rdsv3_page_frag *)buf;
964 	struct rdsv3_ib_device *rds_ibdev = (struct rdsv3_ib_device *)arg;
965 	ibt_iov_attr_t iov_attr;
966 	ibt_iov_t iov_arr[1];
967 	ibt_all_wr_t wr;
968 
969 	bzero(frag, sizeof (struct rdsv3_page_frag));
970 	list_link_init(&frag->f_item);
971 
972 	frag->f_page = kmem_alloc(PAGE_SIZE, kmflags);
973 	if (frag->f_page == NULL) {
974 		RDSV3_DPRINTF2("rdsv3_ib_frag_constructor",
975 		    "kmem_alloc for %d failed", PAGE_SIZE);
976 		return (-1);
977 	}
978 	frag->f_offset = 0;
979 
980 	iov_attr.iov_as = NULL;
981 	iov_attr.iov = &iov_arr[0];
982 	iov_attr.iov_buf = NULL;
983 	iov_attr.iov_list_len = 1;
984 	iov_attr.iov_wr_nds = 1;
985 	iov_attr.iov_lso_hdr_sz = 0;
986 	iov_attr.iov_flags = IBT_IOV_SLEEP | IBT_IOV_RECV;
987 
988 	iov_arr[0].iov_addr = frag->f_page;
989 	iov_arr[0].iov_len = PAGE_SIZE;
990 
991 	wr.recv.wr_nds = 1;
992 	wr.recv.wr_sgl = &frag->f_sge;
993 
994 	if (ibt_map_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev->dev),
995 	    &iov_attr, &wr, &frag->f_mapped) != IBT_SUCCESS) {
996 		RDSV3_DPRINTF2("rdsv3_ib_frag_constructor",
997 		    "ibt_map_mem_iov failed");
998 		kmem_free(frag->f_page, PAGE_SIZE);
999 		return (-1);
1000 	}
1001 
1002 	return (0);
1003 }
1004 
1005 /* ARGSUSED */
1006 void
rdsv3_ib_frag_destructor(void * buf,void * arg)1007 rdsv3_ib_frag_destructor(void *buf, void *arg)
1008 {
1009 	struct rdsv3_page_frag *frag = (struct rdsv3_page_frag *)buf;
1010 	struct rdsv3_ib_device *rds_ibdev = (struct rdsv3_ib_device *)arg;
1011 
1012 	/* unmap the page */
1013 	if (ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev->dev),
1014 	    frag->f_mapped) != IBT_SUCCESS)
1015 		RDSV3_DPRINTF2("rdsv3_ib_frag_destructor",
1016 		    "ibt_unmap_mem_iov failed");
1017 
1018 	/* free the page */
1019 	kmem_free(frag->f_page, PAGE_SIZE);
1020 }
1021 
1022 /* loop.c */
1023 extern kmutex_t loop_conns_lock;
1024 extern list_t loop_conns;
1025 
1026 struct rdsv3_loop_connection
1027 {
1028 	struct list_node loop_node;
1029 	struct rdsv3_connection *conn;
1030 };
1031 
1032 void
rdsv3_loop_init(void)1033 rdsv3_loop_init(void)
1034 {
1035 	list_create(&loop_conns, sizeof (struct rdsv3_loop_connection),
1036 	    offsetof(struct rdsv3_loop_connection, loop_node));
1037 	mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL);
1038 }
1039 
1040 /* rdma.c */
1041 /* IB Rkey is used here for comparison */
1042 int
rdsv3_mr_compare(const void * mr1,const void * mr2)1043 rdsv3_mr_compare(const void *mr1, const void *mr2)
1044 {
1045 	uint32_t key1 = *(uint32_t *)mr1;
1046 	uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key;
1047 
1048 	if (key1 < key2)
1049 		return (-1);
1050 	if (key1 > key2)
1051 		return (1);
1052 	return (0);
1053 }
1054 
1055 /* transport.c */
1056 extern struct rdsv3_transport *transports[];
1057 extern krwlock_t		trans_sem;
1058 
1059 void
rdsv3_trans_exit(void)1060 rdsv3_trans_exit(void)
1061 {
1062 	struct rdsv3_transport *trans;
1063 	int i;
1064 
1065 	RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter");
1066 
1067 	/* currently, only IB transport */
1068 	rw_enter(&trans_sem, RW_READER);
1069 	trans = NULL;
1070 	for (i = 0; i < RDS_TRANS_COUNT; i++) {
1071 		if (transports[i]) {
1072 			trans = transports[i];
1073 			break;
1074 		}
1075 	}
1076 	rw_exit(&trans_sem);
1077 
1078 	/* trans->exit() will remove the trans from the list */
1079 	if (trans)
1080 		trans->exit();
1081 
1082 	rw_destroy(&trans_sem);
1083 
1084 	RDSV3_DPRINTF2("rdsv3_trans_exit", "Return");
1085 }
1086 
1087 void
rdsv3_trans_init()1088 rdsv3_trans_init()
1089 {
1090 	RDSV3_DPRINTF2("rdsv3_trans_init", "Enter");
1091 
1092 	rw_init(&trans_sem, NULL, RW_DRIVER, NULL);
1093 
1094 	RDSV3_DPRINTF2("rdsv3_trans_init", "Return");
1095 }
1096 
1097 int
rdsv3_put_cmsg(struct nmsghdr * msg,int level,int type,size_t size,void * payload)1098 rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size,
1099     void *payload)
1100 {
1101 	struct cmsghdr *cp;
1102 	char *bp;
1103 	size_t cmlen;
1104 	size_t cmspace;
1105 	size_t bufsz;
1106 
1107 	RDSV3_DPRINTF4("rdsv3_put_cmsg",
1108 	    "Enter(msg: %p level: %d type: %d sz: %d)",
1109 	    msg, level, type, size);
1110 
1111 	if (msg == NULL || msg->msg_controllen == 0) {
1112 		return (0);
1113 	}
1114 	/* check for first cmsg or this is another cmsg to be appended */
1115 	if (msg->msg_control == NULL)
1116 		msg->msg_controllen = 0;
1117 
1118 	cmlen = CMSG_LEN(size);
1119 	cmspace = CMSG_SPACE(size);
1120 	bufsz = msg->msg_controllen + cmspace;
1121 
1122 	/* extend the existing cmsg to append the next cmsg */
1123 	bp = kmem_alloc(bufsz, KM_SLEEP);
1124 	if (msg->msg_control) {
1125 		bcopy(msg->msg_control, bp, msg->msg_controllen);
1126 		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
1127 	}
1128 
1129 	/* assign payload the proper cmsg location */
1130 	cp = (struct cmsghdr *)(bp + msg->msg_controllen);
1131 	cp->cmsg_len = cmlen;
1132 	cp->cmsg_level = level;
1133 	cp->cmsg_type = type;
1134 
1135 	bcopy(payload, CMSG_DATA(cp), cmlen -
1136 	    (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr)));
1137 
1138 	msg->msg_control = bp;
1139 	msg->msg_controllen = bufsz;
1140 
1141 	RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len);
1142 
1143 	return (0);
1144 }
1145 
1146 /* ARGSUSED */
1147 int
rdsv3_verify_bind_address(ipaddr_t addr)1148 rdsv3_verify_bind_address(ipaddr_t addr)
1149 {
1150 	return (1);
1151 }
1152 
1153 /* checksum */
1154 uint16_t
rdsv3_ip_fast_csum(void * hdr,size_t length)1155 rdsv3_ip_fast_csum(void *hdr, size_t length)
1156 {
1157 	return (0xffff &
1158 	    (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0)));
1159 }
1160 
1161 /* scatterlist implementation */
1162 /* ARGSUSED */
1163 caddr_t
rdsv3_ib_sg_dma_address(ib_device_t * dev,struct rdsv3_scatterlist * scat,uint_t offset)1164 rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat,
1165     uint_t offset)
1166 {
1167 	return (0);
1168 }
1169 
1170 uint_t
rdsv3_ib_dma_map_sg(struct ib_device * dev,struct rdsv3_scatterlist * scat,uint_t num)1171 rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat,
1172     uint_t num)
1173 {
1174 	struct rdsv3_scatterlist *s, *first;
1175 	ibt_iov_t *iov;
1176 	ibt_wr_ds_t *sgl;
1177 	ibt_iov_attr_t iov_attr;
1178 	ibt_send_wr_t swr;
1179 	uint_t i;
1180 
1181 	RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num);
1182 
1183 	s = first = &scat[0];
1184 	ASSERT(first->mihdl == NULL);
1185 
1186 	iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP);
1187 	sgl = kmem_zalloc((num * 2) *  sizeof (ibt_wr_ds_t), KM_SLEEP);
1188 
1189 	for (i = 0; i < num; i++, s++) {
1190 		iov[i].iov_addr = s->vaddr;
1191 		iov[i].iov_len = s->length;
1192 	}
1193 
1194 	iov_attr.iov_as = NULL;
1195 	iov_attr.iov = iov;
1196 	iov_attr.iov_buf = NULL;
1197 	iov_attr.iov_list_len = num;
1198 	iov_attr.iov_wr_nds = num * 2;
1199 	iov_attr.iov_lso_hdr_sz = 0;
1200 	iov_attr.iov_flags = IBT_IOV_SLEEP;
1201 
1202 	swr.wr_sgl = sgl;
1203 
1204 	i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev),
1205 	    &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl);
1206 	kmem_free(iov, num * sizeof (ibt_iov_t));
1207 	if (i != IBT_SUCCESS) {
1208 		RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg",
1209 		    "ibt_map_mem_iov returned: %d", i);
1210 		return (0);
1211 	}
1212 
1213 	s = first;
1214 	for (i = 0; i < num; i++, s++, sgl++) {
1215 		s->sgl = sgl;
1216 	}
1217 
1218 	return (num);
1219 }
1220 
1221 void
rdsv3_ib_dma_unmap_sg(ib_device_t * dev,struct rdsv3_scatterlist * scat,uint_t num)1222 rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat,
1223     uint_t num)
1224 {
1225 	/* Zero length messages have no scatter gather entries */
1226 	if (num != 0) {
1227 		ASSERT(scat->mihdl != NULL);
1228 		ASSERT(scat->sgl != NULL);
1229 
1230 		(void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl);
1231 
1232 		kmem_free(scat->sgl, (num * 2)  * sizeof (ibt_wr_ds_t));
1233 		scat->sgl = NULL;
1234 		scat->mihdl = NULL;
1235 	}
1236 }
1237 
1238 int
rdsv3_ib_alloc_hdrs(ib_device_t * dev,struct rdsv3_ib_connection * ic)1239 rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
1240 {
1241 	caddr_t addr;
1242 	size_t size;
1243 	ibt_mr_attr_t mr_attr;
1244 	ibt_mr_desc_t mr_desc;
1245 	ibt_mr_hdl_t mr_hdl;
1246 	int ret;
1247 
1248 	RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev);
1249 
1250 	ASSERT(ic->i_mr == NULL);
1251 
1252 	size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) *
1253 	    sizeof (struct rdsv3_header);
1254 
1255 	addr = kmem_zalloc(size, KM_NOSLEEP);
1256 	if (addr == NULL)
1257 		return (-1);
1258 
1259 	mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr;
1260 	mr_attr.mr_len = size;
1261 	mr_attr.mr_as = NULL;
1262 	mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
1263 	ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd),
1264 	    &mr_attr, &mr_hdl, &mr_desc);
1265 	if (ret != IBT_SUCCESS) {
1266 		RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs",
1267 		    "ibt_register_mr returned: " "%d", ret);
1268 		return (-1);
1269 	}
1270 
1271 	ic->i_mr =
1272 	    (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr),
1273 	    KM_SLEEP);
1274 	ic->i_mr->addr = addr;
1275 	ic->i_mr->size = size;
1276 	ic->i_mr->hdl =	mr_hdl;
1277 	ic->i_mr->lkey = mr_desc.md_lkey;
1278 
1279 	ic->i_send_hdrs = (struct rdsv3_header *)addr;
1280 	ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr;
1281 
1282 	ic->i_recv_hdrs = (struct rdsv3_header *)(addr +
1283 	    (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
1284 	ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr +
1285 	    (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
1286 
1287 	ic->i_ack = (struct rdsv3_header *)(addr +
1288 	    ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
1289 	    sizeof (struct rdsv3_header)));
1290 	ic->i_ack_dma = (uint64_t)(uintptr_t)(addr +
1291 	    ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
1292 	    sizeof (struct rdsv3_header)));
1293 
1294 	RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev);
1295 
1296 	return (0);
1297 }
1298 
1299 void
rdsv3_ib_free_hdrs(ib_device_t * dev,struct rdsv3_ib_connection * ic)1300 rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
1301 {
1302 	RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev);
1303 	ASSERT(ic->i_mr != NULL);
1304 
1305 	ic->i_send_hdrs = NULL;
1306 	ic->i_send_hdrs_dma = 0;
1307 
1308 	ic->i_recv_hdrs = NULL;
1309 	ic->i_recv_hdrs_dma = 0;
1310 
1311 	ic->i_ack = NULL;
1312 	ic->i_ack_dma = 0;
1313 
1314 	(void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl);
1315 
1316 	kmem_free(ic->i_mr->addr, ic->i_mr->size);
1317 	kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr));
1318 
1319 	ic->i_mr = NULL;
1320 	RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev);
1321 }
1322 
1323 /*
1324  * atomic_add_unless - add unless the number is a given value
1325  * @v: pointer of type atomic_t
1326  * @a: the amount to add to v...
1327  * @u: ...unless v is equal to u.
1328  *
1329  * Atomically adds @a to @v, so long as it was not @u.
1330  * Returns non-zero if @v was not @u, and zero otherwise.
1331  */
1332 int
atomic_add_unless(atomic_t * v,uint_t a,ulong_t u)1333 atomic_add_unless(atomic_t *v, uint_t a, ulong_t u)
1334 {
1335 	uint_t c, old;
1336 
1337 	c = *v;
1338 	while (c != u && (old = atomic_cas_uint(v, c, c + a)) != c) {
1339 		c = old;
1340 	}
1341 	return ((ulong_t)c != u);
1342 }
1343