xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c (revision b27516f55237249607f754e6e42e865f12456675)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 #include <sys/types.h>
25 #include <sys/stream.h>
26 #include <sys/dlpi.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/strlog.h>
31 #include <sys/ddi.h>
32 #include <sys/cmn_err.h>
33 #include <sys/socket.h>
34 #include <net/if.h>
35 #include <net/if_types.h>
36 #include <netinet/in.h>
37 #include <sys/ethernet.h>
38 #include <inet/arp.h>
39 #include <inet/ip.h>
40 #include <inet/ip6.h>
41 #include <inet/ip_ire.h>
42 #include <inet/ip_if.h>
43 #include <inet/ip_ftable.h>
44 
45 #include <sys/sunddi.h>
46 #include <sys/ksynch.h>
47 
48 #include <sys/rds.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sockio.h>
52 #include <sys/sysmacros.h>
53 #include <inet/common.h>
54 #include <inet/ip.h>
55 #include <net/if_types.h>
56 
57 #include <sys/ib/clients/rdsv3/rdsv3.h>
58 #include <sys/ib/clients/rdsv3/rdma.h>
59 #include <sys/ib/clients/rdsv3/ib.h>
60 #include <sys/ib/clients/rdsv3/rdsv3_impl.h>
61 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
62 
63 #include <sys/dls.h>
64 #include <sys/mac.h>
65 #include <sys/mac_client.h>
66 #include <sys/mac_provider.h>
67 #include <sys/mac_client_priv.h>
68 
69 ddi_taskq_t		*rdsv3_taskq = NULL;
70 extern kmem_cache_t	*rdsv3_alloc_cache;
71 
72 extern unsigned int 	ip_ocsum(ushort_t *address, int halfword_count,
73     unsigned int sum);
74 
75 /*
76  * Check if the IP interface named by `lifrp' is RDS-capable.
77  */
78 boolean_t
79 rdsv3_capable_interface(struct lifreq *lifrp)
80 {
81 	char	ifname[LIFNAMSIZ];
82 	char	drv[MAXLINKNAMELEN];
83 	uint_t	ppa;
84 	char 	*cp;
85 
86 	RDSV3_DPRINTF4("rdsv3_capable_interface", "Enter");
87 
88 	if (lifrp->lifr_type == IFT_IB)
89 		return (B_TRUE);
90 
91 	/*
92 	 * Strip off the logical interface portion before getting
93 	 * intimate with the name.
94 	 */
95 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
96 	if ((cp = strchr(ifname, ':')) != NULL)
97 		*cp = '\0';
98 
99 	if (strcmp("lo0", ifname) == 0) {
100 		/*
101 		 * loopback is considered RDS-capable
102 		 */
103 		return (B_TRUE);
104 	}
105 
106 	return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS &&
107 	    rdsv3_if_lookup_by_name(drv));
108 }
109 
110 int
111 rdsv3_do_ip_ioctl(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
112 {
113 	struct lifnum		lifn;
114 	struct lifconf		lifc;
115 	struct lifreq		*lp, *rlp, lifr;
116 	int			rval = 0;
117 	int			numifs;
118 	int			bufsize, rbufsize;
119 	void			*buf, *rbuf;
120 	int			i, j, n, rc;
121 
122 	*ipaddrs = NULL;
123 	*size = 0;
124 	*nifs = 0;
125 
126 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Enter");
127 
128 retry_count:
129 	/* snapshot the current number of interfaces */
130 	lifn.lifn_family = PF_UNSPEC;
131 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
132 	lifn.lifn_count = 0;
133 	rval = ksocket_ioctl(so4, SIOCGLIFNUM, (intptr_t)&lifn, &rval,
134 	    CRED());
135 	if (rval != 0) {
136 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
137 		    "ksocket_ioctl returned: %d", rval);
138 		return (rval);
139 	}
140 
141 	numifs = lifn.lifn_count;
142 	if (numifs <= 0) {
143 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No interfaces found");
144 		return (0);
145 	}
146 
147 	/* allocate extra room in case more interfaces appear */
148 	numifs += 10;
149 
150 	/* get the interface names and ip addresses */
151 	bufsize = numifs * sizeof (struct lifreq);
152 	buf = kmem_alloc(bufsize, KM_SLEEP);
153 
154 	lifc.lifc_family = AF_UNSPEC;
155 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
156 	lifc.lifc_len = bufsize;
157 	lifc.lifc_buf = buf;
158 	rc = ksocket_ioctl(so4, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
159 	if (rc != 0) {
160 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "SIOCGLIFCONF failed");
161 		kmem_free(buf, bufsize);
162 		return (rc);
163 	}
164 	/* if our extra room is used up, try again */
165 	if (bufsize <= lifc.lifc_len) {
166 		kmem_free(buf, bufsize);
167 		buf = NULL;
168 		goto retry_count;
169 	}
170 	/* calc actual number of ifconfs */
171 	n = lifc.lifc_len / sizeof (struct lifreq);
172 
173 	/*
174 	 * Count the RDS interfaces
175 	 */
176 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
177 
178 		/*
179 		 * Copy as the SIOCGLIFFLAGS ioctl is destructive
180 		 */
181 		bcopy(lp, &lifr, sizeof (struct lifreq));
182 		/*
183 		 * fetch the flags using the socket of the correct family
184 		 */
185 		switch (lifr.lifr_addr.ss_family) {
186 		case AF_INET:
187 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
188 			    &rval, CRED());
189 			break;
190 		default:
191 			continue;
192 		}
193 
194 		if (rc != 0) continue;
195 
196 		/*
197 		 * If we got the flags, skip uninteresting
198 		 * interfaces based on flags
199 		 */
200 		if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
201 			continue;
202 		if (lifr.lifr_flags &
203 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
204 			continue;
205 		if (!rdsv3_capable_interface(&lifr))
206 			continue;
207 		j++;
208 	}
209 
210 	if (j <= 0) {
211 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl", "No RDS interfaces");
212 		kmem_free(buf, bufsize);
213 		return (rval);
214 	}
215 
216 	numifs = j;
217 
218 	/* This is the buffer we pass back */
219 	rbufsize = numifs * sizeof (struct lifreq);
220 	rbuf = kmem_alloc(rbufsize, KM_SLEEP);
221 	rlp = (struct lifreq *)rbuf;
222 
223 	/*
224 	 * Examine the array of interfaces and filter uninteresting ones
225 	 */
226 	for (i = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
227 
228 		/*
229 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
230 		 */
231 		bcopy(lp, &lifr, sizeof (struct lifreq));
232 		/*
233 		 * fetch the flags using the socket of the correct family
234 		 */
235 		switch (lifr.lifr_addr.ss_family) {
236 		case AF_INET:
237 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)&lifr,
238 			    &rval, CRED());
239 			break;
240 		default:
241 			continue;
242 		}
243 
244 
245 		if (rc != 0) {
246 			RDSV3_DPRINTF2("rdsv3_do_ip_ioctl",
247 			    "ksocket_ioctl failed" " for %s", lifr.lifr_name);
248 			continue;
249 		}
250 
251 		/*
252 		 * If we got the flags, skip uninteresting
253 		 * interfaces based on flags
254 		 */
255 		if ((lifr.lifr_flags & IFF_UP) != IFF_UP)
256 			continue;
257 		if (lifr.lifr_flags &
258 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
259 			continue;
260 		if (!rdsv3_capable_interface(&lifr))
261 			continue;
262 
263 		/* save the record */
264 		bcopy(lp, rlp, sizeof (struct lifreq));
265 		rlp->lifr_addr.ss_family = AF_INET_OFFLOAD;
266 		rlp++;
267 	}
268 
269 	kmem_free(buf, bufsize);
270 
271 	*ipaddrs = rbuf;
272 	*size = rbufsize;
273 	*nifs = numifs;
274 
275 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl", "Return");
276 
277 	return (rval);
278 }
279 
280 /*
281  * Check if the IP interface named by `ifrp' is RDS-capable.
282  */
283 boolean_t
284 rdsv3_capable_interface_old(struct ifreq *ifrp)
285 {
286 	char	ifname[IFNAMSIZ];
287 	char	drv[MAXLINKNAMELEN];
288 	uint_t	ppa;
289 	char 	*cp;
290 
291 	RDSV3_DPRINTF4("rdsv3_capable_interface_old", "Enter");
292 
293 	/*
294 	 * Strip off the logical interface portion before getting
295 	 * intimate with the name.
296 	 */
297 	(void) strlcpy(ifname, ifrp->ifr_name, IFNAMSIZ);
298 	if ((cp = strchr(ifname, ':')) != NULL)
299 		*cp = '\0';
300 
301 	RDSV3_DPRINTF4("rdsv3_capable_interface_old", "ifname: %s", ifname);
302 
303 	if ((strcmp("lo0", ifname) == 0) ||
304 	    (strncmp("ibd", ifname, 3) == 0)) {
305 		/*
306 		 * loopback and IB are considered RDS-capable
307 		 */
308 		return (B_TRUE);
309 	}
310 
311 	return (ddi_parse(ifname, drv, &ppa) == DDI_SUCCESS &&
312 	    rdsv3_if_lookup_by_name(drv));
313 }
314 
315 int
316 rdsv3_do_ip_ioctl_old(ksocket_t so4, void **ipaddrs, int *size, int *nifs)
317 {
318 	uint_t			ifn;
319 	struct ifconf		ifc;
320 	struct ifreq		*lp, *rlp, ifr;
321 	int			rval = 0;
322 	int			numifs;
323 	int			bufsize, rbufsize;
324 	void			*buf, *rbuf;
325 	int			i, j, n, rc;
326 
327 	*ipaddrs = NULL;
328 	*size = 0;
329 	*nifs = 0;
330 
331 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Enter");
332 
333 retry_count:
334 	rval = ksocket_ioctl(so4, SIOCGIFNUM, (intptr_t)&ifn, &rval,
335 	    CRED());
336 	if (rval != 0) {
337 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
338 		    "ksocket_ioctl(SIOCGIFNUM) returned: %d", rval);
339 		return (rval);
340 	}
341 
342 	numifs = ifn;
343 	if (numifs <= 0) {
344 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No interfaces found");
345 		return (0);
346 	}
347 
348 	/* allocate extra room in case more interfaces appear */
349 	numifs += 10;
350 
351 	/* get the interface names and ip addresses */
352 	bufsize = numifs * sizeof (struct ifreq);
353 	buf = kmem_alloc(bufsize, KM_SLEEP);
354 
355 	ifc.ifc_len = bufsize;
356 	ifc.ifc_buf = buf;
357 	rc = ksocket_ioctl(so4, SIOCGIFCONF, (intptr_t)&ifc, &rval, CRED());
358 	if (rc != 0) {
359 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
360 		    "SIOCGLIFCONF failed: %d", rc);
361 		kmem_free(buf, bufsize);
362 		return (rc);
363 	}
364 	/* if our extra room is used up, try again */
365 	if (bufsize <= ifc.ifc_len) {
366 		kmem_free(buf, bufsize);
367 		buf = NULL;
368 		goto retry_count;
369 	}
370 	/* calc actual number of ifconfs */
371 	n = ifc.ifc_len / sizeof (struct ifreq);
372 
373 	/*
374 	 * Count the RDS interfaces
375 	 */
376 	for (i = 0, j = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
377 
378 		/*
379 		 * Copy as the SIOCGIFFLAGS ioctl is destructive
380 		 */
381 		bcopy(lp, &ifr, sizeof (struct ifreq));
382 		/*
383 		 * fetch the flags using the socket of the correct family
384 		 */
385 		switch (ifr.ifr_addr.sa_family) {
386 		case AF_INET:
387 			rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
388 			    &rval, CRED());
389 			break;
390 		default:
391 			continue;
392 		}
393 
394 		if (rc != 0) continue;
395 
396 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
397 		    "1. ifr_name: %s, flags: %d", ifr.ifr_name,
398 		    (ushort_t)ifr.ifr_flags);
399 
400 		/*
401 		 * If we got the flags, skip uninteresting
402 		 * interfaces based on flags
403 		 */
404 		if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
405 			continue;
406 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
407 		    "2. ifr_name: %s, flags: %d", ifr.ifr_name,
408 		    (ushort_t)ifr.ifr_flags);
409 		if (((ushort_t)ifr.ifr_flags) &
410 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
411 			continue;
412 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
413 		    "3. ifr_name: %s, flags: %d", ifr.ifr_name,
414 		    (ushort_t)ifr.ifr_flags);
415 		if (!rdsv3_capable_interface_old(&ifr))
416 			continue;
417 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
418 		    "4. ifr_name: %s, flags: %d", ifr.ifr_name,
419 		    (ushort_t)ifr.ifr_flags);
420 		j++;
421 	}
422 
423 	if (j <= 0) {
424 		RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old", "No RDS interfaces");
425 		kmem_free(buf, bufsize);
426 		return (rval);
427 	}
428 
429 	numifs = j;
430 
431 	/* This is the buffer we pass back */
432 	rbufsize = numifs * sizeof (struct ifreq);
433 	rbuf = kmem_alloc(rbufsize, KM_SLEEP);
434 	rlp = (struct ifreq *)rbuf;
435 
436 	/*
437 	 * Examine the array of interfaces and filter uninteresting ones
438 	 */
439 	for (i = 0, lp = ifc.ifc_req; i < n; i++, lp++) {
440 
441 		/*
442 		 * Copy the address as the SIOCGIFFLAGS ioctl is destructive
443 		 */
444 		bcopy(lp, &ifr, sizeof (struct ifreq));
445 		/*
446 		 * fetch the flags using the socket of the correct family
447 		 */
448 		switch (ifr.ifr_addr.sa_family) {
449 		case AF_INET:
450 			rc = ksocket_ioctl(so4, SIOCGIFFLAGS, (intptr_t)&ifr,
451 			    &rval, CRED());
452 			break;
453 		default:
454 			continue;
455 		}
456 
457 
458 		if (rc != 0) {
459 			RDSV3_DPRINTF2("rdsv3_do_ip_ioctl_old",
460 			    "ksocket_ioctl failed: %d for %s",
461 			    rc, ifr.ifr_name);
462 			continue;
463 		}
464 
465 		/*
466 		 * If we got the flags, skip uninteresting
467 		 * interfaces based on flags
468 		 */
469 		if ((((ushort_t)ifr.ifr_flags) & IFF_UP) != IFF_UP)
470 			continue;
471 		if (((ushort_t)ifr.ifr_flags) &
472 		    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
473 			continue;
474 		if (!rdsv3_capable_interface_old(&ifr))
475 			continue;
476 
477 		/* save the record */
478 		bcopy(lp, rlp, sizeof (struct ifreq));
479 		rlp->ifr_addr.sa_family = AF_INET_OFFLOAD;
480 		rlp++;
481 	}
482 
483 	kmem_free(buf, bufsize);
484 
485 	*ipaddrs = rbuf;
486 	*size = rbufsize;
487 	*nifs = numifs;
488 
489 	RDSV3_DPRINTF4("rdsv3_do_ip_ioctl_old", "Return");
490 
491 	return (rval);
492 }
493 
494 boolean_t
495 rdsv3_isloopback(ipaddr_t addr)
496 {
497 	ip_stack_t *ipst;
498 
499 	ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
500 	ASSERT(ipst != NULL);
501 	if (ip_type_v4(addr, ipst) != IRE_LOOPBACK) {
502 		netstack_rele(ipst->ips_netstack);
503 		return (B_FALSE);
504 	}
505 	netstack_rele(ipst->ips_netstack);
506 	return (B_TRUE);
507 }
508 
509 /*
510  * Work Queue Implementation
511  */
512 
513 #define	RDSV3_WQ_THREAD_IDLE		0
514 #define	RDSV3_WQ_THREAD_RUNNING		1
515 #define	RDSV3_WQ_THREAD_FLUSHING	2
516 #define	RDSV3_WQ_THREAD_EXITING		3
517 
518 /* worker thread */
519 void
520 rdsv3_worker_thread(void *arg)
521 {
522 	rdsv3_workqueue_struct_t *wq = arg;
523 	rdsv3_work_t *work;
524 
525 	RDSV3_DPRINTF4("rdsv3_worker_thread", "Enter(wq: 0x%p)", wq);
526 
527 	mutex_enter(&wq->wq_lock);
528 	work = list_remove_head(&wq->wq_queue);
529 	while (work) {
530 		mutex_exit(&wq->wq_lock);
531 
532 		/* process work */
533 		work->func(work);
534 
535 		mutex_enter(&wq->wq_lock);
536 		work = list_remove_head(&wq->wq_queue);
537 	}
538 
539 	/* No more work, go home, until called again */
540 	if (wq->wq_state != RDSV3_WQ_THREAD_EXITING) {
541 		wq->wq_state = RDSV3_WQ_THREAD_IDLE;
542 	}
543 	mutex_exit(&wq->wq_lock);
544 
545 	RDSV3_DPRINTF4("rdsv3_worker_thread", "Return(wq: 0x%p)", wq);
546 }
547 
548 /* XXX */
549 void
550 rdsv3_flush_workqueue(rdsv3_workqueue_struct_t *wq)
551 {
552 	RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Enter(wq: %p)", wq);
553 
554 	mutex_enter(&wq->wq_lock);
555 	switch (wq->wq_state) {
556 	case RDSV3_WQ_THREAD_IDLE:
557 		/* nothing to do */
558 		ASSERT(list_is_empty(&wq->wq_queue));
559 		break;
560 
561 	case RDSV3_WQ_THREAD_RUNNING:
562 		wq->wq_state = RDSV3_WQ_THREAD_FLUSHING;
563 		/* FALLTHRU */
564 	case RDSV3_WQ_THREAD_FLUSHING:
565 		/* already flushing, wait until the flushing is complete */
566 		do {
567 			mutex_exit(&wq->wq_lock);
568 			delay(drv_usectohz(1000000));
569 			mutex_enter(&wq->wq_lock);
570 		} while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
571 		break;
572 	case RDSV3_WQ_THREAD_EXITING:
573 		mutex_exit(&wq->wq_lock);
574 		rdsv3_worker_thread(wq);
575 		return;
576 	}
577 	mutex_exit(&wq->wq_lock);
578 
579 	RDSV3_DPRINTF4("rdsv3_flush_workqueue", "Return(wq: %p)", wq);
580 }
581 
582 void
583 rdsv3_queue_work(rdsv3_workqueue_struct_t *wq, rdsv3_work_t *wp)
584 {
585 	RDSV3_DPRINTF4("rdsv3_queue_work", "Enter(wq: %p, wp: %p)", wq, wp);
586 
587 	mutex_enter(&wq->wq_lock);
588 
589 	if (list_link_active(&wp->work_item)) {
590 		/* This is already in the queue, ignore this call */
591 		mutex_exit(&wq->wq_lock);
592 		RDSV3_DPRINTF3("rdsv3_queue_work", "already queued: %p", wp);
593 		return;
594 	}
595 
596 	switch (wq->wq_state) {
597 	case RDSV3_WQ_THREAD_RUNNING:
598 		list_insert_tail(&wq->wq_queue, wp);
599 		mutex_exit(&wq->wq_lock);
600 		break;
601 
602 	case RDSV3_WQ_THREAD_FLUSHING:
603 		do {
604 			mutex_exit(&wq->wq_lock);
605 			delay(drv_usectohz(1000000));
606 			mutex_enter(&wq->wq_lock);
607 		} while (wq->wq_state == RDSV3_WQ_THREAD_FLUSHING);
608 
609 		if (wq->wq_state == RDSV3_WQ_THREAD_RUNNING) {
610 			list_insert_tail(&wq->wq_queue, wp);
611 			mutex_exit(&wq->wq_lock);
612 			break;
613 		}
614 		/* FALLTHRU */
615 
616 	case RDSV3_WQ_THREAD_IDLE:
617 		list_insert_tail(&wq->wq_queue, wp);
618 		wq->wq_state = RDSV3_WQ_THREAD_RUNNING;
619 		mutex_exit(&wq->wq_lock);
620 
621 		(void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_worker_thread, wq,
622 		    DDI_SLEEP);
623 		break;
624 
625 	case RDSV3_WQ_THREAD_EXITING:
626 		mutex_exit(&wq->wq_lock);
627 		break;
628 	}
629 
630 	RDSV3_DPRINTF4("rdsv3_queue_work", "Return(wq: %p, wp: %p)", wq, wp);
631 }
632 
633 /* timeout handler for delayed work queuing */
634 void
635 rdsv3_work_timeout_handler(void *arg)
636 {
637 	rdsv3_delayed_work_t *dwp = (rdsv3_delayed_work_t *)arg;
638 
639 	RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
640 	    "Enter(wq: %p, wp: %p)", dwp->wq, &dwp->work);
641 
642 	mutex_enter(&dwp->lock);
643 	dwp->timeid = 0;
644 	mutex_exit(&dwp->lock);
645 
646 	mutex_enter(&dwp->wq->wq_lock);
647 	dwp->wq->wq_pending--;
648 	if (dwp->wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
649 		mutex_exit(&dwp->wq->wq_lock);
650 		return;
651 	}
652 	mutex_exit(&dwp->wq->wq_lock);
653 
654 	rdsv3_queue_work(dwp->wq, &dwp->work);
655 
656 	RDSV3_DPRINTF4("rdsv3_work_timeout_handler",
657 	    "Return(wq: %p, wp: %p)", dwp->wq, &dwp->work);
658 }
659 
660 void
661 rdsv3_queue_delayed_work(rdsv3_workqueue_struct_t *wq,
662     rdsv3_delayed_work_t *dwp, uint_t delay)
663 {
664 	RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
665 	    "Enter(wq: %p, wp: %p)", wq, dwp);
666 
667 	if (delay == 0) {
668 		rdsv3_queue_work(wq, &dwp->work);
669 		return;
670 	}
671 
672 	mutex_enter(&wq->wq_lock);
673 	if (wq->wq_state == RDSV3_WQ_THREAD_EXITING) {
674 		mutex_exit(&wq->wq_lock);
675 		RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
676 		    "WQ exiting - don't queue (wq: %p, wp: %p)", wq, dwp);
677 		return;
678 	}
679 	wq->wq_pending++;
680 	mutex_exit(&wq->wq_lock);
681 
682 	mutex_enter(&dwp->lock);
683 	if (dwp->timeid == 0) {
684 		dwp->wq = wq;
685 		dwp->timeid = timeout(rdsv3_work_timeout_handler, dwp,
686 		    jiffies + (delay * rdsv3_one_sec_in_hz));
687 		mutex_exit(&dwp->lock);
688 	} else {
689 		mutex_exit(&dwp->lock);
690 		RDSV3_DPRINTF4("rdsv3_queue_delayed_work", "Already queued: %p",
691 		    dwp);
692 		mutex_enter(&wq->wq_lock);
693 		wq->wq_pending--;
694 		mutex_exit(&wq->wq_lock);
695 	}
696 
697 	RDSV3_DPRINTF4("rdsv3_queue_delayed_work",
698 	    "Return(wq: %p, wp: %p)", wq, dwp);
699 }
700 
701 void
702 rdsv3_cancel_delayed_work(rdsv3_delayed_work_t *dwp)
703 {
704 	RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
705 	    "Enter(wq: %p, dwp: %p)", dwp->wq, dwp);
706 
707 	mutex_enter(&dwp->lock);
708 	if (dwp->timeid != 0) {
709 		(void) untimeout(dwp->timeid);
710 		dwp->timeid = 0;
711 	} else {
712 		RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
713 		    "Nothing to cancel (wq: %p, dwp: %p)", dwp->wq, dwp);
714 		mutex_exit(&dwp->lock);
715 		return;
716 	}
717 	mutex_exit(&dwp->lock);
718 
719 	mutex_enter(&dwp->wq->wq_lock);
720 	dwp->wq->wq_pending--;
721 	mutex_exit(&dwp->wq->wq_lock);
722 
723 	RDSV3_DPRINTF4("rdsv3_cancel_delayed_work",
724 	    "Return(wq: %p, dwp: %p)", dwp->wq, dwp);
725 }
726 
727 void
728 rdsv3_destroy_task_workqueue(rdsv3_workqueue_struct_t *wq)
729 {
730 	RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Enter");
731 
732 	ASSERT(wq);
733 
734 	mutex_enter(&wq->wq_lock);
735 	wq->wq_state = RDSV3_WQ_THREAD_EXITING;
736 
737 	while (wq->wq_pending > 0) {
738 		mutex_exit(&wq->wq_lock);
739 		delay(drv_usectohz(1000000));
740 		mutex_enter(&wq->wq_lock);
741 	};
742 	mutex_exit(&wq->wq_lock);
743 
744 	rdsv3_flush_workqueue(wq);
745 
746 	list_destroy(&wq->wq_queue);
747 	mutex_destroy(&wq->wq_lock);
748 	kmem_free(wq, sizeof (rdsv3_workqueue_struct_t));
749 
750 	ASSERT(rdsv3_taskq);
751 	ddi_taskq_destroy(rdsv3_taskq);
752 
753 	wq = NULL;
754 	rdsv3_taskq = NULL;
755 
756 	RDSV3_DPRINTF2("rdsv3_destroy_workqueue", "Return");
757 }
758 
759 /* ARGSUSED */
760 void
761 rdsv3_rdma_init_worker(struct rdsv3_work_s *work)
762 {
763 	rdsv3_rdma_init();
764 }
765 
766 #define	RDSV3_NUM_TASKQ_THREADS	4
767 rdsv3_workqueue_struct_t *
768 rdsv3_create_task_workqueue(char *name)
769 {
770 	rdsv3_workqueue_struct_t	*wq;
771 
772 	RDSV3_DPRINTF2("create_singlethread_workqueue", "Enter (dip: %p)",
773 	    rdsv3_dev_info);
774 
775 	rdsv3_taskq = ddi_taskq_create(rdsv3_dev_info, name,
776 	    RDSV3_NUM_TASKQ_THREADS, TASKQ_DEFAULTPRI, 0);
777 	if (rdsv3_taskq == NULL) {
778 		RDSV3_DPRINTF2(__FILE__,
779 		    "ddi_taskq_create failed for rdsv3_taskq");
780 		return (NULL);
781 	}
782 
783 	wq = kmem_zalloc(sizeof (rdsv3_workqueue_struct_t), KM_NOSLEEP);
784 	if (wq == NULL) {
785 		RDSV3_DPRINTF2(__FILE__, "kmem_zalloc failed for wq");
786 		ddi_taskq_destroy(rdsv3_taskq);
787 		return (NULL);
788 	}
789 
790 	list_create(&wq->wq_queue, sizeof (struct rdsv3_work_s),
791 	    offsetof(struct rdsv3_work_s, work_item));
792 	mutex_init(&wq->wq_lock, NULL, MUTEX_DRIVER, NULL);
793 	wq->wq_state = RDSV3_WQ_THREAD_IDLE;
794 	wq->wq_pending = 0;
795 	rdsv3_one_sec_in_hz = drv_usectohz(1000000);
796 
797 	RDSV3_DPRINTF2("create_singlethread_workqueue", "Return");
798 
799 	return (wq);
800 }
801 
802 /*
803  * Implementation for struct sock
804  */
805 
806 void
807 rdsv3_sock_exit_data(struct rsock *sk)
808 {
809 	struct rdsv3_sock *rs = sk->sk_protinfo;
810 
811 	RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
812 
813 	ASSERT(rs != NULL);
814 	ASSERT(rdsv3_sk_sock_flag(sk, SOCK_DEAD));
815 
816 	rs->rs_sk = NULL;
817 
818 	list_destroy(&rs->rs_send_queue);
819 	list_destroy(&rs->rs_notify_queue);
820 	list_destroy(&rs->rs_recv_queue);
821 
822 	rw_destroy(&rs->rs_recv_lock);
823 	mutex_destroy(&rs->rs_lock);
824 
825 	mutex_destroy(&rs->rs_rdma_lock);
826 	avl_destroy(&rs->rs_rdma_keys);
827 
828 	rdsv3_exit_waitqueue(sk->sk_sleep);
829 	kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t));
830 	mutex_destroy(&sk->sk_lock);
831 
832 	kmem_cache_free(rdsv3_alloc_cache, sk);
833 	RDSV3_DPRINTF4("rdsv3_sock_exit_data", "rs: %p sk: %p", rs, sk);
834 }
835 
836 /* XXX - figure out right values */
837 #define	RDSV3_RECV_HIWATER	(256 * 1024)
838 #define	RDSV3_RECV_LOWATER	128
839 #define	RDSV3_XMIT_HIWATER	(256 * 1024)
840 #define	RDSV3_XMIT_LOWATER	1024
841 
842 struct rsock *
843 rdsv3_sk_alloc()
844 {
845 	struct rsock *sk;
846 
847 	sk = kmem_cache_alloc(rdsv3_alloc_cache, KM_SLEEP);
848 	if (sk == NULL) {
849 		RDSV3_DPRINTF2("rdsv3_create", "kmem_cache_alloc failed");
850 		return (NULL);
851 	}
852 
853 	bzero(sk, sizeof (struct rsock) + sizeof (struct rdsv3_sock));
854 	return (sk);
855 }
856 
857 void
858 rdsv3_sock_init_data(struct rsock *sk)
859 {
860 	sk->sk_sleep = kmem_zalloc(sizeof (rdsv3_wait_queue_t), KM_SLEEP);
861 	rdsv3_init_waitqueue(sk->sk_sleep);
862 
863 	mutex_init(&sk->sk_lock, NULL, MUTEX_DRIVER, NULL);
864 	sk->sk_refcount = 1;
865 	sk->sk_protinfo = (struct rdsv3_sock *)(sk + 1);
866 	sk->sk_sndbuf = RDSV3_XMIT_HIWATER;
867 	sk->sk_rcvbuf = RDSV3_RECV_HIWATER;
868 }
869 
870 /*
871  * Connection cache
872  */
873 /* ARGSUSED */
874 int
875 rdsv3_conn_constructor(void *buf, void *arg, int kmflags)
876 {
877 	struct rdsv3_connection *conn = buf;
878 
879 	bzero(conn, sizeof (struct rdsv3_connection));
880 
881 	conn->c_next_tx_seq = 1;
882 	mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL);
883 	mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL);
884 	list_create(&conn->c_send_queue, sizeof (struct rdsv3_message),
885 	    offsetof(struct rdsv3_message, m_conn_item));
886 	list_create(&conn->c_retrans, sizeof (struct rdsv3_message),
887 	    offsetof(struct rdsv3_message, m_conn_item));
888 	return (0);
889 }
890 
891 /* ARGSUSED */
892 void
893 rdsv3_conn_destructor(void *buf, void *arg)
894 {
895 	struct rdsv3_connection *conn = buf;
896 
897 	ASSERT(list_is_empty(&conn->c_send_queue));
898 	ASSERT(list_is_empty(&conn->c_retrans));
899 	list_destroy(&conn->c_send_queue);
900 	list_destroy(&conn->c_retrans);
901 	mutex_destroy(&conn->c_send_lock);
902 	mutex_destroy(&conn->c_lock);
903 }
904 
905 int
906 rdsv3_conn_compare(const void *conn1, const void *conn2)
907 {
908 	uint32_be_t	laddr1, faddr1, laddr2, faddr2;
909 
910 	laddr1 = ((rdsv3_conn_info_t *)conn1)->c_laddr;
911 	laddr2 = ((struct rdsv3_connection *)conn2)->c_laddr;
912 
913 	if (laddr1 == laddr2) {
914 		faddr1 = ((rdsv3_conn_info_t *)conn1)->c_faddr;
915 		faddr2 = ((struct rdsv3_connection *)conn2)->c_faddr;
916 		if (faddr1 == faddr2)
917 			return (0);
918 		if (faddr1 < faddr2)
919 			return (-1);
920 		return (1);
921 	}
922 
923 	if (laddr1 < laddr2)
924 		return (-1);
925 
926 	return (1);
927 }
928 
929 /* rdsv3_ib_incoming cache */
930 /* ARGSUSED */
931 int
932 rdsv3_ib_inc_constructor(void *buf, void *arg, int kmflags)
933 {
934 	list_create(&((struct rdsv3_ib_incoming *)buf)->ii_frags,
935 	    sizeof (struct rdsv3_page_frag),
936 	    offsetof(struct rdsv3_page_frag, f_item));
937 
938 	return (0);
939 }
940 
941 /* ARGSUSED */
942 void
943 rdsv3_ib_inc_destructor(void *buf, void *arg)
944 {
945 	list_destroy(&((struct rdsv3_ib_incoming *)buf)->ii_frags);
946 }
947 
948 /* ib_frag_slab cache */
949 /* ARGSUSED */
950 int
951 rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags)
952 {
953 	struct rdsv3_page_frag *frag = (struct rdsv3_page_frag *)buf;
954 	struct rdsv3_ib_device *rds_ibdev = (struct rdsv3_ib_device *)arg;
955 	ibt_iov_attr_t iov_attr;
956 	ibt_iov_t iov_arr[1];
957 	ibt_all_wr_t wr;
958 
959 	bzero(frag, sizeof (struct rdsv3_page_frag));
960 	list_link_init(&frag->f_item);
961 
962 	frag->f_page = kmem_alloc(PAGE_SIZE, kmflags);
963 	if (frag->f_page == NULL) {
964 		RDSV3_DPRINTF2("rdsv3_ib_frag_constructor",
965 		    "kmem_alloc for %d failed", PAGE_SIZE);
966 		return (-1);
967 	}
968 	frag->f_offset = 0;
969 
970 	iov_attr.iov_as = NULL;
971 	iov_attr.iov = &iov_arr[0];
972 	iov_attr.iov_buf = NULL;
973 	iov_attr.iov_list_len = 1;
974 	iov_attr.iov_wr_nds = 1;
975 	iov_attr.iov_lso_hdr_sz = 0;
976 	iov_attr.iov_flags = IBT_IOV_SLEEP | IBT_IOV_RECV;
977 
978 	iov_arr[0].iov_addr = frag->f_page;
979 	iov_arr[0].iov_len = PAGE_SIZE;
980 
981 	wr.recv.wr_nds = 1;
982 	wr.recv.wr_sgl = &frag->f_sge;
983 
984 	if (ibt_map_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev->dev),
985 	    &iov_attr, &wr, &frag->f_mapped) != IBT_SUCCESS) {
986 		RDSV3_DPRINTF2("rdsv3_ib_frag_constructor",
987 		    "ibt_map_mem_iov failed");
988 		kmem_free(frag->f_page, PAGE_SIZE);
989 		return (-1);
990 	}
991 
992 	return (0);
993 }
994 
995 /* ARGSUSED */
996 void
997 rdsv3_ib_frag_destructor(void *buf, void *arg)
998 {
999 	struct rdsv3_page_frag *frag = (struct rdsv3_page_frag *)buf;
1000 	struct rdsv3_ib_device *rds_ibdev = (struct rdsv3_ib_device *)arg;
1001 
1002 	/* unmap the page */
1003 	if (ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(rds_ibdev->dev),
1004 	    frag->f_mapped) != IBT_SUCCESS)
1005 		RDSV3_DPRINTF2("rdsv3_ib_frag_destructor",
1006 		    "ibt_unmap_mem_iov failed");
1007 
1008 	/* free the page */
1009 	kmem_free(frag->f_page, PAGE_SIZE);
1010 }
1011 
1012 /* loop.c */
1013 extern kmutex_t loop_conns_lock;
1014 extern list_t loop_conns;
1015 
1016 struct rdsv3_loop_connection
1017 {
1018 	struct list_node loop_node;
1019 	struct rdsv3_connection *conn;
1020 };
1021 
1022 void
1023 rdsv3_loop_init(void)
1024 {
1025 	list_create(&loop_conns, sizeof (struct rdsv3_loop_connection),
1026 	    offsetof(struct rdsv3_loop_connection, loop_node));
1027 	mutex_init(&loop_conns_lock, NULL, MUTEX_DRIVER, NULL);
1028 }
1029 
1030 /* rdma.c */
1031 /* IB Rkey is used here for comparison */
1032 int
1033 rdsv3_mr_compare(const void *mr1, const void *mr2)
1034 {
1035 	uint32_t key1 = *(uint32_t *)mr1;
1036 	uint32_t key2 = ((struct rdsv3_mr *)mr2)->r_key;
1037 
1038 	if (key1 < key2)
1039 		return (-1);
1040 	if (key1 > key2)
1041 		return (1);
1042 	return (0);
1043 }
1044 
1045 /* transport.c */
1046 extern struct rdsv3_transport *transports[];
1047 extern krwlock_t		trans_sem;
1048 
1049 void
1050 rdsv3_trans_exit(void)
1051 {
1052 	struct rdsv3_transport *trans;
1053 	int i;
1054 
1055 	RDSV3_DPRINTF2("rdsv3_trans_exit", "Enter");
1056 
1057 	/* currently, only IB transport */
1058 	rw_enter(&trans_sem, RW_READER);
1059 	trans = NULL;
1060 	for (i = 0; i < RDS_TRANS_COUNT; i++) {
1061 		if (transports[i]) {
1062 			trans = transports[i];
1063 			break;
1064 		}
1065 	}
1066 	rw_exit(&trans_sem);
1067 
1068 	/* trans->exit() will remove the trans from the list */
1069 	if (trans)
1070 		trans->exit();
1071 
1072 	rw_destroy(&trans_sem);
1073 
1074 	RDSV3_DPRINTF2("rdsv3_trans_exit", "Return");
1075 }
1076 
1077 void
1078 rdsv3_trans_init()
1079 {
1080 	RDSV3_DPRINTF2("rdsv3_trans_init", "Enter");
1081 
1082 	rw_init(&trans_sem, NULL, RW_DRIVER, NULL);
1083 
1084 	RDSV3_DPRINTF2("rdsv3_trans_init", "Return");
1085 }
1086 
1087 int
1088 rdsv3_put_cmsg(struct nmsghdr *msg, int level, int type, size_t size,
1089 	void *payload)
1090 {
1091 	struct cmsghdr *cp;
1092 	char *bp;
1093 	size_t cmlen;
1094 	size_t cmspace;
1095 	size_t bufsz;
1096 
1097 	RDSV3_DPRINTF4("rdsv3_put_cmsg",
1098 	    "Enter(msg: %p level: %d type: %d sz: %d)",
1099 	    msg, level, type, size);
1100 
1101 	if (msg == NULL || msg->msg_controllen == 0 || payload == NULL) {
1102 		return (0);
1103 	}
1104 	/* check for first cmsg or this is another cmsg to be appended */
1105 	if (msg->msg_control == NULL)
1106 		msg->msg_controllen = 0;
1107 
1108 	cmlen = CMSG_LEN(size);
1109 	cmspace = CMSG_SPACE(size);
1110 	bufsz = msg->msg_controllen + cmspace;
1111 
1112 	/* extend the existing cmsg to append the next cmsg */
1113 	bp = kmem_alloc(bufsz, KM_SLEEP);
1114 	if (msg->msg_control) {
1115 		bcopy(msg->msg_control, bp, msg->msg_controllen);
1116 		kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
1117 	}
1118 
1119 	/* assign payload the proper cmsg location */
1120 	cp = (struct cmsghdr *)(bp + msg->msg_controllen);
1121 	cp->cmsg_len = cmlen;
1122 	cp->cmsg_level = level;
1123 	cp->cmsg_type = type;
1124 
1125 	bcopy(payload, CMSG_DATA(cp), cmlen -
1126 	    (unsigned int)_CMSG_DATA_ALIGN(sizeof (struct cmsghdr)));
1127 
1128 	msg->msg_control = bp;
1129 	msg->msg_controllen = bufsz;
1130 
1131 	RDSV3_DPRINTF4("rdsv3_put_cmsg", "Return(cmsg_len: %d)", cp->cmsg_len);
1132 
1133 	return (0);
1134 }
1135 
1136 /* bind.c */
1137 extern kmutex_t rdsv3_bind_lock;
1138 extern avl_tree_t rdsv3_bind_tree;
1139 
1140 /* ARGSUSED */
1141 int
1142 rdsv3_verify_bind_address(ipaddr_t addr)
1143 {
1144 	return (1);
1145 }
1146 
1147 int
1148 rdsv3_bind_node_compare(const void *a, const void *b)
1149 {
1150 	uint64_t			needle = *(uint64_t *)a;
1151 	struct rdsv3_sock		*rs = (struct rdsv3_sock *)b;
1152 
1153 	if (needle > (((uint64_t)rs->rs_bound_addr << 32) | rs->rs_bound_port))
1154 		return (+1);
1155 	else if (needle <
1156 	    (((uint64_t)rs->rs_bound_addr << 32) | rs->rs_bound_port))
1157 		return (-1);
1158 
1159 	return (0);
1160 }
1161 
1162 void
1163 rdsv3_bind_tree_init()
1164 {
1165 	RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Enter");
1166 
1167 	mutex_init(&rdsv3_bind_lock, NULL, MUTEX_DRIVER, NULL);
1168 	avl_create(&rdsv3_bind_tree, rdsv3_bind_node_compare,
1169 	    sizeof (struct rdsv3_sock),
1170 	    offsetof(struct rdsv3_sock, rs_bound_node));
1171 
1172 	RDSV3_DPRINTF4("rdsv3_bind_tree_init", "Return");
1173 }
1174 
1175 void
1176 rdsv3_bind_tree_exit()
1177 {
1178 	RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Enter");
1179 
1180 	ASSERT(avl_is_empty(&rdsv3_bind_tree));
1181 	avl_destroy(&rdsv3_bind_tree);
1182 	mutex_destroy(&rdsv3_bind_lock);
1183 
1184 	RDSV3_DPRINTF2("rdsv3_bind_tree_exit", "Return");
1185 }
1186 
1187 /* checksum */
1188 uint16_t
1189 rdsv3_ip_fast_csum(void *hdr, size_t length)
1190 {
1191 	return (0xffff &
1192 	    (uint16_t)(~ip_ocsum((ushort_t *)hdr, (int)length <<1, 0)));
1193 }
1194 
1195 /* scatterlist implementation */
1196 /* ARGSUSED */
1197 caddr_t
1198 rdsv3_ib_sg_dma_address(ib_device_t *dev, struct rdsv3_scatterlist *scat,
1199     uint_t offset)
1200 {
1201 	return (0);
1202 }
1203 
1204 uint_t
1205 rdsv3_ib_dma_map_sg(struct ib_device *dev, struct rdsv3_scatterlist *scat,
1206     uint_t num)
1207 {
1208 	struct rdsv3_scatterlist *s, *first;
1209 	ibt_iov_t *iov;
1210 	ibt_wr_ds_t *sgl;
1211 	ibt_iov_attr_t iov_attr;
1212 	ibt_send_wr_t swr;
1213 	uint_t i;
1214 
1215 	RDSV3_DPRINTF4("rdsv3_ib_dma_map_sg", "scat %p, num: %d", scat, num);
1216 
1217 	s = first = &scat[0];
1218 	ASSERT(first->mihdl == NULL);
1219 
1220 	iov = kmem_alloc(num * sizeof (ibt_iov_t), KM_SLEEP);
1221 	sgl = kmem_zalloc((num * 2) *  sizeof (ibt_wr_ds_t), KM_SLEEP);
1222 
1223 	for (i = 0; i < num; i++, s++) {
1224 		iov[i].iov_addr = s->vaddr;
1225 		iov[i].iov_len = s->length;
1226 	}
1227 
1228 	iov_attr.iov_as = NULL;
1229 	iov_attr.iov = iov;
1230 	iov_attr.iov_buf = NULL;
1231 	iov_attr.iov_list_len = num;
1232 	iov_attr.iov_wr_nds = num * 2;
1233 	iov_attr.iov_lso_hdr_sz = 0;
1234 	iov_attr.iov_flags = IBT_IOV_SLEEP;
1235 
1236 	swr.wr_sgl = sgl;
1237 
1238 	i = ibt_map_mem_iov(ib_get_ibt_hca_hdl(dev),
1239 	    &iov_attr, (ibt_all_wr_t *)&swr, &first->mihdl);
1240 	kmem_free(iov, num * sizeof (ibt_iov_t));
1241 	if (i != IBT_SUCCESS) {
1242 		RDSV3_DPRINTF2("rdsv3_ib_dma_map_sg",
1243 		    "ibt_map_mem_iov returned: %d", i);
1244 		return (0);
1245 	}
1246 
1247 	s = first;
1248 	for (i = 0; i < num; i++, s++, sgl++) {
1249 		s->sgl = sgl;
1250 	}
1251 
1252 	return (num);
1253 }
1254 
1255 void
1256 rdsv3_ib_dma_unmap_sg(ib_device_t *dev, struct rdsv3_scatterlist *scat,
1257     uint_t num)
1258 {
1259 	/* Zero length messages have no scatter gather entries */
1260 	if (num != 0) {
1261 		ASSERT(scat->mihdl != NULL);
1262 		ASSERT(scat->sgl != NULL);
1263 
1264 		(void) ibt_unmap_mem_iov(ib_get_ibt_hca_hdl(dev), scat->mihdl);
1265 
1266 		kmem_free(scat->sgl, (num * 2)  * sizeof (ibt_wr_ds_t));
1267 		scat->sgl = NULL;
1268 		scat->mihdl = NULL;
1269 	}
1270 }
1271 
1272 int
1273 rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
1274 {
1275 	caddr_t addr;
1276 	size_t size;
1277 	ibt_mr_attr_t mr_attr;
1278 	ibt_mr_desc_t mr_desc;
1279 	ibt_mr_hdl_t mr_hdl;
1280 	int ret;
1281 
1282 	RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Enter(dev: %p)", dev);
1283 
1284 	ASSERT(ic->i_mr == NULL);
1285 
1286 	size = (ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr + 1) *
1287 	    sizeof (struct rdsv3_header);
1288 
1289 	addr = kmem_zalloc(size, KM_NOSLEEP);
1290 	if (addr == NULL)
1291 		return (-1);
1292 
1293 	mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)addr;
1294 	mr_attr.mr_len = size;
1295 	mr_attr.mr_as = NULL;
1296 	mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
1297 	ret = ibt_register_mr(ib_get_ibt_hca_hdl(dev), RDSV3_PD2PDHDL(ic->i_pd),
1298 	    &mr_attr, &mr_hdl, &mr_desc);
1299 	if (ret != IBT_SUCCESS) {
1300 		RDSV3_DPRINTF2("rdsv3_ib_alloc_hdrs",
1301 		    "ibt_register_mr returned: " "%d", ret);
1302 		return (-1);
1303 	}
1304 
1305 	ic->i_mr =
1306 	    (struct rdsv3_hdrs_mr *)kmem_alloc(sizeof (struct rdsv3_hdrs_mr),
1307 	    KM_SLEEP);
1308 	ic->i_mr->addr = addr;
1309 	ic->i_mr->size = size;
1310 	ic->i_mr->hdl =	mr_hdl;
1311 	ic->i_mr->lkey = mr_desc.md_lkey;
1312 
1313 	ic->i_send_hdrs = (struct rdsv3_header *)addr;
1314 	ic->i_send_hdrs_dma = (uint64_t)(uintptr_t)addr;
1315 
1316 	ic->i_recv_hdrs = (struct rdsv3_header *)(addr +
1317 	    (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
1318 	ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr +
1319 	    (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header)));
1320 	ic->i_recv_tasklet_cpuid = -1;
1321 
1322 	ic->i_ack = (struct rdsv3_header *)(addr +
1323 	    ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
1324 	    sizeof (struct rdsv3_header)));
1325 	ic->i_ack_dma = (uint64_t)(uintptr_t)(addr +
1326 	    ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) *
1327 	    sizeof (struct rdsv3_header)));
1328 
1329 	RDSV3_DPRINTF4("rdsv3_ib_alloc_hdrs", "Return(dev: %p)", dev);
1330 
1331 	return (0);
1332 }
1333 
1334 void
1335 rdsv3_ib_free_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic)
1336 {
1337 	RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Enter(dev: %p)", dev);
1338 	ASSERT(ic->i_mr != NULL);
1339 
1340 	ic->i_send_hdrs = NULL;
1341 	ic->i_send_hdrs_dma = NULL;
1342 
1343 	ic->i_recv_hdrs = NULL;
1344 	ic->i_recv_hdrs_dma = NULL;
1345 
1346 	ic->i_ack = NULL;
1347 	ic->i_ack_dma = NULL;
1348 
1349 	(void) ibt_deregister_mr(ib_get_ibt_hca_hdl(dev), ic->i_mr->hdl);
1350 
1351 	kmem_free(ic->i_mr->addr, ic->i_mr->size);
1352 	kmem_free(ic->i_mr, sizeof (struct rdsv3_hdrs_mr));
1353 
1354 	ic->i_mr = NULL;
1355 	RDSV3_DPRINTF4("rdsv3_ib_free_hdrs", "Return(dev: %p)", dev);
1356 }
1357 
1358 /*
1359  * atomic_add_unless - add unless the number is a given value
1360  * @v: pointer of type atomic_t
1361  * @a: the amount to add to v...
1362  * @u: ...unless v is equal to u.
1363  *
1364  * Atomically adds @a to @v, so long as it was not @u.
1365  * Returns non-zero if @v was not @u, and zero otherwise.
1366  */
1367 int
1368 atomic_add_unless(atomic_t *v, uint_t a, ulong_t u)
1369 {
1370 	uint_t c, old;
1371 
1372 	c = *v;
1373 	while (c != u && (old = atomic_cas_uint(v, c, c + a)) != c) {
1374 		c = old;
1375 	}
1376 	return ((ulong_t)c != u);
1377 }
1378